Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 16 Jul 2019 03:38:15 +0000 (20:38 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 16 Jul 2019 03:38:15 +0000 (20:38 -0700)
Pull rdma updates from Jason Gunthorpe:
 "A smaller cycle this time. Notably we see another new driver, 'Soft
  iWarp', and the deletion of an ancient unused driver for nes.

   - Revise and simplify the signature offload RDMA MR APIs

   - More progress on hoisting object allocation boiler plate code out
     of the drivers

   - Driver bug fixes and revisions for hns, hfi1, efa, cxgb4, qib,
     i40iw

   - Tree wide cleanups: struct_size, put_user_page, xarray, rst doc
     conversion

   - Removal of obsolete ib_ucm chardev and nes driver

   - netlink based discovery of chardevs and autoloading of the modules
     providing them

   - Move more of the rdamvt/hfi1 uapi to include/uapi/rdma

   - New driver 'siw' for software based iWarp running on top of netdev,
     much like rxe's software RoCE.

   - mlx5 feature to report events in their raw devx format to userspace

   - Expose per-object counters through rdma tool

   - Adaptive interrupt moderation for RDMA (DIM), sharing the DIM core
     from netdev"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (194 commits)
  RMDA/siw: Require a 64 bit arch
  RDMA/siw: Mark expected switch fall-throughs
  RDMA/core: Fix -Wunused-const-variable warnings
  rdma/siw: Remove set but not used variable 's'
  rdma/siw: Add missing dependencies on LIBCRC32C and DMA_VIRT_OPS
  RDMA/siw: Add missing rtnl_lock around access to ifa
  rdma/siw: Use proper enumerated type in map_cqe_status
  RDMA/siw: Remove unnecessary kthread create/destroy printouts
  IB/rdmavt: Fix variable shadowing issue in rvt_create_cq
  RDMA/core: Fix race when resolving IP address
  RDMA/core: Make rdma_counter.h compile stand alone
  IB/core: Work on the caller socket net namespace in nldev_newlink()
  RDMA/rxe: Fill in wc byte_len with IB_WC_RECV_RDMA_WITH_IMM
  RDMA/mlx5: Set RDMA DIM to be enabled by default
  RDMA/nldev: Added configuration of RDMA dynamic interrupt moderation to netlink
  RDMA/core: Provide RDMA DIM support for ULPs
  linux/dim: Implement RDMA adaptive moderation (DIM)
  IB/mlx5: Report correctly tag matching rendezvous capability
  docs: infiniband: add it to the driver-api bookset
  IB/mlx5: Implement VHCA tunnel mechanism in DEVX
  ...

228 files changed:
Documentation/ABI/stable/sysfs-class-infiniband
Documentation/index.rst
Documentation/infiniband/core_locking.rst [new file with mode: 0644]
Documentation/infiniband/core_locking.txt [deleted file]
Documentation/infiniband/index.rst [new file with mode: 0644]
Documentation/infiniband/ipoib.rst [new file with mode: 0644]
Documentation/infiniband/ipoib.txt [deleted file]
Documentation/infiniband/opa_vnic.rst [new file with mode: 0644]
Documentation/infiniband/opa_vnic.txt [deleted file]
Documentation/infiniband/sysfs.rst [new file with mode: 0644]
Documentation/infiniband/sysfs.txt [deleted file]
Documentation/infiniband/tag_matching.rst [new file with mode: 0644]
Documentation/infiniband/tag_matching.txt [deleted file]
Documentation/infiniband/user_mad.rst [new file with mode: 0644]
Documentation/infiniband/user_mad.txt [deleted file]
Documentation/infiniband/user_verbs.rst [new file with mode: 0644]
Documentation/infiniband/user_verbs.txt [deleted file]
MAINTAINERS
drivers/infiniband/Kconfig
drivers/infiniband/core/Makefile
drivers/infiniband/core/addr.c
drivers/infiniband/core/core_priv.h
drivers/infiniband/core/counters.c [new file with mode: 0644]
drivers/infiniband/core/cq.c
drivers/infiniband/core/device.c
drivers/infiniband/core/mr_pool.c
drivers/infiniband/core/nldev.c
drivers/infiniband/core/restrack.c
drivers/infiniband/core/restrack.h
drivers/infiniband/core/rw.c
drivers/infiniband/core/sysfs.c
drivers/infiniband/core/ucm.c [deleted file]
drivers/infiniband/core/ucma.c
drivers/infiniband/core/umem.c
drivers/infiniband/core/umem_odp.c
drivers/infiniband/core/user_mad.c
drivers/infiniband/core/uverbs_cmd.c
drivers/infiniband/core/uverbs_main.c
drivers/infiniband/core/uverbs_std_types_cq.c
drivers/infiniband/core/uverbs_std_types_mr.c
drivers/infiniband/core/uverbs_uapi.c
drivers/infiniband/core/verbs.c
drivers/infiniband/hw/Makefile
drivers/infiniband/hw/bnxt_re/ib_verbs.c
drivers/infiniband/hw/bnxt_re/ib_verbs.h
drivers/infiniband/hw/bnxt_re/main.c
drivers/infiniband/hw/cxgb3/cxio_hal.c
drivers/infiniband/hw/cxgb3/cxio_hal.h
drivers/infiniband/hw/cxgb3/iwch_cm.c
drivers/infiniband/hw/cxgb3/iwch_provider.c
drivers/infiniband/hw/cxgb4/cm.c
drivers/infiniband/hw/cxgb4/cq.c
drivers/infiniband/hw/cxgb4/device.c
drivers/infiniband/hw/cxgb4/iw_cxgb4.h
drivers/infiniband/hw/cxgb4/mem.c
drivers/infiniband/hw/cxgb4/provider.c
drivers/infiniband/hw/cxgb4/qp.c
drivers/infiniband/hw/cxgb4/resource.c
drivers/infiniband/hw/efa/efa.h
drivers/infiniband/hw/efa/efa_com.c
drivers/infiniband/hw/efa/efa_com.h
drivers/infiniband/hw/efa/efa_com_cmd.c
drivers/infiniband/hw/efa/efa_main.c
drivers/infiniband/hw/efa/efa_verbs.c
drivers/infiniband/hw/hfi1/Makefile
drivers/infiniband/hw/hfi1/aspm.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/aspm.h
drivers/infiniband/hw/hfi1/debugfs.c
drivers/infiniband/hw/hfi1/mad.c
drivers/infiniband/hw/hfi1/pcie.c
drivers/infiniband/hw/hfi1/pio.c
drivers/infiniband/hw/hfi1/qp.c
drivers/infiniband/hw/hfi1/rc.c
drivers/infiniband/hw/hfi1/tid_rdma.c
drivers/infiniband/hw/hfi1/trace_ibhdrs.h
drivers/infiniband/hw/hfi1/uc.c
drivers/infiniband/hw/hfi1/ud.c
drivers/infiniband/hw/hfi1/user_pages.c
drivers/infiniband/hw/hfi1/verbs.c
drivers/infiniband/hw/hns/Kconfig
drivers/infiniband/hw/hns/Makefile
drivers/infiniband/hw/hns/hns_roce_alloc.c
drivers/infiniband/hw/hns/hns_roce_cmd.c
drivers/infiniband/hw/hns/hns_roce_cq.c
drivers/infiniband/hw/hns/hns_roce_db.c
drivers/infiniband/hw/hns/hns_roce_device.h
drivers/infiniband/hw/hns/hns_roce_hem.c
drivers/infiniband/hw/hns/hns_roce_hem.h
drivers/infiniband/hw/hns/hns_roce_hw_v1.c
drivers/infiniband/hw/hns/hns_roce_hw_v2.c
drivers/infiniband/hw/hns/hns_roce_hw_v2.h
drivers/infiniband/hw/hns/hns_roce_main.c
drivers/infiniband/hw/hns/hns_roce_mr.c
drivers/infiniband/hw/hns/hns_roce_pd.c
drivers/infiniband/hw/hns/hns_roce_qp.c
drivers/infiniband/hw/hns/hns_roce_srq.c
drivers/infiniband/hw/i40iw/i40iw_cm.c
drivers/infiniband/hw/i40iw/i40iw_verbs.c
drivers/infiniband/hw/mlx4/cq.c
drivers/infiniband/hw/mlx4/main.c
drivers/infiniband/hw/mlx4/mlx4_ib.h
drivers/infiniband/hw/mlx4/mr.c
drivers/infiniband/hw/mlx4/qp.c
drivers/infiniband/hw/mlx4/srq.c
drivers/infiniband/hw/mlx5/cq.c
drivers/infiniband/hw/mlx5/devx.c
drivers/infiniband/hw/mlx5/mad.c
drivers/infiniband/hw/mlx5/main.c
drivers/infiniband/hw/mlx5/mem.c
drivers/infiniband/hw/mlx5/mlx5_ib.h
drivers/infiniband/hw/mlx5/mr.c
drivers/infiniband/hw/mlx5/odp.c
drivers/infiniband/hw/mlx5/qp.c
drivers/infiniband/hw/mthca/mthca_allocator.c
drivers/infiniband/hw/mthca/mthca_memfree.c
drivers/infiniband/hw/mthca/mthca_provider.c
drivers/infiniband/hw/nes/Kconfig [deleted file]
drivers/infiniband/hw/nes/Makefile [deleted file]
drivers/infiniband/hw/nes/nes.c [deleted file]
drivers/infiniband/hw/nes/nes.h [deleted file]
drivers/infiniband/hw/nes/nes_cm.c [deleted file]
drivers/infiniband/hw/nes/nes_cm.h [deleted file]
drivers/infiniband/hw/nes/nes_context.h [deleted file]
drivers/infiniband/hw/nes/nes_hw.c [deleted file]
drivers/infiniband/hw/nes/nes_hw.h [deleted file]
drivers/infiniband/hw/nes/nes_mgt.c [deleted file]
drivers/infiniband/hw/nes/nes_mgt.h [deleted file]
drivers/infiniband/hw/nes/nes_nic.c [deleted file]
drivers/infiniband/hw/nes/nes_utils.c [deleted file]
drivers/infiniband/hw/nes/nes_verbs.c [deleted file]
drivers/infiniband/hw/nes/nes_verbs.h [deleted file]
drivers/infiniband/hw/ocrdma/ocrdma_hw.c
drivers/infiniband/hw/ocrdma/ocrdma_hw.h
drivers/infiniband/hw/ocrdma/ocrdma_main.c
drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
drivers/infiniband/hw/qedr/main.c
drivers/infiniband/hw/qedr/verbs.c
drivers/infiniband/hw/qedr/verbs.h
drivers/infiniband/hw/qib/qib_qp.c
drivers/infiniband/hw/qib/qib_rc.c
drivers/infiniband/hw/qib/qib_uc.c
drivers/infiniband/hw/qib/qib_ud.c
drivers/infiniband/hw/qib/qib_user_pages.c
drivers/infiniband/hw/qib/qib_user_sdma.c
drivers/infiniband/hw/qib/qib_verbs.c
drivers/infiniband/hw/usnic/usnic_ib.h
drivers/infiniband/hw/usnic/usnic_ib_main.c
drivers/infiniband/hw/usnic/usnic_ib_verbs.c
drivers/infiniband/hw/usnic/usnic_ib_verbs.h
drivers/infiniband/hw/usnic/usnic_uiom.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
drivers/infiniband/sw/Makefile
drivers/infiniband/sw/rdmavt/ah.c
drivers/infiniband/sw/rdmavt/cq.c
drivers/infiniband/sw/rdmavt/cq.h
drivers/infiniband/sw/rdmavt/mr.c
drivers/infiniband/sw/rdmavt/qp.c
drivers/infiniband/sw/rdmavt/qp.h
drivers/infiniband/sw/rdmavt/rc.c
drivers/infiniband/sw/rdmavt/srq.c
drivers/infiniband/sw/rdmavt/trace_mr.h
drivers/infiniband/sw/rdmavt/vt.c
drivers/infiniband/sw/rdmavt/vt.h
drivers/infiniband/sw/rxe/rxe_comp.c
drivers/infiniband/sw/rxe/rxe_mr.c
drivers/infiniband/sw/rxe/rxe_pool.c
drivers/infiniband/sw/rxe/rxe_resp.c
drivers/infiniband/sw/rxe/rxe_verbs.c
drivers/infiniband/sw/rxe/rxe_verbs.h
drivers/infiniband/sw/siw/Kconfig [new file with mode: 0644]
drivers/infiniband/sw/siw/Makefile [new file with mode: 0644]
drivers/infiniband/sw/siw/iwarp.h [new file with mode: 0644]
drivers/infiniband/sw/siw/siw.h [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_cm.c [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_cm.h [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_cq.c [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_main.c [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_mem.c [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_mem.h [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_qp.c [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_qp_rx.c [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_qp_tx.c [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_verbs.c [new file with mode: 0644]
drivers/infiniband/sw/siw/siw_verbs.h [new file with mode: 0644]
drivers/infiniband/ulp/ipoib/Kconfig
drivers/infiniband/ulp/ipoib/ipoib_cm.c
drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
drivers/infiniband/ulp/ipoib/ipoib_main.c
drivers/infiniband/ulp/ipoib/ipoib_verbs.c
drivers/infiniband/ulp/iser/iscsi_iser.c
drivers/infiniband/ulp/iser/iscsi_iser.h
drivers/infiniband/ulp/iser/iser_initiator.c
drivers/infiniband/ulp/iser/iser_memory.c
drivers/infiniband/ulp/iser/iser_verbs.c
drivers/infiniband/ulp/isert/ib_isert.c
drivers/infiniband/ulp/srp/ib_srp.c
drivers/nvme/host/rdma.c
include/linux/dim.h
include/linux/mlx5/mlx5_ifc.h
include/linux/mlx5/qp.h
include/rdma/ib_umem.h
include/rdma/ib_umem_odp.h
include/rdma/ib_verbs.h
include/rdma/mr_pool.h
include/rdma/rdma_counter.h [new file with mode: 0644]
include/rdma/rdma_netlink.h
include/rdma/rdma_vt.h
include/rdma/rdmavt_cq.h
include/rdma/rdmavt_qp.h
include/rdma/restrack.h
include/rdma/rw.h
include/rdma/signature.h [new file with mode: 0644]
include/uapi/rdma/ib_user_cm.h [deleted file]
include/uapi/rdma/mlx5_user_ioctl_cmds.h
include/uapi/rdma/mlx5_user_ioctl_verbs.h
include/uapi/rdma/rdma_netlink.h
include/uapi/rdma/rdma_user_ioctl_cmds.h
include/uapi/rdma/rvt-abi.h [new file with mode: 0644]
include/uapi/rdma/siw-abi.h [new file with mode: 0644]
lib/dim/Makefile
lib/dim/rdma_dim.c [new file with mode: 0644]
net/rds/ib_cm.c

index 17211ceb9bf43876b886fe00ae08d4fcec74fe66..aed21b8916a25ac82767c64fae32601352af239b 100644 (file)
@@ -423,23 +423,6 @@ Description:
                (e.g. driver restart on the VM which owns the VF).
 
 
-sysfs interface for NetEffect RNIC Low-Level iWARP driver (nes)
----------------------------------------------------------------
-
-What:          /sys/class/infiniband/nesX/hw_rev
-What:          /sys/class/infiniband/nesX/hca_type
-What:          /sys/class/infiniband/nesX/board_id
-Date:          Feb, 2008
-KernelVersion: v2.6.25
-Contact:       linux-rdma@vger.kernel.org
-Description:
-               hw_rev:         (RO) Hardware revision number
-
-               hca_type:       (RO) Host Channel Adapter type (NEX020)
-
-               board_id:       (RO) Manufacturing board id
-
-
 sysfs interface for Chelsio T4/T5 RDMA driver (cxgb4)
 -----------------------------------------------------
 
index 216dc0e1e6f2efca39167785e7d32e2078f0b183..71a77feb779b6a7a104b3d59d4163f409b909ebf 100644 (file)
@@ -90,6 +90,7 @@ needed).
 
    driver-api/index
    core-api/index
+   infiniband/index
    media/index
    networking/index
    input/index
diff --git a/Documentation/infiniband/core_locking.rst b/Documentation/infiniband/core_locking.rst
new file mode 100644 (file)
index 0000000..f34669b
--- /dev/null
@@ -0,0 +1,118 @@
+===========================
+InfiniBand Midlayer Locking
+===========================
+
+  This guide is an attempt to make explicit the locking assumptions
+  made by the InfiniBand midlayer.  It describes the requirements on
+  both low-level drivers that sit below the midlayer and upper level
+  protocols that use the midlayer.
+
+Sleeping and interrupt context
+==============================
+
+  With the following exceptions, a low-level driver implementation of
+  all of the methods in struct ib_device may sleep.  The exceptions
+  are any methods from the list:
+
+    - create_ah
+    - modify_ah
+    - query_ah
+    - destroy_ah
+    - post_send
+    - post_recv
+    - poll_cq
+    - req_notify_cq
+    - map_phys_fmr
+
+  which may not sleep and must be callable from any context.
+
+  The corresponding functions exported to upper level protocol
+  consumers:
+
+    - ib_create_ah
+    - ib_modify_ah
+    - ib_query_ah
+    - ib_destroy_ah
+    - ib_post_send
+    - ib_post_recv
+    - ib_req_notify_cq
+    - ib_map_phys_fmr
+
+  are therefore safe to call from any context.
+
+  In addition, the function
+
+    - ib_dispatch_event
+
+  used by low-level drivers to dispatch asynchronous events through
+  the midlayer is also safe to call from any context.
+
+Reentrancy
+----------
+
+  All of the methods in struct ib_device exported by a low-level
+  driver must be fully reentrant.  The low-level driver is required to
+  perform all synchronization necessary to maintain consistency, even
+  if multiple function calls using the same object are run
+  simultaneously.
+
+  The IB midlayer does not perform any serialization of function calls.
+
+  Because low-level drivers are reentrant, upper level protocol
+  consumers are not required to perform any serialization.  However,
+  some serialization may be required to get sensible results.  For
+  example, a consumer may safely call ib_poll_cq() on multiple CPUs
+  simultaneously.  However, the ordering of the work completion
+  information between different calls of ib_poll_cq() is not defined.
+
+Callbacks
+---------
+
+  A low-level driver must not perform a callback directly from the
+  same callchain as an ib_device method call.  For example, it is not
+  allowed for a low-level driver to call a consumer's completion event
+  handler directly from its post_send method.  Instead, the low-level
+  driver should defer this callback by, for example, scheduling a
+  tasklet to perform the callback.
+
+  The low-level driver is responsible for ensuring that multiple
+  completion event handlers for the same CQ are not called
+  simultaneously.  The driver must guarantee that only one CQ event
+  handler for a given CQ is running at a time.  In other words, the
+  following situation is not allowed::
+
+          CPU1                                    CPU2
+
+    low-level driver ->
+      consumer CQ event callback:
+        /* ... */
+        ib_req_notify_cq(cq, ...);
+                                          low-level driver ->
+        /* ... */                           consumer CQ event callback:
+                                              /* ... */
+        return from CQ event handler
+
+  The context in which completion event and asynchronous event
+  callbacks run is not defined.  Depending on the low-level driver, it
+  may be process context, softirq context, or interrupt context.
+  Upper level protocol consumers may not sleep in a callback.
+
+Hot-plug
+--------
+
+  A low-level driver announces that a device is ready for use by
+  consumers when it calls ib_register_device(), all initialization
+  must be complete before this call.  The device must remain usable
+  until the driver's call to ib_unregister_device() has returned.
+
+  A low-level driver must call ib_register_device() and
+  ib_unregister_device() from process context.  It must not hold any
+  semaphores that could cause deadlock if a consumer calls back into
+  the driver across these calls.
+
+  An upper level protocol consumer may begin using an IB device as
+  soon as the add method of its struct ib_client is called for that
+  device.  A consumer must finish all cleanup and free all resources
+  relating to a device before returning from the remove method.
+
+  A consumer is permitted to sleep in its add and remove methods.
diff --git a/Documentation/infiniband/core_locking.txt b/Documentation/infiniband/core_locking.txt
deleted file mode 100644 (file)
index 4b1f36b..0000000
+++ /dev/null
@@ -1,112 +0,0 @@
-INFINIBAND MIDLAYER LOCKING
-
-  This guide is an attempt to make explicit the locking assumptions
-  made by the InfiniBand midlayer.  It describes the requirements on
-  both low-level drivers that sit below the midlayer and upper level
-  protocols that use the midlayer.
-
-Sleeping and interrupt context
-
-  With the following exceptions, a low-level driver implementation of
-  all of the methods in struct ib_device may sleep.  The exceptions
-  are any methods from the list:
-
-    create_ah
-    modify_ah
-    query_ah
-    destroy_ah
-    post_send
-    post_recv
-    poll_cq
-    req_notify_cq
-    map_phys_fmr
-
-  which may not sleep and must be callable from any context.
-
-  The corresponding functions exported to upper level protocol
-  consumers:
-
-    ib_create_ah
-    ib_modify_ah
-    ib_query_ah
-    ib_destroy_ah
-    ib_post_send
-    ib_post_recv
-    ib_req_notify_cq
-    ib_map_phys_fmr
-
-  are therefore safe to call from any context.
-
-  In addition, the function
-
-    ib_dispatch_event
-
-  used by low-level drivers to dispatch asynchronous events through
-  the midlayer is also safe to call from any context.
-
-Reentrancy
-
-  All of the methods in struct ib_device exported by a low-level
-  driver must be fully reentrant.  The low-level driver is required to
-  perform all synchronization necessary to maintain consistency, even
-  if multiple function calls using the same object are run
-  simultaneously.
-
-  The IB midlayer does not perform any serialization of function calls.
-
-  Because low-level drivers are reentrant, upper level protocol
-  consumers are not required to perform any serialization.  However,
-  some serialization may be required to get sensible results.  For
-  example, a consumer may safely call ib_poll_cq() on multiple CPUs
-  simultaneously.  However, the ordering of the work completion
-  information between different calls of ib_poll_cq() is not defined.
-
-Callbacks
-
-  A low-level driver must not perform a callback directly from the
-  same callchain as an ib_device method call.  For example, it is not
-  allowed for a low-level driver to call a consumer's completion event
-  handler directly from its post_send method.  Instead, the low-level
-  driver should defer this callback by, for example, scheduling a
-  tasklet to perform the callback.
-
-  The low-level driver is responsible for ensuring that multiple
-  completion event handlers for the same CQ are not called
-  simultaneously.  The driver must guarantee that only one CQ event
-  handler for a given CQ is running at a time.  In other words, the
-  following situation is not allowed:
-
-        CPU1                                    CPU2
-
-  low-level driver ->
-    consumer CQ event callback:
-      /* ... */
-      ib_req_notify_cq(cq, ...);
-                                        low-level driver ->
-      /* ... */                           consumer CQ event callback:
-                                            /* ... */
-      return from CQ event handler
-
-  The context in which completion event and asynchronous event
-  callbacks run is not defined.  Depending on the low-level driver, it
-  may be process context, softirq context, or interrupt context.
-  Upper level protocol consumers may not sleep in a callback.
-
-Hot-plug
-
-  A low-level driver announces that a device is ready for use by
-  consumers when it calls ib_register_device(), all initialization
-  must be complete before this call.  The device must remain usable
-  until the driver's call to ib_unregister_device() has returned.
-
-  A low-level driver must call ib_register_device() and
-  ib_unregister_device() from process context.  It must not hold any
-  semaphores that could cause deadlock if a consumer calls back into
-  the driver across these calls.
-
-  An upper level protocol consumer may begin using an IB device as
-  soon as the add method of its struct ib_client is called for that
-  device.  A consumer must finish all cleanup and free all resources
-  relating to a device before returning from the remove method.
-
-  A consumer is permitted to sleep in its add and remove methods.
diff --git a/Documentation/infiniband/index.rst b/Documentation/infiniband/index.rst
new file mode 100644 (file)
index 0000000..9cd7615
--- /dev/null
@@ -0,0 +1,23 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========
+InfiniBand
+==========
+
+.. toctree::
+   :maxdepth: 1
+
+   core_locking
+   ipoib
+   opa_vnic
+   sysfs
+   tag_matching
+   user_mad
+   user_verbs
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/infiniband/ipoib.rst b/Documentation/infiniband/ipoib.rst
new file mode 100644 (file)
index 0000000..0dd3615
--- /dev/null
@@ -0,0 +1,115 @@
+==================
+IP over InfiniBand
+==================
+
+  The ib_ipoib driver is an implementation of the IP over InfiniBand
+  protocol as specified by RFC 4391 and 4392, issued by the IETF ipoib
+  working group.  It is a "native" implementation in the sense of
+  setting the interface type to ARPHRD_INFINIBAND and the hardware
+  address length to 20 (earlier proprietary implementations
+  masqueraded to the kernel as ethernet interfaces).
+
+Partitions and P_Keys
+=====================
+
+  When the IPoIB driver is loaded, it creates one interface for each
+  port using the P_Key at index 0.  To create an interface with a
+  different P_Key, write the desired P_Key into the main interface's
+  /sys/class/net/<intf name>/create_child file.  For example::
+
+    echo 0x8001 > /sys/class/net/ib0/create_child
+
+  This will create an interface named ib0.8001 with P_Key 0x8001.  To
+  remove a subinterface, use the "delete_child" file::
+
+    echo 0x8001 > /sys/class/net/ib0/delete_child
+
+  The P_Key for any interface is given by the "pkey" file, and the
+  main interface for a subinterface is in "parent."
+
+  Child interface create/delete can also be done using IPoIB's
+  rtnl_link_ops, where children created using either way behave the same.
+
+Datagram vs Connected modes
+===========================
+
+  The IPoIB driver supports two modes of operation: datagram and
+  connected.  The mode is set and read through an interface's
+  /sys/class/net/<intf name>/mode file.
+
+  In datagram mode, the IB UD (Unreliable Datagram) transport is used
+  and so the interface MTU has is equal to the IB L2 MTU minus the
+  IPoIB encapsulation header (4 bytes).  For example, in a typical IB
+  fabric with a 2K MTU, the IPoIB MTU will be 2048 - 4 = 2044 bytes.
+
+  In connected mode, the IB RC (Reliable Connected) transport is used.
+  Connected mode takes advantage of the connected nature of the IB
+  transport and allows an MTU up to the maximal IP packet size of 64K,
+  which reduces the number of IP packets needed for handling large UDP
+  datagrams, TCP segments, etc and increases the performance for large
+  messages.
+
+  In connected mode, the interface's UD QP is still used for multicast
+  and communication with peers that don't support connected mode. In
+  this case, RX emulation of ICMP PMTU packets is used to cause the
+  networking stack to use the smaller UD MTU for these neighbours.
+
+Stateless offloads
+==================
+
+  If the IB HW supports IPoIB stateless offloads, IPoIB advertises
+  TCP/IP checksum and/or Large Send (LSO) offloading capability to the
+  network stack.
+
+  Large Receive (LRO) offloading is also implemented and may be turned
+  on/off using ethtool calls.  Currently LRO is supported only for
+  checksum offload capable devices.
+
+  Stateless offloads are supported only in datagram mode.
+
+Interrupt moderation
+====================
+
+  If the underlying IB device supports CQ event moderation, one can
+  use ethtool to set interrupt mitigation parameters and thus reduce
+  the overhead incurred by handling interrupts.  The main code path of
+  IPoIB doesn't use events for TX completion signaling so only RX
+  moderation is supported.
+
+Debugging Information
+=====================
+
+  By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set
+  to 'y', tracing messages are compiled into the driver.  They are
+  turned on by setting the module parameters debug_level and
+  mcast_debug_level to 1.  These parameters can be controlled at
+  runtime through files in /sys/module/ib_ipoib/.
+
+  CONFIG_INFINIBAND_IPOIB_DEBUG also enables files in the debugfs
+  virtual filesystem.  By mounting this filesystem, for example with::
+
+    mount -t debugfs none /sys/kernel/debug
+
+  it is possible to get statistics about multicast groups from the
+  files /sys/kernel/debug/ipoib/ib0_mcg and so on.
+
+  The performance impact of this option is negligible, so it
+  is safe to enable this option with debug_level set to 0 for normal
+  operation.
+
+  CONFIG_INFINIBAND_IPOIB_DEBUG_DATA enables even more debug output in
+  the data path when data_debug_level is set to 1.  However, even with
+  the output disabled, enabling this configuration option will affect
+  performance, because it adds tests to the fast path.
+
+References
+==========
+
+  Transmission of IP over InfiniBand (IPoIB) (RFC 4391)
+    http://ietf.org/rfc/rfc4391.txt
+
+  IP over InfiniBand (IPoIB) Architecture (RFC 4392)
+    http://ietf.org/rfc/rfc4392.txt
+
+  IP over InfiniBand: Connected Mode (RFC 4755)
+    http://ietf.org/rfc/rfc4755.txt
diff --git a/Documentation/infiniband/ipoib.txt b/Documentation/infiniband/ipoib.txt
deleted file mode 100644 (file)
index 47c1dd9..0000000
+++ /dev/null
@@ -1,105 +0,0 @@
-IP OVER INFINIBAND
-
-  The ib_ipoib driver is an implementation of the IP over InfiniBand
-  protocol as specified by RFC 4391 and 4392, issued by the IETF ipoib
-  working group.  It is a "native" implementation in the sense of
-  setting the interface type to ARPHRD_INFINIBAND and the hardware
-  address length to 20 (earlier proprietary implementations
-  masqueraded to the kernel as ethernet interfaces).
-
-Partitions and P_Keys
-
-  When the IPoIB driver is loaded, it creates one interface for each
-  port using the P_Key at index 0.  To create an interface with a
-  different P_Key, write the desired P_Key into the main interface's
-  /sys/class/net/<intf name>/create_child file.  For example:
-
-    echo 0x8001 > /sys/class/net/ib0/create_child
-
-  This will create an interface named ib0.8001 with P_Key 0x8001.  To
-  remove a subinterface, use the "delete_child" file:
-
-    echo 0x8001 > /sys/class/net/ib0/delete_child
-
-  The P_Key for any interface is given by the "pkey" file, and the
-  main interface for a subinterface is in "parent."
-
-  Child interface create/delete can also be done using IPoIB's
-  rtnl_link_ops, where children created using either way behave the same.
-
-Datagram vs Connected modes
-
-  The IPoIB driver supports two modes of operation: datagram and
-  connected.  The mode is set and read through an interface's
-  /sys/class/net/<intf name>/mode file.
-
-  In datagram mode, the IB UD (Unreliable Datagram) transport is used
-  and so the interface MTU has is equal to the IB L2 MTU minus the
-  IPoIB encapsulation header (4 bytes).  For example, in a typical IB
-  fabric with a 2K MTU, the IPoIB MTU will be 2048 - 4 = 2044 bytes.
-
-  In connected mode, the IB RC (Reliable Connected) transport is used.
-  Connected mode takes advantage of the connected nature of the IB
-  transport and allows an MTU up to the maximal IP packet size of 64K,
-  which reduces the number of IP packets needed for handling large UDP
-  datagrams, TCP segments, etc and increases the performance for large
-  messages.
-
-  In connected mode, the interface's UD QP is still used for multicast
-  and communication with peers that don't support connected mode. In
-  this case, RX emulation of ICMP PMTU packets is used to cause the
-  networking stack to use the smaller UD MTU for these neighbours.
-
-Stateless offloads
-
-  If the IB HW supports IPoIB stateless offloads, IPoIB advertises
-  TCP/IP checksum and/or Large Send (LSO) offloading capability to the
-  network stack.
-
-  Large Receive (LRO) offloading is also implemented and may be turned
-  on/off using ethtool calls.  Currently LRO is supported only for
-  checksum offload capable devices.
-
-  Stateless offloads are supported only in datagram mode.  
-
-Interrupt moderation
-
-  If the underlying IB device supports CQ event moderation, one can
-  use ethtool to set interrupt mitigation parameters and thus reduce
-  the overhead incurred by handling interrupts.  The main code path of
-  IPoIB doesn't use events for TX completion signaling so only RX
-  moderation is supported.
-
-Debugging Information
-
-  By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set
-  to 'y', tracing messages are compiled into the driver.  They are
-  turned on by setting the module parameters debug_level and
-  mcast_debug_level to 1.  These parameters can be controlled at
-  runtime through files in /sys/module/ib_ipoib/.
-
-  CONFIG_INFINIBAND_IPOIB_DEBUG also enables files in the debugfs
-  virtual filesystem.  By mounting this filesystem, for example with
-
-    mount -t debugfs none /sys/kernel/debug
-
-  it is possible to get statistics about multicast groups from the
-  files /sys/kernel/debug/ipoib/ib0_mcg and so on.
-
-  The performance impact of this option is negligible, so it
-  is safe to enable this option with debug_level set to 0 for normal
-  operation.
-
-  CONFIG_INFINIBAND_IPOIB_DEBUG_DATA enables even more debug output in
-  the data path when data_debug_level is set to 1.  However, even with
-  the output disabled, enabling this configuration option will affect
-  performance, because it adds tests to the fast path.
-
-References
-
-  Transmission of IP over InfiniBand (IPoIB) (RFC 4391)
-    http://ietf.org/rfc/rfc4391.txt 
-  IP over InfiniBand (IPoIB) Architecture (RFC 4392)
-    http://ietf.org/rfc/rfc4392.txt 
-  IP over InfiniBand: Connected Mode (RFC 4755)
-    http://ietf.org/rfc/rfc4755.txt
diff --git a/Documentation/infiniband/opa_vnic.rst b/Documentation/infiniband/opa_vnic.rst
new file mode 100644 (file)
index 0000000..2f888d9
--- /dev/null
@@ -0,0 +1,159 @@
+=================================================================
+Intel Omni-Path (OPA) Virtual Network Interface Controller (VNIC)
+=================================================================
+
+Intel Omni-Path (OPA) Virtual Network Interface Controller (VNIC) feature
+supports Ethernet functionality over Omni-Path fabric by encapsulating
+the Ethernet packets between HFI nodes.
+
+Architecture
+=============
+The patterns of exchanges of Omni-Path encapsulated Ethernet packets
+involves one or more virtual Ethernet switches overlaid on the Omni-Path
+fabric topology. A subset of HFI nodes on the Omni-Path fabric are
+permitted to exchange encapsulated Ethernet packets across a particular
+virtual Ethernet switch. The virtual Ethernet switches are logical
+abstractions achieved by configuring the HFI nodes on the fabric for
+header generation and processing. In the simplest configuration all HFI
+nodes across the fabric exchange encapsulated Ethernet packets over a
+single virtual Ethernet switch. A virtual Ethernet switch, is effectively
+an independent Ethernet network. The configuration is performed by an
+Ethernet Manager (EM) which is part of the trusted Fabric Manager (FM)
+application. HFI nodes can have multiple VNICs each connected to a
+different virtual Ethernet switch. The below diagram presents a case
+of two virtual Ethernet switches with two HFI nodes::
+
+                               +-------------------+
+                               |      Subnet/      |
+                               |     Ethernet      |
+                               |      Manager      |
+                               +-------------------+
+                                  /          /
+                                /           /
+                              /            /
+                            /             /
+  +-----------------------------+  +------------------------------+
+  |  Virtual Ethernet Switch    |  |  Virtual Ethernet Switch     |
+  |  +---------+    +---------+ |  | +---------+    +---------+   |
+  |  | VPORT   |    |  VPORT  | |  | |  VPORT  |    |  VPORT  |   |
+  +--+---------+----+---------+-+  +-+---------+----+---------+---+
+           |                 \        /                 |
+           |                   \    /                   |
+           |                     \/                     |
+           |                    /  \                    |
+           |                  /      \                  |
+       +-----------+------------+  +-----------+------------+
+       |   VNIC    |    VNIC    |  |    VNIC   |    VNIC    |
+       +-----------+------------+  +-----------+------------+
+       |          HFI           |  |          HFI           |
+       +------------------------+  +------------------------+
+
+
+The Omni-Path encapsulated Ethernet packet format is as described below.
+
+==================== ================================
+Bits                 Field
+==================== ================================
+Quad Word 0:
+0-19                 SLID (lower 20 bits)
+20-30                Length (in Quad Words)
+31                   BECN bit
+32-51                DLID (lower 20 bits)
+52-56                SC (Service Class)
+57-59                RC (Routing Control)
+60                   FECN bit
+61-62                L2 (=10, 16B format)
+63                   LT (=1, Link Transfer Head Flit)
+
+Quad Word 1:
+0-7                  L4 type (=0x78 ETHERNET)
+8-11                 SLID[23:20]
+12-15                DLID[23:20]
+16-31                PKEY
+32-47                Entropy
+48-63                Reserved
+
+Quad Word 2:
+0-15                 Reserved
+16-31                L4 header
+32-63                Ethernet Packet
+
+Quad Words 3 to N-1:
+0-63                 Ethernet packet (pad extended)
+
+Quad Word N (last):
+0-23                 Ethernet packet (pad extended)
+24-55                ICRC
+56-61                Tail
+62-63                LT (=01, Link Transfer Tail Flit)
+==================== ================================
+
+Ethernet packet is padded on the transmit side to ensure that the VNIC OPA
+packet is quad word aligned. The 'Tail' field contains the number of bytes
+padded. On the receive side the 'Tail' field is read and the padding is
+removed (along with ICRC, Tail and OPA header) before passing packet up
+the network stack.
+
+The L4 header field contains the virtual Ethernet switch id the VNIC port
+belongs to. On the receive side, this field is used to de-multiplex the
+received VNIC packets to different VNIC ports.
+
+Driver Design
+==============
+Intel OPA VNIC software design is presented in the below diagram.
+OPA VNIC functionality has a HW dependent component and a HW
+independent component.
+
+The support has been added for IB device to allocate and free the RDMA
+netdev devices. The RDMA netdev supports interfacing with the network
+stack thus creating standard network interfaces. OPA_VNIC is an RDMA
+netdev device type.
+
+The HW dependent VNIC functionality is part of the HFI1 driver. It
+implements the verbs to allocate and free the OPA_VNIC RDMA netdev.
+It involves HW resource allocation/management for VNIC functionality.
+It interfaces with the network stack and implements the required
+net_device_ops functions. It expects Omni-Path encapsulated Ethernet
+packets in the transmit path and provides HW access to them. It strips
+the Omni-Path header from the received packets before passing them up
+the network stack. It also implements the RDMA netdev control operations.
+
+The OPA VNIC module implements the HW independent VNIC functionality.
+It consists of two parts. The VNIC Ethernet Management Agent (VEMA)
+registers itself with IB core as an IB client and interfaces with the
+IB MAD stack. It exchanges the management information with the Ethernet
+Manager (EM) and the VNIC netdev. The VNIC netdev part allocates and frees
+the OPA_VNIC RDMA netdev devices. It overrides the net_device_ops functions
+set by HW dependent VNIC driver where required to accommodate any control
+operation. It also handles the encapsulation of Ethernet packets with an
+Omni-Path header in the transmit path. For each VNIC interface, the
+information required for encapsulation is configured by the EM via VEMA MAD
+interface. It also passes any control information to the HW dependent driver
+by invoking the RDMA netdev control operations::
+
+        +-------------------+ +----------------------+
+        |                   | |       Linux          |
+        |     IB MAD        | |      Network         |
+        |                   | |       Stack          |
+        +-------------------+ +----------------------+
+                 |               |          |
+                 |               |          |
+        +----------------------------+      |
+        |                            |      |
+        |      OPA VNIC Module       |      |
+        |  (OPA VNIC RDMA Netdev     |      |
+        |     & EMA functions)       |      |
+        |                            |      |
+        +----------------------------+      |
+                    |                       |
+                    |                       |
+           +------------------+             |
+           |     IB core      |             |
+           +------------------+             |
+                    |                       |
+                    |                       |
+        +--------------------------------------------+
+        |                                            |
+        |      HFI1 Driver with VNIC support         |
+        |                                            |
+        +--------------------------------------------+
diff --git a/Documentation/infiniband/opa_vnic.txt b/Documentation/infiniband/opa_vnic.txt
deleted file mode 100644 (file)
index 282e17b..0000000
+++ /dev/null
@@ -1,153 +0,0 @@
-Intel Omni-Path (OPA) Virtual Network Interface Controller (VNIC) feature
-supports Ethernet functionality over Omni-Path fabric by encapsulating
-the Ethernet packets between HFI nodes.
-
-Architecture
-=============
-The patterns of exchanges of Omni-Path encapsulated Ethernet packets
-involves one or more virtual Ethernet switches overlaid on the Omni-Path
-fabric topology. A subset of HFI nodes on the Omni-Path fabric are
-permitted to exchange encapsulated Ethernet packets across a particular
-virtual Ethernet switch. The virtual Ethernet switches are logical
-abstractions achieved by configuring the HFI nodes on the fabric for
-header generation and processing. In the simplest configuration all HFI
-nodes across the fabric exchange encapsulated Ethernet packets over a
-single virtual Ethernet switch. A virtual Ethernet switch, is effectively
-an independent Ethernet network. The configuration is performed by an
-Ethernet Manager (EM) which is part of the trusted Fabric Manager (FM)
-application. HFI nodes can have multiple VNICs each connected to a
-different virtual Ethernet switch. The below diagram presents a case
-of two virtual Ethernet switches with two HFI nodes.
-
-                             +-------------------+
-                             |      Subnet/      |
-                             |     Ethernet      |
-                             |      Manager      |
-                             +-------------------+
-                                /          /
-                              /           /
-                            /            /
-                          /             /
-+-----------------------------+  +------------------------------+
-|  Virtual Ethernet Switch    |  |  Virtual Ethernet Switch     |
-|  +---------+    +---------+ |  | +---------+    +---------+   |
-|  | VPORT   |    |  VPORT  | |  | |  VPORT  |    |  VPORT  |   |
-+--+---------+----+---------+-+  +-+---------+----+---------+---+
-         |                 \        /                 |
-         |                   \    /                   |
-         |                     \/                     |
-         |                    /  \                    |
-         |                  /      \                  |
-     +-----------+------------+  +-----------+------------+
-     |   VNIC    |    VNIC    |  |    VNIC   |    VNIC    |
-     +-----------+------------+  +-----------+------------+
-     |          HFI           |  |          HFI           |
-     +------------------------+  +------------------------+
-
-
-The Omni-Path encapsulated Ethernet packet format is as described below.
-
-Bits          Field
-------------------------------------
-Quad Word 0:
-0-19      SLID (lower 20 bits)
-20-30     Length (in Quad Words)
-31        BECN bit
-32-51     DLID (lower 20 bits)
-52-56     SC (Service Class)
-57-59     RC (Routing Control)
-60        FECN bit
-61-62     L2 (=10, 16B format)
-63        LT (=1, Link Transfer Head Flit)
-
-Quad Word 1:
-0-7       L4 type (=0x78 ETHERNET)
-8-11      SLID[23:20]
-12-15     DLID[23:20]
-16-31     PKEY
-32-47     Entropy
-48-63     Reserved
-
-Quad Word 2:
-0-15      Reserved
-16-31     L4 header
-32-63     Ethernet Packet
-
-Quad Words 3 to N-1:
-0-63      Ethernet packet (pad extended)
-
-Quad Word N (last):
-0-23      Ethernet packet (pad extended)
-24-55     ICRC
-56-61     Tail
-62-63     LT (=01, Link Transfer Tail Flit)
-
-Ethernet packet is padded on the transmit side to ensure that the VNIC OPA
-packet is quad word aligned. The 'Tail' field contains the number of bytes
-padded. On the receive side the 'Tail' field is read and the padding is
-removed (along with ICRC, Tail and OPA header) before passing packet up
-the network stack.
-
-The L4 header field contains the virtual Ethernet switch id the VNIC port
-belongs to. On the receive side, this field is used to de-multiplex the
-received VNIC packets to different VNIC ports.
-
-Driver Design
-==============
-Intel OPA VNIC software design is presented in the below diagram.
-OPA VNIC functionality has a HW dependent component and a HW
-independent component.
-
-The support has been added for IB device to allocate and free the RDMA
-netdev devices. The RDMA netdev supports interfacing with the network
-stack thus creating standard network interfaces. OPA_VNIC is an RDMA
-netdev device type.
-
-The HW dependent VNIC functionality is part of the HFI1 driver. It
-implements the verbs to allocate and free the OPA_VNIC RDMA netdev.
-It involves HW resource allocation/management for VNIC functionality.
-It interfaces with the network stack and implements the required
-net_device_ops functions. It expects Omni-Path encapsulated Ethernet
-packets in the transmit path and provides HW access to them. It strips
-the Omni-Path header from the received packets before passing them up
-the network stack. It also implements the RDMA netdev control operations.
-
-The OPA VNIC module implements the HW independent VNIC functionality.
-It consists of two parts. The VNIC Ethernet Management Agent (VEMA)
-registers itself with IB core as an IB client and interfaces with the
-IB MAD stack. It exchanges the management information with the Ethernet
-Manager (EM) and the VNIC netdev. The VNIC netdev part allocates and frees
-the OPA_VNIC RDMA netdev devices. It overrides the net_device_ops functions
-set by HW dependent VNIC driver where required to accommodate any control
-operation. It also handles the encapsulation of Ethernet packets with an
-Omni-Path header in the transmit path. For each VNIC interface, the
-information required for encapsulation is configured by the EM via VEMA MAD
-interface. It also passes any control information to the HW dependent driver
-by invoking the RDMA netdev control operations.
-
-        +-------------------+ +----------------------+
-        |                   | |       Linux          |
-        |     IB MAD        | |      Network         |
-        |                   | |       Stack          |
-        +-------------------+ +----------------------+
-                 |               |          |
-                 |               |          |
-        +----------------------------+      |
-        |                            |      |
-        |      OPA VNIC Module       |      |
-        |  (OPA VNIC RDMA Netdev     |      |
-        |     & EMA functions)       |      |
-        |                            |      |
-        +----------------------------+      |
-                    |                       |
-                    |                       |
-           +------------------+             |
-           |     IB core      |             |
-           +------------------+             |
-                    |                       |
-                    |                       |
-        +--------------------------------------------+
-        |                                            |
-        |      HFI1 Driver with VNIC support         |
-        |                                            |
-        +--------------------------------------------+
diff --git a/Documentation/infiniband/sysfs.rst b/Documentation/infiniband/sysfs.rst
new file mode 100644 (file)
index 0000000..f0abd6f
--- /dev/null
@@ -0,0 +1,6 @@
+===========
+Sysfs files
+===========
+
+The sysfs interface has moved to
+Documentation/ABI/stable/sysfs-class-infiniband.
diff --git a/Documentation/infiniband/sysfs.txt b/Documentation/infiniband/sysfs.txt
deleted file mode 100644 (file)
index 9fab506..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-SYSFS FILES
-
-The sysfs interface has moved to
-Documentation/ABI/stable/sysfs-class-infiniband.
diff --git a/Documentation/infiniband/tag_matching.rst b/Documentation/infiniband/tag_matching.rst
new file mode 100644 (file)
index 0000000..ef56ea5
--- /dev/null
@@ -0,0 +1,69 @@
+==================
+Tag matching logic
+==================
+
+The MPI standard defines a set of rules, known as tag-matching, for matching
+source send operations to destination receives.  The following parameters must
+match the following source and destination parameters:
+
+*      Communicator
+*      User tag - wild card may be specified by the receiver
+*      Source rank â€“ wild car may be specified by the receiver
+*      Destination rank â€“ wild
+
+The ordering rules require that when more than one pair of send and receive
+message envelopes may match, the pair that includes the earliest posted-send
+and the earliest posted-receive is the pair that must be used to satisfy the
+matching operation. However, this doesn’t imply that tags are consumed in
+the order they are created, e.g., a later generated tag may be consumed, if
+earlier tags can’t be used to satisfy the matching rules.
+
+When a message is sent from the sender to the receiver, the communication
+library may attempt to process the operation either after or before the
+corresponding matching receive is posted.  If a matching receive is posted,
+this is an expected message, otherwise it is called an unexpected message.
+Implementations frequently use different matching schemes for these two
+different matching instances.
+
+To keep MPI library memory footprint down, MPI implementations typically use
+two different protocols for this purpose:
+
+1.     The Eager protocol- the complete message is sent when the send is
+processed by the sender. A completion send is received in the send_cq
+notifying that the buffer can be reused.
+
+2.     The Rendezvous Protocol - the sender sends the tag-matching header,
+and perhaps a portion of data when first notifying the receiver. When the
+corresponding buffer is posted, the responder will use the information from
+the header to initiate an RDMA READ operation directly to the matching buffer.
+A fin message needs to be received in order for the buffer to be reused.
+
+Tag matching implementation
+===========================
+
+There are two types of matching objects used, the posted receive list and the
+unexpected message list. The application posts receive buffers through calls
+to the MPI receive routines in the posted receive list and posts send messages
+using the MPI send routines. The head of the posted receive list may be
+maintained by the hardware, with the software expected to shadow this list.
+
+When send is initiated and arrives at the receive side, if there is no
+pre-posted receive for this arriving message, it is passed to the software and
+placed in the unexpected message list. Otherwise the match is processed,
+including rendezvous processing, if appropriate, delivering the data to the
+specified receive buffer. This allows overlapping receive-side MPI tag
+matching with computation.
+
+When a receive-message is posted, the communication library will first check
+the software unexpected message list for a matching receive. If a match is
+found, data is delivered to the user buffer, using a software controlled
+protocol. The UCX implementation uses either an eager or rendezvous protocol,
+depending on data size. If no match is found, the entire pre-posted receive
+list is maintained by the hardware, and there is space to add one more
+pre-posted receive to this list, this receive is passed to the hardware.
+Software is expected to shadow this list, to help with processing MPI cancel
+operations. In addition, because hardware and software are not expected to be
+tightly synchronized with respect to the tag-matching operation, this shadow
+list is used to detect the case that a pre-posted receive is passed to the
+hardware, as the matching unexpected message is being passed from the hardware
+to the software.
diff --git a/Documentation/infiniband/tag_matching.txt b/Documentation/infiniband/tag_matching.txt
deleted file mode 100644 (file)
index d2a3bf8..0000000
+++ /dev/null
@@ -1,64 +0,0 @@
-Tag matching logic
-
-The MPI standard defines a set of rules, known as tag-matching, for matching
-source send operations to destination receives.  The following parameters must
-match the following source and destination parameters:
-*      Communicator
-*      User tag - wild card may be specified by the receiver
-*      Source rank â€“ wild car may be specified by the receiver
-*      Destination rank â€“ wild
-The ordering rules require that when more than one pair of send and receive
-message envelopes may match, the pair that includes the earliest posted-send
-and the earliest posted-receive is the pair that must be used to satisfy the
-matching operation. However, this doesn’t imply that tags are consumed in
-the order they are created, e.g., a later generated tag may be consumed, if
-earlier tags can’t be used to satisfy the matching rules.
-
-When a message is sent from the sender to the receiver, the communication
-library may attempt to process the operation either after or before the
-corresponding matching receive is posted.  If a matching receive is posted,
-this is an expected message, otherwise it is called an unexpected message.
-Implementations frequently use different matching schemes for these two
-different matching instances.
-
-To keep MPI library memory footprint down, MPI implementations typically use
-two different protocols for this purpose:
-
-1.     The Eager protocol- the complete message is sent when the send is
-processed by the sender. A completion send is received in the send_cq
-notifying that the buffer can be reused.
-
-2.     The Rendezvous Protocol - the sender sends the tag-matching header,
-and perhaps a portion of data when first notifying the receiver. When the
-corresponding buffer is posted, the responder will use the information from
-the header to initiate an RDMA READ operation directly to the matching buffer.
-A fin message needs to be received in order for the buffer to be reused.
-
-Tag matching implementation
-
-There are two types of matching objects used, the posted receive list and the
-unexpected message list. The application posts receive buffers through calls
-to the MPI receive routines in the posted receive list and posts send messages
-using the MPI send routines. The head of the posted receive list may be
-maintained by the hardware, with the software expected to shadow this list.
-
-When send is initiated and arrives at the receive side, if there is no
-pre-posted receive for this arriving message, it is passed to the software and
-placed in the unexpected message list. Otherwise the match is processed,
-including rendezvous processing, if appropriate, delivering the data to the
-specified receive buffer. This allows overlapping receive-side MPI tag
-matching with computation.
-
-When a receive-message is posted, the communication library will first check
-the software unexpected message list for a matching receive. If a match is
-found, data is delivered to the user buffer, using a software controlled
-protocol. The UCX implementation uses either an eager or rendezvous protocol,
-depending on data size. If no match is found, the entire pre-posted receive
-list is maintained by the hardware, and there is space to add one more
-pre-posted receive to this list, this receive is passed to the hardware.
-Software is expected to shadow this list, to help with processing MPI cancel
-operations. In addition, because hardware and software are not expected to be
-tightly synchronized with respect to the tag-matching operation, this shadow
-list is used to detect the case that a pre-posted receive is passed to the
-hardware, as the matching unexpected message is being passed from the hardware
-to the software.
diff --git a/Documentation/infiniband/user_mad.rst b/Documentation/infiniband/user_mad.rst
new file mode 100644 (file)
index 0000000..d88abfc
--- /dev/null
@@ -0,0 +1,166 @@
+====================
+Userspace MAD access
+====================
+
+Device files
+============
+
+  Each port of each InfiniBand device has a "umad" device and an
+  "issm" device attached.  For example, a two-port HCA will have two
+  umad devices and two issm devices, while a switch will have one
+  device of each type (for switch port 0).
+
+Creating MAD agents
+===================
+
+  A MAD agent can be created by filling in a struct ib_user_mad_reg_req
+  and then calling the IB_USER_MAD_REGISTER_AGENT ioctl on a file
+  descriptor for the appropriate device file.  If the registration
+  request succeeds, a 32-bit id will be returned in the structure.
+  For example::
+
+       struct ib_user_mad_reg_req req = { /* ... */ };
+       ret = ioctl(fd, IB_USER_MAD_REGISTER_AGENT, (char *) &req);
+        if (!ret)
+               my_agent = req.id;
+       else
+               perror("agent register");
+
+  Agents can be unregistered with the IB_USER_MAD_UNREGISTER_AGENT
+  ioctl.  Also, all agents registered through a file descriptor will
+  be unregistered when the descriptor is closed.
+
+  2014
+       a new registration ioctl is now provided which allows additional
+       fields to be provided during registration.
+       Users of this registration call are implicitly setting the use of
+       pkey_index (see below).
+
+Receiving MADs
+==============
+
+  MADs are received using read().  The receive side now supports
+  RMPP. The buffer passed to read() must be at least one
+  struct ib_user_mad + 256 bytes. For example:
+
+  If the buffer passed is not large enough to hold the received
+  MAD (RMPP), the errno is set to ENOSPC and the length of the
+  buffer needed is set in mad.length.
+
+  Example for normal MAD (non RMPP) reads::
+
+       struct ib_user_mad *mad;
+       mad = malloc(sizeof *mad + 256);
+       ret = read(fd, mad, sizeof *mad + 256);
+       if (ret != sizeof mad + 256) {
+               perror("read");
+               free(mad);
+       }
+
+  Example for RMPP reads::
+
+       struct ib_user_mad *mad;
+       mad = malloc(sizeof *mad + 256);
+       ret = read(fd, mad, sizeof *mad + 256);
+       if (ret == -ENOSPC)) {
+               length = mad.length;
+               free(mad);
+               mad = malloc(sizeof *mad + length);
+               ret = read(fd, mad, sizeof *mad + length);
+       }
+       if (ret < 0) {
+               perror("read");
+               free(mad);
+       }
+
+  In addition to the actual MAD contents, the other struct ib_user_mad
+  fields will be filled in with information on the received MAD.  For
+  example, the remote LID will be in mad.lid.
+
+  If a send times out, a receive will be generated with mad.status set
+  to ETIMEDOUT.  Otherwise when a MAD has been successfully received,
+  mad.status will be 0.
+
+  poll()/select() may be used to wait until a MAD can be read.
+
+Sending MADs
+============
+
+  MADs are sent using write().  The agent ID for sending should be
+  filled into the id field of the MAD, the destination LID should be
+  filled into the lid field, and so on.  The send side does support
+  RMPP so arbitrary length MAD can be sent. For example::
+
+       struct ib_user_mad *mad;
+
+       mad = malloc(sizeof *mad + mad_length);
+
+       /* fill in mad->data */
+
+       mad->hdr.id  = my_agent;        /* req.id from agent registration */
+       mad->hdr.lid = my_dest;         /* in network byte order... */
+       /* etc. */
+
+       ret = write(fd, &mad, sizeof *mad + mad_length);
+       if (ret != sizeof *mad + mad_length)
+               perror("write");
+
+Transaction IDs
+===============
+
+  Users of the umad devices can use the lower 32 bits of the
+  transaction ID field (that is, the least significant half of the
+  field in network byte order) in MADs being sent to match
+  request/response pairs.  The upper 32 bits are reserved for use by
+  the kernel and will be overwritten before a MAD is sent.
+
+P_Key Index Handling
+====================
+
+  The old ib_umad interface did not allow setting the P_Key index for
+  MADs that are sent and did not provide a way for obtaining the P_Key
+  index of received MADs.  A new layout for struct ib_user_mad_hdr
+  with a pkey_index member has been defined; however, to preserve binary
+  compatibility with older applications, this new layout will not be used
+  unless one of IB_USER_MAD_ENABLE_PKEY or IB_USER_MAD_REGISTER_AGENT2 ioctl's
+  are called before a file descriptor is used for anything else.
+
+  In September 2008, the IB_USER_MAD_ABI_VERSION will be incremented
+  to 6, the new layout of struct ib_user_mad_hdr will be used by
+  default, and the IB_USER_MAD_ENABLE_PKEY ioctl will be removed.
+
+Setting IsSM Capability Bit
+===========================
+
+  To set the IsSM capability bit for a port, simply open the
+  corresponding issm device file.  If the IsSM bit is already set,
+  then the open call will block until the bit is cleared (or return
+  immediately with errno set to EAGAIN if the O_NONBLOCK flag is
+  passed to open()).  The IsSM bit will be cleared when the issm file
+  is closed.  No read, write or other operations can be performed on
+  the issm file.
+
+/dev files
+==========
+
+  To create the appropriate character device files automatically with
+  udev, a rule like::
+
+    KERNEL=="umad*", NAME="infiniband/%k"
+    KERNEL=="issm*", NAME="infiniband/%k"
+
+  can be used.  This will create device nodes named::
+
+    /dev/infiniband/umad0
+    /dev/infiniband/issm0
+
+  for the first port, and so on.  The InfiniBand device and port
+  associated with these devices can be determined from the files::
+
+    /sys/class/infiniband_mad/umad0/ibdev
+    /sys/class/infiniband_mad/umad0/port
+
+  and::
+
+    /sys/class/infiniband_mad/issm0/ibdev
+    /sys/class/infiniband_mad/issm0/port
diff --git a/Documentation/infiniband/user_mad.txt b/Documentation/infiniband/user_mad.txt
deleted file mode 100644 (file)
index 7aca13a..0000000
+++ /dev/null
@@ -1,153 +0,0 @@
-USERSPACE MAD ACCESS
-
-Device files
-
-  Each port of each InfiniBand device has a "umad" device and an
-  "issm" device attached.  For example, a two-port HCA will have two
-  umad devices and two issm devices, while a switch will have one
-  device of each type (for switch port 0).
-
-Creating MAD agents
-
-  A MAD agent can be created by filling in a struct ib_user_mad_reg_req
-  and then calling the IB_USER_MAD_REGISTER_AGENT ioctl on a file
-  descriptor for the appropriate device file.  If the registration
-  request succeeds, a 32-bit id will be returned in the structure.
-  For example:
-
-       struct ib_user_mad_reg_req req = { /* ... */ };
-       ret = ioctl(fd, IB_USER_MAD_REGISTER_AGENT, (char *) &req);
-        if (!ret)
-               my_agent = req.id;
-       else
-               perror("agent register");
-
-  Agents can be unregistered with the IB_USER_MAD_UNREGISTER_AGENT
-  ioctl.  Also, all agents registered through a file descriptor will
-  be unregistered when the descriptor is closed.
-
-  2014 -- a new registration ioctl is now provided which allows additional
-       fields to be provided during registration.
-       Users of this registration call are implicitly setting the use of
-       pkey_index (see below).
-
-Receiving MADs
-
-  MADs are received using read().  The receive side now supports
-  RMPP. The buffer passed to read() must be at least one
-  struct ib_user_mad + 256 bytes. For example:
-
-  If the buffer passed is not large enough to hold the received
-  MAD (RMPP), the errno is set to ENOSPC and the length of the
-  buffer needed is set in mad.length.
-
-  Example for normal MAD (non RMPP) reads:
-       struct ib_user_mad *mad;
-       mad = malloc(sizeof *mad + 256);
-       ret = read(fd, mad, sizeof *mad + 256);
-       if (ret != sizeof mad + 256) {
-               perror("read");
-               free(mad);
-       }
-
-  Example for RMPP reads:
-       struct ib_user_mad *mad;
-       mad = malloc(sizeof *mad + 256);
-       ret = read(fd, mad, sizeof *mad + 256);
-       if (ret == -ENOSPC)) {
-               length = mad.length;
-               free(mad);
-               mad = malloc(sizeof *mad + length);
-               ret = read(fd, mad, sizeof *mad + length);
-       }
-       if (ret < 0) {
-               perror("read");
-               free(mad);
-       }
-
-  In addition to the actual MAD contents, the other struct ib_user_mad
-  fields will be filled in with information on the received MAD.  For
-  example, the remote LID will be in mad.lid.
-
-  If a send times out, a receive will be generated with mad.status set
-  to ETIMEDOUT.  Otherwise when a MAD has been successfully received,
-  mad.status will be 0.
-
-  poll()/select() may be used to wait until a MAD can be read.
-
-Sending MADs
-
-  MADs are sent using write().  The agent ID for sending should be
-  filled into the id field of the MAD, the destination LID should be
-  filled into the lid field, and so on.  The send side does support
-  RMPP so arbitrary length MAD can be sent. For example:
-
-       struct ib_user_mad *mad;
-
-       mad = malloc(sizeof *mad + mad_length);
-
-       /* fill in mad->data */
-
-       mad->hdr.id  = my_agent;        /* req.id from agent registration */
-       mad->hdr.lid = my_dest;         /* in network byte order... */
-       /* etc. */
-
-       ret = write(fd, &mad, sizeof *mad + mad_length);
-       if (ret != sizeof *mad + mad_length)
-               perror("write");
-
-Transaction IDs
-
-  Users of the umad devices can use the lower 32 bits of the
-  transaction ID field (that is, the least significant half of the
-  field in network byte order) in MADs being sent to match
-  request/response pairs.  The upper 32 bits are reserved for use by
-  the kernel and will be overwritten before a MAD is sent.
-
-P_Key Index Handling
-
-  The old ib_umad interface did not allow setting the P_Key index for
-  MADs that are sent and did not provide a way for obtaining the P_Key
-  index of received MADs.  A new layout for struct ib_user_mad_hdr
-  with a pkey_index member has been defined; however, to preserve binary
-  compatibility with older applications, this new layout will not be used
-  unless one of IB_USER_MAD_ENABLE_PKEY or IB_USER_MAD_REGISTER_AGENT2 ioctl's
-  are called before a file descriptor is used for anything else.
-
-  In September 2008, the IB_USER_MAD_ABI_VERSION will be incremented
-  to 6, the new layout of struct ib_user_mad_hdr will be used by
-  default, and the IB_USER_MAD_ENABLE_PKEY ioctl will be removed.
-
-Setting IsSM Capability Bit
-
-  To set the IsSM capability bit for a port, simply open the
-  corresponding issm device file.  If the IsSM bit is already set,
-  then the open call will block until the bit is cleared (or return
-  immediately with errno set to EAGAIN if the O_NONBLOCK flag is
-  passed to open()).  The IsSM bit will be cleared when the issm file
-  is closed.  No read, write or other operations can be performed on
-  the issm file.
-
-/dev files
-
-  To create the appropriate character device files automatically with
-  udev, a rule like
-
-    KERNEL=="umad*", NAME="infiniband/%k"
-    KERNEL=="issm*", NAME="infiniband/%k"
-
-  can be used.  This will create device nodes named
-
-    /dev/infiniband/umad0
-    /dev/infiniband/issm0
-
-  for the first port, and so on.  The InfiniBand device and port
-  associated with these devices can be determined from the files
-
-    /sys/class/infiniband_mad/umad0/ibdev
-    /sys/class/infiniband_mad/umad0/port
-
-  and
-
-    /sys/class/infiniband_mad/issm0/ibdev
-    /sys/class/infiniband_mad/issm0/port
diff --git a/Documentation/infiniband/user_verbs.rst b/Documentation/infiniband/user_verbs.rst
new file mode 100644 (file)
index 0000000..8ddc4b1
--- /dev/null
@@ -0,0 +1,75 @@
+======================
+Userspace verbs access
+======================
+
+  The ib_uverbs module, built by enabling CONFIG_INFINIBAND_USER_VERBS,
+  enables direct userspace access to IB hardware via "verbs," as
+  described in chapter 11 of the InfiniBand Architecture Specification.
+
+  To use the verbs, the libibverbs library, available from
+  https://github.com/linux-rdma/rdma-core, is required. libibverbs contains a
+  device-independent API for using the ib_uverbs interface.
+  libibverbs also requires appropriate device-dependent kernel and
+  userspace driver for your InfiniBand hardware.  For example, to use
+  a Mellanox HCA, you will need the ib_mthca kernel module and the
+  libmthca userspace driver be installed.
+
+User-kernel communication
+=========================
+
+  Userspace communicates with the kernel for slow path, resource
+  management operations via the /dev/infiniband/uverbsN character
+  devices.  Fast path operations are typically performed by writing
+  directly to hardware registers mmap()ed into userspace, with no
+  system call or context switch into the kernel.
+
+  Commands are sent to the kernel via write()s on these device files.
+  The ABI is defined in drivers/infiniband/include/ib_user_verbs.h.
+  The structs for commands that require a response from the kernel
+  contain a 64-bit field used to pass a pointer to an output buffer.
+  Status is returned to userspace as the return value of the write()
+  system call.
+
+Resource management
+===================
+
+  Since creation and destruction of all IB resources is done by
+  commands passed through a file descriptor, the kernel can keep track
+  of which resources are attached to a given userspace context.  The
+  ib_uverbs module maintains idr tables that are used to translate
+  between kernel pointers and opaque userspace handles, so that kernel
+  pointers are never exposed to userspace and userspace cannot trick
+  the kernel into following a bogus pointer.
+
+  This also allows the kernel to clean up when a process exits and
+  prevent one process from touching another process's resources.
+
+Memory pinning
+==============
+
+  Direct userspace I/O requires that memory regions that are potential
+  I/O targets be kept resident at the same physical address.  The
+  ib_uverbs module manages pinning and unpinning memory regions via
+  get_user_pages() and put_page() calls.  It also accounts for the
+  amount of memory pinned in the process's pinned_vm, and checks that
+  unprivileged processes do not exceed their RLIMIT_MEMLOCK limit.
+
+  Pages that are pinned multiple times are counted each time they are
+  pinned, so the value of pinned_vm may be an overestimate of the
+  number of pages pinned by a process.
+
+/dev files
+==========
+
+  To create the appropriate character device files automatically with
+  udev, a rule like::
+
+    KERNEL=="uverbs*", NAME="infiniband/%k"
+
+  can be used.  This will create device nodes named::
+
+    /dev/infiniband/uverbs0
+
+  and so on.  Since the InfiniBand userspace verbs should be safe for
+  use by non-privileged processes, it may be useful to add an
+  appropriate MODE or GROUP to the udev rule.
diff --git a/Documentation/infiniband/user_verbs.txt b/Documentation/infiniband/user_verbs.txt
deleted file mode 100644 (file)
index 47ebf2f..0000000
+++ /dev/null
@@ -1,69 +0,0 @@
-USERSPACE VERBS ACCESS
-
-  The ib_uverbs module, built by enabling CONFIG_INFINIBAND_USER_VERBS,
-  enables direct userspace access to IB hardware via "verbs," as
-  described in chapter 11 of the InfiniBand Architecture Specification.
-
-  To use the verbs, the libibverbs library, available from
-  https://github.com/linux-rdma/rdma-core, is required. libibverbs contains a
-  device-independent API for using the ib_uverbs interface.
-  libibverbs also requires appropriate device-dependent kernel and
-  userspace driver for your InfiniBand hardware.  For example, to use
-  a Mellanox HCA, you will need the ib_mthca kernel module and the
-  libmthca userspace driver be installed.
-
-User-kernel communication
-
-  Userspace communicates with the kernel for slow path, resource
-  management operations via the /dev/infiniband/uverbsN character
-  devices.  Fast path operations are typically performed by writing
-  directly to hardware registers mmap()ed into userspace, with no
-  system call or context switch into the kernel.
-
-  Commands are sent to the kernel via write()s on these device files.
-  The ABI is defined in drivers/infiniband/include/ib_user_verbs.h.
-  The structs for commands that require a response from the kernel
-  contain a 64-bit field used to pass a pointer to an output buffer.
-  Status is returned to userspace as the return value of the write()
-  system call.
-
-Resource management
-
-  Since creation and destruction of all IB resources is done by
-  commands passed through a file descriptor, the kernel can keep track
-  of which resources are attached to a given userspace context.  The
-  ib_uverbs module maintains idr tables that are used to translate
-  between kernel pointers and opaque userspace handles, so that kernel
-  pointers are never exposed to userspace and userspace cannot trick
-  the kernel into following a bogus pointer.
-
-  This also allows the kernel to clean up when a process exits and
-  prevent one process from touching another process's resources.
-
-Memory pinning
-
-  Direct userspace I/O requires that memory regions that are potential
-  I/O targets be kept resident at the same physical address.  The
-  ib_uverbs module manages pinning and unpinning memory regions via
-  get_user_pages() and put_page() calls.  It also accounts for the
-  amount of memory pinned in the process's pinned_vm, and checks that
-  unprivileged processes do not exceed their RLIMIT_MEMLOCK limit.
-
-  Pages that are pinned multiple times are counted each time they are
-  pinned, so the value of pinned_vm may be an overestimate of the
-  number of pages pinned by a process.
-
-/dev files
-
-  To create the appropriate character device files automatically with
-  udev, a rule like
-
-    KERNEL=="uverbs*", NAME="infiniband/%k"
-
-  can be used.  This will create device nodes named
-
-    /dev/infiniband/uverbs0
-
-  and so on.  Since the InfiniBand userspace verbs should be safe for
-  use by non-privileged processes, it may be useful to add an
-  appropriate MODE or GROUP to the udev rule.
index a7901e1df0584166f9d777e5e84a418a614a239b..350bb27a1c25f77c713987a3c464757281dfc719 100644 (file)
@@ -11018,14 +11018,6 @@ F:     driver/net/net_failover.c
 F:     include/net/net_failover.h
 F:     Documentation/networking/net_failover.rst
 
-NETEFFECT IWARP RNIC DRIVER (IW_NES)
-M:     Faisal Latif <faisal.latif@intel.com>
-L:     linux-rdma@vger.kernel.org
-W:     http://www.intel.com/Products/Server/Adapters/Server-Cluster/Server-Cluster-overview.htm
-S:     Supported
-F:     drivers/infiniband/hw/nes/
-F:     include/uapi/rdma/nes-abi.h
-
 NETEM NETWORK EMULATOR
 M:     Stephen Hemminger <stephen@networkplumber.org>
 L:     netem@lists.linux-foundation.org (moderated for non-subscribers)
@@ -14755,6 +14747,13 @@ M:     Chris Boot <bootc@bootc.net>
 S:     Maintained
 F:     drivers/leds/leds-net48xx.c
 
+SOFT-IWARP DRIVER (siw)
+M:     Bernard Metzler <bmt@zurich.ibm.com>
+L:     linux-rdma@vger.kernel.org
+S:     Supported
+F:     drivers/infiniband/sw/siw/
+F:     include/uapi/rdma/siw-abi.h
+
 SOFT-ROCE DRIVER (rxe)
 M:     Moni Shoua <monis@mellanox.com>
 L:     linux-rdma@vger.kernel.org
index 8ba41cbf18697d1045bd0b9f04c1f7bccc528497..85e103b147cc387cdf1a9a96fa3bf97341d206a7 100644 (file)
@@ -7,6 +7,7 @@ menuconfig INFINIBAND
        depends on m || IPV6 != m
        depends on !ALPHA
        select IRQ_POLL
+       select DIMLIB
        ---help---
          Core support for InfiniBand (IB).  Make sure to also select
          any protocols you wish to use as well as drivers for your
@@ -36,17 +37,6 @@ config INFINIBAND_USER_ACCESS
          libibverbs, libibcm and a hardware driver library from
          rdma-core <https://github.com/linux-rdma/rdma-core>.
 
-config INFINIBAND_USER_ACCESS_UCM
-       tristate "Userspace CM (UCM, DEPRECATED)"
-       depends on BROKEN || COMPILE_TEST
-       depends on INFINIBAND_USER_ACCESS
-       help
-         The UCM module has known security flaws, which no one is
-         interested to fix. The user-space part of this code was
-         dropped from the upstream a long time ago.
-
-         This option is DEPRECATED and planned to be removed.
-
 config INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI
        bool "Allow experimental legacy verbs in new ioctl uAPI  (EXPERIMENTAL)"
        depends on INFINIBAND_USER_ACCESS
@@ -98,7 +88,6 @@ source "drivers/infiniband/hw/efa/Kconfig"
 source "drivers/infiniband/hw/i40iw/Kconfig"
 source "drivers/infiniband/hw/mlx4/Kconfig"
 source "drivers/infiniband/hw/mlx5/Kconfig"
-source "drivers/infiniband/hw/nes/Kconfig"
 source "drivers/infiniband/hw/ocrdma/Kconfig"
 source "drivers/infiniband/hw/vmw_pvrdma/Kconfig"
 source "drivers/infiniband/hw/usnic/Kconfig"
@@ -108,6 +97,7 @@ source "drivers/infiniband/hw/hfi1/Kconfig"
 source "drivers/infiniband/hw/qedr/Kconfig"
 source "drivers/infiniband/sw/rdmavt/Kconfig"
 source "drivers/infiniband/sw/rxe/Kconfig"
+source "drivers/infiniband/sw/siw/Kconfig"
 endif
 
 source "drivers/infiniband/ulp/ipoib/Kconfig"
index 313f2349b518430c27cce117a9c36d3a304387e0..09881bd5f12dd030df685002d69c4731bf18abb7 100644 (file)
@@ -6,13 +6,12 @@ obj-$(CONFIG_INFINIBAND) +=           ib_core.o ib_cm.o iw_cm.o \
                                        $(infiniband-y)
 obj-$(CONFIG_INFINIBAND_USER_MAD) +=   ib_umad.o
 obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o $(user_access-y)
-obj-$(CONFIG_INFINIBAND_USER_ACCESS_UCM) += ib_ucm.o $(user_access-y)
 
 ib_core-y :=                   packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
                                device.o fmr_pool.o cache.o netlink.o \
                                roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
                                multicast.o mad.o smi.o agent.o mad_rmpp.o \
-                               nldev.o restrack.o
+                               nldev.o restrack.o counters.o
 
 ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
 ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
@@ -29,8 +28,6 @@ rdma_ucm-y :=                 ucma.o
 
 ib_umad-y :=                   user_mad.o
 
-ib_ucm-y :=                    ucm.o
-
 ib_uverbs-y :=                 uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
                                rdma_core.o uverbs_std_types.o uverbs_ioctl.o \
                                uverbs_std_types_cq.o \
index 2f7d14159841f833e26a1974c4e14b5be15f7677..9b76a8fcdd2479bc696184eee6848fde6de8d994 100644 (file)
@@ -337,7 +337,7 @@ static int dst_fetch_ha(const struct dst_entry *dst,
                neigh_event_send(n, NULL);
                ret = -ENODATA;
        } else {
-               memcpy(dev_addr->dst_dev_addr, n->ha, MAX_ADDR_LEN);
+               neigh_ha_snapshot(dev_addr->dst_dev_addr, n, dst->dev);
        }
 
        neigh_release(n);
index ff40a450b5d28ed56b38eb590e7a08084a70ce49..888d89ce81df07118fd21683241921a1499e6aea 100644 (file)
@@ -60,6 +60,7 @@ extern bool ib_devices_shared_netns;
 int ib_device_register_sysfs(struct ib_device *device);
 void ib_device_unregister_sysfs(struct ib_device *device);
 int ib_device_rename(struct ib_device *ibdev, const char *name);
+int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim);
 
 typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
              struct net_device *idev, void *cookie);
@@ -88,6 +89,15 @@ typedef int (*nldev_callback)(struct ib_device *device,
 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
                     struct netlink_callback *cb);
 
+struct ib_client_nl_info {
+       struct sk_buff *nl_msg;
+       struct device *cdev;
+       unsigned int port;
+       u64 abi;
+};
+int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name,
+                         struct ib_client_nl_info *res);
+
 enum ib_cache_gid_default_mode {
        IB_CACHE_GID_DEFAULT_MODE_SET,
        IB_CACHE_GID_DEFAULT_MODE_DELETE
diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
new file mode 100644 (file)
index 0000000..01faef7
--- /dev/null
@@ -0,0 +1,634 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019 Mellanox Technologies. All rights reserved.
+ */
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_counter.h>
+
+#include "core_priv.h"
+#include "restrack.h"
+
+#define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE)
+
+static int __counter_set_mode(struct rdma_counter_mode *curr,
+                             enum rdma_nl_counter_mode new_mode,
+                             enum rdma_nl_counter_mask new_mask)
+{
+       if ((new_mode == RDMA_COUNTER_MODE_AUTO) &&
+           ((new_mask & (~ALL_AUTO_MODE_MASKS)) ||
+            (curr->mode != RDMA_COUNTER_MODE_NONE)))
+               return -EINVAL;
+
+       curr->mode = new_mode;
+       curr->mask = new_mask;
+       return 0;
+}
+
+/**
+ * rdma_counter_set_auto_mode() - Turn on/off per-port auto mode
+ *
+ * When @on is true, the @mask must be set; When @on is false, it goes
+ * into manual mode if there's any counter, so that the user is able to
+ * manually access them.
+ */
+int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
+                              bool on, enum rdma_nl_counter_mask mask)
+{
+       struct rdma_port_counter *port_counter;
+       int ret;
+
+       port_counter = &dev->port_data[port].port_counter;
+       mutex_lock(&port_counter->lock);
+       if (on) {
+               ret = __counter_set_mode(&port_counter->mode,
+                                        RDMA_COUNTER_MODE_AUTO, mask);
+       } else {
+               if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+
+               if (port_counter->num_counters)
+                       ret = __counter_set_mode(&port_counter->mode,
+                                                RDMA_COUNTER_MODE_MANUAL, 0);
+               else
+                       ret = __counter_set_mode(&port_counter->mode,
+                                                RDMA_COUNTER_MODE_NONE, 0);
+       }
+
+out:
+       mutex_unlock(&port_counter->lock);
+       return ret;
+}
+
+static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port,
+                                              enum rdma_nl_counter_mode mode)
+{
+       struct rdma_port_counter *port_counter;
+       struct rdma_counter *counter;
+       int ret;
+
+       if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats)
+               return NULL;
+
+       counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+       if (!counter)
+               return NULL;
+
+       counter->device    = dev;
+       counter->port      = port;
+       counter->res.type  = RDMA_RESTRACK_COUNTER;
+       counter->stats     = dev->ops.counter_alloc_stats(counter);
+       if (!counter->stats)
+               goto err_stats;
+
+       port_counter = &dev->port_data[port].port_counter;
+       mutex_lock(&port_counter->lock);
+       if (mode == RDMA_COUNTER_MODE_MANUAL) {
+               ret = __counter_set_mode(&port_counter->mode,
+                                        RDMA_COUNTER_MODE_MANUAL, 0);
+               if (ret)
+                       goto err_mode;
+       }
+
+       port_counter->num_counters++;
+       mutex_unlock(&port_counter->lock);
+
+       counter->mode.mode = mode;
+       kref_init(&counter->kref);
+       mutex_init(&counter->lock);
+
+       return counter;
+
+err_mode:
+       mutex_unlock(&port_counter->lock);
+       kfree(counter->stats);
+err_stats:
+       kfree(counter);
+       return NULL;
+}
+
+static void rdma_counter_free(struct rdma_counter *counter)
+{
+       struct rdma_port_counter *port_counter;
+
+       port_counter = &counter->device->port_data[counter->port].port_counter;
+       mutex_lock(&port_counter->lock);
+       port_counter->num_counters--;
+       if (!port_counter->num_counters &&
+           (port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL))
+               __counter_set_mode(&port_counter->mode, RDMA_COUNTER_MODE_NONE,
+                                  0);
+
+       mutex_unlock(&port_counter->lock);
+
+       rdma_restrack_del(&counter->res);
+       kfree(counter->stats);
+       kfree(counter);
+}
+
+static void auto_mode_init_counter(struct rdma_counter *counter,
+                                  const struct ib_qp *qp,
+                                  enum rdma_nl_counter_mask new_mask)
+{
+       struct auto_mode_param *param = &counter->mode.param;
+
+       counter->mode.mode = RDMA_COUNTER_MODE_AUTO;
+       counter->mode.mask = new_mask;
+
+       if (new_mask & RDMA_COUNTER_MASK_QP_TYPE)
+               param->qp_type = qp->qp_type;
+}
+
+static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter,
+                           enum rdma_nl_counter_mask auto_mask)
+{
+       struct auto_mode_param *param = &counter->mode.param;
+       bool match = true;
+
+       if (rdma_is_kernel_res(&counter->res) != rdma_is_kernel_res(&qp->res))
+               return false;
+
+       /* Ensure that counter belong to right PID */
+       if (!rdma_is_kernel_res(&counter->res) &&
+           !rdma_is_kernel_res(&qp->res) &&
+           (task_pid_vnr(counter->res.task) != current->pid))
+               return false;
+
+       if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE)
+               match &= (param->qp_type == qp->qp_type);
+
+       return match;
+}
+
+static int __rdma_counter_bind_qp(struct rdma_counter *counter,
+                                 struct ib_qp *qp)
+{
+       int ret;
+
+       if (qp->counter)
+               return -EINVAL;
+
+       if (!qp->device->ops.counter_bind_qp)
+               return -EOPNOTSUPP;
+
+       mutex_lock(&counter->lock);
+       ret = qp->device->ops.counter_bind_qp(counter, qp);
+       mutex_unlock(&counter->lock);
+
+       return ret;
+}
+
+static int __rdma_counter_unbind_qp(struct ib_qp *qp)
+{
+       struct rdma_counter *counter = qp->counter;
+       int ret;
+
+       if (!qp->device->ops.counter_unbind_qp)
+               return -EOPNOTSUPP;
+
+       mutex_lock(&counter->lock);
+       ret = qp->device->ops.counter_unbind_qp(qp);
+       mutex_unlock(&counter->lock);
+
+       return ret;
+}
+
+static void counter_history_stat_update(const struct rdma_counter *counter)
+{
+       struct ib_device *dev = counter->device;
+       struct rdma_port_counter *port_counter;
+       int i;
+
+       port_counter = &dev->port_data[counter->port].port_counter;
+       if (!port_counter->hstats)
+               return;
+
+       for (i = 0; i < counter->stats->num_counters; i++)
+               port_counter->hstats->value[i] += counter->stats->value[i];
+}
+
+/**
+ * rdma_get_counter_auto_mode - Find the counter that @qp should be bound
+ *     with in auto mode
+ *
+ * Return: The counter (with ref-count increased) if found
+ */
+static struct rdma_counter *rdma_get_counter_auto_mode(struct ib_qp *qp,
+                                                      u8 port)
+{
+       struct rdma_port_counter *port_counter;
+       struct rdma_counter *counter = NULL;
+       struct ib_device *dev = qp->device;
+       struct rdma_restrack_entry *res;
+       struct rdma_restrack_root *rt;
+       unsigned long id = 0;
+
+       port_counter = &dev->port_data[port].port_counter;
+       rt = &dev->res[RDMA_RESTRACK_COUNTER];
+       xa_lock(&rt->xa);
+       xa_for_each(&rt->xa, id, res) {
+               if (!rdma_is_visible_in_pid_ns(res))
+                       continue;
+
+               counter = container_of(res, struct rdma_counter, res);
+               if ((counter->device != qp->device) || (counter->port != port))
+                       goto next;
+
+               if (auto_mode_match(qp, counter, port_counter->mode.mask))
+                       break;
+next:
+               counter = NULL;
+       }
+
+       if (counter && !kref_get_unless_zero(&counter->kref))
+               counter = NULL;
+
+       xa_unlock(&rt->xa);
+       return counter;
+}
+
+static void rdma_counter_res_add(struct rdma_counter *counter,
+                                struct ib_qp *qp)
+{
+       if (rdma_is_kernel_res(&qp->res)) {
+               rdma_restrack_set_task(&counter->res, qp->res.kern_name);
+               rdma_restrack_kadd(&counter->res);
+       } else {
+               rdma_restrack_attach_task(&counter->res, qp->res.task);
+               rdma_restrack_uadd(&counter->res);
+       }
+}
+
+static void counter_release(struct kref *kref)
+{
+       struct rdma_counter *counter;
+
+       counter = container_of(kref, struct rdma_counter, kref);
+       counter_history_stat_update(counter);
+       counter->device->ops.counter_dealloc(counter);
+       rdma_counter_free(counter);
+}
+
+/**
+ * rdma_counter_bind_qp_auto - Check and bind the QP to a counter base on
+ *   the auto-mode rule
+ */
+int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port)
+{
+       struct rdma_port_counter *port_counter;
+       struct ib_device *dev = qp->device;
+       struct rdma_counter *counter;
+       int ret;
+
+       if (!rdma_is_port_valid(dev, port))
+               return -EINVAL;
+
+       port_counter = &dev->port_data[port].port_counter;
+       if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO)
+               return 0;
+
+       counter = rdma_get_counter_auto_mode(qp, port);
+       if (counter) {
+               ret = __rdma_counter_bind_qp(counter, qp);
+               if (ret) {
+                       kref_put(&counter->kref, counter_release);
+                       return ret;
+               }
+       } else {
+               counter = rdma_counter_alloc(dev, port, RDMA_COUNTER_MODE_AUTO);
+               if (!counter)
+                       return -ENOMEM;
+
+               auto_mode_init_counter(counter, qp, port_counter->mode.mask);
+
+               ret = __rdma_counter_bind_qp(counter, qp);
+               if (ret) {
+                       rdma_counter_free(counter);
+                       return ret;
+               }
+
+               rdma_counter_res_add(counter, qp);
+       }
+
+       return 0;
+}
+
+/**
+ * rdma_counter_unbind_qp - Unbind a qp from a counter
+ * @force:
+ *   true - Decrease the counter ref-count anyway (e.g., qp destroy)
+ */
+int rdma_counter_unbind_qp(struct ib_qp *qp, bool force)
+{
+       struct rdma_counter *counter = qp->counter;
+       int ret;
+
+       if (!counter)
+               return -EINVAL;
+
+       ret = __rdma_counter_unbind_qp(qp);
+       if (ret && !force)
+               return ret;
+
+       kref_put(&counter->kref, counter_release);
+       return 0;
+}
+
+int rdma_counter_query_stats(struct rdma_counter *counter)
+{
+       struct ib_device *dev = counter->device;
+       int ret;
+
+       if (!dev->ops.counter_update_stats)
+               return -EINVAL;
+
+       mutex_lock(&counter->lock);
+       ret = dev->ops.counter_update_stats(counter);
+       mutex_unlock(&counter->lock);
+
+       return ret;
+}
+
+static u64 get_running_counters_hwstat_sum(struct ib_device *dev,
+                                          u8 port, u32 index)
+{
+       struct rdma_restrack_entry *res;
+       struct rdma_restrack_root *rt;
+       struct rdma_counter *counter;
+       unsigned long id = 0;
+       u64 sum = 0;
+
+       rt = &dev->res[RDMA_RESTRACK_COUNTER];
+       xa_lock(&rt->xa);
+       xa_for_each(&rt->xa, id, res) {
+               if (!rdma_restrack_get(res))
+                       continue;
+
+               xa_unlock(&rt->xa);
+
+               counter = container_of(res, struct rdma_counter, res);
+               if ((counter->device != dev) || (counter->port != port) ||
+                   rdma_counter_query_stats(counter))
+                       goto next;
+
+               sum += counter->stats->value[index];
+
+next:
+               xa_lock(&rt->xa);
+               rdma_restrack_put(res);
+       }
+
+       xa_unlock(&rt->xa);
+       return sum;
+}
+
+/**
+ * rdma_counter_get_hwstat_value() - Get the sum value of all counters on a
+ *   specific port, including the running ones and history data
+ */
+u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index)
+{
+       struct rdma_port_counter *port_counter;
+       u64 sum;
+
+       port_counter = &dev->port_data[port].port_counter;
+       sum = get_running_counters_hwstat_sum(dev, port, index);
+       sum += port_counter->hstats->value[index];
+
+       return sum;
+}
+
+static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num)
+{
+       struct rdma_restrack_entry *res = NULL;
+       struct ib_qp *qp = NULL;
+
+       res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_QP, qp_num);
+       if (IS_ERR(res))
+               return NULL;
+
+       if (!rdma_is_visible_in_pid_ns(res))
+               goto err;
+
+       qp = container_of(res, struct ib_qp, res);
+       if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
+               goto err;
+
+       return qp;
+
+err:
+       rdma_restrack_put(&qp->res);
+       return NULL;
+}
+
+static int rdma_counter_bind_qp_manual(struct rdma_counter *counter,
+                                      struct ib_qp *qp)
+{
+       if ((counter->device != qp->device) || (counter->port != qp->port))
+               return -EINVAL;
+
+       return __rdma_counter_bind_qp(counter, qp);
+}
+
+static struct rdma_counter *rdma_get_counter_by_id(struct ib_device *dev,
+                                                  u32 counter_id)
+{
+       struct rdma_restrack_entry *res;
+       struct rdma_counter *counter;
+
+       res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_COUNTER, counter_id);
+       if (IS_ERR(res))
+               return NULL;
+
+       if (!rdma_is_visible_in_pid_ns(res)) {
+               rdma_restrack_put(res);
+               return NULL;
+       }
+
+       counter = container_of(res, struct rdma_counter, res);
+       kref_get(&counter->kref);
+       rdma_restrack_put(res);
+
+       return counter;
+}
+
+/**
+ * rdma_counter_bind_qpn() - Bind QP @qp_num to counter @counter_id
+ */
+int rdma_counter_bind_qpn(struct ib_device *dev, u8 port,
+                         u32 qp_num, u32 counter_id)
+{
+       struct rdma_counter *counter;
+       struct ib_qp *qp;
+       int ret;
+
+       qp = rdma_counter_get_qp(dev, qp_num);
+       if (!qp)
+               return -ENOENT;
+
+       counter = rdma_get_counter_by_id(dev, counter_id);
+       if (!counter) {
+               ret = -ENOENT;
+               goto err;
+       }
+
+       if (counter->res.task != qp->res.task) {
+               ret = -EINVAL;
+               goto err_task;
+       }
+
+       ret = rdma_counter_bind_qp_manual(counter, qp);
+       if (ret)
+               goto err_task;
+
+       rdma_restrack_put(&qp->res);
+       return 0;
+
+err_task:
+       kref_put(&counter->kref, counter_release);
+err:
+       rdma_restrack_put(&qp->res);
+       return ret;
+}
+
+/**
+ * rdma_counter_bind_qpn_alloc() - Alloc a counter and bind QP @qp_num to it
+ *   The id of new counter is returned in @counter_id
+ */
+int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port,
+                               u32 qp_num, u32 *counter_id)
+{
+       struct rdma_counter *counter;
+       struct ib_qp *qp;
+       int ret;
+
+       if (!rdma_is_port_valid(dev, port))
+               return -EINVAL;
+
+       qp = rdma_counter_get_qp(dev, qp_num);
+       if (!qp)
+               return -ENOENT;
+
+       if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) {
+               ret = -EINVAL;
+               goto err;
+       }
+
+       counter = rdma_counter_alloc(dev, port, RDMA_COUNTER_MODE_MANUAL);
+       if (!counter) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       ret = rdma_counter_bind_qp_manual(counter, qp);
+       if (ret)
+               goto err_bind;
+
+       if (counter_id)
+               *counter_id = counter->id;
+
+       rdma_counter_res_add(counter, qp);
+
+       rdma_restrack_put(&qp->res);
+       return ret;
+
+err_bind:
+       rdma_counter_free(counter);
+err:
+       rdma_restrack_put(&qp->res);
+       return ret;
+}
+
+/**
+ * rdma_counter_unbind_qpn() - Unbind QP @qp_num from a counter
+ */
+int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port,
+                           u32 qp_num, u32 counter_id)
+{
+       struct rdma_port_counter *port_counter;
+       struct ib_qp *qp;
+       int ret;
+
+       if (!rdma_is_port_valid(dev, port))
+               return -EINVAL;
+
+       qp = rdma_counter_get_qp(dev, qp_num);
+       if (!qp)
+               return -ENOENT;
+
+       if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       port_counter = &dev->port_data[port].port_counter;
+       if (!qp->counter || qp->counter->id != counter_id ||
+           port_counter->mode.mode != RDMA_COUNTER_MODE_MANUAL) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       ret = rdma_counter_unbind_qp(qp, false);
+
+out:
+       rdma_restrack_put(&qp->res);
+       return ret;
+}
+
+int rdma_counter_get_mode(struct ib_device *dev, u8 port,
+                         enum rdma_nl_counter_mode *mode,
+                         enum rdma_nl_counter_mask *mask)
+{
+       struct rdma_port_counter *port_counter;
+
+       port_counter = &dev->port_data[port].port_counter;
+       *mode = port_counter->mode.mode;
+       *mask = port_counter->mode.mask;
+
+       return 0;
+}
+
+void rdma_counter_init(struct ib_device *dev)
+{
+       struct rdma_port_counter *port_counter;
+       u32 port;
+
+       if (!dev->ops.alloc_hw_stats || !dev->port_data)
+               return;
+
+       rdma_for_each_port(dev, port) {
+               port_counter = &dev->port_data[port].port_counter;
+               port_counter->mode.mode = RDMA_COUNTER_MODE_NONE;
+               mutex_init(&port_counter->lock);
+
+               port_counter->hstats = dev->ops.alloc_hw_stats(dev, port);
+               if (!port_counter->hstats)
+                       goto fail;
+       }
+
+       return;
+
+fail:
+       rdma_for_each_port(dev, port) {
+               port_counter = &dev->port_data[port].port_counter;
+               kfree(port_counter->hstats);
+               port_counter->hstats = NULL;
+       }
+
+       return;
+}
+
+void rdma_counter_release(struct ib_device *dev)
+{
+       struct rdma_port_counter *port_counter;
+       u32 port;
+
+       if (!dev->ops.alloc_hw_stats)
+               return;
+
+       rdma_for_each_port(dev, port) {
+               port_counter = &dev->port_data[port].port_counter;
+               kfree(port_counter->hstats);
+       }
+}
index a24c900fbdf606ad0b4504b56fdcdfd9577b8a5f..7c599878ccf711e22d70771c67a1b093f101670f 100644 (file)
 #define IB_POLL_FLAGS \
        (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
 
+static const struct dim_cq_moder
+rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
+       {1,   0, 1,  0},
+       {1,   0, 4,  0},
+       {2,   0, 4,  0},
+       {2,   0, 8,  0},
+       {4,   0, 8,  0},
+       {16,  0, 8,  0},
+       {16,  0, 16, 0},
+       {32,  0, 16, 0},
+       {32,  0, 32, 0},
+};
+
+static void ib_cq_rdma_dim_work(struct work_struct *w)
+{
+       struct dim *dim = container_of(w, struct dim, work);
+       struct ib_cq *cq = dim->priv;
+
+       u16 usec = rdma_dim_prof[dim->profile_ix].usec;
+       u16 comps = rdma_dim_prof[dim->profile_ix].comps;
+
+       dim->state = DIM_START_MEASURE;
+
+       cq->device->ops.modify_cq(cq, comps, usec);
+}
+
+static void rdma_dim_init(struct ib_cq *cq)
+{
+       struct dim *dim;
+
+       if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim ||
+           cq->poll_ctx == IB_POLL_DIRECT)
+               return;
+
+       dim = kzalloc(sizeof(struct dim), GFP_KERNEL);
+       if (!dim)
+               return;
+
+       dim->state = DIM_START_MEASURE;
+       dim->tune_state = DIM_GOING_RIGHT;
+       dim->profile_ix = RDMA_DIM_START_PROFILE;
+       dim->priv = cq;
+       cq->dim = dim;
+
+       INIT_WORK(&dim->work, ib_cq_rdma_dim_work);
+}
+
 static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs,
                           int batch)
 {
@@ -78,6 +125,7 @@ static void ib_cq_completion_direct(struct ib_cq *cq, void *private)
 static int ib_poll_handler(struct irq_poll *iop, int budget)
 {
        struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
+       struct dim *dim = cq->dim;
        int completed;
 
        completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
@@ -87,6 +135,9 @@ static int ib_poll_handler(struct irq_poll *iop, int budget)
                        irq_poll_sched(&cq->iop);
        }
 
+       if (dim)
+               rdma_dim(dim, completed);
+
        return completed;
 }
 
@@ -105,6 +156,8 @@ static void ib_cq_poll_work(struct work_struct *work)
        if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
            ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
                queue_work(cq->comp_wq, &cq->work);
+       else if (cq->dim)
+               rdma_dim(cq->dim, completed);
 }
 
 static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
@@ -113,7 +166,7 @@ static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
 }
 
 /**
- * __ib_alloc_cq - allocate a completion queue
+ * __ib_alloc_cq_user - allocate a completion queue
  * @dev:               device to allocate the CQ for
  * @private:           driver private data, accessible from cq->cq_context
  * @nr_cqe:            number of CQEs to allocate
@@ -139,25 +192,30 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
        struct ib_cq *cq;
        int ret = -ENOMEM;
 
-       cq = dev->ops.create_cq(dev, &cq_attr, NULL);
-       if (IS_ERR(cq))
-               return cq;
+       cq = rdma_zalloc_drv_obj(dev, ib_cq);
+       if (!cq)
+               return ERR_PTR(ret);
 
        cq->device = dev;
-       cq->uobject = NULL;
-       cq->event_handler = NULL;
        cq->cq_context = private;
        cq->poll_ctx = poll_ctx;
        atomic_set(&cq->usecnt, 0);
 
        cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
        if (!cq->wc)
-               goto out_destroy_cq;
+               goto out_free_cq;
 
        cq->res.type = RDMA_RESTRACK_CQ;
        rdma_restrack_set_task(&cq->res, caller);
+
+       ret = dev->ops.create_cq(cq, &cq_attr, NULL);
+       if (ret)
+               goto out_free_wc;
+
        rdma_restrack_kadd(&cq->res);
 
+       rdma_dim_init(cq);
+
        switch (cq->poll_ctx) {
        case IB_POLL_DIRECT:
                cq->comp_handler = ib_cq_completion_direct;
@@ -178,29 +236,29 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
                break;
        default:
                ret = -EINVAL;
-               goto out_free_wc;
+               goto out_destroy_cq;
        }
 
        return cq;
 
-out_free_wc:
-       kfree(cq->wc);
-       rdma_restrack_del(&cq->res);
 out_destroy_cq:
+       rdma_restrack_del(&cq->res);
        cq->device->ops.destroy_cq(cq, udata);
+out_free_wc:
+       kfree(cq->wc);
+out_free_cq:
+       kfree(cq);
        return ERR_PTR(ret);
 }
 EXPORT_SYMBOL(__ib_alloc_cq_user);
 
 /**
- * ib_free_cq - free a completion queue
+ * ib_free_cq_user - free a completion queue
  * @cq:                completion queue to free.
  * @udata:     User data or NULL for kernel object
  */
 void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
 {
-       int ret;
-
        if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
                return;
 
@@ -218,9 +276,12 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
                WARN_ON_ONCE(1);
        }
 
-       kfree(cq->wc);
        rdma_restrack_del(&cq->res);
-       ret = cq->device->ops.destroy_cq(cq, udata);
-       WARN_ON_ONCE(ret);
+       cq->device->ops.destroy_cq(cq, udata);
+       if (cq->dim)
+               cancel_work_sync(&cq->dim->work);
+       kfree(cq->dim);
+       kfree(cq->wc);
+       kfree(cq);
 }
 EXPORT_SYMBOL(ib_free_cq_user);
index 3352a107b4a36756518087eef46615b55372e2a0..9773145dee0996d0d058230bc6ce18f9c138d34f 100644 (file)
@@ -46,6 +46,7 @@
 #include <rdma/rdma_netlink.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_cache.h>
+#include <rdma/rdma_counter.h>
 
 #include "core_priv.h"
 #include "restrack.h"
@@ -270,7 +271,7 @@ struct ib_port_data_rcu {
        struct ib_port_data pdata[];
 };
 
-static int ib_device_check_mandatory(struct ib_device *device)
+static void ib_device_check_mandatory(struct ib_device *device)
 {
 #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x }
        static const struct {
@@ -305,8 +306,6 @@ static int ib_device_check_mandatory(struct ib_device *device)
                        break;
                }
        }
-
-       return 0;
 }
 
 /*
@@ -375,7 +374,7 @@ struct ib_device *ib_device_get_by_name(const char *name,
        down_read(&devices_rwsem);
        device = __ib_device_get_by_name(name);
        if (device && driver_id != RDMA_DRIVER_UNKNOWN &&
-           device->driver_id != driver_id)
+           device->ops.driver_id != driver_id)
                device = NULL;
 
        if (device) {
@@ -449,6 +448,15 @@ int ib_device_rename(struct ib_device *ibdev, const char *name)
        return 0;
 }
 
+int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim)
+{
+       if (use_dim > 1)
+               return -EINVAL;
+       ibdev->use_cq_dim = use_dim;
+
+       return 0;
+}
+
 static int alloc_name(struct ib_device *ibdev, const char *name)
 {
        struct ib_device *device;
@@ -494,10 +502,12 @@ static void ib_device_release(struct device *device)
        if (dev->port_data) {
                ib_cache_release_one(dev);
                ib_security_release_port_pkey_list(dev);
+               rdma_counter_release(dev);
                kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
                                       pdata[0]),
                          rcu_head);
        }
+
        xa_destroy(&dev->compat_devs);
        xa_destroy(&dev->client_data);
        kfree_rcu(dev, rcu_head);
@@ -1193,10 +1203,7 @@ static int setup_device(struct ib_device *device)
        int ret;
 
        setup_dma_device(device);
-
-       ret = ib_device_check_mandatory(device);
-       if (ret)
-               return ret;
+       ib_device_check_mandatory(device);
 
        ret = setup_port_data(device);
        if (ret) {
@@ -1321,6 +1328,8 @@ int ib_register_device(struct ib_device *device, const char *name)
 
        ib_device_register_rdmacg(device);
 
+       rdma_counter_init(device);
+
        /*
         * Ensure that ADD uevent is not fired because it
         * is too early amd device is not initialized yet.
@@ -1479,7 +1488,7 @@ void ib_unregister_driver(enum rdma_driver_id driver_id)
 
        down_read(&devices_rwsem);
        xa_for_each (&devices, index, ib_dev) {
-               if (ib_dev->driver_id != driver_id)
+               if (ib_dev->ops.driver_id != driver_id)
                        continue;
 
                get_device(&ib_dev->dev);
@@ -1749,6 +1758,104 @@ void ib_unregister_client(struct ib_client *client)
 }
 EXPORT_SYMBOL(ib_unregister_client);
 
+static int __ib_get_global_client_nl_info(const char *client_name,
+                                         struct ib_client_nl_info *res)
+{
+       struct ib_client *client;
+       unsigned long index;
+       int ret = -ENOENT;
+
+       down_read(&clients_rwsem);
+       xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
+               if (strcmp(client->name, client_name) != 0)
+                       continue;
+               if (!client->get_global_nl_info) {
+                       ret = -EOPNOTSUPP;
+                       break;
+               }
+               ret = client->get_global_nl_info(res);
+               if (WARN_ON(ret == -ENOENT))
+                       ret = -EINVAL;
+               if (!ret && res->cdev)
+                       get_device(res->cdev);
+               break;
+       }
+       up_read(&clients_rwsem);
+       return ret;
+}
+
+static int __ib_get_client_nl_info(struct ib_device *ibdev,
+                                  const char *client_name,
+                                  struct ib_client_nl_info *res)
+{
+       unsigned long index;
+       void *client_data;
+       int ret = -ENOENT;
+
+       down_read(&ibdev->client_data_rwsem);
+       xan_for_each_marked (&ibdev->client_data, index, client_data,
+                            CLIENT_DATA_REGISTERED) {
+               struct ib_client *client = xa_load(&clients, index);
+
+               if (!client || strcmp(client->name, client_name) != 0)
+                       continue;
+               if (!client->get_nl_info) {
+                       ret = -EOPNOTSUPP;
+                       break;
+               }
+               ret = client->get_nl_info(ibdev, client_data, res);
+               if (WARN_ON(ret == -ENOENT))
+                       ret = -EINVAL;
+
+               /*
+                * The cdev is guaranteed valid as long as we are inside the
+                * client_data_rwsem as remove_one can't be called. Keep it
+                * valid for the caller.
+                */
+               if (!ret && res->cdev)
+                       get_device(res->cdev);
+               break;
+       }
+       up_read(&ibdev->client_data_rwsem);
+
+       return ret;
+}
+
+/**
+ * ib_get_client_nl_info - Fetch the nl_info from a client
+ * @device - IB device
+ * @client_name - Name of the client
+ * @res - Result of the query
+ */
+int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name,
+                         struct ib_client_nl_info *res)
+{
+       int ret;
+
+       if (ibdev)
+               ret = __ib_get_client_nl_info(ibdev, client_name, res);
+       else
+               ret = __ib_get_global_client_nl_info(client_name, res);
+#ifdef CONFIG_MODULES
+       if (ret == -ENOENT) {
+               request_module("rdma-client-%s", client_name);
+               if (ibdev)
+                       ret = __ib_get_client_nl_info(ibdev, client_name, res);
+               else
+                       ret = __ib_get_global_client_nl_info(client_name, res);
+       }
+#endif
+       if (ret) {
+               if (ret == -ENOENT)
+                       return -EOPNOTSUPP;
+               return ret;
+       }
+
+       if (WARN_ON(!res->cdev))
+               return -EINVAL;
+       return 0;
+}
+
 /**
  * ib_set_client_data - Set IB client context
  * @device:Device to set context for
@@ -2039,7 +2146,7 @@ struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
                                    (uintptr_t)ndev) {
                if (rcu_access_pointer(cur->netdev) == ndev &&
                    (driver_id == RDMA_DRIVER_UNKNOWN ||
-                    cur->ib_dev->driver_id == driver_id) &&
+                    cur->ib_dev->ops.driver_id == driver_id) &&
                    ib_device_try_get(cur->ib_dev)) {
                        res = cur->ib_dev;
                        break;
@@ -2344,12 +2451,28 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 
 #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name)
 
+       if (ops->driver_id != RDMA_DRIVER_UNKNOWN) {
+               WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN &&
+                       dev_ops->driver_id != ops->driver_id);
+               dev_ops->driver_id = ops->driver_id;
+       }
+       if (ops->owner) {
+               WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner);
+               dev_ops->owner = ops->owner;
+       }
+       if (ops->uverbs_abi_ver)
+               dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver;
+
+       dev_ops->uverbs_no_driver_id_binding |=
+               ops->uverbs_no_driver_id_binding;
+
        SET_DEVICE_OP(dev_ops, add_gid);
        SET_DEVICE_OP(dev_ops, advise_mr);
        SET_DEVICE_OP(dev_ops, alloc_dm);
        SET_DEVICE_OP(dev_ops, alloc_fmr);
        SET_DEVICE_OP(dev_ops, alloc_hw_stats);
        SET_DEVICE_OP(dev_ops, alloc_mr);
+       SET_DEVICE_OP(dev_ops, alloc_mr_integrity);
        SET_DEVICE_OP(dev_ops, alloc_mw);
        SET_DEVICE_OP(dev_ops, alloc_pd);
        SET_DEVICE_OP(dev_ops, alloc_rdma_netdev);
@@ -2357,6 +2480,11 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
        SET_DEVICE_OP(dev_ops, alloc_xrcd);
        SET_DEVICE_OP(dev_ops, attach_mcast);
        SET_DEVICE_OP(dev_ops, check_mr_status);
+       SET_DEVICE_OP(dev_ops, counter_alloc_stats);
+       SET_DEVICE_OP(dev_ops, counter_bind_qp);
+       SET_DEVICE_OP(dev_ops, counter_dealloc);
+       SET_DEVICE_OP(dev_ops, counter_unbind_qp);
+       SET_DEVICE_OP(dev_ops, counter_update_stats);
        SET_DEVICE_OP(dev_ops, create_ah);
        SET_DEVICE_OP(dev_ops, create_counters);
        SET_DEVICE_OP(dev_ops, create_cq);
@@ -2409,6 +2537,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
        SET_DEVICE_OP(dev_ops, iw_reject);
        SET_DEVICE_OP(dev_ops, iw_rem_ref);
        SET_DEVICE_OP(dev_ops, map_mr_sg);
+       SET_DEVICE_OP(dev_ops, map_mr_sg_pi);
        SET_DEVICE_OP(dev_ops, map_phys_fmr);
        SET_DEVICE_OP(dev_ops, mmap);
        SET_DEVICE_OP(dev_ops, modify_ah);
@@ -2445,6 +2574,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
        SET_DEVICE_OP(dev_ops, unmap_fmr);
 
        SET_OBJ_SIZE(dev_ops, ib_ah);
+       SET_OBJ_SIZE(dev_ops, ib_cq);
        SET_OBJ_SIZE(dev_ops, ib_pd);
        SET_OBJ_SIZE(dev_ops, ib_srq);
        SET_OBJ_SIZE(dev_ops, ib_ucontext);
index d117f21ce9fdeedb5ce5e5bfb62c51fd83661092..c0e2df128b3449948fd9d0d7a80c158b1cd1c887 100644 (file)
@@ -34,14 +34,18 @@ void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr)
 EXPORT_SYMBOL(ib_mr_pool_put);
 
 int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr,
-               enum ib_mr_type type, u32 max_num_sg)
+               enum ib_mr_type type, u32 max_num_sg, u32 max_num_meta_sg)
 {
        struct ib_mr *mr;
        unsigned long flags;
        int ret, i;
 
        for (i = 0; i < nr; i++) {
-               mr = ib_alloc_mr(qp->pd, type, max_num_sg);
+               if (type == IB_MR_TYPE_INTEGRITY)
+                       mr = ib_alloc_mr_integrity(qp->pd, max_num_sg,
+                                                  max_num_meta_sg);
+               else
+                       mr = ib_alloc_mr(qp->pd, type, max_num_sg);
                if (IS_ERR(mr)) {
                        ret = PTR_ERR(mr);
                        goto out;
index 69188cbbd99bd53ffbf4e49e738805dc240344fa..783e465e7c412988903088911fa655c4aa169346 100644 (file)
 #include "cma_priv.h"
 #include "restrack.h"
 
+/*
+ * Sort array elements by the netlink attribute name
+ */
 static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
-       [RDMA_NLDEV_ATTR_DEV_INDEX]     = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_DEV_NAME]      = { .type = NLA_NUL_STRING,
-                                           .len = IB_DEVICE_NAME_MAX - 1},
-       [RDMA_NLDEV_ATTR_PORT_INDEX]    = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_FW_VERSION]    = { .type = NLA_NUL_STRING,
-                                           .len = IB_FW_VERSION_NAME_MAX - 1},
-       [RDMA_NLDEV_ATTR_NODE_GUID]     = { .type = NLA_U64 },
-       [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 },
-       [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 },
-       [RDMA_NLDEV_ATTR_LID]           = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_SM_LID]        = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_LMC]           = { .type = NLA_U8 },
-       [RDMA_NLDEV_ATTR_PORT_STATE]    = { .type = NLA_U8 },
-       [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 },
-       [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 },
-       [RDMA_NLDEV_ATTR_RES_SUMMARY]   = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY]     = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME] = { .type = NLA_NUL_STRING,
-                                            .len = 16 },
-       [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR] = { .type = NLA_U64 },
-       [RDMA_NLDEV_ATTR_RES_QP]                = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_RES_QP_ENTRY]          = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_RES_LQPN]              = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_RQPN]              = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_RQ_PSN]            = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_SQ_PSN]            = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE] = { .type = NLA_U8 },
-       [RDMA_NLDEV_ATTR_RES_TYPE]              = { .type = NLA_U8 },
-       [RDMA_NLDEV_ATTR_RES_STATE]             = { .type = NLA_U8 },
-       [RDMA_NLDEV_ATTR_RES_PID]               = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_KERN_NAME]         = { .type = NLA_NUL_STRING,
-                                                   .len = TASK_COMM_LEN },
+       [RDMA_NLDEV_ATTR_CHARDEV]               = { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_CHARDEV_ABI]           = { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_CHARDEV_NAME]          = { .type = NLA_NUL_STRING,
+                                       .len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+       [RDMA_NLDEV_ATTR_CHARDEV_TYPE]          = { .type = NLA_NUL_STRING,
+                                       .len = RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE },
+       [RDMA_NLDEV_ATTR_DEV_DIM]               = { .type = NLA_U8 },
+       [RDMA_NLDEV_ATTR_DEV_INDEX]             = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_DEV_NAME]              = { .type = NLA_NUL_STRING,
+                                       .len = IB_DEVICE_NAME_MAX },
+       [RDMA_NLDEV_ATTR_DEV_NODE_TYPE]         = { .type = NLA_U8 },
+       [RDMA_NLDEV_ATTR_DEV_PROTOCOL]          = { .type = NLA_NUL_STRING,
+                                       .len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+       [RDMA_NLDEV_ATTR_DRIVER]                = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_DRIVER_ENTRY]          = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE]     = { .type = NLA_U8 },
+       [RDMA_NLDEV_ATTR_DRIVER_STRING]         = { .type = NLA_NUL_STRING,
+                                       .len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+       [RDMA_NLDEV_ATTR_DRIVER_S32]            = { .type = NLA_S32 },
+       [RDMA_NLDEV_ATTR_DRIVER_S64]            = { .type = NLA_S64 },
+       [RDMA_NLDEV_ATTR_DRIVER_U32]            = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_DRIVER_U64]            = { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_FW_VERSION]            = { .type = NLA_NUL_STRING,
+                                       .len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+       [RDMA_NLDEV_ATTR_LID]                   = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_LINK_TYPE]             = { .type = NLA_NUL_STRING,
+                                       .len = IFNAMSIZ },
+       [RDMA_NLDEV_ATTR_LMC]                   = { .type = NLA_U8 },
+       [RDMA_NLDEV_ATTR_NDEV_INDEX]            = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_NDEV_NAME]             = { .type = NLA_NUL_STRING,
+                                       .len = IFNAMSIZ },
+       [RDMA_NLDEV_ATTR_NODE_GUID]             = { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_PORT_INDEX]            = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_PORT_PHYS_STATE]       = { .type = NLA_U8 },
+       [RDMA_NLDEV_ATTR_PORT_STATE]            = { .type = NLA_U8 },
        [RDMA_NLDEV_ATTR_RES_CM_ID]             = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_RES_CM_IDN]            = { .type = NLA_U32 },
        [RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY]       = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_RES_PS]                = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_SRC_ADDR]  = {
-                       .len = sizeof(struct __kernel_sockaddr_storage) },
-       [RDMA_NLDEV_ATTR_RES_DST_ADDR]  = {
-                       .len = sizeof(struct __kernel_sockaddr_storage) },
        [RDMA_NLDEV_ATTR_RES_CQ]                = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_RES_CQ_ENTRY]          = { .type = NLA_NESTED },
        [RDMA_NLDEV_ATTR_RES_CQE]               = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_USECNT]            = { .type = NLA_U64 },
-       [RDMA_NLDEV_ATTR_RES_POLL_CTX]          = { .type = NLA_U8 },
-       [RDMA_NLDEV_ATTR_RES_MR]                = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_RES_MR_ENTRY]          = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_RES_RKEY]              = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_LKEY]              = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_CQN]               = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_CQ_ENTRY]          = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_RES_CTXN]              = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_DST_ADDR]          = {
+                       .len = sizeof(struct __kernel_sockaddr_storage) },
        [RDMA_NLDEV_ATTR_RES_IOVA]              = { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_RES_KERN_NAME]         = { .type = NLA_NUL_STRING,
+                                       .len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+       [RDMA_NLDEV_ATTR_RES_LKEY]              = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY]    = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_LQPN]              = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_MR]                = { .type = NLA_NESTED },
        [RDMA_NLDEV_ATTR_RES_MRLEN]             = { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_RES_MRN]               = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_MR_ENTRY]          = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE]    = { .type = NLA_U8 },
        [RDMA_NLDEV_ATTR_RES_PD]                = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_RES_PD_ENTRY]          = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY]    = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY] = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_NDEV_INDEX]            = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_NDEV_NAME]             = { .type = NLA_NUL_STRING,
-                                                   .len = IFNAMSIZ },
-       [RDMA_NLDEV_ATTR_DRIVER]                = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_DRIVER_ENTRY]          = { .type = NLA_NESTED },
-       [RDMA_NLDEV_ATTR_DRIVER_STRING]         = { .type = NLA_NUL_STRING,
-                                   .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
-       [RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE]     = { .type = NLA_U8 },
-       [RDMA_NLDEV_ATTR_DRIVER_S32]            = { .type = NLA_S32 },
-       [RDMA_NLDEV_ATTR_DRIVER_U32]            = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_DRIVER_S64]            = { .type = NLA_S64 },
-       [RDMA_NLDEV_ATTR_DRIVER_U64]            = { .type = NLA_U64 },
        [RDMA_NLDEV_ATTR_RES_PDN]               = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_CQN]               = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_MRN]               = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_CM_IDN]            = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_RES_CTXN]              = { .type = NLA_U32 },
-       [RDMA_NLDEV_ATTR_LINK_TYPE]             = { .type = NLA_NUL_STRING,
-                                   .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
-       [RDMA_NLDEV_SYS_ATTR_NETNS_MODE]        = { .type = NLA_U8 },
-       [RDMA_NLDEV_ATTR_DEV_PROTOCOL]          = { .type = NLA_NUL_STRING,
-                                   .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
+       [RDMA_NLDEV_ATTR_RES_PD_ENTRY]          = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_RES_PID]               = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_POLL_CTX]          = { .type = NLA_U8 },
+       [RDMA_NLDEV_ATTR_RES_PS]                = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_QP]                = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_RES_QP_ENTRY]          = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_RES_RKEY]              = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_RQPN]              = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_RQ_PSN]            = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_SQ_PSN]            = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_SRC_ADDR]          = {
+                       .len = sizeof(struct __kernel_sockaddr_storage) },
+       [RDMA_NLDEV_ATTR_RES_STATE]             = { .type = NLA_U8 },
+       [RDMA_NLDEV_ATTR_RES_SUMMARY]           = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY]     = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR]= { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME]= { .type = NLA_NUL_STRING,
+                                       .len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+       [RDMA_NLDEV_ATTR_RES_TYPE]              = { .type = NLA_U8 },
+       [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]= { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_RES_USECNT]            = { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_SM_LID]                = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_SUBNET_PREFIX]         = { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]   = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_STAT_MODE]             = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_STAT_RES]              = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_STAT_COUNTER]          = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY]    = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_STAT_COUNTER_ID]       = { .type = NLA_U32 },
+       [RDMA_NLDEV_ATTR_STAT_HWCOUNTERS]       = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY]  = { .type = NLA_NESTED },
+       [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME] = { .type = NLA_NUL_STRING },
+       [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE] = { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID]        = { .type = NLA_U64 },
+       [RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID]      = { .type = NLA_U32 },
        [RDMA_NLDEV_NET_NS_FD]                  = { .type = NLA_U32 },
+       [RDMA_NLDEV_SYS_ATTR_NETNS_MODE]        = { .type = NLA_U8 },
 };
 
 static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -232,6 +253,8 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
                return -EMSGSIZE;
        if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type))
                return -EMSGSIZE;
+       if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, device->use_cq_dim))
+               return -EMSGSIZE;
 
        /*
         * Link type is determined on first port and mlx4 device
@@ -532,6 +555,9 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin,
            nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx))
                goto err;
 
+       if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, (cq->dim != NULL)))
+               goto err;
+
        if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id))
                goto err;
        if (!rdma_is_kernel_res(res) &&
@@ -623,6 +649,152 @@ static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin,
 err:   return -EMSGSIZE;
 }
 
+static int fill_stat_counter_mode(struct sk_buff *msg,
+                                 struct rdma_counter *counter)
+{
+       struct rdma_counter_mode *m = &counter->mode;
+
+       if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, m->mode))
+               return -EMSGSIZE;
+
+       if (m->mode == RDMA_COUNTER_MODE_AUTO)
+               if ((m->mask & RDMA_COUNTER_MASK_QP_TYPE) &&
+                   nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, m->param.qp_type))
+                       return -EMSGSIZE;
+
+       return 0;
+}
+
+static int fill_stat_counter_qp_entry(struct sk_buff *msg, u32 qpn)
+{
+       struct nlattr *entry_attr;
+
+       entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY);
+       if (!entry_attr)
+               return -EMSGSIZE;
+
+       if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn))
+               goto err;
+
+       nla_nest_end(msg, entry_attr);
+       return 0;
+
+err:
+       nla_nest_cancel(msg, entry_attr);
+       return -EMSGSIZE;
+}
+
+static int fill_stat_counter_qps(struct sk_buff *msg,
+                                struct rdma_counter *counter)
+{
+       struct rdma_restrack_entry *res;
+       struct rdma_restrack_root *rt;
+       struct nlattr *table_attr;
+       struct ib_qp *qp = NULL;
+       unsigned long id = 0;
+       int ret = 0;
+
+       table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP);
+
+       rt = &counter->device->res[RDMA_RESTRACK_QP];
+       xa_lock(&rt->xa);
+       xa_for_each(&rt->xa, id, res) {
+               if (!rdma_is_visible_in_pid_ns(res))
+                       continue;
+
+               qp = container_of(res, struct ib_qp, res);
+               if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
+                       continue;
+
+               if (!qp->counter || (qp->counter->id != counter->id))
+                       continue;
+
+               ret = fill_stat_counter_qp_entry(msg, qp->qp_num);
+               if (ret)
+                       goto err;
+       }
+
+       xa_unlock(&rt->xa);
+       nla_nest_end(msg, table_attr);
+       return 0;
+
+err:
+       xa_unlock(&rt->xa);
+       nla_nest_cancel(msg, table_attr);
+       return ret;
+}
+
+static int fill_stat_hwcounter_entry(struct sk_buff *msg,
+                                    const char *name, u64 value)
+{
+       struct nlattr *entry_attr;
+
+       entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY);
+       if (!entry_attr)
+               return -EMSGSIZE;
+
+       if (nla_put_string(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME,
+                          name))
+               goto err;
+       if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE,
+                             value, RDMA_NLDEV_ATTR_PAD))
+               goto err;
+
+       nla_nest_end(msg, entry_attr);
+       return 0;
+
+err:
+       nla_nest_cancel(msg, entry_attr);
+       return -EMSGSIZE;
+}
+
+static int fill_stat_counter_hwcounters(struct sk_buff *msg,
+                                       struct rdma_counter *counter)
+{
+       struct rdma_hw_stats *st = counter->stats;
+       struct nlattr *table_attr;
+       int i;
+
+       table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS);
+       if (!table_attr)
+               return -EMSGSIZE;
+
+       for (i = 0; i < st->num_counters; i++)
+               if (fill_stat_hwcounter_entry(msg, st->names[i], st->value[i]))
+                       goto err;
+
+       nla_nest_end(msg, table_attr);
+       return 0;
+
+err:
+       nla_nest_cancel(msg, table_attr);
+       return -EMSGSIZE;
+}
+
+static int fill_res_counter_entry(struct sk_buff *msg, bool has_cap_net_admin,
+                                 struct rdma_restrack_entry *res,
+                                 uint32_t port)
+{
+       struct rdma_counter *counter =
+               container_of(res, struct rdma_counter, res);
+
+       if (port && port != counter->port)
+               return 0;
+
+       /* Dump it even query failed */
+       rdma_counter_query_stats(counter);
+
+       if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, counter->port) ||
+           nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, counter->id) ||
+           fill_res_name_pid(msg, &counter->res) ||
+           fill_stat_counter_mode(msg, counter) ||
+           fill_stat_counter_qps(msg, counter) ||
+           fill_stat_counter_hwcounters(msg, counter))
+               return -EMSGSIZE;
+
+       return 0;
+}
+
 static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
                          struct netlink_ext_ack *extack)
 {
@@ -704,6 +876,14 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
                goto put_done;
        }
 
+       if (tb[RDMA_NLDEV_ATTR_DEV_DIM]) {
+               u8 use_dim;
+
+               use_dim = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_DIM]);
+               err = ib_device_set_dim(device,  use_dim);
+               goto done;
+       }
+
 done:
        ib_device_put(device);
 put_done:
@@ -990,19 +1170,15 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
                .entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY,
                .id = RDMA_NLDEV_ATTR_RES_PDN,
        },
+       [RDMA_RESTRACK_COUNTER] = {
+               .fill_res_func = fill_res_counter_entry,
+               .nldev_cmd = RDMA_NLDEV_CMD_STAT_GET,
+               .nldev_attr = RDMA_NLDEV_ATTR_STAT_COUNTER,
+               .entry = RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY,
+               .id = RDMA_NLDEV_ATTR_STAT_COUNTER_ID,
+       },
 };
 
-static bool is_visible_in_pid_ns(struct rdma_restrack_entry *res)
-{
-       /*
-        * 1. Kern resources should be visible in init name space only
-        * 2. Present only resources visible in the current namespace
-        */
-       if (rdma_is_kernel_res(res))
-               return task_active_pid_ns(current) == &init_pid_ns;
-       return task_active_pid_ns(current) == task_active_pid_ns(res->task);
-}
-
 static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
                               struct netlink_ext_ack *extack,
                               enum rdma_restrack_type res_type)
@@ -1047,7 +1223,7 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
                goto err;
        }
 
-       if (!is_visible_in_pid_ns(res)) {
+       if (!rdma_is_visible_in_pid_ns(res)) {
                ret = -ENOENT;
                goto err_get;
        }
@@ -1159,7 +1335,7 @@ static int res_get_common_dumpit(struct sk_buff *skb,
         * objects.
         */
        xa_for_each(&rt->xa, id, res) {
-               if (!is_visible_in_pid_ns(res))
+               if (!rdma_is_visible_in_pid_ns(res))
                        continue;
 
                if (idx < start || !rdma_restrack_get(res))
@@ -1237,6 +1413,7 @@ RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID);
 RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ);
 RES_GET_FUNCS(pd, RDMA_RESTRACK_PD);
 RES_GET_FUNCS(mr, RDMA_RESTRACK_MR);
+RES_GET_FUNCS(counter, RDMA_RESTRACK_COUNTER);
 
 static LIST_HEAD(link_ops);
 static DECLARE_RWSEM(link_ops_rwsem);
@@ -1299,7 +1476,7 @@ static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
        nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME],
                    sizeof(ndev_name));
 
-       ndev = dev_get_by_name(&init_net, ndev_name);
+       ndev = dev_get_by_name(sock_net(skb->sk), ndev_name);
        if (!ndev)
                return -ENODEV;
 
@@ -1347,6 +1524,90 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
        return 0;
 }
 
+static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh,
+                            struct netlink_ext_ack *extack)
+{
+       struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+       char client_name[RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE];
+       struct ib_client_nl_info data = {};
+       struct ib_device *ibdev = NULL;
+       struct sk_buff *msg;
+       u32 index;
+       int err;
+
+       err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy,
+                         extack);
+       if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE])
+               return -EINVAL;
+
+       nla_strlcpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE],
+                   sizeof(client_name));
+
+       if (tb[RDMA_NLDEV_ATTR_DEV_INDEX]) {
+               index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+               ibdev = ib_device_get_by_index(sock_net(skb->sk), index);
+               if (!ibdev)
+                       return -EINVAL;
+
+               if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) {
+                       data.port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+                       if (!rdma_is_port_valid(ibdev, data.port)) {
+                               err = -EINVAL;
+                               goto out_put;
+                       }
+               } else {
+                       data.port = -1;
+               }
+       } else if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) {
+               return -EINVAL;
+       }
+
+       msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+       if (!msg) {
+               err = -ENOMEM;
+               goto out_put;
+       }
+       nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+                       RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+                                        RDMA_NLDEV_CMD_GET_CHARDEV),
+                       0, 0);
+
+       data.nl_msg = msg;
+       err = ib_get_client_nl_info(ibdev, client_name, &data);
+       if (err)
+               goto out_nlmsg;
+
+       err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV,
+                               huge_encode_dev(data.cdev->devt),
+                               RDMA_NLDEV_ATTR_PAD);
+       if (err)
+               goto out_data;
+       err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV_ABI, data.abi,
+                               RDMA_NLDEV_ATTR_PAD);
+       if (err)
+               goto out_data;
+       if (nla_put_string(msg, RDMA_NLDEV_ATTR_CHARDEV_NAME,
+                          dev_name(data.cdev))) {
+               err = -EMSGSIZE;
+               goto out_data;
+       }
+
+       nlmsg_end(msg, nlh);
+       put_device(data.cdev);
+       if (ibdev)
+               ib_device_put(ibdev);
+       return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+out_data:
+       put_device(data.cdev);
+out_nlmsg:
+       nlmsg_free(msg);
+out_put:
+       if (ibdev)
+               ib_device_put(ibdev);
+       return err;
+}
+
 static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
                              struct netlink_ext_ack *extack)
 {
@@ -1399,11 +1660,375 @@ static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
        return err;
 }
 
+static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+                              struct netlink_ext_ack *extack)
+{
+       u32 index, port, mode, mask = 0, qpn, cntn = 0;
+       struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+       struct ib_device *device;
+       struct sk_buff *msg;
+       int ret;
+
+       ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+                         nldev_policy, extack);
+       /* Currently only counter for QP is supported */
+       if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] ||
+           !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+           !tb[RDMA_NLDEV_ATTR_PORT_INDEX] || !tb[RDMA_NLDEV_ATTR_STAT_MODE])
+               return -EINVAL;
+
+       if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP)
+               return -EINVAL;
+
+       index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+       device = ib_device_get_by_index(sock_net(skb->sk), index);
+       if (!device)
+               return -EINVAL;
+
+       port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+       if (!rdma_is_port_valid(device, port)) {
+               ret = -EINVAL;
+               goto err;
+       }
+
+       msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+       if (!msg) {
+               ret = -ENOMEM;
+               goto err;
+       }
+       nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+                       RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+                                        RDMA_NLDEV_CMD_STAT_SET),
+                       0, 0);
+
+       mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]);
+       if (mode == RDMA_COUNTER_MODE_AUTO) {
+               if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK])
+                       mask = nla_get_u32(
+                               tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]);
+
+               ret = rdma_counter_set_auto_mode(device, port,
+                                                mask ? true : false, mask);
+               if (ret)
+                       goto err_msg;
+       } else {
+               qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]);
+               if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) {
+                       cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]);
+                       ret = rdma_counter_bind_qpn(device, port, qpn, cntn);
+               } else {
+                       ret = rdma_counter_bind_qpn_alloc(device, port,
+                                                         qpn, &cntn);
+               }
+               if (ret)
+                       goto err_msg;
+
+               if (fill_nldev_handle(msg, device) ||
+                   nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) ||
+                   nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) ||
+                   nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) {
+                       ret = -EMSGSIZE;
+                       goto err_fill;
+               }
+       }
+
+       nlmsg_end(msg, nlh);
+       ib_device_put(device);
+       return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_fill:
+       rdma_counter_unbind_qpn(device, port, qpn, cntn);
+err_msg:
+       nlmsg_free(msg);
+err:
+       ib_device_put(device);
+       return ret;
+}
+
+static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+                              struct netlink_ext_ack *extack)
+{
+       struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+       struct ib_device *device;
+       struct sk_buff *msg;
+       u32 index, port, qpn, cntn;
+       int ret;
+
+       ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+                         nldev_policy, extack);
+       if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] ||
+           !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX] ||
+           !tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID] ||
+           !tb[RDMA_NLDEV_ATTR_RES_LQPN])
+               return -EINVAL;
+
+       if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP)
+               return -EINVAL;
+
+       index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+       device = ib_device_get_by_index(sock_net(skb->sk), index);
+       if (!device)
+               return -EINVAL;
+
+       port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+       if (!rdma_is_port_valid(device, port)) {
+               ret = -EINVAL;
+               goto err;
+       }
+
+       msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+       if (!msg) {
+               ret = -ENOMEM;
+               goto err;
+       }
+       nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+                       RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+                                        RDMA_NLDEV_CMD_STAT_SET),
+                       0, 0);
+
+       cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]);
+       qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]);
+       ret = rdma_counter_unbind_qpn(device, port, qpn, cntn);
+       if (ret)
+               goto err_unbind;
+
+       if (fill_nldev_handle(msg, device) ||
+           nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) ||
+           nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) ||
+           nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) {
+               ret = -EMSGSIZE;
+               goto err_fill;
+       }
+
+       nlmsg_end(msg, nlh);
+       ib_device_put(device);
+       return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_fill:
+       rdma_counter_bind_qpn(device, port, qpn, cntn);
+err_unbind:
+       nlmsg_free(msg);
+err:
+       ib_device_put(device);
+       return ret;
+}
+
+static int stat_get_doit_default_counter(struct sk_buff *skb,
+                                        struct nlmsghdr *nlh,
+                                        struct netlink_ext_ack *extack,
+                                        struct nlattr *tb[])
+{
+       struct rdma_hw_stats *stats;
+       struct nlattr *table_attr;
+       struct ib_device *device;
+       int ret, num_cnts, i;
+       struct sk_buff *msg;
+       u32 index, port;
+       u64 v;
+
+       if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
+               return -EINVAL;
+
+       index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+       device = ib_device_get_by_index(sock_net(skb->sk), index);
+       if (!device)
+               return -EINVAL;
+
+       if (!device->ops.alloc_hw_stats || !device->ops.get_hw_stats) {
+               ret = -EINVAL;
+               goto err;
+       }
+
+       port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+       if (!rdma_is_port_valid(device, port)) {
+               ret = -EINVAL;
+               goto err;
+       }
+
+       msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+       if (!msg) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+                       RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+                                        RDMA_NLDEV_CMD_STAT_GET),
+                       0, 0);
+
+       if (fill_nldev_handle(msg, device) ||
+           nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) {
+               ret = -EMSGSIZE;
+               goto err_msg;
+       }
+
+       stats = device->port_data ? device->port_data[port].hw_stats : NULL;
+       if (stats == NULL) {
+               ret = -EINVAL;
+               goto err_msg;
+       }
+       mutex_lock(&stats->lock);
+
+       num_cnts = device->ops.get_hw_stats(device, stats, port, 0);
+       if (num_cnts < 0) {
+               ret = -EINVAL;
+               goto err_stats;
+       }
+
+       table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS);
+       if (!table_attr) {
+               ret = -EMSGSIZE;
+               goto err_stats;
+       }
+       for (i = 0; i < num_cnts; i++) {
+               v = stats->value[i] +
+                       rdma_counter_get_hwstat_value(device, port, i);
+               if (fill_stat_hwcounter_entry(msg, stats->names[i], v)) {
+                       ret = -EMSGSIZE;
+                       goto err_table;
+               }
+       }
+       nla_nest_end(msg, table_attr);
+
+       mutex_unlock(&stats->lock);
+       nlmsg_end(msg, nlh);
+       ib_device_put(device);
+       return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_table:
+       nla_nest_cancel(msg, table_attr);
+err_stats:
+       mutex_unlock(&stats->lock);
+err_msg:
+       nlmsg_free(msg);
+err:
+       ib_device_put(device);
+       return ret;
+}
+
+static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
+                           struct netlink_ext_ack *extack, struct nlattr *tb[])
+
+{
+       static enum rdma_nl_counter_mode mode;
+       static enum rdma_nl_counter_mask mask;
+       struct ib_device *device;
+       struct sk_buff *msg;
+       u32 index, port;
+       int ret;
+
+       if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID])
+               return nldev_res_get_counter_doit(skb, nlh, extack);
+
+       if (!tb[RDMA_NLDEV_ATTR_STAT_MODE] ||
+           !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
+               return -EINVAL;
+
+       index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+       device = ib_device_get_by_index(sock_net(skb->sk), index);
+       if (!device)
+               return -EINVAL;
+
+       port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+       if (!rdma_is_port_valid(device, port)) {
+               ret = -EINVAL;
+               goto err;
+       }
+
+       msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+       if (!msg) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+                       RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+                                        RDMA_NLDEV_CMD_STAT_GET),
+                       0, 0);
+
+       ret = rdma_counter_get_mode(device, port, &mode, &mask);
+       if (ret)
+               goto err_msg;
+
+       if (fill_nldev_handle(msg, device) ||
+           nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) ||
+           nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode))
+               goto err_msg;
+
+       if ((mode == RDMA_COUNTER_MODE_AUTO) &&
+           nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask))
+               goto err_msg;
+
+       nlmsg_end(msg, nlh);
+       ib_device_put(device);
+       return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_msg:
+       nlmsg_free(msg);
+err:
+       ib_device_put(device);
+       return ret;
+}
+
+static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+                              struct netlink_ext_ack *extack)
+{
+       struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+       int ret;
+
+       ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+                         nldev_policy, extack);
+       if (ret)
+               return -EINVAL;
+
+       if (!tb[RDMA_NLDEV_ATTR_STAT_RES])
+               return stat_get_doit_default_counter(skb, nlh, extack, tb);
+
+       switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) {
+       case RDMA_NLDEV_ATTR_RES_QP:
+               ret = stat_get_doit_qp(skb, nlh, extack, tb);
+               break;
+
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+       return ret;
+}
+
+static int nldev_stat_get_dumpit(struct sk_buff *skb,
+                                struct netlink_callback *cb)
+{
+       struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+       int ret;
+
+       ret = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+                         nldev_policy, NULL);
+       if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES])
+               return -EINVAL;
+
+       switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) {
+       case RDMA_NLDEV_ATTR_RES_QP:
+               ret = nldev_res_get_counter_dumpit(skb, cb);
+               break;
+
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+       return ret;
+}
+
 static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
        [RDMA_NLDEV_CMD_GET] = {
                .doit = nldev_get_doit,
                .dump = nldev_get_dumpit,
        },
+       [RDMA_NLDEV_CMD_GET_CHARDEV] = {
+               .doit = nldev_get_chardev,
+       },
        [RDMA_NLDEV_CMD_SET] = {
                .doit = nldev_set_doit,
                .flags = RDMA_NL_ADMIN_PERM,
@@ -1449,6 +2074,17 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
        },
        [RDMA_NLDEV_CMD_SYS_SET] = {
                .doit = nldev_set_sys_set_doit,
+       },
+       [RDMA_NLDEV_CMD_STAT_SET] = {
+               .doit = nldev_stat_set_doit,
+               .flags = RDMA_NL_ADMIN_PERM,
+       },
+       [RDMA_NLDEV_CMD_STAT_GET] = {
+               .doit = nldev_stat_get_doit,
+               .dump = nldev_stat_get_dumpit,
+       },
+       [RDMA_NLDEV_CMD_STAT_DEL] = {
+               .doit = nldev_stat_del_doit,
                .flags = RDMA_NL_ADMIN_PERM,
        },
 };
index 3b5ff2f7b5f8759416f56b69758c161c18afbfc2..bddff426ee0f0d37dfb32a9e03f97dc9cde7a9ad 100644 (file)
@@ -6,6 +6,7 @@
 #include <rdma/rdma_cm.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/restrack.h>
+#include <rdma/rdma_counter.h>
 #include <linux/mutex.h>
 #include <linux/sched/task.h>
 #include <linux/pid_namespace.h>
@@ -45,6 +46,7 @@ static const char *type2str(enum rdma_restrack_type type)
                [RDMA_RESTRACK_CM_ID] = "CM_ID",
                [RDMA_RESTRACK_MR] = "MR",
                [RDMA_RESTRACK_CTX] = "CTX",
+               [RDMA_RESTRACK_COUNTER] = "COUNTER",
        };
 
        return names[type];
@@ -169,6 +171,8 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res)
                return container_of(res, struct ib_mr, res)->device;
        case RDMA_RESTRACK_CTX:
                return container_of(res, struct ib_ucontext, res)->device;
+       case RDMA_RESTRACK_COUNTER:
+               return container_of(res, struct rdma_counter, res)->device;
        default:
                WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type);
                return NULL;
@@ -190,6 +194,20 @@ void rdma_restrack_set_task(struct rdma_restrack_entry *res,
 }
 EXPORT_SYMBOL(rdma_restrack_set_task);
 
+/**
+ * rdma_restrack_attach_task() - attach the task onto this resource
+ * @res:  resource entry
+ * @task: the task to attach, the current task will be used if it is NULL.
+ */
+void rdma_restrack_attach_task(struct rdma_restrack_entry *res,
+                              struct task_struct *task)
+{
+       if (res->task)
+               put_task_struct(res->task);
+       get_task_struct(task);
+       res->task = task;
+}
+
 static void rdma_restrack_add(struct rdma_restrack_entry *res)
 {
        struct ib_device *dev = res_to_dev(res);
@@ -203,15 +221,22 @@ static void rdma_restrack_add(struct rdma_restrack_entry *res)
 
        kref_init(&res->kref);
        init_completion(&res->comp);
-       if (res->type != RDMA_RESTRACK_QP)
-               ret = xa_alloc_cyclic(&rt->xa, &res->id, res, xa_limit_32b,
-                               &rt->next_id, GFP_KERNEL);
-       else {
+       if (res->type == RDMA_RESTRACK_QP) {
                /* Special case to ensure that LQPN points to right QP */
                struct ib_qp *qp = container_of(res, struct ib_qp, res);
 
                ret = xa_insert(&rt->xa, qp->qp_num, res, GFP_KERNEL);
                res->id = ret ? 0 : qp->qp_num;
+       } else if (res->type == RDMA_RESTRACK_COUNTER) {
+               /* Special case to ensure that cntn points to right counter */
+               struct rdma_counter *counter;
+
+               counter = container_of(res, struct rdma_counter, res);
+               ret = xa_insert(&rt->xa, counter->id, res, GFP_KERNEL);
+               res->id = ret ? 0 : counter->id;
+       } else {
+               ret = xa_alloc_cyclic(&rt->xa, &res->id, res, xa_limit_32b,
+                                     &rt->next_id, GFP_KERNEL);
        }
 
        if (!ret)
@@ -237,7 +262,8 @@ EXPORT_SYMBOL(rdma_restrack_kadd);
  */
 void rdma_restrack_uadd(struct rdma_restrack_entry *res)
 {
-       if (res->type != RDMA_RESTRACK_CM_ID)
+       if ((res->type != RDMA_RESTRACK_CM_ID) &&
+           (res->type != RDMA_RESTRACK_COUNTER))
                res->task = NULL;
 
        if (!res->task)
@@ -323,3 +349,16 @@ out:
        }
 }
 EXPORT_SYMBOL(rdma_restrack_del);
+
+bool rdma_is_visible_in_pid_ns(struct rdma_restrack_entry *res)
+{
+       /*
+        * 1. Kern resources should be visible in init
+        *    namespace only
+        * 2. Present only resources visible in the current
+        *     namespace
+        */
+       if (rdma_is_kernel_res(res))
+               return task_active_pid_ns(current) == &init_pid_ns;
+       return task_active_pid_ns(current) == task_active_pid_ns(res->task);
+}
index 09a1fbdf578ed1a87d61ecb9aeca9b314b15424a..7bd177cc0a6179c635532ed7f718ab432d07a069 100644 (file)
@@ -25,4 +25,7 @@ struct rdma_restrack_root {
 
 int rdma_restrack_init(struct ib_device *dev);
 void rdma_restrack_clean(struct ib_device *dev);
+void rdma_restrack_attach_task(struct rdma_restrack_entry *res,
+                              struct task_struct *task);
+bool rdma_is_visible_in_pid_ns(struct rdma_restrack_entry *res);
 #endif /* _RDMA_CORE_RESTRACK_H_ */
index 32ca8429eaaea17448b456ada31d59b6cc467b7d..dce06108c8c3fe39d9bd2d28ae9d88dd6cab82c7 100644 (file)
@@ -51,10 +51,34 @@ static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num,
        return false;
 }
 
-static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev)
+static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev,
+                                          bool pi_support)
 {
+       u32 max_pages;
+
+       if (pi_support)
+               max_pages = dev->attrs.max_pi_fast_reg_page_list_len;
+       else
+               max_pages = dev->attrs.max_fast_reg_page_list_len;
+
        /* arbitrary limit to avoid allocating gigantic resources */
-       return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256);
+       return min_t(u32, max_pages, 256);
+}
+
+static inline int rdma_rw_inv_key(struct rdma_rw_reg_ctx *reg)
+{
+       int count = 0;
+
+       if (reg->mr->need_inval) {
+               reg->inv_wr.opcode = IB_WR_LOCAL_INV;
+               reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey;
+               reg->inv_wr.next = &reg->reg_wr.wr;
+               count++;
+       } else {
+               reg->inv_wr.next = NULL;
+       }
+
+       return count;
 }
 
 /* Caller must have zero-initialized *reg. */
@@ -62,7 +86,8 @@ static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num,
                struct rdma_rw_reg_ctx *reg, struct scatterlist *sg,
                u32 sg_cnt, u32 offset)
 {
-       u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+       u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
+                                                   qp->integrity_en);
        u32 nents = min(sg_cnt, pages_per_mr);
        int count = 0, ret;
 
@@ -70,14 +95,7 @@ static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num,
        if (!reg->mr)
                return -EAGAIN;
 
-       if (reg->mr->need_inval) {
-               reg->inv_wr.opcode = IB_WR_LOCAL_INV;
-               reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey;
-               reg->inv_wr.next = &reg->reg_wr.wr;
-               count++;
-       } else {
-               reg->inv_wr.next = NULL;
-       }
+       count += rdma_rw_inv_key(reg);
 
        ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE);
        if (ret < 0 || ret < nents) {
@@ -102,7 +120,8 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
                u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 {
        struct rdma_rw_reg_ctx *prev = NULL;
-       u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+       u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
+                                                   qp->integrity_en);
        int i, j, ret = 0, count = 0;
 
        ctx->nr_ops = (sg_cnt + pages_per_mr - 1) / pages_per_mr;
@@ -343,13 +362,14 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
                u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 {
        struct ib_device *dev = qp->pd->device;
-       u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+       u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
+                                                   qp->integrity_en);
        struct ib_rdma_wr *rdma_wr;
-       struct ib_send_wr *prev_wr = NULL;
        int count = 0, ret;
 
        if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) {
-               pr_err("SG count too large\n");
+               pr_err("SG count too large: sg_cnt=%d, prot_sg_cnt=%d, pages_per_mr=%d\n",
+                      sg_cnt, prot_sg_cnt, pages_per_mr);
                return -EINVAL;
        }
 
@@ -358,75 +378,58 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
                return -ENOMEM;
        sg_cnt = ret;
 
-       ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir);
-       if (!ret) {
-               ret = -ENOMEM;
-               goto out_unmap_sg;
+       if (prot_sg_cnt) {
+               ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir);
+               if (!ret) {
+                       ret = -ENOMEM;
+                       goto out_unmap_sg;
+               }
+               prot_sg_cnt = ret;
        }
-       prot_sg_cnt = ret;
 
        ctx->type = RDMA_RW_SIG_MR;
        ctx->nr_ops = 1;
-       ctx->sig = kcalloc(1, sizeof(*ctx->sig), GFP_KERNEL);
-       if (!ctx->sig) {
+       ctx->reg = kcalloc(1, sizeof(*ctx->reg), GFP_KERNEL);
+       if (!ctx->reg) {
                ret = -ENOMEM;
                goto out_unmap_prot_sg;
        }
 
-       ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->data, sg, sg_cnt, 0);
-       if (ret < 0)
-               goto out_free_ctx;
-       count += ret;
-       prev_wr = &ctx->sig->data.reg_wr.wr;
-
-       ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->prot,
-                                 prot_sg, prot_sg_cnt, 0);
-       if (ret < 0)
-               goto out_destroy_data_mr;
-       count += ret;
-
-       if (ctx->sig->prot.inv_wr.next)
-               prev_wr->next = &ctx->sig->prot.inv_wr;
-       else
-               prev_wr->next = &ctx->sig->prot.reg_wr.wr;
-       prev_wr = &ctx->sig->prot.reg_wr.wr;
-
-       ctx->sig->sig_mr = ib_mr_pool_get(qp, &qp->sig_mrs);
-       if (!ctx->sig->sig_mr) {
+       ctx->reg->mr = ib_mr_pool_get(qp, &qp->sig_mrs);
+       if (!ctx->reg->mr) {
                ret = -EAGAIN;
-               goto out_destroy_prot_mr;
+               goto out_free_ctx;
        }
 
-       if (ctx->sig->sig_mr->need_inval) {
-               memset(&ctx->sig->sig_inv_wr, 0, sizeof(ctx->sig->sig_inv_wr));
+       count += rdma_rw_inv_key(ctx->reg);
 
-               ctx->sig->sig_inv_wr.opcode = IB_WR_LOCAL_INV;
-               ctx->sig->sig_inv_wr.ex.invalidate_rkey = ctx->sig->sig_mr->rkey;
+       memcpy(ctx->reg->mr->sig_attrs, sig_attrs, sizeof(struct ib_sig_attrs));
 
-               prev_wr->next = &ctx->sig->sig_inv_wr;
-               prev_wr = &ctx->sig->sig_inv_wr;
+       ret = ib_map_mr_sg_pi(ctx->reg->mr, sg, sg_cnt, NULL, prot_sg,
+                             prot_sg_cnt, NULL, SZ_4K);
+       if (unlikely(ret)) {
+               pr_err("failed to map PI sg (%d)\n", sg_cnt + prot_sg_cnt);
+               goto out_destroy_sig_mr;
        }
 
-       ctx->sig->sig_wr.wr.opcode = IB_WR_REG_SIG_MR;
-       ctx->sig->sig_wr.wr.wr_cqe = NULL;
-       ctx->sig->sig_wr.wr.sg_list = &ctx->sig->data.sge;
-       ctx->sig->sig_wr.wr.num_sge = 1;
-       ctx->sig->sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE;
-       ctx->sig->sig_wr.sig_attrs = sig_attrs;
-       ctx->sig->sig_wr.sig_mr = ctx->sig->sig_mr;
-       if (prot_sg_cnt)
-               ctx->sig->sig_wr.prot = &ctx->sig->prot.sge;
-       prev_wr->next = &ctx->sig->sig_wr.wr;
-       prev_wr = &ctx->sig->sig_wr.wr;
+       ctx->reg->reg_wr.wr.opcode = IB_WR_REG_MR_INTEGRITY;
+       ctx->reg->reg_wr.wr.wr_cqe = NULL;
+       ctx->reg->reg_wr.wr.num_sge = 0;
+       ctx->reg->reg_wr.wr.send_flags = 0;
+       ctx->reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE;
+       if (rdma_protocol_iwarp(qp->device, port_num))
+               ctx->reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE;
+       ctx->reg->reg_wr.mr = ctx->reg->mr;
+       ctx->reg->reg_wr.key = ctx->reg->mr->lkey;
        count++;
 
-       ctx->sig->sig_sge.addr = 0;
-       ctx->sig->sig_sge.length = ctx->sig->data.sge.length;
-       if (sig_attrs->wire.sig_type != IB_SIG_TYPE_NONE)
-               ctx->sig->sig_sge.length += ctx->sig->prot.sge.length;
+       ctx->reg->sge.addr = ctx->reg->mr->iova;
+       ctx->reg->sge.length = ctx->reg->mr->length;
+       if (sig_attrs->wire.sig_type == IB_SIG_TYPE_NONE)
+               ctx->reg->sge.length -= ctx->reg->mr->sig_attrs->meta_length;
 
-       rdma_wr = &ctx->sig->data.wr;
-       rdma_wr->wr.sg_list = &ctx->sig->sig_sge;
+       rdma_wr = &ctx->reg->wr;
+       rdma_wr->wr.sg_list = &ctx->reg->sge;
        rdma_wr->wr.num_sge = 1;
        rdma_wr->remote_addr = remote_addr;
        rdma_wr->rkey = rkey;
@@ -434,21 +437,18 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
                rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
        else
                rdma_wr->wr.opcode = IB_WR_RDMA_READ;
-       prev_wr->next = &rdma_wr->wr;
-       prev_wr = &rdma_wr->wr;
+       ctx->reg->reg_wr.wr.next = &rdma_wr->wr;
        count++;
 
        return count;
 
-out_destroy_prot_mr:
-       if (prot_sg_cnt)
-               ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr);
-out_destroy_data_mr:
-       ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr);
+out_destroy_sig_mr:
+       ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr);
 out_free_ctx:
-       kfree(ctx->sig);
+       kfree(ctx->reg);
 out_unmap_prot_sg:
-       ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir);
+       if (prot_sg_cnt)
+               ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir);
 out_unmap_sg:
        ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
        return ret;
@@ -491,22 +491,8 @@ struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 
        switch (ctx->type) {
        case RDMA_RW_SIG_MR:
-               rdma_rw_update_lkey(&ctx->sig->data, true);
-               if (ctx->sig->prot.mr)
-                       rdma_rw_update_lkey(&ctx->sig->prot, true);
-       
-               ctx->sig->sig_mr->need_inval = true;
-               ib_update_fast_reg_key(ctx->sig->sig_mr,
-                       ib_inc_rkey(ctx->sig->sig_mr->lkey));
-               ctx->sig->sig_sge.lkey = ctx->sig->sig_mr->lkey;
-
-               if (ctx->sig->data.inv_wr.next)
-                       first_wr = &ctx->sig->data.inv_wr;
-               else
-                       first_wr = &ctx->sig->data.reg_wr.wr;
-               last_wr = &ctx->sig->data.wr.wr;
-               break;
        case RDMA_RW_MR:
+               /* fallthrough */
                for (i = 0; i < ctx->nr_ops; i++) {
                        rdma_rw_update_lkey(&ctx->reg[i],
                                ctx->reg[i].wr.wr.opcode !=
@@ -605,7 +591,7 @@ EXPORT_SYMBOL(rdma_rw_ctx_destroy);
 
 /**
  * rdma_rw_ctx_destroy_signature - release all resources allocated by
- *     rdma_rw_ctx_init_signature
+ *     rdma_rw_ctx_signature_init
  * @ctx:       context to release
  * @qp:                queue pair to operate on
  * @port_num:  port num to which the connection is bound
@@ -623,16 +609,12 @@ void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
        if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR))
                return;
 
-       ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr);
-       ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+       ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr);
+       kfree(ctx->reg);
 
-       if (ctx->sig->prot.mr) {
-               ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr);
+       ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+       if (prot_sg_cnt)
                ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir);
-       }
-
-       ib_mr_pool_put(qp, &qp->sig_mrs, ctx->sig->sig_mr);
-       kfree(ctx->sig);
 }
 EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature);
 
@@ -653,7 +635,7 @@ unsigned int rdma_rw_mr_factor(struct ib_device *device, u8 port_num,
        unsigned int mr_pages;
 
        if (rdma_rw_can_use_mr(device, port_num))
-               mr_pages = rdma_rw_fr_page_list_len(device);
+               mr_pages = rdma_rw_fr_page_list_len(device, false);
        else
                mr_pages = device->attrs.max_sge_rd;
        return DIV_ROUND_UP(maxpages, mr_pages);
@@ -679,9 +661,8 @@ void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
         * we'll need two additional MRs for the registrations and the
         * invalidation.
         */
-       if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN)
-               factor += 6;    /* (inv + reg) * (data + prot + sig) */
-       else if (rdma_rw_can_use_mr(dev, attr->port_num))
+       if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN ||
+           rdma_rw_can_use_mr(dev, attr->port_num))
                factor += 2;    /* inv + reg */
 
        attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs;
@@ -697,20 +678,22 @@ void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
 int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
 {
        struct ib_device *dev = qp->pd->device;
-       u32 nr_mrs = 0, nr_sig_mrs = 0;
+       u32 nr_mrs = 0, nr_sig_mrs = 0, max_num_sg = 0;
        int ret = 0;
 
-       if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) {
+       if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) {
                nr_sig_mrs = attr->cap.max_rdma_ctxs;
-               nr_mrs = attr->cap.max_rdma_ctxs * 2;
+               nr_mrs = attr->cap.max_rdma_ctxs;
+               max_num_sg = rdma_rw_fr_page_list_len(dev, true);
        } else if (rdma_rw_can_use_mr(dev, attr->port_num)) {
                nr_mrs = attr->cap.max_rdma_ctxs;
+               max_num_sg = rdma_rw_fr_page_list_len(dev, false);
        }
 
        if (nr_mrs) {
                ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs,
                                IB_MR_TYPE_MEM_REG,
-                               rdma_rw_fr_page_list_len(dev));
+                               max_num_sg, 0);
                if (ret) {
                        pr_err("%s: failed to allocated %d MRs\n",
                                __func__, nr_mrs);
@@ -720,10 +703,10 @@ int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
 
        if (nr_sig_mrs) {
                ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs,
-                               IB_MR_TYPE_SIGNATURE, 2);
+                               IB_MR_TYPE_INTEGRITY, max_num_sg, max_num_sg);
                if (ret) {
                        pr_err("%s: failed to allocated %d SIG MRs\n",
-                               __func__, nr_mrs);
+                               __func__, nr_sig_mrs);
                        goto out_free_rdma_mrs;
                }
        }
index c78d0c9646ae5d990ccf72a4cf9b531b4ccf57de..b477295a96c2a6bb2ee47cd4950060f08b92cd1c 100644 (file)
@@ -43,6 +43,7 @@
 #include <rdma/ib_mad.h>
 #include <rdma/ib_pma.h>
 #include <rdma/ib_cache.h>
+#include <rdma/rdma_counter.h>
 
 struct ib_port;
 
@@ -800,9 +801,12 @@ static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats,
        return 0;
 }
 
-static ssize_t print_hw_stat(struct rdma_hw_stats *stats, int index, char *buf)
+static ssize_t print_hw_stat(struct ib_device *dev, int port_num,
+                            struct rdma_hw_stats *stats, int index, char *buf)
 {
-       return sprintf(buf, "%llu\n", stats->value[index]);
+       u64 v = rdma_counter_get_hwstat_value(dev, port_num, index);
+
+       return sprintf(buf, "%llu\n", stats->value[index] + v);
 }
 
 static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr,
@@ -828,7 +832,7 @@ static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr,
        ret = update_hw_stats(dev, stats, hsa->port_num, hsa->index);
        if (ret)
                goto unlock;
-       ret = print_hw_stat(stats, hsa->index, buf);
+       ret = print_hw_stat(dev, hsa->port_num, stats, hsa->index, buf);
 unlock:
        mutex_unlock(&stats->lock);
 
@@ -999,6 +1003,8 @@ static void setup_hw_stats(struct ib_device *device, struct ib_port *port,
                        goto err;
                port->hw_stats_ag = hsag;
                port->hw_stats = stats;
+               if (device->port_data)
+                       device->port_data[port_num].hw_stats = stats;
        } else {
                struct kobject *kobj = &device->dev.kobj;
                ret = sysfs_create_group(kobj, hsag);
@@ -1289,6 +1295,8 @@ const struct attribute_group ib_dev_attr_group = {
 
 void ib_free_port_attrs(struct ib_core_device *coredev)
 {
+       struct ib_device *device = rdma_device_to_ibdev(&coredev->dev);
+       bool is_full_dev = &device->coredev == coredev;
        struct kobject *p, *t;
 
        list_for_each_entry_safe(p, t, &coredev->port_list, entry) {
@@ -1298,6 +1306,8 @@ void ib_free_port_attrs(struct ib_core_device *coredev)
                if (port->hw_stats_ag)
                        free_hsag(&port->kobj, port->hw_stats_ag);
                kfree(port->hw_stats);
+               if (device->port_data && is_full_dev)
+                       device->port_data[port->port_num].hw_stats = NULL;
 
                if (port->pma_table)
                        sysfs_remove_group(p, port->pma_table);
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
deleted file mode 100644 (file)
index 8e7da2d..0000000
+++ /dev/null
@@ -1,1350 +0,0 @@
-/*
- * Copyright (c) 2005 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *     copyright notice, this list of conditions and the following
- *     disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *     copyright notice, this list of conditions and the following
- *     disclaimer in the documentation and/or other materials
- *     provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/completion.h>
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/err.h>
-#include <linux/poll.h>
-#include <linux/sched.h>
-#include <linux/file.h>
-#include <linux/mount.h>
-#include <linux/cdev.h>
-#include <linux/xarray.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-
-#include <linux/nospec.h>
-
-#include <linux/uaccess.h>
-
-#include <rdma/ib.h>
-#include <rdma/ib_cm.h>
-#include <rdma/ib_user_cm.h>
-#include <rdma/ib_marshall.h>
-
-#include "core_priv.h"
-
-MODULE_AUTHOR("Libor Michalek");
-MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access");
-MODULE_LICENSE("Dual BSD/GPL");
-
-struct ib_ucm_device {
-       int                     devnum;
-       struct cdev             cdev;
-       struct device           dev;
-       struct ib_device        *ib_dev;
-};
-
-struct ib_ucm_file {
-       struct mutex file_mutex;
-       struct file *filp;
-       struct ib_ucm_device *device;
-
-       struct list_head  ctxs;
-       struct list_head  events;
-       wait_queue_head_t poll_wait;
-};
-
-struct ib_ucm_context {
-       int                 id;
-       struct completion   comp;
-       atomic_t            ref;
-       int                 events_reported;
-
-       struct ib_ucm_file *file;
-       struct ib_cm_id    *cm_id;
-       __u64              uid;
-
-       struct list_head    events;    /* list of pending events. */
-       struct list_head    file_list; /* member in file ctx list */
-};
-
-struct ib_ucm_event {
-       struct ib_ucm_context *ctx;
-       struct list_head file_list; /* member in file event list */
-       struct list_head ctx_list;  /* member in ctx event list */
-
-       struct ib_cm_id *cm_id;
-       struct ib_ucm_event_resp resp;
-       void *data;
-       void *info;
-       int data_len;
-       int info_len;
-};
-
-enum {
-       IB_UCM_MAJOR = 231,
-       IB_UCM_BASE_MINOR = 224,
-       IB_UCM_MAX_DEVICES = RDMA_MAX_PORTS,
-       IB_UCM_NUM_FIXED_MINOR = 32,
-       IB_UCM_NUM_DYNAMIC_MINOR = IB_UCM_MAX_DEVICES - IB_UCM_NUM_FIXED_MINOR,
-};
-
-#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
-static dev_t dynamic_ucm_dev;
-
-static void ib_ucm_add_one(struct ib_device *device);
-static void ib_ucm_remove_one(struct ib_device *device, void *client_data);
-
-static struct ib_client ucm_client = {
-       .name   = "ucm",
-       .add    = ib_ucm_add_one,
-       .remove = ib_ucm_remove_one
-};
-
-static DEFINE_XARRAY_ALLOC(ctx_id_table);
-static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES);
-
-static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id)
-{
-       struct ib_ucm_context *ctx;
-
-       xa_lock(&ctx_id_table);
-       ctx = xa_load(&ctx_id_table, id);
-       if (!ctx)
-               ctx = ERR_PTR(-ENOENT);
-       else if (ctx->file != file)
-               ctx = ERR_PTR(-EINVAL);
-       else
-               atomic_inc(&ctx->ref);
-       xa_unlock(&ctx_id_table);
-
-       return ctx;
-}
-
-static void ib_ucm_ctx_put(struct ib_ucm_context *ctx)
-{
-       if (atomic_dec_and_test(&ctx->ref))
-               complete(&ctx->comp);
-}
-
-static inline int ib_ucm_new_cm_id(int event)
-{
-       return event == IB_CM_REQ_RECEIVED || event == IB_CM_SIDR_REQ_RECEIVED;
-}
-
-static void ib_ucm_cleanup_events(struct ib_ucm_context *ctx)
-{
-       struct ib_ucm_event *uevent;
-
-       mutex_lock(&ctx->file->file_mutex);
-       list_del(&ctx->file_list);
-       while (!list_empty(&ctx->events)) {
-
-               uevent = list_entry(ctx->events.next,
-                                   struct ib_ucm_event, ctx_list);
-               list_del(&uevent->file_list);
-               list_del(&uevent->ctx_list);
-               mutex_unlock(&ctx->file->file_mutex);
-
-               /* clear incoming connections. */
-               if (ib_ucm_new_cm_id(uevent->resp.event))
-                       ib_destroy_cm_id(uevent->cm_id);
-
-               kfree(uevent);
-               mutex_lock(&ctx->file->file_mutex);
-       }
-       mutex_unlock(&ctx->file->file_mutex);
-}
-
-static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file)
-{
-       struct ib_ucm_context *ctx;
-
-       ctx = kzalloc(sizeof *ctx, GFP_KERNEL);
-       if (!ctx)
-               return NULL;
-
-       atomic_set(&ctx->ref, 1);
-       init_completion(&ctx->comp);
-       ctx->file = file;
-       INIT_LIST_HEAD(&ctx->events);
-
-       if (xa_alloc(&ctx_id_table, &ctx->id, ctx, xa_limit_32b, GFP_KERNEL))
-               goto error;
-
-       list_add_tail(&ctx->file_list, &file->ctxs);
-       return ctx;
-
-error:
-       kfree(ctx);
-       return NULL;
-}
-
-static void ib_ucm_event_req_get(struct ib_ucm_req_event_resp *ureq,
-                                const struct ib_cm_req_event_param *kreq)
-{
-       ureq->remote_ca_guid             = kreq->remote_ca_guid;
-       ureq->remote_qkey                = kreq->remote_qkey;
-       ureq->remote_qpn                 = kreq->remote_qpn;
-       ureq->qp_type                    = kreq->qp_type;
-       ureq->starting_psn               = kreq->starting_psn;
-       ureq->responder_resources        = kreq->responder_resources;
-       ureq->initiator_depth            = kreq->initiator_depth;
-       ureq->local_cm_response_timeout  = kreq->local_cm_response_timeout;
-       ureq->flow_control               = kreq->flow_control;
-       ureq->remote_cm_response_timeout = kreq->remote_cm_response_timeout;
-       ureq->retry_count                = kreq->retry_count;
-       ureq->rnr_retry_count            = kreq->rnr_retry_count;
-       ureq->srq                        = kreq->srq;
-       ureq->port                       = kreq->port;
-
-       ib_copy_path_rec_to_user(&ureq->primary_path, kreq->primary_path);
-       if (kreq->alternate_path)
-               ib_copy_path_rec_to_user(&ureq->alternate_path,
-                                        kreq->alternate_path);
-}
-
-static void ib_ucm_event_rep_get(struct ib_ucm_rep_event_resp *urep,
-                                const struct ib_cm_rep_event_param *krep)
-{
-       urep->remote_ca_guid      = krep->remote_ca_guid;
-       urep->remote_qkey         = krep->remote_qkey;
-       urep->remote_qpn          = krep->remote_qpn;
-       urep->starting_psn        = krep->starting_psn;
-       urep->responder_resources = krep->responder_resources;
-       urep->initiator_depth     = krep->initiator_depth;
-       urep->target_ack_delay    = krep->target_ack_delay;
-       urep->failover_accepted   = krep->failover_accepted;
-       urep->flow_control        = krep->flow_control;
-       urep->rnr_retry_count     = krep->rnr_retry_count;
-       urep->srq                 = krep->srq;
-}
-
-static void ib_ucm_event_sidr_rep_get(struct ib_ucm_sidr_rep_event_resp *urep,
-                                     const struct ib_cm_sidr_rep_event_param *krep)
-{
-       urep->status = krep->status;
-       urep->qkey   = krep->qkey;
-       urep->qpn    = krep->qpn;
-};
-
-static int ib_ucm_event_process(const struct ib_cm_event *evt,
-                               struct ib_ucm_event *uvt)
-{
-       void *info = NULL;
-
-       switch (evt->event) {
-       case IB_CM_REQ_RECEIVED:
-               ib_ucm_event_req_get(&uvt->resp.u.req_resp,
-                                    &evt->param.req_rcvd);
-               uvt->data_len      = IB_CM_REQ_PRIVATE_DATA_SIZE;
-               uvt->resp.present  = IB_UCM_PRES_PRIMARY;
-               uvt->resp.present |= (evt->param.req_rcvd.alternate_path ?
-                                     IB_UCM_PRES_ALTERNATE : 0);
-               break;
-       case IB_CM_REP_RECEIVED:
-               ib_ucm_event_rep_get(&uvt->resp.u.rep_resp,
-                                    &evt->param.rep_rcvd);
-               uvt->data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
-               break;
-       case IB_CM_RTU_RECEIVED:
-               uvt->data_len = IB_CM_RTU_PRIVATE_DATA_SIZE;
-               uvt->resp.u.send_status = evt->param.send_status;
-               break;
-       case IB_CM_DREQ_RECEIVED:
-               uvt->data_len = IB_CM_DREQ_PRIVATE_DATA_SIZE;
-               uvt->resp.u.send_status = evt->param.send_status;
-               break;
-       case IB_CM_DREP_RECEIVED:
-               uvt->data_len = IB_CM_DREP_PRIVATE_DATA_SIZE;
-               uvt->resp.u.send_status = evt->param.send_status;
-               break;
-       case IB_CM_MRA_RECEIVED:
-               uvt->resp.u.mra_resp.timeout =
-                                       evt->param.mra_rcvd.service_timeout;
-               uvt->data_len = IB_CM_MRA_PRIVATE_DATA_SIZE;
-               break;
-       case IB_CM_REJ_RECEIVED:
-               uvt->resp.u.rej_resp.reason = evt->param.rej_rcvd.reason;
-               uvt->data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
-               uvt->info_len = evt->param.rej_rcvd.ari_length;
-               info          = evt->param.rej_rcvd.ari;
-               break;
-       case IB_CM_LAP_RECEIVED:
-               ib_copy_path_rec_to_user(&uvt->resp.u.lap_resp.path,
-                                        evt->param.lap_rcvd.alternate_path);
-               uvt->data_len = IB_CM_LAP_PRIVATE_DATA_SIZE;
-               uvt->resp.present = IB_UCM_PRES_ALTERNATE;
-               break;
-       case IB_CM_APR_RECEIVED:
-               uvt->resp.u.apr_resp.status = evt->param.apr_rcvd.ap_status;
-               uvt->data_len = IB_CM_APR_PRIVATE_DATA_SIZE;
-               uvt->info_len = evt->param.apr_rcvd.info_len;
-               info          = evt->param.apr_rcvd.apr_info;
-               break;
-       case IB_CM_SIDR_REQ_RECEIVED:
-               uvt->resp.u.sidr_req_resp.pkey =
-                                       evt->param.sidr_req_rcvd.pkey;
-               uvt->resp.u.sidr_req_resp.port =
-                                       evt->param.sidr_req_rcvd.port;
-               uvt->data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE;
-               break;
-       case IB_CM_SIDR_REP_RECEIVED:
-               ib_ucm_event_sidr_rep_get(&uvt->resp.u.sidr_rep_resp,
-                                         &evt->param.sidr_rep_rcvd);
-               uvt->data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
-               uvt->info_len = evt->param.sidr_rep_rcvd.info_len;
-               info          = evt->param.sidr_rep_rcvd.info;
-               break;
-       default:
-               uvt->resp.u.send_status = evt->param.send_status;
-               break;
-       }
-
-       if (uvt->data_len) {
-               uvt->data = kmemdup(evt->private_data, uvt->data_len, GFP_KERNEL);
-               if (!uvt->data)
-                       goto err1;
-
-               uvt->resp.present |= IB_UCM_PRES_DATA;
-       }
-
-       if (uvt->info_len) {
-               uvt->info = kmemdup(info, uvt->info_len, GFP_KERNEL);
-               if (!uvt->info)
-                       goto err2;
-
-               uvt->resp.present |= IB_UCM_PRES_INFO;
-       }
-       return 0;
-
-err2:
-       kfree(uvt->data);
-err1:
-       return -ENOMEM;
-}
-
-static int ib_ucm_event_handler(struct ib_cm_id *cm_id,
-                               const struct ib_cm_event *event)
-{
-       struct ib_ucm_event *uevent;
-       struct ib_ucm_context *ctx;
-       int result = 0;
-
-       ctx = cm_id->context;
-
-       uevent = kzalloc(sizeof *uevent, GFP_KERNEL);
-       if (!uevent)
-               goto err1;
-
-       uevent->ctx = ctx;
-       uevent->cm_id = cm_id;
-       uevent->resp.uid = ctx->uid;
-       uevent->resp.id = ctx->id;
-       uevent->resp.event = event->event;
-
-       result = ib_ucm_event_process(event, uevent);
-       if (result)
-               goto err2;
-
-       mutex_lock(&ctx->file->file_mutex);
-       list_add_tail(&uevent->file_list, &ctx->file->events);
-       list_add_tail(&uevent->ctx_list, &ctx->events);
-       wake_up_interruptible(&ctx->file->poll_wait);
-       mutex_unlock(&ctx->file->file_mutex);
-       return 0;
-
-err2:
-       kfree(uevent);
-err1:
-       /* Destroy new cm_id's */
-       return ib_ucm_new_cm_id(event->event);
-}
-
-static ssize_t ib_ucm_event(struct ib_ucm_file *file,
-                           const char __user *inbuf,
-                           int in_len, int out_len)
-{
-       struct ib_ucm_context *ctx;
-       struct ib_ucm_event_get cmd;
-       struct ib_ucm_event *uevent;
-       int result = 0;
-
-       if (out_len < sizeof(struct ib_ucm_event_resp))
-               return -ENOSPC;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       mutex_lock(&file->file_mutex);
-       while (list_empty(&file->events)) {
-               mutex_unlock(&file->file_mutex);
-
-               if (file->filp->f_flags & O_NONBLOCK)
-                       return -EAGAIN;
-
-               if (wait_event_interruptible(file->poll_wait,
-                                            !list_empty(&file->events)))
-                       return -ERESTARTSYS;
-
-               mutex_lock(&file->file_mutex);
-       }
-
-       uevent = list_entry(file->events.next, struct ib_ucm_event, file_list);
-
-       if (ib_ucm_new_cm_id(uevent->resp.event)) {
-               ctx = ib_ucm_ctx_alloc(file);
-               if (!ctx) {
-                       result = -ENOMEM;
-                       goto done;
-               }
-
-               ctx->cm_id = uevent->cm_id;
-               ctx->cm_id->context = ctx;
-               uevent->resp.id = ctx->id;
-       }
-
-       if (copy_to_user(u64_to_user_ptr(cmd.response),
-                        &uevent->resp, sizeof(uevent->resp))) {
-               result = -EFAULT;
-               goto done;
-       }
-
-       if (uevent->data) {
-               if (cmd.data_len < uevent->data_len) {
-                       result = -ENOMEM;
-                       goto done;
-               }
-               if (copy_to_user(u64_to_user_ptr(cmd.data),
-                                uevent->data, uevent->data_len)) {
-                       result = -EFAULT;
-                       goto done;
-               }
-       }
-
-       if (uevent->info) {
-               if (cmd.info_len < uevent->info_len) {
-                       result = -ENOMEM;
-                       goto done;
-               }
-               if (copy_to_user(u64_to_user_ptr(cmd.info),
-                                uevent->info, uevent->info_len)) {
-                       result = -EFAULT;
-                       goto done;
-               }
-       }
-
-       list_del(&uevent->file_list);
-       list_del(&uevent->ctx_list);
-       uevent->ctx->events_reported++;
-
-       kfree(uevent->data);
-       kfree(uevent->info);
-       kfree(uevent);
-done:
-       mutex_unlock(&file->file_mutex);
-       return result;
-}
-
-static ssize_t ib_ucm_create_id(struct ib_ucm_file *file,
-                               const char __user *inbuf,
-                               int in_len, int out_len)
-{
-       struct ib_ucm_create_id cmd;
-       struct ib_ucm_create_id_resp resp;
-       struct ib_ucm_context *ctx;
-       int result;
-
-       if (out_len < sizeof(resp))
-               return -ENOSPC;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       mutex_lock(&file->file_mutex);
-       ctx = ib_ucm_ctx_alloc(file);
-       mutex_unlock(&file->file_mutex);
-       if (!ctx)
-               return -ENOMEM;
-
-       ctx->uid = cmd.uid;
-       ctx->cm_id = ib_create_cm_id(file->device->ib_dev,
-                                    ib_ucm_event_handler, ctx);
-       if (IS_ERR(ctx->cm_id)) {
-               result = PTR_ERR(ctx->cm_id);
-               goto err1;
-       }
-
-       resp.id = ctx->id;
-       if (copy_to_user(u64_to_user_ptr(cmd.response),
-                        &resp, sizeof(resp))) {
-               result = -EFAULT;
-               goto err2;
-       }
-       return 0;
-
-err2:
-       ib_destroy_cm_id(ctx->cm_id);
-err1:
-       xa_erase(&ctx_id_table, ctx->id);
-       kfree(ctx);
-       return result;
-}
-
-static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file,
-                                const char __user *inbuf,
-                                int in_len, int out_len)
-{
-       struct ib_ucm_destroy_id cmd;
-       struct ib_ucm_destroy_id_resp resp;
-       struct ib_ucm_context *ctx;
-       int result = 0;
-
-       if (out_len < sizeof(resp))
-               return -ENOSPC;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       xa_lock(&ctx_id_table);
-       ctx = xa_load(&ctx_id_table, cmd.id);
-       if (!ctx)
-               ctx = ERR_PTR(-ENOENT);
-       else if (ctx->file != file)
-               ctx = ERR_PTR(-EINVAL);
-       else
-               __xa_erase(&ctx_id_table, ctx->id);
-       xa_unlock(&ctx_id_table);
-
-       if (IS_ERR(ctx))
-               return PTR_ERR(ctx);
-
-       ib_ucm_ctx_put(ctx);
-       wait_for_completion(&ctx->comp);
-
-       /* No new events will be generated after destroying the cm_id. */
-       ib_destroy_cm_id(ctx->cm_id);
-       /* Cleanup events not yet reported to the user. */
-       ib_ucm_cleanup_events(ctx);
-
-       resp.events_reported = ctx->events_reported;
-       if (copy_to_user(u64_to_user_ptr(cmd.response),
-                        &resp, sizeof(resp)))
-               result = -EFAULT;
-
-       kfree(ctx);
-       return result;
-}
-
-static ssize_t ib_ucm_attr_id(struct ib_ucm_file *file,
-                             const char __user *inbuf,
-                             int in_len, int out_len)
-{
-       struct ib_ucm_attr_id_resp resp;
-       struct ib_ucm_attr_id cmd;
-       struct ib_ucm_context *ctx;
-       int result = 0;
-
-       if (out_len < sizeof(resp))
-               return -ENOSPC;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (IS_ERR(ctx))
-               return PTR_ERR(ctx);
-
-       resp.service_id   = ctx->cm_id->service_id;
-       resp.service_mask = ctx->cm_id->service_mask;
-       resp.local_id     = ctx->cm_id->local_id;
-       resp.remote_id    = ctx->cm_id->remote_id;
-
-       if (copy_to_user(u64_to_user_ptr(cmd.response),
-                        &resp, sizeof(resp)))
-               result = -EFAULT;
-
-       ib_ucm_ctx_put(ctx);
-       return result;
-}
-
-static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file,
-                                  const char __user *inbuf,
-                                  int in_len, int out_len)
-{
-       struct ib_uverbs_qp_attr resp;
-       struct ib_ucm_init_qp_attr cmd;
-       struct ib_ucm_context *ctx;
-       struct ib_qp_attr qp_attr;
-       int result = 0;
-
-       if (out_len < sizeof(resp))
-               return -ENOSPC;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (IS_ERR(ctx))
-               return PTR_ERR(ctx);
-
-       resp.qp_attr_mask = 0;
-       memset(&qp_attr, 0, sizeof qp_attr);
-       qp_attr.qp_state = cmd.qp_state;
-       result = ib_cm_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
-       if (result)
-               goto out;
-
-       ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr);
-
-       if (copy_to_user(u64_to_user_ptr(cmd.response),
-                        &resp, sizeof(resp)))
-               result = -EFAULT;
-
-out:
-       ib_ucm_ctx_put(ctx);
-       return result;
-}
-
-static int ucm_validate_listen(__be64 service_id, __be64 service_mask)
-{
-       service_id &= service_mask;
-
-       if (((service_id & IB_CMA_SERVICE_ID_MASK) == IB_CMA_SERVICE_ID) ||
-           ((service_id & IB_SDP_SERVICE_ID_MASK) == IB_SDP_SERVICE_ID))
-               return -EINVAL;
-
-       return 0;
-}
-
-static ssize_t ib_ucm_listen(struct ib_ucm_file *file,
-                            const char __user *inbuf,
-                            int in_len, int out_len)
-{
-       struct ib_ucm_listen cmd;
-       struct ib_ucm_context *ctx;
-       int result;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (IS_ERR(ctx))
-               return PTR_ERR(ctx);
-
-       result = ucm_validate_listen(cmd.service_id, cmd.service_mask);
-       if (result)
-               goto out;
-
-       result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask);
-out:
-       ib_ucm_ctx_put(ctx);
-       return result;
-}
-
-static ssize_t ib_ucm_notify(struct ib_ucm_file *file,
-                            const char __user *inbuf,
-                            int in_len, int out_len)
-{
-       struct ib_ucm_notify cmd;
-       struct ib_ucm_context *ctx;
-       int result;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (IS_ERR(ctx))
-               return PTR_ERR(ctx);
-
-       result = ib_cm_notify(ctx->cm_id, (enum ib_event_type) cmd.event);
-       ib_ucm_ctx_put(ctx);
-       return result;
-}
-
-static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len)
-{
-       void *data;
-
-       *dest = NULL;
-
-       if (!len)
-               return 0;
-
-       data = memdup_user(u64_to_user_ptr(src), len);
-       if (IS_ERR(data))
-               return PTR_ERR(data);
-
-       *dest = data;
-       return 0;
-}
-
-static int ib_ucm_path_get(struct sa_path_rec **path, u64 src)
-{
-       struct ib_user_path_rec upath;
-       struct sa_path_rec  *sa_path;
-
-       *path = NULL;
-
-       if (!src)
-               return 0;
-
-       sa_path = kmalloc(sizeof(*sa_path), GFP_KERNEL);
-       if (!sa_path)
-               return -ENOMEM;
-
-       if (copy_from_user(&upath, u64_to_user_ptr(src),
-                          sizeof(upath))) {
-
-               kfree(sa_path);
-               return -EFAULT;
-       }
-
-       ib_copy_path_rec_from_user(sa_path, &upath);
-       *path = sa_path;
-       return 0;
-}
-
-static ssize_t ib_ucm_send_req(struct ib_ucm_file *file,
-                              const char __user *inbuf,
-                              int in_len, int out_len)
-{
-       struct ib_cm_req_param param;
-       struct ib_ucm_context *ctx;
-       struct ib_ucm_req cmd;
-       int result;
-
-       param.private_data   = NULL;
-       param.primary_path   = NULL;
-       param.alternate_path = NULL;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
-       if (result)
-               goto done;
-
-       result = ib_ucm_path_get(&param.primary_path, cmd.primary_path);
-       if (result)
-               goto done;
-
-       result = ib_ucm_path_get(&param.alternate_path, cmd.alternate_path);
-       if (result)
-               goto done;
-
-       param.private_data_len           = cmd.len;
-       param.service_id                 = cmd.sid;
-       param.qp_num                     = cmd.qpn;
-       param.qp_type                    = cmd.qp_type;
-       param.starting_psn               = cmd.psn;
-       param.peer_to_peer               = cmd.peer_to_peer;
-       param.responder_resources        = cmd.responder_resources;
-       param.initiator_depth            = cmd.initiator_depth;
-       param.remote_cm_response_timeout = cmd.remote_cm_response_timeout;
-       param.flow_control               = cmd.flow_control;
-       param.local_cm_response_timeout  = cmd.local_cm_response_timeout;
-       param.retry_count                = cmd.retry_count;
-       param.rnr_retry_count            = cmd.rnr_retry_count;
-       param.max_cm_retries             = cmd.max_cm_retries;
-       param.srq                        = cmd.srq;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (!IS_ERR(ctx)) {
-               result = ib_send_cm_req(ctx->cm_id, &param);
-               ib_ucm_ctx_put(ctx);
-       } else
-               result = PTR_ERR(ctx);
-
-done:
-       kfree(param.private_data);
-       kfree(param.primary_path);
-       kfree(param.alternate_path);
-       return result;
-}
-
-static ssize_t ib_ucm_send_rep(struct ib_ucm_file *file,
-                              const char __user *inbuf,
-                              int in_len, int out_len)
-{
-       struct ib_cm_rep_param param;
-       struct ib_ucm_context *ctx;
-       struct ib_ucm_rep cmd;
-       int result;
-
-       param.private_data = NULL;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
-       if (result)
-               return result;
-
-       param.qp_num              = cmd.qpn;
-       param.starting_psn        = cmd.psn;
-       param.private_data_len    = cmd.len;
-       param.responder_resources = cmd.responder_resources;
-       param.initiator_depth     = cmd.initiator_depth;
-       param.failover_accepted   = cmd.failover_accepted;
-       param.flow_control        = cmd.flow_control;
-       param.rnr_retry_count     = cmd.rnr_retry_count;
-       param.srq                 = cmd.srq;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (!IS_ERR(ctx)) {
-               ctx->uid = cmd.uid;
-               result = ib_send_cm_rep(ctx->cm_id, &param);
-               ib_ucm_ctx_put(ctx);
-       } else
-               result = PTR_ERR(ctx);
-
-       kfree(param.private_data);
-       return result;
-}
-
-static ssize_t ib_ucm_send_private_data(struct ib_ucm_file *file,
-                                       const char __user *inbuf, int in_len,
-                                       int (*func)(struct ib_cm_id *cm_id,
-                                                   const void *private_data,
-                                                   u8 private_data_len))
-{
-       struct ib_ucm_private_data cmd;
-       struct ib_ucm_context *ctx;
-       const void *private_data = NULL;
-       int result;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       result = ib_ucm_alloc_data(&private_data, cmd.data, cmd.len);
-       if (result)
-               return result;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (!IS_ERR(ctx)) {
-               result = func(ctx->cm_id, private_data, cmd.len);
-               ib_ucm_ctx_put(ctx);
-       } else
-               result = PTR_ERR(ctx);
-
-       kfree(private_data);
-       return result;
-}
-
-static ssize_t ib_ucm_send_rtu(struct ib_ucm_file *file,
-                              const char __user *inbuf,
-                              int in_len, int out_len)
-{
-       return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_rtu);
-}
-
-static ssize_t ib_ucm_send_dreq(struct ib_ucm_file *file,
-                               const char __user *inbuf,
-                               int in_len, int out_len)
-{
-       return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_dreq);
-}
-
-static ssize_t ib_ucm_send_drep(struct ib_ucm_file *file,
-                               const char __user *inbuf,
-                               int in_len, int out_len)
-{
-       return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_drep);
-}
-
-static ssize_t ib_ucm_send_info(struct ib_ucm_file *file,
-                               const char __user *inbuf, int in_len,
-                               int (*func)(struct ib_cm_id *cm_id,
-                                           int status,
-                                           const void *info,
-                                           u8 info_len,
-                                           const void *data,
-                                           u8 data_len))
-{
-       struct ib_ucm_context *ctx;
-       struct ib_ucm_info cmd;
-       const void *data = NULL;
-       const void *info = NULL;
-       int result;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       result = ib_ucm_alloc_data(&data, cmd.data, cmd.data_len);
-       if (result)
-               goto done;
-
-       result = ib_ucm_alloc_data(&info, cmd.info, cmd.info_len);
-       if (result)
-               goto done;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (!IS_ERR(ctx)) {
-               result = func(ctx->cm_id, cmd.status, info, cmd.info_len,
-                             data, cmd.data_len);
-               ib_ucm_ctx_put(ctx);
-       } else
-               result = PTR_ERR(ctx);
-
-done:
-       kfree(data);
-       kfree(info);
-       return result;
-}
-
-static ssize_t ib_ucm_send_rej(struct ib_ucm_file *file,
-                              const char __user *inbuf,
-                              int in_len, int out_len)
-{
-       return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_rej);
-}
-
-static ssize_t ib_ucm_send_apr(struct ib_ucm_file *file,
-                              const char __user *inbuf,
-                              int in_len, int out_len)
-{
-       return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_apr);
-}
-
-static ssize_t ib_ucm_send_mra(struct ib_ucm_file *file,
-                              const char __user *inbuf,
-                              int in_len, int out_len)
-{
-       struct ib_ucm_context *ctx;
-       struct ib_ucm_mra cmd;
-       const void *data = NULL;
-       int result;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
-       if (result)
-               return result;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (!IS_ERR(ctx)) {
-               result = ib_send_cm_mra(ctx->cm_id, cmd.timeout, data, cmd.len);
-               ib_ucm_ctx_put(ctx);
-       } else
-               result = PTR_ERR(ctx);
-
-       kfree(data);
-       return result;
-}
-
-static ssize_t ib_ucm_send_lap(struct ib_ucm_file *file,
-                              const char __user *inbuf,
-                              int in_len, int out_len)
-{
-       struct ib_ucm_context *ctx;
-       struct sa_path_rec *path = NULL;
-       struct ib_ucm_lap cmd;
-       const void *data = NULL;
-       int result;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
-       if (result)
-               goto done;
-
-       result = ib_ucm_path_get(&path, cmd.path);
-       if (result)
-               goto done;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (!IS_ERR(ctx)) {
-               result = ib_send_cm_lap(ctx->cm_id, path, data, cmd.len);
-               ib_ucm_ctx_put(ctx);
-       } else
-               result = PTR_ERR(ctx);
-
-done:
-       kfree(data);
-       kfree(path);
-       return result;
-}
-
-static ssize_t ib_ucm_send_sidr_req(struct ib_ucm_file *file,
-                                   const char __user *inbuf,
-                                   int in_len, int out_len)
-{
-       struct ib_cm_sidr_req_param param = {};
-       struct ib_ucm_context *ctx;
-       struct ib_ucm_sidr_req cmd;
-       int result;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
-       if (result)
-               goto done;
-
-       result = ib_ucm_path_get(&param.path, cmd.path);
-       if (result)
-               goto done;
-
-       param.private_data_len = cmd.len;
-       param.service_id       = cmd.sid;
-       param.timeout_ms       = cmd.timeout;
-       param.max_cm_retries   = cmd.max_cm_retries;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (!IS_ERR(ctx)) {
-               result = ib_send_cm_sidr_req(ctx->cm_id, &param);
-               ib_ucm_ctx_put(ctx);
-       } else
-               result = PTR_ERR(ctx);
-
-done:
-       kfree(param.private_data);
-       kfree(param.path);
-       return result;
-}
-
-static ssize_t ib_ucm_send_sidr_rep(struct ib_ucm_file *file,
-                                   const char __user *inbuf,
-                                   int in_len, int out_len)
-{
-       struct ib_cm_sidr_rep_param param;
-       struct ib_ucm_sidr_rep cmd;
-       struct ib_ucm_context *ctx;
-       int result;
-
-       param.info = NULL;
-
-       if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
-               return -EFAULT;
-
-       result = ib_ucm_alloc_data(&param.private_data,
-                                  cmd.data, cmd.data_len);
-       if (result)
-               goto done;
-
-       result = ib_ucm_alloc_data(&param.info, cmd.info, cmd.info_len);
-       if (result)
-               goto done;
-
-       param.qp_num            = cmd.qpn;
-       param.qkey              = cmd.qkey;
-       param.status            = cmd.status;
-       param.info_length       = cmd.info_len;
-       param.private_data_len  = cmd.data_len;
-
-       ctx = ib_ucm_ctx_get(file, cmd.id);
-       if (!IS_ERR(ctx)) {
-               result = ib_send_cm_sidr_rep(ctx->cm_id, &param);
-               ib_ucm_ctx_put(ctx);
-       } else
-               result = PTR_ERR(ctx);
-
-done:
-       kfree(param.private_data);
-       kfree(param.info);
-       return result;
-}
-
-static ssize_t (*ucm_cmd_table[])(struct ib_ucm_file *file,
-                                 const char __user *inbuf,
-                                 int in_len, int out_len) = {
-       [IB_USER_CM_CMD_CREATE_ID]     = ib_ucm_create_id,
-       [IB_USER_CM_CMD_DESTROY_ID]    = ib_ucm_destroy_id,
-       [IB_USER_CM_CMD_ATTR_ID]       = ib_ucm_attr_id,
-       [IB_USER_CM_CMD_LISTEN]        = ib_ucm_listen,
-       [IB_USER_CM_CMD_NOTIFY]        = ib_ucm_notify,
-       [IB_USER_CM_CMD_SEND_REQ]      = ib_ucm_send_req,
-       [IB_USER_CM_CMD_SEND_REP]      = ib_ucm_send_rep,
-       [IB_USER_CM_CMD_SEND_RTU]      = ib_ucm_send_rtu,
-       [IB_USER_CM_CMD_SEND_DREQ]     = ib_ucm_send_dreq,
-       [IB_USER_CM_CMD_SEND_DREP]     = ib_ucm_send_drep,
-       [IB_USER_CM_CMD_SEND_REJ]      = ib_ucm_send_rej,
-       [IB_USER_CM_CMD_SEND_MRA]      = ib_ucm_send_mra,
-       [IB_USER_CM_CMD_SEND_LAP]      = ib_ucm_send_lap,
-       [IB_USER_CM_CMD_SEND_APR]      = ib_ucm_send_apr,
-       [IB_USER_CM_CMD_SEND_SIDR_REQ] = ib_ucm_send_sidr_req,
-       [IB_USER_CM_CMD_SEND_SIDR_REP] = ib_ucm_send_sidr_rep,
-       [IB_USER_CM_CMD_EVENT]         = ib_ucm_event,
-       [IB_USER_CM_CMD_INIT_QP_ATTR]  = ib_ucm_init_qp_attr,
-};
-
-static ssize_t ib_ucm_write(struct file *filp, const char __user *buf,
-                           size_t len, loff_t *pos)
-{
-       struct ib_ucm_file *file = filp->private_data;
-       struct ib_ucm_cmd_hdr hdr;
-       ssize_t result;
-
-       if (!ib_safe_file_access(filp)) {
-               pr_err_once("ucm_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n",
-                           task_tgid_vnr(current), current->comm);
-               return -EACCES;
-       }
-
-       if (len < sizeof(hdr))
-               return -EINVAL;
-
-       if (copy_from_user(&hdr, buf, sizeof(hdr)))
-               return -EFAULT;
-
-       if (hdr.cmd >= ARRAY_SIZE(ucm_cmd_table))
-               return -EINVAL;
-       hdr.cmd = array_index_nospec(hdr.cmd, ARRAY_SIZE(ucm_cmd_table));
-
-       if (hdr.in + sizeof(hdr) > len)
-               return -EINVAL;
-
-       result = ucm_cmd_table[hdr.cmd](file, buf + sizeof(hdr),
-                                       hdr.in, hdr.out);
-       if (!result)
-               result = len;
-
-       return result;
-}
-
-static __poll_t ib_ucm_poll(struct file *filp,
-                               struct poll_table_struct *wait)
-{
-       struct ib_ucm_file *file = filp->private_data;
-       __poll_t mask = 0;
-
-       poll_wait(filp, &file->poll_wait, wait);
-
-       if (!list_empty(&file->events))
-               mask = EPOLLIN | EPOLLRDNORM;
-
-       return mask;
-}
-
-/*
- * ib_ucm_open() does not need the BKL:
- *
- *  - no global state is referred to;
- *  - there is no ioctl method to race against;
- *  - no further module initialization is required for open to work
- *    after the device is registered.
- */
-static int ib_ucm_open(struct inode *inode, struct file *filp)
-{
-       struct ib_ucm_file *file;
-
-       file = kmalloc(sizeof(*file), GFP_KERNEL);
-       if (!file)
-               return -ENOMEM;
-
-       INIT_LIST_HEAD(&file->events);
-       INIT_LIST_HEAD(&file->ctxs);
-       init_waitqueue_head(&file->poll_wait);
-
-       mutex_init(&file->file_mutex);
-
-       filp->private_data = file;
-       file->filp = filp;
-       file->device = container_of(inode->i_cdev, struct ib_ucm_device, cdev);
-
-       return stream_open(inode, filp);
-}
-
-static int ib_ucm_close(struct inode *inode, struct file *filp)
-{
-       struct ib_ucm_file *file = filp->private_data;
-       struct ib_ucm_context *ctx;
-
-       mutex_lock(&file->file_mutex);
-       while (!list_empty(&file->ctxs)) {
-               ctx = list_entry(file->ctxs.next,
-                                struct ib_ucm_context, file_list);
-               mutex_unlock(&file->file_mutex);
-
-               xa_erase(&ctx_id_table, ctx->id);
-               ib_destroy_cm_id(ctx->cm_id);
-               ib_ucm_cleanup_events(ctx);
-               kfree(ctx);
-
-               mutex_lock(&file->file_mutex);
-       }
-       mutex_unlock(&file->file_mutex);
-       kfree(file);
-       return 0;
-}
-
-static void ib_ucm_release_dev(struct device *dev)
-{
-       struct ib_ucm_device *ucm_dev;
-
-       ucm_dev = container_of(dev, struct ib_ucm_device, dev);
-       kfree(ucm_dev);
-}
-
-static void ib_ucm_free_dev(struct ib_ucm_device *ucm_dev)
-{
-       clear_bit(ucm_dev->devnum, dev_map);
-}
-
-static const struct file_operations ucm_fops = {
-       .owner   = THIS_MODULE,
-       .open    = ib_ucm_open,
-       .release = ib_ucm_close,
-       .write   = ib_ucm_write,
-       .poll    = ib_ucm_poll,
-       .llseek  = no_llseek,
-};
-
-static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
-                         char *buf)
-{
-       struct ib_ucm_device *ucm_dev;
-
-       ucm_dev = container_of(dev, struct ib_ucm_device, dev);
-       return sprintf(buf, "%s\n", ucm_dev->ib_dev->name);
-}
-static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
-
-static void ib_ucm_add_one(struct ib_device *device)
-{
-       int devnum;
-       dev_t base;
-       struct ib_ucm_device *ucm_dev;
-
-       if (!device->ops.alloc_ucontext || !rdma_cap_ib_cm(device, 1))
-               return;
-
-       ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL);
-       if (!ucm_dev)
-               return;
-
-       device_initialize(&ucm_dev->dev);
-       ucm_dev->ib_dev = device;
-       ucm_dev->dev.release = ib_ucm_release_dev;
-
-       devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES);
-       if (devnum >= IB_UCM_MAX_DEVICES)
-               goto err;
-       ucm_dev->devnum = devnum;
-       set_bit(devnum, dev_map);
-       if (devnum >= IB_UCM_NUM_FIXED_MINOR)
-               base = dynamic_ucm_dev + devnum - IB_UCM_NUM_FIXED_MINOR;
-       else
-               base = IB_UCM_BASE_DEV + devnum;
-
-       cdev_init(&ucm_dev->cdev, &ucm_fops);
-       ucm_dev->cdev.owner = THIS_MODULE;
-       kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum);
-
-       ucm_dev->dev.class = &cm_class;
-       ucm_dev->dev.parent = device->dev.parent;
-       ucm_dev->dev.devt = base;
-
-       dev_set_name(&ucm_dev->dev, "ucm%d", ucm_dev->devnum);
-       if (cdev_device_add(&ucm_dev->cdev, &ucm_dev->dev))
-               goto err_devnum;
-
-       if (device_create_file(&ucm_dev->dev, &dev_attr_ibdev))
-               goto err_dev;
-
-       ib_set_client_data(device, &ucm_client, ucm_dev);
-       return;
-
-err_dev:
-       cdev_device_del(&ucm_dev->cdev, &ucm_dev->dev);
-err_devnum:
-       ib_ucm_free_dev(ucm_dev);
-err:
-       put_device(&ucm_dev->dev);
-       return;
-}
-
-static void ib_ucm_remove_one(struct ib_device *device, void *client_data)
-{
-       struct ib_ucm_device *ucm_dev = client_data;
-
-       if (!ucm_dev)
-               return;
-
-       cdev_device_del(&ucm_dev->cdev, &ucm_dev->dev);
-       ib_ucm_free_dev(ucm_dev);
-       put_device(&ucm_dev->dev);
-}
-
-static CLASS_ATTR_STRING(abi_version, S_IRUGO,
-                        __stringify(IB_USER_CM_ABI_VERSION));
-
-static int __init ib_ucm_init(void)
-{
-       int ret;
-
-       ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR,
-                                    "infiniband_cm");
-       if (ret) {
-               pr_err("ucm: couldn't register device number\n");
-               goto error1;
-       }
-
-       ret = alloc_chrdev_region(&dynamic_ucm_dev, 0, IB_UCM_NUM_DYNAMIC_MINOR,
-                                 "infiniband_cm");
-       if (ret) {
-               pr_err("ucm: couldn't register dynamic device number\n");
-               goto err_alloc;
-       }
-
-       ret = class_create_file(&cm_class, &class_attr_abi_version.attr);
-       if (ret) {
-               pr_err("ucm: couldn't create abi_version attribute\n");
-               goto error2;
-       }
-
-       ret = ib_register_client(&ucm_client);
-       if (ret) {
-               pr_err("ucm: couldn't register client\n");
-               goto error3;
-       }
-       return 0;
-
-error3:
-       class_remove_file(&cm_class, &class_attr_abi_version.attr);
-error2:
-       unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR);
-err_alloc:
-       unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR);
-error1:
-       return ret;
-}
-
-static void __exit ib_ucm_cleanup(void)
-{
-       ib_unregister_client(&ucm_client);
-       class_remove_file(&cm_class, &class_attr_abi_version.attr);
-       unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR);
-       unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR);
-       WARN_ON(!xa_empty(&ctx_id_table));
-}
-
-module_init(ib_ucm_init);
-module_exit(ib_ucm_cleanup);
index 140a338a135f5e46a281ace1919f9827d6116edd..0274e9b704be5930cedb52b2f9d036edec69b6bb 100644 (file)
@@ -52,6 +52,8 @@
 #include <rdma/rdma_cm_ib.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib.h>
+#include <rdma/rdma_netlink.h>
+#include "core_priv.h"
 
 MODULE_AUTHOR("Sean Hefty");
 MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access");
@@ -81,7 +83,7 @@ struct ucma_file {
 };
 
 struct ucma_context {
-       int                     id;
+       u32                     id;
        struct completion       comp;
        atomic_t                ref;
        int                     events_reported;
@@ -94,7 +96,7 @@ struct ucma_context {
        struct list_head        list;
        struct list_head        mc_list;
        /* mark that device is in process of destroying the internal HW
-        * resources, protected by the global mut
+        * resources, protected by the ctx_table lock
         */
        int                     closing;
        /* sync between removal event and id destroy, protected by file mut */
@@ -104,7 +106,7 @@ struct ucma_context {
 
 struct ucma_multicast {
        struct ucma_context     *ctx;
-       int                     id;
+       u32                     id;
        int                     events_reported;
 
        u64                     uid;
@@ -122,9 +124,8 @@ struct ucma_event {
        struct work_struct      close_work;
 };
 
-static DEFINE_MUTEX(mut);
-static DEFINE_IDR(ctx_idr);
-static DEFINE_IDR(multicast_idr);
+static DEFINE_XARRAY_ALLOC(ctx_table);
+static DEFINE_XARRAY_ALLOC(multicast_table);
 
 static const struct file_operations ucma_fops;
 
@@ -133,7 +134,7 @@ static inline struct ucma_context *_ucma_find_context(int id,
 {
        struct ucma_context *ctx;
 
-       ctx = idr_find(&ctx_idr, id);
+       ctx = xa_load(&ctx_table, id);
        if (!ctx)
                ctx = ERR_PTR(-ENOENT);
        else if (ctx->file != file || !ctx->cm_id)
@@ -145,7 +146,7 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
 {
        struct ucma_context *ctx;
 
-       mutex_lock(&mut);
+       xa_lock(&ctx_table);
        ctx = _ucma_find_context(id, file);
        if (!IS_ERR(ctx)) {
                if (ctx->closing)
@@ -153,7 +154,7 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
                else
                        atomic_inc(&ctx->ref);
        }
-       mutex_unlock(&mut);
+       xa_unlock(&ctx_table);
        return ctx;
 }
 
@@ -216,10 +217,7 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
        INIT_LIST_HEAD(&ctx->mc_list);
        ctx->file = file;
 
-       mutex_lock(&mut);
-       ctx->id = idr_alloc(&ctx_idr, ctx, 0, 0, GFP_KERNEL);
-       mutex_unlock(&mut);
-       if (ctx->id < 0)
+       if (xa_alloc(&ctx_table, &ctx->id, ctx, xa_limit_32b, GFP_KERNEL))
                goto error;
 
        list_add_tail(&ctx->list, &file->ctx_list);
@@ -238,13 +236,10 @@ static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx)
        if (!mc)
                return NULL;
 
-       mutex_lock(&mut);
-       mc->id = idr_alloc(&multicast_idr, NULL, 0, 0, GFP_KERNEL);
-       mutex_unlock(&mut);
-       if (mc->id < 0)
+       mc->ctx = ctx;
+       if (xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b, GFP_KERNEL))
                goto error;
 
-       mc->ctx = ctx;
        list_add_tail(&mc->list, &ctx->mc_list);
        return mc;
 
@@ -319,9 +314,9 @@ static void ucma_removal_event_handler(struct rdma_cm_id *cm_id)
         * handled separately below.
         */
        if (ctx->cm_id == cm_id) {
-               mutex_lock(&mut);
+               xa_lock(&ctx_table);
                ctx->closing = 1;
-               mutex_unlock(&mut);
+               xa_unlock(&ctx_table);
                queue_work(ctx->file->close_wq, &ctx->close_work);
                return;
        }
@@ -523,9 +518,7 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf,
 err2:
        rdma_destroy_id(cm_id);
 err1:
-       mutex_lock(&mut);
-       idr_remove(&ctx_idr, ctx->id);
-       mutex_unlock(&mut);
+       xa_erase(&ctx_table, ctx->id);
        mutex_lock(&file->mut);
        list_del(&ctx->list);
        mutex_unlock(&file->mut);
@@ -537,13 +530,13 @@ static void ucma_cleanup_multicast(struct ucma_context *ctx)
 {
        struct ucma_multicast *mc, *tmp;
 
-       mutex_lock(&mut);
+       mutex_lock(&ctx->file->mut);
        list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) {
                list_del(&mc->list);
-               idr_remove(&multicast_idr, mc->id);
+               xa_erase(&multicast_table, mc->id);
                kfree(mc);
        }
-       mutex_unlock(&mut);
+       mutex_unlock(&ctx->file->mut);
 }
 
 static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
@@ -614,11 +607,11 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
        if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
                return -EFAULT;
 
-       mutex_lock(&mut);
+       xa_lock(&ctx_table);
        ctx = _ucma_find_context(cmd.id, file);
        if (!IS_ERR(ctx))
-               idr_remove(&ctx_idr, ctx->id);
-       mutex_unlock(&mut);
+               __xa_erase(&ctx_table, ctx->id);
+       xa_unlock(&ctx_table);
 
        if (IS_ERR(ctx))
                return PTR_ERR(ctx);
@@ -630,14 +623,14 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
        flush_workqueue(ctx->file->close_wq);
        /* At this point it's guaranteed that there is no inflight
         * closing task */
-       mutex_lock(&mut);
+       xa_lock(&ctx_table);
        if (!ctx->closing) {
-               mutex_unlock(&mut);
+               xa_unlock(&ctx_table);
                ucma_put_ctx(ctx);
                wait_for_completion(&ctx->comp);
                rdma_destroy_id(ctx->cm_id);
        } else {
-               mutex_unlock(&mut);
+               xa_unlock(&ctx_table);
        }
 
        resp.events_reported = ucma_free_ctx(ctx);
@@ -951,8 +944,7 @@ static ssize_t ucma_query_path(struct ucma_context *ctx,
                }
        }
 
-       if (copy_to_user(response, resp,
-                        sizeof(*resp) + (i * sizeof(struct ib_path_rec_data))))
+       if (copy_to_user(response, resp, struct_size(resp, path_data, i)))
                ret = -EFAULT;
 
        kfree(resp);
@@ -1432,9 +1424,7 @@ static ssize_t ucma_process_join(struct ucma_file *file,
                goto err3;
        }
 
-       mutex_lock(&mut);
-       idr_replace(&multicast_idr, mc, mc->id);
-       mutex_unlock(&mut);
+       xa_store(&multicast_table, mc->id, mc, 0);
 
        mutex_unlock(&file->mut);
        ucma_put_ctx(ctx);
@@ -1444,9 +1434,7 @@ err3:
        rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr);
        ucma_cleanup_mc_events(mc);
 err2:
-       mutex_lock(&mut);
-       idr_remove(&multicast_idr, mc->id);
-       mutex_unlock(&mut);
+       xa_erase(&multicast_table, mc->id);
        list_del(&mc->list);
        kfree(mc);
 err1:
@@ -1508,8 +1496,8 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file,
        if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
                return -EFAULT;
 
-       mutex_lock(&mut);
-       mc = idr_find(&multicast_idr, cmd.id);
+       xa_lock(&multicast_table);
+       mc = xa_load(&multicast_table, cmd.id);
        if (!mc)
                mc = ERR_PTR(-ENOENT);
        else if (mc->ctx->file != file)
@@ -1517,8 +1505,8 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file,
        else if (!atomic_inc_not_zero(&mc->ctx->ref))
                mc = ERR_PTR(-ENXIO);
        else
-               idr_remove(&multicast_idr, mc->id);
-       mutex_unlock(&mut);
+               __xa_erase(&multicast_table, mc->id);
+       xa_unlock(&multicast_table);
 
        if (IS_ERR(mc)) {
                ret = PTR_ERR(mc);
@@ -1615,14 +1603,14 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file,
         * events being added before existing events.
         */
        ucma_lock_files(cur_file, new_file);
-       mutex_lock(&mut);
+       xa_lock(&ctx_table);
 
        list_move_tail(&ctx->list, &new_file->ctx_list);
        ucma_move_events(ctx, new_file);
        ctx->file = new_file;
        resp.events_reported = ctx->events_reported;
 
-       mutex_unlock(&mut);
+       xa_unlock(&ctx_table);
        ucma_unlock_files(cur_file, new_file);
 
 response:
@@ -1757,18 +1745,15 @@ static int ucma_close(struct inode *inode, struct file *filp)
                ctx->destroying = 1;
                mutex_unlock(&file->mut);
 
-               mutex_lock(&mut);
-               idr_remove(&ctx_idr, ctx->id);
-               mutex_unlock(&mut);
-
+               xa_erase(&ctx_table, ctx->id);
                flush_workqueue(file->close_wq);
                /* At that step once ctx was marked as destroying and workqueue
                 * was flushed we are safe from any inflights handlers that
                 * might put other closing task.
                 */
-               mutex_lock(&mut);
+               xa_lock(&ctx_table);
                if (!ctx->closing) {
-                       mutex_unlock(&mut);
+                       xa_unlock(&ctx_table);
                        ucma_put_ctx(ctx);
                        wait_for_completion(&ctx->comp);
                        /* rdma_destroy_id ensures that no event handlers are
@@ -1776,7 +1761,7 @@ static int ucma_close(struct inode *inode, struct file *filp)
                         */
                        rdma_destroy_id(ctx->cm_id);
                } else {
-                       mutex_unlock(&mut);
+                       xa_unlock(&ctx_table);
                }
 
                ucma_free_ctx(ctx);
@@ -1805,6 +1790,19 @@ static struct miscdevice ucma_misc = {
        .fops           = &ucma_fops,
 };
 
+static int ucma_get_global_nl_info(struct ib_client_nl_info *res)
+{
+       res->abi = RDMA_USER_CM_ABI_VERSION;
+       res->cdev = ucma_misc.this_device;
+       return 0;
+}
+
+static struct ib_client rdma_cma_client = {
+       .name = "rdma_cm",
+       .get_global_nl_info = ucma_get_global_nl_info,
+};
+MODULE_ALIAS_RDMA_CLIENT("rdma_cm");
+
 static ssize_t show_abi_version(struct device *dev,
                                struct device_attribute *attr,
                                char *buf)
@@ -1833,7 +1831,14 @@ static int __init ucma_init(void)
                ret = -ENOMEM;
                goto err2;
        }
+
+       ret = ib_register_client(&rdma_cma_client);
+       if (ret)
+               goto err3;
+
        return 0;
+err3:
+       unregister_net_sysctl_table(ucma_ctl_table_hdr);
 err2:
        device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
 err1:
@@ -1843,11 +1848,10 @@ err1:
 
 static void __exit ucma_cleanup(void)
 {
+       ib_unregister_client(&rdma_cma_client);
        unregister_net_sysctl_table(ucma_ctl_table_hdr);
        device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
        misc_deregister(&ucma_misc);
-       idr_destroy(&ctx_idr);
-       idr_destroy(&multicast_idr);
 }
 
 module_init(ucma_init);
index e7ea819fcb116dd8bb45a34160af950f4b87316d..08da840ed7eebc151b67a61b15626d90578dea54 100644 (file)
@@ -54,9 +54,10 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
 
        for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
                page = sg_page_iter_page(&sg_iter);
-               if (!PageDirty(page) && umem->writable && dirty)
-                       set_page_dirty_lock(page);
-               put_page(page);
+               if (umem->writable && dirty)
+                       put_user_pages_dirty_lock(&page, 1);
+               else
+                       put_user_page(page);
        }
 
        sg_free_table(&umem->sg_head);
@@ -244,7 +245,6 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
        umem->context    = context;
        umem->length     = size;
        umem->address    = addr;
-       umem->page_shift = PAGE_SHIFT;
        umem->writable   = ib_access_writable(access);
        umem->owning_mm = mm = current->mm;
        mmgrab(mm);
@@ -361,6 +361,9 @@ static void __ib_umem_release_tail(struct ib_umem *umem)
  */
 void ib_umem_release(struct ib_umem *umem)
 {
+       if (!umem)
+               return;
+
        if (umem->is_odp) {
                ib_umem_odp_release(to_ib_umem_odp(umem));
                __ib_umem_release_tail(umem);
@@ -385,7 +388,7 @@ int ib_umem_page_count(struct ib_umem *umem)
 
        n = 0;
        for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
-               n += sg_dma_len(sg) >> umem->page_shift;
+               n += sg_dma_len(sg) >> PAGE_SHIFT;
 
        return n;
 }
index f962b5bbfa40e4268de37f99c78b6653895c9599..2a75c6f8d8270e5ca07610684fbe9e5156f72708 100644 (file)
@@ -59,7 +59,7 @@ static u64 node_start(struct umem_odp_node *n)
        struct ib_umem_odp *umem_odp =
                        container_of(n, struct ib_umem_odp, interval_tree);
 
-       return ib_umem_start(&umem_odp->umem);
+       return ib_umem_start(umem_odp);
 }
 
 /* Note that the representation of the intervals in the interval tree
@@ -72,7 +72,7 @@ static u64 node_last(struct umem_odp_node *n)
        struct ib_umem_odp *umem_odp =
                        container_of(n, struct ib_umem_odp, interval_tree);
 
-       return ib_umem_end(&umem_odp->umem) - 1;
+       return ib_umem_end(umem_odp) - 1;
 }
 
 INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
@@ -107,8 +107,6 @@ static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp)
 static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp,
                                               u64 start, u64 end, void *cookie)
 {
-       struct ib_umem *umem = &umem_odp->umem;
-
        /*
         * Increase the number of notifiers running, to
         * prevent any further fault handling on this MR.
@@ -119,8 +117,8 @@ static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp,
         * all pending page faults. */
        smp_wmb();
        complete_all(&umem_odp->notifier_completion);
-       umem->context->invalidate_range(umem_odp, ib_umem_start(umem),
-                                       ib_umem_end(umem));
+       umem_odp->umem.context->invalidate_range(
+               umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp));
        return 0;
 }
 
@@ -151,6 +149,7 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
 {
        struct ib_ucontext_per_mm *per_mm =
                container_of(mn, struct ib_ucontext_per_mm, mn);
+       int rc;
 
        if (mmu_notifier_range_blockable(range))
                down_read(&per_mm->umem_rwsem);
@@ -167,11 +166,14 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
                return 0;
        }
 
-       return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
-                                            range->end,
-                                            invalidate_range_start_trampoline,
-                                            mmu_notifier_range_blockable(range),
-                                            NULL);
+       rc = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
+                                          range->end,
+                                          invalidate_range_start_trampoline,
+                                          mmu_notifier_range_blockable(range),
+                                          NULL);
+       if (rc)
+               up_read(&per_mm->umem_rwsem);
+       return rc;
 }
 
 static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start,
@@ -205,10 +207,9 @@ static const struct mmu_notifier_ops ib_umem_notifiers = {
 static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp)
 {
        struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
-       struct ib_umem *umem = &umem_odp->umem;
 
        down_write(&per_mm->umem_rwsem);
-       if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+       if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp)))
                rbt_ib_umem_insert(&umem_odp->interval_tree,
                                   &per_mm->umem_tree);
        up_write(&per_mm->umem_rwsem);
@@ -217,10 +218,9 @@ static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp)
 static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp)
 {
        struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
-       struct ib_umem *umem = &umem_odp->umem;
 
        down_write(&per_mm->umem_rwsem);
-       if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+       if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp)))
                rbt_ib_umem_remove(&umem_odp->interval_tree,
                                   &per_mm->umem_tree);
        complete_all(&umem_odp->notifier_completion);
@@ -351,7 +351,7 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root,
        umem->context    = ctx;
        umem->length     = size;
        umem->address    = addr;
-       umem->page_shift = PAGE_SHIFT;
+       odp_data->page_shift = PAGE_SHIFT;
        umem->writable   = root->umem.writable;
        umem->is_odp = 1;
        odp_data->per_mm = per_mm;
@@ -405,18 +405,19 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access)
        struct mm_struct *mm = umem->owning_mm;
        int ret_val;
 
+       umem_odp->page_shift = PAGE_SHIFT;
        if (access & IB_ACCESS_HUGETLB) {
                struct vm_area_struct *vma;
                struct hstate *h;
 
                down_read(&mm->mmap_sem);
-               vma = find_vma(mm, ib_umem_start(umem));
+               vma = find_vma(mm, ib_umem_start(umem_odp));
                if (!vma || !is_vm_hugetlb_page(vma)) {
                        up_read(&mm->mmap_sem);
                        return -EINVAL;
                }
                h = hstate_vma(vma);
-               umem->page_shift = huge_page_shift(h);
+               umem_odp->page_shift = huge_page_shift(h);
                up_read(&mm->mmap_sem);
        }
 
@@ -424,16 +425,16 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access)
 
        init_completion(&umem_odp->notifier_completion);
 
-       if (ib_umem_num_pages(umem)) {
+       if (ib_umem_odp_num_pages(umem_odp)) {
                umem_odp->page_list =
                        vzalloc(array_size(sizeof(*umem_odp->page_list),
-                                          ib_umem_num_pages(umem)));
+                                          ib_umem_odp_num_pages(umem_odp)));
                if (!umem_odp->page_list)
                        return -ENOMEM;
 
                umem_odp->dma_list =
                        vzalloc(array_size(sizeof(*umem_odp->dma_list),
-                                          ib_umem_num_pages(umem)));
+                                          ib_umem_odp_num_pages(umem_odp)));
                if (!umem_odp->dma_list) {
                        ret_val = -ENOMEM;
                        goto out_page_list;
@@ -456,16 +457,14 @@ out_page_list:
 
 void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
 {
-       struct ib_umem *umem = &umem_odp->umem;
-
        /*
         * Ensure that no more pages are mapped in the umem.
         *
         * It is the driver's responsibility to ensure, before calling us,
         * that the hardware will not attempt to access the MR any more.
         */
-       ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem),
-                                   ib_umem_end(umem));
+       ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
+                                   ib_umem_end(umem_odp));
 
        remove_umem_from_per_mm(umem_odp);
        put_per_mm(umem_odp);
@@ -487,7 +486,7 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
  * The function returns -EFAULT if the DMA mapping operation fails. It returns
  * -EAGAIN if a concurrent invalidation prevents us from updating the page.
  *
- * The page is released via put_page even if the operation failed. For
+ * The page is released via put_user_page even if the operation failed. For
  * on-demand pinning, the page is released whenever it isn't stored in the
  * umem.
  */
@@ -498,8 +497,8 @@ static int ib_umem_odp_map_dma_single_page(
                u64 access_mask,
                unsigned long current_seq)
 {
-       struct ib_umem *umem = &umem_odp->umem;
-       struct ib_device *dev = umem->context->device;
+       struct ib_ucontext *context = umem_odp->umem.context;
+       struct ib_device *dev = context->device;
        dma_addr_t dma_addr;
        int remove_existing_mapping = 0;
        int ret = 0;
@@ -514,10 +513,9 @@ static int ib_umem_odp_map_dma_single_page(
                goto out;
        }
        if (!(umem_odp->dma_list[page_index])) {
-               dma_addr = ib_dma_map_page(dev,
-                                          page,
-                                          0, BIT(umem->page_shift),
-                                          DMA_BIDIRECTIONAL);
+               dma_addr =
+                       ib_dma_map_page(dev, page, 0, BIT(umem_odp->page_shift),
+                                       DMA_BIDIRECTIONAL);
                if (ib_dma_mapping_error(dev, dma_addr)) {
                        ret = -EFAULT;
                        goto out;
@@ -536,15 +534,16 @@ static int ib_umem_odp_map_dma_single_page(
        }
 
 out:
-       put_page(page);
+       put_user_page(page);
 
        if (remove_existing_mapping) {
                ib_umem_notifier_start_account(umem_odp);
-               umem->context->invalidate_range(
+               context->invalidate_range(
                        umem_odp,
-                       ib_umem_start(umem) + (page_index << umem->page_shift),
-                       ib_umem_start(umem) +
-                               ((page_index + 1) << umem->page_shift));
+                       ib_umem_start(umem_odp) +
+                               (page_index << umem_odp->page_shift),
+                       ib_umem_start(umem_odp) +
+                               ((page_index + 1) << umem_odp->page_shift));
                ib_umem_notifier_end_account(umem_odp);
                ret = -EAGAIN;
        }
@@ -581,27 +580,26 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
                              u64 bcnt, u64 access_mask,
                              unsigned long current_seq)
 {
-       struct ib_umem *umem = &umem_odp->umem;
        struct task_struct *owning_process  = NULL;
        struct mm_struct *owning_mm = umem_odp->umem.owning_mm;
        struct page       **local_page_list = NULL;
        u64 page_mask, off;
-       int j, k, ret = 0, start_idx, npages = 0, page_shift;
-       unsigned int flags = 0;
+       int j, k, ret = 0, start_idx, npages = 0;
+       unsigned int flags = 0, page_shift;
        phys_addr_t p = 0;
 
        if (access_mask == 0)
                return -EINVAL;
 
-       if (user_virt < ib_umem_start(umem) ||
-           user_virt + bcnt > ib_umem_end(umem))
+       if (user_virt < ib_umem_start(umem_odp) ||
+           user_virt + bcnt > ib_umem_end(umem_odp))
                return -EFAULT;
 
        local_page_list = (struct page **)__get_free_page(GFP_KERNEL);
        if (!local_page_list)
                return -ENOMEM;
 
-       page_shift = umem->page_shift;
+       page_shift = umem_odp->page_shift;
        page_mask = ~(BIT(page_shift) - 1);
        off = user_virt & (~page_mask);
        user_virt = user_virt & page_mask;
@@ -621,7 +619,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
        if (access_mask & ODP_WRITE_ALLOWED_BIT)
                flags |= FOLL_WRITE;
 
-       start_idx = (user_virt - ib_umem_start(umem)) >> page_shift;
+       start_idx = (user_virt - ib_umem_start(umem_odp)) >> page_shift;
        k = start_idx;
 
        while (bcnt > 0) {
@@ -659,7 +657,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
                                        ret = -EFAULT;
                                        break;
                                }
-                               put_page(local_page_list[j]);
+                               put_user_page(local_page_list[j]);
                                continue;
                        }
 
@@ -686,8 +684,8 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
                         * ib_umem_odp_map_dma_single_page().
                         */
                        if (npages - (j + 1) > 0)
-                               release_pages(&local_page_list[j+1],
-                                             npages - (j + 1));
+                               put_user_pages(&local_page_list[j+1],
+                                              npages - (j + 1));
                        break;
                }
        }
@@ -711,21 +709,20 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_pages);
 void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
                                 u64 bound)
 {
-       struct ib_umem *umem = &umem_odp->umem;
        int idx;
        u64 addr;
-       struct ib_device *dev = umem->context->device;
+       struct ib_device *dev = umem_odp->umem.context->device;
 
-       virt  = max_t(u64, virt,  ib_umem_start(umem));
-       bound = min_t(u64, bound, ib_umem_end(umem));
+       virt = max_t(u64, virt, ib_umem_start(umem_odp));
+       bound = min_t(u64, bound, ib_umem_end(umem_odp));
        /* Note that during the run of this function, the
         * notifiers_count of the MR is > 0, preventing any racing
         * faults from completion. We might be racing with other
         * invalidations, so we must make sure we free each page only
         * once. */
        mutex_lock(&umem_odp->umem_mutex);
-       for (addr = virt; addr < bound; addr += BIT(umem->page_shift)) {
-               idx = (addr - ib_umem_start(umem)) >> umem->page_shift;
+       for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
+               idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
                if (umem_odp->page_list[idx]) {
                        struct page *page = umem_odp->page_list[idx];
                        dma_addr_t dma = umem_odp->dma_list[idx];
@@ -733,7 +730,8 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
 
                        WARN_ON(!dma_addr);
 
-                       ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE,
+                       ib_dma_unmap_page(dev, dma_addr,
+                                         BIT(umem_odp->page_shift),
                                          DMA_BIDIRECTIONAL);
                        if (dma & ODP_WRITE_ALLOWED_BIT) {
                                struct page *head_page = compound_head(page);
index 671f07ba1fad66e8300d93c9b85a65091bcbc9ae..9f8a48016b4152a248f781d69760ce4fa433897c 100644 (file)
@@ -54,6 +54,7 @@
 
 #include <rdma/ib_mad.h>
 #include <rdma/ib_user_mad.h>
+#include <rdma/rdma_netlink.h>
 
 #include "core_priv.h"
 
@@ -744,7 +745,7 @@ found:
                                "process %s did not enable P_Key index support.\n",
                                current->comm);
                        dev_warn(&file->port->dev,
-                               "   Documentation/infiniband/user_mad.txt has info on the new ABI.\n");
+                               "   Documentation/infiniband/user_mad.rst has info on the new ABI.\n");
                }
        }
 
@@ -1124,11 +1125,48 @@ static const struct file_operations umad_sm_fops = {
        .llseek  = no_llseek,
 };
 
+static int ib_umad_get_nl_info(struct ib_device *ibdev, void *client_data,
+                              struct ib_client_nl_info *res)
+{
+       struct ib_umad_device *umad_dev = client_data;
+
+       if (!rdma_is_port_valid(ibdev, res->port))
+               return -EINVAL;
+
+       res->abi = IB_USER_MAD_ABI_VERSION;
+       res->cdev = &umad_dev->ports[res->port - rdma_start_port(ibdev)].dev;
+
+       return 0;
+}
+
 static struct ib_client umad_client = {
        .name   = "umad",
        .add    = ib_umad_add_one,
-       .remove = ib_umad_remove_one
+       .remove = ib_umad_remove_one,
+       .get_nl_info = ib_umad_get_nl_info,
 };
+MODULE_ALIAS_RDMA_CLIENT("umad");
+
+static int ib_issm_get_nl_info(struct ib_device *ibdev, void *client_data,
+                              struct ib_client_nl_info *res)
+{
+       struct ib_umad_device *umad_dev =
+               ib_get_client_data(ibdev, &umad_client);
+
+       if (!rdma_is_port_valid(ibdev, res->port))
+               return -EINVAL;
+
+       res->abi = IB_USER_MAD_ABI_VERSION;
+       res->cdev = &umad_dev->ports[res->port - rdma_start_port(ibdev)].sm_dev;
+
+       return 0;
+}
+
+static struct ib_client issm_client = {
+       .name = "issm",
+       .get_nl_info = ib_issm_get_nl_info,
+};
+MODULE_ALIAS_RDMA_CLIENT("issm");
 
 static ssize_t ibdev_show(struct device *dev, struct device_attribute *attr,
                          char *buf)
@@ -1387,13 +1425,17 @@ static int __init ib_umad_init(void)
        }
 
        ret = ib_register_client(&umad_client);
-       if (ret) {
-               pr_err("couldn't register ib_umad client\n");
+       if (ret)
                goto out_class;
-       }
+
+       ret = ib_register_client(&issm_client);
+       if (ret)
+               goto out_client;
 
        return 0;
 
+out_client:
+       ib_unregister_client(&umad_client);
 out_class:
        class_unregister(&umad_class);
 
@@ -1411,6 +1453,7 @@ out:
 
 static void __exit ib_umad_cleanup(void)
 {
+       ib_unregister_client(&issm_client);
        ib_unregister_client(&umad_client);
        class_unregister(&umad_class);
        unregister_chrdev_region(base_umad_dev,
index 63fe14c7c68fc31586fad4cd3559a52d03d62ea0..7ddd0e5bc6b3419f79217a59030037c7b5f5b14c 100644 (file)
@@ -756,7 +756,9 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs)
 
        mr->device  = pd->device;
        mr->pd      = pd;
+       mr->type    = IB_MR_TYPE_USER;
        mr->dm      = NULL;
+       mr->sig_attrs = NULL;
        mr->uobject = uobj;
        atomic_inc(&pd->usecnt);
        mr->res.type = RDMA_RESTRACK_MR;
@@ -1021,12 +1023,11 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs,
        attr.comp_vector = cmd->comp_vector;
        attr.flags = cmd->flags;
 
-       cq = ib_dev->ops.create_cq(ib_dev, &attr, &attrs->driver_udata);
-       if (IS_ERR(cq)) {
-               ret = PTR_ERR(cq);
+       cq = rdma_zalloc_drv_obj(ib_dev, ib_cq);
+       if (!cq) {
+               ret = -ENOMEM;
                goto err_file;
        }
-
        cq->device        = ib_dev;
        cq->uobject       = &obj->uobject;
        cq->comp_handler  = ib_uverbs_comp_handler;
@@ -1034,6 +1035,10 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs,
        cq->cq_context    = ev_file ? &ev_file->ev_queue : NULL;
        atomic_set(&cq->usecnt, 0);
 
+       ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata);
+       if (ret)
+               goto err_free;
+
        obj->uobject.object = cq;
        memset(&resp, 0, sizeof resp);
        resp.base.cq_handle = obj->uobject.id;
@@ -1054,7 +1059,9 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs,
 
 err_cb:
        ib_destroy_cq_user(cq, uverbs_get_cleared_udata(attrs));
-
+       cq = NULL;
+err_free:
+       kfree(cq);
 err_file:
        if (ev_file)
                ib_uverbs_release_ucq(attrs->ufile, ev_file, obj);
@@ -2541,7 +2548,7 @@ static int ib_uverbs_detach_mcast(struct uverbs_attr_bundle *attrs)
        struct ib_uqp_object         *obj;
        struct ib_qp                 *qp;
        struct ib_uverbs_mcast_entry *mcast;
-       int                           ret = -EINVAL;
+       int                           ret;
        bool                          found = false;
 
        ret = uverbs_request(attrs, &cmd, sizeof(cmd));
@@ -3715,9 +3722,6 @@ static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs)
  * trailing driver_data flex array. In this case the size of the base struct
  * cannot be changed.
  */
-#define offsetof_after(_struct, _member)                                       \
-       (offsetof(_struct, _member) + sizeof(((_struct *)NULL)->_member))
-
 #define UAPI_DEF_WRITE_IO(req, resp)                                           \
        .write.has_resp = 1 +                                                  \
                          BUILD_BUG_ON_ZERO(offsetof(req, response) != 0) +    \
@@ -3748,11 +3752,11 @@ static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs)
  */
 #define UAPI_DEF_WRITE_IO_EX(req, req_last_member, resp, resp_last_member)     \
        .write.has_resp = 1,                                                   \
-       .write.req_size = offsetof_after(req, req_last_member),                \
-       .write.resp_size = offsetof_after(resp, resp_last_member)
+       .write.req_size = offsetofend(req, req_last_member),                   \
+       .write.resp_size = offsetofend(resp, resp_last_member)
 
 #define UAPI_DEF_WRITE_I_EX(req, req_last_member)                              \
-       .write.req_size = offsetof_after(req, req_last_member)
+       .write.req_size = offsetofend(req, req_last_member)
 
 const struct uapi_definition uverbs_def_write_intf[] = {
        DECLARE_UVERBS_OBJECT(
index 84a5e9a6d483e8933502f76e1004267fef509063..11c13c1381cf5c9d6afdb4596c36f8d4a7203c8b 100644 (file)
@@ -51,6 +51,7 @@
 
 #include <rdma/ib.h>
 #include <rdma/uverbs_std_types.h>
+#include <rdma/rdma_netlink.h>
 
 #include "uverbs.h"
 #include "core_priv.h"
@@ -198,7 +199,7 @@ void ib_uverbs_release_file(struct kref *ref)
        ib_dev = srcu_dereference(file->device->ib_dev,
                                  &file->device->disassociate_srcu);
        if (ib_dev && !ib_dev->ops.disassociate_ucontext)
-               module_put(ib_dev->owner);
+               module_put(ib_dev->ops.owner);
        srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
 
        if (atomic_dec_and_test(&file->device->refcount))
@@ -1065,7 +1066,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
        module_dependent = !(ib_dev->ops.disassociate_ucontext);
 
        if (module_dependent) {
-               if (!try_module_get(ib_dev->owner)) {
+               if (!try_module_get(ib_dev->ops.owner)) {
                        ret = -ENODEV;
                        goto err;
                }
@@ -1100,7 +1101,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
        return stream_open(inode, filp);
 
 err_module:
-       module_put(ib_dev->owner);
+       module_put(ib_dev->ops.owner);
 
 err:
        mutex_unlock(&dev->lists_mutex);
@@ -1148,12 +1149,41 @@ static const struct file_operations uverbs_mmap_fops = {
        .compat_ioctl = ib_uverbs_ioctl,
 };
 
+static int ib_uverbs_get_nl_info(struct ib_device *ibdev, void *client_data,
+                                struct ib_client_nl_info *res)
+{
+       struct ib_uverbs_device *uverbs_dev = client_data;
+       int ret;
+
+       if (res->port != -1)
+               return -EINVAL;
+
+       res->abi = ibdev->ops.uverbs_abi_ver;
+       res->cdev = &uverbs_dev->dev;
+
+       /*
+        * To support DRIVER_ID binding in userspace some of the driver need
+        * upgrading to expose their PCI dependent revision information
+        * through get_context instead of relying on modalias matching. When
+        * the drivers are fixed they can drop this flag.
+        */
+       if (!ibdev->ops.uverbs_no_driver_id_binding) {
+               ret = nla_put_u32(res->nl_msg, RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID,
+                                 ibdev->ops.driver_id);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
 static struct ib_client uverbs_client = {
        .name   = "uverbs",
        .no_kverbs_req = true,
        .add    = ib_uverbs_add_one,
-       .remove = ib_uverbs_remove_one
+       .remove = ib_uverbs_remove_one,
+       .get_nl_info = ib_uverbs_get_nl_info,
 };
+MODULE_ALIAS_RDMA_CLIENT("uverbs");
 
 static ssize_t ibdev_show(struct device *device, struct device_attribute *attr,
                          char *buf)
@@ -1186,7 +1216,7 @@ static ssize_t abi_version_show(struct device *device,
        srcu_key = srcu_read_lock(&dev->disassociate_srcu);
        ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
        if (ib_dev)
-               ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver);
+               ret = sprintf(buf, "%u\n", ib_dev->ops.uverbs_abi_ver);
        srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
 
        return ret;
index 07ea4e3c45663a5fef7d889e986ddd3918a0efe4..e39fe6a8aac43382eb016e82f91bd6f35e4d7986 100644 (file)
@@ -111,9 +111,9 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
        INIT_LIST_HEAD(&obj->comp_list);
        INIT_LIST_HEAD(&obj->async_list);
 
-       cq = ib_dev->ops.create_cq(ib_dev, &attr, &attrs->driver_udata);
-       if (IS_ERR(cq)) {
-               ret = PTR_ERR(cq);
+       cq = rdma_zalloc_drv_obj(ib_dev, ib_cq);
+       if (!cq) {
+               ret = -ENOMEM;
                goto err_event_file;
        }
 
@@ -122,10 +122,15 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
        cq->comp_handler  = ib_uverbs_comp_handler;
        cq->event_handler = ib_uverbs_cq_event_handler;
        cq->cq_context    = ev_file ? &ev_file->ev_queue : NULL;
-       obj->uobject.object = cq;
-       obj->uobject.user_handle = user_handle;
        atomic_set(&cq->usecnt, 0);
        cq->res.type = RDMA_RESTRACK_CQ;
+
+       ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata);
+       if (ret)
+               goto err_free;
+
+       obj->uobject.object = cq;
+       obj->uobject.user_handle = user_handle;
        rdma_restrack_uadd(&cq->res);
 
        ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &cq->cqe,
@@ -136,7 +141,9 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
        return 0;
 err_cq:
        ib_destroy_cq_user(cq, uverbs_get_cleared_udata(attrs));
-
+       cq = NULL;
+err_free:
+       kfree(cq);
 err_event_file:
        if (ev_file)
                uverbs_uobject_put(ev_file_uobj);
index 997f7a3a558af96c839ca3b74aacc9767bd5fcf8..c1286a52dc8451d4a81eb918c1f0f0482630e732 100644 (file)
@@ -128,6 +128,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)(
 
        mr->device  = pd->device;
        mr->pd      = pd;
+       mr->type    = IB_MR_TYPE_DM;
        mr->dm      = dm;
        mr->uobject = uobj;
        atomic_inc(&pd->usecnt);
index 7a987acf0c0bbdf0b48460371ca164b6cb34a194..00c5478871322a7a85274140b593eb23ebf451e2 100644 (file)
@@ -22,6 +22,8 @@ static void *uapi_add_elm(struct uverbs_api *uapi, u32 key, size_t alloc_size)
                return ERR_PTR(-EOVERFLOW);
 
        elm = kzalloc(alloc_size, GFP_KERNEL);
+       if (!elm)
+               return ERR_PTR(-ENOMEM);
        rc = radix_tree_insert(&uapi->radix, key, elm);
        if (rc) {
                kfree(elm);
@@ -645,7 +647,7 @@ struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev)
                return ERR_PTR(-ENOMEM);
 
        INIT_RADIX_TREE(&uapi->radix, GFP_KERNEL);
-       uapi->driver_id = ibdev->driver_id;
+       uapi->driver_id = ibdev->ops.driver_id;
 
        rc = uapi_merge_def(uapi, ibdev, uverbs_core_api, false);
        if (rc)
index e666a1f7608d868621cdd279f455adef15b3d30b..92349bf37589f79d4fa6a589882c6de5c5c21aa0 100644 (file)
@@ -209,7 +209,7 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
 EXPORT_SYMBOL(ib_rate_to_mbps);
 
 __attribute_const__ enum rdma_transport_type
-rdma_node_get_transport(enum rdma_node_type node_type)
+rdma_node_get_transport(unsigned int node_type)
 {
 
        if (node_type == RDMA_NODE_USNIC)
@@ -299,6 +299,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
 
                mr->device      = pd->device;
                mr->pd          = pd;
+               mr->type        = IB_MR_TYPE_DMA;
                mr->uobject     = NULL;
                mr->need_inval  = false;
 
@@ -316,7 +317,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
 EXPORT_SYMBOL(__ib_alloc_pd);
 
 /**
- * ib_dealloc_pd - Deallocates a protection domain.
+ * ib_dealloc_pd_user - Deallocates a protection domain.
  * @pd: The protection domain to deallocate.
  * @udata: Valid user data or NULL for kernel object
  *
@@ -1157,6 +1158,10 @@ struct ib_qp *ib_create_qp_user(struct ib_pd *pd,
            qp_init_attr->cap.max_recv_sge))
                return ERR_PTR(-EINVAL);
 
+       if ((qp_init_attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) &&
+           !(device->attrs.device_cap_flags & IB_DEVICE_INTEGRITY_HANDOVER))
+               return ERR_PTR(-EINVAL);
+
        /*
         * If the callers is using the RDMA API calculate the resources
         * needed for the RDMA READ/WRITE operations.
@@ -1232,6 +1237,8 @@ struct ib_qp *ib_create_qp_user(struct ib_pd *pd,
        qp->max_write_sge = qp_init_attr->cap.max_send_sge;
        qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge,
                                 device->attrs.max_sge_rd);
+       if (qp_init_attr->create_flags & IB_QP_CREATE_INTEGRITY_EN)
+               qp->integrity_en = true;
 
        return qp;
 
@@ -1683,6 +1690,14 @@ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr,
                }
        }
 
+       /*
+        * Bind this qp to a counter automatically based on the rdma counter
+        * rules. This only set in RST2INIT with port specified
+        */
+       if (!qp->counter && (attr_mask & IB_QP_PORT) &&
+           ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT))
+               rdma_counter_bind_qp_auto(qp, attr->port_num);
+
        ret = ib_security_modify_qp(qp, attr, attr_mask, udata);
        if (ret)
                goto out;
@@ -1878,6 +1893,7 @@ int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata)
        if (!qp->uobject)
                rdma_rw_cleanup_mrs(qp);
 
+       rdma_counter_unbind_qp(qp, true);
        rdma_restrack_del(&qp->res);
        ret = qp->device->ops.destroy_qp(qp, udata);
        if (!ret) {
@@ -1916,21 +1932,28 @@ struct ib_cq *__ib_create_cq(struct ib_device *device,
                             const char *caller)
 {
        struct ib_cq *cq;
+       int ret;
+
+       cq = rdma_zalloc_drv_obj(device, ib_cq);
+       if (!cq)
+               return ERR_PTR(-ENOMEM);
 
-       cq = device->ops.create_cq(device, cq_attr, NULL);
-
-       if (!IS_ERR(cq)) {
-               cq->device        = device;
-               cq->uobject       = NULL;
-               cq->comp_handler  = comp_handler;
-               cq->event_handler = event_handler;
-               cq->cq_context    = cq_context;
-               atomic_set(&cq->usecnt, 0);
-               cq->res.type = RDMA_RESTRACK_CQ;
-               rdma_restrack_set_task(&cq->res, caller);
-               rdma_restrack_kadd(&cq->res);
+       cq->device = device;
+       cq->uobject = NULL;
+       cq->comp_handler = comp_handler;
+       cq->event_handler = event_handler;
+       cq->cq_context = cq_context;
+       atomic_set(&cq->usecnt, 0);
+       cq->res.type = RDMA_RESTRACK_CQ;
+       rdma_restrack_set_task(&cq->res, caller);
+
+       ret = device->ops.create_cq(cq, cq_attr, NULL);
+       if (ret) {
+               kfree(cq);
+               return ERR_PTR(ret);
        }
 
+       rdma_restrack_kadd(&cq->res);
        return cq;
 }
 EXPORT_SYMBOL(__ib_create_cq);
@@ -1949,7 +1972,9 @@ int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata)
                return -EBUSY;
 
        rdma_restrack_del(&cq->res);
-       return cq->device->ops.destroy_cq(cq, udata);
+       cq->device->ops.destroy_cq(cq, udata);
+       kfree(cq);
+       return 0;
 }
 EXPORT_SYMBOL(ib_destroy_cq_user);
 
@@ -1966,6 +1991,7 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
 {
        struct ib_pd *pd = mr->pd;
        struct ib_dm *dm = mr->dm;
+       struct ib_sig_attrs *sig_attrs = mr->sig_attrs;
        int ret;
 
        rdma_restrack_del(&mr->res);
@@ -1974,6 +2000,7 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
                atomic_dec(&pd->usecnt);
                if (dm)
                        atomic_dec(&dm->usecnt);
+               kfree(sig_attrs);
        }
 
        return ret;
@@ -1981,7 +2008,7 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
 EXPORT_SYMBOL(ib_dereg_mr_user);
 
 /**
- * ib_alloc_mr() - Allocates a memory region
+ * ib_alloc_mr_user() - Allocates a memory region
  * @pd:            protection domain associated with the region
  * @mr_type:       memory region type
  * @max_num_sg:    maximum sg entries available for registration.
@@ -2001,6 +2028,9 @@ struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type,
        if (!pd->device->ops.alloc_mr)
                return ERR_PTR(-EOPNOTSUPP);
 
+       if (WARN_ON_ONCE(mr_type == IB_MR_TYPE_INTEGRITY))
+               return ERR_PTR(-EINVAL);
+
        mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg, udata);
        if (!IS_ERR(mr)) {
                mr->device  = pd->device;
@@ -2011,12 +2041,66 @@ struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type,
                mr->need_inval = false;
                mr->res.type = RDMA_RESTRACK_MR;
                rdma_restrack_kadd(&mr->res);
+               mr->type = mr_type;
+               mr->sig_attrs = NULL;
        }
 
        return mr;
 }
 EXPORT_SYMBOL(ib_alloc_mr_user);
 
+/**
+ * ib_alloc_mr_integrity() - Allocates an integrity memory region
+ * @pd:                      protection domain associated with the region
+ * @max_num_data_sg:         maximum data sg entries available for registration
+ * @max_num_meta_sg:         maximum metadata sg entries available for
+ *                           registration
+ *
+ * Notes:
+ * Memory registration page/sg lists must not exceed max_num_sg,
+ * also the integrity page/sg lists must not exceed max_num_meta_sg.
+ *
+ */
+struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd,
+                                   u32 max_num_data_sg,
+                                   u32 max_num_meta_sg)
+{
+       struct ib_mr *mr;
+       struct ib_sig_attrs *sig_attrs;
+
+       if (!pd->device->ops.alloc_mr_integrity ||
+           !pd->device->ops.map_mr_sg_pi)
+               return ERR_PTR(-EOPNOTSUPP);
+
+       if (!max_num_meta_sg)
+               return ERR_PTR(-EINVAL);
+
+       sig_attrs = kzalloc(sizeof(struct ib_sig_attrs), GFP_KERNEL);
+       if (!sig_attrs)
+               return ERR_PTR(-ENOMEM);
+
+       mr = pd->device->ops.alloc_mr_integrity(pd, max_num_data_sg,
+                                               max_num_meta_sg);
+       if (IS_ERR(mr)) {
+               kfree(sig_attrs);
+               return mr;
+       }
+
+       mr->device = pd->device;
+       mr->pd = pd;
+       mr->dm = NULL;
+       mr->uobject = NULL;
+       atomic_inc(&pd->usecnt);
+       mr->need_inval = false;
+       mr->res.type = RDMA_RESTRACK_MR;
+       rdma_restrack_kadd(&mr->res);
+       mr->type = IB_MR_TYPE_INTEGRITY;
+       mr->sig_attrs = sig_attrs;
+
+       return mr;
+}
+EXPORT_SYMBOL(ib_alloc_mr_integrity);
+
 /* "Fast" memory regions */
 
 struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
@@ -2226,19 +2310,17 @@ EXPORT_SYMBOL(ib_create_wq);
  */
 int ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata)
 {
-       int err;
        struct ib_cq *cq = wq->cq;
        struct ib_pd *pd = wq->pd;
 
        if (atomic_read(&wq->usecnt))
                return -EBUSY;
 
-       err = wq->device->ops.destroy_wq(wq, udata);
-       if (!err) {
-               atomic_dec(&pd->usecnt);
-               atomic_dec(&cq->usecnt);
-       }
-       return err;
+       wq->device->ops.destroy_wq(wq, udata);
+       atomic_dec(&pd->usecnt);
+       atomic_dec(&cq->usecnt);
+
+       return 0;
 }
 EXPORT_SYMBOL(ib_destroy_wq);
 
@@ -2375,6 +2457,43 @@ int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid,
 }
 EXPORT_SYMBOL(ib_set_vf_guid);
 
+/**
+ * ib_map_mr_sg_pi() - Map the dma mapped SG lists for PI (protection
+ *     information) and set an appropriate memory region for registration.
+ * @mr:             memory region
+ * @data_sg:        dma mapped scatterlist for data
+ * @data_sg_nents:  number of entries in data_sg
+ * @data_sg_offset: offset in bytes into data_sg
+ * @meta_sg:        dma mapped scatterlist for metadata
+ * @meta_sg_nents:  number of entries in meta_sg
+ * @meta_sg_offset: offset in bytes into meta_sg
+ * @page_size:      page vector desired page size
+ *
+ * Constraints:
+ * - The MR must be allocated with type IB_MR_TYPE_INTEGRITY.
+ *
+ * Return: 0 on success.
+ *
+ * After this completes successfully, the  memory region
+ * is ready for registration.
+ */
+int ib_map_mr_sg_pi(struct ib_mr *mr, struct scatterlist *data_sg,
+                   int data_sg_nents, unsigned int *data_sg_offset,
+                   struct scatterlist *meta_sg, int meta_sg_nents,
+                   unsigned int *meta_sg_offset, unsigned int page_size)
+{
+       if (unlikely(!mr->device->ops.map_mr_sg_pi ||
+                    WARN_ON_ONCE(mr->type != IB_MR_TYPE_INTEGRITY)))
+               return -EOPNOTSUPP;
+
+       mr->page_size = page_size;
+
+       return mr->device->ops.map_mr_sg_pi(mr, data_sg, data_sg_nents,
+                                           data_sg_offset, meta_sg,
+                                           meta_sg_nents, meta_sg_offset);
+}
+EXPORT_SYMBOL(ib_map_mr_sg_pi);
+
 /**
  * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list
  *     and set it the memory region.
index 77094be1b2627de8d541cb57a06350e67a3281e9..433fca59febdffe3392996b8ed6dbfdfdbeaf28a 100644 (file)
@@ -7,7 +7,6 @@ obj-$(CONFIG_INFINIBAND_EFA)            += efa/
 obj-$(CONFIG_INFINIBAND_I40IW)         += i40iw/
 obj-$(CONFIG_MLX4_INFINIBAND)          += mlx4/
 obj-$(CONFIG_MLX5_INFINIBAND)          += mlx5/
-obj-$(CONFIG_INFINIBAND_NES)           += nes/
 obj-$(CONFIG_INFINIBAND_OCRDMA)                += ocrdma/
 obj-$(CONFIG_INFINIBAND_VMWARE_PVRDMA) += vmw_pvrdma/
 obj-$(CONFIG_INFINIBAND_USNIC)         += usnic/
index 2c3685faa57a42defe87428d5cdf3f01f4136b2f..a91653aabf3899d9c7e3f5dc4be6ab8ff168ac6c 100644 (file)
@@ -805,10 +805,8 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
                rdev->sqp_ah = NULL;
        }
 
-       if (!IS_ERR_OR_NULL(qp->rumem))
-               ib_umem_release(qp->rumem);
-       if (!IS_ERR_OR_NULL(qp->sumem))
-               ib_umem_release(qp->sumem);
+       ib_umem_release(qp->rumem);
+       ib_umem_release(qp->sumem);
 
        mutex_lock(&rdev->qp_lock);
        list_del(&qp->list);
@@ -1201,12 +1199,8 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
 qp_destroy:
        bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp);
 free_umem:
-       if (udata) {
-               if (qp->rumem)
-                       ib_umem_release(qp->rumem);
-               if (qp->sumem)
-                       ib_umem_release(qp->sumem);
-       }
+       ib_umem_release(qp->rumem);
+       ib_umem_release(qp->sumem);
 fail:
        kfree(qp);
        return ERR_PTR(rc);
@@ -1302,8 +1296,7 @@ void bnxt_re_destroy_srq(struct ib_srq *ib_srq, struct ib_udata *udata)
        if (qplib_srq->cq)
                nq = qplib_srq->cq->nq;
        bnxt_qplib_destroy_srq(&rdev->qplib_res, qplib_srq);
-       if (srq->umem)
-               ib_umem_release(srq->umem);
+       ib_umem_release(srq->umem);
        atomic_dec(&rdev->srq_count);
        if (nq)
                nq->budget--;
@@ -1412,8 +1405,7 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq,
        return 0;
 
 fail:
-       if (srq->umem)
-               ib_umem_release(srq->umem);
+       ib_umem_release(srq->umem);
 exit:
        return rc;
 }
@@ -2517,9 +2509,8 @@ int bnxt_re_post_recv(struct ib_qp *ib_qp, const struct ib_recv_wr *wr,
 }
 
 /* Completion Queues */
-int bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
+void bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
-       int rc;
        struct bnxt_re_cq *cq;
        struct bnxt_qplib_nq *nq;
        struct bnxt_re_dev *rdev;
@@ -2528,29 +2519,20 @@ int bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
        rdev = cq->rdev;
        nq = cq->qplib_cq.nq;
 
-       rc = bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq);
-       if (rc) {
-               dev_err(rdev_to_dev(rdev), "Failed to destroy HW CQ");
-               return rc;
-       }
-       if (!IS_ERR_OR_NULL(cq->umem))
-               ib_umem_release(cq->umem);
+       bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq);
+       ib_umem_release(cq->umem);
 
        atomic_dec(&rdev->cq_count);
        nq->budget--;
        kfree(cq->cql);
-       kfree(cq);
-
-       return 0;
 }
 
-struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
-                               const struct ib_cq_init_attr *attr,
-                               struct ib_udata *udata)
+int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                     struct ib_udata *udata)
 {
-       struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+       struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibcq->device, ibdev);
        struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
-       struct bnxt_re_cq *cq = NULL;
+       struct bnxt_re_cq *cq = container_of(ibcq, struct bnxt_re_cq, ib_cq);
        int rc, entries;
        int cqe = attr->cqe;
        struct bnxt_qplib_nq *nq = NULL;
@@ -2559,11 +2541,8 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
        /* Validate CQ fields */
        if (cqe < 1 || cqe > dev_attr->max_cq_wqes) {
                dev_err(rdev_to_dev(rdev), "Failed to create CQ -max exceeded");
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
        }
-       cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-       if (!cq)
-               return ERR_PTR(-ENOMEM);
 
        cq->rdev = rdev;
        cq->qplib_cq.cq_handle = (u64)(unsigned long)(&cq->qplib_cq);
@@ -2641,15 +2620,13 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
                }
        }
 
-       return &cq->ib_cq;
+       return 0;
 
 c2fail:
-       if (udata)
-               ib_umem_release(cq->umem);
+       ib_umem_release(cq->umem);
 fail:
        kfree(cq->cql);
-       kfree(cq);
-       return ERR_PTR(rc);
+       return rc;
 }
 
 static u8 __req_to_ib_wc_status(u8 qstatus)
@@ -3353,8 +3330,7 @@ int bnxt_re_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
                mr->npages = 0;
                mr->pages = NULL;
        }
-       if (!IS_ERR_OR_NULL(mr->ib_umem))
-               ib_umem_release(mr->ib_umem);
+       ib_umem_release(mr->ib_umem);
 
        kfree(mr);
        atomic_dec(&rdev->mr_count);
@@ -3630,10 +3606,10 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata)
        u32 chip_met_rev_num = 0;
        int rc;
 
-       dev_dbg(rdev_to_dev(rdev), "ABI version requested %d",
-               ibdev->uverbs_abi_ver);
+       dev_dbg(rdev_to_dev(rdev), "ABI version requested %u",
+               ibdev->ops.uverbs_abi_ver);
 
-       if (ibdev->uverbs_abi_ver != BNXT_RE_ABI_VERSION) {
+       if (ibdev->ops.uverbs_abi_ver != BNXT_RE_ABI_VERSION) {
                dev_dbg(rdev_to_dev(rdev), " is different from the device %d ",
                        BNXT_RE_ABI_VERSION);
                return -EPERM;
index 09a33049e42f23388f91294624d550d97bc2b6fb..31662b1ee35ad4a186e23ed234e9f27691882977 100644 (file)
@@ -94,11 +94,11 @@ struct bnxt_re_qp {
 };
 
 struct bnxt_re_cq {
+       struct ib_cq            ib_cq;
        struct bnxt_re_dev      *rdev;
        spinlock_t              cq_lock;        /* protect cq */
        u16                     cq_count;
        u16                     cq_period;
-       struct ib_cq            ib_cq;
        struct bnxt_qplib_cq    qplib_cq;
        struct bnxt_qplib_cqe   *cql;
 #define MAX_CQL_PER_POLL       1024
@@ -190,10 +190,9 @@ int bnxt_re_post_send(struct ib_qp *qp, const struct ib_send_wr *send_wr,
                      const struct ib_send_wr **bad_send_wr);
 int bnxt_re_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr,
                      const struct ib_recv_wr **bad_recv_wr);
-struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
-                               const struct ib_cq_init_attr *attr,
-                               struct ib_udata *udata);
-int bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
+int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                     struct ib_udata *udata);
+void bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int bnxt_re_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
 int bnxt_re_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
 struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
index 814f959c7db9656566e1731ae7065c4a579eebe0..029babe713f3d6e002475fc89c7cbe5689dd5cb1 100644 (file)
@@ -596,6 +596,10 @@ static void bnxt_re_unregister_ib(struct bnxt_re_dev *rdev)
 }
 
 static const struct ib_device_ops bnxt_re_dev_ops = {
+       .owner = THIS_MODULE,
+       .driver_id = RDMA_DRIVER_BNXT_RE,
+       .uverbs_abi_ver = BNXT_RE_ABI_VERSION,
+
        .add_gid = bnxt_re_add_gid,
        .alloc_hw_stats = bnxt_re_ib_alloc_hw_stats,
        .alloc_mr = bnxt_re_alloc_mr,
@@ -637,6 +641,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = {
        .reg_user_mr = bnxt_re_reg_user_mr,
        .req_notify_cq = bnxt_re_req_notify_cq,
        INIT_RDMA_OBJ_SIZE(ib_ah, bnxt_re_ah, ib_ah),
+       INIT_RDMA_OBJ_SIZE(ib_cq, bnxt_re_cq, ib_cq),
        INIT_RDMA_OBJ_SIZE(ib_pd, bnxt_re_pd, ib_pd),
        INIT_RDMA_OBJ_SIZE(ib_srq, bnxt_re_srq, ib_srq),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, bnxt_re_ucontext, ib_uctx),
@@ -648,7 +653,6 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
        int ret;
 
        /* ib device init */
-       ibdev->owner = THIS_MODULE;
        ibdev->node_type = RDMA_NODE_IB_CA;
        strlcpy(ibdev->node_desc, BNXT_RE_DESC " HCA",
                strlen(BNXT_RE_DESC) + 5);
@@ -661,7 +665,6 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
        ibdev->local_dma_lkey = BNXT_QPLIB_RSVD_LKEY;
 
        /* User space */
-       ibdev->uverbs_abi_ver = BNXT_RE_ABI_VERSION;
        ibdev->uverbs_cmd_mask =
                        (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
                        (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
@@ -691,7 +694,6 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
 
 
        rdma_set_device_sysfs_group(ibdev, &bnxt_re_dev_attr_group);
-       ibdev->driver_id = RDMA_DRIVER_BNXT_RE;
        ib_set_device_ops(ibdev, &bnxt_re_dev_ops);
        ret = ib_device_set_netdev(&rdev->ibdev, rdev->netdev, 1);
        if (ret)
index 8ac72ac7cbac7f9a6dd72a31c190fcd5a5c2af72..95b22a651673a100102ec1cf4a091de1a7716d4e 100644 (file)
@@ -174,7 +174,6 @@ int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq, int kernel)
                return -ENOMEM;
        }
        dma_unmap_addr_set(cq, mapping, cq->dma_addr);
-       memset(cq->queue, 0, size);
        setup.id = cq->cqid;
        setup.base_addr = (u64) (cq->dma_addr);
        setup.size = 1UL << cq->size_log2;
@@ -187,20 +186,6 @@ int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq, int kernel)
        return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
 }
 
-#ifdef notyet
-int cxio_resize_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
-{
-       struct rdma_cq_setup setup;
-       setup.id = cq->cqid;
-       setup.base_addr = (u64) (cq->dma_addr);
-       setup.size = 1UL << cq->size_log2;
-       setup.credits = setup.size;
-       setup.credit_thres = setup.size;        /* TBD: overflow recovery */
-       setup.ovfl_mode = 1;
-       return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
-}
-#endif
-
 static u32 get_qpid(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx)
 {
        struct cxio_qpid_list *entry;
@@ -219,7 +204,7 @@ static u32 get_qpid(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx)
                if (!qpid)
                        goto out;
                for (i = qpid+1; i & rdev_p->qpmask; i++) {
-                       entry = kmalloc(sizeof *entry, GFP_KERNEL);
+                       entry = kmalloc(sizeof(*entry), GFP_KERNEL);
                        if (!entry)
                                break;
                        entry->qpid = i;
@@ -237,7 +222,7 @@ static void put_qpid(struct cxio_rdev *rdev_p, u32 qpid,
 {
        struct cxio_qpid_list *entry;
 
-       entry = kmalloc(sizeof *entry, GFP_KERNEL);
+       entry = kmalloc(sizeof(*entry), GFP_KERNEL);
        if (!entry)
                return;
        pr_debug("%s qpid 0x%x\n", __func__, qpid);
@@ -317,17 +302,15 @@ err1:
        return -ENOMEM;
 }
 
-int cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
+void cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
 {
-       int err;
-       err = cxio_hal_clear_cq_ctx(rdev_p, cq->cqid);
+       cxio_hal_clear_cq_ctx(rdev_p, cq->cqid);
        kfree(cq->sw_queue);
        dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
                          (1UL << (cq->size_log2))
                          * sizeof(struct t3_cqe) + 1, cq->queue,
                          dma_unmap_addr(cq, mapping));
        cxio_hal_put_cqid(rdev_p->rscp, cq->cqid);
-       return err;
 }
 
 int cxio_destroy_qp(struct cxio_rdev *rdev_p, struct t3_wq *wq,
@@ -538,8 +521,6 @@ static int cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p)
        dma_unmap_addr_set(&rdev_p->ctrl_qp, mapping,
                           rdev_p->ctrl_qp.dma_addr);
        rdev_p->ctrl_qp.doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr;
-       memset(rdev_p->ctrl_qp.workq, 0,
-              (1 << T3_CTRL_QP_SIZE_LOG2) * sizeof(union t3_wr));
 
        mutex_init(&rdev_p->ctrl_qp.lock);
        init_waitqueue_head(&rdev_p->ctrl_qp.waitq);
@@ -565,9 +546,9 @@ static int cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p)
        wqe->sge_cmd = cpu_to_be64(sge_cmd);
        wqe->ctx1 = cpu_to_be64(ctx1);
        wqe->ctx0 = cpu_to_be64(ctx0);
-       pr_debug("CtrlQP dma_addr 0x%llx workq %p size %d\n",
-                (unsigned long long)rdev_p->ctrl_qp.dma_addr,
-                rdev_p->ctrl_qp.workq, 1 << T3_CTRL_QP_SIZE_LOG2);
+       pr_debug("CtrlQP dma_addr %pad workq %p size %d\n",
+                &rdev_p->ctrl_qp.dma_addr, rdev_p->ctrl_qp.workq,
+                1 << T3_CTRL_QP_SIZE_LOG2);
        skb->priority = CPL_PRIORITY_CONTROL;
        return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb);
 err:
index c64e50b5a548546a2ebadfd8a98bb52cf73685ad..40c029ffa4256a4b353420f1e6a4a22dcf4d7fbd 100644 (file)
@@ -158,8 +158,7 @@ void cxio_rdev_close(struct cxio_rdev *rdev);
 int cxio_hal_cq_op(struct cxio_rdev *rdev, struct t3_cq *cq,
                   enum t3_cq_opcode op, u32 credit);
 int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq, int kernel);
-int cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
-int cxio_resize_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
+void cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
 void cxio_release_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
 void cxio_init_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
 int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq,
index 1c90c86fc8b83d2bd3a9c2fe75a8ff16beda1925..0bca72cb4d9a1b21041cb0ce11a513185a260d52 100644 (file)
@@ -170,7 +170,7 @@ static void release_tid(struct t3cdev *tdev, u32 hwtid, struct sk_buff *skb)
 {
        struct cpl_tid_release *req;
 
-       skb = get_skb(skb, sizeof *req, GFP_KERNEL);
+       skb = get_skb(skb, sizeof(*req), GFP_KERNEL);
        if (!skb)
                return;
        req = skb_put(skb, sizeof(*req));
index 3a481dfb1607a2556d1afa1664b92d30dfd955de..e775c1a1a4506c147dfe369b2edcd7bc8cef68a5 100644 (file)
@@ -88,7 +88,7 @@ static int iwch_alloc_ucontext(struct ib_ucontext *ucontext,
        return 0;
 }
 
-static int iwch_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
+static void iwch_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
        struct iwch_cq *chp;
 
@@ -100,17 +100,16 @@ static int iwch_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
        wait_event(chp->wait, !atomic_read(&chp->refcnt));
 
        cxio_destroy_cq(&chp->rhp->rdev, &chp->cq);
-       kfree(chp);
-       return 0;
 }
 
-static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
-                                   const struct ib_cq_init_attr *attr,
-                                   struct ib_udata *udata)
+static int iwch_create_cq(struct ib_cq *ibcq,
+                         const struct ib_cq_init_attr *attr,
+                         struct ib_udata *udata)
 {
+       struct ib_device *ibdev = ibcq->device;
        int entries = attr->cqe;
-       struct iwch_dev *rhp;
-       struct iwch_cq *chp;
+       struct iwch_dev *rhp = to_iwch_dev(ibcq->device);
+       struct iwch_cq *chp = to_iwch_cq(ibcq);
        struct iwch_create_cq_resp uresp;
        struct iwch_create_cq_req ureq;
        static int warned;
@@ -118,19 +117,13 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
 
        pr_debug("%s ib_dev %p entries %d\n", __func__, ibdev, entries);
        if (attr->flags)
-               return ERR_PTR(-EINVAL);
-
-       rhp = to_iwch_dev(ibdev);
-       chp = kzalloc(sizeof(*chp), GFP_KERNEL);
-       if (!chp)
-               return ERR_PTR(-ENOMEM);
+               return -EINVAL;
 
        if (udata) {
                if (!t3a_device(rhp)) {
-                       if (ib_copy_from_udata(&ureq, udata, sizeof (ureq))) {
-                               kfree(chp);
-                               return ERR_PTR(-EFAULT);
-                       }
+                       if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
+                               return  -EFAULT;
+
                        chp->user_rptr_addr = (u32 __user *)(unsigned long)ureq.user_rptr_addr;
                }
        }
@@ -151,10 +144,9 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
        entries = roundup_pow_of_two(entries);
        chp->cq.size_log2 = ilog2(entries);
 
-       if (cxio_create_cq(&rhp->rdev, &chp->cq, !udata)) {
-               kfree(chp);
-               return ERR_PTR(-ENOMEM);
-       }
+       if (cxio_create_cq(&rhp->rdev, &chp->cq, !udata))
+               return -ENOMEM;
+
        chp->rhp = rhp;
        chp->ibcq.cqe = 1 << chp->cq.size_log2;
        spin_lock_init(&chp->lock);
@@ -163,8 +155,7 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
        init_waitqueue_head(&chp->wait);
        if (xa_store_irq(&rhp->cqs, chp->cq.cqid, chp, GFP_KERNEL)) {
                cxio_destroy_cq(&chp->rhp->rdev, &chp->cq);
-               kfree(chp);
-               return ERR_PTR(-ENOMEM);
+               return -ENOMEM;
        }
 
        if (udata) {
@@ -172,10 +163,10 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
                struct iwch_ucontext *ucontext = rdma_udata_to_drv_context(
                        udata, struct iwch_ucontext, ibucontext);
 
-               mm = kmalloc(sizeof *mm, GFP_KERNEL);
+               mm = kmalloc(sizeof(*mm), GFP_KERNEL);
                if (!mm) {
                        iwch_destroy_cq(&chp->ibcq, udata);
-                       return ERR_PTR(-ENOMEM);
+                       return -ENOMEM;
                }
                uresp.cqid = chp->cq.cqid;
                uresp.size_log2 = chp->cq.size_log2;
@@ -185,7 +176,7 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
                spin_unlock(&ucontext->mmap_lock);
                mm->key = uresp.key;
                mm->addr = virt_to_phys(chp->cq.queue);
-               if (udata->outlen < sizeof uresp) {
+               if (udata->outlen < sizeof(uresp)) {
                        if (!warned++)
                                pr_warn("Warning - downlevel libcxgb3 (non-fatal)\n");
                        mm->len = PAGE_ALIGN((1UL << uresp.size_log2) *
@@ -196,86 +187,19 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
                                             sizeof(struct t3_cqe));
                        uresp.memsize = mm->len;
                        uresp.reserved = 0;
-                       resplen = sizeof uresp;
+                       resplen = sizeof(uresp);
                }
                if (ib_copy_to_udata(udata, &uresp, resplen)) {
                        kfree(mm);
                        iwch_destroy_cq(&chp->ibcq, udata);
-                       return ERR_PTR(-EFAULT);
+                       return -EFAULT;
                }
                insert_mmap(ucontext, mm);
        }
-       pr_debug("created cqid 0x%0x chp %p size 0x%0x, dma_addr 0x%0llx\n",
+       pr_debug("created cqid 0x%0x chp %p size 0x%0x, dma_addr %pad\n",
                 chp->cq.cqid, chp, (1 << chp->cq.size_log2),
-                (unsigned long long)chp->cq.dma_addr);
-       return &chp->ibcq;
-}
-
-static int iwch_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
-{
-#ifdef notyet
-       struct iwch_cq *chp = to_iwch_cq(cq);
-       struct t3_cq oldcq, newcq;
-       int ret;
-
-       pr_debug("%s ib_cq %p cqe %d\n", __func__, cq, cqe);
-
-       /* We don't downsize... */
-       if (cqe <= cq->cqe)
-               return 0;
-
-       /* create new t3_cq with new size */
-       cqe = roundup_pow_of_two(cqe+1);
-       newcq.size_log2 = ilog2(cqe);
-
-       /* Dont allow resize to less than the current wce count */
-       if (cqe < Q_COUNT(chp->cq.rptr, chp->cq.wptr)) {
-               return -ENOMEM;
-       }
-
-       /* Quiesce all QPs using this CQ */
-       ret = iwch_quiesce_qps(chp);
-       if (ret) {
-               return ret;
-       }
-
-       ret = cxio_create_cq(&chp->rhp->rdev, &newcq);
-       if (ret) {
-               return ret;
-       }
-
-       /* copy CQEs */
-       memcpy(newcq.queue, chp->cq.queue, (1 << chp->cq.size_log2) *
-                                       sizeof(struct t3_cqe));
-
-       /* old iwch_qp gets new t3_cq but keeps old cqid */
-       oldcq = chp->cq;
-       chp->cq = newcq;
-       chp->cq.cqid = oldcq.cqid;
-
-       /* resize new t3_cq to update the HW context */
-       ret = cxio_resize_cq(&chp->rhp->rdev, &chp->cq);
-       if (ret) {
-               chp->cq = oldcq;
-               return ret;
-       }
-       chp->ibcq.cqe = (1<<chp->cq.size_log2) - 1;
-
-       /* destroy old t3_cq */
-       oldcq.cqid = newcq.cqid;
-       ret = cxio_destroy_cq(&chp->rhp->rdev, &oldcq);
-       if (ret) {
-               pr_err("%s - cxio_destroy_cq failed %d\n", __func__, ret);
-       }
-
-       /* add user hooks here */
-
-       /* resume qps */
-       ret = iwch_resume_qps(chp);
-       return ret;
-#else
-       return -ENOSYS;
-#endif
+                &chp->cq.dma_addr);
+       return 0;
 }
 
 static int iwch_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
@@ -422,8 +346,7 @@ static int iwch_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
        xa_erase_irq(&rhp->mrs, mmid);
        if (mhp->kva)
                kfree((void *) (unsigned long) mhp->kva);
-       if (mhp->umem)
-               ib_umem_release(mhp->umem);
+       ib_umem_release(mhp->umem);
        pr_debug("%s mmid 0x%x ptr %p\n", __func__, mmid, mhp);
        kfree(mhp);
        return 0;
@@ -553,7 +476,7 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 
        for_each_sg_dma_page(mhp->umem->sg_head.sgl, &sg_iter, mhp->umem->nmap, 0) {
                pages[i++] = cpu_to_be64(sg_page_iter_dma_address(&sg_iter));
-               if (i == PAGE_SIZE / sizeof *pages) {
+               if (i == PAGE_SIZE / sizeof(*pages)) {
                        err = iwch_write_pbl(mhp, pages, i, n);
                        if (err)
                                goto pbl_done;
@@ -587,7 +510,7 @@ pbl_done:
                pr_debug("%s user resp pbl_addr 0x%x\n", __func__,
                         uresp.pbl_addr);
 
-               if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) {
+               if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
                        iwch_dereg_mr(&mhp->ibmr, udata);
                        err = -EFAULT;
                        goto err;
@@ -880,13 +803,13 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
 
                struct iwch_mm_entry *mm1, *mm2;
 
-               mm1 = kmalloc(sizeof *mm1, GFP_KERNEL);
+               mm1 = kmalloc(sizeof(*mm1), GFP_KERNEL);
                if (!mm1) {
                        iwch_destroy_qp(&qhp->ibqp, udata);
                        return ERR_PTR(-ENOMEM);
                }
 
-               mm2 = kmalloc(sizeof *mm2, GFP_KERNEL);
+               mm2 = kmalloc(sizeof(*mm2), GFP_KERNEL);
                if (!mm2) {
                        kfree(mm1);
                        iwch_destroy_qp(&qhp->ibqp, udata);
@@ -903,7 +826,7 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
                uresp.db_key = ucontext->key;
                ucontext->key += PAGE_SIZE;
                spin_unlock(&ucontext->mmap_lock);
-               if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) {
+               if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
                        kfree(mm1);
                        kfree(mm2);
                        iwch_destroy_qp(&qhp->ibqp, udata);
@@ -911,7 +834,7 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
                }
                mm1->key = uresp.key;
                mm1->addr = virt_to_phys(qhp->wq.queue);
-               mm1->len = PAGE_ALIGN(wqsize * sizeof (union t3_wr));
+               mm1->len = PAGE_ALIGN(wqsize * sizeof(union t3_wr));
                insert_mmap(ucontext, mm1);
                mm2->key = uresp.db_key;
                mm2->addr = qhp->wq.udb & PAGE_MASK;
@@ -919,10 +842,11 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
                insert_mmap(ucontext, mm2);
        }
        qhp->ibqp.qp_num = qhp->wq.qpid;
-       pr_debug("%s sq_num_entries %d, rq_num_entries %d qpid 0x%0x qhp %p dma_addr 0x%llx size %d rq_addr 0x%x\n",
-                __func__, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries,
-                qhp->wq.qpid, qhp, (unsigned long long)qhp->wq.dma_addr,
-                1 << qhp->wq.size_log2, qhp->wq.rq_addr);
+       pr_debug(
+               "%s sq_num_entries %d, rq_num_entries %d qpid 0x%0x qhp %p dma_addr %pad size %d rq_addr 0x%x\n",
+               __func__, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries,
+               qhp->wq.qpid, qhp, &qhp->wq.dma_addr, 1 << qhp->wq.size_log2,
+               qhp->wq.rq_addr);
        return &qhp->ibqp;
 }
 
@@ -932,7 +856,7 @@ static int iwch_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
        struct iwch_dev *rhp;
        struct iwch_qp *qhp;
        enum iwch_qp_attr_mask mask = 0;
-       struct iwch_qp_attributes attrs;
+       struct iwch_qp_attributes attrs = {};
 
        pr_debug("%s ib_qp %p\n", __func__, ibqp);
 
@@ -944,7 +868,6 @@ static int iwch_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
        if (!attr_mask)
                return 0;
 
-       memset(&attrs, 0, sizeof attrs);
        qhp = to_iwch_qp(ibqp);
        rhp = qhp->rhp;
 
@@ -1040,7 +963,6 @@ static int iwch_query_device(struct ib_device *ibdev, struct ib_device_attr *pro
                return -EINVAL;
 
        dev = to_iwch_dev(ibdev);
-       memset(props, 0, sizeof *props);
        memcpy(&props->sys_image_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
        props->hw_ver = dev->rdev.t3cdev_p->type;
        props->fw_ver = fw_vers_string_to_u64(dev);
@@ -1304,6 +1226,11 @@ static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str)
 }
 
 static const struct ib_device_ops iwch_dev_ops = {
+       .owner = THIS_MODULE,
+       .driver_id = RDMA_DRIVER_CXGB3,
+       .uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION,
+       .uverbs_no_driver_id_binding = 1,
+
        .alloc_hw_stats = iwch_alloc_stats,
        .alloc_mr = iwch_alloc_mr,
        .alloc_mw = iwch_alloc_mw,
@@ -1341,8 +1268,8 @@ static const struct ib_device_ops iwch_dev_ops = {
        .query_port = iwch_query_port,
        .reg_user_mr = iwch_reg_user_mr,
        .req_notify_cq = iwch_arm_cq,
-       .resize_cq = iwch_resize_cq,
        INIT_RDMA_OBJ_SIZE(ib_pd, iwch_pd, ibpd),
+       INIT_RDMA_OBJ_SIZE(ib_cq, iwch_cq, ibcq),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, iwch_ucontext, ibucontext),
 };
 
@@ -1351,7 +1278,6 @@ int iwch_register_device(struct iwch_dev *dev)
        pr_debug("%s iwch_dev %p\n", __func__, dev);
        memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
        memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
-       dev->ibdev.owner = THIS_MODULE;
        dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY |
                                IB_DEVICE_MEM_WINDOW |
                                IB_DEVICE_MEM_MGT_EXTENSIONS;
@@ -1383,12 +1309,10 @@ int iwch_register_device(struct iwch_dev *dev)
        dev->ibdev.phys_port_cnt = dev->rdev.port_info.nports;
        dev->ibdev.num_comp_vectors = 1;
        dev->ibdev.dev.parent = &dev->rdev.rnic_info.pdev->dev;
-       dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION;
 
        memcpy(dev->ibdev.iw_ifname, dev->rdev.t3cdev_p->lldev->name,
               sizeof(dev->ibdev.iw_ifname));
 
-       dev->ibdev.driver_id = RDMA_DRIVER_CXGB3;
        rdma_set_device_sysfs_group(&dev->ibdev, &iwch_attr_group);
        ib_set_device_ops(&dev->ibdev, &iwch_dev_ops);
        return ib_register_device(&dev->ibdev, "cxgb3_%d");
index 09fcfc9e052d7a61b69c12e53688a39cc9b5f104..e87fc0408470452c4bc5600c127f69848c2c3cb3 100644 (file)
@@ -953,7 +953,7 @@ static int send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,
        mpalen = sizeof(*mpa) + ep->plen;
        if (mpa_rev_to_use == 2)
                mpalen += sizeof(struct mpa_v2_conn_params);
-       wrlen = roundup(mpalen + sizeof *req, 16);
+       wrlen = roundup(mpalen + sizeof(*req), 16);
        skb = get_skb(skb, wrlen, GFP_KERNEL);
        if (!skb) {
                connect_reply_upcall(ep, -ENOMEM);
@@ -997,8 +997,9 @@ static int send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,
        }
 
        if (mpa_rev_to_use == 2) {
-               mpa->private_data_size = htons(ntohs(mpa->private_data_size) +
-                                              sizeof (struct mpa_v2_conn_params));
+               mpa->private_data_size =
+                       htons(ntohs(mpa->private_data_size) +
+                             sizeof(struct mpa_v2_conn_params));
                pr_debug("initiator ird %u ord %u\n", ep->ird,
                         ep->ord);
                mpa_v2_params.ird = htons((u16)ep->ird);
@@ -1057,7 +1058,7 @@ static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen)
        mpalen = sizeof(*mpa) + plen;
        if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn)
                mpalen += sizeof(struct mpa_v2_conn_params);
-       wrlen = roundup(mpalen + sizeof *req, 16);
+       wrlen = roundup(mpalen + sizeof(*req), 16);
 
        skb = get_skb(NULL, wrlen, GFP_KERNEL);
        if (!skb) {
@@ -1088,8 +1089,9 @@ static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen)
 
        if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
                mpa->flags |= MPA_ENHANCED_RDMA_CONN;
-               mpa->private_data_size = htons(ntohs(mpa->private_data_size) +
-                                              sizeof (struct mpa_v2_conn_params));
+               mpa->private_data_size =
+                       htons(ntohs(mpa->private_data_size) +
+                             sizeof(struct mpa_v2_conn_params));
                mpa_v2_params.ird = htons(((u16)ep->ird) |
                                          (peer2peer ? MPA_V2_PEER2PEER_MODEL :
                                           0));
@@ -1136,7 +1138,7 @@ static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen)
        mpalen = sizeof(*mpa) + plen;
        if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn)
                mpalen += sizeof(struct mpa_v2_conn_params);
-       wrlen = roundup(mpalen + sizeof *req, 16);
+       wrlen = roundup(mpalen + sizeof(*req), 16);
 
        skb = get_skb(NULL, wrlen, GFP_KERNEL);
        if (!skb) {
@@ -1171,8 +1173,9 @@ static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen)
 
        if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
                mpa->flags |= MPA_ENHANCED_RDMA_CONN;
-               mpa->private_data_size = htons(ntohs(mpa->private_data_size) +
-                                              sizeof (struct mpa_v2_conn_params));
+               mpa->private_data_size =
+                       htons(ntohs(mpa->private_data_size) +
+                             sizeof(struct mpa_v2_conn_params));
                mpa_v2_params.ird = htons((u16)ep->ird);
                mpa_v2_params.ord = htons((u16)ep->ord);
                if (peer2peer && (ep->mpa_attr.p2p_type !=
index 52ce586621c6fd1e5f96f2c1c9d4c560d8467ba8..b1bb61c65f4f684bdc34dc7f18e972f5f0905bc5 100644 (file)
 
 #include "iw_cxgb4.h"
 
-static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
-                     struct c4iw_dev_ucontext *uctx, struct sk_buff *skb,
-                     struct c4iw_wr_wait *wr_waitp)
+static void destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
+                      struct c4iw_dev_ucontext *uctx, struct sk_buff *skb,
+                      struct c4iw_wr_wait *wr_waitp)
 {
        struct fw_ri_res_wr *res_wr;
        struct fw_ri_res *res;
        int wr_len;
-       int ret;
 
-       wr_len = sizeof *res_wr + sizeof *res;
+       wr_len = sizeof(*res_wr) + sizeof(*res);
        set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
 
        res_wr = __skb_put_zero(skb, wr_len);
@@ -59,14 +58,13 @@ static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
        res->u.cq.iqid = cpu_to_be32(cq->cqid);
 
        c4iw_init_wr_wait(wr_waitp);
-       ret = c4iw_ref_send_wait(rdev, skb, wr_waitp, 0, 0, __func__);
+       c4iw_ref_send_wait(rdev, skb, wr_waitp, 0, 0, __func__);
 
        kfree(cq->sw_queue);
        dma_free_coherent(&(rdev->lldi.pdev->dev),
                          cq->memsize, cq->queue,
                          dma_unmap_addr(cq, mapping));
        c4iw_put_cqid(rdev, cq->cqid, uctx);
-       return ret;
 }
 
 static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
@@ -104,7 +102,6 @@ static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
                goto err3;
        }
        dma_unmap_addr_set(cq, mapping, cq->dma_addr);
-       memset(cq->queue, 0, cq->memsize);
 
        if (user && ucontext->is_32b_cqe) {
                cq->qp_errp = &((struct t4_status_page *)
@@ -117,7 +114,7 @@ static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
        }
 
        /* build fw_ri_res_wr */
-       wr_len = sizeof *res_wr + sizeof *res;
+       wr_len = sizeof(*res_wr) + sizeof(*res);
 
        skb = alloc_skb(wr_len, GFP_KERNEL);
        if (!skb) {
@@ -970,7 +967,7 @@ int c4iw_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
        return !err || err == -ENODATA ? npolled : err;
 }
 
-int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
+void c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
        struct c4iw_cq *chp;
        struct c4iw_ucontext *ucontext;
@@ -988,18 +985,16 @@ int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
                   ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx,
                   chp->destroy_skb, chp->wr_waitp);
        c4iw_put_wr_wait(chp->wr_waitp);
-       kfree(chp);
-       return 0;
 }
 
-struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
-                            const struct ib_cq_init_attr *attr,
-                            struct ib_udata *udata)
+int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                  struct ib_udata *udata)
 {
+       struct ib_device *ibdev = ibcq->device;
        int entries = attr->cqe;
        int vector = attr->comp_vector;
-       struct c4iw_dev *rhp;
-       struct c4iw_cq *chp;
+       struct c4iw_dev *rhp = to_c4iw_dev(ibcq->device);
+       struct c4iw_cq *chp = to_c4iw_cq(ibcq);
        struct c4iw_create_cq ucmd;
        struct c4iw_create_cq_resp uresp;
        int ret, wr_len;
@@ -1010,22 +1005,16 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
 
        pr_debug("ib_dev %p entries %d\n", ibdev, entries);
        if (attr->flags)
-               return ERR_PTR(-EINVAL);
-
-       rhp = to_c4iw_dev(ibdev);
+               return -EINVAL;
 
        if (vector >= rhp->rdev.lldi.nciq)
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
 
        if (udata) {
                if (udata->inlen < sizeof(ucmd))
                        ucontext->is_32b_cqe = 1;
        }
 
-       chp = kzalloc(sizeof(*chp), GFP_KERNEL);
-       if (!chp)
-               return ERR_PTR(-ENOMEM);
-
        chp->wr_waitp = c4iw_alloc_wr_wait(GFP_KERNEL);
        if (!chp->wr_waitp) {
                ret = -ENOMEM;
@@ -1095,10 +1084,10 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
 
        if (ucontext) {
                ret = -ENOMEM;
-               mm = kmalloc(sizeof *mm, GFP_KERNEL);
+               mm = kmalloc(sizeof(*mm), GFP_KERNEL);
                if (!mm)
                        goto err_remove_handle;
-               mm2 = kmalloc(sizeof *mm2, GFP_KERNEL);
+               mm2 = kmalloc(sizeof(*mm2), GFP_KERNEL);
                if (!mm2)
                        goto err_free_mm;
 
@@ -1135,10 +1124,11 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
                mm2->len = PAGE_SIZE;
                insert_mmap(ucontext, mm2);
        }
-       pr_debug("cqid 0x%0x chp %p size %u memsize %zu, dma_addr 0x%0llx\n",
-                chp->cq.cqid, chp, chp->cq.size,
-                chp->cq.memsize, (unsigned long long)chp->cq.dma_addr);
-       return &chp->ibcq;
+
+       pr_debug("cqid 0x%0x chp %p size %u memsize %zu, dma_addr %pad\n",
+                chp->cq.cqid, chp, chp->cq.size, chp->cq.memsize,
+                &chp->cq.dma_addr);
+       return 0;
 err_free_mm2:
        kfree(mm2);
 err_free_mm:
@@ -1154,8 +1144,7 @@ err_free_skb:
 err_free_wr_wait:
        c4iw_put_wr_wait(chp->wr_waitp);
 err_free_chp:
-       kfree(chp);
-       return ERR_PTR(ret);
+       return ret;
 }
 
 int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
index 4c0d925c5ff5a85bf6f14ded2b2880f366196a8b..a8b9548bd1a260e259d1c31ce5384b2a28820861 100644 (file)
@@ -327,7 +327,7 @@ static int qp_open(struct inode *inode, struct file *file)
        unsigned long index;
        int count = 1;
 
-       qpd = kmalloc(sizeof *qpd, GFP_KERNEL);
+       qpd = kmalloc(sizeof(*qpd), GFP_KERNEL);
        if (!qpd)
                return -ENOMEM;
 
@@ -421,7 +421,7 @@ static int stag_open(struct inode *inode, struct file *file)
        int ret = 0;
        int count = 1;
 
-       stagd = kmalloc(sizeof *stagd, GFP_KERNEL);
+       stagd = kmalloc(sizeof(*stagd), GFP_KERNEL);
        if (!stagd) {
                ret = -ENOMEM;
                goto out;
@@ -1075,7 +1075,7 @@ static void *c4iw_uld_add(const struct cxgb4_lld_info *infop)
                pr_info("Chelsio T4/T5 RDMA Driver - version %s\n",
                        DRV_VERSION);
 
-       ctx = kzalloc(sizeof *ctx, GFP_KERNEL);
+       ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
        if (!ctx) {
                ctx = ERR_PTR(-ENOMEM);
                goto out;
@@ -1243,10 +1243,9 @@ static int c4iw_uld_state_change(void *handle, enum cxgb4_state new_state)
        case CXGB4_STATE_START_RECOVERY:
                pr_info("%s: Fatal Error\n", pci_name(ctx->lldi.pdev));
                if (ctx->dev) {
-                       struct ib_event event;
+                       struct ib_event event = {};
 
                        ctx->dev->rdev.flags |= T4_FATAL_ERROR;
-                       memset(&event, 0, sizeof event);
                        event.event  = IB_EVENT_DEVICE_FATAL;
                        event.device = &ctx->dev->ibdev;
                        ib_dispatch_event(&event);
index 916ef982172e96f919eca838fe0a52c165a56c43..7d06b0f8d49a00a388375383bd89bf79b48ce4a6 100644 (file)
@@ -490,13 +490,13 @@ struct c4iw_qp {
        struct t4_wq wq;
        spinlock_t lock;
        struct mutex mutex;
-       struct kref kref;
        wait_queue_head_t wait;
        int sq_sig_all;
        struct c4iw_srq *srq;
-       struct work_struct free_work;
        struct c4iw_ucontext *ucontext;
        struct c4iw_wr_wait *wr_waitp;
+       struct completion qp_rel_comp;
+       refcount_t qp_refcnt;
 };
 
 static inline struct c4iw_qp *to_c4iw_qp(struct ib_qp *ibqp)
@@ -992,10 +992,9 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start,
                                           struct ib_udata *udata);
 struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc);
 int c4iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata);
-int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata);
-struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
-                            const struct ib_cq_init_attr *attr,
-                            struct ib_udata *udata);
+void c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata);
+int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                  struct ib_udata *udata);
 int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
 int c4iw_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *attr,
                    enum ib_srq_attr_mask srq_attr_mask,
index 811c0c8c5b16f03250b5f1202c8959712b66827f..aa772ee0706f9991ced0a0f6644e15f916d9151c 100644 (file)
@@ -130,8 +130,9 @@ static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len,
 
                copy_len = len > C4IW_MAX_INLINE_SIZE ? C4IW_MAX_INLINE_SIZE :
                           len;
-               wr_len = roundup(sizeof *req + sizeof *sc +
-                                roundup(copy_len, T4_ULPTX_MIN_IO), 16);
+               wr_len = roundup(sizeof(*req) + sizeof(*sc) +
+                                        roundup(copy_len, T4_ULPTX_MIN_IO),
+                                16);
 
                if (!skb) {
                        skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL);
@@ -807,8 +808,7 @@ int c4iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
                                  mhp->attr.pbl_size << 3);
        if (mhp->kva)
                kfree((void *) (unsigned long) mhp->kva);
-       if (mhp->umem)
-               ib_umem_release(mhp->umem);
+       ib_umem_release(mhp->umem);
        pr_debug("mmid 0x%x ptr %p\n", mmid, mhp);
        c4iw_put_wr_wait(mhp->wr_waitp);
        kfree(mhp);
index 74b795642fca224bd02b208095a1285ee93662c3..5e59c570872989b7941cc2ce3906e5ba95b08a16 100644 (file)
@@ -271,7 +271,6 @@ static int c4iw_query_device(struct ib_device *ibdev, struct ib_device_attr *pro
                return -EINVAL;
 
        dev = to_c4iw_dev(ibdev);
-       memset(props, 0, sizeof *props);
        memcpy(&props->sys_image_guid, dev->rdev.lldi.ports[0]->dev_addr, 6);
        props->hw_ver = CHELSIO_CHIP_RELEASE(dev->rdev.lldi.adapter_type);
        props->fw_ver = dev->rdev.lldi.fw_vers;
@@ -490,6 +489,10 @@ static int fill_res_entry(struct sk_buff *msg, struct rdma_restrack_entry *res)
 }
 
 static const struct ib_device_ops c4iw_dev_ops = {
+       .owner = THIS_MODULE,
+       .driver_id = RDMA_DRIVER_CXGB4,
+       .uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION,
+
        .alloc_hw_stats = c4iw_alloc_stats,
        .alloc_mr = c4iw_alloc_mr,
        .alloc_mw = c4iw_alloc_mw,
@@ -534,6 +537,7 @@ static const struct ib_device_ops c4iw_dev_ops = {
        .reg_user_mr = c4iw_reg_user_mr,
        .req_notify_cq = c4iw_arm_cq,
        INIT_RDMA_OBJ_SIZE(ib_pd, c4iw_pd, ibpd),
+       INIT_RDMA_OBJ_SIZE(ib_cq, c4iw_cq, ibcq),
        INIT_RDMA_OBJ_SIZE(ib_srq, c4iw_srq, ibsrq),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, c4iw_ucontext, ibucontext),
 };
@@ -561,7 +565,6 @@ void c4iw_register_device(struct work_struct *work)
        pr_debug("c4iw_dev %p\n", dev);
        memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
        memcpy(&dev->ibdev.node_guid, dev->rdev.lldi.ports[0]->dev_addr, 6);
-       dev->ibdev.owner = THIS_MODULE;
        dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW;
        if (fastreg_support)
                dev->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
@@ -594,13 +597,11 @@ void c4iw_register_device(struct work_struct *work)
        dev->ibdev.phys_port_cnt = dev->rdev.lldi.nports;
        dev->ibdev.num_comp_vectors =  dev->rdev.lldi.nciq;
        dev->ibdev.dev.parent = &dev->rdev.lldi.pdev->dev;
-       dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION;
 
        memcpy(dev->ibdev.iw_ifname, dev->rdev.lldi.ports[0]->name,
               sizeof(dev->ibdev.iw_ifname));
 
        rdma_set_device_sysfs_group(&dev->ibdev, &c4iw_attr_group);
-       dev->ibdev.driver_id = RDMA_DRIVER_CXGB4;
        ib_set_device_ops(&dev->ibdev, &c4iw_dev_ops);
        ret = set_netdevs(&dev->ibdev, &dev->rdev);
        if (ret)
index e92b9544357aeee2d93078671a3da1b68b166a7d..eb9368be28c1df457f948b19ff4ef103c8dac12f 100644 (file)
@@ -274,7 +274,6 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq,
                         (unsigned long long)virt_to_phys(wq->sq.queue),
                         wq->rq.queue,
                         (unsigned long long)virt_to_phys(wq->rq.queue));
-               memset(wq->rq.queue, 0, wq->rq.memsize);
                dma_unmap_addr_set(&wq->rq, mapping, wq->rq.dma_addr);
        }
 
@@ -303,7 +302,7 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq,
        wq->rq.msn = 1;
 
        /* build fw_ri_res_wr */
-       wr_len = sizeof *res_wr + 2 * sizeof *res;
+       wr_len = sizeof(*res_wr) + 2 * sizeof(*res);
        if (need_rq)
                wr_len += sizeof(*res);
        skb = alloc_skb(wr_len, GFP_KERNEL);
@@ -439,7 +438,7 @@ static int build_immd(struct t4_sq *sq, struct fw_ri_immd *immdp,
                        rem -= len;
                }
        }
-       len = roundup(plen + sizeof *immdp, 16) - (plen + sizeof *immdp);
+       len = roundup(plen + sizeof(*immdp), 16) - (plen + sizeof(*immdp));
        if (len)
                memset(dstp, 0, len);
        immdp->op = FW_RI_DATA_IMMD;
@@ -528,7 +527,7 @@ static int build_rdma_send(struct t4_sq *sq, union t4_wr *wqe,
                                         T4_MAX_SEND_INLINE, &plen);
                        if (ret)
                                return ret;
-                       size = sizeof wqe->send + sizeof(struct fw_ri_immd) +
+                       size = sizeof(wqe->send) + sizeof(struct fw_ri_immd) +
                               plen;
                } else {
                        ret = build_isgl((__be64 *)sq->queue,
@@ -537,7 +536,7 @@ static int build_rdma_send(struct t4_sq *sq, union t4_wr *wqe,
                                         wr->sg_list, wr->num_sge, &plen);
                        if (ret)
                                return ret;
-                       size = sizeof wqe->send + sizeof(struct fw_ri_isgl) +
+                       size = sizeof(wqe->send) + sizeof(struct fw_ri_isgl) +
                               wr->num_sge * sizeof(struct fw_ri_sge);
                }
        } else {
@@ -545,7 +544,7 @@ static int build_rdma_send(struct t4_sq *sq, union t4_wr *wqe,
                wqe->send.u.immd_src[0].r1 = 0;
                wqe->send.u.immd_src[0].r2 = 0;
                wqe->send.u.immd_src[0].immdlen = 0;
-               size = sizeof wqe->send + sizeof(struct fw_ri_immd);
+               size = sizeof(wqe->send) + sizeof(struct fw_ri_immd);
                plen = 0;
        }
        *len16 = DIV_ROUND_UP(size, 16);
@@ -579,7 +578,7 @@ static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe,
                                         T4_MAX_WRITE_INLINE, &plen);
                        if (ret)
                                return ret;
-                       size = sizeof wqe->write + sizeof(struct fw_ri_immd) +
+                       size = sizeof(wqe->write) + sizeof(struct fw_ri_immd) +
                               plen;
                } else {
                        ret = build_isgl((__be64 *)sq->queue,
@@ -588,7 +587,7 @@ static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe,
                                         wr->sg_list, wr->num_sge, &plen);
                        if (ret)
                                return ret;
-                       size = sizeof wqe->write + sizeof(struct fw_ri_isgl) +
+                       size = sizeof(wqe->write) + sizeof(struct fw_ri_isgl) +
                               wr->num_sge * sizeof(struct fw_ri_sge);
                }
        } else {
@@ -596,7 +595,7 @@ static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe,
                wqe->write.u.immd_src[0].r1 = 0;
                wqe->write.u.immd_src[0].r2 = 0;
                wqe->write.u.immd_src[0].immdlen = 0;
-               size = sizeof wqe->write + sizeof(struct fw_ri_immd);
+               size = sizeof(wqe->write) + sizeof(struct fw_ri_immd);
                plen = 0;
        }
        *len16 = DIV_ROUND_UP(size, 16);
@@ -683,7 +682,7 @@ static int build_rdma_read(union t4_wr *wqe, const struct ib_send_wr *wr,
        }
        wqe->read.r2 = 0;
        wqe->read.r5 = 0;
-       *len16 = DIV_ROUND_UP(sizeof wqe->read, 16);
+       *len16 = DIV_ROUND_UP(sizeof(wqe->read), 16);
        return 0;
 }
 
@@ -766,8 +765,8 @@ static int build_rdma_recv(struct c4iw_qp *qhp, union t4_recv_wr *wqe,
                         &wqe->recv.isgl, wr->sg_list, wr->num_sge, NULL);
        if (ret)
                return ret;
-       *len16 = DIV_ROUND_UP(sizeof wqe->recv +
-                             wr->num_sge * sizeof(struct fw_ri_sge), 16);
+       *len16 = DIV_ROUND_UP(
+               sizeof(wqe->recv) + wr->num_sge * sizeof(struct fw_ri_sge), 16);
        return 0;
 }
 
@@ -886,47 +885,21 @@ static int build_inv_stag(union t4_wr *wqe, const struct ib_send_wr *wr,
 {
        wqe->inv.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey);
        wqe->inv.r2 = 0;
-       *len16 = DIV_ROUND_UP(sizeof wqe->inv, 16);
+       *len16 = DIV_ROUND_UP(sizeof(wqe->inv), 16);
        return 0;
 }
 
-static void free_qp_work(struct work_struct *work)
-{
-       struct c4iw_ucontext *ucontext;
-       struct c4iw_qp *qhp;
-       struct c4iw_dev *rhp;
-
-       qhp = container_of(work, struct c4iw_qp, free_work);
-       ucontext = qhp->ucontext;
-       rhp = qhp->rhp;
-
-       pr_debug("qhp %p ucontext %p\n", qhp, ucontext);
-       destroy_qp(&rhp->rdev, &qhp->wq,
-                  ucontext ? &ucontext->uctx : &rhp->rdev.uctx, !qhp->srq);
-
-       c4iw_put_wr_wait(qhp->wr_waitp);
-       kfree(qhp);
-}
-
-static void queue_qp_free(struct kref *kref)
-{
-       struct c4iw_qp *qhp;
-
-       qhp = container_of(kref, struct c4iw_qp, kref);
-       pr_debug("qhp %p\n", qhp);
-       queue_work(qhp->rhp->rdev.free_workq, &qhp->free_work);
-}
-
 void c4iw_qp_add_ref(struct ib_qp *qp)
 {
        pr_debug("ib_qp %p\n", qp);
-       kref_get(&to_c4iw_qp(qp)->kref);
+       refcount_inc(&to_c4iw_qp(qp)->qp_refcnt);
 }
 
 void c4iw_qp_rem_ref(struct ib_qp *qp)
 {
        pr_debug("ib_qp %p\n", qp);
-       kref_put(&to_c4iw_qp(qp)->kref, queue_qp_free);
+       if (refcount_dec_and_test(&to_c4iw_qp(qp)->qp_refcnt))
+               complete(&to_c4iw_qp(qp)->qp_rel_comp);
 }
 
 static void add_to_fc_list(struct list_head *head, struct list_head *entry)
@@ -1606,7 +1579,7 @@ static void post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe,
                FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*wqe), 16)));
 
        wqe->u.terminate.type = FW_RI_TYPE_TERMINATE;
-       wqe->u.terminate.immdlen = cpu_to_be32(sizeof *term);
+       wqe->u.terminate.immdlen = cpu_to_be32(sizeof(*term));
        term = (struct terminate_message *)wqe->u.terminate.termmsg;
        if (qhp->attr.layer_etype == (LAYER_MPA|DDP_LLP)) {
                term->layer_etype = qhp->attr.layer_etype;
@@ -1751,16 +1724,15 @@ static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp,
 static void build_rtr_msg(u8 p2p_type, struct fw_ri_init *init)
 {
        pr_debug("p2p_type = %d\n", p2p_type);
-       memset(&init->u, 0, sizeof init->u);
+       memset(&init->u, 0, sizeof(init->u));
        switch (p2p_type) {
        case FW_RI_INIT_P2PTYPE_RDMA_WRITE:
                init->u.write.opcode = FW_RI_RDMA_WRITE_WR;
                init->u.write.stag_sink = cpu_to_be32(1);
                init->u.write.to_sink = cpu_to_be64(1);
                init->u.write.u.immd_src[0].op = FW_RI_DATA_IMMD;
-               init->u.write.len16 = DIV_ROUND_UP(sizeof init->u.write +
-                                                  sizeof(struct fw_ri_immd),
-                                                  16);
+               init->u.write.len16 = DIV_ROUND_UP(
+                       sizeof(init->u.write) + sizeof(struct fw_ri_immd), 16);
                break;
        case FW_RI_INIT_P2PTYPE_READ_REQ:
                init->u.write.opcode = FW_RI_RDMA_READ_WR;
@@ -1768,7 +1740,7 @@ static void build_rtr_msg(u8 p2p_type, struct fw_ri_init *init)
                init->u.read.to_src_lo = cpu_to_be32(1);
                init->u.read.stag_sink = cpu_to_be32(1);
                init->u.read.to_sink_lo = cpu_to_be32(1);
-               init->u.read.len16 = DIV_ROUND_UP(sizeof init->u.read, 16);
+               init->u.read.len16 = DIV_ROUND_UP(sizeof(init->u.read), 16);
                break;
        }
 }
@@ -1782,7 +1754,7 @@ static int rdma_init(struct c4iw_dev *rhp, struct c4iw_qp *qhp)
        pr_debug("qhp %p qid 0x%x tid %u ird %u ord %u\n", qhp,
                 qhp->wq.sq.qid, qhp->ep->hwtid, qhp->ep->ird, qhp->ep->ord);
 
-       skb = alloc_skb(sizeof *wqe, GFP_KERNEL);
+       skb = alloc_skb(sizeof(*wqe), GFP_KERNEL);
        if (!skb) {
                ret = -ENOMEM;
                goto out;
@@ -2099,10 +2071,12 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
 {
        struct c4iw_dev *rhp;
        struct c4iw_qp *qhp;
+       struct c4iw_ucontext *ucontext;
        struct c4iw_qp_attributes attrs;
 
        qhp = to_c4iw_qp(ib_qp);
        rhp = qhp->rhp;
+       ucontext = qhp->ucontext;
 
        attrs.next_state = C4IW_QP_STATE_ERROR;
        if (qhp->attr.state == C4IW_QP_STATE_TERMINATE)
@@ -2120,7 +2094,17 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
 
        c4iw_qp_rem_ref(ib_qp);
 
+       wait_for_completion(&qhp->qp_rel_comp);
+
        pr_debug("ib_qp %p qpid 0x%0x\n", ib_qp, qhp->wq.sq.qid);
+       pr_debug("qhp %p ucontext %p\n", qhp, ucontext);
+
+       destroy_qp(&rhp->rdev, &qhp->wq,
+                  ucontext ? &ucontext->uctx : &rhp->rdev.uctx, !qhp->srq);
+
+       c4iw_put_wr_wait(qhp->wr_waitp);
+
+       kfree(qhp);
        return 0;
 }
 
@@ -2230,8 +2214,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
        spin_lock_init(&qhp->lock);
        mutex_init(&qhp->mutex);
        init_waitqueue_head(&qhp->wait);
-       kref_init(&qhp->kref);
-       INIT_WORK(&qhp->free_work, free_qp_work);
+       init_completion(&qhp->qp_rel_comp);
+       refcount_set(&qhp->qp_refcnt, 1);
 
        ret = xa_insert_irq(&rhp->qps, qhp->wq.sq.qid, qhp, GFP_KERNEL);
        if (ret)
@@ -2302,7 +2286,7 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
                        ucontext->key += PAGE_SIZE;
                }
                spin_unlock(&ucontext->mmap_lock);
-               ret = ib_copy_to_udata(udata, &uresp, sizeof uresp);
+               ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
                if (ret)
                        goto err_free_ma_sync_key;
                sq_key_mm->key = uresp.sq_key;
@@ -2386,7 +2370,7 @@ int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
        struct c4iw_dev *rhp;
        struct c4iw_qp *qhp;
        enum c4iw_qp_attr_mask mask = 0;
-       struct c4iw_qp_attributes attrs;
+       struct c4iw_qp_attributes attrs = {};
 
        pr_debug("ib_qp %p\n", ibqp);
 
@@ -2398,7 +2382,6 @@ int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
        if (!attr_mask)
                return 0;
 
-       memset(&attrs, 0, sizeof attrs);
        qhp = to_c4iw_qp(ibqp);
        rhp = qhp->rhp;
 
@@ -2482,8 +2465,8 @@ int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 {
        struct c4iw_qp *qhp = to_c4iw_qp(ibqp);
 
-       memset(attr, 0, sizeof *attr);
-       memset(init_attr, 0, sizeof *init_attr);
+       memset(attr, 0, sizeof(*attr));
+       memset(init_attr, 0, sizeof(*init_attr));
        attr->qp_state = to_ib_qp_state(qhp->attr.state);
        init_attr->cap.max_send_wr = qhp->attr.sq_num_entries;
        init_attr->cap.max_recv_wr = qhp->attr.rq_num_entries;
index 57ed26b3cc21a6b244af622252fa1646eeeb6edd..5c95c789f302dab389e529fbc926e3a0f3736a83 100644 (file)
@@ -126,7 +126,7 @@ u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx)
                rdev->stats.qid.cur += rdev->qpmask + 1;
                mutex_unlock(&rdev->stats.lock);
                for (i = qid+1; i & rdev->qpmask; i++) {
-                       entry = kmalloc(sizeof *entry, GFP_KERNEL);
+                       entry = kmalloc(sizeof(*entry), GFP_KERNEL);
                        if (!entry)
                                goto out;
                        entry->qid = i;
@@ -137,13 +137,13 @@ u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx)
                 * now put the same ids on the qp list since they all
                 * map to the same db/gts page.
                 */
-               entry = kmalloc(sizeof *entry, GFP_KERNEL);
+               entry = kmalloc(sizeof(*entry), GFP_KERNEL);
                if (!entry)
                        goto out;
                entry->qid = qid;
                list_add_tail(&entry->entry, &uctx->qpids);
                for (i = qid+1; i & rdev->qpmask; i++) {
-                       entry = kmalloc(sizeof *entry, GFP_KERNEL);
+                       entry = kmalloc(sizeof(*entry), GFP_KERNEL);
                        if (!entry)
                                goto out;
                        entry->qid = i;
@@ -165,7 +165,7 @@ void c4iw_put_cqid(struct c4iw_rdev *rdev, u32 qid,
 {
        struct c4iw_qid_list *entry;
 
-       entry = kmalloc(sizeof *entry, GFP_KERNEL);
+       entry = kmalloc(sizeof(*entry), GFP_KERNEL);
        if (!entry)
                return;
        pr_debug("qid 0x%x\n", qid);
@@ -200,7 +200,7 @@ u32 c4iw_get_qpid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx)
                rdev->stats.qid.cur += rdev->qpmask + 1;
                mutex_unlock(&rdev->stats.lock);
                for (i = qid+1; i & rdev->qpmask; i++) {
-                       entry = kmalloc(sizeof *entry, GFP_KERNEL);
+                       entry = kmalloc(sizeof(*entry), GFP_KERNEL);
                        if (!entry)
                                goto out;
                        entry->qid = i;
@@ -211,13 +211,13 @@ u32 c4iw_get_qpid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx)
                 * now put the same ids on the cq list since they all
                 * map to the same db/gts page.
                 */
-               entry = kmalloc(sizeof *entry, GFP_KERNEL);
+               entry = kmalloc(sizeof(*entry), GFP_KERNEL);
                if (!entry)
                        goto out;
                entry->qid = qid;
                list_add_tail(&entry->entry, &uctx->cqids);
                for (i = qid; i & rdev->qpmask; i++) {
-                       entry = kmalloc(sizeof *entry, GFP_KERNEL);
+                       entry = kmalloc(sizeof(*entry), GFP_KERNEL);
                        if (!entry)
                                goto out;
                        entry->qid = i;
@@ -239,7 +239,7 @@ void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qid,
 {
        struct c4iw_qid_list *entry;
 
-       entry = kmalloc(sizeof *entry, GFP_KERNEL);
+       entry = kmalloc(sizeof(*entry), GFP_KERNEL);
        if (!entry)
                return;
        pr_debug("qid 0x%x\n", qid);
index 9e3cc3239c13da2947fafe887230016acbce23d9..119f8efec56474340bb6a7cb9f13fee456e4e67d 100644 (file)
@@ -7,10 +7,8 @@
 #define _EFA_H_
 
 #include <linux/bitops.h>
-#include <linux/idr.h>
 #include <linux/interrupt.h>
 #include <linux/pci.h>
-#include <linux/sched.h>
 
 #include <rdma/efa-abi.h>
 #include <rdma/ib_verbs.h>
@@ -136,10 +134,9 @@ int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
 struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
                            struct ib_qp_init_attr *init_attr,
                            struct ib_udata *udata);
-int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
-struct ib_cq *efa_create_cq(struct ib_device *ibdev,
-                           const struct ib_cq_init_attr *attr,
-                           struct ib_udata *udata);
+void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                 struct ib_udata *udata);
 struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
                         u64 virt_addr, int access_flags,
                         struct ib_udata *udata);
index a5c788741a04a17cde71241327fd0d43391f54bc..2cb42484b0f8e6a7d8ab23642df47d261a0c7bd6 100644 (file)
@@ -39,8 +39,6 @@
 enum efa_cmd_status {
        EFA_CMD_SUBMITTED,
        EFA_CMD_COMPLETED,
-       /* Abort - canceled by the driver */
-       EFA_CMD_ABORTED,
 };
 
 struct efa_comp_ctx {
@@ -280,36 +278,34 @@ static void efa_com_dealloc_ctx_id(struct efa_com_admin_queue *aq,
 static inline void efa_com_put_comp_ctx(struct efa_com_admin_queue *aq,
                                        struct efa_comp_ctx *comp_ctx)
 {
-       u16 comp_id = comp_ctx->user_cqe->acq_common_descriptor.command &
-                     EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK;
+       u16 cmd_id = comp_ctx->user_cqe->acq_common_descriptor.command &
+                    EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK;
+       u16 ctx_id = cmd_id & (aq->depth - 1);
 
-       ibdev_dbg(aq->efa_dev, "Putting completion command_id %d\n", comp_id);
+       ibdev_dbg(aq->efa_dev, "Put completion command_id %#x\n", cmd_id);
        comp_ctx->occupied = 0;
-       efa_com_dealloc_ctx_id(aq, comp_id);
+       efa_com_dealloc_ctx_id(aq, ctx_id);
 }
 
 static struct efa_comp_ctx *efa_com_get_comp_ctx(struct efa_com_admin_queue *aq,
-                                                u16 command_id, bool capture)
+                                                u16 cmd_id, bool capture)
 {
-       if (command_id >= aq->depth) {
-               ibdev_err(aq->efa_dev,
-                         "command id is larger than the queue size. cmd_id: %u queue size %d\n",
-                         command_id, aq->depth);
-               return NULL;
-       }
+       u16 ctx_id = cmd_id & (aq->depth - 1);
 
-       if (aq->comp_ctx[command_id].occupied && capture) {
-               ibdev_err(aq->efa_dev, "Completion context is occupied\n");
+       if (aq->comp_ctx[ctx_id].occupied && capture) {
+               ibdev_err(aq->efa_dev,
+                         "Completion context for command_id %#x is occupied\n",
+                         cmd_id);
                return NULL;
        }
 
        if (capture) {
-               aq->comp_ctx[command_id].occupied = 1;
-               ibdev_dbg(aq->efa_dev, "Taking completion ctxt command_id %d\n",
-                         command_id);
+               aq->comp_ctx[ctx_id].occupied = 1;
+               ibdev_dbg(aq->efa_dev,
+                         "Take completion ctxt for command_id %#x\n", cmd_id);
        }
 
-       return &aq->comp_ctx[command_id];
+       return &aq->comp_ctx[ctx_id];
 }
 
 static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queue *aq,
@@ -320,6 +316,7 @@ static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queu
 {
        struct efa_comp_ctx *comp_ctx;
        u16 queue_size_mask;
+       u16 cmd_id;
        u16 ctx_id;
        u16 pi;
 
@@ -328,13 +325,16 @@ static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queu
 
        ctx_id = efa_com_alloc_ctx_id(aq);
 
+       /* cmd_id LSBs are the ctx_id and MSBs are entropy bits from pc */
+       cmd_id = ctx_id & queue_size_mask;
+       cmd_id |= aq->sq.pc & ~queue_size_mask;
+       cmd_id &= EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK;
+
+       cmd->aq_common_descriptor.command_id = cmd_id;
        cmd->aq_common_descriptor.flags |= aq->sq.phase &
                EFA_ADMIN_AQ_COMMON_DESC_PHASE_MASK;
 
-       cmd->aq_common_descriptor.command_id |= ctx_id &
-               EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK;
-
-       comp_ctx = efa_com_get_comp_ctx(aq, ctx_id, true);
+       comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, true);
        if (!comp_ctx) {
                efa_com_dealloc_ctx_id(aq, ctx_id);
                return ERR_PTR(-EINVAL);
@@ -532,16 +532,6 @@ static int efa_com_wait_and_process_admin_cq_polling(struct efa_comp_ctx *comp_c
                msleep(aq->poll_interval);
        }
 
-       if (comp_ctx->status == EFA_CMD_ABORTED) {
-               ibdev_err(aq->efa_dev, "Command was aborted\n");
-               atomic64_inc(&aq->stats.aborted_cmd);
-               err = -ENODEV;
-               goto out;
-       }
-
-       WARN_ONCE(comp_ctx->status != EFA_CMD_COMPLETED,
-                 "Invalid completion status %d\n", comp_ctx->status);
-
        err = efa_com_comp_status_to_errno(comp_ctx->comp_status);
 out:
        efa_com_put_comp_ctx(aq, comp_ctx);
@@ -665,66 +655,6 @@ int efa_com_cmd_exec(struct efa_com_admin_queue *aq,
        return err;
 }
 
-/**
- * efa_com_abort_admin_commands - Abort all the outstanding admin commands.
- * @edev: EFA communication layer struct
- *
- * This method aborts all the outstanding admin commands.
- * The caller should then call efa_com_wait_for_abort_completion to make sure
- * all the commands were completed.
- */
-static void efa_com_abort_admin_commands(struct efa_com_dev *edev)
-{
-       struct efa_com_admin_queue *aq = &edev->aq;
-       struct efa_comp_ctx *comp_ctx;
-       unsigned long flags;
-       u16 i;
-
-       spin_lock(&aq->sq.lock);
-       spin_lock_irqsave(&aq->cq.lock, flags);
-       for (i = 0; i < aq->depth; i++) {
-               comp_ctx = efa_com_get_comp_ctx(aq, i, false);
-               if (!comp_ctx)
-                       break;
-
-               comp_ctx->status = EFA_CMD_ABORTED;
-
-               complete(&comp_ctx->wait_event);
-       }
-       spin_unlock_irqrestore(&aq->cq.lock, flags);
-       spin_unlock(&aq->sq.lock);
-}
-
-/**
- * efa_com_wait_for_abort_completion - Wait for admin commands abort.
- * @edev: EFA communication layer struct
- *
- * This method wait until all the outstanding admin commands will be completed.
- */
-static void efa_com_wait_for_abort_completion(struct efa_com_dev *edev)
-{
-       struct efa_com_admin_queue *aq = &edev->aq;
-       int i;
-
-       /* all mine */
-       for (i = 0; i < aq->depth; i++)
-               down(&aq->avail_cmds);
-
-       /* let it go */
-       for (i = 0; i < aq->depth; i++)
-               up(&aq->avail_cmds);
-}
-
-static void efa_com_admin_flush(struct efa_com_dev *edev)
-{
-       struct efa_com_admin_queue *aq = &edev->aq;
-
-       clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
-
-       efa_com_abort_admin_commands(edev);
-       efa_com_wait_for_abort_completion(edev);
-}
-
 /**
  * efa_com_admin_destroy - Destroy the admin and the async events queues.
  * @edev: EFA communication layer struct
@@ -737,7 +667,7 @@ void efa_com_admin_destroy(struct efa_com_dev *edev)
        struct efa_com_admin_sq *sq = &aq->sq;
        u16 size;
 
-       efa_com_admin_flush(edev);
+       clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
 
        devm_kfree(edev->dmadev, aq->comp_ctx_pool);
        devm_kfree(edev->dmadev, aq->comp_ctx);
index 84d96724a74b7520f6bb5684306b46db54b4b8a6..c67dd8109d1cd6a93b59d45c88a0f16fec2fbc16 100644 (file)
@@ -45,7 +45,6 @@ struct efa_com_admin_sq {
 
 /* Don't use anything other than atomic64 */
 struct efa_com_stats_admin {
-       atomic64_t aborted_cmd;
        atomic64_t submitted_cmd;
        atomic64_t completed_cmd;
        atomic64_t no_completion;
index c0016648804cd711056154cf33b514b8932b1725..62345d8abf3ca20bde1ffbe79a1ab8253fb2a81c 100644 (file)
@@ -3,7 +3,6 @@
  * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
-#include "efa.h"
 #include "efa_com.h"
 #include "efa_com_cmd.h"
 
@@ -57,7 +56,7 @@ int efa_com_create_qp(struct efa_com_dev *edev,
        res->send_sub_cq_idx = cmd_completion.send_sub_cq_idx;
        res->recv_sub_cq_idx = cmd_completion.recv_sub_cq_idx;
 
-       return err;
+       return 0;
 }
 
 int efa_com_modify_qp(struct efa_com_dev *edev,
@@ -181,7 +180,7 @@ int efa_com_create_cq(struct efa_com_dev *edev,
        result->cq_idx = cmd_completion.cq_idx;
        result->actual_depth = params->cq_depth;
 
-       return err;
+       return 0;
 }
 
 int efa_com_destroy_cq(struct efa_com_dev *edev,
@@ -307,7 +306,8 @@ int efa_com_create_ah(struct efa_com_dev *edev,
                               (struct efa_admin_acq_entry *)&cmd_completion,
                               sizeof(cmd_completion));
        if (err) {
-               ibdev_err(edev->efa_dev, "Failed to create ah [%d]\n", err);
+               ibdev_err(edev->efa_dev, "Failed to create ah for %pI6 [%d]\n",
+                         ah_cmd.dest_addr, err);
                return err;
        }
 
index db974caf1eb1a55d0cbbece5b97f1e2b8ad4854a..dd1c6d49466f5837fa4c33f1cf368e7cfdc77277 100644 (file)
@@ -100,7 +100,7 @@ static int efa_request_mgmnt_irq(struct efa_dev *dev)
                nr_cpumask_bits, &irq->affinity_hint_mask, irq->vector);
        irq_set_affinity_hint(irq->vector, &irq->affinity_hint_mask);
 
-       return err;
+       return 0;
 }
 
 static void efa_setup_mgmnt_irq(struct efa_dev *dev)
@@ -197,6 +197,10 @@ static void efa_stats_init(struct efa_dev *dev)
 }
 
 static const struct ib_device_ops efa_dev_ops = {
+       .owner = THIS_MODULE,
+       .driver_id = RDMA_DRIVER_EFA,
+       .uverbs_abi_ver = EFA_UVERBS_ABI_VERSION,
+
        .alloc_pd = efa_alloc_pd,
        .alloc_ucontext = efa_alloc_ucontext,
        .create_ah = efa_create_ah,
@@ -220,6 +224,7 @@ static const struct ib_device_ops efa_dev_ops = {
        .reg_user_mr = efa_reg_mr,
 
        INIT_RDMA_OBJ_SIZE(ib_ah, efa_ah, ibah),
+       INIT_RDMA_OBJ_SIZE(ib_cq, efa_cq, ibcq),
        INIT_RDMA_OBJ_SIZE(ib_pd, efa_pd, ibpd),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, efa_ucontext, ibucontext),
 };
@@ -259,12 +264,10 @@ static int efa_ib_device_add(struct efa_dev *dev)
        if (err)
                goto err_release_doorbell_bar;
 
-       dev->ibdev.owner = THIS_MODULE;
        dev->ibdev.node_type = RDMA_NODE_UNSPECIFIED;
        dev->ibdev.phys_port_cnt = 1;
        dev->ibdev.num_comp_vectors = 1;
        dev->ibdev.dev.parent = &pdev->dev;
-       dev->ibdev.uverbs_abi_ver = EFA_UVERBS_ABI_VERSION;
 
        dev->ibdev.uverbs_cmd_mask =
                (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
@@ -287,7 +290,6 @@ static int efa_ib_device_add(struct efa_dev *dev)
        dev->ibdev.uverbs_ex_cmd_mask =
                (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE);
 
-       dev->ibdev.driver_id = RDMA_DRIVER_EFA;
        ib_set_device_ops(&dev->ibdev, &efa_dev_ops);
 
        err = ib_register_device(&dev->ibdev, "efa_%d");
index fb6115244d4cbbb22c84e39a27bb4720a160f634..df77bc312a25388bee6e84c95f317daf542352cb 100644 (file)
@@ -447,12 +447,6 @@ void efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
        struct efa_dev *dev = to_edev(ibpd->device);
        struct efa_pd *pd = to_epd(ibpd);
 
-       if (udata->inlen &&
-           !ib_is_udata_cleared(udata, 0, udata->inlen)) {
-               ibdev_dbg(&dev->ibdev, "Incompatible ABI params\n");
-               return;
-       }
-
        ibdev_dbg(&dev->ibdev, "Dealloc pd[%d]\n", pd->pdn);
        efa_pd_dealloc(dev, pd->pdn);
 }
@@ -470,12 +464,6 @@ int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
        struct efa_qp *qp = to_eqp(ibqp);
        int err;
 
-       if (udata->inlen &&
-           !ib_is_udata_cleared(udata, 0, udata->inlen)) {
-               ibdev_dbg(&dev->ibdev, "Incompatible ABI params\n");
-               return -EINVAL;
-       }
-
        ibdev_dbg(&dev->ibdev, "Destroy qp[%u]\n", ibqp->qp_num);
        err = efa_destroy_qp_handle(dev, qp->qp_handle);
        if (err)
@@ -870,31 +858,18 @@ static int efa_destroy_cq_idx(struct efa_dev *dev, int cq_idx)
        return efa_com_destroy_cq(&dev->edev, &params);
 }
 
-int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
        struct efa_dev *dev = to_edev(ibcq->device);
        struct efa_cq *cq = to_ecq(ibcq);
-       int err;
-
-       if (udata->inlen &&
-           !ib_is_udata_cleared(udata, 0, udata->inlen)) {
-               ibdev_dbg(&dev->ibdev, "Incompatible ABI params\n");
-               return -EINVAL;
-       }
 
        ibdev_dbg(&dev->ibdev,
                  "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n",
                  cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr);
 
-       err = efa_destroy_cq_idx(dev, cq->cq_idx);
-       if (err)
-               return err;
-
+       efa_destroy_cq_idx(dev, cq->cq_idx);
        dma_unmap_single(&dev->pdev->dev, cq->dma_addr, cq->size,
                         DMA_FROM_DEVICE);
-
-       kfree(cq);
-       return 0;
 }
 
 static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq,
@@ -910,17 +885,20 @@ static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq,
        return 0;
 }
 
-static struct ib_cq *do_create_cq(struct ib_device *ibdev, int entries,
-                                 int vector, struct ib_ucontext *ibucontext,
-                                 struct ib_udata *udata)
+int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                 struct ib_udata *udata)
 {
+       struct efa_ucontext *ucontext = rdma_udata_to_drv_context(
+               udata, struct efa_ucontext, ibucontext);
        struct efa_ibv_create_cq_resp resp = {};
        struct efa_com_create_cq_params params;
        struct efa_com_create_cq_result result;
+       struct ib_device *ibdev = ibcq->device;
        struct efa_dev *dev = to_edev(ibdev);
        struct efa_ibv_create_cq cmd = {};
+       struct efa_cq *cq = to_ecq(ibcq);
        bool cq_entry_inserted = false;
-       struct efa_cq *cq;
+       int entries = attr->cqe;
        int err;
 
        ibdev_dbg(ibdev, "create_cq entries %d\n", entries);
@@ -978,19 +956,13 @@ static struct ib_cq *do_create_cq(struct ib_device *ibdev, int entries,
                goto err_out;
        }
 
-       cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-       if (!cq) {
-               err = -ENOMEM;
-               goto err_out;
-       }
-
-       cq->ucontext = to_eucontext(ibucontext);
+       cq->ucontext = ucontext;
        cq->size = PAGE_ALIGN(cmd.cq_entry_size * entries * cmd.num_sub_cqs);
        cq->cpu_addr = efa_zalloc_mapped(dev, &cq->dma_addr, cq->size,
                                         DMA_FROM_DEVICE);
        if (!cq->cpu_addr) {
                err = -ENOMEM;
-               goto err_free_cq;
+               goto err_out;
        }
 
        params.uarn = cq->ucontext->uarn;
@@ -1009,8 +981,8 @@ static struct ib_cq *do_create_cq(struct ib_device *ibdev, int entries,
 
        err = cq_mmap_entries_setup(dev, cq, &resp);
        if (err) {
-               ibdev_dbg(ibdev,
-                         "Could not setup cq[%u] mmap entries\n", cq->cq_idx);
+               ibdev_dbg(ibdev, "Could not setup cq[%u] mmap entries\n",
+                         cq->cq_idx);
                goto err_destroy_cq;
        }
 
@@ -1026,11 +998,10 @@ static struct ib_cq *do_create_cq(struct ib_device *ibdev, int entries,
                }
        }
 
-       ibdev_dbg(ibdev,
-                 "Created cq[%d], cq depth[%u]. dma[%pad] virt[0x%p]\n",
+       ibdev_dbg(ibdev, "Created cq[%d], cq depth[%u]. dma[%pad] virt[0x%p]\n",
                  cq->cq_idx, result.actual_depth, &cq->dma_addr, cq->cpu_addr);
 
-       return &cq->ibcq;
+       return 0;
 
 err_destroy_cq:
        efa_destroy_cq_idx(dev, cq->cq_idx);
@@ -1039,23 +1010,9 @@ err_free_mapped:
                         DMA_FROM_DEVICE);
        if (!cq_entry_inserted)
                free_pages_exact(cq->cpu_addr, cq->size);
-err_free_cq:
-       kfree(cq);
 err_out:
        atomic64_inc(&dev->stats.sw_stats.create_cq_err);
-       return ERR_PTR(err);
-}
-
-struct ib_cq *efa_create_cq(struct ib_device *ibdev,
-                           const struct ib_cq_init_attr *attr,
-                           struct ib_udata *udata)
-{
-       struct efa_ucontext *ucontext = rdma_udata_to_drv_context(udata,
-                                                                 struct efa_ucontext,
-                                                                 ibucontext);
-
-       return do_create_cq(ibdev, attr->cqe, attr->comp_vector,
-                           &ucontext->ibucontext, udata);
+       return err;
 }
 
 static int umem_to_page_list(struct efa_dev *dev,
@@ -1065,21 +1022,15 @@ static int umem_to_page_list(struct efa_dev *dev,
                             u8 hp_shift)
 {
        u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT);
-       struct sg_dma_page_iter sg_iter;
-       unsigned int page_idx = 0;
+       struct ib_block_iter biter;
        unsigned int hp_idx = 0;
 
        ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n",
                  hp_cnt, pages_in_hp);
 
-       for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
-               if (page_idx % pages_in_hp == 0) {
-                       page_list[hp_idx] = sg_page_iter_dma_address(&sg_iter);
-                       hp_idx++;
-               }
-
-               page_idx++;
-       }
+       rdma_for_each_block(umem->sg_head.sgl, &biter, umem->nmap,
+                           BIT(hp_shift))
+               page_list[hp_idx++] = rdma_block_iter_dma_address(&biter);
 
        return 0;
 }
@@ -1114,14 +1065,14 @@ err:
  */
 static int pbl_chunk_list_create(struct efa_dev *dev, struct pbl_context *pbl)
 {
-       unsigned int entry, payloads_in_sg, chunk_list_size, chunk_idx, payload_idx;
        struct pbl_chunk_list *chunk_list = &pbl->phys.indirect.chunk_list;
        int page_cnt = pbl->phys.indirect.pbl_buf_size_in_pages;
        struct scatterlist *pages_sgl = pbl->phys.indirect.sgl;
+       unsigned int chunk_list_size, chunk_idx, payload_idx;
        int sg_dma_cnt = pbl->phys.indirect.sg_dma_cnt;
        struct efa_com_ctrl_buff_info *ctrl_buf;
        u64 *cur_chunk_buf, *prev_chunk_buf;
-       struct scatterlist *sg;
+       struct ib_block_iter biter;
        dma_addr_t dma_addr;
        int i;
 
@@ -1155,18 +1106,15 @@ static int pbl_chunk_list_create(struct efa_dev *dev, struct pbl_context *pbl)
        chunk_idx = 0;
        payload_idx = 0;
        cur_chunk_buf = chunk_list->chunks[0].buf;
-       for_each_sg(pages_sgl, sg, sg_dma_cnt, entry) {
-               payloads_in_sg = sg_dma_len(sg) >> EFA_CHUNK_PAYLOAD_SHIFT;
-               for (i = 0; i < payloads_in_sg; i++) {
-                       cur_chunk_buf[payload_idx++] =
-                               (sg_dma_address(sg) & ~(EFA_CHUNK_PAYLOAD_SIZE - 1)) +
-                               (EFA_CHUNK_PAYLOAD_SIZE * i);
-
-                       if (payload_idx == EFA_PTRS_PER_CHUNK) {
-                               chunk_idx++;
-                               cur_chunk_buf = chunk_list->chunks[chunk_idx].buf;
-                               payload_idx = 0;
-                       }
+       rdma_for_each_block(pages_sgl, &biter, sg_dma_cnt,
+                           EFA_CHUNK_PAYLOAD_SIZE) {
+               cur_chunk_buf[payload_idx++] =
+                       rdma_block_iter_dma_address(&biter);
+
+               if (payload_idx == EFA_PTRS_PER_CHUNK) {
+                       chunk_idx++;
+                       cur_chunk_buf = chunk_list->chunks[chunk_idx].buf;
+                       payload_idx = 0;
                }
        }
 
@@ -1314,30 +1262,30 @@ static int pbl_create(struct efa_dev *dev,
        int err;
 
        pbl->pbl_buf_size_in_bytes = hp_cnt * EFA_CHUNK_PAYLOAD_PTR_SIZE;
-       pbl->pbl_buf = kzalloc(pbl->pbl_buf_size_in_bytes,
-                              GFP_KERNEL | __GFP_NOWARN);
-       if (pbl->pbl_buf) {
-               pbl->physically_continuous = 1;
+       pbl->pbl_buf = kvzalloc(pbl->pbl_buf_size_in_bytes, GFP_KERNEL);
+       if (!pbl->pbl_buf)
+               return -ENOMEM;
+
+       if (is_vmalloc_addr(pbl->pbl_buf)) {
+               pbl->physically_continuous = 0;
                err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt,
                                        hp_shift);
                if (err)
-                       goto err_continuous;
-               err = pbl_continuous_initialize(dev, pbl);
+                       goto err_free;
+
+               err = pbl_indirect_initialize(dev, pbl);
                if (err)
-                       goto err_continuous;
+                       goto err_free;
        } else {
-               pbl->physically_continuous = 0;
-               pbl->pbl_buf = vzalloc(pbl->pbl_buf_size_in_bytes);
-               if (!pbl->pbl_buf)
-                       return -ENOMEM;
-
+               pbl->physically_continuous = 1;
                err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt,
                                        hp_shift);
                if (err)
-                       goto err_indirect;
-               err = pbl_indirect_initialize(dev, pbl);
+                       goto err_free;
+
+               err = pbl_continuous_initialize(dev, pbl);
                if (err)
-                       goto err_indirect;
+                       goto err_free;
        }
 
        ibdev_dbg(&dev->ibdev,
@@ -1346,24 +1294,20 @@ static int pbl_create(struct efa_dev *dev,
 
        return 0;
 
-err_continuous:
-       kfree(pbl->pbl_buf);
-       return err;
-err_indirect:
-       vfree(pbl->pbl_buf);
+err_free:
+       kvfree(pbl->pbl_buf);
        return err;
 }
 
 static void pbl_destroy(struct efa_dev *dev, struct pbl_context *pbl)
 {
-       if (pbl->physically_continuous) {
+       if (pbl->physically_continuous)
                dma_unmap_single(&dev->pdev->dev, pbl->phys.continuous.dma_addr,
                                 pbl->pbl_buf_size_in_bytes, DMA_TO_DEVICE);
-               kfree(pbl->pbl_buf);
-       } else {
+       else
                pbl_indirect_terminate(dev, pbl);
-               vfree(pbl->pbl_buf);
-       }
+
+       kvfree(pbl->pbl_buf);
 }
 
 static int efa_create_inline_pbl(struct efa_dev *dev, struct efa_mr *mr,
@@ -1417,56 +1361,6 @@ static int efa_create_pbl(struct efa_dev *dev,
        return 0;
 }
 
-static void efa_cont_pages(struct ib_umem *umem, u64 addr,
-                          unsigned long max_page_shift,
-                          int *count, u8 *shift, u32 *ncont)
-{
-       struct scatterlist *sg;
-       u64 base = ~0, p = 0;
-       unsigned long tmp;
-       unsigned long m;
-       u64 len, pfn;
-       int i = 0;
-       int entry;
-
-       addr = addr >> PAGE_SHIFT;
-       tmp = (unsigned long)addr;
-       m = find_first_bit(&tmp, BITS_PER_LONG);
-       if (max_page_shift)
-               m = min_t(unsigned long, max_page_shift - PAGE_SHIFT, m);
-
-       for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-               len = DIV_ROUND_UP(sg_dma_len(sg), PAGE_SIZE);
-               pfn = sg_dma_address(sg) >> PAGE_SHIFT;
-               if (base + p != pfn) {
-                       /*
-                        * If either the offset or the new
-                        * base are unaligned update m
-                        */
-                       tmp = (unsigned long)(pfn | p);
-                       if (!IS_ALIGNED(tmp, 1 << m))
-                               m = find_first_bit(&tmp, BITS_PER_LONG);
-
-                       base = pfn;
-                       p = 0;
-               }
-
-               p += len;
-               i += len;
-       }
-
-       if (i) {
-               m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m);
-               *ncont = DIV_ROUND_UP(i, (1 << m));
-       } else {
-               m = 0;
-               *ncont = 0;
-       }
-
-       *shift = PAGE_SHIFT + m;
-       *count = i;
-}
-
 struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
                         u64 virt_addr, int access_flags,
                         struct ib_udata *udata)
@@ -1474,11 +1368,10 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
        struct efa_dev *dev = to_edev(ibpd->device);
        struct efa_com_reg_mr_params params = {};
        struct efa_com_reg_mr_result result = {};
-       unsigned long max_page_shift;
        struct pbl_context pbl;
+       unsigned int pg_sz;
        struct efa_mr *mr;
        int inline_size;
-       int npages;
        int err;
 
        if (udata->inlen &&
@@ -1515,13 +1408,24 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
        params.iova = virt_addr;
        params.mr_length_in_bytes = length;
        params.permissions = access_flags & 0x1;
-       max_page_shift = fls64(dev->dev_attr.page_size_cap);
 
-       efa_cont_pages(mr->umem, start, max_page_shift, &npages,
-                      &params.page_shift, &params.page_num);
+       pg_sz = ib_umem_find_best_pgsz(mr->umem,
+                                      dev->dev_attr.page_size_cap,
+                                      virt_addr);
+       if (!pg_sz) {
+               err = -EOPNOTSUPP;
+               ibdev_dbg(&dev->ibdev, "Failed to find a suitable page size in page_size_cap %#llx\n",
+                         dev->dev_attr.page_size_cap);
+               goto err_unmap;
+       }
+
+       params.page_shift = __ffs(pg_sz);
+       params.page_num = DIV_ROUND_UP(length + (start & (pg_sz - 1)),
+                                      pg_sz);
+
        ibdev_dbg(&dev->ibdev,
-                 "start %#llx length %#llx npages %d params.page_shift %u params.page_num %u\n",
-                 start, length, npages, params.page_shift, params.page_num);
+                 "start %#llx length %#llx params.page_shift %u params.page_num %u\n",
+                 start, length, params.page_shift, params.page_num);
 
        inline_size = ARRAY_SIZE(params.pbl.inline_pbl_array);
        if (params.page_num <= inline_size) {
@@ -1567,12 +1471,6 @@ int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
        struct efa_mr *mr = to_emr(ibmr);
        int err;
 
-       if (udata->inlen &&
-           !ib_is_udata_cleared(udata, 0, udata->inlen)) {
-               ibdev_dbg(&dev->ibdev, "Incompatible ABI params\n");
-               return -EINVAL;
-       }
-
        ibdev_dbg(&dev->ibdev, "Deregister mr[%d]\n", ibmr->lkey);
 
        if (mr->umem) {
@@ -1580,8 +1478,8 @@ int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
                err = efa_com_dereg_mr(&dev->edev, &params);
                if (err)
                        return err;
-               ib_umem_release(mr->umem);
        }
+       ib_umem_release(mr->umem);
 
        kfree(mr);
 
@@ -1707,13 +1605,15 @@ static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
                err = -EINVAL;
        }
 
-       if (err)
+       if (err) {
                ibdev_dbg(
                        &dev->ibdev,
                        "Couldn't mmap address[%#llx] length[%#llx] mmap_flag[%d] err[%d]\n",
                        entry->address, length, entry->mmap_flag, err);
+               return err;
+       }
 
-       return err;
+       return 0;
 }
 
 int efa_mmap(struct ib_ucontext *ibucontext,
index 4044a8c8dbf4d62c7b7f258e6a280983ce673ba1..0405d26d0833d8073c479dc531debc5a8396f307 100644 (file)
@@ -10,6 +10,7 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o
 
 hfi1-y := \
        affinity.o \
+       aspm.o \
        chip.o \
        device.o \
        driver.o \
diff --git a/drivers/infiniband/hw/hfi1/aspm.c b/drivers/infiniband/hw/hfi1/aspm.c
new file mode 100644 (file)
index 0000000..a3c53be
--- /dev/null
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright(c) 2019 Intel Corporation.
+ *
+ */
+
+#include "aspm.h"
+
+/* Time after which the timer interrupt will re-enable ASPM */
+#define ASPM_TIMER_MS 1000
+/* Time for which interrupts are ignored after a timer has been scheduled */
+#define ASPM_RESCHED_TIMER_MS (ASPM_TIMER_MS / 2)
+/* Two interrupts within this time trigger ASPM disable */
+#define ASPM_TRIGGER_MS 1
+#define ASPM_TRIGGER_NS (ASPM_TRIGGER_MS * 1000 * 1000ull)
+#define ASPM_L1_SUPPORTED(reg) \
+       ((((reg) & PCI_EXP_LNKCAP_ASPMS) >> 10) & 0x2)
+
+uint aspm_mode = ASPM_MODE_DISABLED;
+module_param_named(aspm, aspm_mode, uint, 0444);
+MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic");
+
+static bool aspm_hw_l1_supported(struct hfi1_devdata *dd)
+{
+       struct pci_dev *parent = dd->pcidev->bus->self;
+       u32 up, dn;
+
+       /*
+        * If the driver does not have access to the upstream component,
+        * it cannot support ASPM L1 at all.
+        */
+       if (!parent)
+               return false;
+
+       pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &dn);
+       dn = ASPM_L1_SUPPORTED(dn);
+
+       pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &up);
+       up = ASPM_L1_SUPPORTED(up);
+
+       /* ASPM works on A-step but is reported as not supported */
+       return (!!dn || is_ax(dd)) && !!up;
+}
+
+/* Set L1 entrance latency for slower entry to L1 */
+static void aspm_hw_set_l1_ent_latency(struct hfi1_devdata *dd)
+{
+       u32 l1_ent_lat = 0x4u;
+       u32 reg32;
+
+       pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, &reg32);
+       reg32 &= ~PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK;
+       reg32 |= l1_ent_lat << PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT;
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, reg32);
+}
+
+static void aspm_hw_enable_l1(struct hfi1_devdata *dd)
+{
+       struct pci_dev *parent = dd->pcidev->bus->self;
+
+       /*
+        * If the driver does not have access to the upstream component,
+        * it cannot support ASPM L1 at all.
+        */
+       if (!parent)
+               return;
+
+       /* Enable ASPM L1 first in upstream component and then downstream */
+       pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
+                                          PCI_EXP_LNKCTL_ASPMC,
+                                          PCI_EXP_LNKCTL_ASPM_L1);
+       pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
+                                          PCI_EXP_LNKCTL_ASPMC,
+                                          PCI_EXP_LNKCTL_ASPM_L1);
+}
+
+void aspm_hw_disable_l1(struct hfi1_devdata *dd)
+{
+       struct pci_dev *parent = dd->pcidev->bus->self;
+
+       /* Disable ASPM L1 first in downstream component and then upstream */
+       pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
+                                          PCI_EXP_LNKCTL_ASPMC, 0x0);
+       if (parent)
+               pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
+                                                  PCI_EXP_LNKCTL_ASPMC, 0x0);
+}
+
+static  void aspm_enable(struct hfi1_devdata *dd)
+{
+       if (dd->aspm_enabled || aspm_mode == ASPM_MODE_DISABLED ||
+           !dd->aspm_supported)
+               return;
+
+       aspm_hw_enable_l1(dd);
+       dd->aspm_enabled = true;
+}
+
+static  void aspm_disable(struct hfi1_devdata *dd)
+{
+       if (!dd->aspm_enabled || aspm_mode == ASPM_MODE_ENABLED)
+               return;
+
+       aspm_hw_disable_l1(dd);
+       dd->aspm_enabled = false;
+}
+
+static  void aspm_disable_inc(struct hfi1_devdata *dd)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->aspm_lock, flags);
+       aspm_disable(dd);
+       atomic_inc(&dd->aspm_disabled_cnt);
+       spin_unlock_irqrestore(&dd->aspm_lock, flags);
+}
+
+static  void aspm_enable_dec(struct hfi1_devdata *dd)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->aspm_lock, flags);
+       if (atomic_dec_and_test(&dd->aspm_disabled_cnt))
+               aspm_enable(dd);
+       spin_unlock_irqrestore(&dd->aspm_lock, flags);
+}
+
+/* ASPM processing for each receive context interrupt */
+void __aspm_ctx_disable(struct hfi1_ctxtdata *rcd)
+{
+       bool restart_timer;
+       bool close_interrupts;
+       unsigned long flags;
+       ktime_t now, prev;
+
+       spin_lock_irqsave(&rcd->aspm_lock, flags);
+       /* PSM contexts are open */
+       if (!rcd->aspm_intr_enable)
+               goto unlock;
+
+       prev = rcd->aspm_ts_last_intr;
+       now = ktime_get();
+       rcd->aspm_ts_last_intr = now;
+
+       /* An interrupt pair close together in time */
+       close_interrupts = ktime_to_ns(ktime_sub(now, prev)) < ASPM_TRIGGER_NS;
+
+       /* Don't push out our timer till this much time has elapsed */
+       restart_timer = ktime_to_ns(ktime_sub(now, rcd->aspm_ts_timer_sched)) >
+                                   ASPM_RESCHED_TIMER_MS * NSEC_PER_MSEC;
+       restart_timer = restart_timer && close_interrupts;
+
+       /* Disable ASPM and schedule timer */
+       if (rcd->aspm_enabled && close_interrupts) {
+               aspm_disable_inc(rcd->dd);
+               rcd->aspm_enabled = false;
+               restart_timer = true;
+       }
+
+       if (restart_timer) {
+               mod_timer(&rcd->aspm_timer,
+                         jiffies + msecs_to_jiffies(ASPM_TIMER_MS));
+               rcd->aspm_ts_timer_sched = now;
+       }
+unlock:
+       spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+}
+
+/* Timer function for re-enabling ASPM in the absence of interrupt activity */
+static  void aspm_ctx_timer_function(struct timer_list *t)
+{
+       struct hfi1_ctxtdata *rcd = from_timer(rcd, t, aspm_timer);
+       unsigned long flags;
+
+       spin_lock_irqsave(&rcd->aspm_lock, flags);
+       aspm_enable_dec(rcd->dd);
+       rcd->aspm_enabled = true;
+       spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+}
+
+/*
+ * Disable interrupt processing for verbs contexts when PSM or VNIC contexts
+ * are open.
+ */
+void aspm_disable_all(struct hfi1_devdata *dd)
+{
+       struct hfi1_ctxtdata *rcd;
+       unsigned long flags;
+       u16 i;
+
+       for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
+               rcd = hfi1_rcd_get_by_index(dd, i);
+               if (rcd) {
+                       del_timer_sync(&rcd->aspm_timer);
+                       spin_lock_irqsave(&rcd->aspm_lock, flags);
+                       rcd->aspm_intr_enable = false;
+                       spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+                       hfi1_rcd_put(rcd);
+               }
+       }
+
+       aspm_disable(dd);
+       atomic_set(&dd->aspm_disabled_cnt, 0);
+}
+
+/* Re-enable interrupt processing for verbs contexts */
+void aspm_enable_all(struct hfi1_devdata *dd)
+{
+       struct hfi1_ctxtdata *rcd;
+       unsigned long flags;
+       u16 i;
+
+       aspm_enable(dd);
+
+       if (aspm_mode != ASPM_MODE_DYNAMIC)
+               return;
+
+       for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
+               rcd = hfi1_rcd_get_by_index(dd, i);
+               if (rcd) {
+                       spin_lock_irqsave(&rcd->aspm_lock, flags);
+                       rcd->aspm_intr_enable = true;
+                       rcd->aspm_enabled = true;
+                       spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+                       hfi1_rcd_put(rcd);
+               }
+       }
+}
+
+static  void aspm_ctx_init(struct hfi1_ctxtdata *rcd)
+{
+       spin_lock_init(&rcd->aspm_lock);
+       timer_setup(&rcd->aspm_timer, aspm_ctx_timer_function, 0);
+       rcd->aspm_intr_supported = rcd->dd->aspm_supported &&
+               aspm_mode == ASPM_MODE_DYNAMIC &&
+               rcd->ctxt < rcd->dd->first_dyn_alloc_ctxt;
+}
+
+void aspm_init(struct hfi1_devdata *dd)
+{
+       struct hfi1_ctxtdata *rcd;
+       u16 i;
+
+       spin_lock_init(&dd->aspm_lock);
+       dd->aspm_supported = aspm_hw_l1_supported(dd);
+
+       for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
+               rcd = hfi1_rcd_get_by_index(dd, i);
+               if (rcd)
+                       aspm_ctx_init(rcd);
+               hfi1_rcd_put(rcd);
+       }
+
+       /* Start with ASPM disabled */
+       aspm_hw_set_l1_ent_latency(dd);
+       dd->aspm_enabled = false;
+       aspm_hw_disable_l1(dd);
+
+       /* Now turn on ASPM if configured */
+       aspm_enable_all(dd);
+}
+
+void aspm_exit(struct hfi1_devdata *dd)
+{
+       aspm_disable_all(dd);
+
+       /* Turn on ASPM on exit to conserve power */
+       aspm_enable(dd);
+}
+
index e8133870ee87fbf768a6fd8f6e825cbb01fc94ef..75d5d18da3da0db4a358d630285cf2395cd5d6f1 100644 (file)
@@ -57,266 +57,20 @@ enum aspm_mode {
        ASPM_MODE_DYNAMIC = 2,  /* ASPM enabled/disabled dynamically */
 };
 
-/* Time after which the timer interrupt will re-enable ASPM */
-#define ASPM_TIMER_MS 1000
-/* Time for which interrupts are ignored after a timer has been scheduled */
-#define ASPM_RESCHED_TIMER_MS (ASPM_TIMER_MS / 2)
-/* Two interrupts within this time trigger ASPM disable */
-#define ASPM_TRIGGER_MS 1
-#define ASPM_TRIGGER_NS (ASPM_TRIGGER_MS * 1000 * 1000ull)
-#define ASPM_L1_SUPPORTED(reg) \
-       (((reg & PCI_EXP_LNKCAP_ASPMS) >> 10) & 0x2)
+void aspm_init(struct hfi1_devdata *dd);
+void aspm_exit(struct hfi1_devdata *dd);
+void aspm_hw_disable_l1(struct hfi1_devdata *dd);
+void __aspm_ctx_disable(struct hfi1_ctxtdata *rcd);
+void aspm_disable_all(struct hfi1_devdata *dd);
+void aspm_enable_all(struct hfi1_devdata *dd);
 
-static inline bool aspm_hw_l1_supported(struct hfi1_devdata *dd)
-{
-       struct pci_dev *parent = dd->pcidev->bus->self;
-       u32 up, dn;
-
-       /*
-        * If the driver does not have access to the upstream component,
-        * it cannot support ASPM L1 at all.
-        */
-       if (!parent)
-               return false;
-
-       pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &dn);
-       dn = ASPM_L1_SUPPORTED(dn);
-
-       pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &up);
-       up = ASPM_L1_SUPPORTED(up);
-
-       /* ASPM works on A-step but is reported as not supported */
-       return (!!dn || is_ax(dd)) && !!up;
-}
-
-/* Set L1 entrance latency for slower entry to L1 */
-static inline void aspm_hw_set_l1_ent_latency(struct hfi1_devdata *dd)
-{
-       u32 l1_ent_lat = 0x4u;
-       u32 reg32;
-
-       pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, &reg32);
-       reg32 &= ~PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK;
-       reg32 |= l1_ent_lat << PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT;
-       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, reg32);
-}
-
-static inline void aspm_hw_enable_l1(struct hfi1_devdata *dd)
-{
-       struct pci_dev *parent = dd->pcidev->bus->self;
-
-       /*
-        * If the driver does not have access to the upstream component,
-        * it cannot support ASPM L1 at all.
-        */
-       if (!parent)
-               return;
-
-       /* Enable ASPM L1 first in upstream component and then downstream */
-       pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
-                                          PCI_EXP_LNKCTL_ASPMC,
-                                          PCI_EXP_LNKCTL_ASPM_L1);
-       pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
-                                          PCI_EXP_LNKCTL_ASPMC,
-                                          PCI_EXP_LNKCTL_ASPM_L1);
-}
-
-static inline void aspm_hw_disable_l1(struct hfi1_devdata *dd)
-{
-       struct pci_dev *parent = dd->pcidev->bus->self;
-
-       /* Disable ASPM L1 first in downstream component and then upstream */
-       pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
-                                          PCI_EXP_LNKCTL_ASPMC, 0x0);
-       if (parent)
-               pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
-                                                  PCI_EXP_LNKCTL_ASPMC, 0x0);
-}
-
-static inline void aspm_enable(struct hfi1_devdata *dd)
-{
-       if (dd->aspm_enabled || aspm_mode == ASPM_MODE_DISABLED ||
-           !dd->aspm_supported)
-               return;
-
-       aspm_hw_enable_l1(dd);
-       dd->aspm_enabled = true;
-}
-
-static inline void aspm_disable(struct hfi1_devdata *dd)
-{
-       if (!dd->aspm_enabled || aspm_mode == ASPM_MODE_ENABLED)
-               return;
-
-       aspm_hw_disable_l1(dd);
-       dd->aspm_enabled = false;
-}
-
-static inline void aspm_disable_inc(struct hfi1_devdata *dd)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->aspm_lock, flags);
-       aspm_disable(dd);
-       atomic_inc(&dd->aspm_disabled_cnt);
-       spin_unlock_irqrestore(&dd->aspm_lock, flags);
-}
-
-static inline void aspm_enable_dec(struct hfi1_devdata *dd)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->aspm_lock, flags);
-       if (atomic_dec_and_test(&dd->aspm_disabled_cnt))
-               aspm_enable(dd);
-       spin_unlock_irqrestore(&dd->aspm_lock, flags);
-}
-
-/* ASPM processing for each receive context interrupt */
 static inline void aspm_ctx_disable(struct hfi1_ctxtdata *rcd)
 {
-       bool restart_timer;
-       bool close_interrupts;
-       unsigned long flags;
-       ktime_t now, prev;
-
        /* Quickest exit for minimum impact */
-       if (!rcd->aspm_intr_supported)
-               return;
-
-       spin_lock_irqsave(&rcd->aspm_lock, flags);
-       /* PSM contexts are open */
-       if (!rcd->aspm_intr_enable)
-               goto unlock;
-
-       prev = rcd->aspm_ts_last_intr;
-       now = ktime_get();
-       rcd->aspm_ts_last_intr = now;
-
-       /* An interrupt pair close together in time */
-       close_interrupts = ktime_to_ns(ktime_sub(now, prev)) < ASPM_TRIGGER_NS;
-
-       /* Don't push out our timer till this much time has elapsed */
-       restart_timer = ktime_to_ns(ktime_sub(now, rcd->aspm_ts_timer_sched)) >
-                                   ASPM_RESCHED_TIMER_MS * NSEC_PER_MSEC;
-       restart_timer = restart_timer && close_interrupts;
-
-       /* Disable ASPM and schedule timer */
-       if (rcd->aspm_enabled && close_interrupts) {
-               aspm_disable_inc(rcd->dd);
-               rcd->aspm_enabled = false;
-               restart_timer = true;
-       }
-
-       if (restart_timer) {
-               mod_timer(&rcd->aspm_timer,
-                         jiffies + msecs_to_jiffies(ASPM_TIMER_MS));
-               rcd->aspm_ts_timer_sched = now;
-       }
-unlock:
-       spin_unlock_irqrestore(&rcd->aspm_lock, flags);
-}
-
-/* Timer function for re-enabling ASPM in the absence of interrupt activity */
-static inline void aspm_ctx_timer_function(struct timer_list *t)
-{
-       struct hfi1_ctxtdata *rcd = from_timer(rcd, t, aspm_timer);
-       unsigned long flags;
-
-       spin_lock_irqsave(&rcd->aspm_lock, flags);
-       aspm_enable_dec(rcd->dd);
-       rcd->aspm_enabled = true;
-       spin_unlock_irqrestore(&rcd->aspm_lock, flags);
-}
-
-/*
- * Disable interrupt processing for verbs contexts when PSM or VNIC contexts
- * are open.
- */
-static inline void aspm_disable_all(struct hfi1_devdata *dd)
-{
-       struct hfi1_ctxtdata *rcd;
-       unsigned long flags;
-       u16 i;
-
-       for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
-               rcd = hfi1_rcd_get_by_index(dd, i);
-               if (rcd) {
-                       del_timer_sync(&rcd->aspm_timer);
-                       spin_lock_irqsave(&rcd->aspm_lock, flags);
-                       rcd->aspm_intr_enable = false;
-                       spin_unlock_irqrestore(&rcd->aspm_lock, flags);
-                       hfi1_rcd_put(rcd);
-               }
-       }
-
-       aspm_disable(dd);
-       atomic_set(&dd->aspm_disabled_cnt, 0);
-}
-
-/* Re-enable interrupt processing for verbs contexts */
-static inline void aspm_enable_all(struct hfi1_devdata *dd)
-{
-       struct hfi1_ctxtdata *rcd;
-       unsigned long flags;
-       u16 i;
-
-       aspm_enable(dd);
-
-       if (aspm_mode != ASPM_MODE_DYNAMIC)
+       if (likely(!rcd->aspm_intr_supported))
                return;
 
-       for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
-               rcd = hfi1_rcd_get_by_index(dd, i);
-               if (rcd) {
-                       spin_lock_irqsave(&rcd->aspm_lock, flags);
-                       rcd->aspm_intr_enable = true;
-                       rcd->aspm_enabled = true;
-                       spin_unlock_irqrestore(&rcd->aspm_lock, flags);
-                       hfi1_rcd_put(rcd);
-               }
-       }
-}
-
-static inline void aspm_ctx_init(struct hfi1_ctxtdata *rcd)
-{
-       spin_lock_init(&rcd->aspm_lock);
-       timer_setup(&rcd->aspm_timer, aspm_ctx_timer_function, 0);
-       rcd->aspm_intr_supported = rcd->dd->aspm_supported &&
-               aspm_mode == ASPM_MODE_DYNAMIC &&
-               rcd->ctxt < rcd->dd->first_dyn_alloc_ctxt;
-}
-
-static inline void aspm_init(struct hfi1_devdata *dd)
-{
-       struct hfi1_ctxtdata *rcd;
-       u16 i;
-
-       spin_lock_init(&dd->aspm_lock);
-       dd->aspm_supported = aspm_hw_l1_supported(dd);
-
-       for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
-               rcd = hfi1_rcd_get_by_index(dd, i);
-               if (rcd)
-                       aspm_ctx_init(rcd);
-               hfi1_rcd_put(rcd);
-       }
-
-       /* Start with ASPM disabled */
-       aspm_hw_set_l1_ent_latency(dd);
-       dd->aspm_enabled = false;
-       aspm_hw_disable_l1(dd);
-
-       /* Now turn on ASPM if configured */
-       aspm_enable_all(dd);
-}
-
-static inline void aspm_exit(struct hfi1_devdata *dd)
-{
-       aspm_disable_all(dd);
-
-       /* Turn on ASPM on exit to conserve power */
-       aspm_enable(dd);
+       __aspm_ctx_disable(rcd);
 }
 
 #endif /* _ASPM_H */
index 15efb4a380b2112dc8ecc67adc6c6c8a9c1132d8..d268bf9c42eef0087c4cbff7bb47c93e210713c6 100644 (file)
@@ -987,9 +987,6 @@ static int __i2c_debugfs_open(struct inode *in, struct file *fp, u32 target)
        struct hfi1_pportdata *ppd;
        int ret;
 
-       if (!try_module_get(THIS_MODULE))
-               return -ENODEV;
-
        ppd = private2ppd(fp);
 
        ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0);
@@ -1155,6 +1152,7 @@ static int exprom_wp_debugfs_release(struct inode *in, struct file *fp)
 { \
        .name = nm, \
        .ops = { \
+               .owner = THIS_MODULE, \
                .read = readroutine, \
                .write = writeroutine, \
                .llseek = generic_file_llseek, \
@@ -1165,6 +1163,7 @@ static int exprom_wp_debugfs_release(struct inode *in, struct file *fp)
 { \
        .name = nm, \
        .ops = { \
+               .owner = THIS_MODULE, \
                .read = readf, \
                .write = writef, \
                .llseek = generic_file_llseek, \
index 4228393e6c4cc30ee4e2f08a1d1198f55b010163..184dba3c28284b13ea3b232d2a37f909851654a4 100644 (file)
@@ -2744,8 +2744,7 @@ static int pma_get_opa_portstatus(struct opa_pma_mad *pmp,
        u16 link_width;
        u16 link_speed;
 
-       response_data_size = sizeof(struct opa_port_status_rsp) +
-                               num_vls * sizeof(struct _vls_pctrs);
+       response_data_size = struct_size(rsp, vls, num_vls);
        if (response_data_size > sizeof(pmp->data)) {
                pmp->mad_hdr.status |= OPA_PM_STATUS_REQUEST_TOO_LARGE;
                return reply((struct ib_mad_hdr *)pmp);
@@ -3014,8 +3013,7 @@ static int pma_get_opa_datacounters(struct opa_pma_mad *pmp,
        }
 
        /* Sanity check */
-       response_data_size = sizeof(struct opa_port_data_counters_msg) +
-                               num_vls * sizeof(struct _vls_dctrs);
+       response_data_size = struct_size(req, port[0].vls, num_vls);
 
        if (response_data_size > sizeof(pmp->data)) {
                pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
@@ -3232,8 +3230,7 @@ static int pma_get_opa_porterrors(struct opa_pma_mad *pmp,
                return reply((struct ib_mad_hdr *)pmp);
        }
 
-       response_data_size = sizeof(struct opa_port_error_counters64_msg) +
-                               num_vls * sizeof(struct _vls_ectrs);
+       response_data_size = struct_size(req, port[0].vls, num_vls);
 
        if (response_data_size > sizeof(pmp->data)) {
                pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
index c96d193bb2364cd88e1dcefcf4d8752014edc30c..61aa5504d7c3799b640989cd68b74b4d2379119d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2018 Intel Corporation.
+ * Copyright(c) 2015 - 2019 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -450,10 +450,6 @@ static int hfi1_pcie_caps;
 module_param_named(pcie_caps, hfi1_pcie_caps, int, 0444);
 MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)");
 
-uint aspm_mode = ASPM_MODE_DISABLED;
-module_param_named(aspm, aspm_mode, uint, 0444);
-MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic");
-
 /**
  * tune_pcie_caps() - Code to adjust PCIe capabilities.
  * @dd: Valid device data structure
index 4e5c2d1b8cfa77a1618613c60ee83e286d69e75e..79126b2b14ab036f6865a5a06fa7db00256202ad 100644 (file)
@@ -1594,9 +1594,8 @@ void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint)
        else
                sc_del_credit_return_intr(sc);
        trace_hfi1_wantpiointr(sc, needint, sc->credit_ctrl);
-       if (needint) {
+       if (needint)
                sc_return_credits(sc);
-       }
 }
 
 /**
index 4e0e9fc0a777c2b4f1184964333797115242ea57..f8e733aa3bb862d394b7cd068151a002587ac8e1 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2018 Intel Corporation.
+ * Copyright(c) 2015 - 2019 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -348,7 +348,7 @@ int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send)
                break;
        case IB_QPT_GSI:
        case IB_QPT_UD:
-               ah = ibah_to_rvtah(wqe->ud_wr.ah);
+               ah = rvt_get_swqe_ah(wqe);
                if (wqe->length > (1 << ah->log_pmtu))
                        return -EINVAL;
                if (ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)] == 0xf)
@@ -702,8 +702,8 @@ void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter)
                   sde ? sde->this_idx : 0,
                   send_context,
                   send_context ? send_context->sw_index : 0,
-                  ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->head,
-                  ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->tail,
+                  ib_cq_head(qp->ibqp.send_cq),
+                  ib_cq_tail(qp->ibqp.send_cq),
                   qp->pid,
                   qp->s_state,
                   qp->s_ack_state,
index 7c8cfb149da09c12595685779c619bda8c81e972..0477c14633ab8365849a8eecb24d2f0bcafb4feb 100644 (file)
@@ -1830,23 +1830,14 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
        }
 
        while (qp->s_last != qp->s_acked) {
-               u32 s_last;
-
                wqe = rvt_get_swqe_ptr(qp, qp->s_last);
                if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
                    cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
                        break;
                trdma_clean_swqe(qp, wqe);
                rvt_qp_wqe_unreserve(qp, wqe);
-               s_last = qp->s_last;
-               trace_hfi1_qp_send_completion(qp, wqe, s_last);
-               if (++s_last >= qp->s_size)
-                       s_last = 0;
-               qp->s_last = s_last;
-               /* see post_send() */
-               barrier();
-               rvt_put_qp_swqe(qp, wqe);
-               rvt_qp_swqe_complete(qp,
+               trace_hfi1_qp_send_completion(qp, wqe, qp->s_last);
+               rvt_qp_complete_swqe(qp,
                                     wqe,
                                     ib_hfi1_wc_opcode[wqe->wr.opcode],
                                     IB_WC_SUCCESS);
@@ -1890,19 +1881,10 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
        trace_hfi1_rc_completion(qp, wqe->lpsn);
        if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
            cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
-               u32 s_last;
-
                trdma_clean_swqe(qp, wqe);
-               rvt_put_qp_swqe(qp, wqe);
                rvt_qp_wqe_unreserve(qp, wqe);
-               s_last = qp->s_last;
-               trace_hfi1_qp_send_completion(qp, wqe, s_last);
-               if (++s_last >= qp->s_size)
-                       s_last = 0;
-               qp->s_last = s_last;
-               /* see post_send() */
-               barrier();
-               rvt_qp_swqe_complete(qp,
+               trace_hfi1_qp_send_completion(qp, wqe, qp->s_last);
+               rvt_qp_complete_swqe(qp,
                                     wqe,
                                     ib_hfi1_wc_opcode[wqe->wr.opcode],
                                     IB_WC_SUCCESS);
@@ -3026,8 +3008,7 @@ send_last:
                wc.dlid_path_bits = 0;
                wc.port_num = 0;
                /* Signal completion event if the solicited bit is set. */
-               rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-                            ib_bth_is_solicited(ohdr));
+               rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
                break;
 
        case OP(RDMA_WRITE_ONLY):
index aa9c8d3ef87b6b40e1634bf580b5253d1b8ddb30..92acccaaaa86d66dd233cb8fee5ba8b8cf1537be 100644 (file)
@@ -475,7 +475,7 @@ static struct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd,
  * Must hold the qp s_lock and the exp_lock.
  *
  * Return:
- * false if either of the conditions below are statisfied:
+ * false if either of the conditions below are satisfied:
  * 1. The list is empty or
  * 2. The indicated qp is at the head of the list and the
  *    HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags.
@@ -2024,7 +2024,6 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet,
        trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req);
        if (e->opcode == TID_OP(READ_REQ)) {
                struct ib_reth *reth;
-               u32 offset;
                u32 len;
                u32 rkey;
                u64 vaddr;
@@ -2036,7 +2035,6 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet,
                 * The requester always restarts from the start of the original
                 * request.
                 */
-               offset = delta_psn(psn, e->psn) * qp->pmtu;
                len = be32_to_cpu(reth->length);
                if (psn != e->psn || len != req->total_len)
                        goto unlock;
@@ -4550,7 +4548,7 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet)
        struct rvt_swqe *wqe;
        struct tid_rdma_request *req;
        struct tid_rdma_flow *flow;
-       u32 aeth, psn, req_psn, ack_psn, fspsn, resync_psn, ack_kpsn;
+       u32 aeth, psn, req_psn, ack_psn, resync_psn, ack_kpsn;
        unsigned long flags;
        u16 fidx;
 
@@ -4754,7 +4752,6 @@ done:
                        IB_AETH_CREDIT_MASK) {
                case 0: /* PSN sequence error */
                        flow = &req->flows[req->acked_tail];
-                       fspsn = full_flow_psn(flow, flow->flow_state.spsn);
                        trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail,
                                                        flow);
                        req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
index d1372cc66de67769d4085397c8e7a61551471e9e..2f84290a88caff10c867f00b8bb5b6b414404a1d 100644 (file)
@@ -79,6 +79,8 @@ __print_symbolic(opcode,                                   \
        ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE),             \
        ib_opcode_name(RC_COMPARE_SWAP),                   \
        ib_opcode_name(RC_FETCH_ADD),                      \
+       ib_opcode_name(RC_SEND_LAST_WITH_INVALIDATE),      \
+       ib_opcode_name(RC_SEND_ONLY_WITH_INVALIDATE),      \
        ib_opcode_name(TID_RDMA_WRITE_REQ),                \
        ib_opcode_name(TID_RDMA_WRITE_RESP),               \
        ib_opcode_name(TID_RDMA_WRITE_DATA),               \
index 4ed4fcfabd6c67f3d85f6491f752258db432ca62..0c77f18120edb3d8f230eaee49e1f933dd5aa122 100644 (file)
@@ -476,8 +476,7 @@ last_imm:
                wc.dlid_path_bits = 0;
                wc.port_num = 0;
                /* Signal completion event if the solicited bit is set. */
-               rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-                            ib_bth_is_solicited(ohdr));
+               rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
                break;
 
        case OP(RDMA_WRITE_FIRST):
index 4cb0fce5c096a6909fd05b2bb541c4bf37bf8157..e804af71b629d3887821af47cefb1d717919490c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2018 Intel Corporation.
+ * Copyright(c) 2015 - 2019 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -87,7 +87,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
        rcu_read_lock();
 
        qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp,
-                           swqe->ud_wr.remote_qpn);
+                           rvt_get_swqe_remote_qpn(swqe));
        if (!qp) {
                ibp->rvp.n_pkt_drops++;
                rcu_read_unlock();
@@ -105,7 +105,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
                goto drop;
        }
 
-       ah_attr = &ibah_to_rvtah(swqe->ud_wr.ah)->attr;
+       ah_attr = rvt_get_swqe_ah_attr(swqe);
        ppd = ppd_from_ibp(ibp);
 
        if (qp->ibqp.qp_num > 1) {
@@ -135,8 +135,8 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
        if (qp->ibqp.qp_num) {
                u32 qkey;
 
-               qkey = (int)swqe->ud_wr.remote_qkey < 0 ?
-                       sqp->qkey : swqe->ud_wr.remote_qkey;
+               qkey = (int)rvt_get_swqe_remote_qkey(swqe) < 0 ?
+                       sqp->qkey : rvt_get_swqe_remote_qkey(swqe);
                if (unlikely(qkey != qp->qkey))
                        goto drop; /* silently drop per IBTA spec */
        }
@@ -240,7 +240,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
        if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) {
                if (sqp->ibqp.qp_type == IB_QPT_GSI ||
                    sqp->ibqp.qp_type == IB_QPT_SMI)
-                       wc.pkey_index = swqe->ud_wr.pkey_index;
+                       wc.pkey_index = rvt_get_swqe_pkey_index(swqe);
                else
                        wc.pkey_index = sqp->s_pkey_index;
        } else {
@@ -255,8 +255,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
        wc.dlid_path_bits = rdma_ah_get_dlid(ah_attr) & ((1 << ppd->lmc) - 1);
        wc.port_num = qp->port_num;
        /* Signal completion event if the solicited bit is set. */
-       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-                    swqe->wr.send_flags & IB_SEND_SOLICITED);
+       rvt_recv_cq(qp, &wc, swqe->wr.send_flags & IB_SEND_SOLICITED);
        ibp->rvp.n_loop_pkts++;
 bail_unlock:
        spin_unlock_irqrestore(&qp->r_lock, flags);
@@ -283,20 +282,21 @@ static void hfi1_make_bth_deth(struct rvt_qp *qp, struct rvt_swqe *wqe,
                bth0 |= IB_BTH_SOLICITED;
        bth0 |= extra_bytes << 20;
        if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI)
-               *pkey = hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index);
+               *pkey = hfi1_get_pkey(ibp, rvt_get_swqe_pkey_index(wqe));
        else
                *pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
        if (!bypass)
                bth0 |= *pkey;
        ohdr->bth[0] = cpu_to_be32(bth0);
-       ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.remote_qpn);
+       ohdr->bth[1] = cpu_to_be32(rvt_get_swqe_remote_qpn(wqe));
        ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn));
        /*
         * Qkeys with the high order bit set mean use the
         * qkey from the QP context instead of the WR (see 10.2.5).
         */
-       ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ?
-                                        qp->qkey : wqe->ud_wr.remote_qkey);
+       ohdr->u.ud.deth[0] =
+               cpu_to_be32((int)rvt_get_swqe_remote_qkey(wqe) < 0 ? qp->qkey :
+                           rvt_get_swqe_remote_qkey(wqe));
        ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
 }
 
@@ -316,7 +316,7 @@ void hfi1_make_ud_req_9B(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 
        ibp = to_iport(qp->ibqp.device, qp->port_num);
        ppd = ppd_from_ibp(ibp);
-       ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+       ah_attr = rvt_get_swqe_ah_attr(wqe);
 
        extra_bytes = -wqe->length & 3;
        nwords = ((wqe->length + extra_bytes) >> 2) + SIZE_OF_CRC;
@@ -380,7 +380,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
        struct hfi1_pportdata *ppd;
        struct hfi1_ibport *ibp;
        u32 dlid, slid, nwords, extra_bytes;
-       u32 dest_qp = wqe->ud_wr.remote_qpn;
+       u32 dest_qp = rvt_get_swqe_remote_qpn(wqe);
        u32 src_qp = qp->ibqp.qp_num;
        u16 len, pkey;
        u8 l4, sc5;
@@ -388,7 +388,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 
        ibp = to_iport(qp->ibqp.device, qp->port_num);
        ppd = ppd_from_ibp(ibp);
-       ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+       ah_attr = rvt_get_swqe_ah_attr(wqe);
 
        /*
         * Build 16B Management Packet if either the destination
@@ -450,7 +450,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 
        if (is_mgmt) {
                l4 = OPA_16B_L4_FM;
-               pkey = hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index);
+               pkey = hfi1_get_pkey(ibp, rvt_get_swqe_pkey_index(wqe));
                hfi1_16B_set_qpn(&ps->s_txreq->phdr.hdr.opah.u.mgmt,
                                 dest_qp, src_qp);
        } else {
@@ -515,7 +515,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
        /* Construct the header. */
        ibp = to_iport(qp->ibqp.device, qp->port_num);
        ppd = ppd_from_ibp(ibp);
-       ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+       ah_attr = rvt_get_swqe_ah_attr(wqe);
        priv->hdr_type = hfi1_get_hdr_type(ppd->lid, ah_attr);
        if ((!hfi1_check_mcast(rdma_ah_get_dlid(ah_attr))) ||
            (rdma_ah_get_dlid(ah_attr) == be32_to_cpu(OPA_LID_PERMISSIVE))) {
@@ -1061,7 +1061,7 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
                dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1);
        wc.port_num = qp->port_num;
        /* Signal completion event if the solicited bit is set. */
-       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, solicited);
+       rvt_recv_cq(qp, &wc, solicited);
        return;
 
 drop:
index 02eee8eff1db138d69d1dc4c2b8c29f4b3a8b599..b89a9b9aef7ae71ca9cc4a076206f41aab91f418 100644 (file)
@@ -118,13 +118,10 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np
 void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
                             size_t npages, bool dirty)
 {
-       size_t i;
-
-       for (i = 0; i < npages; i++) {
-               if (dirty)
-                       set_page_dirty_lock(p[i]);
-               put_page(p[i]);
-       }
+       if (dirty)
+               put_user_pages_dirty_lock(p, npages);
+       else
+               put_user_pages(p, npages);
 
        if (mm) { /* during close after signal, mm can be NULL */
                atomic64_sub(npages, &mm->pinned_vm);
index bad3229bad3737d633a0861b3aa6e663442103a1..c4b243f50c76d660b18922035c4f51ec8b89e177 100644 (file)
@@ -1779,6 +1779,9 @@ static int get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
 }
 
 static const struct ib_device_ops hfi1_dev_ops = {
+       .owner = THIS_MODULE,
+       .driver_id = RDMA_DRIVER_HFI1,
+
        .alloc_hw_stats = alloc_hw_stats,
        .alloc_rdma_netdev = hfi1_vnic_alloc_rn,
        .get_dev_fw_str = hfi1_get_dev_fw_str,
@@ -1829,7 +1832,6 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
         */
        if (!ib_hfi1_sys_image_guid)
                ib_hfi1_sys_image_guid = ibdev->node_guid;
-       ibdev->owner = THIS_MODULE;
        ibdev->phys_port_cnt = dd->num_pports;
        ibdev->dev.parent = &dd->pcidev->dev;
 
@@ -1923,7 +1925,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
        rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev,
                                    &ib_hfi1_attr_group);
 
-       ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_HFI1);
+       ret = rvt_register_device(&dd->verbs_dev.rdi);
        if (ret)
                goto err_verbs_txreq;
 
index 61cda7d00627d298312a53d1d2c7190fb3ac6e31..8bf847bcd8d3267d3581224d5a20d94126955d57 100644 (file)
@@ -8,25 +8,24 @@ config INFINIBAND_HNS
          is used in Hisilicon Hip06 and more further ICT SoC based on
          platform device.
 
-         To compile this driver as a module, choose M here: the module
-         will be called hns-roce.
+         To compile HIP06 or HIP08 driver as module, choose M here.
 
 config INFINIBAND_HNS_HIP06
-       tristate "Hisilicon Hip06 Family RoCE support"
+       bool "Hisilicon Hip06 Family RoCE support"
        depends on INFINIBAND_HNS && HNS && HNS_DSAF && HNS_ENET
        ---help---
          RoCE driver support for Hisilicon RoCE engine in Hisilicon Hip06 and
          Hip07 SoC. These RoCE engines are platform devices.
 
-         To compile this driver as a module, choose M here: the module
-         will be called hns-roce-hw-v1.
+         To compile this driver, choose Y here: if INFINIBAND_HNS is m, this
+         module will be called hns-roce-hw-v1
 
 config INFINIBAND_HNS_HIP08
-       tristate "Hisilicon Hip08 Family RoCE support"
+       bool "Hisilicon Hip08 Family RoCE support"
        depends on INFINIBAND_HNS && PCI && HNS3
        ---help---
          RoCE driver support for Hisilicon RoCE engine in Hisilicon Hip08 SoC.
          The RoCE engine is a PCI device.
 
-         To compile this driver as a module, choose M here: the module
-         will be called hns-roce-hw-v2.
+         To compile this driver, choose Y here: if INFINIBAND_HNS is m, this
+         module will be called hns-roce-hw-v2.
index f22d9922cfee738e2694bbb022ca8b6300c7d3a4..e105945b94a11e4d1cd2e360e613335c846a3ea9 100644 (file)
@@ -5,11 +5,16 @@
 
 ccflags-y :=  -I $(srctree)/drivers/net/ethernet/hisilicon/hns3
 
-obj-$(CONFIG_INFINIBAND_HNS) += hns-roce.o
 hns-roce-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \
        hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \
        hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o hns_roce_srq.o hns_roce_restrack.o
-obj-$(CONFIG_INFINIBAND_HNS_HIP06) += hns-roce-hw-v1.o
-hns-roce-hw-v1-objs := hns_roce_hw_v1.o
-obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns-roce-hw-v2.o
-hns-roce-hw-v2-objs := hns_roce_hw_v2.o hns_roce_hw_v2_dfx.o
+
+ifdef CONFIG_INFINIBAND_HNS_HIP06
+hns-roce-hw-v1-objs := hns_roce_hw_v1.o $(hns-roce-objs)
+obj-$(CONFIG_INFINIBAND_HNS) += hns-roce-hw-v1.o
+endif
+
+ifdef CONFIG_INFINIBAND_HNS_HIP08
+hns-roce-hw-v2-objs := hns_roce_hw_v2.o hns_roce_hw_v2_dfx.o $(hns-roce-objs)
+obj-$(CONFIG_INFINIBAND_HNS) += hns-roce-hw-v2.o
+endif
index dac058d3df5314b30e977355aedb5fe0ad27dd3f..8c063c598d2af648eba9c643dfb3e849e700a946 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/platform_device.h>
 #include <linux/vmalloc.h>
 #include "hns_roce_device.h"
+#include <rdma/ib_umem.h>
 
 int hns_roce_bitmap_alloc(struct hns_roce_bitmap *bitmap, unsigned long *obj)
 {
@@ -67,7 +68,6 @@ void hns_roce_bitmap_free(struct hns_roce_bitmap *bitmap, unsigned long obj,
 {
        hns_roce_bitmap_free_range(bitmap, obj, 1, rr);
 }
-EXPORT_SYMBOL_GPL(hns_roce_bitmap_free);
 
 int hns_roce_bitmap_alloc_range(struct hns_roce_bitmap *bitmap, int cnt,
                                int align, unsigned long *obj)
@@ -174,7 +174,6 @@ void hns_roce_buf_free(struct hns_roce_dev *hr_dev, u32 size,
                kfree(buf->page_list);
        }
 }
-EXPORT_SYMBOL_GPL(hns_roce_buf_free);
 
 int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct,
                       struct hns_roce_buf *buf, u32 page_shift)
@@ -238,6 +237,104 @@ err_free:
        return -ENOMEM;
 }
 
+int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs,
+                          int buf_cnt, int start, struct hns_roce_buf *buf)
+{
+       int i, end;
+       int total;
+
+       end = start + buf_cnt;
+       if (end > buf->npages) {
+               dev_err(hr_dev->dev,
+                       "invalid kmem region,offset %d,buf_cnt %d,total %d!\n",
+                       start, buf_cnt, buf->npages);
+               return -EINVAL;
+       }
+
+       total = 0;
+       for (i = start; i < end; i++)
+               if (buf->nbufs == 1)
+                       bufs[total++] = buf->direct.map +
+                                       ((dma_addr_t)i << buf->page_shift);
+               else
+                       bufs[total++] = buf->page_list[i].map;
+
+       return total;
+}
+
+int hns_roce_get_umem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs,
+                          int buf_cnt, int start, struct ib_umem *umem,
+                          int page_shift)
+{
+       struct ib_block_iter biter;
+       int total = 0;
+       int idx = 0;
+       u64 addr;
+
+       if (page_shift < PAGE_SHIFT) {
+               dev_err(hr_dev->dev, "invalid page shift %d!\n", page_shift);
+               return -EINVAL;
+       }
+
+       /* convert system page cnt to hw page cnt */
+       rdma_for_each_block(umem->sg_head.sgl, &biter, umem->nmap,
+                           1 << page_shift) {
+               addr = rdma_block_iter_dma_address(&biter);
+               if (idx >= start) {
+                       bufs[total++] = addr;
+                       if (total >= buf_cnt)
+                               goto done;
+               }
+               idx++;
+       }
+
+done:
+       return total;
+}
+
+void hns_roce_init_buf_region(struct hns_roce_buf_region *region, int hopnum,
+                             int offset, int buf_cnt)
+{
+       if (hopnum == HNS_ROCE_HOP_NUM_0)
+               region->hopnum = 0;
+       else
+               region->hopnum = hopnum;
+
+       region->offset = offset;
+       region->count = buf_cnt;
+}
+
+void hns_roce_free_buf_list(dma_addr_t **bufs, int region_cnt)
+{
+       int i;
+
+       for (i = 0; i < region_cnt; i++) {
+               kfree(bufs[i]);
+               bufs[i] = NULL;
+       }
+}
+
+int hns_roce_alloc_buf_list(struct hns_roce_buf_region *regions,
+                           dma_addr_t **bufs, int region_cnt)
+{
+       struct hns_roce_buf_region *r;
+       int i;
+
+       for (i = 0; i < region_cnt; i++) {
+               r = &regions[i];
+               bufs[i] = kcalloc(r->count, sizeof(dma_addr_t), GFP_KERNEL);
+               if (!bufs[i])
+                       goto err_alloc;
+       }
+
+       return 0;
+
+err_alloc:
+       hns_roce_free_buf_list(bufs, i);
+
+       return -ENOMEM;
+}
+
 void hns_roce_cleanup_bitmap(struct hns_roce_dev *hr_dev)
 {
        if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ)
index 2acf946d02e577e3db8b6fbf3d365d756f5a84ed..0cd09bf4d7eaeb64c1df5229ccd43950bb6859ad 100644 (file)
@@ -103,7 +103,6 @@ void hns_roce_cmd_event(struct hns_roce_dev *hr_dev, u16 token, u8 status,
        context->out_param = out_param;
        complete(&context->done);
 }
-EXPORT_SYMBOL_GPL(hns_roce_cmd_event);
 
 /* this should be called with "use_events" */
 static int __hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param,
@@ -162,7 +161,7 @@ static int hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param,
                                  u64 out_param, unsigned long in_modifier,
                                  u8 op_modifier, u16 op, unsigned long timeout)
 {
-       int ret = 0;
+       int ret;
 
        down(&hr_dev->cmd.event_sem);
        ret = __hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param,
@@ -204,7 +203,6 @@ int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param,
 
        return ret;
 }
-EXPORT_SYMBOL_GPL(hns_roce_cmd_mbox);
 
 int hns_roce_cmd_init(struct hns_roce_dev *hr_dev)
 {
@@ -291,7 +289,6 @@ struct hns_roce_cmd_mailbox
 
        return mailbox;
 }
-EXPORT_SYMBOL_GPL(hns_roce_alloc_cmd_mailbox);
 
 void hns_roce_free_cmd_mailbox(struct hns_roce_dev *hr_dev,
                               struct hns_roce_cmd_mailbox *mailbox)
@@ -302,4 +299,3 @@ void hns_roce_free_cmd_mailbox(struct hns_roce_dev *hr_dev,
        dma_pool_free(hr_dev->cmd.pool, mailbox->buf, mailbox->dma);
        kfree(mailbox);
 }
-EXPORT_SYMBOL_GPL(hns_roce_free_cmd_mailbox);
index 9caf35061721bee439c9adeeec2057dd24e3f76d..4e50c22a2da443e17a70697c92260d7518662a98 100644 (file)
@@ -205,7 +205,6 @@ void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
        hns_roce_table_put(hr_dev, &cq_table->table, hr_cq->cqn);
        hns_roce_bitmap_free(&cq_table->bitmap, hr_cq->cqn, BITMAP_NO_RR);
 }
-EXPORT_SYMBOL_GPL(hns_roce_free_cq);
 
 static int hns_roce_ib_get_cq_umem(struct hns_roce_dev *hr_dev,
                                   struct ib_udata *udata,
@@ -235,8 +234,7 @@ static int hns_roce_ib_get_cq_umem(struct hns_roce_dev *hr_dev,
                                        &buf->hr_mtt);
        } else {
                ret = hns_roce_mtt_init(hr_dev, ib_umem_page_count(*umem),
-                               (*umem)->page_shift,
-                               &buf->hr_mtt);
+                                       PAGE_SHIFT, &buf->hr_mtt);
        }
        if (ret)
                goto err_buf;
@@ -300,15 +298,15 @@ static void hns_roce_ib_free_cq_buf(struct hns_roce_dev *hr_dev,
                          &buf->hr_buf);
 }
 
-struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
-                                   const struct ib_cq_init_attr *attr,
-                                   struct ib_udata *udata)
+int hns_roce_ib_create_cq(struct ib_cq *ib_cq,
+                         const struct ib_cq_init_attr *attr,
+                         struct ib_udata *udata)
 {
-       struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
+       struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device);
        struct device *dev = hr_dev->dev;
        struct hns_roce_ib_create_cq ucmd;
        struct hns_roce_ib_create_cq_resp resp = {};
-       struct hns_roce_cq *hr_cq = NULL;
+       struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq);
        struct hns_roce_uar *uar = NULL;
        int vector = attr->comp_vector;
        int cq_entries = attr->cqe;
@@ -319,13 +317,9 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
        if (cq_entries < 1 || cq_entries > hr_dev->caps.max_cqes) {
                dev_err(dev, "Creat CQ failed. entries=%d, max=%d\n",
                        cq_entries, hr_dev->caps.max_cqes);
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
        }
 
-       hr_cq = kzalloc(sizeof(*hr_cq), GFP_KERNEL);
-       if (!hr_cq)
-               return ERR_PTR(-ENOMEM);
-
        if (hr_dev->caps.min_cqes)
                cq_entries = max(cq_entries, hr_dev->caps.min_cqes);
 
@@ -416,7 +410,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
                        goto err_cqc;
        }
 
-       return &hr_cq->ib_cq;
+       return 0;
 
 err_cqc:
        hns_roce_free_cq(hr_dev, hr_cq);
@@ -428,9 +422,8 @@ err_dbmap:
 
 err_mtt:
        hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
-       if (udata)
-               ib_umem_release(hr_cq->umem);
-       else
+       ib_umem_release(hr_cq->umem);
+       if (!udata)
                hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf,
                                        hr_cq->ib_cq.cqe);
 
@@ -439,47 +432,37 @@ err_db:
                hns_roce_free_db(hr_dev, &hr_cq->db);
 
 err_cq:
-       kfree(hr_cq);
-       return ERR_PTR(ret);
+       return ret;
 }
-EXPORT_SYMBOL_GPL(hns_roce_ib_create_cq);
 
-int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
+void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
        struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device);
        struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq);
-       int ret = 0;
 
        if (hr_dev->hw->destroy_cq) {
-               ret = hr_dev->hw->destroy_cq(ib_cq, udata);
-       } else {
-               hns_roce_free_cq(hr_dev, hr_cq);
-               hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
-
-               if (udata) {
-                       ib_umem_release(hr_cq->umem);
-
-                       if (hr_cq->db_en == 1)
-                               hns_roce_db_unmap_user(
-                                       rdma_udata_to_drv_context(
-                                               udata,
-                                               struct hns_roce_ucontext,
-                                               ibucontext),
-                                       &hr_cq->db);
-               } else {
-                       /* Free the buff of stored cq */
-                       hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf,
-                                               ib_cq->cqe);
-                       if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)
-                               hns_roce_free_db(hr_dev, &hr_cq->db);
-               }
-
-               kfree(hr_cq);
+               hr_dev->hw->destroy_cq(ib_cq, udata);
+               return;
        }
 
-       return ret;
+       hns_roce_free_cq(hr_dev, hr_cq);
+       hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
+
+       ib_umem_release(hr_cq->umem);
+       if (udata) {
+               if (hr_cq->db_en == 1)
+                       hns_roce_db_unmap_user(rdma_udata_to_drv_context(
+                                                      udata,
+                                                      struct hns_roce_ucontext,
+                                                      ibucontext),
+                                              &hr_cq->db);
+       } else {
+               /* Free the buff of stored cq */
+               hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf, ib_cq->cqe);
+               if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)
+                       hns_roce_free_db(hr_dev, &hr_cq->db);
+       }
 }
-EXPORT_SYMBOL_GPL(hns_roce_ib_destroy_cq);
 
 void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn)
 {
@@ -495,7 +478,6 @@ void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn)
        ++cq->arm_sn;
        cq->comp(cq);
 }
-EXPORT_SYMBOL_GPL(hns_roce_cq_completion);
 
 void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type)
 {
@@ -517,7 +499,6 @@ void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type)
        if (atomic_dec_and_test(&cq->refcount))
                complete(&cq->free);
 }
-EXPORT_SYMBOL_GPL(hns_roce_cq_event);
 
 int hns_roce_init_cq_table(struct hns_roce_dev *hr_dev)
 {
index 0c6c1fe87705c49f04d105a0129e452ad1fe94e6..627aa46ef683b8ec4467acc1404033f21822f627 100644 (file)
@@ -51,7 +51,6 @@ out:
 
        return ret;
 }
-EXPORT_SYMBOL(hns_roce_db_map_user);
 
 void hns_roce_db_unmap_user(struct hns_roce_ucontext *context,
                            struct hns_roce_db *db)
@@ -67,7 +66,6 @@ void hns_roce_db_unmap_user(struct hns_roce_ucontext *context,
 
        mutex_unlock(&context->page_mutex);
 }
-EXPORT_SYMBOL(hns_roce_db_unmap_user);
 
 static struct hns_roce_db_pgdir *hns_roce_alloc_db_pgdir(
                                        struct device *dma_device)
@@ -78,7 +76,8 @@ static struct hns_roce_db_pgdir *hns_roce_alloc_db_pgdir(
        if (!pgdir)
                return NULL;
 
-       bitmap_fill(pgdir->order1, HNS_ROCE_DB_PER_PAGE / 2);
+       bitmap_fill(pgdir->order1,
+                   HNS_ROCE_DB_PER_PAGE / HNS_ROCE_DB_TYPE_COUNT);
        pgdir->bits[0] = pgdir->order0;
        pgdir->bits[1] = pgdir->order1;
        pgdir->page = dma_alloc_coherent(dma_device, PAGE_SIZE,
@@ -116,7 +115,7 @@ found:
        db->u.pgdir     = pgdir;
        db->index       = i;
        db->db_record   = pgdir->page + db->index;
-       db->dma         = pgdir->db_dma  + db->index * 4;
+       db->dma         = pgdir->db_dma  + db->index * HNS_ROCE_DB_UNIT_SIZE;
        db->order       = order;
 
        return 0;
@@ -150,7 +149,6 @@ out:
 
        return ret;
 }
-EXPORT_SYMBOL_GPL(hns_roce_alloc_db);
 
 void hns_roce_free_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db)
 {
@@ -170,7 +168,8 @@ void hns_roce_free_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db)
        i >>= o;
        set_bit(i, db->u.pgdir->bits[o]);
 
-       if (bitmap_full(db->u.pgdir->order1, HNS_ROCE_DB_PER_PAGE / 2)) {
+       if (bitmap_full(db->u.pgdir->order1,
+                       HNS_ROCE_DB_PER_PAGE / HNS_ROCE_DB_TYPE_COUNT)) {
                dma_free_coherent(hr_dev->dev, PAGE_SIZE, db->u.pgdir->page,
                                  db->u.pgdir->db_dma);
                list_del(&db->u.pgdir->list);
@@ -179,4 +178,3 @@ void hns_roce_free_db(struct hns_roce_dev *hr_dev, struct hns_roce_db *db)
 
        mutex_unlock(&hr_dev->pgdir_mutex);
 }
-EXPORT_SYMBOL_GPL(hns_roce_free_db);
index 563cf39df6d56df3d73d238b8dc560ecccbdaa33..a548b28aab639ae09536982c3f875acf4b9faefa 100644 (file)
 
 #define DRV_NAME "hns_roce"
 
+/* hip08 is a pci device, it includes two version according pci version id */
+#define PCI_REVISION_ID_HIP08_A                        0x20
+#define PCI_REVISION_ID_HIP08_B                        0x21
+
 #define HNS_ROCE_HW_VER1       ('h' << 24 | 'i' << 16 | '0' << 8 | '6')
 
-#define MAC_ADDR_OCTET_NUM                     6
 #define HNS_ROCE_MAX_MSG_LEN                   0x80000000
 
 #define HNS_ROCE_ALOGN_UP(a, b) ((((a) + (b) - 1) / (b)) * (b))
 
 #define HNS_ROCE_BA_SIZE                       (32 * 4096)
 
+#define BA_BYTE_LEN                            8
+
+#define BITS_PER_BYTE                          8
+
 /* Hardware specification only for v1 engine */
 #define HNS_ROCE_MIN_CQE_NUM                   0x40
 #define HNS_ROCE_MIN_WQE_NUM                   0x20
@@ -55,6 +62,7 @@
 /* Hardware specification only for v1 engine */
 #define HNS_ROCE_MAX_INNER_MTPT_NUM            0x7
 #define HNS_ROCE_MAX_MTPT_PBL_NUM              0x100000
+#define HNS_ROCE_MAX_SGE_NUM                   2
 
 #define HNS_ROCE_EACH_FREE_CQ_WAIT_MSECS       20
 #define HNS_ROCE_MAX_FREE_CQ_WAIT_CNT  \
@@ -64,6 +72,9 @@
 
 #define HNS_ROCE_MAX_IRQ_NUM                   128
 
+#define HNS_ROCE_SGE_IN_WQE                    2
+#define HNS_ROCE_SGE_SHIFT                     4
+
 #define EQ_ENABLE                              1
 #define EQ_DISABLE                             0
 
@@ -81,6 +92,7 @@
 #define HNS_ROCE_MAX_PORTS                     6
 #define HNS_ROCE_MAX_GID_NUM                   16
 #define HNS_ROCE_GID_SIZE                      16
+#define HNS_ROCE_SGE_SIZE                      16
 
 #define HNS_ROCE_HOP_NUM_0                     0xff
 
 #define PAGES_SHIFT_24                         24
 #define PAGES_SHIFT_32                         32
 
+#define HNS_ROCE_PCI_BAR_NUM                   2
+
 #define HNS_ROCE_IDX_QUE_ENTRY_SZ              4
 #define SRQ_DB_REG                             0x230
 
@@ -213,6 +227,9 @@ enum hns_roce_mtt_type {
        MTT_TYPE_IDX
 };
 
+#define HNS_ROCE_DB_TYPE_COUNT                 2
+#define HNS_ROCE_DB_UNIT_SIZE                  4
+
 enum {
        HNS_ROCE_DB_PER_PAGE = PAGE_SIZE / 4
 };
@@ -324,6 +341,29 @@ struct hns_roce_mtt {
        enum hns_roce_mtt_type  mtt_type;
 };
 
+struct hns_roce_buf_region {
+       int offset; /* page offset */
+       u32 count; /* page count*/
+       int hopnum; /* addressing hop num */
+};
+
+#define HNS_ROCE_MAX_BT_REGION 3
+#define HNS_ROCE_MAX_BT_LEVEL  3
+struct hns_roce_hem_list {
+       struct list_head root_bt;
+       /* link all bt dma mem by hop config */
+       struct list_head mid_bt[HNS_ROCE_MAX_BT_REGION][HNS_ROCE_MAX_BT_LEVEL];
+       struct list_head btm_bt; /* link all bottom bt in @mid_bt */
+       dma_addr_t root_ba; /* pointer to the root ba table */
+       int bt_pg_shift;
+};
+
+/* memory translate region */
+struct hns_roce_mtr {
+       struct hns_roce_hem_list hem_list;
+       int buf_pg_shift;
+};
+
 struct hns_roce_mw {
        struct ib_mw            ibmw;
        u32                     pdn;
@@ -413,8 +453,8 @@ struct hns_roce_buf {
 struct hns_roce_db_pgdir {
        struct list_head        list;
        DECLARE_BITMAP(order0, HNS_ROCE_DB_PER_PAGE);
-       DECLARE_BITMAP(order1, HNS_ROCE_DB_PER_PAGE / 2);
-       unsigned long           *bits[2];
+       DECLARE_BITMAP(order1, HNS_ROCE_DB_PER_PAGE / HNS_ROCE_DB_TYPE_COUNT);
+       unsigned long           *bits[HNS_ROCE_DB_TYPE_COUNT];
        u32                     *page;
        dma_addr_t              db_dma;
 };
@@ -472,7 +512,7 @@ struct hns_roce_idx_que {
        u32                             buf_size;
        struct ib_umem                  *umem;
        struct hns_roce_mtt             mtt;
-       u64                             *bitmap;
+       unsigned long                   *bitmap;
 };
 
 struct hns_roce_srq {
@@ -535,7 +575,7 @@ struct hns_roce_av {
        u8          hop_limit;
        __le32      sl_tclass_flowlabel;
        u8          dgid[HNS_ROCE_GID_SIZE];
-       u8          mac[6];
+       u8          mac[ETH_ALEN];
        __le16      vlan;
        bool        vlan_en;
 };
@@ -620,6 +660,14 @@ struct hns_roce_qp {
 
        struct ib_umem          *umem;
        struct hns_roce_mtt     mtt;
+       struct hns_roce_mtr     mtr;
+
+       /* this define must less than HNS_ROCE_MAX_BT_REGION */
+#define HNS_ROCE_WQE_REGION_MAX         3
+       struct hns_roce_buf_region regions[HNS_ROCE_WQE_REGION_MAX];
+       int                     region_cnt;
+       int                     wqe_bt_pg_shift;
+
        u32                     buff_size;
        struct mutex            mutex;
        u8                      port;
@@ -830,6 +878,9 @@ struct hns_roce_caps {
        u32             mtt_ba_pg_sz;
        u32             mtt_buf_pg_sz;
        u32             mtt_hop_num;
+       u32             wqe_sq_hop_num;
+       u32             wqe_sge_hop_num;
+       u32             wqe_rq_hop_num;
        u32             sccc_ba_pg_sz;
        u32             sccc_buf_pg_sz;
        u32             sccc_hop_num;
@@ -921,7 +972,7 @@ struct hns_roce_hw {
        int (*poll_cq)(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
        int (*dereg_mr)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr,
                        struct ib_udata *udata);
-       int (*destroy_cq)(struct ib_cq *ibcq, struct ib_udata *udata);
+       void (*destroy_cq)(struct ib_cq *ibcq, struct ib_udata *udata);
        int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
        int (*init_eq)(struct hns_roce_dev *hr_dev);
        void (*cleanup_eq)(struct hns_roce_dev *hr_dev);
@@ -940,6 +991,16 @@ struct hns_roce_hw {
        const struct ib_device_ops *hns_roce_dev_srq_ops;
 };
 
+enum hns_phy_state {
+       HNS_ROCE_PHY_SLEEP              = 1,
+       HNS_ROCE_PHY_POLLING            = 2,
+       HNS_ROCE_PHY_DISABLED           = 3,
+       HNS_ROCE_PHY_TRAINING           = 4,
+       HNS_ROCE_PHY_LINKUP             = 5,
+       HNS_ROCE_PHY_LINKERR            = 6,
+       HNS_ROCE_PHY_TEST               = 7
+};
+
 struct hns_roce_dev {
        struct ib_device        ib_dev;
        struct platform_device  *pdev;
@@ -962,7 +1023,7 @@ struct hns_roce_dev {
        struct hns_roce_caps    caps;
        struct xarray           qp_table_xa;
 
-       unsigned char   dev_addr[HNS_ROCE_MAX_PORTS][MAC_ADDR_OCTET_NUM];
+       unsigned char   dev_addr[HNS_ROCE_MAX_PORTS][ETH_ALEN];
        u64                     sys_image_guid;
        u32                     vendor_id;
        u32                     vendor_part_id;
@@ -1084,6 +1145,19 @@ void hns_roce_mtt_cleanup(struct hns_roce_dev *hr_dev,
 int hns_roce_buf_write_mtt(struct hns_roce_dev *hr_dev,
                           struct hns_roce_mtt *mtt, struct hns_roce_buf *buf);
 
+void hns_roce_mtr_init(struct hns_roce_mtr *mtr, int bt_pg_shift,
+                      int buf_pg_shift);
+int hns_roce_mtr_attach(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
+                       dma_addr_t **bufs, struct hns_roce_buf_region *regions,
+                       int region_cnt);
+void hns_roce_mtr_cleanup(struct hns_roce_dev *hr_dev,
+                         struct hns_roce_mtr *mtr);
+
+/* hns roce hw need current block and next block addr from mtt */
+#define MTT_MIN_COUNT   2
+int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
+                     int offset, u64 *mtt_buf, int mtt_max, u64 *base_addr);
+
 int hns_roce_init_pd_table(struct hns_roce_dev *hr_dev);
 int hns_roce_init_mr_table(struct hns_roce_dev *hr_dev);
 int hns_roce_init_eq_table(struct hns_roce_dev *hr_dev);
@@ -1148,6 +1222,18 @@ int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct,
 int hns_roce_ib_umem_write_mtt(struct hns_roce_dev *hr_dev,
                               struct hns_roce_mtt *mtt, struct ib_umem *umem);
 
+void hns_roce_init_buf_region(struct hns_roce_buf_region *region, int hopnum,
+                             int offset, int buf_cnt);
+int hns_roce_alloc_buf_list(struct hns_roce_buf_region *regions,
+                           dma_addr_t **bufs, int count);
+void hns_roce_free_buf_list(dma_addr_t **bufs, int count);
+
+int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs,
+                          int buf_cnt, int start, struct hns_roce_buf *buf);
+int hns_roce_get_umem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs,
+                          int buf_cnt, int start, struct ib_umem *umem,
+                          int page_shift);
+
 int hns_roce_create_srq(struct ib_srq *srq,
                        struct ib_srq_init_attr *srq_init_attr,
                        struct ib_udata *udata);
@@ -1178,11 +1264,11 @@ void hns_roce_release_range_qp(struct hns_roce_dev *hr_dev, int base_qpn,
 __be32 send_ieth(const struct ib_send_wr *wr);
 int to_hr_qp_type(int qp_type);
 
-struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
-                                   const struct ib_cq_init_attr *attr,
-                                   struct ib_udata *udata);
+int hns_roce_ib_create_cq(struct ib_cq *ib_cq,
+                         const struct ib_cq_init_attr *attr,
+                         struct ib_udata *udata);
 
-int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata);
+void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata);
 void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq);
 
 int hns_roce_db_map_user(struct hns_roce_ucontext *context,
index 8e29dbb5b5fbc3bd883384915e76155201572848..f4da5bd2884fd14f7994abb50887a43d54dd3531 100644 (file)
@@ -56,7 +56,6 @@ bool hns_roce_check_whether_mhop(struct hns_roce_dev *hr_dev, u32 type)
 
        return false;
 }
-EXPORT_SYMBOL_GPL(hns_roce_check_whether_mhop);
 
 static bool hns_roce_check_hem_null(struct hns_roce_hem **hem, u64 start_idx,
                            u32 bt_chunk_num)
@@ -165,7 +164,7 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev,
                                             + PAGE_SHIFT);
                mhop->bt_chunk_size = 1 << (hr_dev->caps.mtt_ba_pg_sz
                                             + PAGE_SHIFT);
-               mhop->ba_l0_num = mhop->bt_chunk_size / 8;
+               mhop->ba_l0_num = mhop->bt_chunk_size / BA_BYTE_LEN;
                mhop->hop_num = hr_dev->caps.mtt_hop_num;
                break;
        case HEM_TYPE_CQE:
@@ -173,7 +172,7 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev,
                                             + PAGE_SHIFT);
                mhop->bt_chunk_size = 1 << (hr_dev->caps.cqe_ba_pg_sz
                                             + PAGE_SHIFT);
-               mhop->ba_l0_num = mhop->bt_chunk_size / 8;
+               mhop->ba_l0_num = mhop->bt_chunk_size / BA_BYTE_LEN;
                mhop->hop_num = hr_dev->caps.cqe_hop_num;
                break;
        case HEM_TYPE_SRQWQE:
@@ -181,7 +180,7 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev,
                                            + PAGE_SHIFT);
                mhop->bt_chunk_size = 1 << (hr_dev->caps.srqwqe_ba_pg_sz
                                            + PAGE_SHIFT);
-               mhop->ba_l0_num = mhop->bt_chunk_size / 8;
+               mhop->ba_l0_num = mhop->bt_chunk_size / BA_BYTE_LEN;
                mhop->hop_num = hr_dev->caps.srqwqe_hop_num;
                break;
        case HEM_TYPE_IDX:
@@ -189,7 +188,7 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev,
                                       + PAGE_SHIFT);
                mhop->bt_chunk_size = 1 << (hr_dev->caps.idx_ba_pg_sz
                                       + PAGE_SHIFT);
-               mhop->ba_l0_num = mhop->bt_chunk_size / 8;
+               mhop->ba_l0_num = mhop->bt_chunk_size / BA_BYTE_LEN;
                mhop->hop_num = hr_dev->caps.idx_hop_num;
                break;
        default:
@@ -206,7 +205,7 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev,
         * MTT/CQE alloc hem for bt pages.
         */
        bt_num = hns_roce_get_bt_num(table->type, mhop->hop_num);
-       chunk_ba_num = mhop->bt_chunk_size / 8;
+       chunk_ba_num = mhop->bt_chunk_size / BA_BYTE_LEN;
        chunk_size = table->type < HEM_TYPE_MTT ? mhop->buf_chunk_size :
                              mhop->bt_chunk_size;
        table_idx = (*obj & (table->num_obj - 1)) /
@@ -234,7 +233,6 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev,
 
        return 0;
 }
-EXPORT_SYMBOL_GPL(hns_roce_calc_hem_mhop);
 
 static struct hns_roce_hem *hns_roce_alloc_hem(struct hns_roce_dev *hr_dev,
                                               int npages,
@@ -376,18 +374,19 @@ static int hns_roce_set_hem(struct hns_roce_dev *hr_dev,
 
                bt_cmd = hr_dev->reg_base + ROCEE_BT_CMD_H_REG;
 
-               end = msecs_to_jiffies(HW_SYNC_TIMEOUT_MSECS) + jiffies;
-               while (1) {
-                       if (readl(bt_cmd) >> BT_CMD_SYNC_SHIFT) {
-                               if (!(time_before(jiffies, end))) {
-                                       dev_err(dev, "Write bt_cmd err,hw_sync is not zero.\n");
-                                       spin_unlock_irqrestore(lock, flags);
-                                       return -EBUSY;
-                               }
-                       } else {
+               end = HW_SYNC_TIMEOUT_MSECS;
+               while (end) {
+                       if (!(readl(bt_cmd) >> BT_CMD_SYNC_SHIFT))
                                break;
-                       }
+
                        mdelay(HW_SYNC_SLEEP_TIME_INTERVAL);
+                       end -= HW_SYNC_SLEEP_TIME_INTERVAL;
+               }
+
+               if (end <= 0) {
+                       dev_err(dev, "Write bt_cmd err,hw_sync is not zero.\n");
+                       spin_unlock_irqrestore(lock, flags);
+                       return -EBUSY;
                }
 
                bt_cmd_l = (u32)bt_ba;
@@ -435,7 +434,7 @@ static int hns_roce_table_mhop_get(struct hns_roce_dev *hr_dev,
        buf_chunk_size = mhop.buf_chunk_size;
        bt_chunk_size = mhop.bt_chunk_size;
        hop_num = mhop.hop_num;
-       chunk_ba_num = bt_chunk_size / 8;
+       chunk_ba_num = bt_chunk_size / BA_BYTE_LEN;
 
        bt_num = hns_roce_get_bt_num(table->type, hop_num);
        switch (bt_num) {
@@ -620,7 +619,6 @@ out:
        mutex_unlock(&table->mutex);
        return ret;
 }
-EXPORT_SYMBOL_GPL(hns_roce_table_get);
 
 static void hns_roce_table_mhop_put(struct hns_roce_dev *hr_dev,
                                    struct hns_roce_hem_table *table,
@@ -645,7 +643,7 @@ static void hns_roce_table_mhop_put(struct hns_roce_dev *hr_dev,
 
        bt_chunk_size = mhop.bt_chunk_size;
        hop_num = mhop.hop_num;
-       chunk_ba_num = bt_chunk_size / 8;
+       chunk_ba_num = bt_chunk_size / BA_BYTE_LEN;
 
        bt_num = hns_roce_get_bt_num(table->type, hop_num);
        switch (bt_num) {
@@ -763,7 +761,6 @@ void hns_roce_table_put(struct hns_roce_dev *hr_dev,
 
        mutex_unlock(&table->mutex);
 }
-EXPORT_SYMBOL_GPL(hns_roce_table_put);
 
 void *hns_roce_table_find(struct hns_roce_dev *hr_dev,
                          struct hns_roce_hem_table *table,
@@ -799,7 +796,7 @@ void *hns_roce_table_find(struct hns_roce_dev *hr_dev,
                i = mhop.l0_idx;
                j = mhop.l1_idx;
                if (mhop.hop_num == 2)
-                       hem_idx = i * (mhop.bt_chunk_size / 8) + j;
+                       hem_idx = i * (mhop.bt_chunk_size / BA_BYTE_LEN) + j;
                else if (mhop.hop_num == 1 ||
                         mhop.hop_num == HNS_ROCE_HOP_NUM_0)
                        hem_idx = i;
@@ -836,7 +833,6 @@ out:
        mutex_unlock(&table->mutex);
        return addr;
 }
-EXPORT_SYMBOL_GPL(hns_roce_table_find);
 
 int hns_roce_table_get_range(struct hns_roce_dev *hr_dev,
                             struct hns_roce_hem_table *table,
@@ -999,7 +995,7 @@ int hns_roce_init_hem_table(struct hns_roce_dev *hr_dev,
                }
                obj_per_chunk = buf_chunk_size / obj_size;
                num_hem = (nobj + obj_per_chunk - 1) / obj_per_chunk;
-               bt_chunk_num = bt_chunk_size / 8;
+               bt_chunk_num = bt_chunk_size / BA_BYTE_LEN;
                if (type >= HEM_TYPE_MTT)
                        num_bt_l0 = bt_chunk_num;
 
@@ -1156,3 +1152,463 @@ void hns_roce_cleanup_hem(struct hns_roce_dev *hr_dev)
                                           &hr_dev->mr_table.mtt_cqe_table);
        hns_roce_cleanup_hem_table(hr_dev, &hr_dev->mr_table.mtt_table);
 }
+
+struct roce_hem_item {
+       struct list_head list; /* link all hems in the same bt level */
+       struct list_head sibling; /* link all hems in last hop for mtt */
+       void *addr;
+       dma_addr_t dma_addr;
+       size_t count; /* max ba numbers */
+       int start; /* start buf offset in this hem */
+       int end; /* end buf offset in this hem */
+};
+
+static struct roce_hem_item *hem_list_alloc_item(struct hns_roce_dev *hr_dev,
+                                                  int start, int end,
+                                                  int count, bool exist_bt,
+                                                  int bt_level)
+{
+       struct roce_hem_item *hem;
+
+       hem = kzalloc(sizeof(*hem), GFP_KERNEL);
+       if (!hem)
+               return NULL;
+
+       if (exist_bt) {
+               hem->addr = dma_alloc_coherent(hr_dev->dev,
+                                                  count * BA_BYTE_LEN,
+                                                  &hem->dma_addr, GFP_KERNEL);
+               if (!hem->addr) {
+                       kfree(hem);
+                       return NULL;
+               }
+       }
+
+       hem->count = count;
+       hem->start = start;
+       hem->end = end;
+       INIT_LIST_HEAD(&hem->list);
+       INIT_LIST_HEAD(&hem->sibling);
+
+       return hem;
+}
+
+static void hem_list_free_item(struct hns_roce_dev *hr_dev,
+                              struct roce_hem_item *hem, bool exist_bt)
+{
+       if (exist_bt)
+               dma_free_coherent(hr_dev->dev, hem->count * BA_BYTE_LEN,
+                                 hem->addr, hem->dma_addr);
+       kfree(hem);
+}
+
+static void hem_list_free_all(struct hns_roce_dev *hr_dev,
+                             struct list_head *head, bool exist_bt)
+{
+       struct roce_hem_item *hem, *temp_hem;
+
+       list_for_each_entry_safe(hem, temp_hem, head, list) {
+               list_del(&hem->list);
+               hem_list_free_item(hr_dev, hem, exist_bt);
+       }
+}
+
+static void hem_list_link_bt(struct hns_roce_dev *hr_dev, void *base_addr,
+                            u64 table_addr)
+{
+       *(u64 *)(base_addr) = table_addr;
+}
+
+/* assign L0 table address to hem from root bt */
+static void hem_list_assign_bt(struct hns_roce_dev *hr_dev,
+                              struct roce_hem_item *hem, void *cpu_addr,
+                              u64 phy_addr)
+{
+       hem->addr = cpu_addr;
+       hem->dma_addr = (dma_addr_t)phy_addr;
+}
+
+static inline bool hem_list_page_is_in_range(struct roce_hem_item *hem,
+                                            int offset)
+{
+       return (hem->start <= offset && offset <= hem->end);
+}
+
+static struct roce_hem_item *hem_list_search_item(struct list_head *ba_list,
+                                                   int page_offset)
+{
+       struct roce_hem_item *hem, *temp_hem;
+       struct roce_hem_item *found = NULL;
+
+       list_for_each_entry_safe(hem, temp_hem, ba_list, list) {
+               if (hem_list_page_is_in_range(hem, page_offset)) {
+                       found = hem;
+                       break;
+               }
+       }
+
+       return found;
+}
+
+static bool hem_list_is_bottom_bt(int hopnum, int bt_level)
+{
+       /*
+        * hopnum    base address table levels
+        * 0            L0(buf)
+        * 1            L0 -> buf
+        * 2            L0 -> L1 -> buf
+        * 3            L0 -> L1 -> L2 -> buf
+        */
+       return bt_level >= (hopnum ? hopnum - 1 : hopnum);
+}
+
+/**
+ * calc base address entries num
+ * @hopnum: num of mutihop addressing
+ * @bt_level: base address table level
+ * @unit: ba entries per bt page
+ */
+static u32 hem_list_calc_ba_range(int hopnum, int bt_level, int unit)
+{
+       u32 step;
+       int max;
+       int i;
+
+       if (hopnum <= bt_level)
+               return 0;
+       /*
+        * hopnum  bt_level   range
+        * 1          0       unit
+        * ------------
+        * 2          0       unit * unit
+        * 2          1       unit
+        * ------------
+        * 3          0       unit * unit * unit
+        * 3          1       unit * unit
+        * 3          2       unit
+        */
+       step = 1;
+       max = hopnum - bt_level;
+       for (i = 0; i < max; i++)
+               step = step * unit;
+
+       return step;
+}
+
+/**
+ * calc the root ba entries which could cover all regions
+ * @regions: buf region array
+ * @region_cnt: array size of @regions
+ * @unit: ba entries per bt page
+ */
+int hns_roce_hem_list_calc_root_ba(const struct hns_roce_buf_region *regions,
+                                  int region_cnt, int unit)
+{
+       struct hns_roce_buf_region *r;
+       int total = 0;
+       int step;
+       int i;
+
+       for (i = 0; i < region_cnt; i++) {
+               r = (struct hns_roce_buf_region *)&regions[i];
+               if (r->hopnum > 1) {
+                       step = hem_list_calc_ba_range(r->hopnum, 1, unit);
+                       if (step > 0)
+                               total += (r->count + step - 1) / step;
+               } else {
+                       total += r->count;
+               }
+       }
+
+       return total;
+}
+
+static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev,
+                                const struct hns_roce_buf_region *r, int unit,
+                                int offset, struct list_head *mid_bt,
+                                struct list_head *btm_bt)
+{
+       struct roce_hem_item *hem_ptrs[HNS_ROCE_MAX_BT_LEVEL] = { NULL };
+       struct list_head temp_list[HNS_ROCE_MAX_BT_LEVEL];
+       struct roce_hem_item *cur, *pre;
+       const int hopnum = r->hopnum;
+       int start_aligned;
+       int distance;
+       int ret = 0;
+       int max_ofs;
+       int level;
+       u32 step;
+       int end;
+
+       if (hopnum <= 1)
+               return 0;
+
+       if (hopnum > HNS_ROCE_MAX_BT_LEVEL) {
+               dev_err(hr_dev->dev, "invalid hopnum %d!\n", hopnum);
+               return -EINVAL;
+       }
+
+       if (offset < r->offset) {
+               dev_err(hr_dev->dev, "invalid offset %d,min %d!\n",
+                       offset, r->offset);
+               return -EINVAL;
+       }
+
+       distance = offset - r->offset;
+       max_ofs = r->offset + r->count - 1;
+       for (level = 0; level < hopnum; level++)
+               INIT_LIST_HEAD(&temp_list[level]);
+
+       /* config L1 bt to last bt and link them to corresponding parent */
+       for (level = 1; level < hopnum; level++) {
+               cur = hem_list_search_item(&mid_bt[level], offset);
+               if (cur) {
+                       hem_ptrs[level] = cur;
+                       continue;
+               }
+
+               step = hem_list_calc_ba_range(hopnum, level, unit);
+               if (step < 1) {
+                       ret = -EINVAL;
+                       goto err_exit;
+               }
+
+               start_aligned = (distance / step) * step + r->offset;
+               end = min_t(int, start_aligned + step - 1, max_ofs);
+               cur = hem_list_alloc_item(hr_dev, start_aligned, end, unit,
+                                         true, level);
+               if (!cur) {
+                       ret = -ENOMEM;
+                       goto err_exit;
+               }
+               hem_ptrs[level] = cur;
+               list_add(&cur->list, &temp_list[level]);
+               if (hem_list_is_bottom_bt(hopnum, level))
+                       list_add(&cur->sibling, &temp_list[0]);
+
+               /* link bt to parent bt */
+               if (level > 1) {
+                       pre = hem_ptrs[level - 1];
+                       step = (cur->start - pre->start) / step * BA_BYTE_LEN;
+                       hem_list_link_bt(hr_dev, pre->addr + step,
+                                        cur->dma_addr);
+               }
+       }
+
+       list_splice(&temp_list[0], btm_bt);
+       for (level = 1; level < hopnum; level++)
+               list_splice(&temp_list[level], &mid_bt[level]);
+
+       return 0;
+
+err_exit:
+       for (level = 1; level < hopnum; level++)
+               hem_list_free_all(hr_dev, &temp_list[level], true);
+
+       return ret;
+}
+
+static int hem_list_alloc_root_bt(struct hns_roce_dev *hr_dev,
+                                 struct hns_roce_hem_list *hem_list, int unit,
+                                 const struct hns_roce_buf_region *regions,
+                                 int region_cnt)
+{
+       struct roce_hem_item *hem, *temp_hem, *root_hem;
+       struct list_head temp_list[HNS_ROCE_MAX_BT_REGION];
+       const struct hns_roce_buf_region *r;
+       struct list_head temp_root;
+       struct list_head temp_btm;
+       void *cpu_base;
+       u64 phy_base;
+       int ret = 0;
+       int offset;
+       int total;
+       int step;
+       int i;
+
+       r = &regions[0];
+       root_hem = hem_list_search_item(&hem_list->root_bt, r->offset);
+       if (root_hem)
+               return 0;
+
+       INIT_LIST_HEAD(&temp_root);
+       total = r->offset;
+       /* indicate to last region */
+       r = &regions[region_cnt - 1];
+       root_hem = hem_list_alloc_item(hr_dev, total, r->offset + r->count - 1,
+                                      unit, true, 0);
+       if (!root_hem)
+               return -ENOMEM;
+       list_add(&root_hem->list, &temp_root);
+
+       hem_list->root_ba = root_hem->dma_addr;
+
+       INIT_LIST_HEAD(&temp_btm);
+       for (i = 0; i < region_cnt; i++)
+               INIT_LIST_HEAD(&temp_list[i]);
+
+       total = 0;
+       for (i = 0; i < region_cnt && total < unit; i++) {
+               r = &regions[i];
+               if (!r->count)
+                       continue;
+
+               /* all regions's mid[x][0] shared the root_bt's trunk */
+               cpu_base = root_hem->addr + total * BA_BYTE_LEN;
+               phy_base = root_hem->dma_addr + total * BA_BYTE_LEN;
+
+               /* if hopnum is 0 or 1, cut a new fake hem from the root bt
+                * which's address share to all regions.
+                */
+               if (hem_list_is_bottom_bt(r->hopnum, 0)) {
+                       hem = hem_list_alloc_item(hr_dev, r->offset,
+                                                 r->offset + r->count - 1,
+                                                 r->count, false, 0);
+                       if (!hem) {
+                               ret = -ENOMEM;
+                               goto err_exit;
+                       }
+                       hem_list_assign_bt(hr_dev, hem, cpu_base, phy_base);
+                       list_add(&hem->list, &temp_list[i]);
+                       list_add(&hem->sibling, &temp_btm);
+                       total += r->count;
+               } else {
+                       step = hem_list_calc_ba_range(r->hopnum, 1, unit);
+                       if (step < 1) {
+                               ret = -EINVAL;
+                               goto err_exit;
+                       }
+                       /* if exist mid bt, link L1 to L0 */
+                       list_for_each_entry_safe(hem, temp_hem,
+                                         &hem_list->mid_bt[i][1], list) {
+                               offset = hem->start / step * BA_BYTE_LEN;
+                               hem_list_link_bt(hr_dev, cpu_base + offset,
+                                                hem->dma_addr);
+                               total++;
+                       }
+               }
+       }
+
+       list_splice(&temp_btm, &hem_list->btm_bt);
+       list_splice(&temp_root, &hem_list->root_bt);
+       for (i = 0; i < region_cnt; i++)
+               list_splice(&temp_list[i], &hem_list->mid_bt[i][0]);
+
+       return 0;
+
+err_exit:
+       for (i = 0; i < region_cnt; i++)
+               hem_list_free_all(hr_dev, &temp_list[i], false);
+
+       hem_list_free_all(hr_dev, &temp_root, true);
+
+       return ret;
+}
+
+/* construct the base address table and link them by address hop config */
+int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev,
+                             struct hns_roce_hem_list *hem_list,
+                             const struct hns_roce_buf_region *regions,
+                             int region_cnt)
+{
+       const struct hns_roce_buf_region *r;
+       int ofs, end;
+       int ret = 0;
+       int unit;
+       int i;
+
+       if (region_cnt > HNS_ROCE_MAX_BT_REGION) {
+               dev_err(hr_dev->dev, "invalid region region_cnt %d!\n",
+                       region_cnt);
+               return -EINVAL;
+       }
+
+       unit = (1 << hem_list->bt_pg_shift) / BA_BYTE_LEN;
+       for (i = 0; i < region_cnt; i++) {
+               r = &regions[i];
+               if (!r->count)
+                       continue;
+
+               end = r->offset + r->count;
+               for (ofs = r->offset; ofs < end; ofs += unit) {
+                       ret = hem_list_alloc_mid_bt(hr_dev, r, unit, ofs,
+                                                   hem_list->mid_bt[i],
+                                                   &hem_list->btm_bt);
+                       if (ret) {
+                               dev_err(hr_dev->dev,
+                                       "alloc hem trunk fail ret=%d!\n", ret);
+                               goto err_alloc;
+                       }
+               }
+       }
+
+       ret = hem_list_alloc_root_bt(hr_dev, hem_list, unit, regions,
+                                    region_cnt);
+       if (ret)
+               dev_err(hr_dev->dev, "alloc hem root fail ret=%d!\n", ret);
+       else
+               return 0;
+
+err_alloc:
+       hns_roce_hem_list_release(hr_dev, hem_list);
+
+       return ret;
+}
+
+void hns_roce_hem_list_release(struct hns_roce_dev *hr_dev,
+                              struct hns_roce_hem_list *hem_list)
+{
+       int i, j;
+
+       for (i = 0; i < HNS_ROCE_MAX_BT_REGION; i++)
+               for (j = 0; j < HNS_ROCE_MAX_BT_LEVEL; j++)
+                       hem_list_free_all(hr_dev, &hem_list->mid_bt[i][j],
+                                         j != 0);
+
+       hem_list_free_all(hr_dev, &hem_list->root_bt, true);
+       INIT_LIST_HEAD(&hem_list->btm_bt);
+       hem_list->root_ba = 0;
+}
+
+void hns_roce_hem_list_init(struct hns_roce_hem_list *hem_list,
+                           int bt_page_order)
+{
+       int i, j;
+
+       INIT_LIST_HEAD(&hem_list->root_bt);
+       INIT_LIST_HEAD(&hem_list->btm_bt);
+       for (i = 0; i < HNS_ROCE_MAX_BT_REGION; i++)
+               for (j = 0; j < HNS_ROCE_MAX_BT_LEVEL; j++)
+                       INIT_LIST_HEAD(&hem_list->mid_bt[i][j]);
+
+       hem_list->bt_pg_shift = bt_page_order;
+}
+
+void *hns_roce_hem_list_find_mtt(struct hns_roce_dev *hr_dev,
+                                struct hns_roce_hem_list *hem_list,
+                                int offset, int *mtt_cnt, u64 *phy_addr)
+{
+       struct list_head *head = &hem_list->btm_bt;
+       struct roce_hem_item *hem, *temp_hem;
+       void *cpu_base = NULL;
+       u64 phy_base = 0;
+       int nr = 0;
+
+       list_for_each_entry_safe(hem, temp_hem, head, sibling) {
+               if (hem_list_page_is_in_range(hem, offset)) {
+                       nr = offset - hem->start;
+                       cpu_base = hem->addr + nr * BA_BYTE_LEN;
+                       phy_base = hem->dma_addr + nr * BA_BYTE_LEN;
+                       nr = hem->end + 1 - offset;
+                       break;
+               }
+       }
+
+       if (mtt_cnt)
+               *mtt_cnt = nr;
+
+       if (phy_addr)
+               *phy_addr = phy_base;
+
+       return cpu_base;
+}
index d9d668992e4944d800dfee47fd9c2bbf0e831824..f1ccb8f35fe59f44a202056a1c0ff730f67799c2 100644 (file)
@@ -34,8 +34,8 @@
 #ifndef _HNS_ROCE_HEM_H
 #define _HNS_ROCE_HEM_H
 
-#define HW_SYNC_TIMEOUT_MSECS          500
 #define HW_SYNC_SLEEP_TIME_INTERVAL    20
+#define HW_SYNC_TIMEOUT_MSECS           (25 * HW_SYNC_SLEEP_TIME_INTERVAL)
 #define BT_CMD_SYNC_SHIFT              31
 
 enum {
@@ -133,6 +133,20 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev,
                           struct hns_roce_hem_mhop *mhop);
 bool hns_roce_check_whether_mhop(struct hns_roce_dev *hr_dev, u32 type);
 
+void hns_roce_hem_list_init(struct hns_roce_hem_list *hem_list,
+                           int bt_page_order);
+int hns_roce_hem_list_calc_root_ba(const struct hns_roce_buf_region *regions,
+                                  int region_cnt, int unit);
+int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev,
+                             struct hns_roce_hem_list *hem_list,
+                             const struct hns_roce_buf_region *regions,
+                             int region_cnt);
+void hns_roce_hem_list_release(struct hns_roce_dev *hr_dev,
+                              struct hns_roce_hem_list *hem_list);
+void *hns_roce_hem_list_find_mtt(struct hns_roce_dev *hr_dev,
+                                struct hns_roce_hem_list *hem_list,
+                                int offset, int *mtt_cnt, u64 *phy_addr);
+
 static inline void hns_roce_hem_first(struct hns_roce_hem *hem,
                                      struct hns_roce_hem_iter *iter)
 {
index 3afd3e9330e7891e377bf320c9709228717d7aa4..81e6dedb1e022c81990ee7e01bc80a0a7b9ec3f9 100644 (file)
@@ -717,7 +717,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
        union ib_gid dgid;
        u64 subnet_prefix;
        int attr_mask = 0;
-       int ret = -ENOMEM;
+       int ret;
        int i, j;
        u8 queue_en[HNS_ROCE_V1_RESV_QP] = { 0 };
        u8 phy_port;
@@ -730,10 +730,16 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
        /* Reserved cq for loop qp */
        cq_init_attr.cqe                = HNS_ROCE_MIN_WQE_NUM * 2;
        cq_init_attr.comp_vector        = 0;
-       cq = hns_roce_ib_create_cq(&hr_dev->ib_dev, &cq_init_attr, NULL);
-       if (IS_ERR(cq)) {
-               dev_err(dev, "Create cq for reserved loop qp failed!");
+
+       ibdev = &hr_dev->ib_dev;
+       cq = rdma_zalloc_drv_obj(ibdev, ib_cq);
+       if (!cq)
                return -ENOMEM;
+
+       ret = hns_roce_ib_create_cq(cq, &cq_init_attr, NULL);
+       if (ret) {
+               dev_err(dev, "Create cq for reserved loop qp failed!");
+               goto alloc_cq_failed;
        }
        free_mr->mr_free_cq = to_hr_cq(cq);
        free_mr->mr_free_cq->ib_cq.device               = &hr_dev->ib_dev;
@@ -743,7 +749,6 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
        free_mr->mr_free_cq->ib_cq.cq_context           = NULL;
        atomic_set(&free_mr->mr_free_cq->ib_cq.usecnt, 0);
 
-       ibdev = &hr_dev->ib_dev;
        pd = rdma_zalloc_drv_obj(ibdev, ib_pd);
        if (!pd)
                goto alloc_mem_failed;
@@ -818,7 +823,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
                attr.dest_qp_num        = hr_qp->qpn;
                memcpy(rdma_ah_retrieve_dmac(&attr.ah_attr),
                       hr_dev->dev_addr[port],
-                      MAC_ADDR_OCTET_NUM);
+                      ETH_ALEN);
 
                memcpy(&dgid.raw, &subnet_prefix, sizeof(u64));
                memcpy(&dgid.raw[8], hr_dev->dev_addr[port], 3);
@@ -865,9 +870,9 @@ alloc_pd_failed:
        kfree(pd);
 
 alloc_mem_failed:
-       if (hns_roce_ib_destroy_cq(cq, NULL))
-               dev_err(dev, "Destroy cq for create_lp_qp failed!\n");
-
+       hns_roce_ib_destroy_cq(cq, NULL);
+alloc_cq_failed:
+       kfree(cq);
        return ret;
 }
 
@@ -894,10 +899,8 @@ static void hns_roce_v1_release_lp_qp(struct hns_roce_dev *hr_dev)
                                i, ret);
        }
 
-       ret = hns_roce_ib_destroy_cq(&free_mr->mr_free_cq->ib_cq, NULL);
-       if (ret)
-               dev_err(dev, "Destroy cq for mr_free failed(%d)!\n", ret);
-
+       hns_roce_ib_destroy_cq(&free_mr->mr_free_cq->ib_cq, NULL);
+       kfree(&free_mr->mr_free_cq->ib_cq);
        hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd, NULL);
        kfree(&free_mr->mr_free_pd->ibpd);
 }
@@ -966,8 +969,7 @@ static int hns_roce_v1_recreate_lp_qp(struct hns_roce_dev *hr_dev)
        struct hns_roce_free_mr *free_mr;
        struct hns_roce_v1_priv *priv;
        struct completion comp;
-       unsigned long end =
-         msecs_to_jiffies(HNS_ROCE_V1_RECREATE_LP_QP_TIMEOUT_MSECS) + jiffies;
+       unsigned long end = HNS_ROCE_V1_RECREATE_LP_QP_TIMEOUT_MSECS;
 
        priv = (struct hns_roce_v1_priv *)hr_dev->priv;
        free_mr = &priv->free_mr;
@@ -987,10 +989,11 @@ static int hns_roce_v1_recreate_lp_qp(struct hns_roce_dev *hr_dev)
 
        queue_work(free_mr->free_mr_wq, &(lp_qp_work->work));
 
-       while (time_before_eq(jiffies, end)) {
+       while (end) {
                if (try_wait_for_completion(&comp))
                        return 0;
                msleep(HNS_ROCE_V1_RECREATE_LP_QP_WAIT_VALUE);
+               end -= HNS_ROCE_V1_RECREATE_LP_QP_WAIT_VALUE;
        }
 
        lp_qp_work->comp_flag = 0;
@@ -1104,8 +1107,7 @@ static int hns_roce_v1_dereg_mr(struct hns_roce_dev *hr_dev,
        struct hns_roce_free_mr *free_mr;
        struct hns_roce_v1_priv *priv;
        struct completion comp;
-       unsigned long end =
-               msecs_to_jiffies(HNS_ROCE_V1_FREE_MR_TIMEOUT_MSECS) + jiffies;
+       unsigned long end = HNS_ROCE_V1_FREE_MR_TIMEOUT_MSECS;
        unsigned long start = jiffies;
        int npages;
        int ret = 0;
@@ -1135,10 +1137,11 @@ static int hns_roce_v1_dereg_mr(struct hns_roce_dev *hr_dev,
 
        queue_work(free_mr->free_mr_wq, &(mr_work->work));
 
-       while (time_before_eq(jiffies, end)) {
+       while (end) {
                if (try_wait_for_completion(&comp))
                        goto free_mr;
                msleep(HNS_ROCE_V1_FREE_MR_WAIT_VALUE);
+               end -= HNS_ROCE_V1_FREE_MR_WAIT_VALUE;
        }
 
        mr_work->comp_flag = 0;
@@ -1161,8 +1164,7 @@ free_mr:
        hns_roce_bitmap_free(&hr_dev->mr_table.mtpt_bitmap,
                             key_to_hw_index(mr->key), 0);
 
-       if (mr->umem)
-               ib_umem_release(mr->umem);
+       ib_umem_release(mr->umem);
 
        kfree(mr);
 
@@ -1557,6 +1559,7 @@ static int hns_roce_v1_profile(struct hns_roce_dev *hr_dev)
        caps->reserved_mrws     = 1;
        caps->reserved_uars     = 0;
        caps->reserved_cqs      = 0;
+       caps->reserved_qps      = 12; /* 2 SQP per port, six ports total 12 */
        caps->chunk_sz          = HNS_ROCE_V1_TABLE_CHUNK_SIZE;
 
        for (i = 0; i < caps->num_ports; i++)
@@ -1742,11 +1745,14 @@ static int hns_roce_v1_set_gid(struct hns_roce_dev *hr_dev, u8 port,
                               int gid_index, const union ib_gid *gid,
                               const struct ib_gid_attr *attr)
 {
+       unsigned long flags;
        u32 *p = NULL;
        u8 gid_idx = 0;
 
        gid_idx = hns_get_gid_index(hr_dev, port, gid_index);
 
+       spin_lock_irqsave(&hr_dev->iboe.lock, flags);
+
        p = (u32 *)&gid->raw[0];
        roce_raw_write(*p, hr_dev->reg_base + ROCEE_PORT_GID_L_0_REG +
                       (HNS_ROCE_V1_GID_NUM * gid_idx));
@@ -1763,6 +1769,8 @@ static int hns_roce_v1_set_gid(struct hns_roce_dev *hr_dev, u8 port,
        roce_raw_write(*p, hr_dev->reg_base + ROCEE_PORT_GID_H_0_REG +
                       (HNS_ROCE_V1_GID_NUM * gid_idx));
 
+       spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
+
        return 0;
 }
 
@@ -2458,10 +2466,10 @@ static int hns_roce_v1_clear_hem(struct hns_roce_dev *hr_dev,
 
        bt_cmd = hr_dev->reg_base + ROCEE_BT_CMD_H_REG;
 
-       end = msecs_to_jiffies(HW_SYNC_TIMEOUT_MSECS) + jiffies;
+       end = HW_SYNC_TIMEOUT_MSECS;
        while (1) {
                if (readl(bt_cmd) >> BT_CMD_SYNC_SHIFT) {
-                       if (!(time_before(jiffies, end))) {
+                       if (!end) {
                                dev_err(dev, "Write bt_cmd err,hw_sync is not zero.\n");
                                spin_unlock_irqrestore(&hr_dev->bt_cmd_lock,
                                        flags);
@@ -2470,7 +2478,8 @@ static int hns_roce_v1_clear_hem(struct hns_roce_dev *hr_dev,
                } else {
                        break;
                }
-               msleep(HW_SYNC_SLEEP_TIME_INTERVAL);
+               mdelay(HW_SYNC_SLEEP_TIME_INTERVAL);
+               end -= HW_SYNC_SLEEP_TIME_INTERVAL;
        }
 
        bt_cmd_val[0] = (__le32)bt_ba;
@@ -3633,9 +3642,8 @@ int hns_roce_v1_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 
        hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt);
 
-       if (udata)
-               ib_umem_release(hr_qp->umem);
-       else {
+       ib_umem_release(hr_qp->umem);
+       if (!udata) {
                kfree(hr_qp->sq.wrid);
                kfree(hr_qp->rq.wrid);
 
@@ -3649,7 +3657,7 @@ int hns_roce_v1_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
        return 0;
 }
 
-static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+static void hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
        struct hns_roce_dev *hr_dev = to_hr_dev(ibcq->device);
        struct hns_roce_cq *hr_cq = to_hr_cq(ibcq);
@@ -3658,7 +3666,6 @@ static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
        u32 cqe_cnt_cur;
        u32 cq_buf_size;
        int wait_time = 0;
-       int ret = 0;
 
        hns_roce_free_cq(hr_dev, hr_cq);
 
@@ -3680,7 +3687,6 @@ static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
                if (wait_time > HNS_ROCE_MAX_FREE_CQ_WAIT_CNT) {
                        dev_warn(dev, "Destroy cq 0x%lx timeout!\n",
                                hr_cq->cqn);
-                       ret = -ETIMEDOUT;
                        break;
                }
                wait_time++;
@@ -3688,17 +3694,12 @@ static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 
        hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
 
-       if (ibcq->uobject)
-               ib_umem_release(hr_cq->umem);
-       else {
+       ib_umem_release(hr_cq->umem);
+       if (!udata) {
                /* Free the buff of stored cq */
                cq_buf_size = (ibcq->cqe + 1) * hr_dev->caps.cq_entry_sz;
                hns_roce_buf_free(hr_dev, cq_buf_size, &hr_cq->hr_buf.hr_buf);
        }
-
-       kfree(hr_cq);
-
-       return ret;
 }
 
 static void set_eq_cons_index_v1(struct hns_roce_eq *eq, int req_not)
@@ -3902,7 +3903,8 @@ static int hns_roce_v1_aeq_int(struct hns_roce_dev *hr_dev,
                 */
                dma_rmb();
 
-               dev_dbg(dev, "aeqe = %p, aeqe->asyn.event_type = 0x%lx\n", aeqe,
+               dev_dbg(dev, "aeqe = %pK, aeqe->asyn.event_type = 0x%lx\n",
+                       aeqe,
                        roce_get_field(aeqe->asyn,
                                       HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M,
                                       HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S));
@@ -4265,7 +4267,6 @@ static int hns_roce_v1_create_eq(struct hns_roce_dev *hr_dev,
                }
 
                eq->buf_list[i].map = tmp_dma_addr;
-               memset(eq->buf_list[i].buf, 0, HNS_ROCE_BA_SIZE);
        }
        eq->cons_index = 0;
        roce_set_field(tmp, ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M,
index b5392cb5b20f2d472c64b77e6d10f24dbd3fe58e..b76e3beeafb8f5e296b01145312ce111b1b632f4 100644 (file)
@@ -1098,7 +1098,7 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
        if (ret == CMD_RST_PRC_SUCCESS)
                return 0;
        if (ret == CMD_RST_PRC_EBUSY)
-               return ret;
+               return -EBUSY;
 
        ret = __hns_roce_cmq_send(hr_dev, desc, num);
        if (ret) {
@@ -1106,7 +1106,7 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
                if (retval == CMD_RST_PRC_SUCCESS)
                        return 0;
                else if (retval == CMD_RST_PRC_EBUSY)
-                       return retval;
+                       return -EBUSY;
        }
 
        return ret;
@@ -1130,6 +1130,45 @@ static int hns_roce_cmq_query_hw_info(struct hns_roce_dev *hr_dev)
        return 0;
 }
 
+static void hns_roce_function_clear(struct hns_roce_dev *hr_dev)
+{
+       struct hns_roce_func_clear *resp;
+       struct hns_roce_cmq_desc desc;
+       unsigned long end;
+       int ret;
+
+       hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_FUNC_CLEAR, false);
+       resp = (struct hns_roce_func_clear *)desc.data;
+
+       ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+       if (ret) {
+               dev_err(hr_dev->dev, "Func clear write failed, ret = %d.\n",
+                        ret);
+               return;
+       }
+
+       msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL);
+       end = HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS;
+       while (end) {
+               msleep(HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT);
+               end -= HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT;
+
+               hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_FUNC_CLEAR,
+                                             true);
+
+               ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+               if (ret)
+                       continue;
+
+               if (roce_get_bit(resp->func_done, FUNC_CLEAR_RST_FUN_DONE_S)) {
+                       hr_dev->is_reset = true;
+                       return;
+               }
+       }
+
+       dev_err(hr_dev->dev, "Func clear fail.\n");
+}
+
 static int hns_roce_query_fw_ver(struct hns_roce_dev *hr_dev)
 {
        struct hns_roce_query_fw_info *resp;
@@ -1574,7 +1613,10 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
        caps->mtt_ba_pg_sz      = 0;
        caps->mtt_buf_pg_sz     = 0;
        caps->mtt_hop_num       = HNS_ROCE_MTT_HOP_NUM;
-       caps->cqe_ba_pg_sz      = 0;
+       caps->wqe_sq_hop_num    = 2;
+       caps->wqe_sge_hop_num   = 1;
+       caps->wqe_rq_hop_num    = 2;
+       caps->cqe_ba_pg_sz      = 6;
        caps->cqe_buf_pg_sz     = 0;
        caps->cqe_hop_num       = HNS_ROCE_CQE_HOP_NUM;
        caps->srqwqe_ba_pg_sz   = 0;
@@ -1774,7 +1816,6 @@ static int hns_roce_init_link_table(struct hns_roce_dev *hr_dev,
                        goto err_alloc_buf_failed;
 
                link_tbl->pg_list[i].map = t;
-               memset(link_tbl->pg_list[i].buf, 0, buf_chk_sz);
 
                entry[i].blk_ba0 = (t >> 12) & 0xffffffff;
                roce_set_field(entry[i].blk_ba1_nxt_ptr,
@@ -1891,6 +1932,9 @@ static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev)
 {
        struct hns_roce_v2_priv *priv = hr_dev->priv;
 
+       if (hr_dev->pci_dev->revision == 0x21)
+               hns_roce_function_clear(hr_dev);
+
        hns_roce_free_link_table(hr_dev, &priv->tpq);
        hns_roce_free_link_table(hr_dev, &priv->tsq);
 }
@@ -1974,7 +2018,7 @@ static int hns_roce_v2_chk_mbox(struct hns_roce_dev *hr_dev,
                                unsigned long timeout)
 {
        struct device *dev = hr_dev->dev;
-       unsigned long end = 0;
+       unsigned long end;
        u32 status;
 
        end = msecs_to_jiffies(timeout) + jiffies;
@@ -2340,15 +2384,10 @@ static void *get_srq_wqe(struct hns_roce_srq *srq, int n)
 
 static void hns_roce_free_srq_wqe(struct hns_roce_srq *srq, int wqe_index)
 {
-       u32 bitmap_num;
-       int bit_num;
-
        /* always called with interrupts disabled. */
        spin_lock(&srq->lock);
 
-       bitmap_num = wqe_index / (sizeof(u64) * 8);
-       bit_num = wqe_index % (sizeof(u64) * 8);
-       srq->idx_que.bitmap[bitmap_num] |= (1ULL << bit_num);
+       bitmap_clear(srq->idx_que.bitmap, wqe_index, 1);
        srq->tail++;
 
        spin_unlock(&srq->lock);
@@ -2977,7 +3016,7 @@ static int hns_roce_v2_clear_hem(struct hns_roce_dev *hr_dev,
 {
        struct device *dev = hr_dev->dev;
        struct hns_roce_cmd_mailbox *mailbox;
-       int ret = 0;
+       int ret;
        u16 op = 0xff;
 
        if (!hns_roce_check_whether_mhop(hr_dev, table->type))
@@ -3026,7 +3065,6 @@ static int hns_roce_v2_clear_hem(struct hns_roce_dev *hr_dev,
 }
 
 static int hns_roce_v2_qp_modify(struct hns_roce_dev *hr_dev,
-                                struct hns_roce_mtt *mtt,
                                 enum ib_qp_state cur_state,
                                 enum ib_qp_state new_state,
                                 struct hns_roce_v2_qp_context *context,
@@ -3426,7 +3464,9 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp,
        else
                roce_set_field(context->byte_4_sqpn_tst,
                               V2_QPC_BYTE_4_SGE_SHIFT_M,
-                              V2_QPC_BYTE_4_SGE_SHIFT_S, hr_qp->sq.max_gs > 2 ?
+                              V2_QPC_BYTE_4_SGE_SHIFT_S,
+                              hr_qp->sq.max_gs >
+                              HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE ?
                               ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0);
 
        roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M,
@@ -3520,6 +3560,31 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp,
        }
 }
 
+static bool check_wqe_rq_mtt_count(struct hns_roce_dev *hr_dev,
+                                  struct hns_roce_qp *hr_qp, int mtt_cnt,
+                                  u32 page_size)
+{
+       struct device *dev = hr_dev->dev;
+
+       if (hr_qp->rq.wqe_cnt < 1)
+               return true;
+
+       if (mtt_cnt < 1) {
+               dev_err(dev, "qp(0x%lx) rqwqe buf ba find failed\n",
+                       hr_qp->qpn);
+               return false;
+       }
+
+       if (mtt_cnt < MTT_MIN_COUNT &&
+               (hr_qp->rq.offset + page_size) < hr_qp->buff_size) {
+               dev_err(dev, "qp(0x%lx) next rqwqe buf ba find failed\n",
+                       hr_qp->qpn);
+               return false;
+       }
+
+       return true;
+}
+
 static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
                                 const struct ib_qp_attr *attr, int attr_mask,
                                 struct hns_roce_v2_qp_context *context,
@@ -3529,25 +3594,27 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
        struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
        struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
        struct device *dev = hr_dev->dev;
+       u64 mtts[MTT_MIN_COUNT] = { 0 };
        dma_addr_t dma_handle_3;
        dma_addr_t dma_handle_2;
-       dma_addr_t dma_handle;
+       u64 wqe_sge_ba;
        u32 page_size;
        u8 port_num;
        u64 *mtts_3;
        u64 *mtts_2;
-       u64 *mtts;
+       int count;
        u8 *dmac;
        u8 *smac;
        int port;
 
        /* Search qp buf's mtts */
-       mtts = hns_roce_table_find(hr_dev, &hr_dev->mr_table.mtt_table,
-                                  hr_qp->mtt.first_seg, &dma_handle);
-       if (!mtts) {
-               dev_err(dev, "qp buf pa find failed\n");
-               return -EINVAL;
-       }
+       page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
+       count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr,
+                                 hr_qp->rq.offset / page_size, mtts,
+                                 MTT_MIN_COUNT, &wqe_sge_ba);
+       if (!ibqp->srq)
+               if (!check_wqe_rq_mtt_count(hr_dev, hr_qp, count, page_size))
+                       return -EINVAL;
 
        /* Search IRRL's mtts */
        mtts_2 = hns_roce_table_find(hr_dev, &hr_dev->qp_table.irrl_table,
@@ -3571,7 +3638,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
        }
 
        dmac = (u8 *)attr->ah_attr.roce.dmac;
-       context->wqe_sge_ba = (u32)(dma_handle >> 3);
+       context->wqe_sge_ba = (u32)(wqe_sge_ba >> 3);
        qpc_mask->wqe_sge_ba = 0;
 
        /*
@@ -3581,22 +3648,23 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
         * 0 at the same time, else set them to 0x1.
         */
        roce_set_field(context->byte_12_sq_hop, V2_QPC_BYTE_12_WQE_SGE_BA_M,
-                      V2_QPC_BYTE_12_WQE_SGE_BA_S, dma_handle >> (32 + 3));
+                      V2_QPC_BYTE_12_WQE_SGE_BA_S, wqe_sge_ba >> (32 + 3));
        roce_set_field(qpc_mask->byte_12_sq_hop, V2_QPC_BYTE_12_WQE_SGE_BA_M,
                       V2_QPC_BYTE_12_WQE_SGE_BA_S, 0);
 
        roce_set_field(context->byte_12_sq_hop, V2_QPC_BYTE_12_SQ_HOP_NUM_M,
                       V2_QPC_BYTE_12_SQ_HOP_NUM_S,
-                      hr_dev->caps.mtt_hop_num == HNS_ROCE_HOP_NUM_0 ?
-                      0 : hr_dev->caps.mtt_hop_num);
+                      hr_dev->caps.wqe_sq_hop_num == HNS_ROCE_HOP_NUM_0 ?
+                      0 : hr_dev->caps.wqe_sq_hop_num);
        roce_set_field(qpc_mask->byte_12_sq_hop, V2_QPC_BYTE_12_SQ_HOP_NUM_M,
                       V2_QPC_BYTE_12_SQ_HOP_NUM_S, 0);
 
        roce_set_field(context->byte_20_smac_sgid_idx,
                       V2_QPC_BYTE_20_SGE_HOP_NUM_M,
                       V2_QPC_BYTE_20_SGE_HOP_NUM_S,
-                      ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > 2) ?
-                      hr_dev->caps.mtt_hop_num : 0);
+                      ((ibqp->qp_type == IB_QPT_GSI) ||
+                      hr_qp->sq.max_gs > HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ?
+                      hr_dev->caps.wqe_sge_hop_num : 0);
        roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
                       V2_QPC_BYTE_20_SGE_HOP_NUM_M,
                       V2_QPC_BYTE_20_SGE_HOP_NUM_S, 0);
@@ -3604,8 +3672,8 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
        roce_set_field(context->byte_20_smac_sgid_idx,
                       V2_QPC_BYTE_20_RQ_HOP_NUM_M,
                       V2_QPC_BYTE_20_RQ_HOP_NUM_S,
-                      hr_dev->caps.mtt_hop_num == HNS_ROCE_HOP_NUM_0 ?
-                      0 : hr_dev->caps.mtt_hop_num);
+                      hr_dev->caps.wqe_rq_hop_num == HNS_ROCE_HOP_NUM_0 ?
+                      0 : hr_dev->caps.wqe_rq_hop_num);
        roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
                       V2_QPC_BYTE_20_RQ_HOP_NUM_M,
                       V2_QPC_BYTE_20_RQ_HOP_NUM_S, 0);
@@ -3613,7 +3681,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
        roce_set_field(context->byte_16_buf_ba_pg_sz,
                       V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_M,
                       V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_S,
-                      hr_dev->caps.mtt_ba_pg_sz + PG_SHIFT_OFFSET);
+                      hr_qp->wqe_bt_pg_shift + PG_SHIFT_OFFSET);
        roce_set_field(qpc_mask->byte_16_buf_ba_pg_sz,
                       V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_M,
                       V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_S, 0);
@@ -3626,29 +3694,24 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
                       V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_M,
                       V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_S, 0);
 
-       page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
-       context->rq_cur_blk_addr = (u32)(mtts[hr_qp->rq.offset / page_size]
-                                   >> PAGE_ADDR_SHIFT);
+       context->rq_cur_blk_addr = (u32)(mtts[0] >> PAGE_ADDR_SHIFT);
        qpc_mask->rq_cur_blk_addr = 0;
 
        roce_set_field(context->byte_92_srq_info,
                       V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_M,
                       V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_S,
-                      mtts[hr_qp->rq.offset / page_size]
-                      >> (32 + PAGE_ADDR_SHIFT));
+                      mtts[0] >> (32 + PAGE_ADDR_SHIFT));
        roce_set_field(qpc_mask->byte_92_srq_info,
                       V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_M,
                       V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_S, 0);
 
-       context->rq_nxt_blk_addr = (u32)(mtts[hr_qp->rq.offset / page_size + 1]
-                                   >> PAGE_ADDR_SHIFT);
+       context->rq_nxt_blk_addr = (u32)(mtts[1] >> PAGE_ADDR_SHIFT);
        qpc_mask->rq_nxt_blk_addr = 0;
 
        roce_set_field(context->byte_104_rq_sge,
                       V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_M,
                       V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_S,
-                      mtts[hr_qp->rq.offset / page_size + 1]
-                      >> (32 + PAGE_ADDR_SHIFT));
+                      mtts[1] >> (32 + PAGE_ADDR_SHIFT));
        roce_set_field(qpc_mask->byte_104_rq_sge,
                       V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_M,
                       V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_S, 0);
@@ -3708,13 +3771,14 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
        roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
                       V2_QPC_BYTE_20_SGID_IDX_M,
                       V2_QPC_BYTE_20_SGID_IDX_S, 0);
-       memcpy(&(context->dmac), dmac, 4);
+       memcpy(&(context->dmac), dmac, sizeof(u32));
        roce_set_field(context->byte_52_udpspn_dmac, V2_QPC_BYTE_52_DMAC_M,
                       V2_QPC_BYTE_52_DMAC_S, *((u16 *)(&dmac[4])));
        qpc_mask->dmac = 0;
        roce_set_field(qpc_mask->byte_52_udpspn_dmac, V2_QPC_BYTE_52_DMAC_M,
                       V2_QPC_BYTE_52_DMAC_S, 0);
 
+       /* mtu*(2^LP_PKTN_INI) should not bigger than 1 message length 64kb */
        roce_set_field(context->byte_56_dqpn_err, V2_QPC_BYTE_56_LP_PKTN_INI_M,
                       V2_QPC_BYTE_56_LP_PKTN_INI_S, 4);
        roce_set_field(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_LP_PKTN_INI_M,
@@ -3756,6 +3820,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
        roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_TAIL_MAX_M,
                       V2_QPC_BYTE_132_TRRL_TAIL_MAX_S, 0);
 
+       /* rocee send 2^lp_sgen_ini segs every time */
        roce_set_field(context->byte_168_irrl_idx,
                       V2_QPC_BYTE_168_LP_SGEN_INI_M,
                       V2_QPC_BYTE_168_LP_SGEN_INI_S, 3);
@@ -3774,18 +3839,30 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp,
        struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
        struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
        struct device *dev = hr_dev->dev;
-       dma_addr_t dma_handle;
+       u64 sge_cur_blk = 0;
+       u64 sq_cur_blk = 0;
        u32 page_size;
-       u64 *mtts;
+       int count;
 
        /* Search qp buf's mtts */
-       mtts = hns_roce_table_find(hr_dev, &hr_dev->mr_table.mtt_table,
-                                  hr_qp->mtt.first_seg, &dma_handle);
-       if (!mtts) {
-               dev_err(dev, "qp buf pa find failed\n");
+       count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, 0, &sq_cur_blk, 1, NULL);
+       if (count < 1) {
+               dev_err(dev, "qp(0x%lx) buf pa find failed\n", hr_qp->qpn);
                return -EINVAL;
        }
 
+       if (hr_qp->sge.offset) {
+               page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
+               count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr,
+                                         hr_qp->sge.offset / page_size,
+                                         &sge_cur_blk, 1, NULL);
+               if (count < 1) {
+                       dev_err(dev, "qp(0x%lx) sge pa find failed\n",
+                               hr_qp->qpn);
+                       return -EINVAL;
+               }
+       }
+
        /* Not support alternate path and path migration */
        if ((attr_mask & IB_QP_ALT_PATH) ||
            (attr_mask & IB_QP_PATH_MIG_STATE)) {
@@ -3799,37 +3876,37 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp,
         * we should set all bits of the relevant fields in context mask to
         * 0 at the same time, else set them to 0x1.
         */
-       context->sq_cur_blk_addr = (u32)(mtts[0] >> PAGE_ADDR_SHIFT);
+       context->sq_cur_blk_addr = (u32)(sq_cur_blk >> PAGE_ADDR_SHIFT);
        roce_set_field(context->byte_168_irrl_idx,
                       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M,
                       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S,
-                      mtts[0] >> (32 + PAGE_ADDR_SHIFT));
+                      sq_cur_blk >> (32 + PAGE_ADDR_SHIFT));
        qpc_mask->sq_cur_blk_addr = 0;
        roce_set_field(qpc_mask->byte_168_irrl_idx,
                       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M,
                       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S, 0);
 
-       page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
-       context->sq_cur_sge_blk_addr =
-                      ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > 2) ?
-                                     ((u32)(mtts[hr_qp->sge.offset / page_size]
-                                     >> PAGE_ADDR_SHIFT)) : 0;
+       context->sq_cur_sge_blk_addr = ((ibqp->qp_type == IB_QPT_GSI) ||
+                      hr_qp->sq.max_gs > HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ?
+                      ((u32)(sge_cur_blk >>
+                      PAGE_ADDR_SHIFT)) : 0;
        roce_set_field(context->byte_184_irrl_idx,
                       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M,
                       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S,
-                      ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > 2) ?
-                      (mtts[hr_qp->sge.offset / page_size] >>
+                      ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs >
+                      HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ?
+                      (sge_cur_blk >>
                       (32 + PAGE_ADDR_SHIFT)) : 0);
        qpc_mask->sq_cur_sge_blk_addr = 0;
        roce_set_field(qpc_mask->byte_184_irrl_idx,
                       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M,
                       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S, 0);
 
-       context->rx_sq_cur_blk_addr = (u32)(mtts[0] >> PAGE_ADDR_SHIFT);
+       context->rx_sq_cur_blk_addr = (u32)(sq_cur_blk >> PAGE_ADDR_SHIFT);
        roce_set_field(context->byte_232_irrl_sge,
                       V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_M,
                       V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_S,
-                      mtts[0] >> (32 + PAGE_ADDR_SHIFT));
+                      sq_cur_blk >> (32 + PAGE_ADDR_SHIFT));
        qpc_mask->rx_sq_cur_blk_addr = 0;
        roce_set_field(qpc_mask->byte_232_irrl_sge,
                       V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_M,
@@ -4144,7 +4221,7 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
                roce_set_field(context->byte_224_retry_msg,
                               V2_QPC_BYTE_224_RETRY_MSG_PSN_M,
                               V2_QPC_BYTE_224_RETRY_MSG_PSN_S,
-                              attr->sq_psn >> 16);
+                              attr->sq_psn >> V2_QPC_BYTE_220_RETRY_MSG_PSN_S);
                roce_set_field(qpc_mask->byte_224_retry_msg,
                               V2_QPC_BYTE_224_RETRY_MSG_PSN_M,
                               V2_QPC_BYTE_224_RETRY_MSG_PSN_S, 0);
@@ -4230,7 +4307,7 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
                       V2_QPC_BYTE_60_QP_ST_S, 0);
 
        /* SW pass context to HW */
-       ret = hns_roce_v2_qp_modify(hr_dev, &hr_qp->mtt, cur_state, new_state,
+       ret = hns_roce_v2_qp_modify(hr_dev, cur_state, new_state,
                                    context, hr_qp);
        if (ret) {
                dev_err(dev, "hns_roce_qp_modify failed(%d)\n", ret);
@@ -4374,11 +4451,12 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
                                                  V2_QPC_BYTE_56_DQPN_M,
                                                  V2_QPC_BYTE_56_DQPN_S);
        qp_attr->qp_access_flags = ((roce_get_bit(context->byte_76_srqn_op_en,
-                                                 V2_QPC_BYTE_76_RRE_S)) << 2) |
-                                  ((roce_get_bit(context->byte_76_srqn_op_en,
-                                                 V2_QPC_BYTE_76_RWE_S)) << 1) |
-                                  ((roce_get_bit(context->byte_76_srqn_op_en,
-                                                 V2_QPC_BYTE_76_ATE_S)) << 3);
+                                   V2_QPC_BYTE_76_RRE_S)) << V2_QP_RWE_S) |
+                                   ((roce_get_bit(context->byte_76_srqn_op_en,
+                                   V2_QPC_BYTE_76_RWE_S)) << V2_QP_RRE_S) |
+                                   ((roce_get_bit(context->byte_76_srqn_op_en,
+                                   V2_QPC_BYTE_76_ATE_S)) << V2_QP_ATE_S);
+
        if (hr_qp->ibqp.qp_type == IB_QPT_RC ||
            hr_qp->ibqp.qp_type == IB_QPT_UC) {
                struct ib_global_route *grh =
@@ -4487,7 +4565,7 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
            (hr_qp->ibqp.qp_type == IB_QPT_UD))
                hns_roce_release_range_qp(hr_dev, hr_qp->qpn, 1);
 
-       hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt);
+       hns_roce_mtr_cleanup(hr_dev, &hr_qp->mtr);
 
        if (udata) {
                struct hns_roce_ucontext *context =
@@ -4501,7 +4579,6 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
 
                if (hr_qp->rq.wqe_cnt && (hr_qp->rdb_en == 1))
                        hns_roce_db_unmap_user(context, &hr_qp->rdb);
-               ib_umem_release(hr_qp->umem);
        } else {
                kfree(hr_qp->sq.wrid);
                kfree(hr_qp->rq.wrid);
@@ -4509,6 +4586,7 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
                if (hr_qp->rq.wqe_cnt)
                        hns_roce_free_db(hr_dev, &hr_qp->rdb);
        }
+       ib_umem_release(hr_qp->umem);
 
        if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) &&
             hr_qp->rq.wqe_cnt) {
@@ -4682,7 +4760,6 @@ static void hns_roce_irq_work_handle(struct work_struct *work)
                dev_warn(dev, "Path migration failed.\n");
                break;
        case HNS_ROCE_EVENT_TYPE_COMM_EST:
-               dev_info(dev, "Communication established.\n");
                break;
        case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
                dev_warn(dev, "Send queue drained.\n");
@@ -5151,8 +5228,8 @@ static void hns_roce_mhop_free_eq(struct hns_roce_dev *hr_dev,
                        dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i],
                                          eq->l1_dma[i]);
 
-                       for (j = 0; j < bt_chk_sz / 8; j++) {
-                               idx = i * (bt_chk_sz / 8) + j;
+                       for (j = 0; j < bt_chk_sz / BA_BYTE_LEN; j++) {
+                               idx = i * (bt_chk_sz / BA_BYTE_LEN) + j;
                                if ((i == eq->l0_last_num - 1)
                                     && j == eq->l1_last_num - 1) {
                                        eqe_alloc = (buf_chk_sz / eq->eqe_size)
@@ -5368,9 +5445,9 @@ static int hns_roce_mhop_alloc_eq(struct hns_roce_dev *hr_dev,
        buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT);
        bt_chk_sz = 1 << (hr_dev->caps.eqe_ba_pg_sz + PAGE_SHIFT);
 
-       ba_num = (PAGE_ALIGN(eq->entries * eq->eqe_size) + buf_chk_sz - 1)
-                 / buf_chk_sz;
-       bt_num = (ba_num + bt_chk_sz / 8 - 1) / (bt_chk_sz / 8);
+       ba_num = DIV_ROUND_UP(PAGE_ALIGN(eq->entries * eq->eqe_size),
+                             buf_chk_sz);
+       bt_num = DIV_ROUND_UP(ba_num, bt_chk_sz / BA_BYTE_LEN);
 
        /* hop_num = 0 */
        if (mhop_num == HNS_ROCE_HOP_NUM_0) {
@@ -5387,8 +5464,6 @@ static int hns_roce_mhop_alloc_eq(struct hns_roce_dev *hr_dev,
                eq->cur_eqe_ba = eq->l0_dma;
                eq->nxt_eqe_ba = 0;
 
-               memset(eq->bt_l0, 0, eq->entries * eq->eqe_size);
-
                return 0;
        }
 
@@ -5415,12 +5490,12 @@ static int hns_roce_mhop_alloc_eq(struct hns_roce_dev *hr_dev,
                goto err_dma_alloc_l0;
 
        if (mhop_num == 1) {
-               if (ba_num > (bt_chk_sz / 8))
+               if (ba_num > (bt_chk_sz / BA_BYTE_LEN))
                        dev_err(dev, "ba_num %d is too large for 1 hop\n",
                                ba_num);
 
                /* alloc buf */
-               for (i = 0; i < bt_chk_sz / 8; i++) {
+               for (i = 0; i < bt_chk_sz / BA_BYTE_LEN; i++) {
                        if (eq_buf_cnt + 1 < ba_num) {
                                size = buf_chk_sz;
                        } else {
@@ -5444,7 +5519,7 @@ static int hns_roce_mhop_alloc_eq(struct hns_roce_dev *hr_dev,
 
        } else if (mhop_num == 2) {
                /* alloc L1 BT and buf */
-               for (i = 0; i < bt_chk_sz / 8; i++) {
+               for (i = 0; i < bt_chk_sz / BA_BYTE_LEN; i++) {
                        eq->bt_l1[i] = dma_alloc_coherent(dev, bt_chk_sz,
                                                          &(eq->l1_dma[i]),
                                                          GFP_KERNEL);
@@ -5452,8 +5527,8 @@ static int hns_roce_mhop_alloc_eq(struct hns_roce_dev *hr_dev,
                                goto err_dma_alloc_l1;
                        *(eq->bt_l0 + i) = eq->l1_dma[i];
 
-                       for (j = 0; j < bt_chk_sz / 8; j++) {
-                               idx = i * bt_chk_sz / 8 + j;
+                       for (j = 0; j < bt_chk_sz / BA_BYTE_LEN; j++) {
+                               idx = i * bt_chk_sz / BA_BYTE_LEN + j;
                                if (eq_buf_cnt + 1 < ba_num) {
                                        size = buf_chk_sz;
                                } else {
@@ -5498,8 +5573,8 @@ err_dma_alloc_l1:
                dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i],
                                  eq->l1_dma[i]);
 
-               for (j = 0; j < bt_chk_sz / 8; j++) {
-                       idx = i * bt_chk_sz / 8 + j;
+               for (j = 0; j < bt_chk_sz / BA_BYTE_LEN; j++) {
+                       idx = i * bt_chk_sz / BA_BYTE_LEN + j;
                        dma_free_coherent(dev, buf_chk_sz, eq->buf[idx],
                                          eq->buf_dma[idx]);
                }
@@ -5522,11 +5597,11 @@ err_dma_alloc_buf:
                        dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i],
                                          eq->l1_dma[i]);
 
-                       for (j = 0; j < bt_chk_sz / 8; j++) {
+                       for (j = 0; j < bt_chk_sz / BA_BYTE_LEN; j++) {
                                if (i == record_i && j >= record_j)
                                        break;
 
-                               idx = i * bt_chk_sz / 8 + j;
+                               idx = i * bt_chk_sz / BA_BYTE_LEN + j;
                                dma_free_coherent(dev, buf_chk_sz,
                                                  eq->buf[idx],
                                                  eq->buf_dma[idx]);
@@ -5972,18 +6047,19 @@ out:
        return ret;
 }
 
-static int find_empty_entry(struct hns_roce_idx_que *idx_que)
+static int find_empty_entry(struct hns_roce_idx_que *idx_que,
+                           unsigned long size)
 {
-       int bit_num;
-       int i;
+       int wqe_idx;
 
-       /* bitmap[i] is set zero if all bits are allocated */
-       for (i = 0; idx_que->bitmap[i] == 0; ++i)
-               ;
-       bit_num = ffs(idx_que->bitmap[i]);
-       idx_que->bitmap[i] &= ~(1ULL << (bit_num - 1));
+       if (unlikely(bitmap_full(idx_que->bitmap, size)))
+               return -ENOSPC;
+
+       wqe_idx = find_first_zero_bit(idx_que->bitmap, size);
 
-       return i * sizeof(u64) * 8 + (bit_num - 1);
+       bitmap_set(idx_que->bitmap, wqe_idx, 1);
+
+       return wqe_idx;
 }
 
 static void fill_idx_queue(struct hns_roce_idx_que *idx_que,
@@ -6029,7 +6105,13 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq,
                        break;
                }
 
-               wqe_idx = find_empty_entry(&srq->idx_que);
+               wqe_idx = find_empty_entry(&srq->idx_que, srq->max);
+               if (wqe_idx < 0) {
+                       ret = -ENOMEM;
+                       *bad_wr = wr;
+                       break;
+               }
+
                fill_idx_queue(&srq->idx_que, ind, wqe_idx);
                wqe = get_srq_wqe(srq, wqe_idx);
                dseg = (struct hns_roce_v2_wqe_data_seg *)wqe;
@@ -6041,9 +6123,9 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq,
                }
 
                if (i < srq->max_gs) {
-                       dseg->len = 0;
-                       dseg->lkey = cpu_to_le32(0x100);
-                       dseg->addr = 0;
+                       dseg[i].len = 0;
+                       dseg[i].lkey = cpu_to_le32(0x100);
+                       dseg[i].addr = 0;
                }
 
                srq->wrid[wqe_idx] = wr->wr_id;
@@ -6059,7 +6141,8 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq,
                 */
                wmb();
 
-               srq_db.byte_4 = HNS_ROCE_V2_SRQ_DB << 24 | srq->srqn;
+               srq_db.byte_4 = HNS_ROCE_V2_SRQ_DB << V2_DB_BYTE_4_CMD_S |
+                               (srq->srqn & V2_DB_BYTE_4_TAG_M);
                srq_db.parameter = srq->head;
 
                hns_roce_write64(hr_dev, (__le32 *)&srq_db, srq->db_reg_l);
@@ -6301,6 +6384,7 @@ static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle)
        if (!hr_dev)
                return 0;
 
+       hr_dev->is_reset = true;
        hr_dev->active = false;
        hr_dev->dis_db = true;
 
index edfdbe2ce0db1913bc1d6a216044ef68c924e38f..478f5a5b7aa1da0481bafe7b915f6c2ccfa9328c 100644 (file)
@@ -54,7 +54,7 @@
 #define HNS_ROCE_V2_MAX_CQ_NUM                 0x100000
 #define HNS_ROCE_V2_MAX_CQC_TIMER_NUM          0x100
 #define HNS_ROCE_V2_MAX_SRQ_NUM                        0x100000
-#define HNS_ROCE_V2_MAX_CQE_NUM                        0x10000
+#define HNS_ROCE_V2_MAX_CQE_NUM                        0x400000
 #define HNS_ROCE_V2_MAX_SRQWQE_NUM             0x8000
 #define HNS_ROCE_V2_MAX_RQ_SGE_NUM             0x100
 #define HNS_ROCE_V2_MAX_SQ_SGE_NUM             0xff
@@ -241,6 +241,7 @@ enum hns_roce_opcode_type {
        HNS_ROCE_OPC_POST_MB                            = 0x8504,
        HNS_ROCE_OPC_QUERY_MB_ST                        = 0x8505,
        HNS_ROCE_OPC_CFG_BT_ATTR                        = 0x8506,
+       HNS_ROCE_OPC_FUNC_CLEAR                         = 0x8508,
        HNS_ROCE_OPC_CLR_SCCC                           = 0x8509,
        HNS_ROCE_OPC_QUERY_SCCC                         = 0x850a,
        HNS_ROCE_OPC_RESET_SCCC                         = 0x850b,
@@ -886,6 +887,10 @@ struct hns_roce_v2_qp_context {
 #define        V2_QPC_BYTE_256_SQ_FLUSH_IDX_S 16
 #define V2_QPC_BYTE_256_SQ_FLUSH_IDX_M GENMASK(31, 16)
 
+#define        V2_QP_RWE_S 1 /* rdma write enable */
+#define        V2_QP_RRE_S 2 /* rdma read enable */
+#define        V2_QP_ATE_S 3 /* rdma atomic enable */
+
 struct hns_roce_v2_cqe {
        __le32  byte_4;
        union {
@@ -1226,6 +1231,22 @@ struct hns_roce_query_fw_info {
        __le32 rsv[5];
 };
 
+struct hns_roce_func_clear {
+       __le32 rst_funcid_en;
+       __le32 func_done;
+       __le32 rsv[4];
+};
+
+#define FUNC_CLEAR_RST_FUN_DONE_S 0
+/* Each physical function manages up to 248 virtual functionsï¼›
+ * it takes up to 100ms for each function to execute clearï¼›
+ * if an abnormal reset occurs, it is executed twice at most;
+ * so it takes up to 249 * 2 * 100ms.
+ */
+#define HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS   (249 * 2 * 100)
+#define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL      40
+#define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_FAIL_WAIT     20
+
 struct hns_roce_cfg_llm_a {
        __le32 base_addr_l;
        __le32 base_addr_h;
index 8da5f18bf820beef2413bbf74437f28f8d6741ce..1e4ba48f56136e2f4a2c9371ecdb13e33894f8eb 100644 (file)
@@ -57,17 +57,16 @@ int hns_get_gid_index(struct hns_roce_dev *hr_dev, u8 port, int gid_index)
 {
        return gid_index * hr_dev->caps.num_ports + port;
 }
-EXPORT_SYMBOL_GPL(hns_get_gid_index);
 
 static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u8 port, u8 *addr)
 {
        u8 phy_port;
        u32 i = 0;
 
-       if (!memcmp(hr_dev->dev_addr[port], addr, MAC_ADDR_OCTET_NUM))
+       if (!memcmp(hr_dev->dev_addr[port], addr, ETH_ALEN))
                return 0;
 
-       for (i = 0; i < MAC_ADDR_OCTET_NUM; i++)
+       for (i = 0; i < ETH_ALEN; i++)
                hr_dev->dev_addr[port][i] = addr[i];
 
        phy_port = hr_dev->iboe.phy_port[port];
@@ -78,18 +77,13 @@ static int hns_roce_add_gid(const struct ib_gid_attr *attr, void **context)
 {
        struct hns_roce_dev *hr_dev = to_hr_dev(attr->device);
        u8 port = attr->port_num - 1;
-       unsigned long flags;
        int ret;
 
        if (port >= hr_dev->caps.num_ports)
                return -EINVAL;
 
-       spin_lock_irqsave(&hr_dev->iboe.lock, flags);
-
        ret = hr_dev->hw->set_gid(hr_dev, port, attr->index, &attr->gid, attr);
 
-       spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
-
        return ret;
 }
 
@@ -98,18 +92,13 @@ static int hns_roce_del_gid(const struct ib_gid_attr *attr, void **context)
        struct hns_roce_dev *hr_dev = to_hr_dev(attr->device);
        struct ib_gid_attr zattr = { };
        u8 port = attr->port_num - 1;
-       unsigned long flags;
        int ret;
 
        if (port >= hr_dev->caps.num_ports)
                return -EINVAL;
 
-       spin_lock_irqsave(&hr_dev->iboe.lock, flags);
-
        ret = hr_dev->hw->set_gid(hr_dev, port, attr->index, &zgid, &zattr);
 
-       spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
-
        return ret;
 }
 
@@ -272,7 +261,8 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u8 port_num,
        props->active_mtu = mtu ? min(props->max_mtu, mtu) : IB_MTU_256;
        props->state = (netif_running(net_dev) && netif_carrier_ok(net_dev)) ?
                        IB_PORT_ACTIVE : IB_PORT_DOWN;
-       props->phys_state = (props->state == IB_PORT_ACTIVE) ? 5 : 3;
+       props->phys_state = (props->state == IB_PORT_ACTIVE) ?
+                            HNS_ROCE_PHY_LINKUP : HNS_ROCE_PHY_DISABLED;
 
        spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
 
@@ -319,7 +309,7 @@ static int hns_roce_modify_port(struct ib_device *ib_dev, u8 port_num, int mask,
 static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx,
                                   struct ib_udata *udata)
 {
-       int ret = 0;
+       int ret;
        struct hns_roce_ucontext *context = to_hr_ucontext(uctx);
        struct hns_roce_ib_alloc_ucontext_resp resp = {};
        struct hns_roce_dev *hr_dev = to_hr_dev(uctx->device);
@@ -423,6 +413,11 @@ static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev)
 }
 
 static const struct ib_device_ops hns_roce_dev_ops = {
+       .owner = THIS_MODULE,
+       .driver_id = RDMA_DRIVER_HNS,
+       .uverbs_abi_ver = 1,
+       .uverbs_no_driver_id_binding = 1,
+
        .add_gid = hns_roce_add_gid,
        .alloc_pd = hns_roce_alloc_pd,
        .alloc_ucontext = hns_roce_alloc_ucontext,
@@ -451,6 +446,7 @@ static const struct ib_device_ops hns_roce_dev_ops = {
        .reg_user_mr = hns_roce_reg_user_mr,
 
        INIT_RDMA_OBJ_SIZE(ib_ah, hns_roce_ah, ibah),
+       INIT_RDMA_OBJ_SIZE(ib_cq, hns_roce_cq, ib_cq),
        INIT_RDMA_OBJ_SIZE(ib_pd, hns_roce_pd, ibpd),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, hns_roce_ucontext, ibucontext),
 };
@@ -489,14 +485,12 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
 
        ib_dev = &hr_dev->ib_dev;
 
-       ib_dev->owner                   = THIS_MODULE;
        ib_dev->node_type               = RDMA_NODE_IB_CA;
        ib_dev->dev.parent              = dev;
 
        ib_dev->phys_port_cnt           = hr_dev->caps.num_ports;
        ib_dev->local_dma_lkey          = hr_dev->caps.reserved_lkey;
        ib_dev->num_comp_vectors        = hr_dev->caps.num_comp_vectors;
-       ib_dev->uverbs_abi_ver          = 1;
        ib_dev->uverbs_cmd_mask         =
                (1ULL << IB_USER_VERBS_CMD_GET_CONTEXT) |
                (1ULL << IB_USER_VERBS_CMD_QUERY_DEVICE) |
@@ -545,7 +539,6 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
                ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_srq_ops);
        }
 
-       ib_dev->driver_id = RDMA_DRIVER_HNS;
        ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_ops);
        ib_set_device_ops(ib_dev, &hns_roce_dev_ops);
        for (i = 0; i < hr_dev->caps.num_ports; i++) {
@@ -980,7 +973,6 @@ error_failed_cmq_init:
 
        return ret;
 }
-EXPORT_SYMBOL_GPL(hns_roce_init);
 
 void hns_roce_exit(struct hns_roce_dev *hr_dev)
 {
@@ -1001,7 +993,6 @@ void hns_roce_exit(struct hns_roce_dev *hr_dev)
        if (hr_dev->hw->reset)
                hr_dev->hw->reset(hr_dev, false);
 }
-EXPORT_SYMBOL_GPL(hns_roce_exit);
 
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_AUTHOR("Wei Hu <xavier.huwei@huawei.com>");
index 6110ec408626237006650c81cf57f8e6faa7ba3c..549e1a38dfe0be1ff41a78650695c6495205a605 100644 (file)
@@ -47,7 +47,6 @@ unsigned long key_to_hw_index(u32 key)
 {
        return (key << 24) | (key >> 8);
 }
-EXPORT_SYMBOL_GPL(key_to_hw_index);
 
 static int hns_roce_sw2hw_mpt(struct hns_roce_dev *hr_dev,
                              struct hns_roce_cmd_mailbox *mailbox,
@@ -66,7 +65,6 @@ int hns_roce_hw2sw_mpt(struct hns_roce_dev *hr_dev,
                                 mpt_index, !mailbox, HNS_ROCE_CMD_HW2SW_MPT,
                                 HNS_ROCE_CMD_TIMEOUT_MSECS);
 }
-EXPORT_SYMBOL_GPL(hns_roce_hw2sw_mpt);
 
 static int hns_roce_buddy_alloc(struct hns_roce_buddy *buddy, int order,
                                unsigned long *seg)
@@ -293,7 +291,6 @@ void hns_roce_mtt_cleanup(struct hns_roce_dev *hr_dev, struct hns_roce_mtt *mtt)
                break;
        }
 }
-EXPORT_SYMBOL_GPL(hns_roce_mtt_cleanup);
 
 static void hns_roce_loop_free(struct hns_roce_dev *hr_dev,
                               struct hns_roce_mr *mr, int err_loop_index,
@@ -314,11 +311,11 @@ static void hns_roce_loop_free(struct hns_roce_dev *hr_dev,
                        dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i],
                                          mr->pbl_l1_dma_addr[i]);
 
-                       for (j = 0; j < pbl_bt_sz / 8; j++) {
+                       for (j = 0; j < pbl_bt_sz / BA_BYTE_LEN; j++) {
                                if (i == loop_i && j >= loop_j)
                                        break;
 
-                               bt_idx = i * pbl_bt_sz / 8 + j;
+                               bt_idx = i * pbl_bt_sz / BA_BYTE_LEN + j;
                                dma_free_coherent(dev, pbl_bt_sz,
                                                  mr->pbl_bt_l2[bt_idx],
                                                  mr->pbl_l2_dma_addr[bt_idx]);
@@ -329,8 +326,8 @@ static void hns_roce_loop_free(struct hns_roce_dev *hr_dev,
                        dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i],
                                          mr->pbl_l1_dma_addr[i]);
 
-                       for (j = 0; j < pbl_bt_sz / 8; j++) {
-                               bt_idx = i * pbl_bt_sz / 8 + j;
+                       for (j = 0; j < pbl_bt_sz / BA_BYTE_LEN; j++) {
+                               bt_idx = i * pbl_bt_sz / BA_BYTE_LEN + j;
                                dma_free_coherent(dev, pbl_bt_sz,
                                                  mr->pbl_bt_l2[bt_idx],
                                                  mr->pbl_l2_dma_addr[bt_idx]);
@@ -533,7 +530,7 @@ static int hns_roce_mr_alloc(struct hns_roce_dev *hr_dev, u32 pd, u64 iova,
 {
        struct device *dev = hr_dev->dev;
        unsigned long index = 0;
-       int ret = 0;
+       int ret;
 
        /* Allocate a key for mr from mr_table */
        ret = hns_roce_bitmap_alloc(&hr_dev->mr_table.mtpt_bitmap, &index);
@@ -559,7 +556,8 @@ static int hns_roce_mr_alloc(struct hns_roce_dev *hr_dev, u32 pd, u64 iova,
                mr->pbl_l0_dma_addr = 0;
        } else {
                if (!hr_dev->caps.pbl_hop_num) {
-                       mr->pbl_buf = dma_alloc_coherent(dev, npages * 8,
+                       mr->pbl_buf = dma_alloc_coherent(dev,
+                                                        npages * BA_BYTE_LEN,
                                                         &(mr->pbl_dma_addr),
                                                         GFP_KERNEL);
                        if (!mr->pbl_buf)
@@ -590,9 +588,8 @@ static void hns_roce_mhop_free(struct hns_roce_dev *hr_dev,
        if (mhop_num == HNS_ROCE_HOP_NUM_0)
                return;
 
-       /* hop_num = 1 */
        if (mhop_num == 1) {
-               dma_free_coherent(dev, (unsigned int)(npages * 8),
+               dma_free_coherent(dev, (unsigned int)(npages * BA_BYTE_LEN),
                                  mr->pbl_buf, mr->pbl_dma_addr);
                return;
        }
@@ -603,12 +600,13 @@ static void hns_roce_mhop_free(struct hns_roce_dev *hr_dev,
        if (mhop_num == 2) {
                for (i = 0; i < mr->l0_chunk_last_num; i++) {
                        if (i == mr->l0_chunk_last_num - 1) {
-                               npages_allocated = i * (pbl_bt_sz / 8);
+                               npages_allocated =
+                                               i * (pbl_bt_sz / BA_BYTE_LEN);
 
                                dma_free_coherent(dev,
-                                             (npages - npages_allocated) * 8,
-                                             mr->pbl_bt_l1[i],
-                                             mr->pbl_l1_dma_addr[i]);
+                                     (npages - npages_allocated) * BA_BYTE_LEN,
+                                      mr->pbl_bt_l1[i],
+                                      mr->pbl_l1_dma_addr[i]);
 
                                break;
                        }
@@ -621,16 +619,17 @@ static void hns_roce_mhop_free(struct hns_roce_dev *hr_dev,
                        dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i],
                                          mr->pbl_l1_dma_addr[i]);
 
-                       for (j = 0; j < pbl_bt_sz / 8; j++) {
-                               bt_idx = i * (pbl_bt_sz / 8) + j;
+                       for (j = 0; j < pbl_bt_sz / BA_BYTE_LEN; j++) {
+                               bt_idx = i * (pbl_bt_sz / BA_BYTE_LEN) + j;
 
                                if ((i == mr->l0_chunk_last_num - 1)
                                    && j == mr->l1_chunk_last_num - 1) {
                                        npages_allocated = bt_idx *
-                                                          (pbl_bt_sz / 8);
+                                                     (pbl_bt_sz / BA_BYTE_LEN);
 
                                        dma_free_coherent(dev,
-                                             (npages - npages_allocated) * 8,
+                                             (npages - npages_allocated) *
+                                             BA_BYTE_LEN,
                                              mr->pbl_bt_l2[bt_idx],
                                              mr->pbl_l2_dma_addr[bt_idx]);
 
@@ -675,7 +674,8 @@ static void hns_roce_mr_free(struct hns_roce_dev *hr_dev,
                        npages = ib_umem_page_count(mr->umem);
 
                if (!hr_dev->caps.pbl_hop_num)
-                       dma_free_coherent(dev, (unsigned int)(npages * 8),
+                       dma_free_coherent(dev,
+                                         (unsigned int)(npages * BA_BYTE_LEN),
                                          mr->pbl_buf, mr->pbl_dma_addr);
                else
                        hns_roce_mhop_free(hr_dev, mr);
@@ -1059,6 +1059,7 @@ static int hns_roce_ib_umem_write_mr(struct hns_roce_dev *hr_dev,
        for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
                page_addr = sg_page_iter_dma_address(&sg_iter);
                if (!hr_dev->caps.pbl_hop_num) {
+                       /* for hip06, page addr is aligned to 4K */
                        mr->pbl_buf[i++] = page_addr >> 12;
                } else if (hr_dev->caps.pbl_hop_num == 1) {
                        mr->pbl_buf[i++] = page_addr;
@@ -1069,7 +1070,7 @@ static int hns_roce_ib_umem_write_mr(struct hns_roce_dev *hr_dev,
                                mr->pbl_bt_l2[i][j] = page_addr;
 
                        j++;
-                       if (j >= (pbl_bt_sz / 8)) {
+                       if (j >= (pbl_bt_sz / BA_BYTE_LEN)) {
                                i++;
                                j = 0;
                        }
@@ -1117,7 +1118,8 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
        } else {
                u64 pbl_size = 1;
 
-               bt_size = (1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT)) / 8;
+               bt_size = (1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT)) /
+                         BA_BYTE_LEN;
                for (i = 0; i < hr_dev->caps.pbl_hop_num; i++)
                        pbl_size *= bt_size;
                if (n > pbl_size) {
@@ -1293,9 +1295,7 @@ int hns_roce_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
        } else {
                hns_roce_mr_free(hr_dev, mr);
 
-               if (mr->umem)
-                       ib_umem_release(mr->umem);
-
+               ib_umem_release(mr->umem);
                kfree(mr);
        }
 
@@ -1491,3 +1491,119 @@ int hns_roce_dealloc_mw(struct ib_mw *ibmw)
 
        return 0;
 }
+
+void hns_roce_mtr_init(struct hns_roce_mtr *mtr, int bt_pg_shift,
+                      int buf_pg_shift)
+{
+       hns_roce_hem_list_init(&mtr->hem_list, bt_pg_shift);
+       mtr->buf_pg_shift = buf_pg_shift;
+}
+
+void hns_roce_mtr_cleanup(struct hns_roce_dev *hr_dev,
+                         struct hns_roce_mtr *mtr)
+{
+       hns_roce_hem_list_release(hr_dev, &mtr->hem_list);
+}
+
+static int hns_roce_write_mtr(struct hns_roce_dev *hr_dev,
+                             struct hns_roce_mtr *mtr, dma_addr_t *bufs,
+                             struct hns_roce_buf_region *r)
+{
+       int offset;
+       int count;
+       int npage;
+       u64 *mtts;
+       int end;
+       int i;
+
+       offset = r->offset;
+       end = offset + r->count;
+       npage = 0;
+       while (offset < end) {
+               mtts = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list,
+                                                 offset, &count, NULL);
+               if (!mtts)
+                       return -ENOBUFS;
+
+               /* Save page addr, low 12 bits : 0 */
+               for (i = 0; i < count; i++) {
+                       if (hr_dev->hw_rev == HNS_ROCE_HW_VER1)
+                               mtts[i] = cpu_to_le64(bufs[npage] >>
+                                                       PAGE_ADDR_SHIFT);
+                       else
+                               mtts[i] = cpu_to_le64(bufs[npage]);
+
+                       npage++;
+               }
+               offset += count;
+       }
+
+       return 0;
+}
+
+int hns_roce_mtr_attach(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
+                       dma_addr_t **bufs, struct hns_roce_buf_region *regions,
+                       int region_cnt)
+{
+       struct hns_roce_buf_region *r;
+       int ret;
+       int i;
+
+       ret = hns_roce_hem_list_request(hr_dev, &mtr->hem_list, regions,
+                                       region_cnt);
+       if (ret)
+               return ret;
+
+       for (i = 0; i < region_cnt; i++) {
+               r = &regions[i];
+               ret = hns_roce_write_mtr(hr_dev, mtr, bufs[i], r);
+               if (ret) {
+                       dev_err(hr_dev->dev,
+                               "write mtr[%d/%d] err %d,offset=%d.\n",
+                               i, region_cnt, ret,  r->offset);
+                       goto err_write;
+               }
+       }
+
+       return 0;
+
+err_write:
+       hns_roce_hem_list_release(hr_dev, &mtr->hem_list);
+
+       return ret;
+}
+
+int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
+                     int offset, u64 *mtt_buf, int mtt_max, u64 *base_addr)
+{
+       u64 *mtts = mtt_buf;
+       int mtt_count;
+       int total = 0;
+       u64 *addr;
+       int npage;
+       int left;
+
+       if (mtts == NULL || mtt_max < 1)
+               goto done;
+
+       left = mtt_max;
+       while (left > 0) {
+               mtt_count = 0;
+               addr = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list,
+                                                 offset + total,
+                                                 &mtt_count, NULL);
+               if (!addr || !mtt_count)
+                       goto done;
+
+               npage = min(mtt_count, left);
+               memcpy(&mtts[total], addr, BA_BYTE_LEN * npage);
+               left -= npage;
+               total += npage;
+       }
+
+done:
+       if (base_addr)
+               *base_addr = mtr->hem_list.root_ba;
+
+       return total;
+}
index 813401384d789aeacf11681af9571c76531c2154..912b89b4da345470aa5699021fddb8a8861b0a9d 100644 (file)
@@ -83,18 +83,16 @@ int hns_roce_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 
        return 0;
 }
-EXPORT_SYMBOL_GPL(hns_roce_alloc_pd);
 
 void hns_roce_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
        hns_roce_pd_free(to_hr_dev(pd->device), to_hr_pd(pd)->pdn);
 }
-EXPORT_SYMBOL_GPL(hns_roce_dealloc_pd);
 
 int hns_roce_uar_alloc(struct hns_roce_dev *hr_dev, struct hns_roce_uar *uar)
 {
        struct resource *res;
-       int ret = 0;
+       int ret;
 
        /* Using bitmap to manager UAR index */
        ret = hns_roce_bitmap_alloc(&hr_dev->uar_table.bitmap, &uar->logic_idx);
index 8db2817a249e94a6e3d246b955c605f8153b0068..e0424029b058b1d145209f2adaccda99ac2334c2 100644 (file)
@@ -64,7 +64,6 @@ void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type)
        if (atomic_dec_and_test(&qp->refcount))
                complete(&qp->free);
 }
-EXPORT_SYMBOL_GPL(hns_roce_qp_event);
 
 static void hns_roce_ib_qp_event(struct hns_roce_qp *hr_qp,
                                 enum hns_roce_event type)
@@ -139,7 +138,6 @@ enum hns_roce_qp_state to_hns_roce_state(enum ib_qp_state state)
                return HNS_ROCE_QP_NUM_STATE;
        }
 }
-EXPORT_SYMBOL_GPL(to_hns_roce_state);
 
 static int hns_roce_gsi_qp_alloc(struct hns_roce_dev *hr_dev, unsigned long qpn,
                                 struct hns_roce_qp *hr_qp)
@@ -242,7 +240,6 @@ void hns_roce_qp_remove(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
        __xa_erase(xa, hr_qp->qpn & (hr_dev->caps.num_qps - 1));
        xa_unlock_irqrestore(xa, flags);
 }
-EXPORT_SYMBOL_GPL(hns_roce_qp_remove);
 
 void hns_roce_qp_free(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 {
@@ -257,22 +254,19 @@ void hns_roce_qp_free(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
                        hns_roce_table_put(hr_dev, &qp_table->trrl_table,
                                           hr_qp->qpn);
                hns_roce_table_put(hr_dev, &qp_table->irrl_table, hr_qp->qpn);
-               hns_roce_table_put(hr_dev, &qp_table->qp_table, hr_qp->qpn);
        }
 }
-EXPORT_SYMBOL_GPL(hns_roce_qp_free);
 
 void hns_roce_release_range_qp(struct hns_roce_dev *hr_dev, int base_qpn,
                               int cnt)
 {
        struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
 
-       if (base_qpn < SQP_NUM)
+       if (base_qpn < hr_dev->caps.reserved_qps)
                return;
 
        hns_roce_bitmap_free_range(&qp_table->bitmap, base_qpn, cnt, BITMAP_RR);
 }
-EXPORT_SYMBOL_GPL(hns_roce_release_range_qp);
 
 static int hns_roce_set_rq_size(struct hns_roce_dev *hr_dev,
                                struct ib_qp_cap *cap, bool is_user, int has_rq,
@@ -392,8 +386,8 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev,
                                             hr_qp->sq.wqe_shift), PAGE_SIZE);
        } else {
                page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
-               hr_qp->sge.sge_cnt =
-                      max(page_size / (1 << hr_qp->sge.sge_shift), ex_sge_num);
+               hr_qp->sge.sge_cnt = ex_sge_num ?
+                  max(page_size / (1 << hr_qp->sge.sge_shift), ex_sge_num) : 0;
                hr_qp->buff_size = HNS_ROCE_ALOGN_UP((hr_qp->rq.wqe_cnt <<
                                             hr_qp->rq.wqe_shift), page_size) +
                                   HNS_ROCE_ALOGN_UP((hr_qp->sge.sge_cnt <<
@@ -422,6 +416,91 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev,
        return 0;
 }
 
+static int split_wqe_buf_region(struct hns_roce_dev *hr_dev,
+                               struct hns_roce_qp *hr_qp,
+                               struct hns_roce_buf_region *regions,
+                               int region_max, int page_shift)
+{
+       int page_size = 1 << page_shift;
+       bool is_extend_sge;
+       int region_cnt = 0;
+       int buf_size;
+       int buf_cnt;
+
+       if (hr_qp->buff_size < 1 || region_max < 1)
+               return region_cnt;
+
+       if (hr_qp->sge.sge_cnt > 0)
+               is_extend_sge = true;
+       else
+               is_extend_sge = false;
+
+       /* sq region */
+       if (is_extend_sge)
+               buf_size = hr_qp->sge.offset - hr_qp->sq.offset;
+       else
+               buf_size = hr_qp->rq.offset - hr_qp->sq.offset;
+
+       if (buf_size > 0 && region_cnt < region_max) {
+               buf_cnt = DIV_ROUND_UP(buf_size, page_size);
+               hns_roce_init_buf_region(&regions[region_cnt],
+                                        hr_dev->caps.wqe_sq_hop_num,
+                                        hr_qp->sq.offset / page_size,
+                                        buf_cnt);
+               region_cnt++;
+       }
+
+       /* sge region */
+       if (is_extend_sge) {
+               buf_size = hr_qp->rq.offset - hr_qp->sge.offset;
+               if (buf_size > 0 && region_cnt < region_max) {
+                       buf_cnt = DIV_ROUND_UP(buf_size, page_size);
+                       hns_roce_init_buf_region(&regions[region_cnt],
+                                                hr_dev->caps.wqe_sge_hop_num,
+                                                hr_qp->sge.offset / page_size,
+                                                buf_cnt);
+                       region_cnt++;
+               }
+       }
+
+       /* rq region */
+       buf_size = hr_qp->buff_size - hr_qp->rq.offset;
+       if (buf_size > 0) {
+               buf_cnt = DIV_ROUND_UP(buf_size, page_size);
+               hns_roce_init_buf_region(&regions[region_cnt],
+                                        hr_dev->caps.wqe_rq_hop_num,
+                                        hr_qp->rq.offset / page_size,
+                                        buf_cnt);
+               region_cnt++;
+       }
+
+       return region_cnt;
+}
+
+static int calc_wqe_bt_page_shift(struct hns_roce_dev *hr_dev,
+                                 struct hns_roce_buf_region *regions,
+                                 int region_cnt)
+{
+       int bt_pg_shift;
+       int ba_num;
+       int ret;
+
+       bt_pg_shift = PAGE_SHIFT + hr_dev->caps.mtt_ba_pg_sz;
+
+       /* all root ba entries must in one bt page */
+       do {
+               ba_num = (1 << bt_pg_shift) / BA_BYTE_LEN;
+               ret = hns_roce_hem_list_calc_root_ba(regions, region_cnt,
+                                                    ba_num);
+               if (ret <= ba_num)
+                       break;
+
+               bt_pg_shift++;
+       } while (ret > ba_num);
+
+       return bt_pg_shift - PAGE_SHIFT;
+}
+
 static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev,
                                       struct ib_qp_cap *cap,
                                       struct hns_roce_qp *hr_qp)
@@ -534,15 +613,17 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
                                     struct ib_udata *udata, unsigned long sqpn,
                                     struct hns_roce_qp *hr_qp)
 {
+       dma_addr_t *buf_list[ARRAY_SIZE(hr_qp->regions)] = { 0 };
        struct device *dev = hr_dev->dev;
        struct hns_roce_ib_create_qp ucmd;
        struct hns_roce_ib_create_qp_resp resp = {};
        struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(
                udata, struct hns_roce_ucontext, ibucontext);
+       struct hns_roce_buf_region *r;
        unsigned long qpn = 0;
-       int ret = 0;
        u32 page_shift;
-       u32 npages;
+       int buf_count;
+       int ret;
        int i;
 
        mutex_init(&hr_qp->mutex);
@@ -596,6 +677,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
                                init_attr->cap.max_recv_sge];
        }
 
+       page_shift = PAGE_SHIFT + hr_dev->caps.mtt_buf_pg_sz;
        if (udata) {
                if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
                        dev_err(dev, "ib_copy_from_udata error for create qp\n");
@@ -617,32 +699,28 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
                        ret = PTR_ERR(hr_qp->umem);
                        goto err_rq_sge_list;
                }
-
-               hr_qp->mtt.mtt_type = MTT_TYPE_WQE;
-               page_shift = PAGE_SHIFT;
-               if (hr_dev->caps.mtt_buf_pg_sz) {
-                       npages = (ib_umem_page_count(hr_qp->umem) +
-                                 (1 << hr_dev->caps.mtt_buf_pg_sz) - 1) /
-                                (1 << hr_dev->caps.mtt_buf_pg_sz);
-                       page_shift += hr_dev->caps.mtt_buf_pg_sz;
-                       ret = hns_roce_mtt_init(hr_dev, npages,
-                                   page_shift,
-                                   &hr_qp->mtt);
-               } else {
-                       ret = hns_roce_mtt_init(hr_dev,
-                                               ib_umem_page_count(hr_qp->umem),
-                                               page_shift, &hr_qp->mtt);
-               }
+               hr_qp->region_cnt = split_wqe_buf_region(hr_dev, hr_qp,
+                               hr_qp->regions, ARRAY_SIZE(hr_qp->regions),
+                               page_shift);
+               ret = hns_roce_alloc_buf_list(hr_qp->regions, buf_list,
+                                             hr_qp->region_cnt);
                if (ret) {
-                       dev_err(dev, "hns_roce_mtt_init error for create qp\n");
-                       goto err_buf;
+                       dev_err(dev, "alloc buf_list error for create qp\n");
+                       goto err_alloc_list;
                }
 
-               ret = hns_roce_ib_umem_write_mtt(hr_dev, &hr_qp->mtt,
-                                                hr_qp->umem);
-               if (ret) {
-                       dev_err(dev, "hns_roce_ib_umem_write_mtt error for create qp\n");
-                       goto err_mtt;
+               for (i = 0; i < hr_qp->region_cnt; i++) {
+                       r = &hr_qp->regions[i];
+                       buf_count = hns_roce_get_umem_bufs(hr_dev,
+                                       buf_list[i], r->count, r->offset,
+                                       hr_qp->umem, page_shift);
+                       if (buf_count != r->count) {
+                               dev_err(dev,
+                                       "get umem buf err, expect %d,ret %d.\n",
+                                       r->count, buf_count);
+                               ret = -ENOBUFS;
+                               goto err_get_bufs;
+                       }
                }
 
                if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SQ_RECORD_DB) &&
@@ -653,7 +731,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
                                                   &hr_qp->sdb);
                        if (ret) {
                                dev_err(dev, "sq record doorbell map failed!\n");
-                               goto err_mtt;
+                               goto err_get_bufs;
                        }
 
                        /* indicate kernel supports sq record db */
@@ -715,7 +793,6 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
                }
 
                /* Allocate QP buf */
-               page_shift = PAGE_SHIFT + hr_dev->caps.mtt_buf_pg_sz;
                if (hns_roce_buf_alloc(hr_dev, hr_qp->buff_size,
                                       (1 << page_shift) * 2,
                                       &hr_qp->hr_buf, page_shift)) {
@@ -723,21 +800,28 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
                        ret = -ENOMEM;
                        goto err_db;
                }
-
-               hr_qp->mtt.mtt_type = MTT_TYPE_WQE;
-               /* Write MTT */
-               ret = hns_roce_mtt_init(hr_dev, hr_qp->hr_buf.npages,
-                                       hr_qp->hr_buf.page_shift, &hr_qp->mtt);
+               hr_qp->region_cnt = split_wqe_buf_region(hr_dev, hr_qp,
+                               hr_qp->regions, ARRAY_SIZE(hr_qp->regions),
+                               page_shift);
+               ret = hns_roce_alloc_buf_list(hr_qp->regions, buf_list,
+                                             hr_qp->region_cnt);
                if (ret) {
-                       dev_err(dev, "hns_roce_mtt_init error for kernel create qp\n");
-                       goto err_buf;
+                       dev_err(dev, "alloc buf_list error for create qp!\n");
+                       goto err_alloc_list;
                }
 
-               ret = hns_roce_buf_write_mtt(hr_dev, &hr_qp->mtt,
-                                            &hr_qp->hr_buf);
-               if (ret) {
-                       dev_err(dev, "hns_roce_buf_write_mtt error for kernel create qp\n");
-                       goto err_mtt;
+               for (i = 0; i < hr_qp->region_cnt; i++) {
+                       r = &hr_qp->regions[i];
+                       buf_count = hns_roce_get_kmem_bufs(hr_dev,
+                                       buf_list[i], r->count, r->offset,
+                                       &hr_qp->hr_buf);
+                       if (buf_count != r->count) {
+                               dev_err(dev,
+                                       "get kmem buf err, expect %d,ret %d.\n",
+                                       r->count, buf_count);
+                               ret = -ENOBUFS;
+                               goto err_get_bufs;
+                       }
                }
 
                hr_qp->sq.wrid = kcalloc(hr_qp->sq.wqe_cnt, sizeof(u64),
@@ -761,6 +845,17 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
                }
        }
 
+       hr_qp->wqe_bt_pg_shift = calc_wqe_bt_page_shift(hr_dev, hr_qp->regions,
+                                                       hr_qp->region_cnt);
+       hns_roce_mtr_init(&hr_qp->mtr, PAGE_SHIFT + hr_qp->wqe_bt_pg_shift,
+                         page_shift);
+       ret = hns_roce_mtr_attach(hr_dev, &hr_qp->mtr, buf_list,
+                                 hr_qp->regions, hr_qp->region_cnt);
+       if (ret) {
+               dev_err(dev, "mtr attach error for create qp\n");
+               goto err_mtr;
+       }
+
        if (init_attr->qp_type == IB_QPT_GSI &&
            hr_dev->hw_rev == HNS_ROCE_HW_VER1) {
                /* In v1 engine, GSI QP context in RoCE engine's register */
@@ -796,6 +891,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
        }
 
        hr_qp->event = hns_roce_ib_qp_event;
+       hns_roce_free_buf_list(buf_list, hr_qp->region_cnt);
 
        return 0;
 
@@ -810,6 +906,9 @@ err_qpn:
        if (!sqpn)
                hns_roce_release_range_qp(hr_dev, qpn, 1);
 
+err_mtr:
+       hns_roce_mtr_cleanup(hr_dev, &hr_qp->mtr);
+
 err_wrid:
        if (udata) {
                if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
@@ -829,14 +928,13 @@ err_sq_dbmap:
                    hns_roce_qp_has_sq(init_attr))
                        hns_roce_db_unmap_user(uctx, &hr_qp->sdb);
 
-err_mtt:
-       hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt);
+err_get_bufs:
+       hns_roce_free_buf_list(buf_list, hr_qp->region_cnt);
 
-err_buf:
-       if (hr_qp->umem)
-               ib_umem_release(hr_qp->umem);
-       else
+err_alloc_list:
+       if (!hr_qp->umem)
                hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
+       ib_umem_release(hr_qp->umem);
 
 err_db:
        if (!udata && hns_roce_qp_has_rq(init_attr) &&
@@ -923,7 +1021,6 @@ struct ib_qp *hns_roce_create_qp(struct ib_pd *pd,
 
        return &hr_qp->ibqp;
 }
-EXPORT_SYMBOL_GPL(hns_roce_create_qp);
 
 int to_hr_qp_type(int qp_type)
 {
@@ -942,7 +1039,6 @@ int to_hr_qp_type(int qp_type)
 
        return transport_type;
 }
-EXPORT_SYMBOL_GPL(to_hr_qp_type);
 
 int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
                       int attr_mask, struct ib_udata *udata)
@@ -1062,7 +1158,6 @@ void hns_roce_lock_cqs(struct hns_roce_cq *send_cq, struct hns_roce_cq *recv_cq)
                spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
        }
 }
-EXPORT_SYMBOL_GPL(hns_roce_lock_cqs);
 
 void hns_roce_unlock_cqs(struct hns_roce_cq *send_cq,
                         struct hns_roce_cq *recv_cq) __releases(&send_cq->lock)
@@ -1079,7 +1174,6 @@ void hns_roce_unlock_cqs(struct hns_roce_cq *send_cq,
                spin_unlock_irq(&recv_cq->lock);
        }
 }
-EXPORT_SYMBOL_GPL(hns_roce_unlock_cqs);
 
 static void *get_wqe(struct hns_roce_qp *hr_qp, int offset)
 {
@@ -1091,20 +1185,17 @@ void *get_recv_wqe(struct hns_roce_qp *hr_qp, int n)
 {
        return get_wqe(hr_qp, hr_qp->rq.offset + (n << hr_qp->rq.wqe_shift));
 }
-EXPORT_SYMBOL_GPL(get_recv_wqe);
 
 void *get_send_wqe(struct hns_roce_qp *hr_qp, int n)
 {
        return get_wqe(hr_qp, hr_qp->sq.offset + (n << hr_qp->sq.wqe_shift));
 }
-EXPORT_SYMBOL_GPL(get_send_wqe);
 
 void *get_send_extend_sge(struct hns_roce_qp *hr_qp, int n)
 {
        return hns_roce_buf_offset(&hr_qp->hr_buf, hr_qp->sge.offset +
                                        (n << hr_qp->sge.sge_shift));
 }
-EXPORT_SYMBOL_GPL(get_send_extend_sge);
 
 bool hns_roce_wq_overflow(struct hns_roce_wq *hr_wq, int nreq,
                          struct ib_cq *ib_cq)
@@ -1123,7 +1214,6 @@ bool hns_roce_wq_overflow(struct hns_roce_wq *hr_wq, int nreq,
 
        return cur + nreq >= hr_wq->max_post;
 }
-EXPORT_SYMBOL_GPL(hns_roce_wq_overflow);
 
 int hns_roce_init_qp_table(struct hns_roce_dev *hr_dev)
 {
@@ -1135,11 +1225,7 @@ int hns_roce_init_qp_table(struct hns_roce_dev *hr_dev)
        mutex_init(&qp_table->scc_mutex);
        xa_init(&hr_dev->qp_table_xa);
 
-       /* In hw v1, a port include two SQP, six ports total 12 */
-       if (hr_dev->caps.max_sq_sg <= 2)
-               reserved_from_bot = SQP_NUM;
-       else
-               reserved_from_bot = hr_dev->caps.reserved_qps;
+       reserved_from_bot = hr_dev->caps.reserved_qps;
 
        ret = hns_roce_bitmap_init(&qp_table->bitmap, hr_dev->caps.num_qps,
                                   hr_dev->caps.num_qps - 1, reserved_from_bot,
index b3421b1f21e0354ddce8bae155c43bb9d38a862f..38bb548eaa6d8047b2b996c5396bc20f1da51c2d 100644 (file)
@@ -30,7 +30,6 @@ void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type)
        if (atomic_dec_and_test(&srq->refcount))
                complete(&srq->free);
 }
-EXPORT_SYMBOL_GPL(hns_roce_srq_event);
 
 static void hns_roce_ib_srq_event(struct hns_roce_srq *srq,
                                  enum hns_roce_event event_type)
@@ -181,28 +180,19 @@ static int hns_roce_create_idx_que(struct ib_pd *pd, struct hns_roce_srq *srq,
 {
        struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
        struct hns_roce_idx_que *idx_que = &srq->idx_que;
-       u32 bitmap_num;
-       int i;
 
-       bitmap_num = HNS_ROCE_ALOGN_UP(srq->max, 8 * sizeof(u64));
-
-       idx_que->bitmap = kcalloc(1, bitmap_num / 8, GFP_KERNEL);
+       idx_que->bitmap = bitmap_zalloc(srq->max, GFP_KERNEL);
        if (!idx_que->bitmap)
                return -ENOMEM;
 
-       bitmap_num = bitmap_num / (8 * sizeof(u64));
-
        idx_que->buf_size = srq->idx_que.buf_size;
 
        if (hns_roce_buf_alloc(hr_dev, idx_que->buf_size, (1 << page_shift) * 2,
                               &idx_que->idx_buf, page_shift)) {
-               kfree(idx_que->bitmap);
+               bitmap_free(idx_que->bitmap);
                return -ENOMEM;
        }
 
-       for (i = 0; i < bitmap_num; i++)
-               idx_que->bitmap[i] = ~(0UL);
-
        return 0;
 }
 
@@ -264,8 +254,7 @@ int hns_roce_create_srq(struct ib_srq *ib_srq,
                } else
                        ret = hns_roce_mtt_init(hr_dev,
                                                ib_umem_page_count(srq->umem),
-                                               srq->umem->page_shift,
-                                               &srq->mtt);
+                                               PAGE_SHIFT, &srq->mtt);
                if (ret)
                        goto err_buf;
 
@@ -291,10 +280,9 @@ int hns_roce_create_srq(struct ib_srq *ib_srq,
                        ret = hns_roce_mtt_init(hr_dev, npages,
                                                page_shift, &srq->idx_que.mtt);
                } else {
-                       ret = hns_roce_mtt_init(hr_dev,
-                                      ib_umem_page_count(srq->idx_que.umem),
-                                      srq->idx_que.umem->page_shift,
-                                      &srq->idx_que.mtt);
+                       ret = hns_roce_mtt_init(
+                               hr_dev, ib_umem_page_count(srq->idx_que.umem),
+                               PAGE_SHIFT, &srq->idx_que.mtt);
                }
 
                if (ret) {
@@ -391,21 +379,19 @@ err_idx_buf:
        hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt);
 
 err_idx_mtt:
-       if (udata)
-               ib_umem_release(srq->idx_que.umem);
+       ib_umem_release(srq->idx_que.umem);
 
 err_create_idx:
        hns_roce_buf_free(hr_dev, srq->idx_que.buf_size,
                          &srq->idx_que.idx_buf);
-       kfree(srq->idx_que.bitmap);
+       bitmap_free(srq->idx_que.bitmap);
 
 err_srq_mtt:
        hns_roce_mtt_cleanup(hr_dev, &srq->mtt);
 
 err_buf:
-       if (udata)
-               ib_umem_release(srq->umem);
-       else
+       ib_umem_release(srq->umem);
+       if (!udata)
                hns_roce_buf_free(hr_dev, srq_buf_size, &srq->buf);
 
        return ret;
@@ -419,15 +405,15 @@ void hns_roce_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
        hns_roce_srq_free(hr_dev, srq);
        hns_roce_mtt_cleanup(hr_dev, &srq->mtt);
 
-       if (ibsrq->uobject) {
+       if (udata) {
                hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt);
-               ib_umem_release(srq->idx_que.umem);
-               ib_umem_release(srq->umem);
        } else {
                kvfree(srq->wrid);
                hns_roce_buf_free(hr_dev, srq->max << srq->wqe_shift,
                                  &srq->buf);
        }
+       ib_umem_release(srq->idx_que.umem);
+       ib_umem_release(srq->umem);
 }
 
 int hns_roce_init_srq_table(struct hns_roce_dev *hr_dev)
index 700a5d06b60cab92052001035582440c3a4ff5c4..2d6a378e85609fe0f51f94cb16695b3cba5332b3 100644 (file)
@@ -4279,11 +4279,11 @@ static void i40iw_qhash_ctrl(struct i40iw_device *iwdev,
        /* if not found then add a child listener if interface is going up */
        if (!ifup)
                return;
-       child_listen_node = kzalloc(sizeof(*child_listen_node), GFP_ATOMIC);
+       child_listen_node = kmemdup(parent_listen_node,
+                       sizeof(*child_listen_node), GFP_ATOMIC);
        if (!child_listen_node)
                return;
        node_allocated = true;
-       memcpy(child_listen_node, parent_listen_node, sizeof(*child_listen_node));
 
        memcpy(child_listen_node->loc_addr, ipaddr,  ipv4 ? 4 : 16);
 
index 5689d742bafb8cc3e15060dd946d998a260a844e..d169a8031375c21dd02df8782f82216f78005df5 100644 (file)
@@ -772,6 +772,8 @@ static int i40iw_query_qp(struct ib_qp *ibqp,
        struct i40iw_qp *iwqp = to_iwqp(ibqp);
        struct i40iw_sc_qp *qp = &iwqp->sc_qp;
 
+       attr->qp_state = iwqp->ibqp_state;
+       attr->cur_qp_state = attr->qp_state;
        attr->qp_access_flags = 0;
        attr->cap.max_send_wr = qp->qp_uk.sq_size;
        attr->cap.max_recv_wr = qp->qp_uk.rq_size;
@@ -1064,44 +1066,38 @@ void i40iw_cq_wq_destroy(struct i40iw_device *iwdev, struct i40iw_sc_cq *cq)
  * @ib_cq: cq pointer
  * @udata: user data or NULL for kernel object
  */
-static int i40iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
+static void i40iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
        struct i40iw_cq *iwcq;
        struct i40iw_device *iwdev;
        struct i40iw_sc_cq *cq;
 
-       if (!ib_cq) {
-               i40iw_pr_err("ib_cq == NULL\n");
-               return 0;
-       }
-
        iwcq = to_iwcq(ib_cq);
        iwdev = to_iwdev(ib_cq->device);
        cq = &iwcq->sc_cq;
        i40iw_cq_wq_destroy(iwdev, cq);
        cq_free_resources(iwdev, iwcq);
-       kfree(iwcq);
        i40iw_rem_devusecount(iwdev);
-       return 0;
 }
 
 /**
  * i40iw_create_cq - create cq
- * @ibdev: device pointer from stack
+ * @ibcq: CQ allocated
  * @attr: attributes for cq
  * @udata: user data
  */
-static struct ib_cq *i40iw_create_cq(struct ib_device *ibdev,
-                                    const struct ib_cq_init_attr *attr,
-                                    struct ib_udata *udata)
+static int i40iw_create_cq(struct ib_cq *ibcq,
+                          const struct ib_cq_init_attr *attr,
+                          struct ib_udata *udata)
 {
+       struct ib_device *ibdev = ibcq->device;
        struct i40iw_device *iwdev = to_iwdev(ibdev);
-       struct i40iw_cq *iwcq;
+       struct i40iw_cq *iwcq = to_iwcq(ibcq);
        struct i40iw_pbl *iwpbl;
        u32 cq_num = 0;
        struct i40iw_sc_cq *cq;
        struct i40iw_sc_dev *dev = &iwdev->sc_dev;
-       struct i40iw_cq_init_info info;
+       struct i40iw_cq_init_info info = {};
        enum i40iw_status_code status;
        struct i40iw_cqp_request *cqp_request;
        struct cqp_commands_info *cqp_info;
@@ -1111,22 +1107,16 @@ static struct ib_cq *i40iw_create_cq(struct ib_device *ibdev,
        int entries = attr->cqe;
 
        if (iwdev->closing)
-               return ERR_PTR(-ENODEV);
+               return -ENODEV;
 
        if (entries > iwdev->max_cqe)
-               return ERR_PTR(-EINVAL);
-
-       iwcq = kzalloc(sizeof(*iwcq), GFP_KERNEL);
-       if (!iwcq)
-               return ERR_PTR(-ENOMEM);
-
-       memset(&info, 0, sizeof(info));
+               return -EINVAL;
 
        err_code = i40iw_alloc_resource(iwdev, iwdev->allocated_cqs,
                                        iwdev->max_cq, &cq_num,
                                        &iwdev->next_cq);
        if (err_code)
-               goto error;
+               return err_code;
 
        cq = &iwcq->sc_cq;
        cq->back_cq = (void *)iwcq;
@@ -1233,15 +1223,13 @@ static struct ib_cq *i40iw_create_cq(struct ib_device *ibdev,
        }
 
        i40iw_add_devusecount(iwdev);
-       return (struct ib_cq *)iwcq;
+       return 0;
 
 cq_destroy:
        i40iw_cq_wq_destroy(iwdev, cq);
 cq_free_resources:
        cq_free_resources(iwdev, iwcq);
-error:
-       kfree(iwcq);
-       return ERR_PTR(err_code);
+       return err_code;
 }
 
 /**
@@ -2018,8 +2006,7 @@ static int i40iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
        struct cqp_commands_info *cqp_info;
        u32 stag_idx;
 
-       if (iwmr->region)
-               ib_umem_release(iwmr->region);
+       ib_umem_release(iwmr->region);
 
        if (iwmr->type != IW_MEMREG_TYPE_MEM) {
                /* region is released. only test for userness. */
@@ -2655,6 +2642,11 @@ static int i40iw_query_pkey(struct ib_device *ibdev,
 }
 
 static const struct ib_device_ops i40iw_dev_ops = {
+       .owner = THIS_MODULE,
+       .driver_id = RDMA_DRIVER_I40IW,
+       /* NOTE: Older kernels wrongly use 0 for the uverbs_abi_ver */
+       .uverbs_abi_ver = I40IW_ABI_VER,
+
        .alloc_hw_stats = i40iw_alloc_hw_stats,
        .alloc_mr = i40iw_alloc_mr,
        .alloc_pd = i40iw_alloc_pd,
@@ -2694,6 +2686,7 @@ static const struct ib_device_ops i40iw_dev_ops = {
        .reg_user_mr = i40iw_reg_user_mr,
        .req_notify_cq = i40iw_req_notify_cq,
        INIT_RDMA_OBJ_SIZE(ib_pd, i40iw_pd, ibpd),
+       INIT_RDMA_OBJ_SIZE(ib_cq, i40iw_cq, ibcq),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, i40iw_ucontext, ibucontext),
 };
 
@@ -2712,7 +2705,6 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev
                i40iw_pr_err("iwdev == NULL\n");
                return NULL;
        }
-       iwibdev->ibdev.owner = THIS_MODULE;
        iwdev->iwibdev = iwibdev;
        iwibdev->iwdev = iwdev;
 
@@ -2771,9 +2763,6 @@ void i40iw_port_ibevent(struct i40iw_device *iwdev)
  */
 void i40iw_destroy_rdma_device(struct i40iw_ib_device *iwibdev)
 {
-       if (!iwibdev)
-               return;
-
        ib_unregister_device(&iwibdev->ibdev);
        wait_event_timeout(iwibdev->iwdev->close_wq,
                           !atomic64_read(&iwibdev->iwdev->use_count),
@@ -2795,7 +2784,6 @@ int i40iw_register_rdma_device(struct i40iw_device *iwdev)
                return -ENOMEM;
        iwibdev = iwdev->iwibdev;
        rdma_set_device_sysfs_group(&iwibdev->ibdev, &i40iw_attr_group);
-       iwibdev->ibdev.driver_id = RDMA_DRIVER_I40IW;
        ret = ib_register_device(&iwibdev->ibdev, "i40iw%d");
        if (ret)
                goto error;
index 022a0b4ea452ad28923846ccf7ef3a80ebd92ae2..a7d238d312f06ed8664456f8e8886b36d2ff8a8b 100644 (file)
@@ -172,14 +172,14 @@ err_buf:
 }
 
 #define CQ_CREATE_FLAGS_SUPPORTED IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION
-struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
-                               const struct ib_cq_init_attr *attr,
-                               struct ib_udata *udata)
+int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                     struct ib_udata *udata)
 {
+       struct ib_device *ibdev = ibcq->device;
        int entries = attr->cqe;
        int vector = attr->comp_vector;
        struct mlx4_ib_dev *dev = to_mdev(ibdev);
-       struct mlx4_ib_cq *cq;
+       struct mlx4_ib_cq *cq = to_mcq(ibcq);
        struct mlx4_uar *uar;
        void *buf_addr;
        int err;
@@ -187,14 +187,10 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
                udata, struct mlx4_ib_ucontext, ibucontext);
 
        if (entries < 1 || entries > dev->dev->caps.max_cqes)
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
 
        if (attr->flags & ~CQ_CREATE_FLAGS_SUPPORTED)
-               return ERR_PTR(-EINVAL);
-
-       cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-       if (!cq)
-               return ERR_PTR(-ENOMEM);
+               return -EINVAL;
 
        entries      = roundup_pow_of_two(entries + 1);
        cq->ibcq.cqe = entries - 1;
@@ -269,7 +265,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
                        goto err_cq_free;
                }
 
-       return &cq->ibcq;
+       return 0;
 
 err_cq_free:
        mlx4_cq_free(dev->dev, &cq->mcq);
@@ -281,19 +277,15 @@ err_dbmap:
 err_mtt:
        mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt);
 
-       if (udata)
-               ib_umem_release(cq->umem);
-       else
+       ib_umem_release(cq->umem);
+       if (!udata)
                mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
 
 err_db:
        if (!udata)
                mlx4_db_free(dev->dev, &cq->db);
-
 err_cq:
-       kfree(cq);
-
-       return ERR_PTR(err);
+       return err;
 }
 
 static int mlx4_alloc_resize_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq,
@@ -475,18 +467,15 @@ err_buf:
        kfree(cq->resize_buf);
        cq->resize_buf = NULL;
 
-       if (cq->resize_umem) {
-               ib_umem_release(cq->resize_umem);
-               cq->resize_umem = NULL;
-       }
-
+       ib_umem_release(cq->resize_umem);
+       cq->resize_umem = NULL;
 out:
        mutex_unlock(&cq->resize_mutex);
 
        return err;
 }
 
-int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
+void mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
        struct mlx4_ib_dev *dev = to_mdev(cq->device);
        struct mlx4_ib_cq *mcq = to_mcq(cq);
@@ -501,15 +490,11 @@ int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
                                struct mlx4_ib_ucontext,
                                ibucontext),
                        &mcq->db);
-               ib_umem_release(mcq->umem);
        } else {
                mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe);
                mlx4_db_free(dev->dev, &mcq->db);
        }
-
-       kfree(mcq);
-
-       return 0;
+       ib_umem_release(mcq->umem);
 }
 
 static void dump_cqe(void *cqe)
index 25d09d53b51c429915d7e6f26b5981dc2d461896..8790101facb7b5676d90d77439f2151c48f2d362 100644 (file)
@@ -1089,7 +1089,8 @@ static int mlx4_ib_alloc_ucontext(struct ib_ucontext *uctx,
        if (!dev->ib_active)
                return -EAGAIN;
 
-       if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) {
+       if (ibdev->ops.uverbs_abi_ver ==
+           MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) {
                resp_v3.qp_tab_size      = dev->dev->caps.num_qps;
                resp_v3.bf_reg_size      = dev->dev->caps.bf_reg_size;
                resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
@@ -1111,7 +1112,7 @@ static int mlx4_ib_alloc_ucontext(struct ib_ucontext *uctx,
        INIT_LIST_HEAD(&context->wqn_ranges_list);
        mutex_init(&context->wqn_ranges_mutex);
 
-       if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION)
+       if (ibdev->ops.uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION)
                err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3));
        else
                err = ib_copy_to_udata(udata, &resp, sizeof(resp));
@@ -2509,6 +2510,10 @@ static void get_fw_ver_str(struct ib_device *device, char *str)
 }
 
 static const struct ib_device_ops mlx4_ib_dev_ops = {
+       .owner = THIS_MODULE,
+       .driver_id = RDMA_DRIVER_MLX4,
+       .uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION,
+
        .add_gid = mlx4_ib_add_gid,
        .alloc_mr = mlx4_ib_alloc_mr,
        .alloc_pd = mlx4_ib_alloc_pd,
@@ -2560,6 +2565,7 @@ static const struct ib_device_ops mlx4_ib_dev_ops = {
        .resize_cq = mlx4_ib_resize_cq,
 
        INIT_RDMA_OBJ_SIZE(ib_ah, mlx4_ib_ah, ibah),
+       INIT_RDMA_OBJ_SIZE(ib_cq, mlx4_ib_cq, ibcq),
        INIT_RDMA_OBJ_SIZE(ib_pd, mlx4_ib_pd, ibpd),
        INIT_RDMA_OBJ_SIZE(ib_srq, mlx4_ib_srq, ibsrq),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx4_ib_ucontext, ibucontext),
@@ -2642,7 +2648,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
        ibdev->dev = dev;
        ibdev->bond_next_port   = 0;
 
-       ibdev->ib_dev.owner             = THIS_MODULE;
        ibdev->ib_dev.node_type         = RDMA_NODE_IB_CA;
        ibdev->ib_dev.local_dma_lkey    = dev->caps.reserved_lkey;
        ibdev->num_ports                = num_ports;
@@ -2651,11 +2656,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
        ibdev->ib_dev.num_comp_vectors  = dev->caps.num_comp_vectors;
        ibdev->ib_dev.dev.parent        = &dev->persist->pdev->dev;
 
-       if (dev->caps.userspace_caps)
-               ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION;
-       else
-               ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION;
-
        ibdev->ib_dev.uverbs_cmd_mask   =
                (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
                (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
@@ -2729,6 +2729,10 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
                ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_fs_ops);
        }
 
+       if (!dev->caps.userspace_caps)
+               ibdev->ib_dev.ops.uverbs_abi_ver =
+                       MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION;
+
        mlx4_ib_alloc_eqs(dev, ibdev);
 
        spin_lock_init(&iboe->lock);
@@ -2839,7 +2843,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
                goto err_steer_free_bitmap;
 
        rdma_set_device_sysfs_group(&ibdev->ib_dev, &mlx4_attr_group);
-       ibdev->ib_dev.driver_id = RDMA_DRIVER_MLX4;
        if (ib_register_device(&ibdev->ib_dev, "mlx4_%d"))
                goto err_diag_counters;
 
index 26897102057d2739b71fe68a85be6a38f2222ceb..eb53bb4c0c91cc0e7a7af87e3a3f6f4b2767f446 100644 (file)
@@ -743,10 +743,9 @@ int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
                      unsigned int *sg_offset);
 int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
 int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
-struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
-                               const struct ib_cq_init_attr *attr,
-                               struct ib_udata *udata);
-int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
+int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                     struct ib_udata *udata);
+void mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
 void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
@@ -907,7 +906,7 @@ void mlx4_ib_sl2vl_update(struct mlx4_ib_dev *mdev, int port);
 struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd,
                                struct ib_wq_init_attr *init_attr,
                                struct ib_udata *udata);
-int mlx4_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata);
+void mlx4_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata);
 int mlx4_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
                      u32 wq_attr_mask, struct ib_udata *udata);
 
index 355205a285441dd31f4ae0ddcab49215db289a5a..753479285ce92a5156314a028e02867154ea4e9c 100644 (file)
@@ -258,7 +258,7 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va,
                                       int *num_of_mtts)
 {
        u64 block_shift = MLX4_MAX_MTT_SHIFT;
-       u64 min_shift = umem->page_shift;
+       u64 min_shift = PAGE_SHIFT;
        u64 last_block_aligned_end = 0;
        u64 current_block_start = 0;
        u64 first_block_start = 0;
@@ -295,8 +295,8 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va,
                         * in access to the wrong data.
                         */
                        misalignment_bits =
-                       (start_va & (~(((u64)(BIT(umem->page_shift))) - 1ULL)))
-                       ^ current_block_start;
+                               (start_va & (~(((u64)(PAGE_SIZE)) - 1ULL))) ^
+                               current_block_start;
                        block_shift = min(alignment_of(misalignment_bits),
                                          block_shift);
                }
@@ -368,8 +368,7 @@ end:
 }
 
 static struct ib_umem *mlx4_get_umem_mr(struct ib_udata *udata, u64 start,
-                                       u64 length, u64 virt_addr,
-                                       int access_flags)
+                                       u64 length, int access_flags)
 {
        /*
         * Force registering the memory as writable if the underlying pages
@@ -415,8 +414,7 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
        if (!mr)
                return ERR_PTR(-ENOMEM);
 
-       mr->umem =
-               mlx4_get_umem_mr(udata, start, length, virt_addr, access_flags);
+       mr->umem = mlx4_get_umem_mr(udata, start, length, access_flags);
        if (IS_ERR(mr->umem)) {
                err = PTR_ERR(mr->umem);
                goto err_free;
@@ -505,7 +503,7 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
 
                mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
                ib_umem_release(mmr->umem);
-               mmr->umem = mlx4_get_umem_mr(udata, start, length, virt_addr,
+               mmr->umem = mlx4_get_umem_mr(udata, start, length,
                                             mr_access_flags);
                if (IS_ERR(mmr->umem)) {
                        err = PTR_ERR(mmr->umem);
@@ -514,7 +512,7 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
                        goto release_mpt_entry;
                }
                n = ib_umem_page_count(mmr->umem);
-               shift = mmr->umem->page_shift;
+               shift = PAGE_SHIFT;
 
                err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr,
                                              virt_addr, length, n, shift,
index 5221c0794d1d0a88ce5ccff5be3b6b10c0363947..82aff2f2fdc23b1b9615c910a29a80e707db3cc1 100644 (file)
@@ -1207,10 +1207,9 @@ err_mtt:
        mlx4_mtt_cleanup(dev->dev, &qp->mtt);
 
 err_buf:
-       if (qp->umem)
-               ib_umem_release(qp->umem);
-       else
+       if (!qp->umem)
                mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+       ib_umem_release(qp->umem);
 
 err_db:
        if (!udata && qp_has_rq(init_attr))
@@ -1421,7 +1420,6 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 
                        mlx4_ib_db_unmap_user(mcontext, &qp->db);
                }
-               ib_umem_release(qp->umem);
        } else {
                kvfree(qp->sq.wrid);
                kvfree(qp->rq.wrid);
@@ -1432,6 +1430,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
                if (qp->rq.wqe_cnt)
                        mlx4_db_free(dev->dev, &qp->db);
        }
+       ib_umem_release(qp->umem);
 
        del_gid_entries(qp);
 }
@@ -4248,7 +4247,7 @@ int mlx4_ib_modify_wq(struct ib_wq *ibwq, struct ib_wq_attr *wq_attr,
        return err;
 }
 
-int mlx4_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata)
+void mlx4_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata)
 {
        struct mlx4_ib_dev *dev = to_mdev(ibwq->device);
        struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
@@ -4259,8 +4258,6 @@ int mlx4_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata)
        destroy_qp_common(dev, qp, MLX4_IB_RWQ_SRC, udata);
 
        kfree(qp);
-
-       return 0;
 }
 
 struct ib_rwq_ind_table
index 4bf2946b9759e81139e66b412e49c4e697d24b22..848db7264cc9548f0eb1e02c10e29a04732806ca 100644 (file)
@@ -115,7 +115,7 @@ int mlx4_ib_create_srq(struct ib_srq *ib_srq,
                        return PTR_ERR(srq->umem);
 
                err = mlx4_mtt_init(dev->dev, ib_umem_page_count(srq->umem),
-                                   srq->umem->page_shift, &srq->mtt);
+                                   PAGE_SHIFT, &srq->mtt);
                if (err)
                        goto err_buf;
 
@@ -204,10 +204,9 @@ err_mtt:
        mlx4_mtt_cleanup(dev->dev, &srq->mtt);
 
 err_buf:
-       if (srq->umem)
-               ib_umem_release(srq->umem);
-       else
+       if (!srq->umem)
                mlx4_buf_free(dev->dev, buf_size, &srq->buf);
+       ib_umem_release(srq->umem);
 
 err_db:
        if (!udata)
@@ -275,13 +274,13 @@ void mlx4_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata)
                                struct mlx4_ib_ucontext,
                                ibucontext),
                        &msrq->db);
-               ib_umem_release(msrq->umem);
        } else {
                kvfree(msrq->wrid);
                mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift,
                              &msrq->buf);
                mlx4_db_free(dev->dev, &msrq->db);
        }
+       ib_umem_release(msrq->umem);
 }
 
 void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index)
index 4efbbd2fce0cdfb6793a4694d85660e8380d0286..45f48cde6b9d548e06e88859da52ed413d4bc2ae 100644 (file)
@@ -884,15 +884,15 @@ static void notify_soft_wc_handler(struct work_struct *work)
        cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
 }
 
-struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
-                               const struct ib_cq_init_attr *attr,
-                               struct ib_udata *udata)
+int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                     struct ib_udata *udata)
 {
+       struct ib_device *ibdev = ibcq->device;
        int entries = attr->cqe;
        int vector = attr->comp_vector;
        struct mlx5_ib_dev *dev = to_mdev(ibdev);
+       struct mlx5_ib_cq *cq = to_mcq(ibcq);
        u32 out[MLX5_ST_SZ_DW(create_cq_out)];
-       struct mlx5_ib_cq *cq;
        int uninitialized_var(index);
        int uninitialized_var(inlen);
        u32 *cqb = NULL;
@@ -904,18 +904,14 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
 
        if (entries < 0 ||
            (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))))
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
 
        if (check_cq_create_flags(attr->flags))
-               return ERR_PTR(-EOPNOTSUPP);
+               return -EOPNOTSUPP;
 
        entries = roundup_pow_of_two(entries + 1);
        if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)))
-               return ERR_PTR(-EINVAL);
-
-       cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-       if (!cq)
-               return ERR_PTR(-ENOMEM);
+               return -EINVAL;
 
        cq->ibcq.cqe = entries - 1;
        mutex_init(&cq->resize_mutex);
@@ -930,13 +926,13 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
                err = create_cq_user(dev, udata, cq, entries, &cqb, &cqe_size,
                                     &index, &inlen);
                if (err)
-                       goto err_create;
+                       return err;
        } else {
                cqe_size = cache_line_size() == 128 ? 128 : 64;
                err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb,
                                       &index, &inlen);
                if (err)
-                       goto err_create;
+                       return err;
 
                INIT_WORK(&cq->notify_work, notify_soft_wc_handler);
        }
@@ -981,7 +977,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
 
 
        kvfree(cqb);
-       return &cq->ibcq;
+       return 0;
 
 err_cmd:
        mlx5_core_destroy_cq(dev->mdev, &cq->mcq);
@@ -992,14 +988,10 @@ err_cqb:
                destroy_cq_user(cq, udata);
        else
                destroy_cq_kernel(dev, cq);
-
-err_create:
-       kfree(cq);
-
-       return ERR_PTR(err);
+       return err;
 }
 
-int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
+void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
        struct mlx5_ib_dev *dev = to_mdev(cq->device);
        struct mlx5_ib_cq *mcq = to_mcq(cq);
@@ -1009,10 +1001,6 @@ int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
                destroy_cq_user(mcq, udata);
        else
                destroy_cq_kernel(dev, mcq);
-
-       kfree(mcq);
-
-       return 0;
 }
 
 static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn)
@@ -1138,11 +1126,6 @@ static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
        return 0;
 }
 
-static void un_resize_user(struct mlx5_ib_cq *cq)
-{
-       ib_umem_release(cq->resize_umem);
-}
-
 static int resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
                         int entries, int cqe_size)
 {
@@ -1165,12 +1148,6 @@ ex:
        return err;
 }
 
-static void un_resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq)
-{
-       free_cq_buf(dev, cq->resize_buf);
-       cq->resize_buf = NULL;
-}
-
 static int copy_resize_cqes(struct mlx5_ib_cq *cq)
 {
        struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
@@ -1351,10 +1328,11 @@ ex_alloc:
        kvfree(in);
 
 ex_resize:
-       if (udata)
-               un_resize_user(cq);
-       else
-               un_resize_kernel(dev, cq);
+       ib_umem_release(cq->resize_umem);
+       if (!udata) {
+               free_cq_buf(dev, cq->resize_buf);
+               cq->resize_buf = NULL;
+       }
 ex:
        mutex_unlock(&cq->resize_mutex);
        return err;
index 931f587dfb8fda45476babceeee2b3be450ffaec..ec4370f9938127f94d70533f2ebf8cc9f44b48a2 100644 (file)
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/fs.h>
 #include "mlx5_ib.h"
+#include <linux/xarray.h>
 
 #define UVERBS_MODULE_NAME mlx5_ib
 #include <rdma/uverbs_named_ioctl.h>
 
+static void dispatch_event_fd(struct list_head *fd_list, const void *data);
+
 enum devx_obj_flags {
        DEVX_OBJ_FLAGS_INDIRECT_MKEY = 1 << 0,
        DEVX_OBJ_FLAGS_DCT = 1 << 1,
+       DEVX_OBJ_FLAGS_CQ = 1 << 2,
 };
 
 struct devx_async_data {
@@ -33,9 +37,61 @@ struct devx_async_data {
        struct mlx5_ib_uapi_devx_async_cmd_hdr hdr;
 };
 
+struct devx_async_event_data {
+       struct list_head list; /* headed in ev_file->event_list */
+       struct mlx5_ib_uapi_devx_async_event_hdr hdr;
+};
+
+/* first level XA value data structure */
+struct devx_event {
+       struct xarray object_ids; /* second XA level, Key = object id */
+       struct list_head unaffiliated_list;
+};
+
+/* second level XA value data structure */
+struct devx_obj_event {
+       struct rcu_head rcu;
+       struct list_head obj_sub_list;
+};
+
+struct devx_event_subscription {
+       struct list_head file_list; /* headed in ev_file->
+                                    * subscribed_events_list
+                                    */
+       struct list_head xa_list; /* headed in devx_event->unaffiliated_list or
+                                  * devx_obj_event->obj_sub_list
+                                  */
+       struct list_head obj_list; /* headed in devx_object */
+       struct list_head event_list; /* headed in ev_file->event_list or in
+                                     * temp list via subscription
+                                     */
+
+       u8 is_cleaned:1;
+       u32 xa_key_level1;
+       u32 xa_key_level2;
+       struct rcu_head rcu;
+       u64 cookie;
+       struct devx_async_event_file *ev_file;
+       struct file *filp; /* Upon hot unplug we need a direct access to */
+       struct eventfd_ctx *eventfd;
+};
+
+struct devx_async_event_file {
+       struct ib_uobject uobj;
+       /* Head of events that are subscribed to this FD */
+       struct list_head subscribed_events_list;
+       spinlock_t lock;
+       wait_queue_head_t poll_wait;
+       struct list_head event_list;
+       struct mlx5_ib_dev *dev;
+       u8 omit_data:1;
+       u8 is_overflow_err:1;
+       u8 is_destroyed:1;
+};
+
 #define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in)
 struct devx_obj {
-       struct mlx5_core_dev    *mdev;
+       struct mlx5_ib_dev      *ib_dev;
        u64                     obj_id;
        u32                     dinlen; /* destroy inbox length */
        u32                     dinbox[MLX5_MAX_DESTROY_INBOX_SIZE_DW];
@@ -43,7 +99,9 @@ struct devx_obj {
        union {
                struct mlx5_ib_devx_mr  devx_mr;
                struct mlx5_core_dct    core_dct;
+               struct mlx5_core_cq     core_cq;
        };
+       struct list_head event_sub; /* holds devx_event_subscription entries */
 };
 
 struct devx_umem {
@@ -149,6 +207,127 @@ bool mlx5_ib_devx_is_flow_counter(void *obj, u32 *counter_id)
        return false;
 }
 
+static bool is_legacy_unaffiliated_event_num(u16 event_num)
+{
+       switch (event_num) {
+       case MLX5_EVENT_TYPE_PORT_CHANGE:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static bool is_legacy_obj_event_num(u16 event_num)
+{
+       switch (event_num) {
+       case MLX5_EVENT_TYPE_PATH_MIG:
+       case MLX5_EVENT_TYPE_COMM_EST:
+       case MLX5_EVENT_TYPE_SQ_DRAINED:
+       case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+       case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+       case MLX5_EVENT_TYPE_CQ_ERROR:
+       case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+       case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+       case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+       case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+       case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+       case MLX5_EVENT_TYPE_DCT_DRAINED:
+       case MLX5_EVENT_TYPE_COMP:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static u16 get_legacy_obj_type(u16 opcode)
+{
+       switch (opcode) {
+       case MLX5_CMD_OP_CREATE_RQ:
+               return MLX5_EVENT_QUEUE_TYPE_RQ;
+       case MLX5_CMD_OP_CREATE_QP:
+               return MLX5_EVENT_QUEUE_TYPE_QP;
+       case MLX5_CMD_OP_CREATE_SQ:
+               return MLX5_EVENT_QUEUE_TYPE_SQ;
+       case MLX5_CMD_OP_CREATE_DCT:
+               return MLX5_EVENT_QUEUE_TYPE_DCT;
+       default:
+               return 0;
+       }
+}
+
+static u16 get_dec_obj_type(struct devx_obj *obj, u16 event_num)
+{
+       u16 opcode;
+
+       opcode = (obj->obj_id >> 32) & 0xffff;
+
+       if (is_legacy_obj_event_num(event_num))
+               return get_legacy_obj_type(opcode);
+
+       switch (opcode) {
+       case MLX5_CMD_OP_CREATE_GENERAL_OBJECT:
+               return (obj->obj_id >> 48);
+       case MLX5_CMD_OP_CREATE_RQ:
+               return MLX5_OBJ_TYPE_RQ;
+       case MLX5_CMD_OP_CREATE_QP:
+               return MLX5_OBJ_TYPE_QP;
+       case MLX5_CMD_OP_CREATE_SQ:
+               return MLX5_OBJ_TYPE_SQ;
+       case MLX5_CMD_OP_CREATE_DCT:
+               return MLX5_OBJ_TYPE_DCT;
+       case MLX5_CMD_OP_CREATE_TIR:
+               return MLX5_OBJ_TYPE_TIR;
+       case MLX5_CMD_OP_CREATE_TIS:
+               return MLX5_OBJ_TYPE_TIS;
+       case MLX5_CMD_OP_CREATE_PSV:
+               return MLX5_OBJ_TYPE_PSV;
+       case MLX5_OBJ_TYPE_MKEY:
+               return MLX5_OBJ_TYPE_MKEY;
+       case MLX5_CMD_OP_CREATE_RMP:
+               return MLX5_OBJ_TYPE_RMP;
+       case MLX5_CMD_OP_CREATE_XRC_SRQ:
+               return MLX5_OBJ_TYPE_XRC_SRQ;
+       case MLX5_CMD_OP_CREATE_XRQ:
+               return MLX5_OBJ_TYPE_XRQ;
+       case MLX5_CMD_OP_CREATE_RQT:
+               return MLX5_OBJ_TYPE_RQT;
+       case MLX5_CMD_OP_ALLOC_FLOW_COUNTER:
+               return MLX5_OBJ_TYPE_FLOW_COUNTER;
+       case MLX5_CMD_OP_CREATE_CQ:
+               return MLX5_OBJ_TYPE_CQ;
+       default:
+               return 0;
+       }
+}
+
+static u16 get_event_obj_type(unsigned long event_type, struct mlx5_eqe *eqe)
+{
+       switch (event_type) {
+       case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+       case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+       case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+       case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+       case MLX5_EVENT_TYPE_PATH_MIG:
+       case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+       case MLX5_EVENT_TYPE_COMM_EST:
+       case MLX5_EVENT_TYPE_SQ_DRAINED:
+       case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+       case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+               return eqe->data.qp_srq.type;
+       case MLX5_EVENT_TYPE_CQ_ERROR:
+               return 0;
+       case MLX5_EVENT_TYPE_DCT_DRAINED:
+               return MLX5_EVENT_QUEUE_TYPE_DCT;
+       default:
+               return MLX5_GET(affiliated_event_header, &eqe->data, obj_type);
+       }
+}
+
+static u32 get_dec_obj_id(u64 obj_id)
+{
+       return (obj_id & 0xffffffff);
+}
+
 /*
  * As the obj_id in the firmware is not globally unique the object type
  * must be considered upon checking for a valid object id.
@@ -715,12 +894,16 @@ static int devx_get_uid(struct mlx5_ib_ucontext *c, void *cmd_in)
 
        return c->devx_uid;
 }
-static bool devx_is_general_cmd(void *in)
+
+static bool devx_is_general_cmd(void *in, struct mlx5_ib_dev *dev)
 {
        u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode);
 
-       if (opcode >= MLX5_CMD_OP_GENERAL_START &&
-           opcode < MLX5_CMD_OP_GENERAL_END)
+       /* Pass all cmds for vhca_tunnel as general, tracking is done in FW */
+       if ((MLX5_CAP_GEN_64(dev->mdev, vhca_tunnel_commands) &&
+            MLX5_GET(general_obj_in_cmd_hdr, in, vhca_tunnel_id)) ||
+           (opcode >= MLX5_CMD_OP_GENERAL_START &&
+            opcode < MLX5_CMD_OP_GENERAL_END))
                return true;
 
        switch (opcode) {
@@ -846,7 +1029,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OTHER)(
                return uid;
 
        /* Only white list of some general HCA commands are allowed for this method. */
-       if (!devx_is_general_cmd(cmd_in))
+       if (!devx_is_general_cmd(cmd_in, dev))
                return -EINVAL;
 
        cmd_out = uverbs_zalloc(attrs, cmd_out_len);
@@ -1111,33 +1294,72 @@ static void devx_free_indirect_mkey(struct rcu_head *rcu)
  */
 static void devx_cleanup_mkey(struct devx_obj *obj)
 {
-       xa_erase(&obj->mdev->priv.mkey_table,
+       xa_erase(&obj->ib_dev->mdev->priv.mkey_table,
                 mlx5_base_mkey(obj->devx_mr.mmkey.key));
 }
 
+static void devx_cleanup_subscription(struct mlx5_ib_dev *dev,
+                                     struct devx_event_subscription *sub)
+{
+       struct devx_event *event;
+       struct devx_obj_event *xa_val_level2;
+
+       if (sub->is_cleaned)
+               return;
+
+       sub->is_cleaned = 1;
+       list_del_rcu(&sub->xa_list);
+
+       if (list_empty(&sub->obj_list))
+               return;
+
+       list_del_rcu(&sub->obj_list);
+       /* check whether key level 1 for this obj_sub_list is empty */
+       event = xa_load(&dev->devx_event_table.event_xa,
+                       sub->xa_key_level1);
+       WARN_ON(!event);
+
+       xa_val_level2 = xa_load(&event->object_ids, sub->xa_key_level2);
+       if (list_empty(&xa_val_level2->obj_sub_list)) {
+               xa_erase(&event->object_ids,
+                        sub->xa_key_level2);
+               kfree_rcu(xa_val_level2, rcu);
+       }
+}
+
 static int devx_obj_cleanup(struct ib_uobject *uobject,
                            enum rdma_remove_reason why,
                            struct uverbs_attr_bundle *attrs)
 {
        u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)];
+       struct mlx5_devx_event_table *devx_event_table;
        struct devx_obj *obj = uobject->object;
+       struct devx_event_subscription *sub_entry, *tmp;
+       struct mlx5_ib_dev *dev;
        int ret;
 
+       dev = mlx5_udata_to_mdev(&attrs->driver_udata);
        if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY)
                devx_cleanup_mkey(obj);
 
        if (obj->flags & DEVX_OBJ_FLAGS_DCT)
-               ret = mlx5_core_destroy_dct(obj->mdev, &obj->core_dct);
+               ret = mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct);
+       else if (obj->flags & DEVX_OBJ_FLAGS_CQ)
+               ret = mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq);
        else
-               ret = mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out,
-                                   sizeof(out));
+               ret = mlx5_cmd_exec(obj->ib_dev->mdev, obj->dinbox,
+                                   obj->dinlen, out, sizeof(out));
        if (ib_is_destroy_retryable(ret, why, uobject))
                return ret;
 
-       if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
-               struct mlx5_ib_dev *dev =
-                       mlx5_udata_to_mdev(&attrs->driver_udata);
+       devx_event_table = &dev->devx_event_table;
+
+       mutex_lock(&devx_event_table->event_xa_lock);
+       list_for_each_entry_safe(sub_entry, tmp, &obj->event_sub, obj_list)
+               devx_cleanup_subscription(dev, sub_entry);
+       mutex_unlock(&devx_event_table->event_xa_lock);
 
+       if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
                call_srcu(&dev->mr_srcu, &obj->devx_mr.rcu,
                          devx_free_indirect_mkey);
                return ret;
@@ -1147,6 +1369,29 @@ static int devx_obj_cleanup(struct ib_uobject *uobject,
        return ret;
 }
 
+static void devx_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
+{
+       struct devx_obj *obj = container_of(mcq, struct devx_obj, core_cq);
+       struct mlx5_devx_event_table *table;
+       struct devx_event *event;
+       struct devx_obj_event *obj_event;
+       u32 obj_id = mcq->cqn;
+
+       table = &obj->ib_dev->devx_event_table;
+       rcu_read_lock();
+       event = xa_load(&table->event_xa, MLX5_EVENT_TYPE_COMP);
+       if (!event)
+               goto out;
+
+       obj_event = xa_load(&event->object_ids, obj_id);
+       if (!obj_event)
+               goto out;
+
+       dispatch_event_fd(&obj_event->obj_sub_list, eqe);
+out:
+       rcu_read_unlock();
+}
+
 static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
        struct uverbs_attr_bundle *attrs)
 {
@@ -1169,6 +1414,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
        u32 obj_id;
        u16 opcode;
 
+       if (MLX5_GET(general_obj_in_cmd_hdr, cmd_in, vhca_tunnel_id))
+               return -EINVAL;
+
        uid = devx_get_uid(c, cmd_in);
        if (uid < 0)
                return uid;
@@ -1198,6 +1446,12 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
                err = mlx5_core_create_dct(dev->mdev, &obj->core_dct,
                                           cmd_in, cmd_in_len,
                                           cmd_out, cmd_out_len);
+       } else if (opcode == MLX5_CMD_OP_CREATE_CQ) {
+               obj->flags |= DEVX_OBJ_FLAGS_CQ;
+               obj->core_cq.comp = devx_cq_comp;
+               err = mlx5_core_create_cq(dev->mdev, &obj->core_cq,
+                                         cmd_in, cmd_in_len, cmd_out,
+                                         cmd_out_len);
        } else {
                err = mlx5_cmd_exec(dev->mdev, cmd_in,
                                    cmd_in_len,
@@ -1208,7 +1462,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)(
                goto obj_free;
 
        uobj->object = obj;
-       obj->mdev = dev->mdev;
+       INIT_LIST_HEAD(&obj->event_sub);
+       obj->ib_dev = dev;
        devx_obj_build_destroy_cmd(cmd_in, cmd_out, obj->dinbox, &obj->dinlen,
                                   &obj_id);
        WARN_ON(obj->dinlen > MLX5_MAX_DESTROY_INBOX_SIZE_DW * sizeof(u32));
@@ -1235,9 +1490,11 @@ err_copy:
                devx_cleanup_mkey(obj);
 obj_destroy:
        if (obj->flags & DEVX_OBJ_FLAGS_DCT)
-               mlx5_core_destroy_dct(obj->mdev, &obj->core_dct);
+               mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct);
+       else if (obj->flags & DEVX_OBJ_FLAGS_CQ)
+               mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq);
        else
-               mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out,
+               mlx5_cmd_exec(obj->ib_dev->mdev, obj->dinbox, obj->dinlen, out,
                              sizeof(out));
 obj_free:
        kfree(obj);
@@ -1259,6 +1516,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_MODIFY)(
        int err;
        int uid;
 
+       if (MLX5_GET(general_obj_in_cmd_hdr, cmd_in, vhca_tunnel_id))
+               return -EINVAL;
+
        uid = devx_get_uid(c, cmd_in);
        if (uid < 0)
                return uid;
@@ -1301,6 +1561,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)(
        int uid;
        struct mlx5_ib_dev *mdev = to_mdev(c->ibucontext.device);
 
+       if (MLX5_GET(general_obj_in_cmd_hdr, cmd_in, vhca_tunnel_id))
+               return -EINVAL;
+
        uid = devx_get_uid(c, cmd_in);
        if (uid < 0)
                return uid;
@@ -1365,6 +1628,38 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)(
        return 0;
 }
 
+static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC)(
+       struct uverbs_attr_bundle *attrs)
+{
+       struct ib_uobject *uobj = uverbs_attr_get_uobject(
+               attrs, MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_HANDLE);
+       struct devx_async_event_file *ev_file;
+       struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
+               &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
+       struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device);
+       u32 flags;
+       int err;
+
+       err = uverbs_get_flags32(&flags, attrs,
+               MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_FLAGS,
+               MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA);
+
+       if (err)
+               return err;
+
+       ev_file = container_of(uobj, struct devx_async_event_file,
+                              uobj);
+       spin_lock_init(&ev_file->lock);
+       INIT_LIST_HEAD(&ev_file->event_list);
+       init_waitqueue_head(&ev_file->poll_wait);
+       if (flags & MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA)
+               ev_file->omit_data = 1;
+       INIT_LIST_HEAD(&ev_file->subscribed_events_list);
+       ev_file->dev = dev;
+       get_device(&dev->ib_dev.dev);
+       return 0;
+}
+
 static void devx_query_callback(int status, struct mlx5_async_work *context)
 {
        struct devx_async_data *async_data =
@@ -1406,6 +1701,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY)(
        struct devx_async_cmd_event_file *ev_file;
        struct devx_async_data *async_data;
 
+       if (MLX5_GET(general_obj_in_cmd_hdr, cmd_in, vhca_tunnel_id))
+               return -EINVAL;
+
        uid = devx_get_uid(c, cmd_in);
        if (uid < 0)
                return uid;
@@ -1474,6 +1772,331 @@ sub_bytes:
        return err;
 }
 
+static void
+subscribe_event_xa_dealloc(struct mlx5_devx_event_table *devx_event_table,
+                          u32 key_level1,
+                          bool is_level2,
+                          u32 key_level2)
+{
+       struct devx_event *event;
+       struct devx_obj_event *xa_val_level2;
+
+       /* Level 1 is valid for future use, no need to free */
+       if (!is_level2)
+               return;
+
+       event = xa_load(&devx_event_table->event_xa, key_level1);
+       WARN_ON(!event);
+
+       xa_val_level2 = xa_load(&event->object_ids,
+                               key_level2);
+       if (list_empty(&xa_val_level2->obj_sub_list)) {
+               xa_erase(&event->object_ids,
+                        key_level2);
+               kfree_rcu(xa_val_level2, rcu);
+       }
+}
+
+static int
+subscribe_event_xa_alloc(struct mlx5_devx_event_table *devx_event_table,
+                        u32 key_level1,
+                        bool is_level2,
+                        u32 key_level2)
+{
+       struct devx_obj_event *obj_event;
+       struct devx_event *event;
+       int err;
+
+       event = xa_load(&devx_event_table->event_xa, key_level1);
+       if (!event) {
+               event = kzalloc(sizeof(*event), GFP_KERNEL);
+               if (!event)
+                       return -ENOMEM;
+
+               INIT_LIST_HEAD(&event->unaffiliated_list);
+               xa_init(&event->object_ids);
+
+               err = xa_insert(&devx_event_table->event_xa,
+                               key_level1,
+                               event,
+                               GFP_KERNEL);
+               if (err) {
+                       kfree(event);
+                       return err;
+               }
+       }
+
+       if (!is_level2)
+               return 0;
+
+       obj_event = xa_load(&event->object_ids, key_level2);
+       if (!obj_event) {
+               obj_event = kzalloc(sizeof(*obj_event), GFP_KERNEL);
+               if (!obj_event)
+                       /* Level1 is valid for future use, no need to free */
+                       return -ENOMEM;
+
+               err = xa_insert(&event->object_ids,
+                               key_level2,
+                               obj_event,
+                               GFP_KERNEL);
+               if (err)
+                       return err;
+               INIT_LIST_HEAD(&obj_event->obj_sub_list);
+       }
+
+       return 0;
+}
+
+static bool is_valid_events_legacy(int num_events, u16 *event_type_num_list,
+                                  struct devx_obj *obj)
+{
+       int i;
+
+       for (i = 0; i < num_events; i++) {
+               if (obj) {
+                       if (!is_legacy_obj_event_num(event_type_num_list[i]))
+                               return false;
+               } else if (!is_legacy_unaffiliated_event_num(
+                               event_type_num_list[i])) {
+                       return false;
+               }
+       }
+
+       return true;
+}
+
+#define MAX_SUPP_EVENT_NUM 255
+static bool is_valid_events(struct mlx5_core_dev *dev,
+                           int num_events, u16 *event_type_num_list,
+                           struct devx_obj *obj)
+{
+       __be64 *aff_events;
+       __be64 *unaff_events;
+       int mask_entry;
+       int mask_bit;
+       int i;
+
+       if (MLX5_CAP_GEN(dev, event_cap)) {
+               aff_events = MLX5_CAP_DEV_EVENT(dev,
+                                               user_affiliated_events);
+               unaff_events = MLX5_CAP_DEV_EVENT(dev,
+                                                 user_unaffiliated_events);
+       } else {
+               return is_valid_events_legacy(num_events, event_type_num_list,
+                                             obj);
+       }
+
+       for (i = 0; i < num_events; i++) {
+               if (event_type_num_list[i] > MAX_SUPP_EVENT_NUM)
+                       return false;
+
+               mask_entry = event_type_num_list[i] / 64;
+               mask_bit = event_type_num_list[i] % 64;
+
+               if (obj) {
+                       /* CQ completion */
+                       if (event_type_num_list[i] == 0)
+                               continue;
+
+                       if (!(be64_to_cpu(aff_events[mask_entry]) &
+                                       (1ull << mask_bit)))
+                               return false;
+
+                       continue;
+               }
+
+               if (!(be64_to_cpu(unaff_events[mask_entry]) &
+                               (1ull << mask_bit)))
+                       return false;
+       }
+
+       return true;
+}
+
+#define MAX_NUM_EVENTS 16
+static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT)(
+       struct uverbs_attr_bundle *attrs)
+{
+       struct ib_uobject *devx_uobj = uverbs_attr_get_uobject(
+                               attrs,
+                               MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE);
+       struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
+               &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
+       struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device);
+       struct ib_uobject *fd_uobj;
+       struct devx_obj *obj = NULL;
+       struct devx_async_event_file *ev_file;
+       struct mlx5_devx_event_table *devx_event_table = &dev->devx_event_table;
+       u16 *event_type_num_list;
+       struct devx_event_subscription *event_sub, *tmp_sub;
+       struct list_head sub_list;
+       int redirect_fd;
+       bool use_eventfd = false;
+       int num_events;
+       int num_alloc_xa_entries = 0;
+       u16 obj_type = 0;
+       u64 cookie = 0;
+       u32 obj_id = 0;
+       int err;
+       int i;
+
+       if (!c->devx_uid)
+               return -EINVAL;
+
+       if (!IS_ERR(devx_uobj)) {
+               obj = (struct devx_obj *)devx_uobj->object;
+               if (obj)
+                       obj_id = get_dec_obj_id(obj->obj_id);
+       }
+
+       fd_uobj = uverbs_attr_get_uobject(attrs,
+                               MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE);
+       if (IS_ERR(fd_uobj))
+               return PTR_ERR(fd_uobj);
+
+       ev_file = container_of(fd_uobj, struct devx_async_event_file,
+                              uobj);
+
+       if (uverbs_attr_is_valid(attrs,
+                                MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM)) {
+               err = uverbs_copy_from(&redirect_fd, attrs,
+                              MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM);
+               if (err)
+                       return err;
+
+               use_eventfd = true;
+       }
+
+       if (uverbs_attr_is_valid(attrs,
+                                MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE)) {
+               if (use_eventfd)
+                       return -EINVAL;
+
+               err = uverbs_copy_from(&cookie, attrs,
+                               MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE);
+               if (err)
+                       return err;
+       }
+
+       num_events = uverbs_attr_ptr_get_array_size(
+               attrs, MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST,
+               sizeof(u16));
+
+       if (num_events < 0)
+               return num_events;
+
+       if (num_events > MAX_NUM_EVENTS)
+               return -EINVAL;
+
+       event_type_num_list = uverbs_attr_get_alloced_ptr(attrs,
+                       MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST);
+
+       if (!is_valid_events(dev->mdev, num_events, event_type_num_list, obj))
+               return -EINVAL;
+
+       INIT_LIST_HEAD(&sub_list);
+
+       /* Protect from concurrent subscriptions to same XA entries to allow
+        * both to succeed
+        */
+       mutex_lock(&devx_event_table->event_xa_lock);
+       for (i = 0; i < num_events; i++) {
+               u32 key_level1;
+
+               if (obj)
+                       obj_type = get_dec_obj_type(obj,
+                                                   event_type_num_list[i]);
+               key_level1 = event_type_num_list[i] | obj_type << 16;
+
+               err = subscribe_event_xa_alloc(devx_event_table,
+                                              key_level1,
+                                              obj,
+                                              obj_id);
+               if (err)
+                       goto err;
+
+               num_alloc_xa_entries++;
+               event_sub = kzalloc(sizeof(*event_sub), GFP_KERNEL);
+               if (!event_sub)
+                       goto err;
+
+               list_add_tail(&event_sub->event_list, &sub_list);
+               if (use_eventfd) {
+                       event_sub->eventfd =
+                               eventfd_ctx_fdget(redirect_fd);
+
+                       if (IS_ERR(event_sub)) {
+                               err = PTR_ERR(event_sub->eventfd);
+                               event_sub->eventfd = NULL;
+                               goto err;
+                       }
+               }
+
+               event_sub->cookie = cookie;
+               event_sub->ev_file = ev_file;
+               event_sub->filp = fd_uobj->object;
+               /* May be needed upon cleanup the devx object/subscription */
+               event_sub->xa_key_level1 = key_level1;
+               event_sub->xa_key_level2 = obj_id;
+               INIT_LIST_HEAD(&event_sub->obj_list);
+       }
+
+       /* Once all the allocations and the XA data insertions were done we
+        * can go ahead and add all the subscriptions to the relevant lists
+        * without concern of a failure.
+        */
+       list_for_each_entry_safe(event_sub, tmp_sub, &sub_list, event_list) {
+               struct devx_event *event;
+               struct devx_obj_event *obj_event;
+
+               list_del_init(&event_sub->event_list);
+
+               spin_lock_irq(&ev_file->lock);
+               list_add_tail_rcu(&event_sub->file_list,
+                                 &ev_file->subscribed_events_list);
+               spin_unlock_irq(&ev_file->lock);
+
+               event = xa_load(&devx_event_table->event_xa,
+                               event_sub->xa_key_level1);
+               WARN_ON(!event);
+
+               if (!obj) {
+                       list_add_tail_rcu(&event_sub->xa_list,
+                                         &event->unaffiliated_list);
+                       continue;
+               }
+
+               obj_event = xa_load(&event->object_ids, obj_id);
+               WARN_ON(!obj_event);
+               list_add_tail_rcu(&event_sub->xa_list,
+                                 &obj_event->obj_sub_list);
+               list_add_tail_rcu(&event_sub->obj_list,
+                                 &obj->event_sub);
+       }
+
+       mutex_unlock(&devx_event_table->event_xa_lock);
+       return 0;
+
+err:
+       list_for_each_entry_safe(event_sub, tmp_sub, &sub_list, event_list) {
+               list_del(&event_sub->event_list);
+
+               subscribe_event_xa_dealloc(devx_event_table,
+                                          event_sub->xa_key_level1,
+                                          obj,
+                                          obj_id);
+
+               if (event_sub->eventfd)
+                       eventfd_ctx_put(event_sub->eventfd);
+
+               kfree(event_sub);
+       }
+
+       mutex_unlock(&devx_event_table->event_xa_lock);
+       return err;
+}
+
 static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
                         struct uverbs_attr_bundle *attrs,
                         struct devx_umem *obj)
@@ -1621,6 +2244,203 @@ static int devx_umem_cleanup(struct ib_uobject *uobject,
        return 0;
 }
 
+static bool is_unaffiliated_event(struct mlx5_core_dev *dev,
+                                 unsigned long event_type)
+{
+       __be64 *unaff_events;
+       int mask_entry;
+       int mask_bit;
+
+       if (!MLX5_CAP_GEN(dev, event_cap))
+               return is_legacy_unaffiliated_event_num(event_type);
+
+       unaff_events = MLX5_CAP_DEV_EVENT(dev,
+                                         user_unaffiliated_events);
+       WARN_ON(event_type > MAX_SUPP_EVENT_NUM);
+
+       mask_entry = event_type / 64;
+       mask_bit = event_type % 64;
+
+       if (!(be64_to_cpu(unaff_events[mask_entry]) & (1ull << mask_bit)))
+               return false;
+
+       return true;
+}
+
+static u32 devx_get_obj_id_from_event(unsigned long event_type, void *data)
+{
+       struct mlx5_eqe *eqe = data;
+       u32 obj_id = 0;
+
+       switch (event_type) {
+       case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+       case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+       case MLX5_EVENT_TYPE_PATH_MIG:
+       case MLX5_EVENT_TYPE_COMM_EST:
+       case MLX5_EVENT_TYPE_SQ_DRAINED:
+       case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+       case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+       case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+       case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+       case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+               obj_id = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
+               break;
+       case MLX5_EVENT_TYPE_DCT_DRAINED:
+               obj_id = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff;
+               break;
+       case MLX5_EVENT_TYPE_CQ_ERROR:
+               obj_id = be32_to_cpu(eqe->data.cq_err.cqn) & 0xffffff;
+               break;
+       default:
+               obj_id = MLX5_GET(affiliated_event_header, &eqe->data, obj_id);
+               break;
+       }
+
+       return obj_id;
+}
+
+static int deliver_event(struct devx_event_subscription *event_sub,
+                        const void *data)
+{
+       struct devx_async_event_file *ev_file;
+       struct devx_async_event_data *event_data;
+       unsigned long flags;
+
+       ev_file = event_sub->ev_file;
+
+       if (ev_file->omit_data) {
+               spin_lock_irqsave(&ev_file->lock, flags);
+               if (!list_empty(&event_sub->event_list)) {
+                       spin_unlock_irqrestore(&ev_file->lock, flags);
+                       return 0;
+               }
+
+               list_add_tail(&event_sub->event_list, &ev_file->event_list);
+               spin_unlock_irqrestore(&ev_file->lock, flags);
+               wake_up_interruptible(&ev_file->poll_wait);
+               return 0;
+       }
+
+       event_data = kzalloc(sizeof(*event_data) + sizeof(struct mlx5_eqe),
+                            GFP_ATOMIC);
+       if (!event_data) {
+               spin_lock_irqsave(&ev_file->lock, flags);
+               ev_file->is_overflow_err = 1;
+               spin_unlock_irqrestore(&ev_file->lock, flags);
+               return -ENOMEM;
+       }
+
+       event_data->hdr.cookie = event_sub->cookie;
+       memcpy(event_data->hdr.out_data, data, sizeof(struct mlx5_eqe));
+
+       spin_lock_irqsave(&ev_file->lock, flags);
+       list_add_tail(&event_data->list, &ev_file->event_list);
+       spin_unlock_irqrestore(&ev_file->lock, flags);
+       wake_up_interruptible(&ev_file->poll_wait);
+
+       return 0;
+}
+
+static void dispatch_event_fd(struct list_head *fd_list,
+                             const void *data)
+{
+       struct devx_event_subscription *item;
+
+       list_for_each_entry_rcu(item, fd_list, xa_list) {
+               if (!get_file_rcu(item->filp))
+                       continue;
+
+               if (item->eventfd) {
+                       eventfd_signal(item->eventfd, 1);
+                       fput(item->filp);
+                       continue;
+               }
+
+               deliver_event(item, data);
+               fput(item->filp);
+       }
+}
+
+static int devx_event_notifier(struct notifier_block *nb,
+                              unsigned long event_type, void *data)
+{
+       struct mlx5_devx_event_table *table;
+       struct mlx5_ib_dev *dev;
+       struct devx_event *event;
+       struct devx_obj_event *obj_event;
+       u16 obj_type = 0;
+       bool is_unaffiliated;
+       u32 obj_id;
+
+       /* Explicit filtering to kernel events which may occur frequently */
+       if (event_type == MLX5_EVENT_TYPE_CMD ||
+           event_type == MLX5_EVENT_TYPE_PAGE_REQUEST)
+               return NOTIFY_OK;
+
+       table = container_of(nb, struct mlx5_devx_event_table, devx_nb.nb);
+       dev = container_of(table, struct mlx5_ib_dev, devx_event_table);
+       is_unaffiliated = is_unaffiliated_event(dev->mdev, event_type);
+
+       if (!is_unaffiliated)
+               obj_type = get_event_obj_type(event_type, data);
+
+       rcu_read_lock();
+       event = xa_load(&table->event_xa, event_type | (obj_type << 16));
+       if (!event) {
+               rcu_read_unlock();
+               return NOTIFY_DONE;
+       }
+
+       if (is_unaffiliated) {
+               dispatch_event_fd(&event->unaffiliated_list, data);
+               rcu_read_unlock();
+               return NOTIFY_OK;
+       }
+
+       obj_id = devx_get_obj_id_from_event(event_type, data);
+       obj_event = xa_load(&event->object_ids, obj_id);
+       if (!obj_event) {
+               rcu_read_unlock();
+               return NOTIFY_DONE;
+       }
+
+       dispatch_event_fd(&obj_event->obj_sub_list, data);
+
+       rcu_read_unlock();
+       return NOTIFY_OK;
+}
+
+void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev)
+{
+       struct mlx5_devx_event_table *table = &dev->devx_event_table;
+
+       xa_init(&table->event_xa);
+       mutex_init(&table->event_xa_lock);
+       MLX5_NB_INIT(&table->devx_nb, devx_event_notifier, NOTIFY_ANY);
+       mlx5_eq_notifier_register(dev->mdev, &table->devx_nb);
+}
+
+void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev)
+{
+       struct mlx5_devx_event_table *table = &dev->devx_event_table;
+       struct devx_event_subscription *sub, *tmp;
+       struct devx_event *event;
+       void *entry;
+       unsigned long id;
+
+       mlx5_eq_notifier_unregister(dev->mdev, &table->devx_nb);
+       mutex_lock(&dev->devx_event_table.event_xa_lock);
+       xa_for_each(&table->event_xa, id, entry) {
+               event = entry;
+               list_for_each_entry_safe(sub, tmp, &event->unaffiliated_list,
+                                        xa_list)
+                       devx_cleanup_subscription(dev, sub);
+               kfree(entry);
+       }
+       mutex_unlock(&dev->devx_event_table.event_xa_lock);
+       xa_destroy(&table->event_xa);
+}
+
 static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf,
                                         size_t count, loff_t *pos)
 {
@@ -1719,6 +2539,149 @@ static const struct file_operations devx_async_cmd_event_fops = {
        .llseek  = no_llseek,
 };
 
+static ssize_t devx_async_event_read(struct file *filp, char __user *buf,
+                                    size_t count, loff_t *pos)
+{
+       struct devx_async_event_file *ev_file = filp->private_data;
+       struct devx_event_subscription *event_sub;
+       struct devx_async_event_data *uninitialized_var(event);
+       int ret = 0;
+       size_t eventsz;
+       bool omit_data;
+       void *event_data;
+
+       omit_data = ev_file->omit_data;
+
+       spin_lock_irq(&ev_file->lock);
+
+       if (ev_file->is_overflow_err) {
+               ev_file->is_overflow_err = 0;
+               spin_unlock_irq(&ev_file->lock);
+               return -EOVERFLOW;
+       }
+
+       if (ev_file->is_destroyed) {
+               spin_unlock_irq(&ev_file->lock);
+               return -EIO;
+       }
+
+       while (list_empty(&ev_file->event_list)) {
+               spin_unlock_irq(&ev_file->lock);
+
+               if (filp->f_flags & O_NONBLOCK)
+                       return -EAGAIN;
+
+               if (wait_event_interruptible(ev_file->poll_wait,
+                           (!list_empty(&ev_file->event_list) ||
+                            ev_file->is_destroyed))) {
+                       return -ERESTARTSYS;
+               }
+
+               spin_lock_irq(&ev_file->lock);
+               if (ev_file->is_destroyed) {
+                       spin_unlock_irq(&ev_file->lock);
+                       return -EIO;
+               }
+       }
+
+       if (omit_data) {
+               event_sub = list_first_entry(&ev_file->event_list,
+                                       struct devx_event_subscription,
+                                       event_list);
+               eventsz = sizeof(event_sub->cookie);
+               event_data = &event_sub->cookie;
+       } else {
+               event = list_first_entry(&ev_file->event_list,
+                                     struct devx_async_event_data, list);
+               eventsz = sizeof(struct mlx5_eqe) +
+                       sizeof(struct mlx5_ib_uapi_devx_async_event_hdr);
+               event_data = &event->hdr;
+       }
+
+       if (eventsz > count) {
+               spin_unlock_irq(&ev_file->lock);
+               return -EINVAL;
+       }
+
+       if (omit_data)
+               list_del_init(&event_sub->event_list);
+       else
+               list_del(&event->list);
+
+       spin_unlock_irq(&ev_file->lock);
+
+       if (copy_to_user(buf, event_data, eventsz))
+               /* This points to an application issue, not a kernel concern */
+               ret = -EFAULT;
+       else
+               ret = eventsz;
+
+       if (!omit_data)
+               kfree(event);
+       return ret;
+}
+
+static __poll_t devx_async_event_poll(struct file *filp,
+                                     struct poll_table_struct *wait)
+{
+       struct devx_async_event_file *ev_file = filp->private_data;
+       __poll_t pollflags = 0;
+
+       poll_wait(filp, &ev_file->poll_wait, wait);
+
+       spin_lock_irq(&ev_file->lock);
+       if (ev_file->is_destroyed)
+               pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+       else if (!list_empty(&ev_file->event_list))
+               pollflags = EPOLLIN | EPOLLRDNORM;
+       spin_unlock_irq(&ev_file->lock);
+
+       return pollflags;
+}
+
+static int devx_async_event_close(struct inode *inode, struct file *filp)
+{
+       struct devx_async_event_file *ev_file = filp->private_data;
+       struct devx_event_subscription *event_sub, *event_sub_tmp;
+       struct devx_async_event_data *entry, *tmp;
+
+       mutex_lock(&ev_file->dev->devx_event_table.event_xa_lock);
+       /* delete the subscriptions which are related to this FD */
+       list_for_each_entry_safe(event_sub, event_sub_tmp,
+                                &ev_file->subscribed_events_list, file_list) {
+               devx_cleanup_subscription(ev_file->dev, event_sub);
+               if (event_sub->eventfd)
+                       eventfd_ctx_put(event_sub->eventfd);
+
+               list_del_rcu(&event_sub->file_list);
+               /* subscription may not be used by the read API any more */
+               kfree_rcu(event_sub, rcu);
+       }
+
+       mutex_unlock(&ev_file->dev->devx_event_table.event_xa_lock);
+
+       /* free the pending events allocation */
+       if (!ev_file->omit_data) {
+               spin_lock_irq(&ev_file->lock);
+               list_for_each_entry_safe(entry, tmp,
+                                        &ev_file->event_list, list)
+                       kfree(entry); /* read can't come any more */
+               spin_unlock_irq(&ev_file->lock);
+       }
+
+       uverbs_close_fd(filp);
+       put_device(&ev_file->dev->ib_dev.dev);
+       return 0;
+}
+
+static const struct file_operations devx_async_event_fops = {
+       .owner   = THIS_MODULE,
+       .read    = devx_async_event_read,
+       .poll    = devx_async_event_poll,
+       .release = devx_async_event_close,
+       .llseek  = no_llseek,
+};
+
 static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj,
                                                   enum rdma_remove_reason why)
 {
@@ -1738,6 +2701,21 @@ static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj,
        return 0;
 };
 
+static int devx_hot_unplug_async_event_file(struct ib_uobject *uobj,
+                                           enum rdma_remove_reason why)
+{
+       struct devx_async_event_file *ev_file =
+               container_of(uobj, struct devx_async_event_file,
+                            uobj);
+
+       spin_lock_irq(&ev_file->lock);
+       ev_file->is_destroyed = 1;
+       spin_unlock_irq(&ev_file->lock);
+
+       wake_up_interruptible(&ev_file->poll_wait);
+       return 0;
+};
+
 DECLARE_UVERBS_NAMED_METHOD(
        MLX5_IB_METHOD_DEVX_UMEM_REG,
        UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE,
@@ -1869,10 +2847,32 @@ DECLARE_UVERBS_NAMED_METHOD(
                UVERBS_ATTR_TYPE(u64),
                UA_MANDATORY));
 
+DECLARE_UVERBS_NAMED_METHOD(
+       MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT,
+       UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE,
+               MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD,
+               UVERBS_ACCESS_READ,
+               UA_MANDATORY),
+       UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE,
+               MLX5_IB_OBJECT_DEVX_OBJ,
+               UVERBS_ACCESS_READ,
+               UA_OPTIONAL),
+       UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST,
+               UVERBS_ATTR_MIN_SIZE(sizeof(u16)),
+               UA_MANDATORY,
+               UA_ALLOC_AND_COPY),
+       UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE,
+               UVERBS_ATTR_TYPE(u64),
+               UA_OPTIONAL),
+       UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM,
+               UVERBS_ATTR_TYPE(u32),
+               UA_OPTIONAL));
+
 DECLARE_UVERBS_GLOBAL_METHODS(MLX5_IB_OBJECT_DEVX,
                              &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OTHER),
                              &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_UAR),
-                             &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_EQN));
+                             &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_EQN),
+                             &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT));
 
 DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_OBJ,
                            UVERBS_TYPE_ALLOC_IDR(devx_obj_cleanup),
@@ -1903,6 +2903,24 @@ DECLARE_UVERBS_NAMED_OBJECT(
                             O_RDONLY),
        &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC));
 
+DECLARE_UVERBS_NAMED_METHOD(
+       MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC,
+       UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_HANDLE,
+                       MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD,
+                       UVERBS_ACCESS_NEW,
+                       UA_MANDATORY),
+       UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_FLAGS,
+                       enum mlx5_ib_uapi_devx_create_event_channel_flags,
+                       UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+       MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD,
+       UVERBS_TYPE_ALLOC_FD(sizeof(struct devx_async_event_file),
+                            devx_hot_unplug_async_event_file,
+                            &devx_async_event_fops, "[devx_async_event]",
+                            O_RDONLY),
+       &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC));
+
 static bool devx_is_supported(struct ib_device *device)
 {
        struct mlx5_ib_dev *dev = to_mdev(device);
@@ -1923,5 +2941,8 @@ const struct uapi_definition mlx5_ib_devx_defs[] = {
        UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
                MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
                UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)),
+       UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+               MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD,
+               UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)),
        {},
 };
index 6c529e6f3a0131ae5f26d8c4a2493e74b18cc68a..348c1df69cdc64ac729c93787a575a3c960372dd 100644 (file)
@@ -200,19 +200,33 @@ static void pma_cnt_assign(struct ib_pma_portcounters *pma_cnt,
                             vl_15_dropped);
 }
 
-static int process_pma_cmd(struct mlx5_core_dev *mdev, u8 port_num,
+static int process_pma_cmd(struct mlx5_ib_dev *dev, u8 port_num,
                           const struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
-       int err;
+       struct mlx5_core_dev *mdev;
+       bool native_port = true;
+       u8 mdev_port_num;
        void *out_cnt;
+       int err;
 
+       mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
+       if (!mdev) {
+               /* Fail to get the native port, likely due to 2nd port is still
+                * unaffiliated. In such case default to 1st port and attached
+                * PF device.
+                */
+               native_port = false;
+               mdev = dev->mdev;
+               mdev_port_num = 1;
+       }
        /* Declaring support of extended counters */
        if (in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO) {
                struct ib_class_port_info cpi = {};
 
                cpi.capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH;
                memcpy((out_mad->data + 40), &cpi, sizeof(cpi));
-               return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+               err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+               goto done;
        }
 
        if (in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT) {
@@ -221,11 +235,13 @@ static int process_pma_cmd(struct mlx5_core_dev *mdev, u8 port_num,
                int sz = MLX5_ST_SZ_BYTES(query_vport_counter_out);
 
                out_cnt = kvzalloc(sz, GFP_KERNEL);
-               if (!out_cnt)
-                       return IB_MAD_RESULT_FAILURE;
+               if (!out_cnt) {
+                       err = IB_MAD_RESULT_FAILURE;
+                       goto done;
+               }
 
                err = mlx5_core_query_vport_counter(mdev, 0, 0,
-                                                   port_num, out_cnt, sz);
+                                                   mdev_port_num, out_cnt, sz);
                if (!err)
                        pma_cnt_ext_assign(pma_cnt_ext, out_cnt);
        } else {
@@ -234,20 +250,23 @@ static int process_pma_cmd(struct mlx5_core_dev *mdev, u8 port_num,
                int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
 
                out_cnt = kvzalloc(sz, GFP_KERNEL);
-               if (!out_cnt)
-                       return IB_MAD_RESULT_FAILURE;
+               if (!out_cnt) {
+                       err = IB_MAD_RESULT_FAILURE;
+                       goto done;
+               }
 
-               err = mlx5_core_query_ib_ppcnt(mdev, port_num,
+               err = mlx5_core_query_ib_ppcnt(mdev, mdev_port_num,
                                               out_cnt, sz);
                if (!err)
                        pma_cnt_assign(pma_cnt, out_cnt);
-               }
-
+       }
        kvfree(out_cnt);
-       if (err)
-               return IB_MAD_RESULT_FAILURE;
-
-       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+       err = err ? IB_MAD_RESULT_FAILURE :
+                   IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+done:
+       if (native_port)
+               mlx5_ib_put_native_port_mdev(dev, port_num);
+       return err;
 }
 
 int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
@@ -259,8 +278,6 @@ int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
        struct mlx5_ib_dev *dev = to_mdev(ibdev);
        const struct ib_mad *in_mad = (const struct ib_mad *)in;
        struct ib_mad *out_mad = (struct ib_mad *)out;
-       struct mlx5_core_dev *mdev;
-       u8 mdev_port_num;
        int ret;
 
        if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
@@ -269,19 +286,14 @@ int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 
        memset(out_mad->data, 0, sizeof(out_mad->data));
 
-       mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
-       if (!mdev)
-               return IB_MAD_RESULT_FAILURE;
-
-       if (MLX5_CAP_GEN(mdev, vport_counters) &&
+       if (MLX5_CAP_GEN(dev->mdev, vport_counters) &&
            in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT &&
            in_mad->mad_hdr.method == IB_MGMT_METHOD_GET) {
-               ret = process_pma_cmd(mdev, mdev_port_num, in_mad, out_mad);
+               ret = process_pma_cmd(dev, port_num, in_mad, out_mad);
        } else {
                ret =  process_mad(ibdev, mad_flags, port_num, in_wc, in_grh,
                                   in_mad, out_mad);
        }
-       mlx5_ib_put_native_port_mdev(dev, port_num);
        return ret;
 }
 
index ba312bf59c7ab14817b327c9bb693aa4135cae4d..c2a5780cb394e64b013fbeff00e78eb7e3361287 100644 (file)
@@ -52,6 +52,7 @@
 #include <linux/mlx5/port.h>
 #include <linux/mlx5/vport.h>
 #include <linux/mlx5/fs.h>
+#include <linux/mlx5/eswitch.h>
 #include <linux/list.h>
 #include <rdma/ib_smi.h>
 #include <rdma/ib_umem.h>
@@ -888,7 +889,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
        }
        props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
        if (MLX5_CAP_GEN(mdev, sho)) {
-               props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER;
+               props->device_cap_flags |= IB_DEVICE_INTEGRITY_HANDOVER;
                /* At this stage no support for signature handover */
                props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
                                      IB_PROT_T10DIF_TYPE_2 |
@@ -1008,6 +1009,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
        props->max_srq_sge         = max_rq_sg - 1;
        props->max_fast_reg_page_list_len =
                1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
+       props->max_pi_fast_reg_page_list_len =
+               props->max_fast_reg_page_list_len / 2;
        get_atomic_caps_qp(dev, props);
        props->masked_atomic_cap   = IB_ATOMIC_NONE;
        props->max_mcast_grp       = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
@@ -1043,15 +1046,19 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
        }
 
        if (MLX5_CAP_GEN(mdev, tag_matching)) {
-               props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE;
                props->tm_caps.max_num_tags =
                        (1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1;
-               props->tm_caps.flags = IB_TM_CAP_RC;
                props->tm_caps.max_ops =
                        1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
                props->tm_caps.max_sge = MLX5_TM_MAX_SGE;
        }
 
+       if (MLX5_CAP_GEN(mdev, tag_matching) &&
+           MLX5_CAP_GEN(mdev, rndv_offload_rc)) {
+               props->tm_caps.flags = IB_TM_CAP_RNDV_RC;
+               props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE;
+       }
+
        if (MLX5_CAP_GEN(dev->mdev, cq_moderation)) {
                props->cq_caps.max_cq_moderation_count =
                                                MLX5_MAX_CQ_COUNT;
@@ -3257,11 +3264,14 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
        int max_table_size;
        int num_entries;
        int num_groups;
+       bool esw_encap;
        u32 flags = 0;
        int priority;
 
        max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
                                                       log_max_ft_size));
+       esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) !=
+               DEVLINK_ESWITCH_ENCAP_MODE_NONE;
        if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
                enum mlx5_flow_namespace_type fn_type;
 
@@ -3274,10 +3284,10 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
                if (ft_type == MLX5_IB_FT_RX) {
                        fn_type = MLX5_FLOW_NAMESPACE_BYPASS;
                        prio = &dev->flow_db->prios[priority];
-                       if (!dev->is_rep &&
+                       if (!dev->is_rep && !esw_encap &&
                            MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap))
                                flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
-                       if (!dev->is_rep &&
+                       if (!dev->is_rep && !esw_encap &&
                            MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
                                        reformat_l3_tunnel_to_l2))
                                flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
@@ -3287,7 +3297,7 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
                                                              log_max_ft_size));
                        fn_type = MLX5_FLOW_NAMESPACE_EGRESS;
                        prio = &dev->flow_db->egress_prios[priority];
-                       if (!dev->is_rep &&
+                       if (!dev->is_rep && !esw_encap &&
                            MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat))
                                flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
                }
@@ -3923,6 +3933,7 @@ _get_flow_table(struct mlx5_ib_dev *dev,
        struct mlx5_flow_namespace *ns = NULL;
        struct mlx5_ib_flow_prio *prio = NULL;
        int max_table_size = 0;
+       bool esw_encap;
        u32 flags = 0;
        int priority;
 
@@ -3931,22 +3942,30 @@ _get_flow_table(struct mlx5_ib_dev *dev,
        else
                priority = ib_prio_to_core_prio(fs_matcher->priority, false);
 
+       esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) !=
+               DEVLINK_ESWITCH_ENCAP_MODE_NONE;
        if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) {
                max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
                                        log_max_ft_size));
-               if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap))
+               if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap) && !esw_encap)
                        flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
                if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
-                                             reformat_l3_tunnel_to_l2))
+                                             reformat_l3_tunnel_to_l2) &&
+                   !esw_encap)
                        flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
        } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS) {
                max_table_size = BIT(
                        MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, log_max_ft_size));
-               if (MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat))
+               if (MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat) && !esw_encap)
                        flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
        } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB) {
                max_table_size = BIT(
                        MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, log_max_ft_size));
+               if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, decap) && esw_encap)
+                       flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
+               if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, reformat_l3_tunnel_to_l2) &&
+                   esw_encap)
+                       flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
                priority = FDB_BYPASS_PATH;
        }
 
@@ -4711,7 +4730,7 @@ static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port)
        int err = -ENOMEM;
        struct ib_udata uhw = {.inlen = 0, .outlen = 0};
 
-       pprops = kmalloc(sizeof(*pprops), GFP_KERNEL);
+       pprops = kzalloc(sizeof(*pprops), GFP_KERNEL);
        if (!pprops)
                goto out;
 
@@ -4725,7 +4744,6 @@ static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port)
                goto out;
        }
 
-       memset(pprops, 0, sizeof(*pprops));
        err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
        if (err) {
                mlx5_ib_warn(dev, "query_port %d failed %d\n",
@@ -4926,18 +4944,19 @@ static int create_dev_resources(struct mlx5_ib_resources *devr)
        if (ret)
                goto error0;
 
-       devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL);
-       if (IS_ERR(devr->c0)) {
-               ret = PTR_ERR(devr->c0);
+       devr->c0 = rdma_zalloc_drv_obj(ibdev, ib_cq);
+       if (!devr->c0) {
+               ret = -ENOMEM;
                goto error1;
        }
-       devr->c0->device        = &dev->ib_dev;
-       devr->c0->uobject       = NULL;
-       devr->c0->comp_handler  = NULL;
-       devr->c0->event_handler = NULL;
-       devr->c0->cq_context    = NULL;
+
+       devr->c0->device = &dev->ib_dev;
        atomic_set(&devr->c0->usecnt, 0);
 
+       ret = mlx5_ib_create_cq(devr->c0, &cq_attr, NULL);
+       if (ret)
+               goto err_create_cq;
+
        devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL);
        if (IS_ERR(devr->x0)) {
                ret = PTR_ERR(devr->x0);
@@ -5029,6 +5048,8 @@ error3:
        mlx5_ib_dealloc_xrcd(devr->x0, NULL);
 error2:
        mlx5_ib_destroy_cq(devr->c0, NULL);
+err_create_cq:
+       kfree(devr->c0);
 error1:
        mlx5_ib_dealloc_pd(devr->p0, NULL);
 error0:
@@ -5047,6 +5068,7 @@ static void destroy_dev_resources(struct mlx5_ib_resources *devr)
        mlx5_ib_dealloc_xrcd(devr->x0, NULL);
        mlx5_ib_dealloc_xrcd(devr->x1, NULL);
        mlx5_ib_destroy_cq(devr->c0, NULL);
+       kfree(devr->c0);
        mlx5_ib_dealloc_pd(devr->p0, NULL);
        kfree(devr->p0);
 
@@ -5459,7 +5481,8 @@ static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
 
 static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev,
                                    struct mlx5_ib_port *port,
-                                   struct rdma_hw_stats *stats)
+                                   struct rdma_hw_stats *stats,
+                                   u16 set_id)
 {
        int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
        void *out;
@@ -5470,9 +5493,7 @@ static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev,
        if (!out)
                return -ENOMEM;
 
-       ret = mlx5_core_query_q_counter(mdev,
-                                       port->cnts.set_id, 0,
-                                       out, outlen);
+       ret = mlx5_core_query_q_counter(mdev, set_id, 0, out, outlen);
        if (ret)
                goto free;
 
@@ -5532,7 +5553,8 @@ static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
                       port->cnts.num_ext_ppcnt_counters;
 
        /* q_counters are per IB device, query the master mdev */
-       ret = mlx5_ib_query_q_counters(dev->mdev, port, stats);
+       ret = mlx5_ib_query_q_counters(dev->mdev, port, stats,
+                                      port->cnts.set_id);
        if (ret)
                return ret;
 
@@ -5568,6 +5590,68 @@ done:
        return num_counters;
 }
 
+static struct rdma_hw_stats *
+mlx5_ib_counter_alloc_stats(struct rdma_counter *counter)
+{
+       struct mlx5_ib_dev *dev = to_mdev(counter->device);
+       struct mlx5_ib_port *port = &dev->port[counter->port - 1];
+
+       /* Q counters are in the beginning of all counters */
+       return rdma_alloc_hw_stats_struct(port->cnts.names,
+                                         port->cnts.num_q_counters,
+                                         RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int mlx5_ib_counter_update_stats(struct rdma_counter *counter)
+{
+       struct mlx5_ib_dev *dev = to_mdev(counter->device);
+       struct mlx5_ib_port *port = &dev->port[counter->port - 1];
+
+       return mlx5_ib_query_q_counters(dev->mdev, port,
+                                       counter->stats, counter->id);
+}
+
+static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
+                                  struct ib_qp *qp)
+{
+       struct mlx5_ib_dev *dev = to_mdev(qp->device);
+       u16 cnt_set_id = 0;
+       int err;
+
+       if (!counter->id) {
+               err = mlx5_cmd_alloc_q_counter(dev->mdev,
+                                              &cnt_set_id,
+                                              MLX5_SHARED_RESOURCE_UID);
+               if (err)
+                       return err;
+               counter->id = cnt_set_id;
+       }
+
+       err = mlx5_ib_qp_set_counter(qp, counter);
+       if (err)
+               goto fail_set_counter;
+
+       return 0;
+
+fail_set_counter:
+       mlx5_core_dealloc_q_counter(dev->mdev, cnt_set_id);
+       counter->id = 0;
+
+       return err;
+}
+
+static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp)
+{
+       return mlx5_ib_qp_set_counter(qp, NULL);
+}
+
+static int mlx5_ib_counter_dealloc(struct rdma_counter *counter)
+{
+       struct mlx5_ib_dev *dev = to_mdev(counter->device);
+
+       return mlx5_core_dealloc_q_counter(dev->mdev, counter->id);
+}
+
 static int mlx5_ib_rn_get_params(struct ib_device *device, u8 port_num,
                                 enum rdma_netdev_t type,
                                 struct rdma_netdev_alloc_params *params)
@@ -6079,7 +6163,6 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
        if (mlx5_use_mad_ifc(dev))
                get_ext_port_caps(dev);
 
-       dev->ib_dev.owner               = THIS_MODULE;
        dev->ib_dev.node_type           = RDMA_NODE_IB_CA;
        dev->ib_dev.local_dma_lkey      = 0 /* not supported for now */;
        dev->ib_dev.phys_port_cnt       = dev->num_ports;
@@ -6159,8 +6242,13 @@ static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev)
 }
 
 static const struct ib_device_ops mlx5_ib_dev_ops = {
+       .owner = THIS_MODULE,
+       .driver_id = RDMA_DRIVER_MLX5,
+       .uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION,
+
        .add_gid = mlx5_ib_add_gid,
        .alloc_mr = mlx5_ib_alloc_mr,
+       .alloc_mr_integrity = mlx5_ib_alloc_mr_integrity,
        .alloc_pd = mlx5_ib_alloc_pd,
        .alloc_ucontext = mlx5_ib_alloc_ucontext,
        .attach_mcast = mlx5_ib_mcg_attach,
@@ -6190,6 +6278,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
        .get_dma_mr = mlx5_ib_get_dma_mr,
        .get_link_layer = mlx5_ib_port_link_layer,
        .map_mr_sg = mlx5_ib_map_mr_sg,
+       .map_mr_sg_pi = mlx5_ib_map_mr_sg_pi,
        .mmap = mlx5_ib_mmap,
        .modify_cq = mlx5_ib_modify_cq,
        .modify_device = mlx5_ib_modify_device,
@@ -6214,6 +6303,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
        .resize_cq = mlx5_ib_resize_cq,
 
        INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah),
+       INIT_RDMA_OBJ_SIZE(ib_cq, mlx5_ib_cq, ibcq),
        INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd),
        INIT_RDMA_OBJ_SIZE(ib_srq, mlx5_ib_srq, ibsrq),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext),
@@ -6256,7 +6346,6 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
        struct mlx5_core_dev *mdev = dev->mdev;
        int err;
 
-       dev->ib_dev.uverbs_abi_ver      = MLX5_IB_UVERBS_ABI_VERSION;
        dev->ib_dev.uverbs_cmd_mask     =
                (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
                (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
@@ -6325,7 +6414,6 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
        if (mlx5_accel_ipsec_device_caps(dev->mdev) &
            MLX5_ACCEL_IPSEC_CAP_DEVICE)
                ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_flow_ipsec_ops);
-       dev->ib_dev.driver_id = RDMA_DRIVER_MLX5;
        ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_ops);
 
        if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS))
@@ -6340,6 +6428,8 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
             MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
                mutex_init(&dev->lb.mutex);
 
+       dev->ib_dev.use_cq_dim = true;
+
        return 0;
 }
 
@@ -6487,6 +6577,11 @@ static void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev)
 static const struct ib_device_ops mlx5_ib_dev_hw_stats_ops = {
        .alloc_hw_stats = mlx5_ib_alloc_hw_stats,
        .get_hw_stats = mlx5_ib_get_hw_stats,
+       .counter_bind_qp = mlx5_ib_counter_bind_qp,
+       .counter_unbind_qp = mlx5_ib_counter_unbind_qp,
+       .counter_dealloc = mlx5_ib_counter_dealloc,
+       .counter_alloc_stats = mlx5_ib_counter_alloc_stats,
+       .counter_update_stats = mlx5_ib_counter_update_stats,
 };
 
 static int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev)
@@ -6607,15 +6702,19 @@ static int mlx5_ib_stage_devx_init(struct mlx5_ib_dev *dev)
        int uid;
 
        uid = mlx5_ib_devx_create(dev, false);
-       if (uid > 0)
+       if (uid > 0) {
                dev->devx_whitelist_uid = uid;
+               mlx5_ib_devx_init_event_table(dev);
+       }
 
        return 0;
 }
 static void mlx5_ib_stage_devx_cleanup(struct mlx5_ib_dev *dev)
 {
-       if (dev->devx_whitelist_uid)
+       if (dev->devx_whitelist_uid) {
+               mlx5_ib_devx_cleanup_event_table(dev);
                mlx5_ib_devx_destroy(dev, dev->devx_whitelist_uid);
+       }
 }
 
 void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
index 9f90be296ee0f7f48e413249c86b0051c8ed78a8..fe1a76d8531cee8b5d7e347c54fdbc1c51caaac3 100644 (file)
@@ -55,9 +55,10 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
        int i = 0;
        struct scatterlist *sg;
        int entry;
-       unsigned long page_shift = umem->page_shift;
 
        if (umem->is_odp) {
+               unsigned int page_shift = to_ib_umem_odp(umem)->page_shift;
+
                *ncont = ib_umem_page_count(umem);
                *count = *ncont << (page_shift - PAGE_SHIFT);
                *shift = page_shift;
@@ -67,15 +68,15 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
                return;
        }
 
-       addr = addr >> page_shift;
+       addr = addr >> PAGE_SHIFT;
        tmp = (unsigned long)addr;
        m = find_first_bit(&tmp, BITS_PER_LONG);
        if (max_page_shift)
-               m = min_t(unsigned long, max_page_shift - page_shift, m);
+               m = min_t(unsigned long, max_page_shift - PAGE_SHIFT, m);
 
        for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-               len = sg_dma_len(sg) >> page_shift;
-               pfn = sg_dma_address(sg) >> page_shift;
+               len = sg_dma_len(sg) >> PAGE_SHIFT;
+               pfn = sg_dma_address(sg) >> PAGE_SHIFT;
                if (base + p != pfn) {
                        /* If either the offset or the new
                         * base are unaligned update m
@@ -107,7 +108,7 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
 
                *ncont = 0;
        }
-       *shift = page_shift + m;
+       *shift = PAGE_SHIFT + m;
        *count = i;
 }
 
@@ -140,8 +141,7 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
                            int page_shift, size_t offset, size_t num_pages,
                            __be64 *pas, int access_flags)
 {
-       unsigned long umem_page_shift = umem->page_shift;
-       int shift = page_shift - umem_page_shift;
+       int shift = page_shift - PAGE_SHIFT;
        int mask = (1 << shift) - 1;
        int i, k, idx;
        u64 cur = 0;
@@ -165,7 +165,7 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 
        i = 0;
        for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-               len = sg_dma_len(sg) >> umem_page_shift;
+               len = sg_dma_len(sg) >> PAGE_SHIFT;
                base = sg_dma_address(sg);
 
                /* Skip elements below offset */
@@ -184,7 +184,7 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 
                for (; k < len; k++) {
                        if (!(i & mask)) {
-                               cur = base + (k << umem_page_shift);
+                               cur = base + (k << PAGE_SHIFT);
                                cur |= access_flags;
                                idx = (i >> shift) - offset;
 
index ee73dc122d28e476979674c2a065ca44c963f321..c482f19958b39754555e8c74cb31906a95a1992a 100644 (file)
@@ -431,9 +431,6 @@ struct mlx5_ib_qp {
 
        int                     create_type;
 
-       /* Store signature errors */
-       bool                    signature_en;
-
        struct list_head        qps_list;
        struct list_head        cq_recv_list;
        struct list_head        cq_send_list;
@@ -442,6 +439,10 @@ struct mlx5_ib_qp {
        u32                     flags_en;
        /* storage for qp sub type when core qp type is IB_QPT_DRIVER */
        enum ib_qp_type         qp_sub_type;
+       /* A flag to indicate if there's a new counter is configured
+        * but not take effective
+        */
+       u32                     counter_pending;
 };
 
 struct mlx5_ib_cq_buf {
@@ -587,6 +588,9 @@ struct mlx5_ib_mr {
        void                    *descs;
        dma_addr_t              desc_map;
        int                     ndescs;
+       int                     data_length;
+       int                     meta_ndescs;
+       int                     meta_length;
        int                     max_descs;
        int                     desc_size;
        int                     access_mode;
@@ -605,6 +609,13 @@ struct mlx5_ib_mr {
        int                     access_flags; /* Needed for rereg MR */
 
        struct mlx5_ib_mr      *parent;
+       /* Needed for IB_MR_TYPE_INTEGRITY */
+       struct mlx5_ib_mr      *pi_mr;
+       struct mlx5_ib_mr      *klm_mr;
+       struct mlx5_ib_mr      *mtt_mr;
+       u64                     data_iova;
+       u64                     pi_iova;
+
        atomic_t                num_leaf_free;
        wait_queue_head_t       q_leaf_free;
        struct mlx5_async_work  cb_work;
@@ -929,6 +940,13 @@ struct mlx5_ib_pf_eq {
        mempool_t *pool;
 };
 
+struct mlx5_devx_event_table {
+       struct mlx5_nb devx_nb;
+       /* serialize updating the event_xa */
+       struct mutex event_xa_lock;
+       struct xarray event_xa;
+};
+
 struct mlx5_ib_dev {
        struct ib_device                ib_dev;
        struct mlx5_core_dev            *mdev;
@@ -978,6 +996,7 @@ struct mlx5_ib_dev {
        u16                     devx_whitelist_uid;
        struct mlx5_srq_table   srq_table;
        struct mlx5_async_ctx   async_ctx;
+       struct mlx5_devx_event_table devx_event_table;
 };
 
 static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
@@ -1115,10 +1134,9 @@ int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
                             int buflen, size_t *bc);
 int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index,
                              void *buffer, int buflen, size_t *bc);
-struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
-                               const struct ib_cq_init_attr *attr,
-                               struct ib_udata *udata);
-int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
+int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                     struct ib_udata *udata);
+void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
 int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
@@ -1148,8 +1166,15 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
                               u32 max_num_sg, struct ib_udata *udata);
+struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
+                                        u32 max_num_sg,
+                                        u32 max_num_meta_sg);
 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
                      unsigned int *sg_offset);
+int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
+                        int data_sg_nents, unsigned int *data_sg_offset,
+                        struct scatterlist *meta_sg, int meta_sg_nents,
+                        unsigned int *meta_sg_offset);
 int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
                        const struct ib_wc *in_wc, const struct ib_grh *in_grh,
                        const struct ib_mad_hdr *in, size_t in_mad_size,
@@ -1201,7 +1226,7 @@ int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
 struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
                                struct ib_wq_init_attr *init_attr,
                                struct ib_udata *udata);
-int mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata);
+void mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata);
 int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
                      u32 wq_attr_mask, struct ib_udata *udata);
 struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
@@ -1311,6 +1336,8 @@ void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *dev,
 #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
 int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user);
 void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid);
+void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev);
+void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev);
 const struct uverbs_object_tree_def *mlx5_ib_get_devx_tree(void);
 extern const struct uapi_definition mlx5_ib_devx_defs[];
 extern const struct uapi_definition mlx5_ib_flow_defs[];
@@ -1328,6 +1355,8 @@ static inline int
 mlx5_ib_devx_create(struct mlx5_ib_dev *dev,
                           bool is_user) { return -EOPNOTSUPP; }
 static inline void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid) {}
+static inline void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev) {}
+static inline void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev) {}
 static inline bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id,
                                             int *dest_type)
 {
@@ -1443,4 +1472,6 @@ void mlx5_ib_put_xlt_emergency_page(void);
 int bfregn_to_uar_index(struct mlx5_ib_dev *dev,
                        struct mlx5_bfreg_info *bfregi, u32 bfregn,
                        bool dyn_bfreg);
+
+int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter);
 #endif /* MLX5_IB_H */
index 83b452d977d496c94f69114fe5ef4b0cb899fb8b..20ece6e0b2fcc1527b4018886b81d9f06633f0d6 100644 (file)
@@ -1507,10 +1507,9 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
        return 0;
 
 err:
-       if (mr->umem) {
-               ib_umem_release(mr->umem);
-               mr->umem = NULL;
-       }
+       ib_umem_release(mr->umem);
+       mr->umem = NULL;
+
        clean_mr(dev, mr);
        return err;
 }
@@ -1606,8 +1605,9 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
                synchronize_srcu(&dev->mr_srcu);
                /* Destroy all page mappings */
                if (umem_odp->page_list)
-                       mlx5_ib_invalidate_range(umem_odp, ib_umem_start(umem),
-                                                ib_umem_end(umem));
+                       mlx5_ib_invalidate_range(umem_odp,
+                                                ib_umem_start(umem_odp),
+                                                ib_umem_end(umem_odp));
                else
                        mlx5_ib_free_implicit_mr(mr);
                /*
@@ -1629,28 +1629,85 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
         * remove the DMA mapping.
         */
        mlx5_mr_cache_free(dev, mr);
-       if (umem) {
-               ib_umem_release(umem);
+       ib_umem_release(umem);
+       if (umem)
                atomic_sub(npages, &dev->mdev->priv.reg_pages);
-       }
+
        if (!mr->allocated_from_cache)
                kfree(mr);
 }
 
 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
-       dereg_mr(to_mdev(ibmr->device), to_mmr(ibmr));
+       struct mlx5_ib_mr *mmr = to_mmr(ibmr);
+
+       if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
+               dereg_mr(to_mdev(mmr->mtt_mr->ibmr.device), mmr->mtt_mr);
+               dereg_mr(to_mdev(mmr->klm_mr->ibmr.device), mmr->klm_mr);
+       }
+
+       dereg_mr(to_mdev(ibmr->device), mmr);
+
        return 0;
 }
 
-struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-                              u32 max_num_sg, struct ib_udata *udata)
+static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
+                                  int access_mode, int page_shift)
+{
+       void *mkc;
+
+       mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+
+       MLX5_SET(mkc, mkc, free, 1);
+       MLX5_SET(mkc, mkc, qpn, 0xffffff);
+       MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
+       MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
+       MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
+       MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
+       MLX5_SET(mkc, mkc, umr_en, 1);
+       MLX5_SET(mkc, mkc, log_page_size, page_shift);
+}
+
+static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
+                                 int ndescs, int desc_size, int page_shift,
+                                 int access_mode, u32 *in, int inlen)
 {
        struct mlx5_ib_dev *dev = to_mdev(pd->device);
+       int err;
+
+       mr->access_mode = access_mode;
+       mr->desc_size = desc_size;
+       mr->max_descs = ndescs;
+
+       err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size);
+       if (err)
+               return err;
+
+       mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
+
+       err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
+       if (err)
+               goto err_free_descs;
+
+       mr->mmkey.type = MLX5_MKEY_MR;
+       mr->ibmr.lkey = mr->mmkey.key;
+       mr->ibmr.rkey = mr->mmkey.key;
+
+       return 0;
+
+err_free_descs:
+       mlx5_free_priv_descs(mr);
+       return err;
+}
+
+static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
+                               u32 max_num_sg, u32 max_num_meta_sg,
+                               int desc_size, int access_mode)
+{
        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
-       int ndescs = ALIGN(max_num_sg, 4);
+       int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
+       int page_shift = 0;
        struct mlx5_ib_mr *mr;
-       void *mkc;
        u32 *in;
        int err;
 
@@ -1658,99 +1715,168 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
        if (!mr)
                return ERR_PTR(-ENOMEM);
 
+       mr->ibmr.pd = pd;
+       mr->ibmr.device = pd->device;
+
        in = kzalloc(inlen, GFP_KERNEL);
        if (!in) {
                err = -ENOMEM;
                goto err_free;
        }
 
+       if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
+               page_shift = PAGE_SHIFT;
+
+       err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
+                                    access_mode, in, inlen);
+       if (err)
+               goto err_free_in;
+
+       mr->umem = NULL;
+       kfree(in);
+
+       return mr;
+
+err_free_in:
+       kfree(in);
+err_free:
+       kfree(mr);
+       return ERR_PTR(err);
+}
+
+static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
+                                   int ndescs, u32 *in, int inlen)
+{
+       return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt),
+                                     PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in,
+                                     inlen);
+}
+
+static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
+                                   int ndescs, u32 *in, int inlen)
+{
+       return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm),
+                                     0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
+}
+
+static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
+                                     int max_num_sg, int max_num_meta_sg,
+                                     u32 *in, int inlen)
+{
+       struct mlx5_ib_dev *dev = to_mdev(pd->device);
+       u32 psv_index[2];
+       void *mkc;
+       int err;
+
+       mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
+       if (!mr->sig)
+               return -ENOMEM;
+
+       /* create mem & wire PSVs */
+       err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index);
+       if (err)
+               goto err_free_sig;
+
+       mr->sig->psv_memory.psv_idx = psv_index[0];
+       mr->sig->psv_wire.psv_idx = psv_index[1];
+
+       mr->sig->sig_status_checked = true;
+       mr->sig->sig_err_exists = false;
+       /* Next UMR, Arm SIGERR */
+       ++mr->sig->sigerr_count;
+       mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
+                                        sizeof(struct mlx5_klm),
+                                        MLX5_MKC_ACCESS_MODE_KLMS);
+       if (IS_ERR(mr->klm_mr)) {
+               err = PTR_ERR(mr->klm_mr);
+               goto err_destroy_psv;
+       }
+       mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
+                                        sizeof(struct mlx5_mtt),
+                                        MLX5_MKC_ACCESS_MODE_MTT);
+       if (IS_ERR(mr->mtt_mr)) {
+               err = PTR_ERR(mr->mtt_mr);
+               goto err_free_klm_mr;
+       }
+
+       /* Set bsf descriptors for mkey */
        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
-       MLX5_SET(mkc, mkc, free, 1);
-       MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
-       MLX5_SET(mkc, mkc, qpn, 0xffffff);
-       MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
+       MLX5_SET(mkc, mkc, bsf_en, 1);
+       MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
 
-       if (mr_type == IB_MR_TYPE_MEM_REG) {
-               mr->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
-               MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
-               err = mlx5_alloc_priv_descs(pd->device, mr,
-                                           ndescs, sizeof(struct mlx5_mtt));
-               if (err)
-                       goto err_free_in;
+       err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0,
+                                    MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
+       if (err)
+               goto err_free_mtt_mr;
 
-               mr->desc_size = sizeof(struct mlx5_mtt);
-               mr->max_descs = ndescs;
-       } else if (mr_type == IB_MR_TYPE_SG_GAPS) {
-               mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
+       return 0;
 
-               err = mlx5_alloc_priv_descs(pd->device, mr,
-                                           ndescs, sizeof(struct mlx5_klm));
-               if (err)
-                       goto err_free_in;
-               mr->desc_size = sizeof(struct mlx5_klm);
-               mr->max_descs = ndescs;
-       } else if (mr_type == IB_MR_TYPE_SIGNATURE) {
-               u32 psv_index[2];
-
-               MLX5_SET(mkc, mkc, bsf_en, 1);
-               MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
-               mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
-               if (!mr->sig) {
-                       err = -ENOMEM;
-                       goto err_free_in;
-               }
+err_free_mtt_mr:
+       dereg_mr(to_mdev(mr->mtt_mr->ibmr.device), mr->mtt_mr);
+       mr->mtt_mr = NULL;
+err_free_klm_mr:
+       dereg_mr(to_mdev(mr->klm_mr->ibmr.device), mr->klm_mr);
+       mr->klm_mr = NULL;
+err_destroy_psv:
+       if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
+               mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
+                            mr->sig->psv_memory.psv_idx);
+       if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
+               mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
+                            mr->sig->psv_wire.psv_idx);
+err_free_sig:
+       kfree(mr->sig);
 
-               /* create mem & wire PSVs */
-               err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn,
-                                          2, psv_index);
-               if (err)
-                       goto err_free_sig;
+       return err;
+}
+
+static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
+                                       enum ib_mr_type mr_type, u32 max_num_sg,
+                                       u32 max_num_meta_sg)
+{
+       struct mlx5_ib_dev *dev = to_mdev(pd->device);
+       int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+       int ndescs = ALIGN(max_num_sg, 4);
+       struct mlx5_ib_mr *mr;
+       u32 *in;
+       int err;
 
-               mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS;
-               mr->sig->psv_memory.psv_idx = psv_index[0];
-               mr->sig->psv_wire.psv_idx = psv_index[1];
+       mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+       if (!mr)
+               return ERR_PTR(-ENOMEM);
 
-               mr->sig->sig_status_checked = true;
-               mr->sig->sig_err_exists = false;
-               /* Next UMR, Arm SIGERR */
-               ++mr->sig->sigerr_count;
-       } else {
+       in = kzalloc(inlen, GFP_KERNEL);
+       if (!in) {
+               err = -ENOMEM;
+               goto err_free;
+       }
+
+       mr->ibmr.device = pd->device;
+       mr->umem = NULL;
+
+       switch (mr_type) {
+       case IB_MR_TYPE_MEM_REG:
+               err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
+               break;
+       case IB_MR_TYPE_SG_GAPS:
+               err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
+               break;
+       case IB_MR_TYPE_INTEGRITY:
+               err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
+                                                max_num_meta_sg, in, inlen);
+               break;
+       default:
                mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
                err = -EINVAL;
-               goto err_free_in;
        }
 
-       MLX5_SET(mkc, mkc, access_mode_1_0, mr->access_mode & 0x3);
-       MLX5_SET(mkc, mkc, access_mode_4_2, (mr->access_mode >> 2) & 0x7);
-       MLX5_SET(mkc, mkc, umr_en, 1);
-
-       mr->ibmr.device = pd->device;
-       err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
        if (err)
-               goto err_destroy_psv;
+               goto err_free_in;
 
-       mr->mmkey.type = MLX5_MKEY_MR;
-       mr->ibmr.lkey = mr->mmkey.key;
-       mr->ibmr.rkey = mr->mmkey.key;
-       mr->umem = NULL;
        kfree(in);
 
        return &mr->ibmr;
 
-err_destroy_psv:
-       if (mr->sig) {
-               if (mlx5_core_destroy_psv(dev->mdev,
-                                         mr->sig->psv_memory.psv_idx))
-                       mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
-                                    mr->sig->psv_memory.psv_idx);
-               if (mlx5_core_destroy_psv(dev->mdev,
-                                         mr->sig->psv_wire.psv_idx))
-                       mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
-                                    mr->sig->psv_wire.psv_idx);
-       }
-       mlx5_free_priv_descs(mr);
-err_free_sig:
-       kfree(mr->sig);
 err_free_in:
        kfree(in);
 err_free:
@@ -1758,6 +1884,19 @@ err_free:
        return ERR_PTR(err);
 }
 
+struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+                              u32 max_num_sg, struct ib_udata *udata)
+{
+       return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
+}
+
+struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
+                                        u32 max_num_sg, u32 max_num_meta_sg)
+{
+       return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
+                                 max_num_meta_sg);
+}
+
 struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
                               struct ib_udata *udata)
 {
@@ -1886,17 +2025,54 @@ done:
        return ret;
 }
 
+static int
+mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
+                       int data_sg_nents, unsigned int *data_sg_offset,
+                       struct scatterlist *meta_sg, int meta_sg_nents,
+                       unsigned int *meta_sg_offset)
+{
+       struct mlx5_ib_mr *mr = to_mmr(ibmr);
+       unsigned int sg_offset = 0;
+       int n = 0;
+
+       mr->meta_length = 0;
+       if (data_sg_nents == 1) {
+               n++;
+               mr->ndescs = 1;
+               if (data_sg_offset)
+                       sg_offset = *data_sg_offset;
+               mr->data_length = sg_dma_len(data_sg) - sg_offset;
+               mr->data_iova = sg_dma_address(data_sg) + sg_offset;
+               if (meta_sg_nents == 1) {
+                       n++;
+                       mr->meta_ndescs = 1;
+                       if (meta_sg_offset)
+                               sg_offset = *meta_sg_offset;
+                       else
+                               sg_offset = 0;
+                       mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
+                       mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
+               }
+               ibmr->length = mr->data_length + mr->meta_length;
+       }
+
+       return n;
+}
+
 static int
 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
                   struct scatterlist *sgl,
                   unsigned short sg_nents,
-                  unsigned int *sg_offset_p)
+                  unsigned int *sg_offset_p,
+                  struct scatterlist *meta_sgl,
+                  unsigned short meta_sg_nents,
+                  unsigned int *meta_sg_offset_p)
 {
        struct scatterlist *sg = sgl;
        struct mlx5_klm *klms = mr->descs;
        unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
        u32 lkey = mr->ibmr.pd->local_dma_lkey;
-       int i;
+       int i, j = 0;
 
        mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
        mr->ibmr.length = 0;
@@ -1911,12 +2087,36 @@ mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
 
                sg_offset = 0;
        }
-       mr->ndescs = i;
 
        if (sg_offset_p)
                *sg_offset_p = sg_offset;
 
-       return i;
+       mr->ndescs = i;
+       mr->data_length = mr->ibmr.length;
+
+       if (meta_sg_nents) {
+               sg = meta_sgl;
+               sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
+               for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
+                       if (unlikely(i + j >= mr->max_descs))
+                               break;
+                       klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
+                                                    sg_offset);
+                       klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
+                                                        sg_offset);
+                       klms[i + j].key = cpu_to_be32(lkey);
+                       mr->ibmr.length += sg_dma_len(sg) - sg_offset;
+
+                       sg_offset = 0;
+               }
+               if (meta_sg_offset_p)
+                       *meta_sg_offset_p = sg_offset;
+
+               mr->meta_ndescs = j;
+               mr->meta_length = mr->ibmr.length - mr->data_length;
+       }
+
+       return i + j;
 }
 
 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
@@ -1933,6 +2133,181 @@ static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
        return 0;
 }
 
+static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
+{
+       struct mlx5_ib_mr *mr = to_mmr(ibmr);
+       __be64 *descs;
+
+       if (unlikely(mr->ndescs + mr->meta_ndescs == mr->max_descs))
+               return -ENOMEM;
+
+       descs = mr->descs;
+       descs[mr->ndescs + mr->meta_ndescs++] =
+               cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
+
+       return 0;
+}
+
+static int
+mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
+                        int data_sg_nents, unsigned int *data_sg_offset,
+                        struct scatterlist *meta_sg, int meta_sg_nents,
+                        unsigned int *meta_sg_offset)
+{
+       struct mlx5_ib_mr *mr = to_mmr(ibmr);
+       struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
+       int n;
+
+       pi_mr->ndescs = 0;
+       pi_mr->meta_ndescs = 0;
+       pi_mr->meta_length = 0;
+
+       ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
+                                  pi_mr->desc_size * pi_mr->max_descs,
+                                  DMA_TO_DEVICE);
+
+       pi_mr->ibmr.page_size = ibmr->page_size;
+       n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset,
+                          mlx5_set_page);
+       if (n != data_sg_nents)
+               return n;
+
+       pi_mr->data_iova = pi_mr->ibmr.iova;
+       pi_mr->data_length = pi_mr->ibmr.length;
+       pi_mr->ibmr.length = pi_mr->data_length;
+       ibmr->length = pi_mr->data_length;
+
+       if (meta_sg_nents) {
+               u64 page_mask = ~((u64)ibmr->page_size - 1);
+               u64 iova = pi_mr->data_iova;
+
+               n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents,
+                                   meta_sg_offset, mlx5_set_page_pi);
+
+               pi_mr->meta_length = pi_mr->ibmr.length;
+               /*
+                * PI address for the HW is the offset of the metadata address
+                * relative to the first data page address.
+                * It equals to first data page address + size of data pages +
+                * metadata offset at the first metadata page
+                */
+               pi_mr->pi_iova = (iova & page_mask) +
+                                pi_mr->ndescs * ibmr->page_size +
+                                (pi_mr->ibmr.iova & ~page_mask);
+               /*
+                * In order to use one MTT MR for data and metadata, we register
+                * also the gaps between the end of the data and the start of
+                * the metadata (the sig MR will verify that the HW will access
+                * to right addresses). This mapping is safe because we use
+                * internal mkey for the registration.
+                */
+               pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
+               pi_mr->ibmr.iova = iova;
+               ibmr->length += pi_mr->meta_length;
+       }
+
+       ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
+                                     pi_mr->desc_size * pi_mr->max_descs,
+                                     DMA_TO_DEVICE);
+
+       return n;
+}
+
+static int
+mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
+                        int data_sg_nents, unsigned int *data_sg_offset,
+                        struct scatterlist *meta_sg, int meta_sg_nents,
+                        unsigned int *meta_sg_offset)
+{
+       struct mlx5_ib_mr *mr = to_mmr(ibmr);
+       struct mlx5_ib_mr *pi_mr = mr->klm_mr;
+       int n;
+
+       pi_mr->ndescs = 0;
+       pi_mr->meta_ndescs = 0;
+       pi_mr->meta_length = 0;
+
+       ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
+                                  pi_mr->desc_size * pi_mr->max_descs,
+                                  DMA_TO_DEVICE);
+
+       n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
+                              meta_sg, meta_sg_nents, meta_sg_offset);
+
+       ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
+                                     pi_mr->desc_size * pi_mr->max_descs,
+                                     DMA_TO_DEVICE);
+
+       /* This is zero-based memory region */
+       pi_mr->data_iova = 0;
+       pi_mr->ibmr.iova = 0;
+       pi_mr->pi_iova = pi_mr->data_length;
+       ibmr->length = pi_mr->ibmr.length;
+
+       return n;
+}
+
+int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
+                        int data_sg_nents, unsigned int *data_sg_offset,
+                        struct scatterlist *meta_sg, int meta_sg_nents,
+                        unsigned int *meta_sg_offset)
+{
+       struct mlx5_ib_mr *mr = to_mmr(ibmr);
+       struct mlx5_ib_mr *pi_mr = NULL;
+       int n;
+
+       WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
+
+       mr->ndescs = 0;
+       mr->data_length = 0;
+       mr->data_iova = 0;
+       mr->meta_ndescs = 0;
+       mr->pi_iova = 0;
+       /*
+        * As a performance optimization, if possible, there is no need to
+        * perform UMR operation to register the data/metadata buffers.
+        * First try to map the sg lists to PA descriptors with local_dma_lkey.
+        * Fallback to UMR only in case of a failure.
+        */
+       n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
+                                   data_sg_offset, meta_sg, meta_sg_nents,
+                                   meta_sg_offset);
+       if (n == data_sg_nents + meta_sg_nents)
+               goto out;
+       /*
+        * As a performance optimization, if possible, there is no need to map
+        * the sg lists to KLM descriptors. First try to map the sg lists to MTT
+        * descriptors and fallback to KLM only in case of a failure.
+        * It's more efficient for the HW to work with MTT descriptors
+        * (especially in high load).
+        * Use KLM (indirect access) only if it's mandatory.
+        */
+       pi_mr = mr->mtt_mr;
+       n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
+                                    data_sg_offset, meta_sg, meta_sg_nents,
+                                    meta_sg_offset);
+       if (n == data_sg_nents + meta_sg_nents)
+               goto out;
+
+       pi_mr = mr->klm_mr;
+       n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
+                                    data_sg_offset, meta_sg, meta_sg_nents,
+                                    meta_sg_offset);
+       if (unlikely(n != data_sg_nents + meta_sg_nents))
+               return -ENOMEM;
+
+out:
+       /* This is zero-based memory region */
+       ibmr->iova = 0;
+       mr->pi_mr = pi_mr;
+       if (pi_mr)
+               ibmr->sig_attrs->meta_length = pi_mr->meta_length;
+       else
+               ibmr->sig_attrs->meta_length = mr->meta_length;
+
+       return 0;
+}
+
 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
                      unsigned int *sg_offset)
 {
@@ -1946,7 +2321,8 @@ int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
                                   DMA_TO_DEVICE);
 
        if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
-               n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset);
+               n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
+                                      NULL);
        else
                n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
                                mlx5_set_page);
index 831c450b271aad438785bdb953c7ce6abe2f5a05..5b642d81e617dc61207f3a690df1e7b304203326 100644 (file)
@@ -150,7 +150,7 @@ static struct ib_umem_odp *odp_lookup(u64 start, u64 length,
                if (!rb)
                        goto not_found;
                odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
-               if (ib_umem_start(&odp->umem) > start + length)
+               if (ib_umem_start(odp) > start + length)
                        goto not_found;
        }
 not_found:
@@ -200,7 +200,7 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
 static void mr_leaf_free_action(struct work_struct *work)
 {
        struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work);
-       int idx = ib_umem_start(&odp->umem) >> MLX5_IMR_MTT_SHIFT;
+       int idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
        struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
 
        mr->parent = NULL;
@@ -224,7 +224,6 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
        const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT /
                                    sizeof(struct mlx5_mtt)) - 1;
        u64 idx = 0, blk_start_idx = 0;
-       struct ib_umem *umem;
        int in_block = 0;
        u64 addr;
 
@@ -232,15 +231,14 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
                pr_err("invalidation called on NULL umem or non-ODP umem\n");
                return;
        }
-       umem = &umem_odp->umem;
 
        mr = umem_odp->private;
 
        if (!mr || !mr->ibmr.pd)
                return;
 
-       start = max_t(u64, ib_umem_start(umem), start);
-       end = min_t(u64, ib_umem_end(umem), end);
+       start = max_t(u64, ib_umem_start(umem_odp), start);
+       end = min_t(u64, ib_umem_end(umem_odp), end);
 
        /*
         * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
@@ -249,8 +247,8 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
         * but they will write 0s as well, so no difference in the end result.
         */
 
-       for (addr = start; addr < end; addr += BIT(umem->page_shift)) {
-               idx = (addr - ib_umem_start(umem)) >> umem->page_shift;
+       for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) {
+               idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
                /*
                 * Strive to write the MTTs in chunks, but avoid overwriting
                 * non-existing MTTs. The huristic here can be improved to
@@ -544,13 +542,12 @@ static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end,
                        void *cookie)
 {
        struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie;
-       struct ib_umem *umem = &umem_odp->umem;
 
        if (mr->parent != imr)
                return 0;
 
-       ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem),
-                                   ib_umem_end(umem));
+       ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
+                                   ib_umem_end(umem_odp));
 
        if (umem_odp->dying)
                return 0;
@@ -602,9 +599,9 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
        }
 
 next_mr:
-       size = min_t(size_t, bcnt, ib_umem_end(&odp->umem) - io_virt);
+       size = min_t(size_t, bcnt, ib_umem_end(odp) - io_virt);
 
-       page_shift = mr->umem->page_shift;
+       page_shift = odp->page_shift;
        page_mask = ~(BIT(page_shift) - 1);
        start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
        access_mask = ODP_READ_ALLOWED_BIT;
index 768c7e81f688169c49455e447de77d6a6f61a89e..2a97619ed6034d13e4091659dfdd4cb7b844db46 100644 (file)
@@ -34,6 +34,7 @@
 #include <rdma/ib_umem.h>
 #include <rdma/ib_cache.h>
 #include <rdma/ib_user_verbs.h>
+#include <rdma/rdma_counter.h>
 #include <linux/mlx5/fs.h>
 #include "mlx5_ib.h"
 #include "ib_rep.h"
@@ -442,9 +443,9 @@ static int calc_send_wqe(struct ib_qp_init_attr *attr)
        }
 
        size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg);
-       if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN &&
+       if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN &&
            ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB) < MLX5_SIG_WQE_SIZE)
-                       return MLX5_SIG_WQE_SIZE;
+               return MLX5_SIG_WQE_SIZE;
        else
                return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB);
 }
@@ -496,9 +497,6 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr,
                              sizeof(struct mlx5_wqe_inline_seg);
        attr->cap.max_inline_data = qp->max_inline_data;
 
-       if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN)
-               qp->signature_en = true;
-
        wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size);
        qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB;
        if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) {
@@ -790,8 +788,7 @@ static void destroy_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
                atomic_dec(&dev->delay_drop.rqs_cnt);
 
        mlx5_ib_db_unmap_user(context, &rwq->db);
-       if (rwq->umem)
-               ib_umem_release(rwq->umem);
+       ib_umem_release(rwq->umem);
 }
 
 static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
@@ -977,8 +974,7 @@ err_free:
        kvfree(*in);
 
 err_umem:
-       if (ubuffer->umem)
-               ib_umem_release(ubuffer->umem);
+       ib_umem_release(ubuffer->umem);
 
 err_bfreg:
        if (bfregn != MLX5_IB_INVALID_BFREG)
@@ -997,8 +993,7 @@ static void destroy_qp_user(struct mlx5_ib_dev *dev, struct ib_pd *pd,
                        ibucontext);
 
        mlx5_ib_db_unmap_user(context, &qp->db);
-       if (base->ubuffer.umem)
-               ib_umem_release(base->ubuffer.umem);
+       ib_umem_release(base->ubuffer.umem);
 
        /*
         * Free only the BFREGs which are handled by the kernel.
@@ -1042,7 +1037,7 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
        void *qpc;
        int err;
 
-       if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN |
+       if (init_attr->create_flags & ~(IB_QP_CREATE_INTEGRITY_EN |
                                        IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
                                        IB_QP_CREATE_IPOIB_UD_LSO |
                                        IB_QP_CREATE_NETIF_QP |
@@ -3386,6 +3381,35 @@ static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev,
        return tx_port_affinity;
 }
 
+static int __mlx5_ib_qp_set_counter(struct ib_qp *qp,
+                                   struct rdma_counter *counter)
+{
+       struct mlx5_ib_dev *dev = to_mdev(qp->device);
+       struct mlx5_ib_qp *mqp = to_mqp(qp);
+       struct mlx5_qp_context context = {};
+       struct mlx5_ib_port *mibport = NULL;
+       struct mlx5_ib_qp_base *base;
+       u32 set_id;
+
+       if (!MLX5_CAP_GEN(dev->mdev, rts2rts_qp_counters_set_id))
+               return 0;
+
+       if (counter) {
+               set_id = counter->id;
+       } else {
+               mibport = &dev->port[mqp->port - 1];
+               set_id = mibport->cnts.set_id;
+       }
+
+       base = &mqp->trans_qp.base;
+       context.qp_counter_set_usr_page &= cpu_to_be32(0xffffff);
+       context.qp_counter_set_usr_page |= cpu_to_be32(set_id << 24);
+       return mlx5_core_qp_modify(dev->mdev,
+                                  MLX5_CMD_OP_RTS2RTS_QP,
+                                  MLX5_QP_OPTPAR_COUNTER_SET_ID,
+                                  &context, &base->mqp);
+}
+
 static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
                               const struct ib_qp_attr *attr, int attr_mask,
                               enum ib_qp_state cur_state,
@@ -3439,6 +3463,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
        struct mlx5_ib_port *mibport = NULL;
        enum mlx5_qp_state mlx5_cur, mlx5_new;
        enum mlx5_qp_optpar optpar;
+       u32 set_id = 0;
        int mlx5_st;
        int err;
        u16 op;
@@ -3601,8 +3626,12 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
                        port_num = 0;
 
                mibport = &dev->port[port_num];
+               if (ibqp->counter)
+                       set_id = ibqp->counter->id;
+               else
+                       set_id = mibport->cnts.set_id;
                context->qp_counter_set_usr_page |=
-                       cpu_to_be32((u32)(mibport->cnts.set_id) << 24);
+                       cpu_to_be32(set_id << 24);
        }
 
        if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
@@ -3630,7 +3659,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 
                raw_qp_param.operation = op;
                if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
-                       raw_qp_param.rq_q_ctr_id = mibport->cnts.set_id;
+                       raw_qp_param.rq_q_ctr_id = set_id;
                        raw_qp_param.set_mask |= MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID;
                }
 
@@ -3707,6 +3736,12 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
                qp->db.db[MLX5_SND_DBR] = 0;
        }
 
+       if ((new_state == IB_QPS_RTS) && qp->counter_pending) {
+               err = __mlx5_ib_qp_set_counter(ibqp, ibqp->counter);
+               if (!err)
+                       qp->counter_pending = 0;
+       }
+
 out:
        kfree(context);
        return err;
@@ -4170,15 +4205,13 @@ static __be64 sig_mkey_mask(void)
 }
 
 static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr,
-                           struct mlx5_ib_mr *mr, bool umr_inline)
+                           struct mlx5_ib_mr *mr, u8 flags)
 {
-       int size = mr->ndescs * mr->desc_size;
+       int size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size;
 
        memset(umr, 0, sizeof(*umr));
 
-       umr->flags = MLX5_UMR_CHECK_NOT_FREE;
-       if (umr_inline)
-               umr->flags |= MLX5_UMR_INLINE;
+       umr->flags = flags;
        umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size));
        umr->mkey_mask = frwr_mkey_mask();
 }
@@ -4305,7 +4338,7 @@ static void set_reg_mkey_seg(struct mlx5_mkey_seg *seg,
                             struct mlx5_ib_mr *mr,
                             u32 key, int access)
 {
-       int ndescs = ALIGN(mr->ndescs, 8) >> 1;
+       int ndescs = ALIGN(mr->ndescs + mr->meta_ndescs, 8) >> 1;
 
        memset(seg, 0, sizeof(*seg));
 
@@ -4356,7 +4389,7 @@ static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg,
                             struct mlx5_ib_mr *mr,
                             struct mlx5_ib_pd *pd)
 {
-       int bcount = mr->desc_size * mr->ndescs;
+       int bcount = mr->desc_size * (mr->ndescs + mr->meta_ndescs);
 
        dseg->addr = cpu_to_be64(mr->desc_map);
        dseg->byte_count = cpu_to_be32(ALIGN(bcount, 64));
@@ -4549,23 +4582,37 @@ static int mlx5_set_bsf(struct ib_mr *sig_mr,
        return 0;
 }
 
-static int set_sig_data_segment(const struct ib_sig_handover_wr *wr,
-                               struct mlx5_ib_qp *qp, void **seg,
-                               int *size, void **cur_edge)
+static int set_sig_data_segment(const struct ib_send_wr *send_wr,
+                               struct ib_mr *sig_mr,
+                               struct ib_sig_attrs *sig_attrs,
+                               struct mlx5_ib_qp *qp, void **seg, int *size,
+                               void **cur_edge)
 {
-       struct ib_sig_attrs *sig_attrs = wr->sig_attrs;
-       struct ib_mr *sig_mr = wr->sig_mr;
        struct mlx5_bsf *bsf;
-       u32 data_len = wr->wr.sg_list->length;
-       u32 data_key = wr->wr.sg_list->lkey;
-       u64 data_va = wr->wr.sg_list->addr;
+       u32 data_len;
+       u32 data_key;
+       u64 data_va;
+       u32 prot_len = 0;
+       u32 prot_key = 0;
+       u64 prot_va = 0;
+       bool prot = false;
        int ret;
        int wqe_size;
+       struct mlx5_ib_mr *mr = to_mmr(sig_mr);
+       struct mlx5_ib_mr *pi_mr = mr->pi_mr;
+
+       data_len = pi_mr->data_length;
+       data_key = pi_mr->ibmr.lkey;
+       data_va = pi_mr->data_iova;
+       if (pi_mr->meta_ndescs) {
+               prot_len = pi_mr->meta_length;
+               prot_key = pi_mr->ibmr.lkey;
+               prot_va = pi_mr->pi_iova;
+               prot = true;
+       }
 
-       if (!wr->prot ||
-           (data_key == wr->prot->lkey &&
-            data_va == wr->prot->addr &&
-            data_len == wr->prot->length)) {
+       if (!prot || (data_key == prot_key && data_va == prot_va &&
+                     data_len == prot_len)) {
                /**
                 * Source domain doesn't contain signature information
                 * or data and protection are interleaved in memory.
@@ -4599,8 +4646,6 @@ static int set_sig_data_segment(const struct ib_sig_handover_wr *wr,
                struct mlx5_stride_block_ctrl_seg *sblock_ctrl;
                struct mlx5_stride_block_entry *data_sentry;
                struct mlx5_stride_block_entry *prot_sentry;
-               u32 prot_key = wr->prot->lkey;
-               u64 prot_va = wr->prot->addr;
                u16 block_size = sig_attrs->mem.sig.dif.pi_interval;
                int prot_size;
 
@@ -4650,17 +4695,15 @@ static int set_sig_data_segment(const struct ib_sig_handover_wr *wr,
 }
 
 static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg,
-                                const struct ib_sig_handover_wr *wr, u32 size,
-                                u32 length, u32 pdn)
+                                struct ib_mr *sig_mr, int access_flags,
+                                u32 size, u32 length, u32 pdn)
 {
-       struct ib_mr *sig_mr = wr->sig_mr;
        u32 sig_key = sig_mr->rkey;
        u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1;
 
        memset(seg, 0, sizeof(*seg));
 
-       seg->flags = get_umr_flags(wr->access_flags) |
-                                  MLX5_MKC_ACCESS_MODE_KLMS;
+       seg->flags = get_umr_flags(access_flags) | MLX5_MKC_ACCESS_MODE_KLMS;
        seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00);
        seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 |
                                    MLX5_MKEY_BSF_EN | pdn);
@@ -4680,49 +4723,50 @@ static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
        umr->mkey_mask = sig_mkey_mask();
 }
 
-
-static int set_sig_umr_wr(const struct ib_send_wr *send_wr,
-                         struct mlx5_ib_qp *qp, void **seg, int *size,
-                         void **cur_edge)
+static int set_pi_umr_wr(const struct ib_send_wr *send_wr,
+                        struct mlx5_ib_qp *qp, void **seg, int *size,
+                        void **cur_edge)
 {
-       const struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr);
-       struct mlx5_ib_mr *sig_mr = to_mmr(wr->sig_mr);
+       const struct ib_reg_wr *wr = reg_wr(send_wr);
+       struct mlx5_ib_mr *sig_mr = to_mmr(wr->mr);
+       struct mlx5_ib_mr *pi_mr = sig_mr->pi_mr;
+       struct ib_sig_attrs *sig_attrs = sig_mr->ibmr.sig_attrs;
        u32 pdn = get_pd(qp)->pdn;
        u32 xlt_size;
        int region_len, ret;
 
-       if (unlikely(wr->wr.num_sge != 1) ||
-           unlikely(wr->access_flags & IB_ACCESS_REMOTE_ATOMIC) ||
-           unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) ||
+       if (unlikely(send_wr->num_sge != 0) ||
+           unlikely(wr->access & IB_ACCESS_REMOTE_ATOMIC) ||
+           unlikely(!sig_mr->sig) || unlikely(!qp->ibqp.integrity_en) ||
            unlikely(!sig_mr->sig->sig_status_checked))
                return -EINVAL;
 
        /* length of the protected region, data + protection */
-       region_len = wr->wr.sg_list->length;
-       if (wr->prot &&
-           (wr->prot->lkey != wr->wr.sg_list->lkey  ||
-            wr->prot->addr != wr->wr.sg_list->addr  ||
-            wr->prot->length != wr->wr.sg_list->length))
-               region_len += wr->prot->length;
+       region_len = pi_mr->ibmr.length;
 
        /**
         * KLM octoword size - if protection was provided
         * then we use strided block format (3 octowords),
         * else we use single KLM (1 octoword)
         **/
-       xlt_size = wr->prot ? 0x30 : sizeof(struct mlx5_klm);
+       if (sig_attrs->mem.sig_type != IB_SIG_TYPE_NONE)
+               xlt_size = 0x30;
+       else
+               xlt_size = sizeof(struct mlx5_klm);
 
        set_sig_umr_segment(*seg, xlt_size);
        *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
        *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
        handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
 
-       set_sig_mkey_segment(*seg, wr, xlt_size, region_len, pdn);
+       set_sig_mkey_segment(*seg, wr->mr, wr->access, xlt_size, region_len,
+                            pdn);
        *seg += sizeof(struct mlx5_mkey_seg);
        *size += sizeof(struct mlx5_mkey_seg) / 16;
        handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
 
-       ret = set_sig_data_segment(wr, qp, seg, size, cur_edge);
+       ret = set_sig_data_segment(send_wr, wr->mr, sig_attrs, qp, seg, size,
+                                  cur_edge);
        if (ret)
                return ret;
 
@@ -4759,12 +4803,14 @@ static int set_psv_wr(struct ib_sig_domain *domain,
 
 static int set_reg_wr(struct mlx5_ib_qp *qp,
                      const struct ib_reg_wr *wr,
-                     void **seg, int *size, void **cur_edge)
+                     void **seg, int *size, void **cur_edge,
+                     bool check_not_free)
 {
        struct mlx5_ib_mr *mr = to_mmr(wr->mr);
        struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd);
-       size_t mr_list_size = mr->ndescs * mr->desc_size;
+       int mr_list_size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size;
        bool umr_inline = mr_list_size <= MLX5_IB_SQ_UMR_INLINE_THRESHOLD;
+       u8 flags = 0;
 
        if (unlikely(wr->wr.send_flags & IB_SEND_INLINE)) {
                mlx5_ib_warn(to_mdev(qp->ibqp.device),
@@ -4772,7 +4818,12 @@ static int set_reg_wr(struct mlx5_ib_qp *qp,
                return -EINVAL;
        }
 
-       set_reg_umr_seg(*seg, mr, umr_inline);
+       if (check_not_free)
+               flags |= MLX5_UMR_CHECK_NOT_FREE;
+       if (umr_inline)
+               flags |= MLX5_UMR_INLINE;
+
+       set_reg_umr_seg(*seg, mr, flags);
        *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
        *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
        handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
@@ -4898,8 +4949,12 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
        struct mlx5_wqe_ctrl_seg *ctrl = NULL;  /* compiler warning */
        struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
        struct mlx5_core_dev *mdev = dev->mdev;
+       struct ib_reg_wr reg_pi_wr;
        struct mlx5_ib_qp *qp;
        struct mlx5_ib_mr *mr;
+       struct mlx5_ib_mr *pi_mr;
+       struct mlx5_ib_mr pa_pi_mr;
+       struct ib_sig_attrs *sig_attrs;
        struct mlx5_wqe_xrc_seg *xrc;
        struct mlx5_bf *bf;
        void *cur_edge;
@@ -4953,7 +5008,8 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
                        goto out;
                }
 
-               if (wr->opcode == IB_WR_REG_MR) {
+               if (wr->opcode == IB_WR_REG_MR ||
+                   wr->opcode == IB_WR_REG_MR_INTEGRITY) {
                        fence = dev->umr_fence;
                        next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
                } else  {
@@ -5003,7 +5059,7 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
                                qp->sq.wr_data[idx] = IB_WR_REG_MR;
                                ctrl->imm = cpu_to_be32(reg_wr(wr)->key);
                                err = set_reg_wr(qp, reg_wr(wr), &seg, &size,
-                                                &cur_edge);
+                                                &cur_edge, true);
                                if (err) {
                                        *bad_wr = wr;
                                        goto out;
@@ -5011,26 +5067,82 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
                                num_sge = 0;
                                break;
 
-                       case IB_WR_REG_SIG_MR:
-                               qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR;
-                               mr = to_mmr(sig_handover_wr(wr)->sig_mr);
-
+                       case IB_WR_REG_MR_INTEGRITY:
+                               qp->sq.wr_data[idx] = IB_WR_REG_MR_INTEGRITY;
+
+                               mr = to_mmr(reg_wr(wr)->mr);
+                               pi_mr = mr->pi_mr;
+
+                               if (pi_mr) {
+                                       memset(&reg_pi_wr, 0,
+                                              sizeof(struct ib_reg_wr));
+
+                                       reg_pi_wr.mr = &pi_mr->ibmr;
+                                       reg_pi_wr.access = reg_wr(wr)->access;
+                                       reg_pi_wr.key = pi_mr->ibmr.rkey;
+
+                                       ctrl->imm = cpu_to_be32(reg_pi_wr.key);
+                                       /* UMR for data + prot registration */
+                                       err = set_reg_wr(qp, &reg_pi_wr, &seg,
+                                                        &size, &cur_edge,
+                                                        false);
+                                       if (err) {
+                                               *bad_wr = wr;
+                                               goto out;
+                                       }
+                                       finish_wqe(qp, ctrl, seg, size,
+                                                  cur_edge, idx, wr->wr_id,
+                                                  nreq, fence,
+                                                  MLX5_OPCODE_UMR);
+
+                                       err = begin_wqe(qp, &seg, &ctrl, wr,
+                                                       &idx, &size, &cur_edge,
+                                                       nreq);
+                                       if (err) {
+                                               mlx5_ib_warn(dev, "\n");
+                                               err = -ENOMEM;
+                                               *bad_wr = wr;
+                                               goto out;
+                                       }
+                               } else {
+                                       memset(&pa_pi_mr, 0,
+                                              sizeof(struct mlx5_ib_mr));
+                                       /* No UMR, use local_dma_lkey */
+                                       pa_pi_mr.ibmr.lkey =
+                                               mr->ibmr.pd->local_dma_lkey;
+
+                                       pa_pi_mr.ndescs = mr->ndescs;
+                                       pa_pi_mr.data_length = mr->data_length;
+                                       pa_pi_mr.data_iova = mr->data_iova;
+                                       if (mr->meta_ndescs) {
+                                               pa_pi_mr.meta_ndescs =
+                                                       mr->meta_ndescs;
+                                               pa_pi_mr.meta_length =
+                                                       mr->meta_length;
+                                               pa_pi_mr.pi_iova = mr->pi_iova;
+                                       }
+
+                                       pa_pi_mr.ibmr.length = mr->ibmr.length;
+                                       mr->pi_mr = &pa_pi_mr;
+                               }
                                ctrl->imm = cpu_to_be32(mr->ibmr.rkey);
-                               err = set_sig_umr_wr(wr, qp, &seg, &size,
-                                                    &cur_edge);
+                               /* UMR for sig MR */
+                               err = set_pi_umr_wr(wr, qp, &seg, &size,
+                                                   &cur_edge);
                                if (err) {
                                        mlx5_ib_warn(dev, "\n");
                                        *bad_wr = wr;
                                        goto out;
                                }
-
                                finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
                                           wr->wr_id, nreq, fence,
                                           MLX5_OPCODE_UMR);
+
                                /*
                                 * SET_PSV WQEs are not signaled and solicited
                                 * on error
                                 */
+                               sig_attrs = mr->ibmr.sig_attrs;
                                err = __begin_wqe(qp, &seg, &ctrl, wr, &idx,
                                                  &size, &cur_edge, nreq, false,
                                                  true);
@@ -5040,19 +5152,18 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
                                        *bad_wr = wr;
                                        goto out;
                                }
-
-                               err = set_psv_wr(&sig_handover_wr(wr)->sig_attrs->mem,
-                                                mr->sig->psv_memory.psv_idx, &seg,
-                                                &size);
+                               err = set_psv_wr(&sig_attrs->mem,
+                                                mr->sig->psv_memory.psv_idx,
+                                                &seg, &size);
                                if (err) {
                                        mlx5_ib_warn(dev, "\n");
                                        *bad_wr = wr;
                                        goto out;
                                }
-
                                finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
-                                          wr->wr_id, nreq, fence,
+                                          wr->wr_id, nreq, next_fence,
                                           MLX5_OPCODE_SET_PSV);
+
                                err = __begin_wqe(qp, &seg, &ctrl, wr, &idx,
                                                  &size, &cur_edge, nreq, false,
                                                  true);
@@ -5062,20 +5173,20 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
                                        *bad_wr = wr;
                                        goto out;
                                }
-
-                               err = set_psv_wr(&sig_handover_wr(wr)->sig_attrs->wire,
-                                                mr->sig->psv_wire.psv_idx, &seg,
-                                                &size);
+                               err = set_psv_wr(&sig_attrs->wire,
+                                                mr->sig->psv_wire.psv_idx,
+                                                &seg, &size);
                                if (err) {
                                        mlx5_ib_warn(dev, "\n");
                                        *bad_wr = wr;
                                        goto out;
                                }
-
                                finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
-                                          wr->wr_id, nreq, fence,
+                                          wr->wr_id, nreq, next_fence,
                                           MLX5_OPCODE_SET_PSV);
-                               qp->next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
+
+                               qp->next_fence =
+                                       MLX5_FENCE_MODE_INITIATOR_SMALL;
                                num_sge = 0;
                                goto skip_psv;
 
@@ -6047,7 +6158,7 @@ err:
        return ERR_PTR(err);
 }
 
-int mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata)
+void mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata)
 {
        struct mlx5_ib_dev *dev = to_mdev(wq->device);
        struct mlx5_ib_rwq *rwq = to_mrwq(wq);
@@ -6055,8 +6166,6 @@ int mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata)
        mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp);
        destroy_user_rq(dev, wq->pd, rwq, udata);
        kfree(rwq);
-
-       return 0;
 }
 
 struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
@@ -6367,3 +6476,34 @@ void mlx5_ib_drain_rq(struct ib_qp *qp)
 
        handle_drain_completion(cq, &rdrain, dev);
 }
+
+/**
+ * Bind a qp to a counter. If @counter is NULL then bind the qp to
+ * the default counter
+ */
+int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter)
+{
+       struct mlx5_ib_qp *mqp = to_mqp(qp);
+       int err = 0;
+
+       mutex_lock(&mqp->mutex);
+       if (mqp->state == IB_QPS_RESET) {
+               qp->counter = counter;
+               goto out;
+       }
+
+       if (mqp->state == IB_QPS_RTS) {
+               err = __mlx5_ib_qp_set_counter(qp, counter);
+               if (!err)
+                       qp->counter = counter;
+
+               goto out;
+       }
+
+       mqp->counter_pending = 1;
+       qp->counter = counter;
+
+out:
+       mutex_unlock(&mqp->mutex);
+       return err;
+}
index aaf10dd5364d44c80b1e9767d0d2a36b8aec0280..aef1d274a14e4bb11418f54239bd5c30e1fefd9b 100644 (file)
@@ -214,8 +214,6 @@ int mthca_buf_alloc(struct mthca_dev *dev, int size, int max_direct,
 
                dma_unmap_addr_set(&buf->direct, mapping, t);
 
-               memset(buf->direct.buf, 0, size);
-
                while (t & ((1 << shift) - 1)) {
                        --shift;
                        npages *= 2;
index 8ff0e90d756485764645de4259c9e6945402b68c..edccfd6e178f020f229789c9cc235ea850479bfd 100644 (file)
@@ -482,7 +482,7 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
 
        ret = pci_map_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
        if (ret < 0) {
-               put_page(pages[0]);
+               put_user_page(pages[0]);
                goto out;
        }
 
@@ -490,7 +490,7 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
                                 mthca_uarc_virt(dev, uar, i));
        if (ret) {
                pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
-               put_page(sg_page(&db_tab->page[i].mem));
+               put_user_page(sg_page(&db_tab->page[i].mem));
                goto out;
        }
 
@@ -556,7 +556,7 @@ void mthca_cleanup_user_db_tab(struct mthca_dev *dev, struct mthca_uar *uar,
                if (db_tab->page[i].uvirt) {
                        mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, uar, i), 1);
                        pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
-                       put_page(sg_page(&db_tab->page[i].mem));
+                       put_user_page(sg_page(&db_tab->page[i].mem));
                }
        }
 
index 4f40dfedf9208149a992a4e7c511c29dea9a4bcb..23554d8bf2419441c63b5c2b062bd069225e42b8 100644 (file)
@@ -601,10 +601,11 @@ static int mthca_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
        return 0;
 }
 
-static struct ib_cq *mthca_create_cq(struct ib_device *ibdev,
-                                    const struct ib_cq_init_attr *attr,
-                                    struct ib_udata *udata)
+static int mthca_create_cq(struct ib_cq *ibcq,
+                          const struct ib_cq_init_attr *attr,
+                          struct ib_udata *udata)
 {
+       struct ib_device *ibdev = ibcq->device;
        int entries = attr->cqe;
        struct mthca_create_cq ucmd;
        struct mthca_cq *cq;
@@ -614,20 +615,20 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev,
                udata, struct mthca_ucontext, ibucontext);
 
        if (attr->flags)
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
 
        if (entries < 1 || entries > to_mdev(ibdev)->limits.max_cqes)
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
 
        if (udata) {
-               if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
-                       return ERR_PTR(-EFAULT);
+               if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)))
+                       return -EFAULT;
 
                err = mthca_map_user_db(to_mdev(ibdev), &context->uar,
                                        context->db_tab, ucmd.set_db_index,
                                        ucmd.set_db_page);
                if (err)
-                       return ERR_PTR(err);
+                       return err;
 
                err = mthca_map_user_db(to_mdev(ibdev), &context->uar,
                                        context->db_tab, ucmd.arm_db_index,
@@ -636,11 +637,7 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev,
                        goto err_unmap_set;
        }
 
-       cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-       if (!cq) {
-               err = -ENOMEM;
-               goto err_unmap_arm;
-       }
+       cq = to_mcq(ibcq);
 
        if (udata) {
                cq->buf.mr.ibmr.lkey = ucmd.lkey;
@@ -655,20 +652,17 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev,
                            udata ? ucmd.pdn : to_mdev(ibdev)->driver_pd.pd_num,
                            cq);
        if (err)
-               goto err_free;
+               goto err_unmap_arm;
 
        if (udata && ib_copy_to_udata(udata, &cq->cqn, sizeof(__u32))) {
                mthca_free_cq(to_mdev(ibdev), cq);
                err = -EFAULT;
-               goto err_free;
+               goto err_unmap_arm;
        }
 
        cq->resize_buf = NULL;
 
-       return &cq->ibcq;
-
-err_free:
-       kfree(cq);
+       return 0;
 
 err_unmap_arm:
        if (udata)
@@ -680,7 +674,7 @@ err_unmap_set:
                mthca_unmap_user_db(to_mdev(ibdev), &context->uar,
                                    context->db_tab, ucmd.set_db_index);
 
-       return ERR_PTR(err);
+       return err;
 }
 
 static int mthca_alloc_resize_buf(struct mthca_dev *dev, struct mthca_cq *cq,
@@ -804,7 +798,7 @@ out:
        return ret;
 }
 
-static int mthca_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
+static void mthca_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
        if (udata) {
                struct mthca_ucontext *context =
@@ -823,9 +817,6 @@ static int mthca_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
                                    to_mcq(cq)->set_ci_db_index);
        }
        mthca_free_cq(to_mdev(cq->device), to_mcq(cq));
-       kfree(cq);
-
-       return 0;
 }
 
 static inline u32 convert_access(int acc)
@@ -962,8 +953,7 @@ static int mthca_dereg_mr(struct ib_mr *mr, struct ib_udata *udata)
        struct mthca_mr *mmr = to_mmr(mr);
 
        mthca_free_mr(to_mdev(mr->device), mmr);
-       if (mmr->umem)
-               ib_umem_release(mmr->umem);
+       ib_umem_release(mmr->umem);
        kfree(mmr);
 
        return 0;
@@ -1153,6 +1143,11 @@ static void get_dev_fw_str(struct ib_device *device, char *str)
 }
 
 static const struct ib_device_ops mthca_dev_ops = {
+       .owner = THIS_MODULE,
+       .driver_id = RDMA_DRIVER_MTHCA,
+       .uverbs_abi_ver = MTHCA_UVERBS_ABI_VERSION,
+       .uverbs_no_driver_id_binding = 1,
+
        .alloc_pd = mthca_alloc_pd,
        .alloc_ucontext = mthca_alloc_ucontext,
        .attach_mcast = mthca_multicast_attach,
@@ -1185,6 +1180,7 @@ static const struct ib_device_ops mthca_dev_ops = {
        .resize_cq = mthca_resize_cq,
 
        INIT_RDMA_OBJ_SIZE(ib_ah, mthca_ah, ibah),
+       INIT_RDMA_OBJ_SIZE(ib_cq, mthca_cq, ibcq),
        INIT_RDMA_OBJ_SIZE(ib_pd, mthca_pd, ibpd),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, mthca_ucontext, ibucontext),
 };
@@ -1243,9 +1239,6 @@ int mthca_register_device(struct mthca_dev *dev)
        if (ret)
                return ret;
 
-       dev->ib_dev.owner                = THIS_MODULE;
-
-       dev->ib_dev.uverbs_abi_ver       = MTHCA_UVERBS_ABI_VERSION;
        dev->ib_dev.uverbs_cmd_mask      =
                (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
                (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
@@ -1303,7 +1296,6 @@ int mthca_register_device(struct mthca_dev *dev)
        mutex_init(&dev->cap_mask_mutex);
 
        rdma_set_device_sysfs_group(&dev->ib_dev, &mthca_attr_group);
-       dev->ib_dev.driver_id = RDMA_DRIVER_MTHCA;
        ret = ib_register_device(&dev->ib_dev, "mthca%d");
        if (ret)
                return ret;
diff --git a/drivers/infiniband/hw/nes/Kconfig b/drivers/infiniband/hw/nes/Kconfig
deleted file mode 100644 (file)
index 8245353..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-config INFINIBAND_NES
-       tristate "NetEffect RNIC Driver"
-       depends on PCI && INET
-       select LIBCRC32C
-       ---help---
-         This is the RDMA Network Interface Card (RNIC) driver for
-         NetEffect Ethernet Cluster Server Adapters.
-
-config INFINIBAND_NES_DEBUG
-       bool "Verbose debugging output"
-       depends on INFINIBAND_NES
-       default n
-       ---help---
-         This option enables debug messages from the NetEffect RNIC
-         driver.  Select this if you are diagnosing a problem.
diff --git a/drivers/infiniband/hw/nes/Makefile b/drivers/infiniband/hw/nes/Makefile
deleted file mode 100644 (file)
index 239689a..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_INFINIBAND_NES) += iw_nes.o
-
-iw_nes-objs := nes.o nes_hw.o nes_nic.o nes_utils.o nes_verbs.o nes_cm.o nes_mgt.o
diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c
deleted file mode 100644 (file)
index 29b3247..0000000
+++ /dev/null
@@ -1,1211 +0,0 @@
-/*
- * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/ethtool.h>
-#include <linux/mii.h>
-#include <linux/if_vlan.h>
-#include <linux/crc32.h>
-#include <linux/in.h>
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/if_arp.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/byteorder.h>
-#include <rdma/ib_smi.h>
-#include <rdma/ib_verbs.h>
-#include <rdma/ib_pack.h>
-#include <rdma/iw_cm.h>
-
-#include "nes.h"
-
-#include <net/netevent.h>
-#include <net/neighbour.h>
-#include <linux/route.h>
-#include <net/ip_fib.h>
-
-MODULE_AUTHOR("NetEffect");
-MODULE_DESCRIPTION("NetEffect RNIC Low-level iWARP Driver");
-MODULE_LICENSE("Dual BSD/GPL");
-
-int interrupt_mod_interval = 0;
-
-/* Interoperability */
-int mpa_version = 1;
-module_param(mpa_version, int, 0644);
-MODULE_PARM_DESC(mpa_version, "MPA version to be used int MPA Req/Resp (0 or 1)");
-
-/* Interoperability */
-int disable_mpa_crc = 0;
-module_param(disable_mpa_crc, int, 0644);
-MODULE_PARM_DESC(disable_mpa_crc, "Disable checking of MPA CRC");
-
-unsigned int nes_drv_opt = NES_DRV_OPT_DISABLE_INT_MOD | NES_DRV_OPT_ENABLE_PAU;
-module_param(nes_drv_opt, int, 0644);
-MODULE_PARM_DESC(nes_drv_opt, "Driver option parameters");
-
-unsigned int nes_debug_level = 0;
-module_param_named(debug_level, nes_debug_level, uint, 0644);
-MODULE_PARM_DESC(debug_level, "Enable debug output level");
-
-unsigned int wqm_quanta = 0x10000;
-module_param(wqm_quanta, int, 0644);
-MODULE_PARM_DESC(wqm_quanta, "WQM quanta");
-
-static bool limit_maxrdreqsz;
-module_param(limit_maxrdreqsz, bool, 0644);
-MODULE_PARM_DESC(limit_maxrdreqsz, "Limit max read request size to 256 Bytes");
-
-LIST_HEAD(nes_adapter_list);
-static LIST_HEAD(nes_dev_list);
-
-atomic_t qps_destroyed;
-
-static unsigned int ee_flsh_adapter;
-static unsigned int sysfs_nonidx_addr;
-static unsigned int sysfs_idx_addr;
-
-static const struct pci_device_id nes_pci_table[] = {
-       { PCI_VDEVICE(NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020), },
-       { PCI_VDEVICE(NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020_KR), },
-       {0}
-};
-
-MODULE_DEVICE_TABLE(pci, nes_pci_table);
-
-static int nes_inetaddr_event(struct notifier_block *, unsigned long, void *);
-static int nes_net_event(struct notifier_block *, unsigned long, void *);
-static int nes_notifiers_registered;
-
-
-static struct notifier_block nes_inetaddr_notifier = {
-       .notifier_call = nes_inetaddr_event
-};
-
-static struct notifier_block nes_net_notifier = {
-       .notifier_call = nes_net_event
-};
-
-/**
- * nes_inetaddr_event
- */
-static int nes_inetaddr_event(struct notifier_block *notifier,
-               unsigned long event, void *ptr)
-{
-       struct in_ifaddr *ifa = ptr;
-       struct net_device *event_netdev = ifa->ifa_dev->dev;
-       struct nes_device *nesdev;
-       struct net_device *netdev;
-       struct net_device *upper_dev;
-       struct nes_vnic *nesvnic;
-       unsigned int is_bonded;
-
-       nes_debug(NES_DBG_NETDEV, "nes_inetaddr_event: ip address %pI4, netmask %pI4.\n",
-                 &ifa->ifa_address, &ifa->ifa_mask);
-       list_for_each_entry(nesdev, &nes_dev_list, list) {
-               nes_debug(NES_DBG_NETDEV, "Nesdev list entry = 0x%p. (%s)\n",
-                               nesdev, nesdev->netdev[0]->name);
-               netdev = nesdev->netdev[0];
-               nesvnic = netdev_priv(netdev);
-               upper_dev = netdev_master_upper_dev_get(netdev);
-               is_bonded = netif_is_bond_slave(netdev) &&
-                           (upper_dev == event_netdev);
-               if ((netdev == event_netdev) || is_bonded) {
-                       if (nesvnic->rdma_enabled == 0) {
-                               nes_debug(NES_DBG_NETDEV, "Returning without processing event for %s since"
-                                               " RDMA is not enabled.\n",
-                                               netdev->name);
-                               return NOTIFY_OK;
-                       }
-                       /* we have ifa->ifa_address/mask here if we need it */
-                       switch (event) {
-                               case NETDEV_DOWN:
-                                       nes_debug(NES_DBG_NETDEV, "event:DOWN\n");
-                                       nes_write_indexed(nesdev,
-                                                       NES_IDX_DST_IP_ADDR+(0x10*PCI_FUNC(nesdev->pcidev->devfn)), 0);
-
-                                       nes_manage_arp_cache(netdev, netdev->dev_addr,
-                                                       ntohl(nesvnic->local_ipaddr), NES_ARP_DELETE);
-                                       nesvnic->local_ipaddr = 0;
-                                       if (is_bonded)
-                                               continue;
-                                       else
-                                               return NOTIFY_OK;
-                                       break;
-                               case NETDEV_UP:
-                                       nes_debug(NES_DBG_NETDEV, "event:UP\n");
-
-                                       if (nesvnic->local_ipaddr != 0) {
-                                               nes_debug(NES_DBG_NETDEV, "Interface already has local_ipaddr\n");
-                                               return NOTIFY_OK;
-                                       }
-                                       /* fall through */
-                               case NETDEV_CHANGEADDR:
-                                       /* Add the address to the IP table */
-                                       if (upper_dev) {
-                                               struct in_device *in;
-
-                                               rcu_read_lock();
-                                               in = __in_dev_get_rcu(upper_dev);
-                                               if (in) {
-                                                       struct in_ifaddr *ifa;
-
-                                                       ifa = rcu_dereference(in->ifa_list);
-                                                       if (ifa)
-                                                               nesvnic->local_ipaddr = ifa->ifa_address;
-                                               }
-                                               rcu_read_unlock();
-                                       } else {
-                                               nesvnic->local_ipaddr = ifa->ifa_address;
-                                       }
-
-                                       nes_write_indexed(nesdev,
-                                                       NES_IDX_DST_IP_ADDR+(0x10*PCI_FUNC(nesdev->pcidev->devfn)),
-                                                       ntohl(nesvnic->local_ipaddr));
-                                       nes_manage_arp_cache(netdev, netdev->dev_addr,
-                                                       ntohl(nesvnic->local_ipaddr), NES_ARP_ADD);
-                                       if (is_bonded)
-                                               continue;
-                                       else
-                                               return NOTIFY_OK;
-                                       break;
-                               default:
-                                       break;
-                       }
-               }
-       }
-
-       return NOTIFY_DONE;
-}
-
-
-/**
- * nes_net_event
- */
-static int nes_net_event(struct notifier_block *notifier,
-               unsigned long event, void *ptr)
-{
-       struct neighbour *neigh = ptr;
-       struct nes_device *nesdev;
-       struct net_device *netdev;
-       struct nes_vnic *nesvnic;
-
-       switch (event) {
-               case NETEVENT_NEIGH_UPDATE:
-                       list_for_each_entry(nesdev, &nes_dev_list, list) {
-                               /* nes_debug(NES_DBG_NETDEV, "Nesdev list entry = 0x%p.\n", nesdev); */
-                               netdev = nesdev->netdev[0];
-                               nesvnic = netdev_priv(netdev);
-                               if (netdev == neigh->dev) {
-                                       if (nesvnic->rdma_enabled == 0) {
-                                               nes_debug(NES_DBG_NETDEV, "Skipping device %s since no RDMA\n",
-                                                               netdev->name);
-                                       } else {
-                                               if (neigh->nud_state & NUD_VALID) {
-                                                       nes_manage_arp_cache(neigh->dev, neigh->ha,
-                                                                       ntohl(*(__be32 *)neigh->primary_key), NES_ARP_ADD);
-                                               } else {
-                                                       nes_manage_arp_cache(neigh->dev, neigh->ha,
-                                                                       ntohl(*(__be32 *)neigh->primary_key), NES_ARP_DELETE);
-                                               }
-                                       }
-                                       return NOTIFY_OK;
-                               }
-                       }
-                       break;
-               default:
-                       nes_debug(NES_DBG_NETDEV, "NETEVENT_ %lu undefined\n", event);
-                       break;
-       }
-
-       return NOTIFY_DONE;
-}
-
-
-/**
- * nes_add_ref
- */
-void nes_add_ref(struct ib_qp *ibqp)
-{
-       struct nes_qp *nesqp;
-
-       nesqp = to_nesqp(ibqp);
-       nes_debug(NES_DBG_QP, "Bumping refcount for QP%u.  Pre-inc value = %u\n",
-                       ibqp->qp_num, atomic_read(&nesqp->refcount));
-       atomic_inc(&nesqp->refcount);
-}
-
-static void nes_cqp_rem_ref_callback(struct nes_device *nesdev, struct nes_cqp_request *cqp_request)
-{
-       unsigned long flags;
-       struct nes_qp *nesqp = cqp_request->cqp_callback_pointer;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-
-       atomic_inc(&qps_destroyed);
-
-       /* Free the control structures */
-
-       if (nesqp->pbl_vbase) {
-               pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size,
-                               nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase);
-               spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-               nesadapter->free_256pbl++;
-               spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-               pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase);
-               nesqp->pbl_vbase = NULL;
-
-       } else {
-               pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size,
-                               nesqp->hwqp.sq_vbase, nesqp->hwqp.sq_pbase);
-       }
-       nes_free_resource(nesadapter, nesadapter->allocated_qps, nesqp->hwqp.qp_id);
-
-       nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = NULL;
-       kfree(nesqp->allocated_buffer);
-
-}
-
-/**
- * nes_rem_ref
- */
-void nes_rem_ref(struct ib_qp *ibqp)
-{
-       u64 u64temp;
-       struct nes_qp *nesqp;
-       struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       struct nes_cqp_request *cqp_request;
-       u32 opcode;
-
-       nesqp = to_nesqp(ibqp);
-
-       if (atomic_read(&nesqp->refcount) == 0) {
-               printk(KERN_INFO PFX "%s: Reference count already 0 for QP%d, last aeq = 0x%04X.\n",
-                               __func__, ibqp->qp_num, nesqp->last_aeq);
-               BUG();
-       }
-
-       if (atomic_dec_and_test(&nesqp->refcount)) {
-               if (nesqp->pau_mode)
-                       nes_destroy_pau_qp(nesdev, nesqp);
-
-               /* Destroy the QP */
-               cqp_request = nes_get_cqp_request(nesdev);
-               if (cqp_request == NULL) {
-                       nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n");
-                       return;
-               }
-               cqp_request->waiting = 0;
-               cqp_request->callback = 1;
-               cqp_request->cqp_callback = nes_cqp_rem_ref_callback;
-               cqp_request->cqp_callback_pointer = nesqp;
-               cqp_wqe = &cqp_request->cqp_wqe;
-
-               nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-               opcode = NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_IWARP;
-
-               if (nesqp->hte_added) {
-                       opcode  |= NES_CQP_QP_DEL_HTE;
-                       nesqp->hte_added = 0;
-               }
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
-               u64temp = (u64)nesqp->nesqp_context_pbase;
-               set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
-               nes_post_cqp_request(nesdev, cqp_request);
-       }
-}
-
-
-/**
- * nes_get_qp
- */
-struct ib_qp *nes_get_qp(struct ib_device *device, int qpn)
-{
-       struct nes_vnic *nesvnic = to_nesvnic(device);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-
-       if ((qpn < NES_FIRST_QPN) || (qpn >= (NES_FIRST_QPN + nesadapter->max_qp)))
-               return NULL;
-
-       return &nesadapter->qp_table[qpn - NES_FIRST_QPN]->ibqp;
-}
-
-
-/**
- * nes_print_macaddr
- */
-static void nes_print_macaddr(struct net_device *netdev)
-{
-       nes_debug(NES_DBG_INIT, "%s: %pM, IRQ %u\n",
-                 netdev->name, netdev->dev_addr, netdev->irq);
-}
-
-/**
- * nes_interrupt - handle interrupts
- */
-static irqreturn_t nes_interrupt(int irq, void *dev_id)
-{
-       struct nes_device *nesdev = (struct nes_device *)dev_id;
-       int handled = 0;
-       u32 int_mask;
-       u32 int_req;
-       u32 int_stat;
-       u32 intf_int_stat;
-       u32 timer_stat;
-
-       if (nesdev->msi_enabled) {
-               /* No need to read the interrupt pending register if msi is enabled */
-               handled = 1;
-       } else {
-               if (unlikely(nesdev->nesadapter->hw_rev == NE020_REV)) {
-                       /* Master interrupt enable provides synchronization for kicking off bottom half
-                         when interrupt sharing is going on */
-                       int_mask = nes_read32(nesdev->regs + NES_INT_MASK);
-                       if (int_mask & 0x80000000) {
-                               /* Check interrupt status to see if this might be ours */
-                               int_stat = nes_read32(nesdev->regs + NES_INT_STAT);
-                               int_req = nesdev->int_req;
-                               if (int_stat&int_req) {
-                                       /* if interesting CEQ or AEQ is pending, claim the interrupt */
-                                       if ((int_stat&int_req) & (~(NES_INT_TIMER|NES_INT_INTF))) {
-                                               handled = 1;
-                                       } else {
-                                               if (((int_stat & int_req) & NES_INT_TIMER) == NES_INT_TIMER) {
-                                                       /* Timer might be running but might be for another function */
-                                                       timer_stat = nes_read32(nesdev->regs + NES_TIMER_STAT);
-                                                       if ((timer_stat & nesdev->timer_int_req) != 0) {
-                                                               handled = 1;
-                                                       }
-                                               }
-                                               if ((((int_stat & int_req) & NES_INT_INTF) == NES_INT_INTF) &&
-                                                               (handled == 0)) {
-                                                       intf_int_stat = nes_read32(nesdev->regs+NES_INTF_INT_STAT);
-                                                       if ((intf_int_stat & nesdev->intf_int_req) != 0) {
-                                                               handled = 1;
-                                                       }
-                                               }
-                                       }
-                                       if (handled) {
-                                               nes_write32(nesdev->regs+NES_INT_MASK, int_mask & (~0x80000000));
-                                               int_mask = nes_read32(nesdev->regs+NES_INT_MASK);
-                                               /* Save off the status to save an additional read */
-                                               nesdev->int_stat = int_stat;
-                                               nesdev->napi_isr_ran = 1;
-                                       }
-                               }
-                       }
-               } else {
-                       handled = nes_read32(nesdev->regs+NES_INT_PENDING);
-               }
-       }
-
-       if (handled) {
-
-               if (nes_napi_isr(nesdev) == 0) {
-                       tasklet_schedule(&nesdev->dpc_tasklet);
-
-               }
-               return IRQ_HANDLED;
-       } else {
-               return IRQ_NONE;
-       }
-}
-
-
-/**
- * nes_probe - Device initialization
- */
-static int nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent)
-{
-       struct net_device *netdev = NULL;
-       struct nes_device *nesdev = NULL;
-       int ret = 0;
-       void __iomem *mmio_regs = NULL;
-       u8 hw_rev;
-
-       printk(KERN_INFO PFX "NetEffect RNIC driver v%s loading. (%s)\n",
-                       DRV_VERSION, pci_name(pcidev));
-
-       ret = pci_enable_device(pcidev);
-       if (ret) {
-               printk(KERN_ERR PFX "Unable to enable PCI device. (%s)\n", pci_name(pcidev));
-               goto bail0;
-       }
-
-       nes_debug(NES_DBG_INIT, "BAR0 (@0x%08lX) size = 0x%lX bytes\n",
-                       (long unsigned int)pci_resource_start(pcidev, BAR_0),
-                       (long unsigned int)pci_resource_len(pcidev, BAR_0));
-       nes_debug(NES_DBG_INIT, "BAR1 (@0x%08lX) size = 0x%lX bytes\n",
-                       (long unsigned int)pci_resource_start(pcidev, BAR_1),
-                       (long unsigned int)pci_resource_len(pcidev, BAR_1));
-
-       /* Make sure PCI base addr are MMIO */
-       if (!(pci_resource_flags(pcidev, BAR_0) & IORESOURCE_MEM) ||
-                       !(pci_resource_flags(pcidev, BAR_1) & IORESOURCE_MEM)) {
-               printk(KERN_ERR PFX "PCI regions not an MMIO resource\n");
-               ret = -ENODEV;
-               goto bail1;
-       }
-
-       /* Reserve PCI I/O and memory resources */
-       ret = pci_request_regions(pcidev, DRV_NAME);
-       if (ret) {
-               printk(KERN_ERR PFX "Unable to request regions. (%s)\n", pci_name(pcidev));
-               goto bail1;
-       }
-
-       if ((sizeof(dma_addr_t) > 4)) {
-               ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(64));
-               if (ret < 0) {
-                       printk(KERN_ERR PFX "64b DMA mask configuration failed\n");
-                       goto bail2;
-               }
-               ret = pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(64));
-               if (ret) {
-                       printk(KERN_ERR PFX "64b DMA consistent mask configuration failed\n");
-                       goto bail2;
-               }
-       } else {
-               ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32));
-               if (ret < 0) {
-                       printk(KERN_ERR PFX "32b DMA mask configuration failed\n");
-                       goto bail2;
-               }
-               ret = pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(32));
-               if (ret) {
-                       printk(KERN_ERR PFX "32b DMA consistent mask configuration failed\n");
-                       goto bail2;
-               }
-       }
-
-       pci_set_master(pcidev);
-
-       /* Allocate hardware structure */
-       nesdev = kzalloc(sizeof(struct nes_device), GFP_KERNEL);
-       if (!nesdev) {
-               ret = -ENOMEM;
-               goto bail2;
-       }
-
-       nes_debug(NES_DBG_INIT, "Allocated nes device at %p\n", nesdev);
-       nesdev->pcidev = pcidev;
-       pci_set_drvdata(pcidev, nesdev);
-
-       pci_read_config_byte(pcidev, 0x0008, &hw_rev);
-       nes_debug(NES_DBG_INIT, "hw_rev=%u\n", hw_rev);
-
-       spin_lock_init(&nesdev->indexed_regs_lock);
-
-       /* Remap the PCI registers in adapter BAR0 to kernel VA space */
-       mmio_regs = ioremap_nocache(pci_resource_start(pcidev, BAR_0),
-                                   pci_resource_len(pcidev, BAR_0));
-       if (mmio_regs == NULL) {
-               printk(KERN_ERR PFX "Unable to remap BAR0\n");
-               ret = -EIO;
-               goto bail3;
-       }
-       nesdev->regs = mmio_regs;
-       nesdev->index_reg = 0x50 + (PCI_FUNC(pcidev->devfn)*8) + mmio_regs;
-
-       /* Ensure interrupts are disabled */
-       nes_write32(nesdev->regs+NES_INT_MASK, 0x7fffffff);
-
-       if (nes_drv_opt & NES_DRV_OPT_ENABLE_MSI) {
-               if (!pci_enable_msi(nesdev->pcidev)) {
-                       nesdev->msi_enabled = 1;
-                       nes_debug(NES_DBG_INIT, "MSI is enabled for device %s\n",
-                                       pci_name(pcidev));
-               } else {
-                       nes_debug(NES_DBG_INIT, "MSI is disabled by linux for device %s\n",
-                                       pci_name(pcidev));
-               }
-       } else {
-               nes_debug(NES_DBG_INIT, "MSI not requested due to driver options for device %s\n",
-                               pci_name(pcidev));
-       }
-
-       nesdev->csr_start = pci_resource_start(nesdev->pcidev, BAR_0);
-       nesdev->doorbell_region = pci_resource_start(nesdev->pcidev, BAR_1);
-
-       /* Init the adapter */
-       nesdev->nesadapter = nes_init_adapter(nesdev, hw_rev);
-       if (!nesdev->nesadapter) {
-               printk(KERN_ERR PFX "Unable to initialize adapter.\n");
-               ret = -ENOMEM;
-               goto bail5;
-       }
-       nesdev->nesadapter->et_rx_coalesce_usecs_irq = interrupt_mod_interval;
-       nesdev->nesadapter->wqm_quanta = wqm_quanta;
-
-       /* nesdev->base_doorbell_index =
-                       nesdev->nesadapter->pd_config_base[PCI_FUNC(nesdev->pcidev->devfn)]; */
-       nesdev->base_doorbell_index = 1;
-       nesdev->doorbell_start = nesdev->nesadapter->doorbell_start;
-       if (nesdev->nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G) {
-               switch (PCI_FUNC(nesdev->pcidev->devfn) %
-                       nesdev->nesadapter->port_count) {
-               case 1:
-                       nesdev->mac_index = 2;
-                       break;
-               case 2:
-                       nesdev->mac_index = 1;
-                       break;
-               case 3:
-                       nesdev->mac_index = 3;
-                       break;
-               case 0:
-               default:
-                       nesdev->mac_index = 0;
-               }
-       } else {
-               nesdev->mac_index = PCI_FUNC(nesdev->pcidev->devfn) %
-                                               nesdev->nesadapter->port_count;
-       }
-
-       if ((limit_maxrdreqsz ||
-            ((nesdev->nesadapter->phy_type[0] == NES_PHY_TYPE_GLADIUS) &&
-             (hw_rev == NE020_REV1))) &&
-           (pcie_get_readrq(pcidev) > 256)) {
-               if (pcie_set_readrq(pcidev, 256))
-                       printk(KERN_ERR PFX "Unable to set max read request"
-                               " to 256 bytes\n");
-               else
-                       nes_debug(NES_DBG_INIT, "Max read request size set"
-                               " to 256 bytes\n");
-       }
-
-       tasklet_init(&nesdev->dpc_tasklet, nes_dpc, (unsigned long)nesdev);
-
-       /* bring up the Control QP */
-       if (nes_init_cqp(nesdev)) {
-               ret = -ENODEV;
-               goto bail6;
-       }
-
-       /* Arm the CCQ */
-       nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
-                       PCI_FUNC(nesdev->pcidev->devfn));
-       nes_read32(nesdev->regs+NES_CQE_ALLOC);
-
-       /* Enable the interrupts */
-       nesdev->int_req = (0x101 << PCI_FUNC(nesdev->pcidev->devfn)) |
-                       (1 << (PCI_FUNC(nesdev->pcidev->devfn)+16));
-       if (PCI_FUNC(nesdev->pcidev->devfn) < 4) {
-               nesdev->int_req |= (1 << (PCI_FUNC(nesdev->mac_index)+24));
-       }
-
-       /* TODO: This really should be the first driver to load, not function 0 */
-       if (PCI_FUNC(nesdev->pcidev->devfn) == 0) {
-               /* pick up PCI and critical errors if the first driver to load */
-               nesdev->intf_int_req = NES_INTF_INT_PCIERR | NES_INTF_INT_CRITERR;
-               nesdev->int_req |= NES_INT_INTF;
-       } else {
-               nesdev->intf_int_req = 0;
-       }
-       nesdev->intf_int_req |= (1 << (PCI_FUNC(nesdev->pcidev->devfn)+16));
-       nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS0, 0);
-       nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS1, 0);
-       nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS2, 0x00001265);
-       nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS4, 0x18021804);
-
-       nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS3, 0x17801790);
-
-       /* deal with both periodic and one_shot */
-       nesdev->timer_int_req = 0x101 << PCI_FUNC(nesdev->pcidev->devfn);
-       nesdev->nesadapter->timer_int_req |= nesdev->timer_int_req;
-       nes_debug(NES_DBG_INIT, "setting int_req for function %u, nesdev = 0x%04X, adapter = 0x%04X\n",
-                       PCI_FUNC(nesdev->pcidev->devfn),
-                       nesdev->timer_int_req, nesdev->nesadapter->timer_int_req);
-
-       nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
-
-       list_add_tail(&nesdev->list, &nes_dev_list);
-
-       /* Request an interrupt line for the driver */
-       ret = request_irq(pcidev->irq, nes_interrupt, IRQF_SHARED, DRV_NAME, nesdev);
-       if (ret) {
-               printk(KERN_ERR PFX "%s: requested IRQ %u is busy\n",
-                               pci_name(pcidev), pcidev->irq);
-               goto bail65;
-       }
-
-       nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
-
-       if (nes_notifiers_registered == 0) {
-               register_inetaddr_notifier(&nes_inetaddr_notifier);
-               register_netevent_notifier(&nes_net_notifier);
-       }
-       nes_notifiers_registered++;
-
-       INIT_DELAYED_WORK(&nesdev->work, nes_recheck_link_status);
-
-       /* Initialize network devices */
-       netdev = nes_netdev_init(nesdev, mmio_regs);
-       if (netdev == NULL) {
-               ret = -ENOMEM;
-               goto bail7;
-       }
-
-       /* Register network device */
-       ret = register_netdev(netdev);
-       if (ret) {
-               printk(KERN_ERR PFX "Unable to register netdev, ret = %d\n", ret);
-               nes_netdev_destroy(netdev);
-               goto bail7;
-       }
-
-       nes_print_macaddr(netdev);
-
-       nesdev->netdev_count++;
-       nesdev->nesadapter->netdev_count++;
-
-       printk(KERN_INFO PFX "%s: NetEffect RNIC driver successfully loaded.\n",
-                       pci_name(pcidev));
-       return 0;
-
-       bail7:
-       printk(KERN_ERR PFX "bail7\n");
-       while (nesdev->netdev_count > 0) {
-               nesdev->netdev_count--;
-               nesdev->nesadapter->netdev_count--;
-
-               unregister_netdev(nesdev->netdev[nesdev->netdev_count]);
-               nes_netdev_destroy(nesdev->netdev[nesdev->netdev_count]);
-       }
-
-       nes_debug(NES_DBG_INIT, "netdev_count=%d, nesadapter->netdev_count=%d\n",
-                       nesdev->netdev_count, nesdev->nesadapter->netdev_count);
-
-       nes_notifiers_registered--;
-       if (nes_notifiers_registered == 0) {
-               unregister_netevent_notifier(&nes_net_notifier);
-               unregister_inetaddr_notifier(&nes_inetaddr_notifier);
-       }
-
-       list_del(&nesdev->list);
-       nes_destroy_cqp(nesdev);
-
-       bail65:
-       printk(KERN_ERR PFX "bail65\n");
-       free_irq(pcidev->irq, nesdev);
-       if (nesdev->msi_enabled) {
-               pci_disable_msi(pcidev);
-       }
-       bail6:
-       printk(KERN_ERR PFX "bail6\n");
-       tasklet_kill(&nesdev->dpc_tasklet);
-       /* Deallocate the Adapter Structure */
-       nes_destroy_adapter(nesdev->nesadapter);
-
-       bail5:
-       printk(KERN_ERR PFX "bail5\n");
-       iounmap(nesdev->regs);
-
-       bail3:
-       printk(KERN_ERR PFX "bail3\n");
-       kfree(nesdev);
-
-       bail2:
-       pci_release_regions(pcidev);
-
-       bail1:
-       pci_disable_device(pcidev);
-
-       bail0:
-       return ret;
-}
-
-
-/**
- * nes_remove - unload from kernel
- */
-static void nes_remove(struct pci_dev *pcidev)
-{
-       struct nes_device *nesdev = pci_get_drvdata(pcidev);
-       struct net_device *netdev;
-       int netdev_index = 0;
-       unsigned long flags;
-
-       if (nesdev->netdev_count) {
-               netdev = nesdev->netdev[netdev_index];
-               if (netdev) {
-                       netif_stop_queue(netdev);
-                       unregister_netdev(netdev);
-                       nes_netdev_destroy(netdev);
-
-                       nesdev->netdev[netdev_index] = NULL;
-                       nesdev->netdev_count--;
-                       nesdev->nesadapter->netdev_count--;
-               }
-       }
-
-       nes_notifiers_registered--;
-       if (nes_notifiers_registered == 0) {
-               unregister_netevent_notifier(&nes_net_notifier);
-               unregister_inetaddr_notifier(&nes_inetaddr_notifier);
-       }
-
-       list_del(&nesdev->list);
-       nes_destroy_cqp(nesdev);
-
-       free_irq(pcidev->irq, nesdev);
-       tasklet_kill(&nesdev->dpc_tasklet);
-
-       spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
-       if (nesdev->link_recheck) {
-               spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
-               cancel_delayed_work_sync(&nesdev->work);
-       } else {
-               spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
-       }
-
-       /* Deallocate the Adapter Structure */
-       nes_destroy_adapter(nesdev->nesadapter);
-
-       if (nesdev->msi_enabled) {
-               pci_disable_msi(pcidev);
-       }
-
-       iounmap(nesdev->regs);
-       kfree(nesdev);
-
-       /* nes_debug(NES_DBG_SHUTDOWN, "calling pci_release_regions.\n"); */
-       pci_release_regions(pcidev);
-       pci_disable_device(pcidev);
-       pci_set_drvdata(pcidev, NULL);
-}
-
-
-static ssize_t adapter_show(struct device_driver *ddp, char *buf)
-{
-       unsigned int  devfn = 0xffffffff;
-       unsigned char bus_number = 0xff;
-       unsigned int  i = 0;
-       struct nes_device *nesdev;
-
-       list_for_each_entry(nesdev, &nes_dev_list, list) {
-               if (i == ee_flsh_adapter) {
-                       devfn = nesdev->pcidev->devfn;
-                       bus_number = nesdev->pcidev->bus->number;
-                       break;
-               }
-               i++;
-       }
-
-       return snprintf(buf, PAGE_SIZE, "%x:%x\n", bus_number, devfn);
-}
-
-static ssize_t adapter_store(struct device_driver *ddp,
-       const char *buf, size_t count)
-{
-       char *p = (char *)buf;
-
-       ee_flsh_adapter = simple_strtoul(p, &p, 10);
-       return strnlen(buf, count);
-}
-
-static ssize_t eeprom_cmd_show(struct device_driver *ddp, char *buf)
-{
-       u32 eeprom_cmd = 0xdead;
-       u32 i = 0;
-       struct nes_device *nesdev;
-
-       list_for_each_entry(nesdev, &nes_dev_list, list) {
-               if (i == ee_flsh_adapter) {
-                       eeprom_cmd = nes_read32(nesdev->regs + NES_EEPROM_COMMAND);
-                       break;
-               }
-               i++;
-       }
-       return snprintf(buf, PAGE_SIZE, "0x%x\n", eeprom_cmd);
-}
-
-static ssize_t eeprom_cmd_store(struct device_driver *ddp,
-       const char *buf, size_t count)
-{
-       char *p = (char *)buf;
-       u32 val;
-       u32 i = 0;
-       struct nes_device *nesdev;
-
-       if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
-               val = simple_strtoul(p, &p, 16);
-               list_for_each_entry(nesdev, &nes_dev_list, list) {
-                       if (i == ee_flsh_adapter) {
-                               nes_write32(nesdev->regs + NES_EEPROM_COMMAND, val);
-                               break;
-                       }
-                       i++;
-               }
-       }
-       return strnlen(buf, count);
-}
-
-static ssize_t eeprom_data_show(struct device_driver *ddp, char *buf)
-{
-       u32 eeprom_data = 0xdead;
-       u32 i = 0;
-       struct nes_device *nesdev;
-
-       list_for_each_entry(nesdev, &nes_dev_list, list) {
-               if (i == ee_flsh_adapter) {
-                       eeprom_data = nes_read32(nesdev->regs + NES_EEPROM_DATA);
-                       break;
-               }
-               i++;
-       }
-
-       return  snprintf(buf, PAGE_SIZE, "0x%x\n", eeprom_data);
-}
-
-static ssize_t eeprom_data_store(struct device_driver *ddp,
-       const char *buf, size_t count)
-{
-       char *p = (char *)buf;
-       u32 val;
-       u32 i = 0;
-       struct nes_device *nesdev;
-
-       if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
-               val = simple_strtoul(p, &p, 16);
-               list_for_each_entry(nesdev, &nes_dev_list, list) {
-                       if (i == ee_flsh_adapter) {
-                               nes_write32(nesdev->regs + NES_EEPROM_DATA, val);
-                               break;
-                       }
-                       i++;
-               }
-       }
-       return strnlen(buf, count);
-}
-
-static ssize_t flash_cmd_show(struct device_driver *ddp, char *buf)
-{
-       u32 flash_cmd = 0xdead;
-       u32 i = 0;
-       struct nes_device *nesdev;
-
-       list_for_each_entry(nesdev, &nes_dev_list, list) {
-               if (i == ee_flsh_adapter) {
-                       flash_cmd = nes_read32(nesdev->regs + NES_FLASH_COMMAND);
-                       break;
-               }
-               i++;
-       }
-
-       return  snprintf(buf, PAGE_SIZE, "0x%x\n", flash_cmd);
-}
-
-static ssize_t flash_cmd_store(struct device_driver *ddp,
-       const char *buf, size_t count)
-{
-       char *p = (char *)buf;
-       u32 val;
-       u32 i = 0;
-       struct nes_device *nesdev;
-
-       if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
-               val = simple_strtoul(p, &p, 16);
-               list_for_each_entry(nesdev, &nes_dev_list, list) {
-                       if (i == ee_flsh_adapter) {
-                               nes_write32(nesdev->regs + NES_FLASH_COMMAND, val);
-                               break;
-                       }
-                       i++;
-               }
-       }
-       return strnlen(buf, count);
-}
-
-static ssize_t flash_data_show(struct device_driver *ddp, char *buf)
-{
-       u32 flash_data = 0xdead;
-       u32 i = 0;
-       struct nes_device *nesdev;
-
-       list_for_each_entry(nesdev, &nes_dev_list, list) {
-               if (i == ee_flsh_adapter) {
-                       flash_data = nes_read32(nesdev->regs + NES_FLASH_DATA);
-                       break;
-               }
-               i++;
-       }
-
-       return  snprintf(buf, PAGE_SIZE, "0x%x\n", flash_data);
-}
-
-static ssize_t flash_data_store(struct device_driver *ddp,
-       const char *buf, size_t count)
-{
-       char *p = (char *)buf;
-       u32 val;
-       u32 i = 0;
-       struct nes_device *nesdev;
-
-       if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
-               val = simple_strtoul(p, &p, 16);
-               list_for_each_entry(nesdev, &nes_dev_list, list) {
-                       if (i == ee_flsh_adapter) {
-                               nes_write32(nesdev->regs + NES_FLASH_DATA, val);
-                               break;
-                       }
-                       i++;
-               }
-       }
-       return strnlen(buf, count);
-}
-
-static ssize_t nonidx_addr_show(struct device_driver *ddp, char *buf)
-{
-       return  snprintf(buf, PAGE_SIZE, "0x%x\n", sysfs_nonidx_addr);
-}
-
-static ssize_t nonidx_addr_store(struct device_driver *ddp,
-       const char *buf, size_t count)
-{
-       char *p = (char *)buf;
-
-       if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X')
-               sysfs_nonidx_addr = simple_strtoul(p, &p, 16);
-
-       return strnlen(buf, count);
-}
-
-static ssize_t nonidx_data_show(struct device_driver *ddp, char *buf)
-{
-       u32 nonidx_data = 0xdead;
-       u32 i = 0;
-       struct nes_device *nesdev;
-
-       list_for_each_entry(nesdev, &nes_dev_list, list) {
-               if (i == ee_flsh_adapter) {
-                       nonidx_data = nes_read32(nesdev->regs + sysfs_nonidx_addr);
-                       break;
-               }
-               i++;
-       }
-
-       return  snprintf(buf, PAGE_SIZE, "0x%x\n", nonidx_data);
-}
-
-static ssize_t nonidx_data_store(struct device_driver *ddp,
-       const char *buf, size_t count)
-{
-       char *p = (char *)buf;
-       u32 val;
-       u32 i = 0;
-       struct nes_device *nesdev;
-
-       if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
-               val = simple_strtoul(p, &p, 16);
-               list_for_each_entry(nesdev, &nes_dev_list, list) {
-                       if (i == ee_flsh_adapter) {
-                               nes_write32(nesdev->regs + sysfs_nonidx_addr, val);
-                               break;
-                       }
-                       i++;
-               }
-       }
-       return strnlen(buf, count);
-}
-
-static ssize_t idx_addr_show(struct device_driver *ddp, char *buf)
-{
-       return  snprintf(buf, PAGE_SIZE, "0x%x\n", sysfs_idx_addr);
-}
-
-static ssize_t idx_addr_store(struct device_driver *ddp,
-       const char *buf, size_t count)
-{
-       char *p = (char *)buf;
-
-       if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X')
-               sysfs_idx_addr = simple_strtoul(p, &p, 16);
-
-       return strnlen(buf, count);
-}
-
-static ssize_t idx_data_show(struct device_driver *ddp, char *buf)
-{
-       u32 idx_data = 0xdead;
-       u32 i = 0;
-       struct nes_device *nesdev;
-
-       list_for_each_entry(nesdev, &nes_dev_list, list) {
-               if (i == ee_flsh_adapter) {
-                       idx_data = nes_read_indexed(nesdev, sysfs_idx_addr);
-                       break;
-               }
-               i++;
-       }
-
-       return  snprintf(buf, PAGE_SIZE, "0x%x\n", idx_data);
-}
-
-static ssize_t idx_data_store(struct device_driver *ddp,
-       const char *buf, size_t count)
-{
-       char *p = (char *)buf;
-       u32 val;
-       u32 i = 0;
-       struct nes_device *nesdev;
-
-       if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
-               val = simple_strtoul(p, &p, 16);
-               list_for_each_entry(nesdev, &nes_dev_list, list) {
-                       if (i == ee_flsh_adapter) {
-                               nes_write_indexed(nesdev, sysfs_idx_addr, val);
-                               break;
-                       }
-                       i++;
-               }
-       }
-       return strnlen(buf, count);
-}
-
-static ssize_t wqm_quanta_show(struct device_driver *ddp, char *buf)
-{
-       u32 wqm_quanta_value = 0xdead;
-       u32 i = 0;
-       struct nes_device *nesdev;
-
-       list_for_each_entry(nesdev, &nes_dev_list, list) {
-               if (i == ee_flsh_adapter) {
-                       wqm_quanta_value = nesdev->nesadapter->wqm_quanta;
-                       break;
-               }
-               i++;
-       }
-
-       return  snprintf(buf, PAGE_SIZE, "0x%X\n", wqm_quanta_value);
-}
-
-static ssize_t wqm_quanta_store(struct device_driver *ddp, const char *buf,
-                               size_t count)
-{
-       unsigned long wqm_quanta_value;
-       u32 wqm_config1;
-       u32 i = 0;
-       struct nes_device *nesdev;
-
-       if (kstrtoul(buf, 0, &wqm_quanta_value) < 0)
-               return -EINVAL;
-
-       list_for_each_entry(nesdev, &nes_dev_list, list) {
-               if (i == ee_flsh_adapter) {
-                       nesdev->nesadapter->wqm_quanta = wqm_quanta_value;
-                       wqm_config1 = nes_read_indexed(nesdev,
-                                               NES_IDX_WQM_CONFIG1);
-                       nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG1,
-                                       ((wqm_quanta_value << 1) |
-                                       (wqm_config1 & 0x00000001)));
-                       break;
-               }
-               i++;
-       }
-       return strnlen(buf, count);
-}
-
-static DRIVER_ATTR_RW(adapter);
-static DRIVER_ATTR_RW(eeprom_cmd);
-static DRIVER_ATTR_RW(eeprom_data);
-static DRIVER_ATTR_RW(flash_cmd);
-static DRIVER_ATTR_RW(flash_data);
-static DRIVER_ATTR_RW(nonidx_addr);
-static DRIVER_ATTR_RW(nonidx_data);
-static DRIVER_ATTR_RW(idx_addr);
-static DRIVER_ATTR_RW(idx_data);
-static DRIVER_ATTR_RW(wqm_quanta);
-
-static struct attribute *nes_attrs[] = {
-       &driver_attr_adapter.attr,
-       &driver_attr_eeprom_cmd.attr,
-       &driver_attr_eeprom_data.attr,
-       &driver_attr_flash_cmd.attr,
-       &driver_attr_flash_data.attr,
-       &driver_attr_nonidx_addr.attr,
-       &driver_attr_nonidx_data.attr,
-       &driver_attr_idx_addr.attr,
-       &driver_attr_idx_data.attr,
-       &driver_attr_wqm_quanta.attr,
-       NULL,
-};
-ATTRIBUTE_GROUPS(nes);
-
-static struct pci_driver nes_pci_driver = {
-       .name = DRV_NAME,
-       .id_table = nes_pci_table,
-       .probe = nes_probe,
-       .remove = nes_remove,
-       .groups = nes_groups,
-};
-
-
-/**
- * nes_init_module - module initialization entry point
- */
-static int __init nes_init_module(void)
-{
-       int retval;
-
-       retval = nes_cm_start();
-       if (retval) {
-               printk(KERN_ERR PFX "Unable to start NetEffect iWARP CM.\n");
-               return retval;
-       }
-       return pci_register_driver(&nes_pci_driver);
-}
-
-
-/**
- * nes_exit_module - module unload entry point
- */
-static void __exit nes_exit_module(void)
-{
-       nes_cm_stop();
-
-       pci_unregister_driver(&nes_pci_driver);
-}
-
-
-module_init(nes_init_module);
-module_exit(nes_exit_module);
diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h
deleted file mode 100644 (file)
index a895fe9..0000000
+++ /dev/null
@@ -1,574 +0,0 @@
-/*
- * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __NES_H
-#define __NES_H
-
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include <linux/dma-mapping.h>
-#include <linux/workqueue.h>
-#include <linux/slab.h>
-#include <asm/io.h>
-#include <linux/crc32c.h>
-
-#include <rdma/ib_smi.h>
-#include <rdma/ib_verbs.h>
-#include <rdma/ib_pack.h>
-#include <rdma/rdma_cm.h>
-#include <rdma/iw_cm.h>
-#include <rdma/rdma_netlink.h>
-#include <rdma/iw_portmap.h>
-
-#define NES_SEND_FIRST_WRITE
-
-#define QUEUE_DISCONNECTS
-
-#define DRV_NAME    "iw_nes"
-#define DRV_VERSION "1.5.0.1"
-#define PFX         DRV_NAME ": "
-
-/*
- * NetEffect PCI vendor id and NE010 PCI device id.
- */
-#ifndef PCI_VENDOR_ID_NETEFFECT        /* not in pci.ids yet */
-#define PCI_VENDOR_ID_NETEFFECT          0x1678
-#define PCI_DEVICE_ID_NETEFFECT_NE020    0x0100
-#define PCI_DEVICE_ID_NETEFFECT_NE020_KR 0x0110
-#endif
-
-#define NE020_REV   4
-#define NE020_REV1  5
-
-#define BAR_0       0
-#define BAR_1       2
-
-#define RX_BUF_SIZE             (1536 + 8)
-#define NES_REG0_SIZE           (4 * 1024)
-#define NES_TX_TIMEOUT          (6*HZ)
-#define NES_FIRST_QPN           64
-#define NES_SW_CONTEXT_ALIGN    1024
-
-#define NES_MAX_MTU            9000
-
-#define NES_NIC_MAX_NICS        16
-#define NES_MAX_ARP_TABLE_SIZE  4096
-
-#define NES_NIC_CEQ_SIZE        8
-/* NICs will be on a separate CQ */
-#define NES_CCEQ_SIZE ((nesadapter->max_cq / nesadapter->port_count) - 32)
-
-#define NES_MAX_PORT_COUNT 4
-
-#define MAX_DPC_ITERATIONS               128
-
-#define NES_DRV_OPT_ENABLE_MPA_VER_0     0x00000001
-#define NES_DRV_OPT_DISABLE_MPA_CRC      0x00000002
-#define NES_DRV_OPT_DISABLE_FIRST_WRITE  0x00000004
-#define NES_DRV_OPT_DISABLE_INTF         0x00000008
-#define NES_DRV_OPT_ENABLE_MSI           0x00000010
-#define NES_DRV_OPT_DUAL_LOGICAL_PORT    0x00000020
-#define NES_DRV_OPT_SUPRESS_OPTION_BC    0x00000040
-#define NES_DRV_OPT_NO_INLINE_DATA       0x00000080
-#define NES_DRV_OPT_DISABLE_INT_MOD      0x00000100
-#define NES_DRV_OPT_DISABLE_VIRT_WQ      0x00000200
-#define NES_DRV_OPT_ENABLE_PAU           0x00000400
-
-#define NES_AEQ_EVENT_TIMEOUT         2500
-#define NES_DISCONNECT_EVENT_TIMEOUT  2000
-
-/* debug levels */
-/* must match userspace */
-#define NES_DBG_HW          0x00000001
-#define NES_DBG_INIT        0x00000002
-#define NES_DBG_ISR         0x00000004
-#define NES_DBG_PHY         0x00000008
-#define NES_DBG_NETDEV      0x00000010
-#define NES_DBG_CM          0x00000020
-#define NES_DBG_CM1         0x00000040
-#define NES_DBG_NIC_RX      0x00000080
-#define NES_DBG_NIC_TX      0x00000100
-#define NES_DBG_CQP         0x00000200
-#define NES_DBG_MMAP        0x00000400
-#define NES_DBG_MR          0x00000800
-#define NES_DBG_PD          0x00001000
-#define NES_DBG_CQ          0x00002000
-#define NES_DBG_QP          0x00004000
-#define NES_DBG_MOD_QP      0x00008000
-#define NES_DBG_AEQ         0x00010000
-#define NES_DBG_IW_RX       0x00020000
-#define NES_DBG_IW_TX       0x00040000
-#define NES_DBG_SHUTDOWN    0x00080000
-#define NES_DBG_PAU         0x00100000
-#define NES_DBG_NLMSG       0x00200000
-#define NES_DBG_RSVD1       0x10000000
-#define NES_DBG_RSVD2       0x20000000
-#define NES_DBG_RSVD3       0x40000000
-#define NES_DBG_RSVD4       0x80000000
-#define NES_DBG_ALL         0xffffffff
-
-#ifdef CONFIG_INFINIBAND_NES_DEBUG
-#define nes_debug(level, fmt, args...) \
-do { \
-       if (level & nes_debug_level) \
-               printk(KERN_ERR PFX "%s[%u]: " fmt, __func__, __LINE__, ##args); \
-} while (0)
-
-#define NES_EVENT_TIMEOUT   1200000
-#else
-#define nes_debug(level, fmt, args...) no_printk(fmt, ##args)
-
-#define NES_EVENT_TIMEOUT   100000
-#endif
-
-#include "nes_hw.h"
-#include "nes_verbs.h"
-#include "nes_context.h"
-#include <rdma/nes-abi.h>
-#include "nes_cm.h"
-#include "nes_mgt.h"
-
-extern int interrupt_mod_interval;
-extern int nes_if_count;
-extern int mpa_version;
-extern int disable_mpa_crc;
-extern unsigned int nes_drv_opt;
-extern unsigned int nes_debug_level;
-extern unsigned int wqm_quanta;
-extern struct list_head nes_adapter_list;
-
-extern atomic_t cm_connects;
-extern atomic_t cm_accepts;
-extern atomic_t cm_disconnects;
-extern atomic_t cm_closes;
-extern atomic_t cm_connecteds;
-extern atomic_t cm_connect_reqs;
-extern atomic_t cm_rejects;
-extern atomic_t mod_qp_timouts;
-extern atomic_t qps_created;
-extern atomic_t qps_destroyed;
-extern atomic_t sw_qps_destroyed;
-extern u32 mh_detected;
-extern u32 mh_pauses_sent;
-extern u32 cm_packets_sent;
-extern u32 cm_packets_bounced;
-extern u32 cm_packets_created;
-extern u32 cm_packets_received;
-extern u32 cm_packets_dropped;
-extern u32 cm_packets_retrans;
-extern atomic_t cm_listens_created;
-extern atomic_t cm_listens_destroyed;
-extern u32 cm_backlog_drops;
-extern atomic_t cm_loopbacks;
-extern atomic_t cm_nodes_created;
-extern atomic_t cm_nodes_destroyed;
-extern atomic_t cm_accel_dropped_pkts;
-extern atomic_t cm_resets_recvd;
-extern atomic_t pau_qps_created;
-extern atomic_t pau_qps_destroyed;
-
-extern u32 int_mod_timer_init;
-extern u32 int_mod_cq_depth_256;
-extern u32 int_mod_cq_depth_128;
-extern u32 int_mod_cq_depth_32;
-extern u32 int_mod_cq_depth_24;
-extern u32 int_mod_cq_depth_16;
-extern u32 int_mod_cq_depth_4;
-extern u32 int_mod_cq_depth_1;
-
-struct nes_device {
-       struct nes_adapter         *nesadapter;
-       void __iomem           *regs;
-       void __iomem           *index_reg;
-       struct pci_dev         *pcidev;
-       struct net_device      *netdev[NES_NIC_MAX_NICS];
-       u64                    link_status_interrupts;
-       struct tasklet_struct  dpc_tasklet;
-       spinlock_t             indexed_regs_lock;
-       unsigned long          csr_start;
-       unsigned long          doorbell_region;
-       unsigned long          doorbell_start;
-       unsigned long          mac_tx_errors;
-       unsigned long          mac_pause_frames_sent;
-       unsigned long          mac_pause_frames_received;
-       unsigned long          mac_rx_errors;
-       unsigned long          mac_rx_crc_errors;
-       unsigned long          mac_rx_symbol_err_frames;
-       unsigned long          mac_rx_jabber_frames;
-       unsigned long          mac_rx_oversized_frames;
-       unsigned long          mac_rx_short_frames;
-       unsigned long          port_rx_discards;
-       unsigned long          port_tx_discards;
-       unsigned int           mac_index;
-       unsigned int           nes_stack_start;
-
-       /* Control Structures */
-       void                   *cqp_vbase;
-       dma_addr_t             cqp_pbase;
-       u32                    cqp_mem_size;
-       u8                     ceq_index;
-       u8                     nic_ceq_index;
-       struct nes_hw_cqp      cqp;
-       struct nes_hw_cq       ccq;
-       struct list_head       cqp_avail_reqs;
-       struct list_head       cqp_pending_reqs;
-       struct nes_cqp_request *nes_cqp_requests;
-
-       u32                    int_req;
-       u32                    int_stat;
-       u32                    timer_int_req;
-       u32                    timer_only_int_count;
-       u32                    intf_int_req;
-       u32                    last_mac_tx_pauses;
-       u32                    last_used_chunks_tx;
-       struct list_head       list;
-
-       u16                    base_doorbell_index;
-       u16                    currcq_count;
-       u16                    deepcq_count;
-       u8                     iw_status;
-       u8                     msi_enabled;
-       u8                     netdev_count;
-       u8                     napi_isr_ran;
-       u8                     disable_rx_flow_control;
-       u8                     disable_tx_flow_control;
-
-       struct delayed_work    work;
-       u8                     link_recheck;
-};
-
-/* Receive skb private area - must fit in skb->cb area */
-struct nes_rskb_cb {
-       u64                    busaddr;
-       u32                    maplen;
-       u32                    seqnum;
-       u8                     *data_start;
-       struct nes_qp          *nesqp;
-};
-
-static inline __le32 get_crc_value(struct nes_v4_quad *nes_quad)
-{
-       u32 crc_value;
-       crc_value = crc32c(~0, (void *)nes_quad, sizeof (struct nes_v4_quad));
-
-       /*
-        * With commit ef19454b ("[LIB] crc32c: Keep intermediate crc
-        * state in cpu order"), behavior of crc32c changes on
-        * big-endian platforms.  Our algorithm expects the previous
-        * behavior; otherwise we have RDMA connection establishment
-        * issue on big-endian.
-        */
-       return cpu_to_le32(crc_value);
-}
-
-static inline void
-set_wqe_64bit_value(__le32 *wqe_words, u32 index, u64 value)
-{
-       wqe_words[index]     = cpu_to_le32((u32) value);
-       wqe_words[index + 1] = cpu_to_le32(upper_32_bits(value));
-}
-
-static inline void
-set_wqe_32bit_value(__le32 *wqe_words, u32 index, u32 value)
-{
-       wqe_words[index] = cpu_to_le32(value);
-}
-
-static inline void
-nes_fill_init_cqp_wqe(struct nes_hw_cqp_wqe *cqp_wqe, struct nes_device *nesdev)
-{
-       cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_LOW_IDX]       = 0;
-       cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_HIGH_IDX]      = 0;
-       cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_LOW_IDX]   = 0;
-       cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX]  = 0;
-       cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX] = 0;
-       cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_LEN_IDX]       = 0;
-       cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_LOW_IDX]       = 0;
-       cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PA_LOW_IDX]        = 0;
-       cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PA_HIGH_IDX]       = 0;
-}
-
-static inline void
-nes_fill_init_qp_wqe(struct nes_hw_qp_wqe *wqe, struct nes_qp *nesqp, u32 head)
-{
-       u32 value;
-       value = ((u32)((unsigned long) nesqp)) | head;
-       set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_HIGH_IDX,
-                       (u32)(upper_32_bits((unsigned long)(nesqp))));
-       set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, value);
-}
-
-/* Read from memory-mapped device */
-static inline u32 nes_read_indexed(struct nes_device *nesdev, u32 reg_index)
-{
-       unsigned long flags;
-       void __iomem *addr = nesdev->index_reg;
-       u32 value;
-
-       spin_lock_irqsave(&nesdev->indexed_regs_lock, flags);
-
-       writel(reg_index, addr);
-       value = readl((void __iomem *)addr + 4);
-
-       spin_unlock_irqrestore(&nesdev->indexed_regs_lock, flags);
-       return value;
-}
-
-static inline u32 nes_read32(const void __iomem *addr)
-{
-       return readl(addr);
-}
-
-static inline u16 nes_read16(const void __iomem *addr)
-{
-       return readw(addr);
-}
-
-static inline u8 nes_read8(const void __iomem *addr)
-{
-       return readb(addr);
-}
-
-/* Write to memory-mapped device */
-static inline void nes_write_indexed(struct nes_device *nesdev, u32 reg_index, u32 val)
-{
-       unsigned long flags;
-       void __iomem *addr = nesdev->index_reg;
-
-       spin_lock_irqsave(&nesdev->indexed_regs_lock, flags);
-
-       writel(reg_index, addr);
-       writel(val, (void __iomem *)addr + 4);
-
-       spin_unlock_irqrestore(&nesdev->indexed_regs_lock, flags);
-}
-
-static inline void nes_write32(void __iomem *addr, u32 val)
-{
-       writel(val, addr);
-}
-
-static inline void nes_write16(void __iomem *addr, u16 val)
-{
-       writew(val, addr);
-}
-
-static inline void nes_write8(void __iomem *addr, u8 val)
-{
-       writeb(val, addr);
-}
-
-enum nes_resource {
-       NES_RESOURCE_MW = 1,
-       NES_RESOURCE_FAST_MR,
-       NES_RESOURCE_PHYS_MR,
-       NES_RESOURCE_USER_MR,
-       NES_RESOURCE_PD,
-       NES_RESOURCE_QP,
-       NES_RESOURCE_CQ,
-       NES_RESOURCE_ARP
-};
-
-static inline int nes_alloc_resource(struct nes_adapter *nesadapter,
-               unsigned long *resource_array, u32 max_resources,
-               u32 *req_resource_num, u32 *next, enum nes_resource resource_type)
-{
-       unsigned long flags;
-       u32 resource_num;
-
-       spin_lock_irqsave(&nesadapter->resource_lock, flags);
-
-       resource_num = find_next_zero_bit(resource_array, max_resources, *next);
-       if (resource_num >= max_resources) {
-               resource_num = find_first_zero_bit(resource_array, max_resources);
-               if (resource_num >= max_resources) {
-                       printk(KERN_ERR PFX "%s: No available resources [type=%u].\n", __func__, resource_type);
-                       spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
-                       return -EMFILE;
-               }
-       }
-       set_bit(resource_num, resource_array);
-       *next = resource_num+1;
-       if (*next == max_resources) {
-               *next = 0;
-       }
-       spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
-       *req_resource_num = resource_num;
-
-       return 0;
-}
-
-static inline int nes_is_resource_allocated(struct nes_adapter *nesadapter,
-               unsigned long *resource_array, u32 resource_num)
-{
-       unsigned long flags;
-       int bit_is_set;
-
-       spin_lock_irqsave(&nesadapter->resource_lock, flags);
-
-       bit_is_set = test_bit(resource_num, resource_array);
-       nes_debug(NES_DBG_HW, "resource_num %u is%s allocated.\n",
-                       resource_num, (bit_is_set ? "": " not"));
-       spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
-
-       return bit_is_set;
-}
-
-static inline void nes_free_resource(struct nes_adapter *nesadapter,
-               unsigned long *resource_array, u32 resource_num)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&nesadapter->resource_lock, flags);
-       clear_bit(resource_num, resource_array);
-       spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
-}
-
-static inline struct nes_vnic *to_nesvnic(struct ib_device *ibdev)
-{
-       return container_of(ibdev, struct nes_ib_device, ibdev)->nesvnic;
-}
-
-static inline struct nes_pd *to_nespd(struct ib_pd *ibpd)
-{
-       return container_of(ibpd, struct nes_pd, ibpd);
-}
-
-static inline struct nes_ucontext *to_nesucontext(struct ib_ucontext *ibucontext)
-{
-       return container_of(ibucontext, struct nes_ucontext, ibucontext);
-}
-
-static inline struct nes_mr *to_nesmr(struct ib_mr *ibmr)
-{
-       return container_of(ibmr, struct nes_mr, ibmr);
-}
-
-static inline struct nes_mr *to_nesmr_from_ibfmr(struct ib_fmr *ibfmr)
-{
-       return container_of(ibfmr, struct nes_mr, ibfmr);
-}
-
-static inline struct nes_mr *to_nesmw(struct ib_mw *ibmw)
-{
-       return container_of(ibmw, struct nes_mr, ibmw);
-}
-
-static inline struct nes_fmr *to_nesfmr(struct nes_mr *nesmr)
-{
-       return container_of(nesmr, struct nes_fmr, nesmr);
-}
-
-static inline struct nes_cq *to_nescq(struct ib_cq *ibcq)
-{
-       return container_of(ibcq, struct nes_cq, ibcq);
-}
-
-static inline struct nes_qp *to_nesqp(struct ib_qp *ibqp)
-{
-       return container_of(ibqp, struct nes_qp, ibqp);
-}
-
-
-
-/* nes.c */
-void nes_add_ref(struct ib_qp *);
-void nes_rem_ref(struct ib_qp *);
-struct ib_qp *nes_get_qp(struct ib_device *, int);
-
-
-/* nes_hw.c */
-struct nes_adapter *nes_init_adapter(struct nes_device *, u8);
-void  nes_nic_init_timer_defaults(struct nes_device *, u8);
-void nes_destroy_adapter(struct nes_adapter *);
-int nes_init_cqp(struct nes_device *);
-int nes_init_phy(struct nes_device *);
-int nes_init_nic_qp(struct nes_device *, struct net_device *);
-void nes_destroy_nic_qp(struct nes_vnic *);
-int nes_napi_isr(struct nes_device *);
-void nes_dpc(unsigned long);
-void nes_nic_ce_handler(struct nes_device *, struct nes_hw_nic_cq *);
-void nes_iwarp_ce_handler(struct nes_device *, struct nes_hw_cq *);
-int nes_destroy_cqp(struct nes_device *);
-int nes_nic_cm_xmit(struct sk_buff *, struct net_device *);
-void nes_recheck_link_status(struct work_struct *work);
-void nes_terminate_timeout(struct timer_list *t);
-
-/* nes_nic.c */
-struct net_device *nes_netdev_init(struct nes_device *, void __iomem *);
-void nes_netdev_destroy(struct net_device *);
-int nes_nic_cm_xmit(struct sk_buff *, struct net_device *);
-
-/* nes_cm.c */
-void *nes_cm_create(struct net_device *);
-int nes_cm_recv(struct sk_buff *, struct net_device *);
-void nes_update_arp(unsigned char *, u32, u32, u16, u16);
-void nes_manage_arp_cache(struct net_device *, unsigned char *, u32, u32);
-void nes_sock_release(struct nes_qp *, unsigned long *);
-void flush_wqes(struct nes_device *nesdev, struct nes_qp *, u32, u32);
-int nes_manage_apbvt(struct nes_vnic *, u32, u32, u32);
-int nes_cm_disconn(struct nes_qp *);
-void nes_cm_disconn_worker(void *);
-
-/* nes_verbs.c */
-int nes_hw_modify_qp(struct nes_device *, struct nes_qp *, u32, u32, u32);
-int nes_modify_qp(struct ib_qp *, struct ib_qp_attr *, int, struct ib_udata *);
-struct nes_ib_device *nes_init_ofa_device(struct net_device *);
-void  nes_port_ibevent(struct nes_vnic *nesvnic);
-void nes_destroy_ofa_device(struct nes_ib_device *);
-int nes_register_ofa_device(struct nes_ib_device *);
-
-/* nes_util.c */
-int nes_read_eeprom_values(struct nes_device *, struct nes_adapter *);
-void nes_write_1G_phy_reg(struct nes_device *, u8, u8, u16);
-void nes_read_1G_phy_reg(struct nes_device *, u8, u8, u16 *);
-void nes_write_10G_phy_reg(struct nes_device *, u16, u8, u16, u16);
-void nes_read_10G_phy_reg(struct nes_device *, u8, u8, u16);
-struct nes_cqp_request *nes_get_cqp_request(struct nes_device *);
-void nes_free_cqp_request(struct nes_device *nesdev,
-                         struct nes_cqp_request *cqp_request);
-void nes_put_cqp_request(struct nes_device *nesdev,
-                        struct nes_cqp_request *cqp_request);
-void nes_post_cqp_request(struct nes_device *, struct nes_cqp_request *);
-int nes_arp_table(struct nes_device *, u32, u8 *, u32);
-void nes_mh_fix(struct timer_list *t);
-void nes_clc(struct timer_list *t);
-void nes_dump_mem(unsigned int, void *, int);
-u32 nes_crc32(u32, u32, u32, u32, u8 *, u32, u32, u32);
-
-#endif /* __NES_H */
diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c
deleted file mode 100644 (file)
index 62bf986..0000000
+++ /dev/null
@@ -1,3992 +0,0 @@
-/*
- * Copyright (c) 2006 - 2014 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-
-#define TCPOPT_TIMESTAMP 8
-
-#include <linux/atomic.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/init.h>
-#include <linux/if_arp.h>
-#include <linux/if_vlan.h>
-#include <linux/notifier.h>
-#include <linux/net.h>
-#include <linux/types.h>
-#include <linux/timer.h>
-#include <linux/time.h>
-#include <linux/delay.h>
-#include <linux/etherdevice.h>
-#include <linux/netdevice.h>
-#include <linux/random.h>
-#include <linux/list.h>
-#include <linux/threads.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <net/arp.h>
-#include <net/neighbour.h>
-#include <net/route.h>
-#include <net/ip_fib.h>
-#include <net/secure_seq.h>
-#include <net/tcp.h>
-#include <linux/fcntl.h>
-
-#include "nes.h"
-
-u32 cm_packets_sent;
-u32 cm_packets_bounced;
-u32 cm_packets_dropped;
-u32 cm_packets_retrans;
-u32 cm_packets_created;
-u32 cm_packets_received;
-atomic_t cm_listens_created;
-atomic_t cm_listens_destroyed;
-u32 cm_backlog_drops;
-atomic_t cm_loopbacks;
-atomic_t cm_nodes_created;
-atomic_t cm_nodes_destroyed;
-atomic_t cm_accel_dropped_pkts;
-atomic_t cm_resets_recvd;
-
-static inline int mini_cm_accelerated(struct nes_cm_core *, struct nes_cm_node *);
-static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *, struct nes_vnic *, struct nes_cm_info *);
-static int mini_cm_del_listen(struct nes_cm_core *, struct nes_cm_listener *);
-static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *, struct nes_vnic *, u16, void *, struct nes_cm_info *);
-static int mini_cm_close(struct nes_cm_core *, struct nes_cm_node *);
-static int mini_cm_accept(struct nes_cm_core *, struct nes_cm_node *);
-static int mini_cm_reject(struct nes_cm_core *, struct nes_cm_node *);
-static int mini_cm_recv_pkt(struct nes_cm_core *, struct nes_vnic *, struct sk_buff *);
-static int mini_cm_dealloc_core(struct nes_cm_core *);
-static int mini_cm_get(struct nes_cm_core *);
-static int mini_cm_set(struct nes_cm_core *, u32, u32);
-
-static void form_cm_frame(struct sk_buff *, struct nes_cm_node *, void *, u32, void *, u32, u8);
-static int add_ref_cm_node(struct nes_cm_node *);
-static int rem_ref_cm_node(struct nes_cm_core *, struct nes_cm_node *);
-
-static int nes_cm_disconn_true(struct nes_qp *);
-static int nes_cm_post_event(struct nes_cm_event *event);
-static int nes_disconnect(struct nes_qp *nesqp, int abrupt);
-static void nes_disconnect_worker(struct work_struct *work);
-
-static int send_mpa_request(struct nes_cm_node *, struct sk_buff *);
-static int send_mpa_reject(struct nes_cm_node *);
-static int send_syn(struct nes_cm_node *, u32, struct sk_buff *);
-static int send_reset(struct nes_cm_node *, struct sk_buff *);
-static int send_ack(struct nes_cm_node *cm_node, struct sk_buff *skb);
-static int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb);
-static void process_packet(struct nes_cm_node *, struct sk_buff *, struct nes_cm_core *);
-
-static void active_open_err(struct nes_cm_node *, struct sk_buff *, int);
-static void passive_open_err(struct nes_cm_node *, struct sk_buff *, int);
-static void cleanup_retrans_entry(struct nes_cm_node *);
-static void handle_rcv_mpa(struct nes_cm_node *, struct sk_buff *);
-static void free_retrans_entry(struct nes_cm_node *cm_node);
-static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph, struct sk_buff *skb, int optionsize, int passive);
-
-/* CM event handler functions */
-static void cm_event_connected(struct nes_cm_event *);
-static void cm_event_connect_error(struct nes_cm_event *);
-static void cm_event_reset(struct nes_cm_event *);
-static void cm_event_mpa_req(struct nes_cm_event *);
-static void cm_event_mpa_reject(struct nes_cm_event *);
-static void handle_recv_entry(struct nes_cm_node *cm_node, u32 rem_node);
-
-/* MPA build functions */
-static int cm_build_mpa_frame(struct nes_cm_node *, u8 **, u16 *, u8 *, u8);
-static void build_mpa_v2(struct nes_cm_node *, void *, u8);
-static void build_mpa_v1(struct nes_cm_node *, void *, u8);
-static void build_rdma0_msg(struct nes_cm_node *, struct nes_qp **);
-
-static void print_core(struct nes_cm_core *core);
-static void record_ird_ord(struct nes_cm_node *, u16, u16);
-
-/* External CM API Interface */
-/* instance of function pointers for client API */
-/* set address of this instance to cm_core->cm_ops at cm_core alloc */
-static const struct nes_cm_ops nes_cm_api = {
-       .accelerated = mini_cm_accelerated,
-       .listen = mini_cm_listen,
-       .stop_listener = mini_cm_del_listen,
-       .connect = mini_cm_connect,
-       .close = mini_cm_close,
-       .accept = mini_cm_accept,
-       .reject = mini_cm_reject,
-       .recv_pkt = mini_cm_recv_pkt,
-       .destroy_cm_core = mini_cm_dealloc_core,
-       .get = mini_cm_get,
-       .set = mini_cm_set
-};
-
-static struct nes_cm_core *g_cm_core;
-
-atomic_t cm_connects;
-atomic_t cm_accepts;
-atomic_t cm_disconnects;
-atomic_t cm_closes;
-atomic_t cm_connecteds;
-atomic_t cm_connect_reqs;
-atomic_t cm_rejects;
-
-int nes_add_ref_cm_node(struct nes_cm_node *cm_node)
-{
-       return add_ref_cm_node(cm_node);
-}
-
-int nes_rem_ref_cm_node(struct nes_cm_node *cm_node)
-{
-       return rem_ref_cm_node(cm_node->cm_core, cm_node);
-}
-/**
- * create_event
- */
-static struct nes_cm_event *create_event(struct nes_cm_node *  cm_node,
-                                        enum nes_cm_event_type type)
-{
-       struct nes_cm_event *event;
-
-       if (!cm_node->cm_id)
-               return NULL;
-
-       /* allocate an empty event */
-       event = kzalloc(sizeof(*event), GFP_ATOMIC);
-
-       if (!event)
-               return NULL;
-
-       event->type = type;
-       event->cm_node = cm_node;
-       event->cm_info.rem_addr = cm_node->rem_addr;
-       event->cm_info.loc_addr = cm_node->loc_addr;
-       event->cm_info.rem_port = cm_node->rem_port;
-       event->cm_info.loc_port = cm_node->loc_port;
-       event->cm_info.cm_id = cm_node->cm_id;
-
-       nes_debug(NES_DBG_CM, "cm_node=%p Created event=%p, type=%u, "
-                 "dst_addr=%08x[%x], src_addr=%08x[%x]\n",
-                 cm_node, event, type, event->cm_info.loc_addr,
-                 event->cm_info.loc_port, event->cm_info.rem_addr,
-                 event->cm_info.rem_port);
-
-       nes_cm_post_event(event);
-       return event;
-}
-
-
-/**
- * send_mpa_request
- */
-static int send_mpa_request(struct nes_cm_node *cm_node, struct sk_buff *skb)
-{
-       u8 start_addr = 0;
-       u8 *start_ptr = &start_addr;
-       u8 **start_buff = &start_ptr;
-       u16 buff_len = 0;
-
-       if (!skb) {
-               nes_debug(NES_DBG_CM, "skb set to NULL\n");
-               return -1;
-       }
-
-       /* send an MPA Request frame */
-       cm_build_mpa_frame(cm_node, start_buff, &buff_len, NULL, MPA_KEY_REQUEST);
-       form_cm_frame(skb, cm_node, NULL, 0, *start_buff, buff_len, SET_ACK);
-
-       return schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
-}
-
-
-
-static int send_mpa_reject(struct nes_cm_node *cm_node)
-{
-       struct sk_buff *skb = NULL;
-       u8 start_addr = 0;
-       u8 *start_ptr = &start_addr;
-       u8 **start_buff = &start_ptr;
-       u16 buff_len = 0;
-       struct ietf_mpa_v1 *mpa_frame;
-
-       skb = dev_alloc_skb(MAX_CM_BUFFER);
-       if (!skb) {
-               nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
-               return -ENOMEM;
-       }
-
-       /* send an MPA reject frame */
-       cm_build_mpa_frame(cm_node, start_buff, &buff_len, NULL, MPA_KEY_REPLY);
-       mpa_frame = (struct ietf_mpa_v1 *)*start_buff;
-       mpa_frame->flags |= IETF_MPA_FLAGS_REJECT;
-       form_cm_frame(skb, cm_node, NULL, 0, *start_buff, buff_len, SET_ACK | SET_FIN);
-
-       cm_node->state = NES_CM_STATE_FIN_WAIT1;
-       return schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
-}
-
-
-/**
- * recv_mpa - process a received TCP pkt, we are expecting an
- * IETF MPA frame
- */
-static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type,
-                    u32 len)
-{
-       struct ietf_mpa_v1 *mpa_frame;
-       struct ietf_mpa_v2 *mpa_v2_frame;
-       struct ietf_rtr_msg *rtr_msg;
-       int mpa_hdr_len;
-       int priv_data_len;
-
-       *type = NES_MPA_REQUEST_ACCEPT;
-
-       /* assume req frame is in tcp data payload */
-       if (len < sizeof(struct ietf_mpa_v1)) {
-               nes_debug(NES_DBG_CM, "The received ietf buffer was too small (%x)\n", len);
-               return -EINVAL;
-       }
-
-       /* points to the beginning of the frame, which could be MPA V1 or V2 */
-       mpa_frame = (struct ietf_mpa_v1 *)buffer;
-       mpa_hdr_len = sizeof(struct ietf_mpa_v1);
-       priv_data_len = ntohs(mpa_frame->priv_data_len);
-
-       /* make sure mpa private data len is less than 512 bytes */
-       if (priv_data_len > IETF_MAX_PRIV_DATA_LEN) {
-               nes_debug(NES_DBG_CM, "The received Length of Private"
-                         " Data field exceeds 512 octets\n");
-               return -EINVAL;
-       }
-       /*
-        * make sure MPA receiver interoperate with the
-        * received MPA version and MPA key information
-        *
-        */
-       if (mpa_frame->rev != IETF_MPA_V1 && mpa_frame->rev != IETF_MPA_V2) {
-               nes_debug(NES_DBG_CM, "The received mpa version"
-                         " is not supported\n");
-               return -EINVAL;
-       }
-       /*
-       * backwards compatibility only
-       */
-       if (mpa_frame->rev > cm_node->mpa_frame_rev) {
-               nes_debug(NES_DBG_CM, "The received mpa version"
-                       " can not be interoperated\n");
-               return -EINVAL;
-       } else {
-               cm_node->mpa_frame_rev = mpa_frame->rev;
-       }
-
-       if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) {
-               if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE)) {
-                       nes_debug(NES_DBG_CM, "Unexpected MPA Key received \n");
-                       return -EINVAL;
-               }
-       } else {
-               if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE)) {
-                       nes_debug(NES_DBG_CM, "Unexpected MPA Key received \n");
-                       return -EINVAL;
-               }
-       }
-
-       if (priv_data_len + mpa_hdr_len != len) {
-               nes_debug(NES_DBG_CM, "The received ietf buffer was not right"
-                       " complete (%x + %x != %x)\n",
-                       priv_data_len, mpa_hdr_len, len);
-               return -EINVAL;
-       }
-       /* make sure it does not exceed the max size */
-       if (len > MAX_CM_BUFFER) {
-               nes_debug(NES_DBG_CM, "The received ietf buffer was too large"
-                       " (%x + %x != %x)\n",
-                       priv_data_len, mpa_hdr_len, len);
-               return -EINVAL;
-       }
-
-       cm_node->mpa_frame_size = priv_data_len;
-
-       switch (mpa_frame->rev) {
-       case IETF_MPA_V2: {
-               u16 ird_size;
-               u16 ord_size;
-               u16 rtr_ctrl_ird;
-               u16 rtr_ctrl_ord;
-
-               mpa_v2_frame = (struct ietf_mpa_v2 *)buffer;
-               mpa_hdr_len += IETF_RTR_MSG_SIZE;
-               cm_node->mpa_frame_size -= IETF_RTR_MSG_SIZE;
-               rtr_msg = &mpa_v2_frame->rtr_msg;
-
-               /* parse rtr message */
-               rtr_ctrl_ird = ntohs(rtr_msg->ctrl_ird);
-               rtr_ctrl_ord = ntohs(rtr_msg->ctrl_ord);
-               ird_size = rtr_ctrl_ird & IETF_NO_IRD_ORD;
-               ord_size = rtr_ctrl_ord & IETF_NO_IRD_ORD;
-
-               if (!(rtr_ctrl_ird & IETF_PEER_TO_PEER)) {
-                       /* send reset */
-                       return -EINVAL;
-               }
-               if (ird_size == IETF_NO_IRD_ORD || ord_size == IETF_NO_IRD_ORD)
-                       cm_node->mpav2_ird_ord = IETF_NO_IRD_ORD;
-
-               if (cm_node->mpav2_ird_ord != IETF_NO_IRD_ORD) {
-                       /* responder */
-                       if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) {
-                               /* we are still negotiating */
-                               if (ord_size > NES_MAX_IRD) {
-                                       cm_node->ird_size = NES_MAX_IRD;
-                               } else {
-                                       cm_node->ird_size = ord_size;
-                                       if (ord_size == 0 &&
-                                       (rtr_ctrl_ord & IETF_RDMA0_READ)) {
-                                               cm_node->ird_size = 1;
-                                               nes_debug(NES_DBG_CM,
-                                               "%s: Remote peer doesn't support RDMA0_READ (ord=%u)\n",
-                                                       __func__, ord_size);
-                                       }
-                               }
-                               if (ird_size > NES_MAX_ORD)
-                                       cm_node->ord_size = NES_MAX_ORD;
-                               else
-                                       cm_node->ord_size = ird_size;
-                       } else { /* initiator */
-                               if (ord_size > NES_MAX_IRD) {
-                                       nes_debug(NES_DBG_CM,
-                                       "%s: Unable to support the requested (ord =%u)\n",
-                                                       __func__, ord_size);
-                                       return -EINVAL;
-                               }
-                               cm_node->ird_size = ord_size;
-
-                               if (ird_size > NES_MAX_ORD) {
-                                       cm_node->ord_size = NES_MAX_ORD;
-                               } else {
-                                       if (ird_size == 0 &&
-                                       (rtr_ctrl_ord & IETF_RDMA0_READ)) {
-                                               nes_debug(NES_DBG_CM,
-                                               "%s: Remote peer doesn't support RDMA0_READ (ird=%u)\n",
-                                                       __func__, ird_size);
-                                               return -EINVAL;
-                                       } else {
-                                               cm_node->ord_size = ird_size;
-                                       }
-                               }
-                       }
-               }
-
-               if (rtr_ctrl_ord & IETF_RDMA0_READ) {
-                       cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO;
-
-               } else if (rtr_ctrl_ord & IETF_RDMA0_WRITE) {
-                       cm_node->send_rdma0_op = SEND_RDMA_WRITE_ZERO;
-               } else {        /* Not supported RDMA0 operation */
-                       return -EINVAL;
-               }
-               break;
-       }
-       case IETF_MPA_V1:
-       default:
-               break;
-       }
-
-       /* copy entire MPA frame to our cm_node's frame */
-       memcpy(cm_node->mpa_frame_buf, buffer + mpa_hdr_len, cm_node->mpa_frame_size);
-
-       if (mpa_frame->flags & IETF_MPA_FLAGS_REJECT)
-               *type = NES_MPA_REQUEST_REJECT;
-       return 0;
-}
-
-
-/**
- * form_cm_frame - get a free packet and build empty frame Use
- * node info to build.
- */
-static void form_cm_frame(struct sk_buff *skb,
-                         struct nes_cm_node *cm_node, void *options, u32 optionsize,
-                         void *data, u32 datasize, u8 flags)
-{
-       struct tcphdr *tcph;
-       struct iphdr *iph;
-       struct ethhdr *ethh;
-       u8 *buf;
-       u16 packetsize = sizeof(*iph);
-
-       packetsize += sizeof(*tcph);
-       packetsize += optionsize + datasize;
-
-       skb_trim(skb, 0);
-       memset(skb->data, 0x00, ETH_HLEN + sizeof(*iph) + sizeof(*tcph));
-
-       buf = skb_put(skb, packetsize + ETH_HLEN);
-
-       ethh = (struct ethhdr *)buf;
-       buf += ETH_HLEN;
-
-       iph = (struct iphdr *)buf;
-       buf += sizeof(*iph);
-       tcph = (struct tcphdr *)buf;
-       skb_reset_mac_header(skb);
-       skb_set_network_header(skb, ETH_HLEN);
-       skb_set_transport_header(skb, ETH_HLEN + sizeof(*iph));
-       buf += sizeof(*tcph);
-
-       skb->ip_summed = CHECKSUM_PARTIAL;
-       if (!(cm_node->netdev->features & NETIF_F_IP_CSUM))
-               skb->ip_summed = CHECKSUM_NONE;
-       skb->protocol = htons(0x800);
-       skb->data_len = 0;
-       skb->mac_len = ETH_HLEN;
-
-       memcpy(ethh->h_dest, cm_node->rem_mac, ETH_ALEN);
-       memcpy(ethh->h_source, cm_node->loc_mac, ETH_ALEN);
-       ethh->h_proto = htons(0x0800);
-
-       iph->version = IPVERSION;
-       iph->ihl = 5;           /* 5 * 4Byte words, IP headr len */
-       iph->tos = 0;
-       iph->tot_len = htons(packetsize);
-       iph->id = htons(++cm_node->tcp_cntxt.loc_id);
-
-       iph->frag_off = htons(0x4000);
-       iph->ttl = 0x40;
-       iph->protocol = 0x06;   /* IPPROTO_TCP */
-
-       iph->saddr = htonl(cm_node->loc_addr);
-       iph->daddr = htonl(cm_node->rem_addr);
-
-       tcph->source = htons(cm_node->loc_port);
-       tcph->dest = htons(cm_node->rem_port);
-       tcph->seq = htonl(cm_node->tcp_cntxt.loc_seq_num);
-
-       if (flags & SET_ACK) {
-               cm_node->tcp_cntxt.loc_ack_num = cm_node->tcp_cntxt.rcv_nxt;
-               tcph->ack_seq = htonl(cm_node->tcp_cntxt.loc_ack_num);
-               tcph->ack = 1;
-       } else {
-               tcph->ack_seq = 0;
-       }
-
-       if (flags & SET_SYN) {
-               cm_node->tcp_cntxt.loc_seq_num++;
-               tcph->syn = 1;
-       } else {
-               cm_node->tcp_cntxt.loc_seq_num += datasize;
-       }
-
-       if (flags & SET_FIN) {
-               cm_node->tcp_cntxt.loc_seq_num++;
-               tcph->fin = 1;
-       }
-
-       if (flags & SET_RST)
-               tcph->rst = 1;
-
-       tcph->doff = (u16)((sizeof(*tcph) + optionsize + 3) >> 2);
-       tcph->window = htons(cm_node->tcp_cntxt.rcv_wnd);
-       tcph->urg_ptr = 0;
-       if (optionsize)
-               memcpy(buf, options, optionsize);
-       buf += optionsize;
-       if (datasize)
-               memcpy(buf, data, datasize);
-
-       skb_shinfo(skb)->nr_frags = 0;
-       cm_packets_created++;
-}
-
-/**
- * print_core - dump a cm core
- */
-static void print_core(struct nes_cm_core *core)
-{
-       nes_debug(NES_DBG_CM, "---------------------------------------------\n");
-       nes_debug(NES_DBG_CM, "CM Core  -- (core = %p )\n", core);
-       if (!core)
-               return;
-       nes_debug(NES_DBG_CM, "---------------------------------------------\n");
-
-       nes_debug(NES_DBG_CM, "State         : %u \n", core->state);
-
-       nes_debug(NES_DBG_CM, "Listen Nodes  : %u \n", atomic_read(&core->listen_node_cnt));
-       nes_debug(NES_DBG_CM, "Active Nodes  : %u \n", atomic_read(&core->node_cnt));
-
-       nes_debug(NES_DBG_CM, "core          : %p \n", core);
-
-       nes_debug(NES_DBG_CM, "-------------- end core ---------------\n");
-}
-
-static void record_ird_ord(struct nes_cm_node *cm_node,
-                                       u16 conn_ird, u16 conn_ord)
-{
-       if (conn_ird > NES_MAX_IRD)
-               conn_ird = NES_MAX_IRD;
-
-       if (conn_ord > NES_MAX_ORD)
-               conn_ord = NES_MAX_ORD;
-
-       cm_node->ird_size = conn_ird;
-       cm_node->ord_size = conn_ord;
-}
-
-/**
- * cm_build_mpa_frame - build a MPA V1 frame or MPA V2 frame
- */
-static int cm_build_mpa_frame(struct nes_cm_node *cm_node, u8 **start_buff,
-                             u16 *buff_len, u8 *pci_mem, u8 mpa_key)
-{
-       int ret = 0;
-
-       *start_buff = (pci_mem) ? pci_mem : &cm_node->mpa_frame_buf[0];
-
-       switch (cm_node->mpa_frame_rev) {
-       case IETF_MPA_V1:
-               *start_buff = (u8 *)*start_buff + sizeof(struct ietf_rtr_msg);
-               *buff_len = sizeof(struct ietf_mpa_v1) + cm_node->mpa_frame_size;
-               build_mpa_v1(cm_node, *start_buff, mpa_key);
-               break;
-       case IETF_MPA_V2:
-               *buff_len = sizeof(struct ietf_mpa_v2) + cm_node->mpa_frame_size;
-               build_mpa_v2(cm_node, *start_buff, mpa_key);
-               break;
-       default:
-               ret = -EINVAL;
-       }
-       return ret;
-}
-
-/**
- * build_mpa_v2 - build a MPA V2 frame
- */
-static void build_mpa_v2(struct nes_cm_node *cm_node,
-                        void *start_addr, u8 mpa_key)
-{
-       struct ietf_mpa_v2 *mpa_frame = (struct ietf_mpa_v2 *)start_addr;
-       struct ietf_rtr_msg *rtr_msg = &mpa_frame->rtr_msg;
-       u16 ctrl_ird;
-       u16 ctrl_ord;
-
-       /* initialize the upper 5 bytes of the frame */
-       build_mpa_v1(cm_node, start_addr, mpa_key);
-       mpa_frame->flags |= IETF_MPA_V2_FLAG; /* set a bit to indicate MPA V2 */
-       mpa_frame->priv_data_len += htons(IETF_RTR_MSG_SIZE);
-
-       /* initialize RTR msg */
-       if (cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) {
-               ctrl_ird = IETF_NO_IRD_ORD;
-               ctrl_ord = IETF_NO_IRD_ORD;
-       } else {
-               ctrl_ird = cm_node->ird_size & IETF_NO_IRD_ORD;
-               ctrl_ord = cm_node->ord_size & IETF_NO_IRD_ORD;
-       }
-       ctrl_ird |= IETF_PEER_TO_PEER;
-
-       switch (mpa_key) {
-       case MPA_KEY_REQUEST:
-               ctrl_ord |= IETF_RDMA0_WRITE;
-               ctrl_ord |= IETF_RDMA0_READ;
-               break;
-       case MPA_KEY_REPLY:
-               switch (cm_node->send_rdma0_op) {
-               case SEND_RDMA_WRITE_ZERO:
-                       ctrl_ord |= IETF_RDMA0_WRITE;
-                       break;
-               case SEND_RDMA_READ_ZERO:
-                       ctrl_ord |= IETF_RDMA0_READ;
-                       break;
-               }
-       }
-       rtr_msg->ctrl_ird = htons(ctrl_ird);
-       rtr_msg->ctrl_ord = htons(ctrl_ord);
-}
-
-/**
- * build_mpa_v1 - build a MPA V1 frame
- */
-static void build_mpa_v1(struct nes_cm_node *cm_node, void *start_addr, u8 mpa_key)
-{
-       struct ietf_mpa_v1 *mpa_frame = (struct ietf_mpa_v1 *)start_addr;
-
-       switch (mpa_key) {
-       case MPA_KEY_REQUEST:
-               memcpy(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE);
-               break;
-       case MPA_KEY_REPLY:
-               memcpy(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE);
-               break;
-       }
-       mpa_frame->flags = IETF_MPA_FLAGS_CRC;
-       mpa_frame->rev = cm_node->mpa_frame_rev;
-       mpa_frame->priv_data_len = htons(cm_node->mpa_frame_size);
-}
-
-static void build_rdma0_msg(struct nes_cm_node *cm_node, struct nes_qp **nesqp_addr)
-{
-       u64 u64temp;
-       struct nes_qp *nesqp = *nesqp_addr;
-       struct nes_hw_qp_wqe *wqe = &nesqp->hwqp.sq_vbase[0];
-
-       u64temp = (unsigned long)nesqp->nesuqp_addr;
-       u64temp |= NES_SW_CONTEXT_ALIGN >> 1;
-       set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, u64temp);
-
-       wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_LOW_IDX] = 0;
-       wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_HIGH_IDX] = 0;
-
-       switch (cm_node->send_rdma0_op) {
-       case SEND_RDMA_WRITE_ZERO:
-               nes_debug(NES_DBG_CM, "Sending first write.\n");
-               wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] =
-                       cpu_to_le32(NES_IWARP_SQ_OP_RDMAW);
-               wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] = 0;
-               wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] = 0;
-               wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 0;
-               break;
-
-       case SEND_RDMA_READ_ZERO:
-       default:
-               if (cm_node->send_rdma0_op != SEND_RDMA_READ_ZERO)
-                       WARN(1, "Unsupported RDMA0 len operation=%u\n",
-                            cm_node->send_rdma0_op);
-               nes_debug(NES_DBG_CM, "Sending first rdma operation.\n");
-               wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] =
-                       cpu_to_le32(NES_IWARP_SQ_OP_RDMAR);
-               wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX] = 1;
-               wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_TO_HIGH_IDX] = 0;
-               wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] = 0;
-               wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_STAG_IDX] = 1;
-               wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 1;
-               break;
-       }
-
-       if (nesqp->sq_kmapped) {
-               nesqp->sq_kmapped = 0;
-               kunmap(nesqp->page);
-       }
-
-       /*use the reserved spot on the WQ for the extra first WQE*/
-       nesqp->nesqp_context->ird_ord_sizes &= cpu_to_le32(~(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT |
-                                                            NES_QPCONTEXT_ORDIRD_WRPDU |
-                                                            NES_QPCONTEXT_ORDIRD_ALSMM));
-       nesqp->skip_lsmm = 1;
-       nesqp->hwqp.sq_tail = 0;
-}
-
-/**
- * schedule_nes_timer
- * note - cm_node needs to be protected before calling this. Encase in:
- *                     rem_ref_cm_node(cm_core, cm_node);add_ref_cm_node(cm_node);
- */
-int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb,
-                      enum nes_timer_type type, int send_retrans,
-                      int close_when_complete)
-{
-       unsigned long flags;
-       struct nes_cm_core *cm_core = cm_node->cm_core;
-       struct nes_timer_entry *new_send;
-       int ret = 0;
-
-       new_send = kzalloc(sizeof(*new_send), GFP_ATOMIC);
-       if (!new_send)
-               return -ENOMEM;
-
-       /* new_send->timetosend = currenttime */
-       new_send->retrycount = NES_DEFAULT_RETRYS;
-       new_send->retranscount = NES_DEFAULT_RETRANS;
-       new_send->skb = skb;
-       new_send->timetosend = jiffies;
-       new_send->type = type;
-       new_send->netdev = cm_node->netdev;
-       new_send->send_retrans = send_retrans;
-       new_send->close_when_complete = close_when_complete;
-
-       if (type == NES_TIMER_TYPE_CLOSE) {
-               new_send->timetosend += (HZ / 10);
-               if (cm_node->recv_entry) {
-                       kfree(new_send);
-                       WARN_ON(1);
-                       return -EINVAL;
-               }
-               cm_node->recv_entry = new_send;
-       }
-
-       if (type == NES_TIMER_TYPE_SEND) {
-               new_send->seq_num = ntohl(tcp_hdr(skb)->seq);
-               refcount_inc(&new_send->skb->users);
-               spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
-               cm_node->send_entry = new_send;
-               add_ref_cm_node(cm_node);
-               spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
-               new_send->timetosend = jiffies + NES_RETRY_TIMEOUT;
-
-               ret = nes_nic_cm_xmit(new_send->skb, cm_node->netdev);
-               if (ret != NETDEV_TX_OK) {
-                       nes_debug(NES_DBG_CM, "Error sending packet %p "
-                                 "(jiffies = %lu)\n", new_send, jiffies);
-                       new_send->timetosend = jiffies;
-                       ret = NETDEV_TX_OK;
-               } else {
-                       cm_packets_sent++;
-                       if (!send_retrans) {
-                               cleanup_retrans_entry(cm_node);
-                               if (close_when_complete)
-                                       rem_ref_cm_node(cm_core, cm_node);
-                               return ret;
-                       }
-               }
-       }
-
-       if (!timer_pending(&cm_core->tcp_timer))
-               mod_timer(&cm_core->tcp_timer, new_send->timetosend);
-
-       return ret;
-}
-
-static void nes_retrans_expired(struct nes_cm_node *cm_node)
-{
-       struct iw_cm_id *cm_id = cm_node->cm_id;
-       enum nes_cm_node_state state = cm_node->state;
-       cm_node->state = NES_CM_STATE_CLOSED;
-
-       switch (state) {
-       case NES_CM_STATE_SYN_RCVD:
-       case NES_CM_STATE_CLOSING:
-               rem_ref_cm_node(cm_node->cm_core, cm_node);
-               break;
-       case NES_CM_STATE_LAST_ACK:
-       case NES_CM_STATE_FIN_WAIT1:
-               if (cm_node->cm_id)
-                       cm_id->rem_ref(cm_id);
-               send_reset(cm_node, NULL);
-               break;
-       default:
-               add_ref_cm_node(cm_node);
-               send_reset(cm_node, NULL);
-               create_event(cm_node, NES_CM_EVENT_ABORTED);
-       }
-}
-
-static void handle_recv_entry(struct nes_cm_node *cm_node, u32 rem_node)
-{
-       struct nes_timer_entry *recv_entry = cm_node->recv_entry;
-       struct iw_cm_id *cm_id = cm_node->cm_id;
-       struct nes_qp *nesqp;
-       unsigned long qplockflags;
-
-       if (!recv_entry)
-               return;
-       nesqp = (struct nes_qp *)recv_entry->skb;
-       if (nesqp) {
-               spin_lock_irqsave(&nesqp->lock, qplockflags);
-               if (nesqp->cm_id) {
-                       nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, "
-                                 "refcount = %d: HIT A "
-                                 "NES_TIMER_TYPE_CLOSE with something "
-                                 "to do!!!\n", nesqp->hwqp.qp_id, cm_id,
-                                 atomic_read(&nesqp->refcount));
-                       nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED;
-                       nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT;
-                       nesqp->ibqp_state = IB_QPS_ERR;
-                       spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-                       nes_cm_disconn(nesqp);
-               } else {
-                       spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-                       nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, "
-                                 "refcount = %d: HIT A "
-                                 "NES_TIMER_TYPE_CLOSE with nothing "
-                                 "to do!!!\n", nesqp->hwqp.qp_id, cm_id,
-                                 atomic_read(&nesqp->refcount));
-               }
-       } else if (rem_node) {
-               /* TIME_WAIT state */
-               rem_ref_cm_node(cm_node->cm_core, cm_node);
-       }
-       if (cm_node->cm_id)
-               cm_id->rem_ref(cm_id);
-       kfree(recv_entry);
-       cm_node->recv_entry = NULL;
-}
-
-/**
- * nes_cm_timer_tick
- */
-static void nes_cm_timer_tick(struct timer_list *unused)
-{
-       unsigned long flags;
-       unsigned long nexttimeout = jiffies + NES_LONG_TIME;
-       struct nes_cm_node *cm_node;
-       struct nes_timer_entry *send_entry, *recv_entry;
-       struct list_head *list_core_temp;
-       struct list_head *list_node;
-       struct nes_cm_core *cm_core = g_cm_core;
-       u32 settimer = 0;
-       unsigned long timetosend;
-       int ret = NETDEV_TX_OK;
-
-       struct list_head timer_list;
-
-       INIT_LIST_HEAD(&timer_list);
-       spin_lock_irqsave(&cm_core->ht_lock, flags);
-
-       list_for_each_safe(list_node, list_core_temp,
-                          &cm_core->connected_nodes) {
-               cm_node = container_of(list_node, struct nes_cm_node, list);
-               if ((cm_node->recv_entry) || (cm_node->send_entry)) {
-                       add_ref_cm_node(cm_node);
-                       list_add(&cm_node->timer_entry, &timer_list);
-               }
-       }
-       spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-
-       list_for_each_safe(list_node, list_core_temp, &timer_list) {
-               cm_node = container_of(list_node, struct nes_cm_node,
-                                      timer_entry);
-               recv_entry = cm_node->recv_entry;
-
-               if (recv_entry) {
-                       if (time_after(recv_entry->timetosend, jiffies)) {
-                               if (nexttimeout > recv_entry->timetosend ||
-                                   !settimer) {
-                                       nexttimeout = recv_entry->timetosend;
-                                       settimer = 1;
-                               }
-                       } else {
-                               handle_recv_entry(cm_node, 1);
-                       }
-               }
-
-               spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
-               do {
-                       send_entry = cm_node->send_entry;
-                       if (!send_entry)
-                               break;
-                       if (time_after(send_entry->timetosend, jiffies)) {
-                               if (cm_node->state != NES_CM_STATE_TSA) {
-                                       if ((nexttimeout >
-                                            send_entry->timetosend) ||
-                                           !settimer) {
-                                               nexttimeout =
-                                                       send_entry->timetosend;
-                                               settimer = 1;
-                                       }
-                               } else {
-                                       free_retrans_entry(cm_node);
-                               }
-                               break;
-                       }
-
-                       if ((cm_node->state == NES_CM_STATE_TSA) ||
-                           (cm_node->state == NES_CM_STATE_CLOSED)) {
-                               free_retrans_entry(cm_node);
-                               break;
-                       }
-
-                       if (!send_entry->retranscount ||
-                           !send_entry->retrycount) {
-                               cm_packets_dropped++;
-                               free_retrans_entry(cm_node);
-
-                               spin_unlock_irqrestore(
-                                       &cm_node->retrans_list_lock, flags);
-                               nes_retrans_expired(cm_node);
-                               cm_node->state = NES_CM_STATE_CLOSED;
-                               spin_lock_irqsave(&cm_node->retrans_list_lock,
-                                                 flags);
-                               break;
-                       }
-                       refcount_inc(&send_entry->skb->users);
-                       cm_packets_retrans++;
-                       nes_debug(NES_DBG_CM, "Retransmitting send_entry %p "
-                                 "for node %p, jiffies = %lu, time to send = "
-                                 "%lu, retranscount = %u, send_entry->seq_num = "
-                                 "0x%08X, cm_node->tcp_cntxt.rem_ack_num = "
-                                 "0x%08X\n", send_entry, cm_node, jiffies,
-                                 send_entry->timetosend,
-                                 send_entry->retranscount,
-                                 send_entry->seq_num,
-                                 cm_node->tcp_cntxt.rem_ack_num);
-
-                       spin_unlock_irqrestore(&cm_node->retrans_list_lock,
-                                              flags);
-                       ret = nes_nic_cm_xmit(send_entry->skb, cm_node->netdev);
-                       spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
-                       if (ret != NETDEV_TX_OK) {
-                               nes_debug(NES_DBG_CM, "rexmit failed for "
-                                         "node=%p\n", cm_node);
-                               cm_packets_bounced++;
-                               send_entry->retrycount--;
-                               nexttimeout = jiffies + NES_SHORT_TIME;
-                               settimer = 1;
-                               break;
-                       } else {
-                               cm_packets_sent++;
-                       }
-                       nes_debug(NES_DBG_CM, "Packet Sent: retrans count = "
-                                 "%u, retry count = %u.\n",
-                                 send_entry->retranscount,
-                                 send_entry->retrycount);
-                       if (send_entry->send_retrans) {
-                               send_entry->retranscount--;
-                               timetosend = (NES_RETRY_TIMEOUT <<
-                                             (NES_DEFAULT_RETRANS - send_entry->retranscount));
-
-                               send_entry->timetosend = jiffies +
-                                                        min(timetosend, NES_MAX_TIMEOUT);
-                               if (nexttimeout > send_entry->timetosend ||
-                                   !settimer) {
-                                       nexttimeout = send_entry->timetosend;
-                                       settimer = 1;
-                               }
-                       } else {
-                               int close_when_complete;
-                               close_when_complete =
-                                       send_entry->close_when_complete;
-                               nes_debug(NES_DBG_CM, "cm_node=%p state=%d\n",
-                                         cm_node, cm_node->state);
-                               free_retrans_entry(cm_node);
-                               if (close_when_complete)
-                                       rem_ref_cm_node(cm_node->cm_core,
-                                                       cm_node);
-                       }
-               } while (0);
-
-               spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
-               rem_ref_cm_node(cm_node->cm_core, cm_node);
-       }
-
-       if (settimer) {
-               if (!timer_pending(&cm_core->tcp_timer))
-                       mod_timer(&cm_core->tcp_timer, nexttimeout);
-       }
-}
-
-
-/**
- * send_syn
- */
-static int send_syn(struct nes_cm_node *cm_node, u32 sendack,
-                   struct sk_buff *skb)
-{
-       int ret;
-       int flags = SET_SYN;
-       char optionsbuffer[sizeof(struct option_mss) +
-                          sizeof(struct option_windowscale) + sizeof(struct option_base) +
-                          TCP_OPTIONS_PADDING];
-
-       int optionssize = 0;
-       /* Sending MSS option */
-       union all_known_options *options;
-
-       if (!cm_node)
-               return -EINVAL;
-
-       options = (union all_known_options *)&optionsbuffer[optionssize];
-       options->as_mss.optionnum = OPTION_NUMBER_MSS;
-       options->as_mss.length = sizeof(struct option_mss);
-       options->as_mss.mss = htons(cm_node->tcp_cntxt.mss);
-       optionssize += sizeof(struct option_mss);
-
-       options = (union all_known_options *)&optionsbuffer[optionssize];
-       options->as_windowscale.optionnum = OPTION_NUMBER_WINDOW_SCALE;
-       options->as_windowscale.length = sizeof(struct option_windowscale);
-       options->as_windowscale.shiftcount = cm_node->tcp_cntxt.rcv_wscale;
-       optionssize += sizeof(struct option_windowscale);
-
-       if (sendack && !(NES_DRV_OPT_SUPRESS_OPTION_BC & nes_drv_opt)) {
-               options = (union all_known_options *)&optionsbuffer[optionssize];
-               options->as_base.optionnum = OPTION_NUMBER_WRITE0;
-               options->as_base.length = sizeof(struct option_base);
-               optionssize += sizeof(struct option_base);
-               /* we need the size to be a multiple of 4 */
-               options = (union all_known_options *)&optionsbuffer[optionssize];
-               options->as_end = 1;
-               optionssize += 1;
-               options = (union all_known_options *)&optionsbuffer[optionssize];
-               options->as_end = 1;
-               optionssize += 1;
-       }
-
-       options = (union all_known_options *)&optionsbuffer[optionssize];
-       options->as_end = OPTION_NUMBER_END;
-       optionssize += 1;
-
-       if (!skb)
-               skb = dev_alloc_skb(MAX_CM_BUFFER);
-       if (!skb) {
-               nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
-               return -1;
-       }
-
-       if (sendack)
-               flags |= SET_ACK;
-
-       form_cm_frame(skb, cm_node, optionsbuffer, optionssize, NULL, 0, flags);
-       ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
-
-       return ret;
-}
-
-
-/**
- * send_reset
- */
-static int send_reset(struct nes_cm_node *cm_node, struct sk_buff *skb)
-{
-       int ret;
-       int flags = SET_RST | SET_ACK;
-
-       if (!skb)
-               skb = dev_alloc_skb(MAX_CM_BUFFER);
-       if (!skb) {
-               nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
-               return -ENOMEM;
-       }
-
-       form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, flags);
-       ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 0, 1);
-
-       return ret;
-}
-
-
-/**
- * send_ack
- */
-static int send_ack(struct nes_cm_node *cm_node, struct sk_buff *skb)
-{
-       int ret;
-
-       if (!skb)
-               skb = dev_alloc_skb(MAX_CM_BUFFER);
-
-       if (!skb) {
-               nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
-               return -1;
-       }
-
-       form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, SET_ACK);
-       ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 0, 0);
-
-       return ret;
-}
-
-
-/**
- * send_fin
- */
-static int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb)
-{
-       int ret;
-
-       /* if we didn't get a frame get one */
-       if (!skb)
-               skb = dev_alloc_skb(MAX_CM_BUFFER);
-
-       if (!skb) {
-               nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
-               return -1;
-       }
-
-       form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, SET_ACK | SET_FIN);
-       ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
-
-       return ret;
-}
-
-
-/**
- * find_node - find a cm node that matches the reference cm node
- */
-static struct nes_cm_node *find_node(struct nes_cm_core *cm_core,
-                                    u16 rem_port, nes_addr_t rem_addr, u16 loc_port, nes_addr_t loc_addr)
-{
-       unsigned long flags;
-       struct list_head *hte;
-       struct nes_cm_node *cm_node;
-
-       /* get a handle on the hte */
-       hte = &cm_core->connected_nodes;
-
-       /* walk list and find cm_node associated with this session ID */
-       spin_lock_irqsave(&cm_core->ht_lock, flags);
-       list_for_each_entry(cm_node, hte, list) {
-               /* compare quad, return node handle if a match */
-               nes_debug(NES_DBG_CM, "finding node %x:%x =? %x:%x ^ %x:%x =? %x:%x\n",
-                         cm_node->loc_addr, cm_node->loc_port,
-                         loc_addr, loc_port,
-                         cm_node->rem_addr, cm_node->rem_port,
-                         rem_addr, rem_port);
-               if ((cm_node->loc_addr == loc_addr) &&
-                   (cm_node->loc_port == loc_port) &&
-                   (cm_node->rem_addr == rem_addr) &&
-                   (cm_node->rem_port == rem_port)) {
-                       add_ref_cm_node(cm_node);
-                       spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-                       return cm_node;
-               }
-       }
-       spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-
-       /* no owner node */
-       return NULL;
-}
-
-
-/**
- * find_listener - find a cm node listening on this addr-port pair
- */
-static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core,
-                                            nes_addr_t dst_addr, u16 dst_port,
-                                            enum nes_cm_listener_state listener_state)
-{
-       unsigned long flags;
-       struct nes_cm_listener *listen_node;
-       nes_addr_t listen_addr;
-       u16 listen_port;
-
-       /* walk list and find cm_node associated with this session ID */
-       spin_lock_irqsave(&cm_core->listen_list_lock, flags);
-       list_for_each_entry(listen_node, &cm_core->listen_list.list, list) {
-               listen_addr = listen_node->loc_addr;
-               listen_port = listen_node->loc_port;
-
-               /* compare node pair, return node handle if a match */
-               if (((listen_addr == dst_addr) ||
-                    listen_addr == 0x00000000) &&
-                   (listen_port == dst_port) &&
-                   (listener_state & listen_node->listener_state)) {
-                       atomic_inc(&listen_node->ref_count);
-                       spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
-                       return listen_node;
-               }
-       }
-       spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
-
-       /* no listener */
-       return NULL;
-}
-
-/**
- * add_hte_node - add a cm node to the hash table
- */
-static int add_hte_node(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
-{
-       unsigned long flags;
-       struct list_head *hte;
-
-       if (!cm_node || !cm_core)
-               return -EINVAL;
-
-       nes_debug(NES_DBG_CM, "Adding Node %p to Active Connection HT\n",
-                 cm_node);
-
-       spin_lock_irqsave(&cm_core->ht_lock, flags);
-
-       /* get a handle on the hash table element (list head for this slot) */
-       hte = &cm_core->connected_nodes;
-       list_add_tail(&cm_node->list, hte);
-       atomic_inc(&cm_core->ht_node_cnt);
-
-       spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-
-       return 0;
-}
-
-
-/**
- * mini_cm_dec_refcnt_listen
- */
-static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core,
-                                    struct nes_cm_listener *listener, int free_hanging_nodes)
-{
-       int ret = -EINVAL;
-       int err = 0;
-       unsigned long flags;
-       struct list_head *list_pos = NULL;
-       struct list_head *list_temp = NULL;
-       struct nes_cm_node *cm_node = NULL;
-       struct list_head reset_list;
-
-       nes_debug(NES_DBG_CM, "attempting listener= %p free_nodes= %d, "
-                 "refcnt=%d\n", listener, free_hanging_nodes,
-                 atomic_read(&listener->ref_count));
-       /* free non-accelerated child nodes for this listener */
-       INIT_LIST_HEAD(&reset_list);
-       if (free_hanging_nodes) {
-               spin_lock_irqsave(&cm_core->ht_lock, flags);
-               list_for_each_safe(list_pos, list_temp,
-                                  &g_cm_core->connected_nodes) {
-                       cm_node = container_of(list_pos, struct nes_cm_node,
-                                              list);
-                       if ((cm_node->listener == listener) &&
-                           (!cm_node->accelerated)) {
-                               add_ref_cm_node(cm_node);
-                               list_add(&cm_node->reset_entry, &reset_list);
-                       }
-               }
-               spin_unlock_irqrestore(&cm_core->ht_lock, flags);
-       }
-
-       list_for_each_safe(list_pos, list_temp, &reset_list) {
-               cm_node = container_of(list_pos, struct nes_cm_node,
-                                      reset_entry);
-               {
-                       struct nes_cm_node *loopback = cm_node->loopbackpartner;
-                       enum nes_cm_node_state old_state;
-                       if (NES_CM_STATE_FIN_WAIT1 <= cm_node->state) {
-                               rem_ref_cm_node(cm_node->cm_core, cm_node);
-                       } else {
-                               if (!loopback) {
-                                       cleanup_retrans_entry(cm_node);
-                                       err = send_reset(cm_node, NULL);
-                                       if (err) {
-                                               cm_node->state =
-                                                       NES_CM_STATE_CLOSED;
-                                               WARN_ON(1);
-                                       } else {
-                                               old_state = cm_node->state;
-                                               cm_node->state = NES_CM_STATE_LISTENER_DESTROYED;
-                                               if (old_state != NES_CM_STATE_MPAREQ_RCVD)
-                                                       rem_ref_cm_node(
-                                                               cm_node->cm_core,
-                                                               cm_node);
-                                       }
-                               } else {
-                                       struct nes_cm_event event;
-
-                                       event.cm_node = loopback;
-                                       event.cm_info.rem_addr =
-                                                       loopback->rem_addr;
-                                       event.cm_info.loc_addr =
-                                                       loopback->loc_addr;
-                                       event.cm_info.rem_port =
-                                                       loopback->rem_port;
-                                       event.cm_info.loc_port =
-                                                        loopback->loc_port;
-                                       event.cm_info.cm_id = loopback->cm_id;
-                                       add_ref_cm_node(loopback);
-                                       loopback->state = NES_CM_STATE_CLOSED;
-                                       cm_event_connect_error(&event);
-                                       cm_node->state = NES_CM_STATE_LISTENER_DESTROYED;
-
-                                       rem_ref_cm_node(cm_node->cm_core,
-                                                        cm_node);
-
-                               }
-                       }
-               }
-       }
-
-       spin_lock_irqsave(&cm_core->listen_list_lock, flags);
-       if (!atomic_dec_return(&listener->ref_count)) {
-               list_del(&listener->list);
-
-               /* decrement our listen node count */
-               atomic_dec(&cm_core->listen_node_cnt);
-
-               spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
-
-               if (listener->nesvnic) {
-                       nes_manage_apbvt(listener->nesvnic,
-                               listener->loc_port,
-                               PCI_FUNC(listener->nesvnic->nesdev->pcidev->devfn),
-                               NES_MANAGE_APBVT_DEL);
-
-                       nes_debug(NES_DBG_NLMSG,
-                                       "Delete APBVT loc_port = %04X\n",
-                                       listener->loc_port);
-               }
-
-               nes_debug(NES_DBG_CM, "destroying listener (%p)\n", listener);
-
-               kfree(listener);
-               listener = NULL;
-               ret = 0;
-               atomic_inc(&cm_listens_destroyed);
-       } else {
-               spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
-       }
-       if (listener) {
-               if (atomic_read(&listener->pend_accepts_cnt) > 0)
-                       nes_debug(NES_DBG_CM, "destroying listener (%p)"
-                                 " with non-zero pending accepts=%u\n",
-                                 listener, atomic_read(&listener->pend_accepts_cnt));
-       }
-
-       return ret;
-}
-
-
-/**
- * mini_cm_del_listen
- */
-static int mini_cm_del_listen(struct nes_cm_core *cm_core,
-                             struct nes_cm_listener *listener)
-{
-       listener->listener_state = NES_CM_LISTENER_PASSIVE_STATE;
-       listener->cm_id = NULL; /* going to be destroyed pretty soon */
-       return mini_cm_dec_refcnt_listen(cm_core, listener, 1);
-}
-
-
-/**
- * mini_cm_accelerated
- */
-static inline int mini_cm_accelerated(struct nes_cm_core *cm_core,
-                                     struct nes_cm_node *cm_node)
-{
-       cm_node->accelerated = true;
-
-       if (cm_node->accept_pend) {
-               BUG_ON(!cm_node->listener);
-               atomic_dec(&cm_node->listener->pend_accepts_cnt);
-               cm_node->accept_pend = 0;
-               BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0);
-       }
-
-       if (!timer_pending(&cm_core->tcp_timer))
-               mod_timer(&cm_core->tcp_timer, (jiffies + NES_SHORT_TIME));
-
-       return 0;
-}
-
-
-/**
- * nes_addr_resolve_neigh
- */
-static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpindex)
-{
-       struct rtable *rt;
-       struct neighbour *neigh;
-       int rc = arpindex;
-       struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter;
-       __be32 dst_ipaddr = htonl(dst_ip);
-
-       rt = ip_route_output(&init_net, dst_ipaddr, nesvnic->local_ipaddr, 0, 0);
-       if (IS_ERR(rt)) {
-               printk(KERN_ERR "%s: ip_route_output_key failed for 0x%08X\n",
-                      __func__, dst_ip);
-               return rc;
-       }
-
-       neigh = dst_neigh_lookup(&rt->dst, &dst_ipaddr);
-
-       rcu_read_lock();
-       if (neigh) {
-               if (neigh->nud_state & NUD_VALID) {
-                       nes_debug(NES_DBG_CM, "Neighbor MAC address for 0x%08X"
-                                 " is %pM, Gateway is 0x%08X \n", dst_ip,
-                                 neigh->ha, ntohl(rt->rt_gw4));
-
-                       if (arpindex >= 0) {
-                               if (ether_addr_equal(nesadapter->arp_table[arpindex].mac_addr, neigh->ha)) {
-                                       /* Mac address same as in nes_arp_table */
-                                       goto out;
-                               }
-
-                               nes_manage_arp_cache(nesvnic->netdev,
-                                                    nesadapter->arp_table[arpindex].mac_addr,
-                                                    dst_ip, NES_ARP_DELETE);
-                       }
-
-                       nes_manage_arp_cache(nesvnic->netdev, neigh->ha,
-                                            dst_ip, NES_ARP_ADD);
-                       rc = nes_arp_table(nesvnic->nesdev, dst_ip, NULL,
-                                          NES_ARP_RESOLVE);
-               } else {
-                       neigh_event_send(neigh, NULL);
-               }
-       }
-out:
-       rcu_read_unlock();
-
-       if (neigh)
-               neigh_release(neigh);
-
-       ip_rt_put(rt);
-       return rc;
-}
-
-/**
- * make_cm_node - create a new instance of a cm node
- */
-static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core,
-                                       struct nes_vnic *nesvnic, struct nes_cm_info *cm_info,
-                                       struct nes_cm_listener *listener)
-{
-       struct nes_cm_node *cm_node;
-       int oldarpindex = 0;
-       int arpindex = 0;
-       struct nes_device *nesdev;
-       struct nes_adapter *nesadapter;
-
-       /* create an hte and cm_node for this instance */
-       cm_node = kzalloc(sizeof(*cm_node), GFP_ATOMIC);
-       if (!cm_node)
-               return NULL;
-
-       /* set our node specific transport info */
-       if (listener) {
-               cm_node->loc_addr = listener->loc_addr;
-               cm_node->loc_port = listener->loc_port;
-       } else {
-               cm_node->loc_addr = cm_info->loc_addr;
-               cm_node->loc_port = cm_info->loc_port;
-       }
-       cm_node->rem_addr = cm_info->rem_addr;
-       cm_node->rem_port = cm_info->rem_port;
-
-       cm_node->mpa_frame_rev = mpa_version;
-       cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO;
-       cm_node->mpav2_ird_ord = 0;
-       cm_node->ird_size = 0;
-       cm_node->ord_size = 0;
-
-       nes_debug(NES_DBG_CM, "Make node addresses : loc = %pI4:%x, rem = %pI4:%x\n",
-                 &cm_node->loc_addr, cm_node->loc_port,
-                 &cm_node->rem_addr, cm_node->rem_port);
-       cm_node->listener = listener;
-       if (listener)
-               cm_node->tos = listener->tos;
-       cm_node->netdev = nesvnic->netdev;
-       cm_node->cm_id = cm_info->cm_id;
-       memcpy(cm_node->loc_mac, nesvnic->netdev->dev_addr, ETH_ALEN);
-
-       nes_debug(NES_DBG_CM, "listener=%p, cm_id=%p\n", cm_node->listener,
-                 cm_node->cm_id);
-
-       spin_lock_init(&cm_node->retrans_list_lock);
-
-       cm_node->loopbackpartner = NULL;
-       atomic_set(&cm_node->ref_count, 1);
-       /* associate our parent CM core */
-       cm_node->cm_core = cm_core;
-       cm_node->tcp_cntxt.loc_id = NES_CM_DEF_LOCAL_ID;
-       cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE;
-       cm_node->tcp_cntxt.rcv_wnd = NES_CM_DEFAULT_RCV_WND_SCALED >>
-                                    NES_CM_DEFAULT_RCV_WND_SCALE;
-       cm_node->tcp_cntxt.loc_seq_num = secure_tcp_seq(htonl(cm_node->loc_addr),
-                                                       htonl(cm_node->rem_addr),
-                                                       htons(cm_node->loc_port),
-                                                       htons(cm_node->rem_port));
-       cm_node->tcp_cntxt.mss = nesvnic->max_frame_size - sizeof(struct iphdr) -
-                                sizeof(struct tcphdr) - ETH_HLEN - VLAN_HLEN;
-       cm_node->tcp_cntxt.rcv_nxt = 0;
-       /* get a unique session ID , add thread_id to an upcounter to handle race */
-       atomic_inc(&cm_core->node_cnt);
-       cm_node->conn_type = cm_info->conn_type;
-       cm_node->apbvt_set = 0;
-       cm_node->accept_pend = 0;
-
-       cm_node->nesvnic = nesvnic;
-       /* get some device handles, for arp lookup */
-       nesdev = nesvnic->nesdev;
-       nesadapter = nesdev->nesadapter;
-
-       cm_node->loopbackpartner = NULL;
-
-       /* get the mac addr for the remote node */
-       oldarpindex = nes_arp_table(nesdev, cm_node->rem_addr,
-                                   NULL, NES_ARP_RESOLVE);
-       arpindex = nes_addr_resolve_neigh(nesvnic, cm_node->rem_addr,
-                                         oldarpindex);
-       if (arpindex < 0) {
-               kfree(cm_node);
-               return NULL;
-       }
-
-       /* copy the mac addr to node context */
-       memcpy(cm_node->rem_mac, nesadapter->arp_table[arpindex].mac_addr, ETH_ALEN);
-       nes_debug(NES_DBG_CM, "Remote mac addr from arp table: %pM\n",
-                 cm_node->rem_mac);
-
-       add_hte_node(cm_core, cm_node);
-       atomic_inc(&cm_nodes_created);
-
-       return cm_node;
-}
-
-
-/**
- * add_ref_cm_node - destroy an instance of a cm node
- */
-static int add_ref_cm_node(struct nes_cm_node *cm_node)
-{
-       atomic_inc(&cm_node->ref_count);
-       return 0;
-}
-
-
-/**
- * rem_ref_cm_node - destroy an instance of a cm node
- */
-static int rem_ref_cm_node(struct nes_cm_core *cm_core,
-                          struct nes_cm_node *cm_node)
-{
-       unsigned long flags;
-       struct nes_qp *nesqp;
-
-       if (!cm_node)
-               return -EINVAL;
-
-       spin_lock_irqsave(&cm_node->cm_core->ht_lock, flags);
-       if (atomic_dec_return(&cm_node->ref_count)) {
-               spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags);
-               return 0;
-       }
-       list_del(&cm_node->list);
-       atomic_dec(&cm_core->ht_node_cnt);
-       spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags);
-
-       /* if the node is destroyed before connection was accelerated */
-       if (!cm_node->accelerated && cm_node->accept_pend) {
-               BUG_ON(!cm_node->listener);
-               atomic_dec(&cm_node->listener->pend_accepts_cnt);
-               BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0);
-       }
-       WARN_ON(cm_node->send_entry);
-       if (cm_node->recv_entry)
-               handle_recv_entry(cm_node, 0);
-       if (cm_node->listener) {
-               mini_cm_dec_refcnt_listen(cm_core, cm_node->listener, 0);
-       } else {
-               if (cm_node->apbvt_set && cm_node->nesvnic) {
-                       nes_manage_apbvt(cm_node->nesvnic, cm_node->loc_port,
-                                        PCI_FUNC(cm_node->nesvnic->nesdev->pcidev->devfn),
-                                        NES_MANAGE_APBVT_DEL);
-               }
-               nes_debug(NES_DBG_NLMSG, "Delete APBVT loc_port = %04X\n",
-                         cm_node->loc_port);
-       }
-
-       atomic_dec(&cm_core->node_cnt);
-       atomic_inc(&cm_nodes_destroyed);
-       nesqp = cm_node->nesqp;
-       if (nesqp) {
-               nesqp->cm_node = NULL;
-               nes_rem_ref(&nesqp->ibqp);
-               cm_node->nesqp = NULL;
-       }
-
-       kfree(cm_node);
-       return 0;
-}
-
-/**
- * process_options
- */
-static int process_options(struct nes_cm_node *cm_node, u8 *optionsloc,
-                          u32 optionsize, u32 syn_packet)
-{
-       u32 tmp;
-       u32 offset = 0;
-       union all_known_options *all_options;
-       char got_mss_option = 0;
-
-       while (offset < optionsize) {
-               all_options = (union all_known_options *)(optionsloc + offset);
-               switch (all_options->as_base.optionnum) {
-               case OPTION_NUMBER_END:
-                       offset = optionsize;
-                       break;
-               case OPTION_NUMBER_NONE:
-                       offset += 1;
-                       continue;
-               case OPTION_NUMBER_MSS:
-                       nes_debug(NES_DBG_CM, "%s: MSS Length: %d Offset: %d "
-                                 "Size: %d\n", __func__,
-                                 all_options->as_mss.length, offset, optionsize);
-                       got_mss_option = 1;
-                       if (all_options->as_mss.length != 4) {
-                               return 1;
-                       } else {
-                               tmp = ntohs(all_options->as_mss.mss);
-                               if (tmp > 0 && tmp <
-                                   cm_node->tcp_cntxt.mss)
-                                       cm_node->tcp_cntxt.mss = tmp;
-                       }
-                       break;
-               case OPTION_NUMBER_WINDOW_SCALE:
-                       cm_node->tcp_cntxt.snd_wscale =
-                               all_options->as_windowscale.shiftcount;
-                       break;
-               default:
-                       nes_debug(NES_DBG_CM, "TCP Option not understood: %x\n",
-                                 all_options->as_base.optionnum);
-                       break;
-               }
-               offset += all_options->as_base.length;
-       }
-       if ((!got_mss_option) && (syn_packet))
-               cm_node->tcp_cntxt.mss = NES_CM_DEFAULT_MSS;
-       return 0;
-}
-
-static void drop_packet(struct sk_buff *skb)
-{
-       atomic_inc(&cm_accel_dropped_pkts);
-       dev_kfree_skb_any(skb);
-}
-
-static void handle_fin_pkt(struct nes_cm_node *cm_node)
-{
-       nes_debug(NES_DBG_CM, "Received FIN, cm_node = %p, state = %u. "
-                 "refcnt=%d\n", cm_node, cm_node->state,
-                 atomic_read(&cm_node->ref_count));
-       switch (cm_node->state) {
-       case NES_CM_STATE_SYN_RCVD:
-       case NES_CM_STATE_SYN_SENT:
-       case NES_CM_STATE_ESTABLISHED:
-       case NES_CM_STATE_MPAREJ_RCVD:
-               cm_node->tcp_cntxt.rcv_nxt++;
-               cleanup_retrans_entry(cm_node);
-               cm_node->state = NES_CM_STATE_LAST_ACK;
-               send_fin(cm_node, NULL);
-               break;
-       case NES_CM_STATE_MPAREQ_SENT:
-               create_event(cm_node, NES_CM_EVENT_ABORTED);
-               cm_node->tcp_cntxt.rcv_nxt++;
-               cleanup_retrans_entry(cm_node);
-               cm_node->state = NES_CM_STATE_CLOSED;
-               add_ref_cm_node(cm_node);
-               send_reset(cm_node, NULL);
-               break;
-       case NES_CM_STATE_FIN_WAIT1:
-               cm_node->tcp_cntxt.rcv_nxt++;
-               cleanup_retrans_entry(cm_node);
-               cm_node->state = NES_CM_STATE_CLOSING;
-               send_ack(cm_node, NULL);
-               /* Wait for ACK as this is simultaneous close..
-               * After we receive ACK, do not send anything..
-               * Just rm the node.. Done.. */
-               break;
-       case NES_CM_STATE_FIN_WAIT2:
-               cm_node->tcp_cntxt.rcv_nxt++;
-               cleanup_retrans_entry(cm_node);
-               cm_node->state = NES_CM_STATE_TIME_WAIT;
-               send_ack(cm_node, NULL);
-               schedule_nes_timer(cm_node, NULL,  NES_TIMER_TYPE_CLOSE, 1, 0);
-               break;
-       case NES_CM_STATE_TIME_WAIT:
-               cm_node->tcp_cntxt.rcv_nxt++;
-               cleanup_retrans_entry(cm_node);
-               cm_node->state = NES_CM_STATE_CLOSED;
-               rem_ref_cm_node(cm_node->cm_core, cm_node);
-               break;
-       case NES_CM_STATE_TSA:
-       default:
-               nes_debug(NES_DBG_CM, "Error Rcvd FIN for node-%p state = %d\n",
-                       cm_node, cm_node->state);
-               break;
-       }
-}
-
-
-static void handle_rst_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
-       struct tcphdr *tcph)
-{
-
-       int     reset = 0;      /* whether to send reset in case of err.. */
-       atomic_inc(&cm_resets_recvd);
-       nes_debug(NES_DBG_CM, "Received Reset, cm_node = %p, state = %u."
-                       " refcnt=%d\n", cm_node, cm_node->state,
-                       atomic_read(&cm_node->ref_count));
-       cleanup_retrans_entry(cm_node);
-       switch (cm_node->state) {
-       case NES_CM_STATE_SYN_SENT:
-       case NES_CM_STATE_MPAREQ_SENT:
-               nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p "
-                       "listener=%p state=%d\n", __func__, __LINE__, cm_node,
-                       cm_node->listener, cm_node->state);
-               switch (cm_node->mpa_frame_rev) {
-               case IETF_MPA_V2:
-                       cm_node->mpa_frame_rev = IETF_MPA_V1;
-                       /* send a syn and goto syn sent state */
-                       cm_node->state = NES_CM_STATE_SYN_SENT;
-                       if (send_syn(cm_node, 0, NULL)) {
-                               active_open_err(cm_node, skb, reset);
-                       }
-                       break;
-               case IETF_MPA_V1:
-               default:
-                       active_open_err(cm_node, skb, reset);
-                       break;
-               }
-               break;
-       case NES_CM_STATE_MPAREQ_RCVD:
-               atomic_inc(&cm_node->passive_state);
-               dev_kfree_skb_any(skb);
-               break;
-       case NES_CM_STATE_ESTABLISHED:
-       case NES_CM_STATE_SYN_RCVD:
-       case NES_CM_STATE_LISTENING:
-               nes_debug(NES_DBG_CM, "Bad state %s[%u]\n", __func__, __LINE__);
-               passive_open_err(cm_node, skb, reset);
-               break;
-       case NES_CM_STATE_TSA:
-               active_open_err(cm_node, skb, reset);
-               break;
-       case NES_CM_STATE_CLOSED:
-               drop_packet(skb);
-               break;
-       case NES_CM_STATE_FIN_WAIT2:
-       case NES_CM_STATE_FIN_WAIT1:
-       case NES_CM_STATE_LAST_ACK:
-               cm_node->cm_id->rem_ref(cm_node->cm_id);
-               /* fall through */
-       case NES_CM_STATE_TIME_WAIT:
-               cm_node->state = NES_CM_STATE_CLOSED;
-               rem_ref_cm_node(cm_node->cm_core, cm_node);
-               drop_packet(skb);
-               break;
-       default:
-               drop_packet(skb);
-               break;
-       }
-}
-
-
-static void handle_rcv_mpa(struct nes_cm_node *cm_node, struct sk_buff *skb)
-{
-       int ret = 0;
-       int datasize = skb->len;
-       u8 *dataloc = skb->data;
-
-       enum nes_cm_event_type type = NES_CM_EVENT_UNKNOWN;
-       u32 res_type;
-
-       ret = parse_mpa(cm_node, dataloc, &res_type, datasize);
-       if (ret) {
-               nes_debug(NES_DBG_CM, "didn't like MPA Request\n");
-               if (cm_node->state == NES_CM_STATE_MPAREQ_SENT) {
-                       nes_debug(NES_DBG_CM, "%s[%u] create abort for "
-                                 "cm_node=%p listener=%p state=%d\n", __func__,
-                                 __LINE__, cm_node, cm_node->listener,
-                                 cm_node->state);
-                       active_open_err(cm_node, skb, 1);
-               } else {
-                       passive_open_err(cm_node, skb, 1);
-               }
-               return;
-       }
-
-       switch (cm_node->state) {
-       case NES_CM_STATE_ESTABLISHED:
-               if (res_type == NES_MPA_REQUEST_REJECT)
-                       /*BIG problem as we are receiving the MPA.. So should
-                        * not be REJECT.. This is Passive Open.. We can
-                        * only receive it Reject for Active Open...*/
-                       WARN_ON(1);
-               cm_node->state = NES_CM_STATE_MPAREQ_RCVD;
-               type = NES_CM_EVENT_MPA_REQ;
-               atomic_set(&cm_node->passive_state,
-                          NES_PASSIVE_STATE_INDICATED);
-               break;
-       case NES_CM_STATE_MPAREQ_SENT:
-               cleanup_retrans_entry(cm_node);
-               if (res_type == NES_MPA_REQUEST_REJECT) {
-                       type = NES_CM_EVENT_MPA_REJECT;
-                       cm_node->state = NES_CM_STATE_MPAREJ_RCVD;
-               } else {
-                       type = NES_CM_EVENT_CONNECTED;
-                       cm_node->state = NES_CM_STATE_TSA;
-               }
-               send_ack(cm_node, NULL);
-               break;
-       default:
-               WARN_ON(1);
-               break;
-       }
-       dev_kfree_skb_any(skb);
-       create_event(cm_node, type);
-}
-
-static void indicate_pkt_err(struct nes_cm_node *cm_node, struct sk_buff *skb)
-{
-       switch (cm_node->state) {
-       case NES_CM_STATE_SYN_SENT:
-       case NES_CM_STATE_MPAREQ_SENT:
-               nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p "
-                         "listener=%p state=%d\n", __func__, __LINE__, cm_node,
-                         cm_node->listener, cm_node->state);
-               active_open_err(cm_node, skb, 1);
-               break;
-       case NES_CM_STATE_ESTABLISHED:
-       case NES_CM_STATE_SYN_RCVD:
-               passive_open_err(cm_node, skb, 1);
-               break;
-       case NES_CM_STATE_TSA:
-       default:
-               drop_packet(skb);
-       }
-}
-
-static int check_syn(struct nes_cm_node *cm_node, struct tcphdr *tcph,
-                    struct sk_buff *skb)
-{
-       int err;
-
-       err = ((ntohl(tcph->ack_seq) == cm_node->tcp_cntxt.loc_seq_num)) ? 0 : 1;
-       if (err)
-               active_open_err(cm_node, skb, 1);
-
-       return err;
-}
-
-static int check_seq(struct nes_cm_node *cm_node, struct tcphdr *tcph,
-                    struct sk_buff *skb)
-{
-       int err = 0;
-       u32 seq;
-       u32 ack_seq;
-       u32 loc_seq_num = cm_node->tcp_cntxt.loc_seq_num;
-       u32 rcv_nxt = cm_node->tcp_cntxt.rcv_nxt;
-       u32 rcv_wnd;
-
-       seq = ntohl(tcph->seq);
-       ack_seq = ntohl(tcph->ack_seq);
-       rcv_wnd = cm_node->tcp_cntxt.rcv_wnd;
-       if (ack_seq != loc_seq_num)
-               err = 1;
-       else if (!between(seq, rcv_nxt, (rcv_nxt + rcv_wnd)))
-               err = 1;
-       if (err) {
-               nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p "
-                         "listener=%p state=%d\n", __func__, __LINE__, cm_node,
-                         cm_node->listener, cm_node->state);
-               indicate_pkt_err(cm_node, skb);
-               nes_debug(NES_DBG_CM, "seq ERROR cm_node =%p seq=0x%08X "
-                         "rcv_nxt=0x%08X rcv_wnd=0x%x\n", cm_node, seq, rcv_nxt,
-                         rcv_wnd);
-       }
-       return err;
-}
-
-/*
- * handle_syn_pkt() is for Passive node. The syn packet is received when a node
- * is created with a listener or it may comein as rexmitted packet which in
- * that case will be just dropped.
- */
-static void handle_syn_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
-                          struct tcphdr *tcph)
-{
-       int ret;
-       u32 inc_sequence;
-       int optionsize;
-
-       optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
-       skb_trim(skb, 0);
-       inc_sequence = ntohl(tcph->seq);
-
-       switch (cm_node->state) {
-       case NES_CM_STATE_SYN_SENT:
-       case NES_CM_STATE_MPAREQ_SENT:
-               /* Rcvd syn on active open connection*/
-               active_open_err(cm_node, skb, 1);
-               break;
-       case NES_CM_STATE_LISTENING:
-               /* Passive OPEN */
-               if (atomic_read(&cm_node->listener->pend_accepts_cnt) >
-                   cm_node->listener->backlog) {
-                       nes_debug(NES_DBG_CM, "drop syn due to backlog "
-                                 "pressure \n");
-                       cm_backlog_drops++;
-                       passive_open_err(cm_node, skb, 0);
-                       break;
-               }
-               ret = handle_tcp_options(cm_node, tcph, skb, optionsize,
-                                        1);
-               if (ret) {
-                       passive_open_err(cm_node, skb, 0);
-                       /* drop pkt */
-                       break;
-               }
-               cm_node->tcp_cntxt.rcv_nxt = inc_sequence + 1;
-               BUG_ON(cm_node->send_entry);
-               cm_node->accept_pend = 1;
-               atomic_inc(&cm_node->listener->pend_accepts_cnt);
-
-               cm_node->state = NES_CM_STATE_SYN_RCVD;
-               send_syn(cm_node, 1, skb);
-               break;
-       case NES_CM_STATE_CLOSED:
-               cleanup_retrans_entry(cm_node);
-               add_ref_cm_node(cm_node);
-               send_reset(cm_node, skb);
-               break;
-       case NES_CM_STATE_TSA:
-       case NES_CM_STATE_ESTABLISHED:
-       case NES_CM_STATE_FIN_WAIT1:
-       case NES_CM_STATE_FIN_WAIT2:
-       case NES_CM_STATE_MPAREQ_RCVD:
-       case NES_CM_STATE_LAST_ACK:
-       case NES_CM_STATE_CLOSING:
-       case NES_CM_STATE_UNKNOWN:
-       default:
-               drop_packet(skb);
-               break;
-       }
-}
-
-static void handle_synack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
-                             struct tcphdr *tcph)
-{
-       int ret;
-       u32 inc_sequence;
-       int optionsize;
-
-       optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
-       skb_trim(skb, 0);
-       inc_sequence = ntohl(tcph->seq);
-       switch (cm_node->state) {
-       case NES_CM_STATE_SYN_SENT:
-               cleanup_retrans_entry(cm_node);
-               /* active open */
-               if (check_syn(cm_node, tcph, skb))
-                       return;
-               cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
-               /* setup options */
-               ret = handle_tcp_options(cm_node, tcph, skb, optionsize, 0);
-               if (ret) {
-                       nes_debug(NES_DBG_CM, "cm_node=%p tcp_options failed\n",
-                                 cm_node);
-                       break;
-               }
-               cleanup_retrans_entry(cm_node);
-               cm_node->tcp_cntxt.rcv_nxt = inc_sequence + 1;
-               send_mpa_request(cm_node, skb);
-               cm_node->state = NES_CM_STATE_MPAREQ_SENT;
-               break;
-       case NES_CM_STATE_MPAREQ_RCVD:
-               /* passive open, so should not be here */
-               passive_open_err(cm_node, skb, 1);
-               break;
-       case NES_CM_STATE_LISTENING:
-               cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq);
-               cleanup_retrans_entry(cm_node);
-               cm_node->state = NES_CM_STATE_CLOSED;
-               send_reset(cm_node, skb);
-               break;
-       case NES_CM_STATE_CLOSED:
-               cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq);
-               cleanup_retrans_entry(cm_node);
-               add_ref_cm_node(cm_node);
-               send_reset(cm_node, skb);
-               break;
-       case NES_CM_STATE_ESTABLISHED:
-       case NES_CM_STATE_FIN_WAIT1:
-       case NES_CM_STATE_FIN_WAIT2:
-       case NES_CM_STATE_LAST_ACK:
-       case NES_CM_STATE_TSA:
-       case NES_CM_STATE_CLOSING:
-       case NES_CM_STATE_UNKNOWN:
-       case NES_CM_STATE_MPAREQ_SENT:
-       default:
-               drop_packet(skb);
-               break;
-       }
-}
-
-static int handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
-                         struct tcphdr *tcph)
-{
-       int datasize = 0;
-       u32 inc_sequence;
-       int ret = 0;
-       int optionsize;
-
-       optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
-
-       if (check_seq(cm_node, tcph, skb))
-               return -EINVAL;
-
-       skb_pull(skb, tcph->doff << 2);
-       inc_sequence = ntohl(tcph->seq);
-       datasize = skb->len;
-       switch (cm_node->state) {
-       case NES_CM_STATE_SYN_RCVD:
-               /* Passive OPEN */
-               cleanup_retrans_entry(cm_node);
-               ret = handle_tcp_options(cm_node, tcph, skb, optionsize, 1);
-               if (ret)
-                       break;
-               cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
-               cm_node->state = NES_CM_STATE_ESTABLISHED;
-               if (datasize) {
-                       cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
-                       handle_rcv_mpa(cm_node, skb);
-               } else { /* rcvd ACK only */
-                       dev_kfree_skb_any(skb);
-               }
-               break;
-       case NES_CM_STATE_ESTABLISHED:
-               /* Passive OPEN */
-               cleanup_retrans_entry(cm_node);
-               if (datasize) {
-                       cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
-                       handle_rcv_mpa(cm_node, skb);
-               } else {
-                       drop_packet(skb);
-               }
-               break;
-       case NES_CM_STATE_MPAREQ_SENT:
-               cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
-               if (datasize) {
-                       cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
-                       handle_rcv_mpa(cm_node, skb);
-               } else { /* Could be just an ack pkt.. */
-                       dev_kfree_skb_any(skb);
-               }
-               break;
-       case NES_CM_STATE_LISTENING:
-               cleanup_retrans_entry(cm_node);
-               cm_node->state = NES_CM_STATE_CLOSED;
-               send_reset(cm_node, skb);
-               break;
-       case NES_CM_STATE_CLOSED:
-               cleanup_retrans_entry(cm_node);
-               add_ref_cm_node(cm_node);
-               send_reset(cm_node, skb);
-               break;
-       case NES_CM_STATE_LAST_ACK:
-       case NES_CM_STATE_CLOSING:
-               cleanup_retrans_entry(cm_node);
-               cm_node->state = NES_CM_STATE_CLOSED;
-               cm_node->cm_id->rem_ref(cm_node->cm_id);
-               rem_ref_cm_node(cm_node->cm_core, cm_node);
-               drop_packet(skb);
-               break;
-       case NES_CM_STATE_FIN_WAIT1:
-               cleanup_retrans_entry(cm_node);
-               drop_packet(skb);
-               cm_node->state = NES_CM_STATE_FIN_WAIT2;
-               break;
-       case NES_CM_STATE_SYN_SENT:
-       case NES_CM_STATE_FIN_WAIT2:
-       case NES_CM_STATE_TSA:
-       case NES_CM_STATE_MPAREQ_RCVD:
-       case NES_CM_STATE_UNKNOWN:
-       default:
-               cleanup_retrans_entry(cm_node);
-               drop_packet(skb);
-               break;
-       }
-       return ret;
-}
-
-
-
-static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph,
-                             struct sk_buff *skb, int optionsize, int passive)
-{
-       u8 *optionsloc = (u8 *)&tcph[1];
-
-       if (optionsize) {
-               if (process_options(cm_node, optionsloc, optionsize,
-                                   (u32)tcph->syn)) {
-                       nes_debug(NES_DBG_CM, "%s: Node %p, Sending RESET\n",
-                                 __func__, cm_node);
-                       if (passive)
-                               passive_open_err(cm_node, skb, 1);
-                       else
-                               active_open_err(cm_node, skb, 1);
-                       return 1;
-               }
-       }
-
-       cm_node->tcp_cntxt.snd_wnd = ntohs(tcph->window) <<
-                                    cm_node->tcp_cntxt.snd_wscale;
-
-       if (cm_node->tcp_cntxt.snd_wnd > cm_node->tcp_cntxt.max_snd_wnd)
-               cm_node->tcp_cntxt.max_snd_wnd = cm_node->tcp_cntxt.snd_wnd;
-       return 0;
-}
-
-/*
- * active_open_err() will send reset() if flag set..
- * It will also send ABORT event.
- */
-static void active_open_err(struct nes_cm_node *cm_node, struct sk_buff *skb,
-                           int reset)
-{
-       cleanup_retrans_entry(cm_node);
-       if (reset) {
-               nes_debug(NES_DBG_CM, "ERROR active err called for cm_node=%p, "
-                         "state=%d\n", cm_node, cm_node->state);
-               add_ref_cm_node(cm_node);
-               send_reset(cm_node, skb);
-       } else {
-               dev_kfree_skb_any(skb);
-       }
-
-       cm_node->state = NES_CM_STATE_CLOSED;
-       create_event(cm_node, NES_CM_EVENT_ABORTED);
-}
-
-/*
- * passive_open_err() will either do a reset() or will free up the skb and
- * remove the cm_node.
- */
-static void passive_open_err(struct nes_cm_node *cm_node, struct sk_buff *skb,
-                            int reset)
-{
-       cleanup_retrans_entry(cm_node);
-       cm_node->state = NES_CM_STATE_CLOSED;
-       if (reset) {
-               nes_debug(NES_DBG_CM, "passive_open_err sending RST for "
-                         "cm_node=%p state =%d\n", cm_node, cm_node->state);
-               send_reset(cm_node, skb);
-       } else {
-               dev_kfree_skb_any(skb);
-               rem_ref_cm_node(cm_node->cm_core, cm_node);
-       }
-}
-
-/*
- * free_retrans_entry() routines assumes that the retrans_list_lock has
- * been acquired before calling.
- */
-static void free_retrans_entry(struct nes_cm_node *cm_node)
-{
-       struct nes_timer_entry *send_entry;
-
-       send_entry = cm_node->send_entry;
-       if (send_entry) {
-               cm_node->send_entry = NULL;
-               dev_kfree_skb_any(send_entry->skb);
-               kfree(send_entry);
-               rem_ref_cm_node(cm_node->cm_core, cm_node);
-       }
-}
-
-static void cleanup_retrans_entry(struct nes_cm_node *cm_node)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
-       free_retrans_entry(cm_node);
-       spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
-}
-
-/**
- * process_packet
- * Returns skb if to be freed, else it will return NULL if already used..
- */
-static void process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb,
-                          struct nes_cm_core *cm_core)
-{
-       enum nes_tcpip_pkt_type pkt_type = NES_PKT_TYPE_UNKNOWN;
-       struct tcphdr *tcph = tcp_hdr(skb);
-       u32 fin_set = 0;
-       int ret = 0;
-
-       skb_pull(skb, ip_hdr(skb)->ihl << 2);
-
-       nes_debug(NES_DBG_CM, "process_packet: cm_node=%p state =%d syn=%d "
-                 "ack=%d rst=%d fin=%d\n", cm_node, cm_node->state, tcph->syn,
-                 tcph->ack, tcph->rst, tcph->fin);
-
-       if (tcph->rst) {
-               pkt_type = NES_PKT_TYPE_RST;
-       } else if (tcph->syn) {
-               pkt_type = NES_PKT_TYPE_SYN;
-               if (tcph->ack)
-                       pkt_type = NES_PKT_TYPE_SYNACK;
-       } else if (tcph->ack) {
-               pkt_type = NES_PKT_TYPE_ACK;
-       }
-       if (tcph->fin)
-               fin_set = 1;
-
-       switch (pkt_type) {
-       case NES_PKT_TYPE_SYN:
-               handle_syn_pkt(cm_node, skb, tcph);
-               break;
-       case NES_PKT_TYPE_SYNACK:
-               handle_synack_pkt(cm_node, skb, tcph);
-               break;
-       case NES_PKT_TYPE_ACK:
-               ret = handle_ack_pkt(cm_node, skb, tcph);
-               if (fin_set && !ret)
-                       handle_fin_pkt(cm_node);
-               break;
-       case NES_PKT_TYPE_RST:
-               handle_rst_pkt(cm_node, skb, tcph);
-               break;
-       default:
-               if ((fin_set) && (!check_seq(cm_node, tcph, skb)))
-                       handle_fin_pkt(cm_node);
-               drop_packet(skb);
-               break;
-       }
-}
-
-/**
- * mini_cm_listen - create a listen node with params
- */
-static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core,
-                       struct nes_vnic *nesvnic, struct nes_cm_info *cm_info)
-{
-       struct nes_cm_listener *listener;
-       unsigned long flags;
-
-       nes_debug(NES_DBG_CM, "Search for 0x%08x : 0x%04x\n",
-                 cm_info->loc_addr, cm_info->loc_port);
-
-       /* cannot have multiple matching listeners */
-       listener = find_listener(cm_core, cm_info->loc_addr, cm_info->loc_port,
-                               NES_CM_LISTENER_EITHER_STATE);
-
-       if (listener && listener->listener_state == NES_CM_LISTENER_ACTIVE_STATE) {
-               /* find automatically incs ref count ??? */
-               atomic_dec(&listener->ref_count);
-               nes_debug(NES_DBG_CM, "Not creating listener since it already exists\n");
-               return NULL;
-       }
-
-       if (!listener) {
-               /* create a CM listen node (1/2 node to compare incoming traffic to) */
-               listener = kzalloc(sizeof(*listener), GFP_ATOMIC);
-               if (!listener)
-                       return NULL;
-
-               listener->loc_addr = cm_info->loc_addr;
-               listener->loc_port = cm_info->loc_port;
-               listener->reused_node = 0;
-
-               atomic_set(&listener->ref_count, 1);
-       }
-       /* pasive case */
-       /* find already inc'ed the ref count */
-       else {
-               listener->reused_node = 1;
-       }
-
-       listener->cm_id = cm_info->cm_id;
-       atomic_set(&listener->pend_accepts_cnt, 0);
-       listener->cm_core = cm_core;
-       listener->nesvnic = nesvnic;
-       atomic_inc(&cm_core->node_cnt);
-
-       listener->conn_type = cm_info->conn_type;
-       listener->backlog = cm_info->backlog;
-       listener->listener_state = NES_CM_LISTENER_ACTIVE_STATE;
-
-       if (!listener->reused_node) {
-               spin_lock_irqsave(&cm_core->listen_list_lock, flags);
-               list_add(&listener->list, &cm_core->listen_list.list);
-               spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
-               atomic_inc(&cm_core->listen_node_cnt);
-       }
-
-       nes_debug(NES_DBG_CM, "Api - listen(): addr=0x%08X, port=0x%04x,"
-                 " listener = %p, backlog = %d, cm_id = %p.\n",
-                 cm_info->loc_addr, cm_info->loc_port,
-                 listener, listener->backlog, listener->cm_id);
-
-       return listener;
-}
-
-
-/**
- * mini_cm_connect - make a connection node with params
- */
-static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core,
-                                          struct nes_vnic *nesvnic, u16 private_data_len,
-                                          void *private_data, struct nes_cm_info *cm_info)
-{
-       int ret = 0;
-       struct nes_cm_node *cm_node;
-       struct nes_cm_listener *loopbackremotelistener;
-       struct nes_cm_node *loopbackremotenode;
-       struct nes_cm_info loopback_cm_info;
-       u8 *start_buff;
-
-       /* create a CM connection node */
-       cm_node = make_cm_node(cm_core, nesvnic, cm_info, NULL);
-       if (!cm_node)
-               return NULL;
-
-       /* set our node side to client (active) side */
-       cm_node->tcp_cntxt.client = 1;
-       cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE;
-
-       if (cm_info->loc_addr == cm_info->rem_addr) {
-               loopbackremotelistener = find_listener(cm_core,
-                       cm_node->loc_addr, cm_node->rem_port,
-                       NES_CM_LISTENER_ACTIVE_STATE);
-               if (loopbackremotelistener == NULL) {
-                       create_event(cm_node, NES_CM_EVENT_ABORTED);
-               } else {
-                       loopback_cm_info = *cm_info;
-                       loopback_cm_info.loc_port = cm_info->rem_port;
-                       loopback_cm_info.rem_port = cm_info->loc_port;
-                       loopback_cm_info.loc_port =
-                               cm_info->rem_port;
-                       loopback_cm_info.rem_port =
-                               cm_info->loc_port;
-                       loopback_cm_info.cm_id = loopbackremotelistener->cm_id;
-                       loopbackremotenode = make_cm_node(cm_core, nesvnic,
-                                                         &loopback_cm_info, loopbackremotelistener);
-                       if (!loopbackremotenode) {
-                               rem_ref_cm_node(cm_node->cm_core, cm_node);
-                               return NULL;
-                       }
-                       atomic_inc(&cm_loopbacks);
-                       loopbackremotenode->loopbackpartner = cm_node;
-                       loopbackremotenode->tcp_cntxt.rcv_wscale =
-                               NES_CM_DEFAULT_RCV_WND_SCALE;
-                       cm_node->loopbackpartner = loopbackremotenode;
-                       memcpy(loopbackremotenode->mpa_frame_buf, private_data,
-                              private_data_len);
-                       loopbackremotenode->mpa_frame_size = private_data_len;
-
-                       /* we are done handling this state. */
-                       /* set node to a TSA state */
-                       cm_node->state = NES_CM_STATE_TSA;
-                       cm_node->tcp_cntxt.rcv_nxt =
-                               loopbackremotenode->tcp_cntxt.loc_seq_num;
-                       loopbackremotenode->tcp_cntxt.rcv_nxt =
-                               cm_node->tcp_cntxt.loc_seq_num;
-                       cm_node->tcp_cntxt.max_snd_wnd =
-                               loopbackremotenode->tcp_cntxt.rcv_wnd;
-                       loopbackremotenode->tcp_cntxt.max_snd_wnd =
-                               cm_node->tcp_cntxt.rcv_wnd;
-                       cm_node->tcp_cntxt.snd_wnd =
-                               loopbackremotenode->tcp_cntxt.rcv_wnd;
-                       loopbackremotenode->tcp_cntxt.snd_wnd =
-                               cm_node->tcp_cntxt.rcv_wnd;
-                       cm_node->tcp_cntxt.snd_wscale =
-                               loopbackremotenode->tcp_cntxt.rcv_wscale;
-                       loopbackremotenode->tcp_cntxt.snd_wscale =
-                               cm_node->tcp_cntxt.rcv_wscale;
-                       loopbackremotenode->state = NES_CM_STATE_MPAREQ_RCVD;
-                       create_event(loopbackremotenode, NES_CM_EVENT_MPA_REQ);
-               }
-               return cm_node;
-       }
-
-       start_buff = &cm_node->mpa_frame_buf[0] + sizeof(struct ietf_mpa_v2);
-       cm_node->mpa_frame_size = private_data_len;
-
-       memcpy(start_buff, private_data, private_data_len);
-
-       /* send a syn and goto syn sent state */
-       cm_node->state = NES_CM_STATE_SYN_SENT;
-       ret = send_syn(cm_node, 0, NULL);
-
-       if (ret) {
-               /* error in sending the syn free up the cm_node struct */
-               nes_debug(NES_DBG_CM, "Api - connect() FAILED: dest "
-                         "addr=0x%08X, port=0x%04x, cm_node=%p, cm_id = %p.\n",
-                         cm_node->rem_addr, cm_node->rem_port, cm_node,
-                         cm_node->cm_id);
-               rem_ref_cm_node(cm_node->cm_core, cm_node);
-               cm_node = NULL;
-       }
-
-       if (cm_node) {
-               nes_debug(NES_DBG_CM, "Api - connect(): dest addr=0x%08X,"
-                         "port=0x%04x, cm_node=%p, cm_id = %p.\n",
-                         cm_node->rem_addr, cm_node->rem_port, cm_node,
-                         cm_node->cm_id);
-       }
-
-       return cm_node;
-}
-
-
-/**
- * mini_cm_accept - accept a connection
- * This function is never called
- */
-static int mini_cm_accept(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
-{
-       return 0;
-}
-
-
-/**
- * mini_cm_reject - reject and teardown a connection
- */
-static int mini_cm_reject(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
-{
-       int ret = 0;
-       int err = 0;
-       int passive_state;
-       struct nes_cm_event event;
-       struct iw_cm_id *cm_id = cm_node->cm_id;
-       struct nes_cm_node *loopback = cm_node->loopbackpartner;
-
-       nes_debug(NES_DBG_CM, "%s cm_node=%p type=%d state=%d\n",
-                 __func__, cm_node, cm_node->tcp_cntxt.client, cm_node->state);
-
-       if (cm_node->tcp_cntxt.client)
-               return ret;
-       cleanup_retrans_entry(cm_node);
-
-       if (!loopback) {
-               passive_state = atomic_add_return(1, &cm_node->passive_state);
-               if (passive_state == NES_SEND_RESET_EVENT) {
-                       cm_node->state = NES_CM_STATE_CLOSED;
-                       rem_ref_cm_node(cm_core, cm_node);
-               } else {
-                       if (cm_node->state == NES_CM_STATE_LISTENER_DESTROYED) {
-                               rem_ref_cm_node(cm_core, cm_node);
-                       } else {
-                               ret = send_mpa_reject(cm_node);
-                               if (ret) {
-                                       cm_node->state = NES_CM_STATE_CLOSED;
-                                       err = send_reset(cm_node, NULL);
-                                       if (err)
-                                               WARN_ON(1);
-                               } else {
-                                       cm_id->add_ref(cm_id);
-                               }
-                       }
-               }
-       } else {
-               cm_node->cm_id = NULL;
-               if (cm_node->state == NES_CM_STATE_LISTENER_DESTROYED) {
-                       rem_ref_cm_node(cm_core, cm_node);
-                       rem_ref_cm_node(cm_core, loopback);
-               } else {
-                       event.cm_node = loopback;
-                       event.cm_info.rem_addr = loopback->rem_addr;
-                       event.cm_info.loc_addr = loopback->loc_addr;
-                       event.cm_info.rem_port = loopback->rem_port;
-                       event.cm_info.loc_port = loopback->loc_port;
-                       event.cm_info.cm_id = loopback->cm_id;
-                       cm_event_mpa_reject(&event);
-                       rem_ref_cm_node(cm_core, cm_node);
-                       loopback->state = NES_CM_STATE_CLOSING;
-
-                       cm_id = loopback->cm_id;
-                       rem_ref_cm_node(cm_core, loopback);
-                       cm_id->rem_ref(cm_id);
-               }
-       }
-
-       return ret;
-}
-
-
-/**
- * mini_cm_close
- */
-static int mini_cm_close(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
-{
-       int ret = 0;
-
-       if (!cm_core || !cm_node)
-               return -EINVAL;
-
-       switch (cm_node->state) {
-       case NES_CM_STATE_SYN_RCVD:
-       case NES_CM_STATE_SYN_SENT:
-       case NES_CM_STATE_ONE_SIDE_ESTABLISHED:
-       case NES_CM_STATE_ESTABLISHED:
-       case NES_CM_STATE_ACCEPTING:
-       case NES_CM_STATE_MPAREQ_SENT:
-       case NES_CM_STATE_MPAREQ_RCVD:
-               cleanup_retrans_entry(cm_node);
-               send_reset(cm_node, NULL);
-               break;
-       case NES_CM_STATE_CLOSE_WAIT:
-               cm_node->state = NES_CM_STATE_LAST_ACK;
-               send_fin(cm_node, NULL);
-               break;
-       case NES_CM_STATE_FIN_WAIT1:
-       case NES_CM_STATE_FIN_WAIT2:
-       case NES_CM_STATE_LAST_ACK:
-       case NES_CM_STATE_TIME_WAIT:
-       case NES_CM_STATE_CLOSING:
-               ret = -1;
-               break;
-       case NES_CM_STATE_LISTENING:
-               cleanup_retrans_entry(cm_node);
-               send_reset(cm_node, NULL);
-               break;
-       case NES_CM_STATE_MPAREJ_RCVD:
-       case NES_CM_STATE_UNKNOWN:
-       case NES_CM_STATE_INITED:
-       case NES_CM_STATE_CLOSED:
-       case NES_CM_STATE_LISTENER_DESTROYED:
-               ret = rem_ref_cm_node(cm_core, cm_node);
-               break;
-       case NES_CM_STATE_TSA:
-               if (cm_node->send_entry)
-                       printk(KERN_ERR "ERROR Close got called from STATE_TSA "
-                              "send_entry=%p\n", cm_node->send_entry);
-               ret = rem_ref_cm_node(cm_core, cm_node);
-               break;
-       }
-       return ret;
-}
-
-
-/**
- * recv_pkt - recv an ETHERNET packet, and process it through CM
- * node state machine
- */
-static int mini_cm_recv_pkt(struct nes_cm_core *cm_core,
-                           struct nes_vnic *nesvnic, struct sk_buff *skb)
-{
-       struct nes_cm_node *cm_node = NULL;
-       struct nes_cm_listener *listener = NULL;
-       struct iphdr *iph;
-       struct tcphdr *tcph;
-       struct nes_cm_info nfo;
-       int skb_handled = 1;
-       __be32 tmp_daddr, tmp_saddr;
-
-       if (!skb)
-               return 0;
-       if (skb->len < sizeof(struct iphdr) + sizeof(struct tcphdr))
-               return 0;
-
-       iph = (struct iphdr *)skb->data;
-       tcph = (struct tcphdr *)(skb->data + sizeof(struct iphdr));
-
-       nfo.loc_addr = ntohl(iph->daddr);
-       nfo.loc_port = ntohs(tcph->dest);
-       nfo.rem_addr = ntohl(iph->saddr);
-       nfo.rem_port = ntohs(tcph->source);
-
-       tmp_daddr = cpu_to_be32(iph->daddr);
-       tmp_saddr = cpu_to_be32(iph->saddr);
-
-       nes_debug(NES_DBG_CM, "Received packet: dest=%pI4:0x%04X src=%pI4:0x%04X\n",
-                 &tmp_daddr, tcph->dest, &tmp_saddr, tcph->source);
-
-       do {
-               cm_node = find_node(cm_core,
-                                   nfo.rem_port, nfo.rem_addr,
-                                   nfo.loc_port, nfo.loc_addr);
-
-               if (!cm_node) {
-                       /* Only type of packet accepted are for */
-                       /* the PASSIVE open (syn only) */
-                       if ((!tcph->syn) || (tcph->ack)) {
-                               skb_handled = 0;
-                               break;
-                       }
-                       listener = find_listener(cm_core, nfo.loc_addr,
-                                                nfo.loc_port,
-                                                NES_CM_LISTENER_ACTIVE_STATE);
-                       if (!listener) {
-                               nfo.cm_id = NULL;
-                               nfo.conn_type = 0;
-                               nes_debug(NES_DBG_CM, "Unable to find listener for the pkt\n");
-                               skb_handled = 0;
-                               break;
-                       }
-                       nfo.cm_id = listener->cm_id;
-                       nfo.conn_type = listener->conn_type;
-                       cm_node = make_cm_node(cm_core, nesvnic, &nfo,
-                                              listener);
-                       if (!cm_node) {
-                               nes_debug(NES_DBG_CM, "Unable to allocate "
-                                         "node\n");
-                               cm_packets_dropped++;
-                               atomic_dec(&listener->ref_count);
-                               dev_kfree_skb_any(skb);
-                               break;
-                       }
-                       if (!tcph->rst && !tcph->fin) {
-                               cm_node->state = NES_CM_STATE_LISTENING;
-                       } else {
-                               cm_packets_dropped++;
-                               rem_ref_cm_node(cm_core, cm_node);
-                               dev_kfree_skb_any(skb);
-                               break;
-                       }
-                       add_ref_cm_node(cm_node);
-               } else if (cm_node->state == NES_CM_STATE_TSA) {
-                       if (cm_node->nesqp->pau_mode)
-                               nes_queue_mgt_skbs(skb, nesvnic, cm_node->nesqp);
-                       else {
-                               rem_ref_cm_node(cm_core, cm_node);
-                               atomic_inc(&cm_accel_dropped_pkts);
-                               dev_kfree_skb_any(skb);
-                       }
-                       break;
-               }
-               skb_reset_network_header(skb);
-               skb_set_transport_header(skb, sizeof(*tcph));
-               skb->len = ntohs(iph->tot_len);
-               process_packet(cm_node, skb, cm_core);
-               rem_ref_cm_node(cm_core, cm_node);
-       } while (0);
-       return skb_handled;
-}
-
-
-/**
- * nes_cm_alloc_core - allocate a top level instance of a cm core
- */
-static struct nes_cm_core *nes_cm_alloc_core(void)
-{
-       struct nes_cm_core *cm_core;
-
-       /* setup the CM core */
-       /* alloc top level core control structure */
-       cm_core = kzalloc(sizeof(*cm_core), GFP_KERNEL);
-       if (!cm_core)
-               return NULL;
-
-       INIT_LIST_HEAD(&cm_core->connected_nodes);
-       timer_setup(&cm_core->tcp_timer, nes_cm_timer_tick, 0);
-
-       cm_core->mtu = NES_CM_DEFAULT_MTU;
-       cm_core->state = NES_CM_STATE_INITED;
-       cm_core->free_tx_pkt_max = NES_CM_DEFAULT_FREE_PKTS;
-
-       atomic_set(&cm_core->events_posted, 0);
-
-       cm_core->api = &nes_cm_api;
-
-       spin_lock_init(&cm_core->ht_lock);
-       spin_lock_init(&cm_core->listen_list_lock);
-
-       INIT_LIST_HEAD(&cm_core->listen_list.list);
-
-       nes_debug(NES_DBG_CM, "Init CM Core completed -- cm_core=%p\n", cm_core);
-
-       nes_debug(NES_DBG_CM, "Enable QUEUE EVENTS\n");
-       cm_core->event_wq = alloc_ordered_workqueue("nesewq", 0);
-       if (!cm_core->event_wq)
-               goto out_free_cmcore;
-       cm_core->post_event = nes_cm_post_event;
-       nes_debug(NES_DBG_CM, "Enable QUEUE DISCONNECTS\n");
-       cm_core->disconn_wq = alloc_ordered_workqueue("nesdwq", 0);
-       if (!cm_core->disconn_wq)
-               goto out_free_wq;
-
-       print_core(cm_core);
-       return cm_core;
-
-out_free_wq:
-       destroy_workqueue(cm_core->event_wq);
-out_free_cmcore:
-       kfree(cm_core);
-       return NULL;
-}
-
-
-/**
- * mini_cm_dealloc_core - deallocate a top level instance of a cm core
- */
-static int mini_cm_dealloc_core(struct nes_cm_core *cm_core)
-{
-       nes_debug(NES_DBG_CM, "De-Alloc CM Core (%p)\n", cm_core);
-
-       if (!cm_core)
-               return -EINVAL;
-
-       barrier();
-
-       if (timer_pending(&cm_core->tcp_timer))
-               del_timer(&cm_core->tcp_timer);
-
-       destroy_workqueue(cm_core->event_wq);
-       destroy_workqueue(cm_core->disconn_wq);
-       nes_debug(NES_DBG_CM, "\n");
-       kfree(cm_core);
-
-       return 0;
-}
-
-
-/**
- * mini_cm_get
- */
-static int mini_cm_get(struct nes_cm_core *cm_core)
-{
-       return cm_core->state;
-}
-
-
-/**
- * mini_cm_set
- */
-static int mini_cm_set(struct nes_cm_core *cm_core, u32 type, u32 value)
-{
-       int ret = 0;
-
-       switch (type) {
-       case NES_CM_SET_PKT_SIZE:
-               cm_core->mtu = value;
-               break;
-       case NES_CM_SET_FREE_PKT_Q_SIZE:
-               cm_core->free_tx_pkt_max = value;
-               break;
-       default:
-               /* unknown set option */
-               ret = -EINVAL;
-       }
-
-       return ret;
-}
-
-
-/**
- * nes_cm_init_tsa_conn setup HW; MPA frames must be
- * successfully exchanged when this is called
- */
-static int nes_cm_init_tsa_conn(struct nes_qp *nesqp, struct nes_cm_node *cm_node)
-{
-       int ret = 0;
-
-       if (!nesqp)
-               return -EINVAL;
-
-       nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_IPV4 |
-                                                 NES_QPCONTEXT_MISC_NO_NAGLE | NES_QPCONTEXT_MISC_DO_NOT_FRAG |
-                                                 NES_QPCONTEXT_MISC_DROS);
-
-       if (cm_node->tcp_cntxt.snd_wscale || cm_node->tcp_cntxt.rcv_wscale)
-               nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_WSCALE);
-
-       nesqp->nesqp_context->misc2 |= cpu_to_le32(64 << NES_QPCONTEXT_MISC2_TTL_SHIFT);
-
-       nesqp->nesqp_context->misc2 |= cpu_to_le32(
-               cm_node->tos << NES_QPCONTEXT_MISC2_TOS_SHIFT);
-
-       nesqp->nesqp_context->mss |= cpu_to_le32(((u32)cm_node->tcp_cntxt.mss) << 16);
-
-       nesqp->nesqp_context->tcp_state_flow_label |= cpu_to_le32(
-               (u32)NES_QPCONTEXT_TCPSTATE_EST << NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT);
-
-       nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32(
-               (cm_node->tcp_cntxt.snd_wscale << NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT) &
-               NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK);
-
-       nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32(
-               (cm_node->tcp_cntxt.rcv_wscale << NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT) &
-               NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK);
-
-       nesqp->nesqp_context->keepalive = cpu_to_le32(0x80);
-       nesqp->nesqp_context->ts_recent = 0;
-       nesqp->nesqp_context->ts_age = 0;
-       nesqp->nesqp_context->snd_nxt = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
-       nesqp->nesqp_context->snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.snd_wnd);
-       nesqp->nesqp_context->rcv_nxt = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt);
-       nesqp->nesqp_context->rcv_wnd = cpu_to_le32(cm_node->tcp_cntxt.rcv_wnd <<
-                                                   cm_node->tcp_cntxt.rcv_wscale);
-       nesqp->nesqp_context->snd_max = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
-       nesqp->nesqp_context->snd_una = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
-       nesqp->nesqp_context->srtt = 0;
-       nesqp->nesqp_context->rttvar = cpu_to_le32(0x6);
-       nesqp->nesqp_context->ssthresh = cpu_to_le32(0x3FFFC000);
-       nesqp->nesqp_context->cwnd = cpu_to_le32(2 * cm_node->tcp_cntxt.mss);
-       nesqp->nesqp_context->snd_wl1 = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt);
-       nesqp->nesqp_context->snd_wl2 = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
-       nesqp->nesqp_context->max_snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.max_snd_wnd);
-
-       nes_debug(NES_DBG_CM, "QP%u: rcv_nxt = 0x%08X, snd_nxt = 0x%08X,"
-                 " Setting MSS to %u, PDWscale = 0x%08X, rcv_wnd = %u, context misc = 0x%08X.\n",
-                 nesqp->hwqp.qp_id, le32_to_cpu(nesqp->nesqp_context->rcv_nxt),
-                 le32_to_cpu(nesqp->nesqp_context->snd_nxt),
-                 cm_node->tcp_cntxt.mss, le32_to_cpu(nesqp->nesqp_context->pd_index_wscale),
-                 le32_to_cpu(nesqp->nesqp_context->rcv_wnd),
-                 le32_to_cpu(nesqp->nesqp_context->misc));
-       nes_debug(NES_DBG_CM, "  snd_wnd  = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->snd_wnd));
-       nes_debug(NES_DBG_CM, "  snd_cwnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->cwnd));
-       nes_debug(NES_DBG_CM, "  max_swnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->max_snd_wnd));
-
-       nes_debug(NES_DBG_CM, "Change cm_node state to TSA\n");
-       cm_node->state = NES_CM_STATE_TSA;
-
-       return ret;
-}
-
-
-/**
- * nes_cm_disconn
- */
-int nes_cm_disconn(struct nes_qp *nesqp)
-{
-       struct disconn_work *work;
-
-       work = kzalloc(sizeof *work, GFP_ATOMIC);
-       if (!work)
-               return -ENOMEM;  /* Timer will clean up */
-
-       nes_add_ref(&nesqp->ibqp);
-       work->nesqp = nesqp;
-       INIT_WORK(&work->work, nes_disconnect_worker);
-       queue_work(g_cm_core->disconn_wq, &work->work);
-       return 0;
-}
-
-
-/**
- * nes_disconnect_worker
- */
-static void nes_disconnect_worker(struct work_struct *work)
-{
-       struct disconn_work *dwork = container_of(work, struct disconn_work, work);
-       struct nes_qp *nesqp = dwork->nesqp;
-
-       kfree(dwork);
-       nes_debug(NES_DBG_CM, "processing AEQE id 0x%04X for QP%u.\n",
-                 nesqp->last_aeq, nesqp->hwqp.qp_id);
-       nes_cm_disconn_true(nesqp);
-       nes_rem_ref(&nesqp->ibqp);
-}
-
-
-/**
- * nes_cm_disconn_true
- */
-static int nes_cm_disconn_true(struct nes_qp *nesqp)
-{
-       unsigned long flags;
-       int ret = 0;
-       struct iw_cm_id *cm_id;
-       struct iw_cm_event cm_event;
-       struct nes_vnic *nesvnic;
-       u16 last_ae;
-       u8 original_hw_tcp_state;
-       u8 original_ibqp_state;
-       int disconn_status = 0;
-       int issue_disconn = 0;
-       int issue_close = 0;
-       int issue_flush = 0;
-       u32 flush_q = NES_CQP_FLUSH_RQ;
-       struct ib_event ibevent;
-
-       if (!nesqp) {
-               nes_debug(NES_DBG_CM, "disconnect_worker nesqp is NULL\n");
-               return -1;
-       }
-
-       spin_lock_irqsave(&nesqp->lock, flags);
-       cm_id = nesqp->cm_id;
-       /* make sure we havent already closed this connection */
-       if (!cm_id) {
-               nes_debug(NES_DBG_CM, "QP%u disconnect_worker cmid is NULL\n",
-                         nesqp->hwqp.qp_id);
-               spin_unlock_irqrestore(&nesqp->lock, flags);
-               return -1;
-       }
-
-       nesvnic = to_nesvnic(nesqp->ibqp.device);
-       nes_debug(NES_DBG_CM, "Disconnecting QP%u\n", nesqp->hwqp.qp_id);
-
-       original_hw_tcp_state = nesqp->hw_tcp_state;
-       original_ibqp_state = nesqp->ibqp_state;
-       last_ae = nesqp->last_aeq;
-
-       if (nesqp->term_flags) {
-               issue_disconn = 1;
-               issue_close = 1;
-               nesqp->cm_id = NULL;
-               del_timer(&nesqp->terminate_timer);
-               if (nesqp->flush_issued == 0) {
-                       nesqp->flush_issued = 1;
-                       issue_flush = 1;
-               }
-       } else if ((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) ||
-                       ((original_ibqp_state == IB_QPS_RTS) &&
-                       (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) {
-               issue_disconn = 1;
-               if (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET)
-                       disconn_status = -ECONNRESET;
-       }
-
-       if (((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSED) ||
-                (original_hw_tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT) ||
-                (last_ae == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) ||
-                (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) {
-               issue_close = 1;
-               nesqp->cm_id = NULL;
-               if (nesqp->flush_issued == 0) {
-                       nesqp->flush_issued = 1;
-                       issue_flush = 1;
-               }
-       }
-
-       spin_unlock_irqrestore(&nesqp->lock, flags);
-
-       if ((issue_flush) && (nesqp->destroyed == 0)) {
-               /* Flush the queue(s) */
-               if (nesqp->hw_iwarp_state >= NES_AEQE_IWARP_STATE_TERMINATE)
-                       flush_q |= NES_CQP_FLUSH_SQ;
-               flush_wqes(nesvnic->nesdev, nesqp, flush_q, 1);
-
-               if (nesqp->term_flags) {
-                       ibevent.device = nesqp->ibqp.device;
-                       ibevent.event = nesqp->terminate_eventtype;
-                       ibevent.element.qp = &nesqp->ibqp;
-                       if (nesqp->ibqp.event_handler)
-                               nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context);
-               }
-       }
-
-       if ((cm_id) && (cm_id->event_handler)) {
-               if (issue_disconn) {
-                       atomic_inc(&cm_disconnects);
-                       cm_event.event = IW_CM_EVENT_DISCONNECT;
-                       cm_event.status = disconn_status;
-                       cm_event.local_addr = cm_id->m_local_addr;
-                       cm_event.remote_addr = cm_id->m_remote_addr;
-                       cm_event.private_data = NULL;
-                       cm_event.private_data_len = 0;
-
-                       nes_debug(NES_DBG_CM, "Generating a CM Disconnect Event"
-                                 " for  QP%u, SQ Head = %u, SQ Tail = %u. "
-                                 "cm_id = %p, refcount = %u.\n",
-                                 nesqp->hwqp.qp_id, nesqp->hwqp.sq_head,
-                                 nesqp->hwqp.sq_tail, cm_id,
-                                 atomic_read(&nesqp->refcount));
-
-                       ret = cm_id->event_handler(cm_id, &cm_event);
-                       if (ret)
-                               nes_debug(NES_DBG_CM, "OFA CM event_handler "
-                                         "returned, ret=%d\n", ret);
-               }
-
-               if (issue_close) {
-                       atomic_inc(&cm_closes);
-                       nes_disconnect(nesqp, 1);
-
-                       cm_id->provider_data = nesqp;
-                       /* Send up the close complete event */
-                       cm_event.event = IW_CM_EVENT_CLOSE;
-                       cm_event.status = 0;
-                       cm_event.provider_data = cm_id->provider_data;
-                       cm_event.local_addr = cm_id->m_local_addr;
-                       cm_event.remote_addr = cm_id->m_remote_addr;
-                       cm_event.private_data = NULL;
-                       cm_event.private_data_len = 0;
-
-                       ret = cm_id->event_handler(cm_id, &cm_event);
-                       if (ret)
-                               nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
-
-                       cm_id->rem_ref(cm_id);
-               }
-       }
-
-       return 0;
-}
-
-
-/**
- * nes_disconnect
- */
-static int nes_disconnect(struct nes_qp *nesqp, int abrupt)
-{
-       int ret = 0;
-       struct nes_vnic *nesvnic;
-       struct nes_device *nesdev;
-       struct nes_ib_device *nesibdev;
-
-       nesvnic = to_nesvnic(nesqp->ibqp.device);
-       if (!nesvnic)
-               return -EINVAL;
-
-       nesdev = nesvnic->nesdev;
-       nesibdev = nesvnic->nesibdev;
-
-       nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
-                       netdev_refcnt_read(nesvnic->netdev));
-
-       if (nesqp->active_conn) {
-
-               /* indicate this connection is NOT active */
-               nesqp->active_conn = 0;
-       } else {
-               /* Need to free the Last Streaming Mode Message */
-               if (nesqp->ietf_frame) {
-                       if (nesqp->lsmm_mr)
-                               nesibdev->ibdev.ops.dereg_mr(nesqp->lsmm_mr,
-                                                            NULL);
-                       pci_free_consistent(nesdev->pcidev,
-                                           nesqp->private_data_len + nesqp->ietf_frame_size,
-                                           nesqp->ietf_frame, nesqp->ietf_frame_pbase);
-               }
-       }
-
-       /* close the CM node down if it is still active */
-       if (nesqp->cm_node) {
-               nes_debug(NES_DBG_CM, "Call close API\n");
-
-               g_cm_core->api->close(g_cm_core, nesqp->cm_node);
-       }
-
-       return ret;
-}
-
-
-/**
- * nes_accept
- */
-int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
-{
-       u64 u64temp;
-       struct ib_qp *ibqp;
-       struct nes_qp *nesqp;
-       struct nes_vnic *nesvnic;
-       struct nes_device *nesdev;
-       struct nes_cm_node *cm_node;
-       struct nes_adapter *adapter;
-       struct ib_qp_attr attr;
-       struct iw_cm_event cm_event;
-       struct nes_hw_qp_wqe *wqe;
-       struct nes_v4_quad nes_quad;
-       u32 crc_value;
-       int ret;
-       int passive_state;
-       struct ib_mr *ibmr = NULL;
-       struct nes_pd *nespd;
-       u64 tagged_offset;
-       u8 mpa_frame_offset = 0;
-       struct ietf_mpa_v2 *mpa_v2_frame;
-       u8 start_addr = 0;
-       u8 *start_ptr = &start_addr;
-       u8 **start_buff = &start_ptr;
-       u16 buff_len = 0;
-       struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
-       struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
-
-       ibqp = nes_get_qp(cm_id->device, conn_param->qpn);
-       if (!ibqp)
-               return -EINVAL;
-
-       /* get all our handles */
-       nesqp = to_nesqp(ibqp);
-       nesvnic = to_nesvnic(nesqp->ibqp.device);
-       nesdev = nesvnic->nesdev;
-       adapter = nesdev->nesadapter;
-
-       cm_node = (struct nes_cm_node *)cm_id->provider_data;
-       nes_debug(NES_DBG_CM, "nes_accept: cm_node= %p nesvnic=%p, netdev=%p,"
-               "%s\n", cm_node, nesvnic, nesvnic->netdev,
-               nesvnic->netdev->name);
-
-       if (NES_CM_STATE_LISTENER_DESTROYED == cm_node->state) {
-               if (cm_node->loopbackpartner)
-                       rem_ref_cm_node(cm_node->cm_core, cm_node->loopbackpartner);
-               rem_ref_cm_node(cm_node->cm_core, cm_node);
-               return -EINVAL;
-       }
-
-       passive_state = atomic_add_return(1, &cm_node->passive_state);
-       if (passive_state == NES_SEND_RESET_EVENT) {
-               rem_ref_cm_node(cm_node->cm_core, cm_node);
-               return -ECONNRESET;
-       }
-       /* associate the node with the QP */
-       nesqp->cm_node = (void *)cm_node;
-       cm_node->nesqp = nesqp;
-
-
-       nes_debug(NES_DBG_CM, "QP%u, cm_node=%p, jiffies = %lu listener = %p\n",
-               nesqp->hwqp.qp_id, cm_node, jiffies, cm_node->listener);
-       atomic_inc(&cm_accepts);
-
-       nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
-                       netdev_refcnt_read(nesvnic->netdev));
-
-       nesqp->ietf_frame_size = sizeof(struct ietf_mpa_v2);
-       /* allocate the ietf frame and space for private data */
-       nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev,
-                                                nesqp->ietf_frame_size + conn_param->private_data_len,
-                                                &nesqp->ietf_frame_pbase);
-
-       if (!nesqp->ietf_frame) {
-               nes_debug(NES_DBG_CM, "Unable to allocate memory for private data\n");
-               return -ENOMEM;
-       }
-       mpa_v2_frame = (struct ietf_mpa_v2 *)nesqp->ietf_frame;
-
-       if (cm_node->mpa_frame_rev == IETF_MPA_V1)
-               mpa_frame_offset = 4;
-
-       if (cm_node->mpa_frame_rev == IETF_MPA_V1 ||
-                       cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) {
-               record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord);
-       }
-
-       memcpy(mpa_v2_frame->priv_data, conn_param->private_data,
-              conn_param->private_data_len);
-
-       cm_build_mpa_frame(cm_node, start_buff, &buff_len, nesqp->ietf_frame, MPA_KEY_REPLY);
-       nesqp->private_data_len = conn_param->private_data_len;
-
-       /* setup our first outgoing iWarp send WQE (the IETF frame response) */
-       wqe = &nesqp->hwqp.sq_vbase[0];
-
-       if (raddr->sin_addr.s_addr != laddr->sin_addr.s_addr) {
-               u64temp = (unsigned long)nesqp;
-               nespd = nesqp->nespd;
-               tagged_offset = (u64)(unsigned long)*start_buff;
-               ibmr = nes_reg_phys_mr(&nespd->ibpd,
-                               nesqp->ietf_frame_pbase + mpa_frame_offset,
-                               buff_len, IB_ACCESS_LOCAL_WRITE,
-                               &tagged_offset);
-               if (IS_ERR(ibmr)) {
-                       nes_debug(NES_DBG_CM, "Unable to register memory region"
-                                 "for lSMM for cm_node = %p \n",
-                                 cm_node);
-                       pci_free_consistent(nesdev->pcidev,
-                                           nesqp->private_data_len + nesqp->ietf_frame_size,
-                                           nesqp->ietf_frame, nesqp->ietf_frame_pbase);
-                       return PTR_ERR(ibmr);
-               }
-
-               ibmr->pd = &nespd->ibpd;
-               ibmr->device = nespd->ibpd.device;
-               nesqp->lsmm_mr = ibmr;
-
-               u64temp |= NES_SW_CONTEXT_ALIGN >> 1;
-               set_wqe_64bit_value(wqe->wqe_words,
-                                   NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX,
-                                   u64temp);
-               wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] =
-                       cpu_to_le32(NES_IWARP_SQ_WQE_STREAMING |
-                                   NES_IWARP_SQ_WQE_WRPDU);
-               wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] =
-                       cpu_to_le32(buff_len);
-               set_wqe_64bit_value(wqe->wqe_words,
-                                   NES_IWARP_SQ_WQE_FRAG0_LOW_IDX,
-                                   (u64)(unsigned long)(*start_buff));
-               wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] =
-                       cpu_to_le32(buff_len);
-               wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = ibmr->lkey;
-               if (nesqp->sq_kmapped) {
-                       nesqp->sq_kmapped = 0;
-                       kunmap(nesqp->page);
-               }
-
-               nesqp->nesqp_context->ird_ord_sizes |=
-                       cpu_to_le32(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT |
-                                   NES_QPCONTEXT_ORDIRD_WRPDU);
-       } else {
-               nesqp->nesqp_context->ird_ord_sizes |=
-                       cpu_to_le32(NES_QPCONTEXT_ORDIRD_WRPDU);
-       }
-       nesqp->skip_lsmm = 1;
-
-       /* Cache the cm_id in the qp */
-       nesqp->cm_id = cm_id;
-       cm_node->cm_id = cm_id;
-
-       /*  nesqp->cm_node = (void *)cm_id->provider_data; */
-       cm_id->provider_data = nesqp;
-       nesqp->active_conn = 0;
-
-       if (cm_node->state == NES_CM_STATE_TSA)
-               nes_debug(NES_DBG_CM, "Already state = TSA for cm_node=%p\n",
-                         cm_node);
-
-       nes_cm_init_tsa_conn(nesqp, cm_node);
-
-       nesqp->nesqp_context->tcpPorts[0] =
-                               cpu_to_le16(cm_node->loc_port);
-       nesqp->nesqp_context->tcpPorts[1] =
-                               cpu_to_le16(cm_node->rem_port);
-
-       nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->rem_addr);
-
-       nesqp->nesqp_context->misc2 |= cpu_to_le32(
-               (u32)PCI_FUNC(nesdev->pcidev->devfn) <<
-               NES_QPCONTEXT_MISC2_SRC_IP_SHIFT);
-
-       nesqp->nesqp_context->arp_index_vlan |=
-               cpu_to_le32(nes_arp_table(nesdev,
-                                         le32_to_cpu(nesqp->nesqp_context->ip0), NULL,
-                                         NES_ARP_RESOLVE) << 16);
-
-       nesqp->nesqp_context->ts_val_delta = cpu_to_le32(
-               jiffies - nes_read_indexed(nesdev, NES_IDX_TCP_NOW));
-
-       nesqp->nesqp_context->ird_index = cpu_to_le32(nesqp->hwqp.qp_id);
-
-       nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(
-               ((u32)1 << NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT));
-       nesqp->nesqp_context->ird_ord_sizes |=
-               cpu_to_le32((u32)cm_node->ord_size);
-
-       memset(&nes_quad, 0, sizeof(nes_quad));
-       nes_quad.DstIpAdrIndex =
-               cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24);
-       nes_quad.SrcIpadr = htonl(cm_node->rem_addr);
-       nes_quad.TcpPorts[0] = htons(cm_node->rem_port);
-       nes_quad.TcpPorts[1] = htons(cm_node->loc_port);
-
-       /* Produce hash key */
-       crc_value = get_crc_value(&nes_quad);
-       nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff);
-       nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, CRC = 0x%08X\n",
-                 nesqp->hte_index, nesqp->hte_index & adapter->hte_index_mask);
-
-       nesqp->hte_index &= adapter->hte_index_mask;
-       nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index);
-
-       cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node);
-
-       nes_debug(NES_DBG_CM, "QP%u, Destination IP = 0x%08X:0x%04X, local = "
-                 "0x%08X:0x%04X, rcv_nxt=0x%08X, snd_nxt=0x%08X, mpa + "
-                 "private data length=%u.\n", nesqp->hwqp.qp_id,
-                 ntohl(raddr->sin_addr.s_addr), ntohs(raddr->sin_port),
-                 ntohl(laddr->sin_addr.s_addr), ntohs(laddr->sin_port),
-                 le32_to_cpu(nesqp->nesqp_context->rcv_nxt),
-                 le32_to_cpu(nesqp->nesqp_context->snd_nxt),
-                 buff_len);
-
-       /* notify OF layer that accept event was successful */
-       cm_id->add_ref(cm_id);
-       nes_add_ref(&nesqp->ibqp);
-
-       cm_event.event = IW_CM_EVENT_ESTABLISHED;
-       cm_event.status = 0;
-       cm_event.provider_data = (void *)nesqp;
-       cm_event.local_addr = cm_id->m_local_addr;
-       cm_event.remote_addr = cm_id->m_remote_addr;
-       cm_event.private_data = NULL;
-       cm_event.private_data_len = 0;
-       cm_event.ird = cm_node->ird_size;
-       cm_event.ord = cm_node->ord_size;
-
-       ret = cm_id->event_handler(cm_id, &cm_event);
-       attr.qp_state = IB_QPS_RTS;
-       nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL);
-       if (cm_node->loopbackpartner) {
-               cm_node->loopbackpartner->mpa_frame_size =
-                       nesqp->private_data_len;
-               /* copy entire MPA frame to our cm_node's frame */
-               memcpy(cm_node->loopbackpartner->mpa_frame_buf,
-                      conn_param->private_data, conn_param->private_data_len);
-               create_event(cm_node->loopbackpartner, NES_CM_EVENT_CONNECTED);
-       }
-       if (ret)
-               printk(KERN_ERR "%s[%u] OFA CM event_handler returned, "
-                      "ret=%d\n", __func__, __LINE__, ret);
-
-       return 0;
-}
-
-
-/**
- * nes_reject
- */
-int nes_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
-{
-       struct nes_cm_node *cm_node;
-       struct nes_cm_node *loopback;
-       struct nes_cm_core *cm_core;
-       u8 *start_buff;
-
-       atomic_inc(&cm_rejects);
-       cm_node = (struct nes_cm_node *)cm_id->provider_data;
-       loopback = cm_node->loopbackpartner;
-       cm_core = cm_node->cm_core;
-       cm_node->cm_id = cm_id;
-
-       if (pdata_len + sizeof(struct ietf_mpa_v2) > MAX_CM_BUFFER)
-               return -EINVAL;
-
-       if (loopback) {
-               memcpy(&loopback->mpa_frame.priv_data, pdata, pdata_len);
-               loopback->mpa_frame.priv_data_len = pdata_len;
-               loopback->mpa_frame_size = pdata_len;
-       } else {
-               start_buff = &cm_node->mpa_frame_buf[0] + sizeof(struct ietf_mpa_v2);
-               cm_node->mpa_frame_size = pdata_len;
-               memcpy(start_buff, pdata, pdata_len);
-       }
-       return cm_core->api->reject(cm_core, cm_node);
-}
-
-
-/**
- * nes_connect
- * setup and launch cm connect node
- */
-int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
-{
-       struct ib_qp *ibqp;
-       struct nes_qp *nesqp;
-       struct nes_vnic *nesvnic;
-       struct nes_device *nesdev;
-       struct nes_cm_node *cm_node;
-       struct nes_cm_info cm_info;
-       int apbvt_set = 0;
-       struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
-       struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
-
-       if (cm_id->remote_addr.ss_family != AF_INET)
-               return -ENOSYS;
-       ibqp = nes_get_qp(cm_id->device, conn_param->qpn);
-       if (!ibqp)
-               return -EINVAL;
-       nesqp = to_nesqp(ibqp);
-       if (!nesqp)
-               return -EINVAL;
-       nesvnic = to_nesvnic(nesqp->ibqp.device);
-       if (!nesvnic)
-               return -EINVAL;
-       nesdev = nesvnic->nesdev;
-       if (!nesdev)
-               return -EINVAL;
-
-       if (!laddr->sin_port || !raddr->sin_port)
-               return -EINVAL;
-
-       nes_debug(NES_DBG_CM, "QP%u, current IP = 0x%08X, Destination IP = "
-                 "0x%08X:0x%04X, local = 0x%08X:0x%04X.\n", nesqp->hwqp.qp_id,
-                 ntohl(nesvnic->local_ipaddr), ntohl(raddr->sin_addr.s_addr),
-                 ntohs(raddr->sin_port), ntohl(laddr->sin_addr.s_addr),
-                 ntohs(laddr->sin_port));
-
-       atomic_inc(&cm_connects);
-       nesqp->active_conn = 1;
-
-       /* cache the cm_id in the qp */
-       nesqp->cm_id = cm_id;
-       cm_id->provider_data = nesqp;
-       nesqp->private_data_len = conn_param->private_data_len;
-
-       nes_debug(NES_DBG_CM, "requested ord = 0x%08X.\n", (u32)conn_param->ord);
-       nes_debug(NES_DBG_CM, "mpa private data len =%u\n",
-                 conn_param->private_data_len);
-
-       /* set up the connection params for the node */
-       cm_info.loc_addr = ntohl(laddr->sin_addr.s_addr);
-       cm_info.loc_port = ntohs(laddr->sin_port);
-       cm_info.rem_addr = ntohl(raddr->sin_addr.s_addr);
-       cm_info.rem_port = ntohs(raddr->sin_port);
-       cm_info.cm_id = cm_id;
-       cm_info.conn_type = NES_CM_IWARP_CONN_TYPE;
-
-       if (laddr->sin_addr.s_addr != raddr->sin_addr.s_addr) {
-               nes_manage_apbvt(nesvnic, cm_info.loc_port,
-                                PCI_FUNC(nesdev->pcidev->devfn),
-                                NES_MANAGE_APBVT_ADD);
-               apbvt_set = 1;
-       }
-
-       cm_id->add_ref(cm_id);
-
-       /* create a connect CM node connection */
-       cm_node = g_cm_core->api->connect(g_cm_core, nesvnic,
-                                         conn_param->private_data_len, (void *)conn_param->private_data,
-                                         &cm_info);
-       if (!cm_node) {
-               if (apbvt_set)
-                       nes_manage_apbvt(nesvnic, cm_info.loc_port,
-                                        PCI_FUNC(nesdev->pcidev->devfn),
-                                        NES_MANAGE_APBVT_DEL);
-
-               nes_debug(NES_DBG_NLMSG, "Delete loc_port = %04X\n",
-                         cm_info.loc_port);
-               cm_id->rem_ref(cm_id);
-               return -ENOMEM;
-       }
-
-       record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord);
-       if (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO &&
-                               cm_node->ord_size == 0)
-               cm_node->ord_size = 1;
-
-       cm_node->apbvt_set = apbvt_set;
-       cm_node->tos = cm_id->tos;
-       nesqp->cm_node = cm_node;
-       cm_node->nesqp = nesqp;
-       nes_add_ref(&nesqp->ibqp);
-
-       return 0;
-}
-
-
-/**
- * nes_create_listen
- */
-int nes_create_listen(struct iw_cm_id *cm_id, int backlog)
-{
-       struct nes_vnic *nesvnic;
-       struct nes_cm_listener *cm_node;
-       struct nes_cm_info cm_info;
-       int err;
-       struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
-
-       nes_debug(NES_DBG_CM, "cm_id = %p, local port = 0x%04X.\n",
-                 cm_id, ntohs(laddr->sin_port));
-
-       if (cm_id->m_local_addr.ss_family != AF_INET)
-               return -ENOSYS;
-       nesvnic = to_nesvnic(cm_id->device);
-       if (!nesvnic)
-               return -EINVAL;
-
-       nes_debug(NES_DBG_CM, "nesvnic=%p, netdev=%p, %s\n",
-                       nesvnic, nesvnic->netdev, nesvnic->netdev->name);
-
-       nes_debug(NES_DBG_CM, "nesvnic->local_ipaddr=0x%08x, sin_addr.s_addr=0x%08x\n",
-                       nesvnic->local_ipaddr, laddr->sin_addr.s_addr);
-
-       /* setup listen params in our api call struct */
-       cm_info.loc_addr = ntohl(nesvnic->local_ipaddr);
-       cm_info.loc_port = ntohs(laddr->sin_port);
-       cm_info.backlog = backlog;
-       cm_info.cm_id = cm_id;
-
-       cm_info.conn_type = NES_CM_IWARP_CONN_TYPE;
-
-       cm_node = g_cm_core->api->listen(g_cm_core, nesvnic, &cm_info);
-       if (!cm_node) {
-               printk(KERN_ERR "%s[%u] Error returned from listen API call\n",
-                      __func__, __LINE__);
-               return -ENOMEM;
-       }
-
-       cm_id->provider_data = cm_node;
-       cm_node->tos = cm_id->tos;
-
-       if (!cm_node->reused_node) {
-               err = nes_manage_apbvt(nesvnic, cm_node->loc_port,
-                                      PCI_FUNC(nesvnic->nesdev->pcidev->devfn),
-                                      NES_MANAGE_APBVT_ADD);
-               if (err) {
-                       printk(KERN_ERR "nes_manage_apbvt call returned %d.\n",
-                              err);
-                       g_cm_core->api->stop_listener(g_cm_core, (void *)cm_node);
-                       return err;
-               }
-               atomic_inc(&cm_listens_created);
-       }
-
-       cm_id->add_ref(cm_id);
-       cm_id->provider_data = (void *)cm_node;
-
-
-       return 0;
-}
-
-
-/**
- * nes_destroy_listen
- */
-int nes_destroy_listen(struct iw_cm_id *cm_id)
-{
-       if (cm_id->provider_data)
-               g_cm_core->api->stop_listener(g_cm_core, cm_id->provider_data);
-       else
-               nes_debug(NES_DBG_CM, "cm_id->provider_data was NULL\n");
-
-       cm_id->rem_ref(cm_id);
-
-       return 0;
-}
-
-
-/**
- * nes_cm_recv
- */
-int nes_cm_recv(struct sk_buff *skb, struct net_device *netdevice)
-{
-       int rc = 0;
-
-       cm_packets_received++;
-       if ((g_cm_core) && (g_cm_core->api))
-               rc = g_cm_core->api->recv_pkt(g_cm_core, netdev_priv(netdevice), skb);
-       else
-               nes_debug(NES_DBG_CM, "Unable to process packet for CM,"
-                         " cm is not setup properly.\n");
-
-       return rc;
-}
-
-
-/**
- * nes_cm_start
- * Start and init a cm core module
- */
-int nes_cm_start(void)
-{
-       nes_debug(NES_DBG_CM, "\n");
-       /* create the primary CM core, pass this handle to subsequent core inits */
-       g_cm_core = nes_cm_alloc_core();
-       if (g_cm_core)
-               return 0;
-       else
-               return -ENOMEM;
-}
-
-
-/**
- * nes_cm_stop
- * stop and dealloc all cm core instances
- */
-int nes_cm_stop(void)
-{
-       g_cm_core->api->destroy_cm_core(g_cm_core);
-       return 0;
-}
-
-
-/**
- * cm_event_connected
- * handle a connected event, setup QPs and HW
- */
-static void cm_event_connected(struct nes_cm_event *event)
-{
-       struct nes_qp *nesqp;
-       struct nes_vnic *nesvnic;
-       struct nes_device *nesdev;
-       struct nes_cm_node *cm_node;
-       struct nes_adapter *nesadapter;
-       struct ib_qp_attr attr;
-       struct iw_cm_id *cm_id;
-       struct iw_cm_event cm_event;
-       struct nes_v4_quad nes_quad;
-       u32 crc_value;
-       int ret;
-       struct sockaddr_in *laddr;
-       struct sockaddr_in *raddr;
-       struct sockaddr_in *cm_event_laddr;
-
-       /* get all our handles */
-       cm_node = event->cm_node;
-       cm_id = cm_node->cm_id;
-       nes_debug(NES_DBG_CM, "cm_event_connected - %p - cm_id = %p\n", cm_node, cm_id);
-       nesqp = (struct nes_qp *)cm_id->provider_data;
-       nesvnic = to_nesvnic(nesqp->ibqp.device);
-       nesdev = nesvnic->nesdev;
-       nesadapter = nesdev->nesadapter;
-       laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
-       raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
-       cm_event_laddr = (struct sockaddr_in *)&cm_event.local_addr;
-
-       if (nesqp->destroyed)
-               return;
-       atomic_inc(&cm_connecteds);
-       nes_debug(NES_DBG_CM, "QP%u attempting to connect to  0x%08X:0x%04X on"
-                 " local port 0x%04X. jiffies = %lu.\n",
-                 nesqp->hwqp.qp_id, ntohl(raddr->sin_addr.s_addr),
-                 ntohs(raddr->sin_port), ntohs(laddr->sin_port), jiffies);
-
-       nes_cm_init_tsa_conn(nesqp, cm_node);
-
-       /* set the QP tsa context */
-       nesqp->nesqp_context->tcpPorts[0] =
-                       cpu_to_le16(cm_node->loc_port);
-       nesqp->nesqp_context->tcpPorts[1] =
-                       cpu_to_le16(cm_node->rem_port);
-       nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->rem_addr);
-
-       nesqp->nesqp_context->misc2 |= cpu_to_le32(
-                       (u32)PCI_FUNC(nesdev->pcidev->devfn) <<
-                       NES_QPCONTEXT_MISC2_SRC_IP_SHIFT);
-       nesqp->nesqp_context->arp_index_vlan |= cpu_to_le32(
-                       nes_arp_table(nesdev,
-                       le32_to_cpu(nesqp->nesqp_context->ip0),
-                       NULL, NES_ARP_RESOLVE) << 16);
-       nesqp->nesqp_context->ts_val_delta = cpu_to_le32(
-                       jiffies - nes_read_indexed(nesdev, NES_IDX_TCP_NOW));
-       nesqp->nesqp_context->ird_index = cpu_to_le32(nesqp->hwqp.qp_id);
-       nesqp->nesqp_context->ird_ord_sizes |=
-                       cpu_to_le32((u32)1 <<
-                       NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT);
-       nesqp->nesqp_context->ird_ord_sizes |=
-                       cpu_to_le32((u32)cm_node->ord_size);
-
-       /* Adjust tail for not having a LSMM */
-       /*nesqp->hwqp.sq_tail = 1;*/
-
-       build_rdma0_msg(cm_node, &nesqp);
-
-       nes_write32(nesdev->regs + NES_WQE_ALLOC,
-                   (1 << 24) | 0x00800000 | nesqp->hwqp.qp_id);
-
-       memset(&nes_quad, 0, sizeof(nes_quad));
-
-       nes_quad.DstIpAdrIndex =
-               cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24);
-       nes_quad.SrcIpadr = htonl(cm_node->rem_addr);
-       nes_quad.TcpPorts[0] = htons(cm_node->rem_port);
-       nes_quad.TcpPorts[1] = htons(cm_node->loc_port);
-
-       /* Produce hash key */
-       crc_value = get_crc_value(&nes_quad);
-       nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff);
-       nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, After CRC = 0x%08X\n",
-                 nesqp->hte_index, nesqp->hte_index & nesadapter->hte_index_mask);
-
-       nesqp->hte_index &= nesadapter->hte_index_mask;
-       nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index);
-
-       nesqp->ietf_frame = &cm_node->mpa_frame;
-       nesqp->private_data_len = (u8)cm_node->mpa_frame_size;
-       cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node);
-
-       /* notify OF layer we successfully created the requested connection */
-       cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
-       cm_event.status = 0;
-       cm_event.provider_data = cm_id->provider_data;
-       cm_event_laddr->sin_family = AF_INET;
-       cm_event_laddr->sin_port = laddr->sin_port;
-       cm_event.remote_addr = cm_id->m_remote_addr;
-
-       cm_event.private_data = (void *)event->cm_node->mpa_frame_buf;
-       cm_event.private_data_len = (u8)event->cm_node->mpa_frame_size;
-       cm_event.ird = cm_node->ird_size;
-       cm_event.ord = cm_node->ord_size;
-
-       cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr);
-       ret = cm_id->event_handler(cm_id, &cm_event);
-       nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
-
-       if (ret)
-               printk(KERN_ERR "%s[%u] OFA CM event_handler returned, "
-                      "ret=%d\n", __func__, __LINE__, ret);
-       attr.qp_state = IB_QPS_RTS;
-       nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL);
-
-       nes_debug(NES_DBG_CM, "Exiting connect thread for QP%u. jiffies = "
-                 "%lu\n", nesqp->hwqp.qp_id, jiffies);
-
-       return;
-}
-
-
-/**
- * cm_event_connect_error
- */
-static void cm_event_connect_error(struct nes_cm_event *event)
-{
-       struct nes_qp *nesqp;
-       struct iw_cm_id *cm_id;
-       struct iw_cm_event cm_event;
-       /* struct nes_cm_info cm_info; */
-       int ret;
-
-       if (!event->cm_node)
-               return;
-
-       cm_id = event->cm_node->cm_id;
-       if (!cm_id)
-               return;
-
-       nes_debug(NES_DBG_CM, "cm_node=%p, cm_id=%p\n", event->cm_node, cm_id);
-       nesqp = cm_id->provider_data;
-
-       if (!nesqp)
-               return;
-
-       /* notify OF layer about this connection error event */
-       /* cm_id->rem_ref(cm_id); */
-       nesqp->cm_id = NULL;
-       cm_id->provider_data = NULL;
-       cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
-       cm_event.status = -ECONNRESET;
-       cm_event.provider_data = cm_id->provider_data;
-       cm_event.local_addr = cm_id->m_local_addr;
-       cm_event.remote_addr = cm_id->m_remote_addr;
-       cm_event.private_data = NULL;
-       cm_event.private_data_len = 0;
-
-#ifdef CONFIG_INFINIBAND_NES_DEBUG
-       {
-               struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *)
-                                                    &cm_event.local_addr;
-               struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *)
-                                                    &cm_event.remote_addr;
-               nes_debug(NES_DBG_CM, "call CM_EVENT REJECTED, local_addr=%08x, remote_addr=%08x\n",
-                         cm_event_laddr->sin_addr.s_addr, cm_event_raddr->sin_addr.s_addr);
-       }
-#endif
-
-       ret = cm_id->event_handler(cm_id, &cm_event);
-       nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
-       if (ret)
-               printk(KERN_ERR "%s[%u] OFA CM event_handler returned, "
-                      "ret=%d\n", __func__, __LINE__, ret);
-       cm_id->rem_ref(cm_id);
-
-       rem_ref_cm_node(event->cm_node->cm_core, event->cm_node);
-       return;
-}
-
-
-/**
- * cm_event_reset
- */
-static void cm_event_reset(struct nes_cm_event *event)
-{
-       struct nes_qp *nesqp;
-       struct iw_cm_id *cm_id;
-       struct iw_cm_event cm_event;
-       /* struct nes_cm_info cm_info; */
-       int ret;
-
-       if (!event->cm_node)
-               return;
-
-       if (!event->cm_node->cm_id)
-               return;
-
-       cm_id = event->cm_node->cm_id;
-
-       nes_debug(NES_DBG_CM, "%p - cm_id = %p\n", event->cm_node, cm_id);
-       nesqp = cm_id->provider_data;
-       if (!nesqp)
-               return;
-
-       nesqp->cm_id = NULL;
-       /* cm_id->provider_data = NULL; */
-       cm_event.event = IW_CM_EVENT_DISCONNECT;
-       cm_event.status = -ECONNRESET;
-       cm_event.provider_data = cm_id->provider_data;
-       cm_event.local_addr = cm_id->m_local_addr;
-       cm_event.remote_addr = cm_id->m_remote_addr;
-       cm_event.private_data = NULL;
-       cm_event.private_data_len = 0;
-
-       cm_id->add_ref(cm_id);
-       ret = cm_id->event_handler(cm_id, &cm_event);
-       atomic_inc(&cm_closes);
-       cm_event.event = IW_CM_EVENT_CLOSE;
-       cm_event.status = 0;
-       cm_event.provider_data = cm_id->provider_data;
-       cm_event.local_addr = cm_id->m_local_addr;
-       cm_event.remote_addr = cm_id->m_remote_addr;
-       cm_event.private_data = NULL;
-       cm_event.private_data_len = 0;
-       nes_debug(NES_DBG_CM, "NODE %p Generating CLOSE\n", event->cm_node);
-       ret = cm_id->event_handler(cm_id, &cm_event);
-
-       nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
-
-
-       /* notify OF layer about this connection error event */
-       cm_id->rem_ref(cm_id);
-
-       return;
-}
-
-
-/**
- * cm_event_mpa_req
- */
-static void cm_event_mpa_req(struct nes_cm_event *event)
-{
-       struct iw_cm_id *cm_id;
-       struct iw_cm_event cm_event;
-       int ret;
-       struct nes_cm_node *cm_node;
-       struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *)
-                                            &cm_event.local_addr;
-       struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *)
-                                            &cm_event.remote_addr;
-
-       cm_node = event->cm_node;
-       if (!cm_node)
-               return;
-       cm_id = cm_node->cm_id;
-
-       atomic_inc(&cm_connect_reqs);
-       nes_debug(NES_DBG_CM, "cm_node = %p - cm_id = %p, jiffies = %lu\n",
-                 cm_node, cm_id, jiffies);
-
-       cm_event.event = IW_CM_EVENT_CONNECT_REQUEST;
-       cm_event.status = 0;
-       cm_event.provider_data = (void *)cm_node;
-
-       cm_event_laddr->sin_family = AF_INET;
-       cm_event_laddr->sin_port = htons(event->cm_info.loc_port);
-       cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr);
-
-       cm_event_raddr->sin_family = AF_INET;
-       cm_event_raddr->sin_port = htons(event->cm_info.rem_port);
-       cm_event_raddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr);
-       cm_event.private_data = cm_node->mpa_frame_buf;
-       cm_event.private_data_len = (u8)cm_node->mpa_frame_size;
-       if (cm_node->mpa_frame_rev == IETF_MPA_V1) {
-               cm_event.ird = NES_MAX_IRD;
-               cm_event.ord = NES_MAX_ORD;
-       } else {
-       cm_event.ird = cm_node->ird_size;
-       cm_event.ord = cm_node->ord_size;
-       }
-
-       ret = cm_id->event_handler(cm_id, &cm_event);
-       if (ret)
-               printk(KERN_ERR "%s[%u] OFA CM event_handler returned, ret=%d\n",
-                      __func__, __LINE__, ret);
-       return;
-}
-
-
-static void cm_event_mpa_reject(struct nes_cm_event *event)
-{
-       struct iw_cm_id *cm_id;
-       struct iw_cm_event cm_event;
-       struct nes_cm_node *cm_node;
-       int ret;
-       struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *)
-                                            &cm_event.local_addr;
-       struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *)
-                                            &cm_event.remote_addr;
-
-       cm_node = event->cm_node;
-       if (!cm_node)
-               return;
-       cm_id = cm_node->cm_id;
-
-       atomic_inc(&cm_connect_reqs);
-       nes_debug(NES_DBG_CM, "cm_node = %p - cm_id = %p, jiffies = %lu\n",
-                 cm_node, cm_id, jiffies);
-
-       cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
-       cm_event.status = -ECONNREFUSED;
-       cm_event.provider_data = cm_id->provider_data;
-
-       cm_event_laddr->sin_family = AF_INET;
-       cm_event_laddr->sin_port = htons(event->cm_info.loc_port);
-       cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr);
-
-       cm_event_raddr->sin_family = AF_INET;
-       cm_event_raddr->sin_port = htons(event->cm_info.rem_port);
-       cm_event_raddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr);
-
-       cm_event.private_data = cm_node->mpa_frame_buf;
-       cm_event.private_data_len = (u8)cm_node->mpa_frame_size;
-
-       nes_debug(NES_DBG_CM, "call CM_EVENT_MPA_REJECTED, local_addr=%08x, "
-                 "remove_addr=%08x\n",
-                 cm_event_laddr->sin_addr.s_addr,
-                 cm_event_raddr->sin_addr.s_addr);
-
-       ret = cm_id->event_handler(cm_id, &cm_event);
-       if (ret)
-               printk(KERN_ERR "%s[%u] OFA CM event_handler returned, ret=%d\n",
-                      __func__, __LINE__, ret);
-
-       return;
-}
-
-
-static void nes_cm_event_handler(struct work_struct *);
-
-/**
- * nes_cm_post_event
- * post an event to the cm event handler
- */
-static int nes_cm_post_event(struct nes_cm_event *event)
-{
-       atomic_inc(&event->cm_node->cm_core->events_posted);
-       add_ref_cm_node(event->cm_node);
-       event->cm_info.cm_id->add_ref(event->cm_info.cm_id);
-       INIT_WORK(&event->event_work, nes_cm_event_handler);
-       nes_debug(NES_DBG_CM, "cm_node=%p queue_work, event=%p\n",
-                 event->cm_node, event);
-
-       queue_work(event->cm_node->cm_core->event_wq, &event->event_work);
-
-       nes_debug(NES_DBG_CM, "Exit\n");
-       return 0;
-}
-
-
-/**
- * nes_cm_event_handler
- * worker function to handle cm events
- * will free instance of nes_cm_event
- */
-static void nes_cm_event_handler(struct work_struct *work)
-{
-       struct nes_cm_event *event = container_of(work, struct nes_cm_event,
-                                                 event_work);
-       struct nes_cm_core *cm_core;
-
-       if ((!event) || (!event->cm_node) || (!event->cm_node->cm_core))
-               return;
-
-       cm_core = event->cm_node->cm_core;
-       nes_debug(NES_DBG_CM, "event=%p, event->type=%u, events posted=%u\n",
-                 event, event->type, atomic_read(&cm_core->events_posted));
-
-       switch (event->type) {
-       case NES_CM_EVENT_MPA_REQ:
-               cm_event_mpa_req(event);
-               nes_debug(NES_DBG_CM, "cm_node=%p CM Event: MPA REQUEST\n",
-                         event->cm_node);
-               break;
-       case NES_CM_EVENT_RESET:
-               nes_debug(NES_DBG_CM, "cm_node = %p CM Event: RESET\n",
-                         event->cm_node);
-               cm_event_reset(event);
-               break;
-       case NES_CM_EVENT_CONNECTED:
-               if ((!event->cm_node->cm_id) ||
-                   (event->cm_node->state != NES_CM_STATE_TSA))
-                       break;
-               cm_event_connected(event);
-               nes_debug(NES_DBG_CM, "CM Event: CONNECTED\n");
-               break;
-       case NES_CM_EVENT_MPA_REJECT:
-               if ((!event->cm_node->cm_id) ||
-                   (event->cm_node->state == NES_CM_STATE_TSA))
-                       break;
-               cm_event_mpa_reject(event);
-               nes_debug(NES_DBG_CM, "CM Event: REJECT\n");
-               break;
-
-       case NES_CM_EVENT_ABORTED:
-               if ((!event->cm_node->cm_id) ||
-                   (event->cm_node->state == NES_CM_STATE_TSA))
-                       break;
-               cm_event_connect_error(event);
-               nes_debug(NES_DBG_CM, "CM Event: ABORTED\n");
-               break;
-       case NES_CM_EVENT_DROPPED_PKT:
-               nes_debug(NES_DBG_CM, "CM Event: DROPPED PKT\n");
-               break;
-       default:
-               nes_debug(NES_DBG_CM, "CM Event: UNKNOWN EVENT TYPE\n");
-               break;
-       }
-
-       atomic_dec(&cm_core->events_posted);
-       event->cm_info.cm_id->rem_ref(event->cm_info.cm_id);
-       rem_ref_cm_node(cm_core, event->cm_node);
-       kfree(event);
-
-       return;
-}
diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h
deleted file mode 100644 (file)
index b9cc02b..0000000
+++ /dev/null
@@ -1,470 +0,0 @@
-/*
- * Copyright (c) 2006 - 2014 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#ifndef NES_CM_H
-#define NES_CM_H
-
-#define QUEUE_EVENTS
-
-#define NES_MANAGE_APBVT_DEL 0
-#define NES_MANAGE_APBVT_ADD 1
-
-#define NES_MPA_REQUEST_ACCEPT  1
-#define NES_MPA_REQUEST_REJECT  2
-
-/* IETF MPA -- defines, enums, structs */
-#define IEFT_MPA_KEY_REQ  "MPA ID Req Frame"
-#define IEFT_MPA_KEY_REP  "MPA ID Rep Frame"
-#define IETF_MPA_KEY_SIZE 16
-#define IETF_MPA_VERSION  1
-#define IETF_MAX_PRIV_DATA_LEN 512
-#define IETF_MPA_FRAME_SIZE    20
-#define IETF_RTR_MSG_SIZE      4
-#define IETF_MPA_V2_FLAG       0x10
-
-/* IETF RTR MSG Fields               */
-#define IETF_PEER_TO_PEER       0x8000
-#define IETF_FLPDU_ZERO_LEN     0x4000
-#define IETF_RDMA0_WRITE        0x8000
-#define IETF_RDMA0_READ         0x4000
-#define IETF_NO_IRD_ORD         0x3FFF
-#define NES_MAX_IRD             0x40
-#define NES_MAX_ORD             0x7F
-
-enum ietf_mpa_flags {
-       IETF_MPA_FLAGS_MARKERS = 0x80,  /* receive Markers */
-       IETF_MPA_FLAGS_CRC     = 0x40,  /* receive Markers */
-       IETF_MPA_FLAGS_REJECT  = 0x20,  /* Reject */
-};
-
-struct ietf_mpa_v1 {
-       u8 key[IETF_MPA_KEY_SIZE];
-       u8 flags;
-       u8 rev;
-       __be16 priv_data_len;
-       u8 priv_data[0];
-};
-
-#define ietf_mpa_req_resp_frame ietf_mpa_frame
-
-struct ietf_rtr_msg {
-       __be16 ctrl_ird;
-       __be16 ctrl_ord;
-};
-
-struct ietf_mpa_v2 {
-       u8 key[IETF_MPA_KEY_SIZE];
-       u8 flags;
-       u8 rev;
-        __be16 priv_data_len;
-       struct ietf_rtr_msg rtr_msg;
-       u8 priv_data[0];
-};
-
-struct nes_v4_quad {
-       u32 rsvd0;
-       __le32 DstIpAdrIndex;   /* Only most significant 5 bits are valid */
-       __be32 SrcIpadr;
-       __be16 TcpPorts[2];             /* src is low, dest is high */
-};
-
-struct nes_cm_node;
-enum nes_timer_type {
-       NES_TIMER_TYPE_SEND,
-       NES_TIMER_TYPE_RECV,
-       NES_TIMER_NODE_CLEANUP,
-       NES_TIMER_TYPE_CLOSE,
-};
-
-#define NES_PASSIVE_STATE_INDICATED    0
-#define NES_DO_NOT_SEND_RESET_EVENT    1
-#define NES_SEND_RESET_EVENT           2
-
-#define MAX_NES_IFS 4
-
-#define SET_ACK 1
-#define SET_SYN 2
-#define SET_FIN 4
-#define SET_RST 8
-
-#define TCP_OPTIONS_PADDING    3
-
-struct option_base {
-       u8 optionnum;
-       u8 length;
-};
-
-enum option_numbers {
-       OPTION_NUMBER_END,
-       OPTION_NUMBER_NONE,
-       OPTION_NUMBER_MSS,
-       OPTION_NUMBER_WINDOW_SCALE,
-       OPTION_NUMBER_SACK_PERM,
-       OPTION_NUMBER_SACK,
-       OPTION_NUMBER_WRITE0 = 0xbc
-};
-
-struct option_mss {
-       u8 optionnum;
-       u8 length;
-       __be16 mss;
-};
-
-struct option_windowscale {
-       u8 optionnum;
-       u8 length;
-       u8 shiftcount;
-};
-
-union all_known_options {
-       char as_end;
-       struct option_base as_base;
-       struct option_mss as_mss;
-       struct option_windowscale as_windowscale;
-};
-
-struct nes_timer_entry {
-       struct list_head list;
-       unsigned long timetosend;       /* jiffies */
-       struct sk_buff *skb;
-       u32 type;
-       u32 retrycount;
-       u32 retranscount;
-       u32 context;
-       u32 seq_num;
-       u32 send_retrans;
-       int close_when_complete;
-       struct net_device *netdev;
-};
-
-#define NES_DEFAULT_RETRYS  64
-#define NES_DEFAULT_RETRANS 8
-#ifdef CONFIG_INFINIBAND_NES_DEBUG
-#define NES_RETRY_TIMEOUT   (1000*HZ/1000)
-#else
-#define NES_RETRY_TIMEOUT   (3000*HZ/1000)
-#endif
-#define NES_SHORT_TIME      (10)
-#define NES_LONG_TIME       (2000*HZ/1000)
-#define NES_MAX_TIMEOUT     ((unsigned long) (12*HZ))
-
-#define NES_CM_HASHTABLE_SIZE         1024
-#define NES_CM_TCP_TIMER_INTERVAL     3000
-#define NES_CM_DEFAULT_MTU            1540
-#define NES_CM_DEFAULT_FRAME_CNT      10
-#define NES_CM_THREAD_STACK_SIZE      256
-#define NES_CM_DEFAULT_RCV_WND        64240    // before we know that window scaling is allowed
-#define NES_CM_DEFAULT_RCV_WND_SCALED 256960  // after we know that window scaling is allowed
-#define NES_CM_DEFAULT_RCV_WND_SCALE  2
-#define NES_CM_DEFAULT_FREE_PKTS      0x000A
-#define NES_CM_FREE_PKT_LO_WATERMARK  2
-
-#define NES_CM_DEFAULT_MSS   536
-
-#define NES_CM_DEF_SEQ       0x159bf75f
-#define NES_CM_DEF_LOCAL_ID  0x3b47
-
-#define NES_CM_DEF_SEQ2      0x18ed5740
-#define NES_CM_DEF_LOCAL_ID2 0xb807
-#define        MAX_CM_BUFFER   (IETF_MPA_FRAME_SIZE + IETF_RTR_MSG_SIZE + IETF_MAX_PRIV_DATA_LEN)
-
-typedef u32 nes_addr_t;
-
-#define nes_cm_tsa_context nes_qp_context
-
-struct nes_qp;
-
-/* cm node transition states */
-enum nes_cm_node_state {
-       NES_CM_STATE_UNKNOWN,
-       NES_CM_STATE_INITED,
-       NES_CM_STATE_LISTENING,
-       NES_CM_STATE_SYN_RCVD,
-       NES_CM_STATE_SYN_SENT,
-       NES_CM_STATE_ONE_SIDE_ESTABLISHED,
-       NES_CM_STATE_ESTABLISHED,
-       NES_CM_STATE_ACCEPTING,
-       NES_CM_STATE_MPAREQ_SENT,
-       NES_CM_STATE_MPAREQ_RCVD,
-       NES_CM_STATE_MPAREJ_RCVD,
-       NES_CM_STATE_TSA,
-       NES_CM_STATE_FIN_WAIT1,
-       NES_CM_STATE_FIN_WAIT2,
-       NES_CM_STATE_CLOSE_WAIT,
-       NES_CM_STATE_TIME_WAIT,
-       NES_CM_STATE_LAST_ACK,
-       NES_CM_STATE_CLOSING,
-       NES_CM_STATE_LISTENER_DESTROYED,
-       NES_CM_STATE_CLOSED
-};
-
-enum mpa_frame_version {
-       IETF_MPA_V1 = 1,
-       IETF_MPA_V2 = 2
-};
-
-enum mpa_frame_key {
-       MPA_KEY_REQUEST,
-       MPA_KEY_REPLY
-};
-
-enum send_rdma0 {
-       SEND_RDMA_READ_ZERO = 1,
-       SEND_RDMA_WRITE_ZERO = 2
-};
-
-enum nes_tcpip_pkt_type {
-       NES_PKT_TYPE_UNKNOWN,
-       NES_PKT_TYPE_SYN,
-       NES_PKT_TYPE_SYNACK,
-       NES_PKT_TYPE_ACK,
-       NES_PKT_TYPE_FIN,
-       NES_PKT_TYPE_RST
-};
-
-
-/* type of nes connection */
-enum nes_cm_conn_type {
-       NES_CM_IWARP_CONN_TYPE,
-};
-
-/* CM context params */
-struct nes_cm_tcp_context {
-       u8  client;
-
-       u32 loc_seq_num;
-       u32 loc_ack_num;
-       u32 rem_ack_num;
-       u32 rcv_nxt;
-
-       u32 loc_id;
-       u32 rem_id;
-
-       u32 snd_wnd;
-       u32 max_snd_wnd;
-
-       u32 rcv_wnd;
-       u32 mss;
-       u8  snd_wscale;
-       u8  rcv_wscale;
-
-       struct nes_cm_tsa_context tsa_cntxt;
-};
-
-
-enum nes_cm_listener_state {
-       NES_CM_LISTENER_PASSIVE_STATE = 1,
-       NES_CM_LISTENER_ACTIVE_STATE = 2,
-       NES_CM_LISTENER_EITHER_STATE = 3
-};
-
-struct nes_cm_listener {
-       struct list_head           list;
-       struct nes_cm_core         *cm_core;
-       u8                         loc_mac[ETH_ALEN];
-       nes_addr_t                 loc_addr;
-       u16                        loc_port;
-       struct iw_cm_id            *cm_id;
-       enum nes_cm_conn_type      conn_type;
-       atomic_t                   ref_count;
-       struct nes_vnic            *nesvnic;
-       atomic_t                   pend_accepts_cnt;
-       int                        backlog;
-       enum nes_cm_listener_state listener_state;
-       u32                        reused_node;
-       u8                         tos;
-};
-
-/* per connection node and node state information */
-struct nes_cm_node {
-       nes_addr_t                loc_addr, rem_addr;
-       u16                       loc_port, rem_port;
-
-       u8                        loc_mac[ETH_ALEN];
-       u8                        rem_mac[ETH_ALEN];
-
-       enum nes_cm_node_state    state;
-       struct nes_cm_tcp_context tcp_cntxt;
-       struct nes_cm_core        *cm_core;
-       struct sk_buff_head       resend_list;
-       atomic_t                  ref_count;
-       struct net_device         *netdev;
-
-       struct nes_cm_node        *loopbackpartner;
-
-       struct nes_timer_entry    *send_entry;
-       struct nes_timer_entry    *recv_entry;
-       spinlock_t                retrans_list_lock;
-       enum send_rdma0           send_rdma0_op;
-
-       union {
-               struct ietf_mpa_v1 mpa_frame;
-               struct ietf_mpa_v2 mpa_v2_frame;
-               u8                 mpa_frame_buf[MAX_CM_BUFFER];
-       };
-       enum mpa_frame_version    mpa_frame_rev;
-       u16                       ird_size;
-       u16                       ord_size;
-       u16                       mpav2_ird_ord;
-
-       u16                       mpa_frame_size;
-       struct iw_cm_id           *cm_id;
-       struct list_head          list;
-       bool                      accelerated;
-       struct nes_cm_listener    *listener;
-       enum nes_cm_conn_type     conn_type;
-       struct nes_vnic           *nesvnic;
-       int                       apbvt_set;
-       int                       accept_pend;
-       struct list_head        timer_entry;
-       struct list_head        reset_entry;
-       struct nes_qp           *nesqp;
-       atomic_t                passive_state;
-       u8                      tos;
-};
-
-/* structure for client or CM to fill when making CM api calls. */
-/*     - only need to set relevant data, based on op. */
-struct nes_cm_info {
-       union {
-               struct iw_cm_id   *cm_id;
-               struct net_device *netdev;
-       };
-
-       u16 loc_port;
-       u16 rem_port;
-       nes_addr_t loc_addr;
-       nes_addr_t rem_addr;
-       enum nes_cm_conn_type  conn_type;
-       int backlog;
-};
-
-/* CM event codes */
-enum  nes_cm_event_type {
-       NES_CM_EVENT_UNKNOWN,
-       NES_CM_EVENT_ESTABLISHED,
-       NES_CM_EVENT_MPA_REQ,
-       NES_CM_EVENT_MPA_CONNECT,
-       NES_CM_EVENT_MPA_ACCEPT,
-       NES_CM_EVENT_MPA_REJECT,
-       NES_CM_EVENT_MPA_ESTABLISHED,
-       NES_CM_EVENT_CONNECTED,
-       NES_CM_EVENT_CLOSED,
-       NES_CM_EVENT_RESET,
-       NES_CM_EVENT_DROPPED_PKT,
-       NES_CM_EVENT_CLOSE_IMMED,
-       NES_CM_EVENT_CLOSE_HARD,
-       NES_CM_EVENT_CLOSE_CLEAN,
-       NES_CM_EVENT_ABORTED,
-       NES_CM_EVENT_SEND_FIRST
-};
-
-/* event to post to CM event handler */
-struct nes_cm_event {
-       enum nes_cm_event_type type;
-
-       struct nes_cm_info cm_info;
-       struct work_struct event_work;
-       struct nes_cm_node *cm_node;
-};
-
-struct nes_cm_core {
-       enum nes_cm_node_state  state;
-
-       atomic_t                listen_node_cnt;
-       struct nes_cm_node      listen_list;
-       spinlock_t              listen_list_lock;
-
-       u32                     mtu;
-       u32                     free_tx_pkt_max;
-       u32                     rx_pkt_posted;
-       atomic_t                ht_node_cnt;
-       struct list_head        connected_nodes;
-       /* struct list_head hashtable[NES_CM_HASHTABLE_SIZE]; */
-       spinlock_t              ht_lock;
-
-       struct timer_list       tcp_timer;
-
-       const struct nes_cm_ops *api;
-
-       int (*post_event)(struct nes_cm_event *event);
-       atomic_t                events_posted;
-       struct workqueue_struct *event_wq;
-       struct workqueue_struct *disconn_wq;
-
-       atomic_t                node_cnt;
-       u64                     aborted_connects;
-       u32                     options;
-
-       struct nes_cm_node      *current_listen_node;
-};
-
-
-#define NES_CM_SET_PKT_SIZE        (1 << 1)
-#define NES_CM_SET_FREE_PKT_Q_SIZE (1 << 2)
-
-/* CM ops/API for client interface */
-struct nes_cm_ops {
-       int (*accelerated)(struct nes_cm_core *, struct nes_cm_node *);
-       struct nes_cm_listener * (*listen)(struct nes_cm_core *, struct nes_vnic *,
-                       struct nes_cm_info *);
-       int (*stop_listener)(struct nes_cm_core *, struct nes_cm_listener *);
-       struct nes_cm_node * (*connect)(struct nes_cm_core *,
-                       struct nes_vnic *, u16, void *,
-                       struct nes_cm_info *);
-       int (*close)(struct nes_cm_core *, struct nes_cm_node *);
-       int (*accept)(struct nes_cm_core *, struct nes_cm_node *);
-       int (*reject)(struct nes_cm_core *, struct nes_cm_node *);
-       int (*recv_pkt)(struct nes_cm_core *, struct nes_vnic *,
-                       struct sk_buff *);
-       int (*destroy_cm_core)(struct nes_cm_core *);
-       int (*get)(struct nes_cm_core *);
-       int (*set)(struct nes_cm_core *, u32, u32);
-};
-
-int schedule_nes_timer(struct nes_cm_node *, struct sk_buff *,
-               enum nes_timer_type, int, int);
-
-int nes_accept(struct iw_cm_id *, struct iw_cm_conn_param *);
-int nes_reject(struct iw_cm_id *, const void *, u8);
-int nes_connect(struct iw_cm_id *, struct iw_cm_conn_param *);
-int nes_create_listen(struct iw_cm_id *, int);
-int nes_destroy_listen(struct iw_cm_id *);
-
-int nes_cm_recv(struct sk_buff *, struct net_device *);
-int nes_cm_start(void);
-int nes_cm_stop(void);
-int nes_add_ref_cm_node(struct nes_cm_node *cm_node);
-int nes_rem_ref_cm_node(struct nes_cm_node *cm_node);
-
-#endif                 /* NES_CM_H */
diff --git a/drivers/infiniband/hw/nes/nes_context.h b/drivers/infiniband/hw/nes/nes_context.h
deleted file mode 100644 (file)
index a69eef1..0000000
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef NES_CONTEXT_H
-#define NES_CONTEXT_H
-
-struct nes_qp_context {
-       __le32   misc;
-       __le32   cqs;
-       __le32   sq_addr_low;
-       __le32   sq_addr_high;
-       __le32   rq_addr_low;
-       __le32   rq_addr_high;
-       __le32   misc2;
-       __le16   tcpPorts[2];
-       __le32   ip0;
-       __le32   ip1;
-       __le32   ip2;
-       __le32   ip3;
-       __le32   mss;
-       __le32   arp_index_vlan;
-       __le32   tcp_state_flow_label;
-       __le32   pd_index_wscale;
-       __le32   keepalive;
-       u32   ts_recent;
-       u32   ts_age;
-       __le32   snd_nxt;
-       __le32   snd_wnd;
-       __le32   rcv_nxt;
-       __le32   rcv_wnd;
-       __le32   snd_max;
-       __le32   snd_una;
-       u32   srtt;
-       __le32   rttvar;
-       __le32   ssthresh;
-       __le32   cwnd;
-       __le32   snd_wl1;
-       __le32   snd_wl2;
-       __le32   max_snd_wnd;
-       __le32   ts_val_delta;
-       u32   retransmit;
-       u32   probe_cnt;
-       u32   hte_index;
-       __le32   q2_addr_low;
-       __le32   q2_addr_high;
-       __le32   ird_index;
-       u32   Rsvd3;
-       __le32   ird_ord_sizes;
-       u32   mrkr_offset;
-       __le32   aeq_token_low;
-       __le32   aeq_token_high;
-};
-
-/* QP Context Misc Field */
-
-#define NES_QPCONTEXT_MISC_IWARP_VER_MASK    0x00000003
-#define NES_QPCONTEXT_MISC_IWARP_VER_SHIFT   0
-#define NES_QPCONTEXT_MISC_EFB_SIZE_MASK     0x000000C0
-#define NES_QPCONTEXT_MISC_EFB_SIZE_SHIFT    6
-#define NES_QPCONTEXT_MISC_RQ_SIZE_MASK      0x00000300
-#define NES_QPCONTEXT_MISC_RQ_SIZE_SHIFT     8
-#define NES_QPCONTEXT_MISC_SQ_SIZE_MASK      0x00000c00
-#define NES_QPCONTEXT_MISC_SQ_SIZE_SHIFT     10
-#define NES_QPCONTEXT_MISC_PCI_FCN_MASK      0x00007000
-#define NES_QPCONTEXT_MISC_PCI_FCN_SHIFT     12
-#define NES_QPCONTEXT_MISC_DUP_ACKS_MASK     0x00070000
-#define NES_QPCONTEXT_MISC_DUP_ACKS_SHIFT    16
-
-enum nes_qp_context_misc_bits {
-       NES_QPCONTEXT_MISC_RX_WQE_SIZE         = 0x00000004,
-       NES_QPCONTEXT_MISC_IPV4                = 0x00000008,
-       NES_QPCONTEXT_MISC_DO_NOT_FRAG         = 0x00000010,
-       NES_QPCONTEXT_MISC_INSERT_VLAN         = 0x00000020,
-       NES_QPCONTEXT_MISC_DROS                = 0x00008000,
-       NES_QPCONTEXT_MISC_WSCALE              = 0x00080000,
-       NES_QPCONTEXT_MISC_KEEPALIVE           = 0x00100000,
-       NES_QPCONTEXT_MISC_TIMESTAMP           = 0x00200000,
-       NES_QPCONTEXT_MISC_SACK                = 0x00400000,
-       NES_QPCONTEXT_MISC_RDMA_WRITE_EN       = 0x00800000,
-       NES_QPCONTEXT_MISC_RDMA_READ_EN        = 0x01000000,
-       NES_QPCONTEXT_MISC_WBIND_EN            = 0x10000000,
-       NES_QPCONTEXT_MISC_FAST_REGISTER_EN    = 0x20000000,
-       NES_QPCONTEXT_MISC_PRIV_EN             = 0x40000000,
-       NES_QPCONTEXT_MISC_NO_NAGLE            = 0x80000000
-};
-
-enum nes_qp_acc_wq_sizes {
-       HCONTEXT_TSA_WQ_SIZE_4 = 0,
-       HCONTEXT_TSA_WQ_SIZE_32 = 1,
-       HCONTEXT_TSA_WQ_SIZE_128 = 2,
-       HCONTEXT_TSA_WQ_SIZE_512 = 3
-};
-
-/* QP Context Misc2 Fields */
-#define NES_QPCONTEXT_MISC2_TTL_MASK            0x000000ff
-#define NES_QPCONTEXT_MISC2_TTL_SHIFT           0
-#define NES_QPCONTEXT_MISC2_HOP_LIMIT_MASK      0x000000ff
-#define NES_QPCONTEXT_MISC2_HOP_LIMIT_SHIFT     0
-#define NES_QPCONTEXT_MISC2_LIMIT_MASK          0x00000300
-#define NES_QPCONTEXT_MISC2_LIMIT_SHIFT         8
-#define NES_QPCONTEXT_MISC2_NIC_INDEX_MASK      0x0000fc00
-#define NES_QPCONTEXT_MISC2_NIC_INDEX_SHIFT     10
-#define NES_QPCONTEXT_MISC2_SRC_IP_MASK         0x001f0000
-#define NES_QPCONTEXT_MISC2_SRC_IP_SHIFT        16
-#define NES_QPCONTEXT_MISC2_TOS_MASK            0xff000000
-#define NES_QPCONTEXT_MISC2_TOS_SHIFT           24
-#define NES_QPCONTEXT_MISC2_TRAFFIC_CLASS_MASK  0xff000000
-#define NES_QPCONTEXT_MISC2_TRAFFIC_CLASS_SHIFT 24
-
-/* QP Context Tcp State/Flow Label Fields */
-#define NES_QPCONTEXT_TCPFLOW_FLOW_LABEL_MASK   0x000fffff
-#define NES_QPCONTEXT_TCPFLOW_FLOW_LABEL_SHIFT  0
-#define NES_QPCONTEXT_TCPFLOW_TCP_STATE_MASK    0xf0000000
-#define NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT   28
-
-enum nes_qp_tcp_state {
-       NES_QPCONTEXT_TCPSTATE_CLOSED = 1,
-       NES_QPCONTEXT_TCPSTATE_EST = 5,
-       NES_QPCONTEXT_TCPSTATE_TIME_WAIT = 11,
-};
-
-/* QP Context PD Index/wscale Fields */
-#define NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK  0x0000000f
-#define NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT 0
-#define NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK  0x00000f00
-#define NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT 8
-#define NES_QPCONTEXT_PDWSCALE_PDINDEX_MASK     0xffff0000
-#define NES_QPCONTEXT_PDWSCALE_PDINDEX_SHIFT    16
-
-/* QP Context Keepalive Fields */
-#define NES_QPCONTEXT_KEEPALIVE_DELTA_MASK      0x0000ffff
-#define NES_QPCONTEXT_KEEPALIVE_DELTA_SHIFT     0
-#define NES_QPCONTEXT_KEEPALIVE_PROBE_CNT_MASK  0x00ff0000
-#define NES_QPCONTEXT_KEEPALIVE_PROBE_CNT_SHIFT 16
-#define NES_QPCONTEXT_KEEPALIVE_INTV_MASK       0xff000000
-#define NES_QPCONTEXT_KEEPALIVE_INTV_SHIFT      24
-
-/* QP Context ORD/IRD Fields */
-#define NES_QPCONTEXT_ORDIRD_ORDSIZE_MASK       0x0000007f
-#define NES_QPCONTEXT_ORDIRD_ORDSIZE_SHIFT      0
-#define NES_QPCONTEXT_ORDIRD_IRDSIZE_MASK       0x00030000
-#define NES_QPCONTEXT_ORDIRD_IRDSIZE_SHIFT      16
-#define NES_QPCONTEXT_ORDIRD_IWARP_MODE_MASK    0x30000000
-#define NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT   28
-
-enum nes_ord_ird_bits {
-       NES_QPCONTEXT_ORDIRD_WRPDU                   = 0x02000000,
-       NES_QPCONTEXT_ORDIRD_LSMM_PRESENT            = 0x04000000,
-       NES_QPCONTEXT_ORDIRD_ALSMM                   = 0x08000000,
-       NES_QPCONTEXT_ORDIRD_AAH                     = 0x40000000,
-       NES_QPCONTEXT_ORDIRD_RNMC                    = 0x80000000
-};
-
-enum nes_iwarp_qp_state {
-       NES_QPCONTEXT_IWARP_STATE_NONEXIST  = 0,
-       NES_QPCONTEXT_IWARP_STATE_IDLE      = 1,
-       NES_QPCONTEXT_IWARP_STATE_RTS       = 2,
-       NES_QPCONTEXT_IWARP_STATE_CLOSING   = 3,
-       NES_QPCONTEXT_IWARP_STATE_TERMINATE = 5,
-       NES_QPCONTEXT_IWARP_STATE_ERROR     = 6
-};
-
-
-#endif         /* NES_CONTEXT_H */
diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c
deleted file mode 100644 (file)
index 5517e39..0000000
+++ /dev/null
@@ -1,3887 +0,0 @@
-/*
- * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/if_vlan.h>
-#include <linux/slab.h>
-
-#include "nes.h"
-
-static int wide_ppm_offset;
-module_param(wide_ppm_offset, int, 0644);
-MODULE_PARM_DESC(wide_ppm_offset, "Increase CX4 interface clock ppm offset, 0=100ppm (default), 1=300ppm");
-
-static u32 crit_err_count;
-u32 int_mod_timer_init;
-u32 int_mod_cq_depth_256;
-u32 int_mod_cq_depth_128;
-u32 int_mod_cq_depth_32;
-u32 int_mod_cq_depth_24;
-u32 int_mod_cq_depth_16;
-u32 int_mod_cq_depth_4;
-u32 int_mod_cq_depth_1;
-static const u8 nes_max_critical_error_count = 100;
-#include "nes_cm.h"
-
-static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq);
-static void nes_init_csr_ne020(struct nes_device *nesdev, u8 hw_rev, u8 port_count);
-static int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count,
-                               struct nes_adapter *nesadapter, u8  OneG_Mode);
-static void nes_nic_napi_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq);
-static void nes_process_aeq(struct nes_device *nesdev, struct nes_hw_aeq *aeq);
-static void nes_process_ceq(struct nes_device *nesdev, struct nes_hw_ceq *ceq);
-static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
-                                  struct nes_hw_aeqe *aeqe);
-static void process_critical_error(struct nes_device *nesdev);
-static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number);
-static unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode);
-static void nes_terminate_start_timer(struct nes_qp *nesqp);
-
-static const char *const nes_iwarp_state_str[] = {
-       "Non-Existent",
-       "Idle",
-       "RTS",
-       "Closing",
-       "RSVD1",
-       "Terminate",
-       "Error",
-       "RSVD2",
-};
-
-static const char *const nes_tcp_state_str[] = {
-       "Non-Existent",
-       "Closed",
-       "Listen",
-       "SYN Sent",
-       "SYN Rcvd",
-       "Established",
-       "Close Wait",
-       "FIN Wait 1",
-       "Closing",
-       "Last Ack",
-       "FIN Wait 2",
-       "Time Wait",
-       "RSVD1",
-       "RSVD2",
-       "RSVD3",
-       "RSVD4",
-};
-
-static inline void print_ip(struct nes_cm_node *cm_node)
-{
-       unsigned char *rem_addr;
-       if (cm_node) {
-               rem_addr = (unsigned char *)&cm_node->rem_addr;
-               printk(KERN_ERR PFX "Remote IP addr: %pI4\n", rem_addr);
-       }
-}
-
-/**
- * nes_nic_init_timer_defaults
- */
-void  nes_nic_init_timer_defaults(struct nes_device *nesdev, u8 jumbomode)
-{
-       unsigned long flags;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
-
-       spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
-
-       shared_timer->timer_in_use_min = NES_NIC_FAST_TIMER_LOW;
-       shared_timer->timer_in_use_max = NES_NIC_FAST_TIMER_HIGH;
-       if (jumbomode) {
-               shared_timer->threshold_low    = DEFAULT_JUMBO_NES_QL_LOW;
-               shared_timer->threshold_target = DEFAULT_JUMBO_NES_QL_TARGET;
-               shared_timer->threshold_high   = DEFAULT_JUMBO_NES_QL_HIGH;
-       } else {
-               shared_timer->threshold_low    = DEFAULT_NES_QL_LOW;
-               shared_timer->threshold_target = DEFAULT_NES_QL_TARGET;
-               shared_timer->threshold_high   = DEFAULT_NES_QL_HIGH;
-       }
-
-       /* todo use netdev->mtu to set thresholds */
-       spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
-}
-
-
-/**
- * nes_nic_init_timer
- */
-static void  nes_nic_init_timer(struct nes_device *nesdev)
-{
-       unsigned long flags;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
-
-       spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
-
-       if (shared_timer->timer_in_use_old == 0) {
-               nesdev->deepcq_count = 0;
-               shared_timer->timer_direction_upward = 0;
-               shared_timer->timer_direction_downward = 0;
-               shared_timer->timer_in_use = NES_NIC_FAST_TIMER;
-               shared_timer->timer_in_use_old = 0;
-
-       }
-       if (shared_timer->timer_in_use != shared_timer->timer_in_use_old) {
-               shared_timer->timer_in_use_old = shared_timer->timer_in_use;
-               nes_write32(nesdev->regs+NES_PERIODIC_CONTROL,
-                       0x80000000 | ((u32)(shared_timer->timer_in_use*8)));
-       }
-       /* todo use netdev->mtu to set thresholds */
-       spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
-}
-
-
-/**
- * nes_nic_tune_timer
- */
-static void nes_nic_tune_timer(struct nes_device *nesdev)
-{
-       unsigned long flags;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
-       u16 cq_count = nesdev->currcq_count;
-
-       spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
-
-       if (shared_timer->cq_count_old <= cq_count)
-               shared_timer->cq_direction_downward = 0;
-       else
-               shared_timer->cq_direction_downward++;
-       shared_timer->cq_count_old = cq_count;
-       if (shared_timer->cq_direction_downward > NES_NIC_CQ_DOWNWARD_TREND) {
-               if (cq_count <= shared_timer->threshold_low &&
-                   shared_timer->threshold_low > 4) {
-                       shared_timer->threshold_low = shared_timer->threshold_low/2;
-                       shared_timer->cq_direction_downward=0;
-                       nesdev->currcq_count = 0;
-                       spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
-                       return;
-               }
-       }
-
-       if (cq_count > 1) {
-               nesdev->deepcq_count += cq_count;
-               if (cq_count <= shared_timer->threshold_low) {       /* increase timer gently */
-                       shared_timer->timer_direction_upward++;
-                       shared_timer->timer_direction_downward = 0;
-               } else if (cq_count <= shared_timer->threshold_target) { /* balanced */
-                       shared_timer->timer_direction_upward = 0;
-                       shared_timer->timer_direction_downward = 0;
-               } else if (cq_count <= shared_timer->threshold_high) {  /* decrease timer gently */
-                       shared_timer->timer_direction_downward++;
-                       shared_timer->timer_direction_upward = 0;
-               } else if (cq_count <= (shared_timer->threshold_high) * 2) {
-                       shared_timer->timer_in_use -= 2;
-                       shared_timer->timer_direction_upward = 0;
-                       shared_timer->timer_direction_downward++;
-               } else {
-                       shared_timer->timer_in_use -= 4;
-                       shared_timer->timer_direction_upward = 0;
-                       shared_timer->timer_direction_downward++;
-               }
-
-               if (shared_timer->timer_direction_upward > 3 ) {  /* using history */
-                       shared_timer->timer_in_use += 3;
-                       shared_timer->timer_direction_upward = 0;
-                       shared_timer->timer_direction_downward = 0;
-               }
-               if (shared_timer->timer_direction_downward > 5) { /* using history */
-                       shared_timer->timer_in_use -= 4 ;
-                       shared_timer->timer_direction_downward = 0;
-                       shared_timer->timer_direction_upward = 0;
-               }
-       }
-
-       /* boundary checking */
-       if (shared_timer->timer_in_use > shared_timer->threshold_high)
-               shared_timer->timer_in_use = shared_timer->threshold_high;
-       else if (shared_timer->timer_in_use < shared_timer->threshold_low)
-               shared_timer->timer_in_use = shared_timer->threshold_low;
-
-       nesdev->currcq_count = 0;
-
-       spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
-}
-
-
-/**
- * nes_init_adapter - initialize adapter
- */
-struct nes_adapter *nes_init_adapter(struct nes_device *nesdev, u8 hw_rev) {
-       struct nes_adapter *nesadapter = NULL;
-       unsigned long num_pds;
-       u32 u32temp;
-       u32 port_count;
-       u16 max_rq_wrs;
-       u16 max_sq_wrs;
-       u32 max_mr;
-       u32 max_256pbl;
-       u32 max_4kpbl;
-       u32 max_qp;
-       u32 max_irrq;
-       u32 max_cq;
-       u32 hte_index_mask;
-       u32 adapter_size;
-       u32 arp_table_size;
-       u16 vendor_id;
-       u16 device_id;
-       u8  OneG_Mode;
-       u8  func_index;
-
-       /* search the list of existing adapters */
-       list_for_each_entry(nesadapter, &nes_adapter_list, list) {
-               nes_debug(NES_DBG_INIT, "Searching Adapter list for PCI devfn = 0x%X,"
-                               " adapter PCI slot/bus = %u/%u, pci devices PCI slot/bus = %u/%u, .\n",
-                               nesdev->pcidev->devfn,
-                               PCI_SLOT(nesadapter->devfn),
-                               nesadapter->bus_number,
-                               PCI_SLOT(nesdev->pcidev->devfn),
-                               nesdev->pcidev->bus->number );
-               if ((PCI_SLOT(nesadapter->devfn) == PCI_SLOT(nesdev->pcidev->devfn)) &&
-                               (nesadapter->bus_number == nesdev->pcidev->bus->number)) {
-                       nesadapter->ref_count++;
-                       return nesadapter;
-               }
-       }
-
-       /* no adapter found */
-       num_pds = pci_resource_len(nesdev->pcidev, BAR_1) >> PAGE_SHIFT;
-       if ((hw_rev != NE020_REV) && (hw_rev != NE020_REV1)) {
-               nes_debug(NES_DBG_INIT, "NE020 driver detected unknown hardware revision 0x%x\n",
-                               hw_rev);
-               return NULL;
-       }
-
-       nes_debug(NES_DBG_INIT, "Determine Soft Reset, QP_control=0x%x, CPU0=0x%x, CPU1=0x%x, CPU2=0x%x\n",
-                       nes_read_indexed(nesdev, NES_IDX_QP_CONTROL + PCI_FUNC(nesdev->pcidev->devfn) * 8),
-                       nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS),
-                       nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS + 4),
-                       nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS + 8));
-
-       nes_debug(NES_DBG_INIT, "Reset and init NE020\n");
-
-
-       if ((port_count = nes_reset_adapter_ne020(nesdev, &OneG_Mode)) == 0)
-               return NULL;
-
-       max_qp = nes_read_indexed(nesdev, NES_IDX_QP_CTX_SIZE);
-       nes_debug(NES_DBG_INIT, "QP_CTX_SIZE=%u\n", max_qp);
-
-       u32temp = nes_read_indexed(nesdev, NES_IDX_QUAD_HASH_TABLE_SIZE);
-       if (max_qp > ((u32)1 << (u32temp & 0x001f))) {
-               nes_debug(NES_DBG_INIT, "Reducing Max QPs to %u due to hash table size = 0x%08X\n",
-                               max_qp, u32temp);
-               max_qp = (u32)1 << (u32temp & 0x001f);
-       }
-
-       hte_index_mask = ((u32)1 << ((u32temp & 0x001f)+1))-1;
-       nes_debug(NES_DBG_INIT, "Max QP = %u, hte_index_mask = 0x%08X.\n",
-                       max_qp, hte_index_mask);
-
-       u32temp = nes_read_indexed(nesdev, NES_IDX_IRRQ_COUNT);
-
-       max_irrq = 1 << (u32temp & 0x001f);
-
-       if (max_qp > max_irrq) {
-               max_qp = max_irrq;
-               nes_debug(NES_DBG_INIT, "Reducing Max QPs to %u due to Available Q1s.\n",
-                               max_qp);
-       }
-
-       /* there should be no reason to allocate more pds than qps */
-       if (num_pds > max_qp)
-               num_pds = max_qp;
-
-       u32temp = nes_read_indexed(nesdev, NES_IDX_MRT_SIZE);
-       max_mr = (u32)8192 << (u32temp & 0x7);
-
-       u32temp = nes_read_indexed(nesdev, NES_IDX_PBL_REGION_SIZE);
-       max_256pbl = (u32)1 << (u32temp & 0x0000001f);
-       max_4kpbl = (u32)1 << ((u32temp >> 16) & 0x0000001f);
-       max_cq = nes_read_indexed(nesdev, NES_IDX_CQ_CTX_SIZE);
-
-       u32temp = nes_read_indexed(nesdev, NES_IDX_ARP_CACHE_SIZE);
-       arp_table_size = 1 << u32temp;
-
-       adapter_size = (sizeof(struct nes_adapter) +
-                       (sizeof(unsigned long)-1)) & (~(sizeof(unsigned long)-1));
-       adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_qp);
-       adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_mr);
-       adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_cq);
-       adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(num_pds);
-       adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(arp_table_size);
-       adapter_size += sizeof(struct nes_qp **) * max_qp;
-
-       /* allocate a new adapter struct */
-       nesadapter = kzalloc(adapter_size, GFP_KERNEL);
-       if (!nesadapter)
-               return NULL;
-
-       nes_debug(NES_DBG_INIT, "Allocating new nesadapter @ %p, size = %u (actual size = %u).\n",
-                       nesadapter, (u32)sizeof(struct nes_adapter), adapter_size);
-
-       if (nes_read_eeprom_values(nesdev, nesadapter)) {
-               printk(KERN_ERR PFX "Unable to read EEPROM data.\n");
-               kfree(nesadapter);
-               return NULL;
-       }
-
-       nesadapter->vendor_id = (((u32) nesadapter->mac_addr_high) << 8) |
-                               (nesadapter->mac_addr_low >> 24);
-
-       pci_bus_read_config_word(nesdev->pcidev->bus, nesdev->pcidev->devfn,
-                                PCI_DEVICE_ID, &device_id);
-       nesadapter->vendor_part_id = device_id;
-
-       if (nes_init_serdes(nesdev, hw_rev, port_count, nesadapter,
-                                                       OneG_Mode)) {
-               kfree(nesadapter);
-               return NULL;
-       }
-       nes_init_csr_ne020(nesdev, hw_rev, port_count);
-
-       memset(nesadapter->pft_mcast_map, 255,
-              sizeof nesadapter->pft_mcast_map);
-
-       /* populate the new nesadapter */
-       nesadapter->nesdev = nesdev;
-       nesadapter->devfn = nesdev->pcidev->devfn;
-       nesadapter->bus_number = nesdev->pcidev->bus->number;
-       nesadapter->ref_count = 1;
-       nesadapter->timer_int_req = 0xffff0000;
-       nesadapter->OneG_Mode = OneG_Mode;
-       nesadapter->doorbell_start = nesdev->doorbell_region;
-
-       /* nesadapter->tick_delta = clk_divisor; */
-       nesadapter->hw_rev = hw_rev;
-       nesadapter->port_count = port_count;
-
-       nesadapter->max_qp = max_qp;
-       nesadapter->hte_index_mask = hte_index_mask;
-       nesadapter->max_irrq = max_irrq;
-       nesadapter->max_mr = max_mr;
-       nesadapter->max_256pbl = max_256pbl - 1;
-       nesadapter->max_4kpbl = max_4kpbl - 1;
-       nesadapter->max_cq = max_cq;
-       nesadapter->free_256pbl = max_256pbl - 1;
-       nesadapter->free_4kpbl = max_4kpbl - 1;
-       nesadapter->max_pd = num_pds;
-       nesadapter->arp_table_size = arp_table_size;
-
-       nesadapter->et_pkt_rate_low = NES_TIMER_ENABLE_LIMIT;
-       if (nes_drv_opt & NES_DRV_OPT_DISABLE_INT_MOD) {
-               nesadapter->et_use_adaptive_rx_coalesce = 0;
-               nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT;
-               nesadapter->et_rx_coalesce_usecs_irq = interrupt_mod_interval;
-       } else {
-               nesadapter->et_use_adaptive_rx_coalesce = 1;
-               nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT_DYNAMIC;
-               nesadapter->et_rx_coalesce_usecs_irq = 0;
-               printk(PFX "%s: Using Adaptive Interrupt Moderation\n", __func__);
-       }
-       /* Setup and enable the periodic timer */
-       if (nesadapter->et_rx_coalesce_usecs_irq)
-               nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, 0x80000000 |
-                               ((u32)(nesadapter->et_rx_coalesce_usecs_irq * 8)));
-       else
-               nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, 0x00000000);
-
-       nesadapter->base_pd = 1;
-
-       nesadapter->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY |
-                                      IB_DEVICE_MEM_WINDOW |
-                                      IB_DEVICE_MEM_MGT_EXTENSIONS;
-
-       nesadapter->allocated_qps = (unsigned long *)&(((unsigned char *)nesadapter)
-                       [(sizeof(struct nes_adapter)+(sizeof(unsigned long)-1))&(~(sizeof(unsigned long)-1))]);
-       nesadapter->allocated_cqs = &nesadapter->allocated_qps[BITS_TO_LONGS(max_qp)];
-       nesadapter->allocated_mrs = &nesadapter->allocated_cqs[BITS_TO_LONGS(max_cq)];
-       nesadapter->allocated_pds = &nesadapter->allocated_mrs[BITS_TO_LONGS(max_mr)];
-       nesadapter->allocated_arps = &nesadapter->allocated_pds[BITS_TO_LONGS(num_pds)];
-       nesadapter->qp_table = (struct nes_qp **)(&nesadapter->allocated_arps[BITS_TO_LONGS(arp_table_size)]);
-
-
-       /* mark the usual suspect QPs, MR and CQs as in use */
-       for (u32temp = 0; u32temp < NES_FIRST_QPN; u32temp++) {
-               set_bit(u32temp, nesadapter->allocated_qps);
-               set_bit(u32temp, nesadapter->allocated_cqs);
-       }
-       set_bit(0, nesadapter->allocated_mrs);
-
-       for (u32temp = 0; u32temp < 20; u32temp++)
-               set_bit(u32temp, nesadapter->allocated_pds);
-       u32temp = nes_read_indexed(nesdev, NES_IDX_QP_MAX_CFG_SIZES);
-
-       max_rq_wrs = ((u32temp >> 8) & 3);
-       switch (max_rq_wrs) {
-               case 0:
-                       max_rq_wrs = 4;
-                       break;
-               case 1:
-                       max_rq_wrs = 16;
-                       break;
-               case 2:
-                       max_rq_wrs = 32;
-                       break;
-               case 3:
-                       max_rq_wrs = 512;
-                       break;
-       }
-
-       max_sq_wrs = (u32temp & 3);
-       switch (max_sq_wrs) {
-               case 0:
-                       max_sq_wrs = 4;
-                       break;
-               case 1:
-                       max_sq_wrs = 16;
-                       break;
-               case 2:
-                       max_sq_wrs = 32;
-                       break;
-               case 3:
-                       max_sq_wrs = 512;
-                       break;
-       }
-       nesadapter->max_qp_wr = min(max_rq_wrs, max_sq_wrs);
-       nesadapter->max_irrq_wr = (u32temp >> 16) & 3;
-
-       nesadapter->max_sge = 4;
-       nesadapter->max_cqe = 32766;
-
-       if (nes_read_eeprom_values(nesdev, nesadapter)) {
-               printk(KERN_ERR PFX "Unable to read EEPROM data.\n");
-               kfree(nesadapter);
-               return NULL;
-       }
-
-       u32temp = nes_read_indexed(nesdev, NES_IDX_TCP_TIMER_CONFIG);
-       nes_write_indexed(nesdev, NES_IDX_TCP_TIMER_CONFIG,
-                       (u32temp & 0xff000000) | (nesadapter->tcp_timer_core_clk_divisor & 0x00ffffff));
-
-       /* setup port configuration */
-       if (nesadapter->port_count == 1) {
-               nesadapter->log_port = 0x00000000;
-               if (nes_drv_opt & NES_DRV_OPT_DUAL_LOGICAL_PORT)
-                       nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000002);
-               else
-                       nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000003);
-       } else {
-               if (nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G) {
-                       nesadapter->log_port = 0x000000D8;
-               } else {
-                       if (nesadapter->port_count == 2)
-                               nesadapter->log_port = 0x00000044;
-                       else
-                               nesadapter->log_port = 0x000000e4;
-               }
-               nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000003);
-       }
-
-       nes_write_indexed(nesdev, NES_IDX_NIC_LOGPORT_TO_PHYPORT,
-                                               nesadapter->log_port);
-       nes_debug(NES_DBG_INIT, "Probe time, LOG2PHY=%u\n",
-                       nes_read_indexed(nesdev, NES_IDX_NIC_LOGPORT_TO_PHYPORT));
-
-       spin_lock_init(&nesadapter->resource_lock);
-       spin_lock_init(&nesadapter->phy_lock);
-       spin_lock_init(&nesadapter->pbl_lock);
-       spin_lock_init(&nesadapter->periodic_timer_lock);
-
-       INIT_LIST_HEAD(&nesadapter->nesvnic_list[0]);
-       INIT_LIST_HEAD(&nesadapter->nesvnic_list[1]);
-       INIT_LIST_HEAD(&nesadapter->nesvnic_list[2]);
-       INIT_LIST_HEAD(&nesadapter->nesvnic_list[3]);
-
-       if ((!nesadapter->OneG_Mode) && (nesadapter->port_count == 2)) {
-               u32 pcs_control_status0, pcs_control_status1;
-               u32 reset_value;
-               u32 i = 0;
-               u32 int_cnt = 0;
-               u32 ext_cnt = 0;
-               unsigned long flags;
-               u32 j = 0;
-
-               pcs_control_status0 = nes_read_indexed(nesdev,
-                       NES_IDX_PHY_PCS_CONTROL_STATUS0);
-               pcs_control_status1 = nes_read_indexed(nesdev,
-                       NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
-
-               for (i = 0; i < NES_MAX_LINK_CHECK; i++) {
-                       pcs_control_status0 = nes_read_indexed(nesdev,
-                                       NES_IDX_PHY_PCS_CONTROL_STATUS0);
-                       pcs_control_status1 = nes_read_indexed(nesdev,
-                                       NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
-                       if ((0x0F000100 == (pcs_control_status0 & 0x0F000100))
-                           || (0x0F000100 == (pcs_control_status1 & 0x0F000100)))
-                               int_cnt++;
-                       usleep_range(1000, 2000);
-               }
-               if (int_cnt > 1) {
-                       spin_lock_irqsave(&nesadapter->phy_lock, flags);
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8);
-                       mh_detected++;
-                       reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
-                       reset_value |= 0x0000003d;
-                       nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
-
-                       while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
-                               & 0x00000040) != 0x00000040) && (j++ < 5000));
-                       spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-
-                       pcs_control_status0 = nes_read_indexed(nesdev,
-                                       NES_IDX_PHY_PCS_CONTROL_STATUS0);
-                       pcs_control_status1 = nes_read_indexed(nesdev,
-                                       NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
-
-                       for (i = 0; i < NES_MAX_LINK_CHECK; i++) {
-                               pcs_control_status0 = nes_read_indexed(nesdev,
-                                       NES_IDX_PHY_PCS_CONTROL_STATUS0);
-                               pcs_control_status1 = nes_read_indexed(nesdev,
-                                       NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
-                               if ((0x0F000100 == (pcs_control_status0 & 0x0F000100))
-                                       || (0x0F000100 == (pcs_control_status1 & 0x0F000100))) {
-                                       if (++ext_cnt > int_cnt) {
-                                               spin_lock_irqsave(&nesadapter->phy_lock, flags);
-                                               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1,
-                                                               0x0000F088);
-                                               mh_detected++;
-                                               reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
-                                               reset_value |= 0x0000003d;
-                                               nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
-
-                                               while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
-                                                       & 0x00000040) != 0x00000040) && (j++ < 5000));
-                                               spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-                                               break;
-                                       }
-                               }
-                               usleep_range(1000, 2000);
-                       }
-               }
-       }
-
-       if (nesadapter->hw_rev == NE020_REV) {
-               timer_setup(&nesadapter->mh_timer, nes_mh_fix, 0);
-               nesadapter->mh_timer.expires = jiffies + (HZ/5);  /* 1 second */
-               add_timer(&nesadapter->mh_timer);
-       } else {
-               nes_write32(nesdev->regs+NES_INTF_INT_STAT, 0x0f000000);
-       }
-
-       timer_setup(&nesadapter->lc_timer, nes_clc, 0);
-       nesadapter->lc_timer.expires = jiffies + 3600 * HZ;  /* 1 hour */
-       add_timer(&nesadapter->lc_timer);
-
-       list_add_tail(&nesadapter->list, &nes_adapter_list);
-
-       for (func_index = 0; func_index < 8; func_index++) {
-               pci_bus_read_config_word(nesdev->pcidev->bus,
-                                       PCI_DEVFN(PCI_SLOT(nesdev->pcidev->devfn),
-                                       func_index), 0, &vendor_id);
-               if (vendor_id == 0xffff)
-                       break;
-       }
-       nes_debug(NES_DBG_INIT, "%s %d functions found for %s.\n", __func__,
-               func_index, pci_name(nesdev->pcidev));
-       nesadapter->adapter_fcn_count = func_index;
-
-       return nesadapter;
-}
-
-
-/**
- * nes_reset_adapter_ne020
- */
-static unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode)
-{
-       u32 port_count;
-       u32 u32temp;
-       u32 i;
-
-       u32temp = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
-       port_count = ((u32temp & 0x00000300) >> 8) + 1;
-       /* TODO: assuming that both SERDES are set the same for now */
-       *OneG_Mode = (u32temp & 0x00003c00) ? 0 : 1;
-       nes_debug(NES_DBG_INIT, "Initial Software Reset = 0x%08X, port_count=%u\n",
-                       u32temp, port_count);
-       if (*OneG_Mode)
-               nes_debug(NES_DBG_INIT, "Running in 1G mode.\n");
-       u32temp &= 0xff00ffc0;
-       switch (port_count) {
-               case 1:
-                       u32temp |= 0x00ee0000;
-                       break;
-               case 2:
-                       u32temp |= 0x00cc0000;
-                       break;
-               case 4:
-                       u32temp |= 0x00000000;
-                       break;
-               default:
-                       return 0;
-                       break;
-       }
-
-       /* check and do full reset if needed */
-       if (nes_read_indexed(nesdev, NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))) {
-               nes_debug(NES_DBG_INIT, "Issuing Full Soft reset = 0x%08X\n", u32temp | 0xd);
-               nes_write32(nesdev->regs+NES_SOFTWARE_RESET, u32temp | 0xd);
-
-               i = 0;
-               while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) & 0x00000040) == 0) && i++ < 10000)
-                       mdelay(1);
-               if (i > 10000) {
-                       nes_debug(NES_DBG_INIT, "Did not see full soft reset done.\n");
-                       return 0;
-               }
-
-               i = 0;
-               while ((nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS) != 0x80) && i++ < 10000)
-                       mdelay(1);
-               if (i > 10000) {
-                       printk(KERN_ERR PFX "Internal CPU not ready, status = %02X\n",
-                              nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS));
-                       return 0;
-               }
-       }
-
-       /* port reset */
-       switch (port_count) {
-               case 1:
-                       u32temp |= 0x00ee0010;
-                       break;
-               case 2:
-                       u32temp |= 0x00cc0030;
-                       break;
-               case 4:
-                       u32temp |= 0x00000030;
-                       break;
-       }
-
-       nes_debug(NES_DBG_INIT, "Issuing Port Soft reset = 0x%08X\n", u32temp | 0xd);
-       nes_write32(nesdev->regs+NES_SOFTWARE_RESET, u32temp | 0xd);
-
-       i = 0;
-       while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) & 0x00000040) == 0) && i++ < 10000)
-               mdelay(1);
-       if (i > 10000) {
-               nes_debug(NES_DBG_INIT, "Did not see port soft reset done.\n");
-               return 0;
-       }
-
-       /* serdes 0 */
-       i = 0;
-       while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0)
-                       & 0x0000000f)) != 0x0000000f) && i++ < 5000)
-               mdelay(1);
-       if (i > 5000) {
-               nes_debug(NES_DBG_INIT, "Serdes 0 not ready, status=%x\n", u32temp);
-               return 0;
-       }
-
-       /* serdes 1 */
-       if (port_count > 1) {
-               i = 0;
-               while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS1)
-                               & 0x0000000f)) != 0x0000000f) && i++ < 5000)
-                       mdelay(1);
-               if (i > 5000) {
-                       nes_debug(NES_DBG_INIT, "Serdes 1 not ready, status=%x\n", u32temp);
-                       return 0;
-               }
-       }
-
-       return port_count;
-}
-
-
-/**
- * nes_init_serdes
- */
-static int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count,
-                               struct nes_adapter *nesadapter, u8  OneG_Mode)
-{
-       int i;
-       u32 u32temp;
-       u32 sds;
-
-       if (hw_rev != NE020_REV) {
-               /* init serdes 0 */
-               switch (nesadapter->phy_type[0]) {
-               case NES_PHY_TYPE_CX4:
-                       if (wide_ppm_offset)
-                               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000FFFAA);
-                       else
-                               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
-                       break;
-               case NES_PHY_TYPE_KR:
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000);
-                       break;
-               case NES_PHY_TYPE_PUMA_1G:
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
-                       sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0);
-                       sds |= 0x00000100;
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, sds);
-                       break;
-               default:
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
-                       break;
-               }
-
-               if (!OneG_Mode)
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0, 0x11110000);
-
-               if (port_count < 2)
-                       return 0;
-
-               /* init serdes 1 */
-               if (!(OneG_Mode && (nesadapter->phy_type[1] != NES_PHY_TYPE_PUMA_1G)))
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000FF);
-
-               switch (nesadapter->phy_type[1]) {
-               case NES_PHY_TYPE_ARGUS:
-               case NES_PHY_TYPE_SFP_D:
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000);
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000);
-                       break;
-               case NES_PHY_TYPE_CX4:
-                       if (wide_ppm_offset)
-                               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000FFFAA);
-                       break;
-               case NES_PHY_TYPE_KR:
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000);
-                       break;
-               case NES_PHY_TYPE_PUMA_1G:
-                       sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
-                       sds |= 0x000000100;
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, sds);
-               }
-               if (!OneG_Mode) {
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1, 0x11110000);
-                       sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
-                       sds &= 0xFFFFFFBF;
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, sds);
-               }
-       } else {
-               /* init serdes 0 */
-               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, 0x00000008);
-               i = 0;
-               while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0)
-                               & 0x0000000f)) != 0x0000000f) && i++ < 5000)
-                       mdelay(1);
-               if (i > 5000) {
-                       nes_debug(NES_DBG_PHY, "Init: serdes 0 not ready, status=%x\n", u32temp);
-                       return 1;
-               }
-               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x000bdef7);
-               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE0, 0x9ce73000);
-               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE0, 0x0ff00000);
-               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET0, 0x00000000);
-               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS0, 0x00000000);
-               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0, 0x00000000);
-               if (OneG_Mode)
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0182222);
-               else
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0042222);
-
-               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000ff);
-               if (port_count > 1) {
-                       /* init serdes 1 */
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x00000048);
-                       i = 0;
-                       while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS1)
-                               & 0x0000000f)) != 0x0000000f) && (i++ < 5000))
-                               mdelay(1);
-                       if (i > 5000) {
-                               printk("%s: Init: serdes 1 not ready, status=%x\n", __func__, u32temp);
-                               /* return 1; */
-                       }
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x000bdef7);
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE1, 0x9ce73000);
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE1, 0x0ff00000);
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET1, 0x00000000);
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS1, 0x00000000);
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL1, 0x00000000);
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL1, 0xf0002222);
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000ff);
-               }
-       }
-       return 0;
-}
-
-
-/**
- * nes_init_csr_ne020
- * Initialize registers for ne020 hardware
- */
-static void nes_init_csr_ne020(struct nes_device *nesdev, u8 hw_rev, u8 port_count)
-{
-       u32 u32temp;
-
-       nes_debug(NES_DBG_INIT, "port_count=%d\n", port_count);
-
-       nes_write_indexed(nesdev, 0x000001E4, 0x00000007);
-       /* nes_write_indexed(nesdev, 0x000001E8, 0x000208C4); */
-       nes_write_indexed(nesdev, 0x000001E8, 0x00020874);
-       nes_write_indexed(nesdev, 0x000001D8, 0x00048002);
-       /* nes_write_indexed(nesdev, 0x000001D8, 0x0004B002); */
-       nes_write_indexed(nesdev, 0x000001FC, 0x00050005);
-       nes_write_indexed(nesdev, 0x00000600, 0x55555555);
-       nes_write_indexed(nesdev, 0x00000604, 0x55555555);
-
-       /* TODO: move these MAC register settings to NIC bringup */
-       nes_write_indexed(nesdev, 0x00002000, 0x00000001);
-       nes_write_indexed(nesdev, 0x00002004, 0x00000001);
-       nes_write_indexed(nesdev, 0x00002008, 0x0000FFFF);
-       nes_write_indexed(nesdev, 0x0000200C, 0x00000001);
-       nes_write_indexed(nesdev, 0x00002010, 0x000003c1);
-       nes_write_indexed(nesdev, 0x0000201C, 0x75345678);
-       if (port_count > 1) {
-               nes_write_indexed(nesdev, 0x00002200, 0x00000001);
-               nes_write_indexed(nesdev, 0x00002204, 0x00000001);
-               nes_write_indexed(nesdev, 0x00002208, 0x0000FFFF);
-               nes_write_indexed(nesdev, 0x0000220C, 0x00000001);
-               nes_write_indexed(nesdev, 0x00002210, 0x000003c1);
-               nes_write_indexed(nesdev, 0x0000221C, 0x75345678);
-               nes_write_indexed(nesdev, 0x00000908, 0x20000001);
-       }
-       if (port_count > 2) {
-               nes_write_indexed(nesdev, 0x00002400, 0x00000001);
-               nes_write_indexed(nesdev, 0x00002404, 0x00000001);
-               nes_write_indexed(nesdev, 0x00002408, 0x0000FFFF);
-               nes_write_indexed(nesdev, 0x0000240C, 0x00000001);
-               nes_write_indexed(nesdev, 0x00002410, 0x000003c1);
-               nes_write_indexed(nesdev, 0x0000241C, 0x75345678);
-               nes_write_indexed(nesdev, 0x00000910, 0x20000001);
-
-               nes_write_indexed(nesdev, 0x00002600, 0x00000001);
-               nes_write_indexed(nesdev, 0x00002604, 0x00000001);
-               nes_write_indexed(nesdev, 0x00002608, 0x0000FFFF);
-               nes_write_indexed(nesdev, 0x0000260C, 0x00000001);
-               nes_write_indexed(nesdev, 0x00002610, 0x000003c1);
-               nes_write_indexed(nesdev, 0x0000261C, 0x75345678);
-               nes_write_indexed(nesdev, 0x00000918, 0x20000001);
-       }
-
-       nes_write_indexed(nesdev, 0x00005000, 0x00018000);
-       /* nes_write_indexed(nesdev, 0x00005000, 0x00010000); */
-       nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG1, (wqm_quanta << 1) |
-                                                        0x00000001);
-       nes_write_indexed(nesdev, 0x00005008, 0x1F1F1F1F);
-       nes_write_indexed(nesdev, 0x00005010, 0x1F1F1F1F);
-       nes_write_indexed(nesdev, 0x00005018, 0x1F1F1F1F);
-       nes_write_indexed(nesdev, 0x00005020, 0x1F1F1F1F);
-       nes_write_indexed(nesdev, 0x00006090, 0xFFFFFFFF);
-
-       /* TODO: move this to code, get from EEPROM */
-       nes_write_indexed(nesdev, 0x00000900, 0x20000001);
-       nes_write_indexed(nesdev, 0x000060C0, 0x0000028e);
-       nes_write_indexed(nesdev, 0x000060C8, 0x00000020);
-
-       nes_write_indexed(nesdev, 0x000001EC, 0x7b2625a0);
-       /* nes_write_indexed(nesdev, 0x000001EC, 0x5f2625a0); */
-
-       if (hw_rev != NE020_REV) {
-               u32temp = nes_read_indexed(nesdev, 0x000008e8);
-               u32temp |= 0x80000000;
-               nes_write_indexed(nesdev, 0x000008e8, u32temp);
-               u32temp = nes_read_indexed(nesdev, 0x000021f8);
-               u32temp &= 0x7fffffff;
-               u32temp |= 0x7fff0010;
-               nes_write_indexed(nesdev, 0x000021f8, u32temp);
-               if (port_count > 1) {
-                       u32temp = nes_read_indexed(nesdev, 0x000023f8);
-                       u32temp &= 0x7fffffff;
-                       u32temp |= 0x7fff0010;
-                       nes_write_indexed(nesdev, 0x000023f8, u32temp);
-               }
-       }
-}
-
-
-/**
- * nes_destroy_adapter - destroy the adapter structure
- */
-void nes_destroy_adapter(struct nes_adapter *nesadapter)
-{
-       struct nes_adapter *tmp_adapter;
-
-       list_for_each_entry(tmp_adapter, &nes_adapter_list, list) {
-               nes_debug(NES_DBG_SHUTDOWN, "Nes Adapter list entry = 0x%p.\n",
-                               tmp_adapter);
-       }
-
-       nesadapter->ref_count--;
-       if (!nesadapter->ref_count) {
-               if (nesadapter->hw_rev == NE020_REV) {
-                       del_timer(&nesadapter->mh_timer);
-               }
-               del_timer(&nesadapter->lc_timer);
-
-               list_del(&nesadapter->list);
-               kfree(nesadapter);
-       }
-}
-
-
-/**
- * nes_init_cqp
- */
-int nes_init_cqp(struct nes_device *nesdev)
-{
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       struct nes_hw_cqp_qp_context *cqp_qp_context;
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       struct nes_hw_ceq *ceq;
-       struct nes_hw_ceq *nic_ceq;
-       struct nes_hw_aeq *aeq;
-       void *vmem;
-       dma_addr_t pmem;
-       u32 count=0;
-       u32 cqp_head;
-       u64 u64temp;
-       u32 u32temp;
-
-       /* allocate CQP memory */
-       /* Need to add max_cq to the aeq size once cq overflow checking is added back */
-       /* SQ is 512 byte aligned, others are 256 byte aligned */
-       nesdev->cqp_mem_size = 512 +
-                       (sizeof(struct nes_hw_cqp_wqe) * NES_CQP_SQ_SIZE) +
-                       (sizeof(struct nes_hw_cqe) * NES_CCQ_SIZE) +
-                       max(((u32)sizeof(struct nes_hw_ceqe) * NES_CCEQ_SIZE), (u32)256) +
-                       max(((u32)sizeof(struct nes_hw_ceqe) * NES_NIC_CEQ_SIZE), (u32)256) +
-                       (sizeof(struct nes_hw_aeqe) * nesadapter->max_qp) +
-                       sizeof(struct nes_hw_cqp_qp_context);
-
-       nesdev->cqp_vbase = pci_zalloc_consistent(nesdev->pcidev,
-                                                 nesdev->cqp_mem_size,
-                                                 &nesdev->cqp_pbase);
-       if (!nesdev->cqp_vbase) {
-               nes_debug(NES_DBG_INIT, "Unable to allocate memory for host descriptor rings\n");
-               return -ENOMEM;
-       }
-
-       /* Allocate a twice the number of CQP requests as the SQ size */
-       nesdev->nes_cqp_requests = kzalloc(sizeof(struct nes_cqp_request) *
-                       2 * NES_CQP_SQ_SIZE, GFP_KERNEL);
-       if (!nesdev->nes_cqp_requests) {
-               pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, nesdev->cqp.sq_vbase,
-                               nesdev->cqp.sq_pbase);
-               return -ENOMEM;
-       }
-
-       nes_debug(NES_DBG_INIT, "Allocated CQP structures at %p (phys = %016lX), size = %u.\n",
-                       nesdev->cqp_vbase, (unsigned long)nesdev->cqp_pbase, nesdev->cqp_mem_size);
-
-       spin_lock_init(&nesdev->cqp.lock);
-       init_waitqueue_head(&nesdev->cqp.waitq);
-
-       /* Setup Various Structures */
-       vmem = (void *)(((unsigned long)nesdev->cqp_vbase + (512 - 1)) &
-                       ~(unsigned long)(512 - 1));
-       pmem = (dma_addr_t)(((unsigned long long)nesdev->cqp_pbase + (512 - 1)) &
-                       ~(unsigned long long)(512 - 1));
-
-       nesdev->cqp.sq_vbase = vmem;
-       nesdev->cqp.sq_pbase = pmem;
-       nesdev->cqp.sq_size = NES_CQP_SQ_SIZE;
-       nesdev->cqp.sq_head = 0;
-       nesdev->cqp.sq_tail = 0;
-       nesdev->cqp.qp_id = PCI_FUNC(nesdev->pcidev->devfn);
-
-       vmem += (sizeof(struct nes_hw_cqp_wqe) * nesdev->cqp.sq_size);
-       pmem += (sizeof(struct nes_hw_cqp_wqe) * nesdev->cqp.sq_size);
-
-       nesdev->ccq.cq_vbase = vmem;
-       nesdev->ccq.cq_pbase = pmem;
-       nesdev->ccq.cq_size = NES_CCQ_SIZE;
-       nesdev->ccq.cq_head = 0;
-       nesdev->ccq.ce_handler = nes_cqp_ce_handler;
-       nesdev->ccq.cq_number = PCI_FUNC(nesdev->pcidev->devfn);
-
-       vmem += (sizeof(struct nes_hw_cqe) * nesdev->ccq.cq_size);
-       pmem += (sizeof(struct nes_hw_cqe) * nesdev->ccq.cq_size);
-
-       nesdev->ceq_index = PCI_FUNC(nesdev->pcidev->devfn);
-       ceq = &nesadapter->ceq[nesdev->ceq_index];
-       ceq->ceq_vbase = vmem;
-       ceq->ceq_pbase = pmem;
-       ceq->ceq_size = NES_CCEQ_SIZE;
-       ceq->ceq_head = 0;
-
-       vmem += max(((u32)sizeof(struct nes_hw_ceqe) * ceq->ceq_size), (u32)256);
-       pmem += max(((u32)sizeof(struct nes_hw_ceqe) * ceq->ceq_size), (u32)256);
-
-       nesdev->nic_ceq_index = PCI_FUNC(nesdev->pcidev->devfn) + 8;
-       nic_ceq = &nesadapter->ceq[nesdev->nic_ceq_index];
-       nic_ceq->ceq_vbase = vmem;
-       nic_ceq->ceq_pbase = pmem;
-       nic_ceq->ceq_size = NES_NIC_CEQ_SIZE;
-       nic_ceq->ceq_head = 0;
-
-       vmem += max(((u32)sizeof(struct nes_hw_ceqe) * nic_ceq->ceq_size), (u32)256);
-       pmem += max(((u32)sizeof(struct nes_hw_ceqe) * nic_ceq->ceq_size), (u32)256);
-
-       aeq = &nesadapter->aeq[PCI_FUNC(nesdev->pcidev->devfn)];
-       aeq->aeq_vbase = vmem;
-       aeq->aeq_pbase = pmem;
-       aeq->aeq_size = nesadapter->max_qp;
-       aeq->aeq_head = 0;
-
-       /* Setup QP Context */
-       vmem += (sizeof(struct nes_hw_aeqe) * aeq->aeq_size);
-       pmem += (sizeof(struct nes_hw_aeqe) * aeq->aeq_size);
-
-       cqp_qp_context = vmem;
-       cqp_qp_context->context_words[0] =
-                       cpu_to_le32((PCI_FUNC(nesdev->pcidev->devfn) << 12) + (2 << 10));
-       cqp_qp_context->context_words[1] = 0;
-       cqp_qp_context->context_words[2] = cpu_to_le32((u32)nesdev->cqp.sq_pbase);
-       cqp_qp_context->context_words[3] = cpu_to_le32(((u64)nesdev->cqp.sq_pbase) >> 32);
-
-
-       /* Write the address to Create CQP */
-       if ((sizeof(dma_addr_t) > 4)) {
-               nes_write_indexed(nesdev,
-                               NES_IDX_CREATE_CQP_HIGH + (PCI_FUNC(nesdev->pcidev->devfn) * 8),
-                               ((u64)pmem) >> 32);
-       } else {
-               nes_write_indexed(nesdev,
-                               NES_IDX_CREATE_CQP_HIGH + (PCI_FUNC(nesdev->pcidev->devfn) * 8), 0);
-       }
-       nes_write_indexed(nesdev,
-                       NES_IDX_CREATE_CQP_LOW + (PCI_FUNC(nesdev->pcidev->devfn) * 8),
-                       (u32)pmem);
-
-       INIT_LIST_HEAD(&nesdev->cqp_avail_reqs);
-       INIT_LIST_HEAD(&nesdev->cqp_pending_reqs);
-
-       for (count = 0; count < 2*NES_CQP_SQ_SIZE; count++) {
-               init_waitqueue_head(&nesdev->nes_cqp_requests[count].waitq);
-               list_add_tail(&nesdev->nes_cqp_requests[count].list, &nesdev->cqp_avail_reqs);
-       }
-
-       /* Write Create CCQ WQE */
-       cqp_head = nesdev->cqp.sq_head++;
-       cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-       nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
-                       (NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
-                       NES_CQP_CQ_CHK_OVERFLOW | ((u32)nesdev->ccq.cq_size << 16)));
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
-                           (nesdev->ccq.cq_number |
-                            ((u32)nesdev->ceq_index << 16)));
-       u64temp = (u64)nesdev->ccq.cq_pbase;
-       set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
-       cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0;
-       u64temp = (unsigned long)&nesdev->ccq;
-       cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] =
-                       cpu_to_le32((u32)(u64temp >> 1));
-       cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
-                       cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
-       cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
-
-       /* Write Create CEQ WQE */
-       cqp_head = nesdev->cqp.sq_head++;
-       cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-       nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
-                           (NES_CQP_CREATE_CEQ + ((u32)nesdev->ceq_index << 8)));
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX, ceq->ceq_size);
-       u64temp = (u64)ceq->ceq_pbase;
-       set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
-
-       /* Write Create AEQ WQE */
-       cqp_head = nesdev->cqp.sq_head++;
-       cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-       nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
-                       (NES_CQP_CREATE_AEQ + ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 8)));
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_AEQ_WQE_ELEMENT_COUNT_IDX, aeq->aeq_size);
-       u64temp = (u64)aeq->aeq_pbase;
-       set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
-
-       /* Write Create NIC CEQ WQE */
-       cqp_head = nesdev->cqp.sq_head++;
-       cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-       nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
-                       (NES_CQP_CREATE_CEQ + ((u32)nesdev->nic_ceq_index << 8)));
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX, nic_ceq->ceq_size);
-       u64temp = (u64)nic_ceq->ceq_pbase;
-       set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
-
-       /* Poll until CCQP done */
-       count = 0;
-       do {
-               if (count++ > 1000) {
-                       printk(KERN_ERR PFX "Error creating CQP\n");
-                       pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size,
-                                       nesdev->cqp_vbase, nesdev->cqp_pbase);
-                       return -1;
-               }
-               udelay(10);
-       } while (!(nes_read_indexed(nesdev,
-                       NES_IDX_QP_CONTROL + (PCI_FUNC(nesdev->pcidev->devfn) * 8)) & (1 << 8)));
-
-       nes_debug(NES_DBG_INIT, "CQP Status = 0x%08X\n", nes_read_indexed(nesdev,
-                       NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)));
-
-       u32temp = 0x04800000;
-       nes_write32(nesdev->regs+NES_WQE_ALLOC, u32temp | nesdev->cqp.qp_id);
-
-       /* wait for the CCQ, CEQ, and AEQ to get created */
-       count = 0;
-       do {
-               if (count++ > 1000) {
-                       printk(KERN_ERR PFX "Error creating CCQ, CEQ, and AEQ\n");
-                       pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size,
-                                       nesdev->cqp_vbase, nesdev->cqp_pbase);
-                       return -1;
-               }
-               udelay(10);
-       } while (((nes_read_indexed(nesdev,
-                       NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)) & (15<<8)) != (15<<8)));
-
-       /* dump the QP status value */
-       nes_debug(NES_DBG_INIT, "QP Status = 0x%08X\n", nes_read_indexed(nesdev,
-                       NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)));
-
-       nesdev->cqp.sq_tail++;
-
-       return 0;
-}
-
-
-/**
- * nes_destroy_cqp
- */
-int nes_destroy_cqp(struct nes_device *nesdev)
-{
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       u32 count = 0;
-       u32 cqp_head;
-       unsigned long flags;
-
-       do {
-               if (count++ > 1000)
-                       break;
-               udelay(10);
-       } while (!(nesdev->cqp.sq_head == nesdev->cqp.sq_tail));
-
-       /* Reset CCQ */
-       nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_RESET |
-                       nesdev->ccq.cq_number);
-
-       /* Disable device interrupts */
-       nes_write32(nesdev->regs+NES_INT_MASK, 0x7fffffff);
-
-       spin_lock_irqsave(&nesdev->cqp.lock, flags);
-
-       /* Destroy the AEQ */
-       cqp_head = nesdev->cqp.sq_head++;
-       nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
-       cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-       cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_AEQ |
-                       ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 8));
-       cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_HIGH_IDX] = 0;
-
-       /* Destroy the NIC CEQ */
-       cqp_head = nesdev->cqp.sq_head++;
-       nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
-       cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-       cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CEQ |
-                       ((u32)nesdev->nic_ceq_index << 8));
-
-       /* Destroy the CEQ */
-       cqp_head = nesdev->cqp.sq_head++;
-       nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
-       cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-       cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CEQ |
-                       (nesdev->ceq_index << 8));
-
-       /* Destroy the CCQ */
-       cqp_head = nesdev->cqp.sq_head++;
-       nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
-       cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-       cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CQ);
-       cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesdev->ccq.cq_number |
-                       ((u32)nesdev->ceq_index << 16));
-
-       /* Destroy CQP */
-       cqp_head = nesdev->cqp.sq_head++;
-       nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
-       cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-       cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_QP |
-                       NES_CQP_QP_TYPE_CQP);
-       cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesdev->cqp.qp_id);
-
-       barrier();
-       /* Ring doorbell (5 WQEs) */
-       nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x05800000 | nesdev->cqp.qp_id);
-
-       spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-
-       /* wait for the CCQ, CEQ, and AEQ to get destroyed */
-       count = 0;
-       do {
-               if (count++ > 1000) {
-                       printk(KERN_ERR PFX "Function%d: Error destroying CCQ, CEQ, and AEQ\n",
-                                       PCI_FUNC(nesdev->pcidev->devfn));
-                       break;
-               }
-               udelay(10);
-       } while (((nes_read_indexed(nesdev,
-                       NES_IDX_QP_CONTROL + (PCI_FUNC(nesdev->pcidev->devfn)*8)) & (15 << 8)) != 0));
-
-       /* dump the QP status value */
-       nes_debug(NES_DBG_SHUTDOWN, "Function%d: QP Status = 0x%08X\n",
-                       PCI_FUNC(nesdev->pcidev->devfn),
-                       nes_read_indexed(nesdev,
-                       NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)));
-
-       kfree(nesdev->nes_cqp_requests);
-
-       /* Free the control structures */
-       pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, nesdev->cqp.sq_vbase,
-                       nesdev->cqp.sq_pbase);
-
-       return 0;
-}
-
-
-/**
- * nes_init_1g_phy
- */
-static int nes_init_1g_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index)
-{
-       u32 counter = 0;
-       u16 phy_data;
-       int ret = 0;
-
-       nes_read_1G_phy_reg(nesdev, 1, phy_index, &phy_data);
-       nes_write_1G_phy_reg(nesdev, 23, phy_index, 0xb000);
-
-       /* Reset the PHY */
-       nes_write_1G_phy_reg(nesdev, 0, phy_index, 0x8000);
-       udelay(100);
-       counter = 0;
-       do {
-               nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
-               if (counter++ > 100) {
-                       ret = -1;
-                       break;
-               }
-       } while (phy_data & 0x8000);
-
-       /* Setting no phy loopback */
-       phy_data &= 0xbfff;
-       phy_data |= 0x1140;
-       nes_write_1G_phy_reg(nesdev, 0, phy_index,  phy_data);
-       nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
-       nes_read_1G_phy_reg(nesdev, 0x17, phy_index, &phy_data);
-       nes_read_1G_phy_reg(nesdev, 0x1e, phy_index, &phy_data);
-
-       /* Setting the interrupt mask */
-       nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data);
-       nes_write_1G_phy_reg(nesdev, 0x19, phy_index, 0xffee);
-       nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data);
-
-       /* turning on flow control */
-       nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data);
-       nes_write_1G_phy_reg(nesdev, 4, phy_index, (phy_data & ~(0x03E0)) | 0xc00);
-       nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data);
-
-       /* Clear Half duplex */
-       nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data);
-       nes_write_1G_phy_reg(nesdev, 9, phy_index, phy_data & ~(0x0100));
-       nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data);
-
-       nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
-       nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data | 0x0300);
-
-       return ret;
-}
-
-
-/**
- * nes_init_2025_phy
- */
-static int nes_init_2025_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index)
-{
-       u32 temp_phy_data = 0;
-       u32 temp_phy_data2 = 0;
-       u32 counter = 0;
-       u32 sds;
-       u32 mac_index = nesdev->mac_index;
-       int ret = 0;
-       unsigned int first_attempt = 1;
-
-       /* Check firmware heartbeat */
-       nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
-       temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-       udelay(1500);
-       nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
-       temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-
-       if (temp_phy_data != temp_phy_data2) {
-               nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd);
-               temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-               if ((temp_phy_data & 0xff) > 0x20)
-                       return 0;
-               printk(PFX "Reinitialize external PHY\n");
-       }
-
-       /* no heartbeat, configure the PHY */
-       nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0x0000, 0x8000);
-       nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0000);
-       nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
-       nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
-
-       switch (phy_type) {
-       case NES_PHY_TYPE_ARGUS:
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0008);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0001);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00);
-
-               /* setup LEDs */
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x0007);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x000A);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0009);
-               break;
-
-       case NES_PHY_TYPE_SFP_D:
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x0004);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0038);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00);
-
-               /* setup LEDs */
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x0007);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x000A);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0009);
-               break;
-
-       case NES_PHY_TYPE_KR:
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0010);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0080);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00);
-
-               /* setup LEDs */
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x000B);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x0003);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0004);
-
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0022, 0x406D);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0023, 0x0020);
-               break;
-       }
-
-       nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0028, 0xA528);
-
-       /* Bring PHY out of reset */
-       nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0002);
-
-       /* Check for heartbeat */
-       counter = 0;
-       mdelay(690);
-       nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
-       temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-       do {
-               if (counter++ > 150) {
-                       printk(PFX "No PHY heartbeat\n");
-                       break;
-               }
-               mdelay(1);
-               nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
-               temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-       } while (temp_phy_data2 == temp_phy_data);
-
-       /* wait for tracking */
-       counter = 0;
-       do {
-               nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd);
-               temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-               if (counter++ > 300) {
-                       if (((temp_phy_data & 0xff) == 0x0) && first_attempt) {
-                               first_attempt = 0;
-                               counter = 0;
-                               /* reset AMCC PHY and try again */
-                               nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x00c0);
-                               nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x0040);
-                               continue;
-                       } else {
-                               ret = 1;
-                               break;
-                       }
-               }
-               mdelay(10);
-       } while ((temp_phy_data & 0xff) < 0x30);
-
-       /* setup signal integrity */
-       nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd003, 0x0000);
-       nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00D, 0x00FE);
-       nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00E, 0x0032);
-       if (phy_type == NES_PHY_TYPE_KR) {
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00F, 0x000C);
-       } else {
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00F, 0x0002);
-               nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc314, 0x0063);
-       }
-
-       /* reset serdes */
-       sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200);
-       sds |= 0x1;
-       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200, sds);
-       sds &= 0xfffffffe;
-       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200, sds);
-
-       counter = 0;
-       while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET) & 0x00000040) != 0x00000040)
-                       && (counter++ < 5000))
-               ;
-
-       return ret;
-}
-
-
-/**
- * nes_init_phy
- */
-int nes_init_phy(struct nes_device *nesdev)
-{
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       u32 mac_index = nesdev->mac_index;
-       u32 tx_config = 0;
-       unsigned long flags;
-       u8  phy_type = nesadapter->phy_type[mac_index];
-       u8  phy_index = nesadapter->phy_index[mac_index];
-       int ret = 0;
-
-       tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG);
-       if (phy_type == NES_PHY_TYPE_1G) {
-               /* setup 1G MDIO operation */
-               tx_config &= 0xFFFFFFE3;
-               tx_config |= 0x04;
-       } else {
-               /* setup 10G MDIO operation */
-               tx_config &= 0xFFFFFFE3;
-               tx_config |= 0x1D;
-       }
-       nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config);
-
-       spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
-
-       switch (phy_type) {
-       case NES_PHY_TYPE_1G:
-               ret = nes_init_1g_phy(nesdev, phy_type, phy_index);
-               break;
-       case NES_PHY_TYPE_ARGUS:
-       case NES_PHY_TYPE_SFP_D:
-       case NES_PHY_TYPE_KR:
-               ret = nes_init_2025_phy(nesdev, phy_type, phy_index);
-               break;
-       }
-
-       spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
-
-       return ret;
-}
-
-
-/**
- * nes_replenish_nic_rq
- */
-static void nes_replenish_nic_rq(struct nes_vnic *nesvnic)
-{
-       unsigned long flags;
-       dma_addr_t bus_address;
-       struct sk_buff *skb;
-       struct nes_hw_nic_rq_wqe *nic_rqe;
-       struct nes_hw_nic *nesnic;
-       struct nes_device *nesdev;
-       struct nes_rskb_cb *cb;
-       u32 rx_wqes_posted = 0;
-
-       nesnic = &nesvnic->nic;
-       nesdev = nesvnic->nesdev;
-       spin_lock_irqsave(&nesnic->rq_lock, flags);
-       if (nesnic->replenishing_rq !=0) {
-               if (((nesnic->rq_size-1) == atomic_read(&nesvnic->rx_skbs_needed)) &&
-                               (atomic_read(&nesvnic->rx_skb_timer_running) == 0)) {
-                       atomic_set(&nesvnic->rx_skb_timer_running, 1);
-                       spin_unlock_irqrestore(&nesnic->rq_lock, flags);
-                       nesvnic->rq_wqes_timer.expires = jiffies + (HZ/2);      /* 1/2 second */
-                       add_timer(&nesvnic->rq_wqes_timer);
-               } else
-               spin_unlock_irqrestore(&nesnic->rq_lock, flags);
-               return;
-       }
-       nesnic->replenishing_rq = 1;
-       spin_unlock_irqrestore(&nesnic->rq_lock, flags);
-       do {
-               skb = dev_alloc_skb(nesvnic->max_frame_size);
-               if (skb) {
-                       skb->dev = nesvnic->netdev;
-
-                       bus_address = pci_map_single(nesdev->pcidev,
-                                       skb->data, nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
-                       cb = (struct nes_rskb_cb *)&skb->cb[0];
-                       cb->busaddr = bus_address;
-                       cb->maplen = nesvnic->max_frame_size;
-
-                       nic_rqe = &nesnic->rq_vbase[nesvnic->nic.rq_head];
-                       nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] =
-                                       cpu_to_le32(nesvnic->max_frame_size);
-                       nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
-                       nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] =
-                                       cpu_to_le32((u32)bus_address);
-                       nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] =
-                                       cpu_to_le32((u32)((u64)bus_address >> 32));
-                       nesnic->rx_skb[nesnic->rq_head] = skb;
-                       nesnic->rq_head++;
-                       nesnic->rq_head &= nesnic->rq_size - 1;
-                       atomic_dec(&nesvnic->rx_skbs_needed);
-                       barrier();
-                       if (++rx_wqes_posted == 255) {
-                               nes_write32(nesdev->regs+NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesnic->qp_id);
-                               rx_wqes_posted = 0;
-                       }
-               } else {
-                       spin_lock_irqsave(&nesnic->rq_lock, flags);
-                       if (((nesnic->rq_size-1) == atomic_read(&nesvnic->rx_skbs_needed)) &&
-                                       (atomic_read(&nesvnic->rx_skb_timer_running) == 0)) {
-                               atomic_set(&nesvnic->rx_skb_timer_running, 1);
-                               spin_unlock_irqrestore(&nesnic->rq_lock, flags);
-                               nesvnic->rq_wqes_timer.expires = jiffies + (HZ/2);      /* 1/2 second */
-                               add_timer(&nesvnic->rq_wqes_timer);
-                       } else
-                               spin_unlock_irqrestore(&nesnic->rq_lock, flags);
-                       break;
-               }
-       } while (atomic_read(&nesvnic->rx_skbs_needed));
-       barrier();
-       if (rx_wqes_posted)
-               nes_write32(nesdev->regs+NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesnic->qp_id);
-       nesnic->replenishing_rq = 0;
-}
-
-
-/**
- * nes_rq_wqes_timeout
- */
-static void nes_rq_wqes_timeout(struct timer_list *t)
-{
-       struct nes_vnic *nesvnic = from_timer(nesvnic, t, rq_wqes_timer);
-       printk("%s: Timer fired.\n", __func__);
-       atomic_set(&nesvnic->rx_skb_timer_running, 0);
-       if (atomic_read(&nesvnic->rx_skbs_needed))
-               nes_replenish_nic_rq(nesvnic);
-}
-
-
-/**
- * nes_init_nic_qp
- */
-int nes_init_nic_qp(struct nes_device *nesdev, struct net_device *netdev)
-{
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       struct nes_hw_nic_sq_wqe *nic_sqe;
-       struct nes_hw_nic_qp_context *nic_context;
-       struct sk_buff *skb;
-       struct nes_hw_nic_rq_wqe *nic_rqe;
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-       unsigned long flags;
-       void *vmem;
-       dma_addr_t pmem;
-       u64 u64temp;
-       int ret;
-       u32 cqp_head;
-       u32 counter;
-       u32 wqe_count;
-       struct nes_rskb_cb *cb;
-       u8 jumbomode=0;
-
-       /* Allocate fragment, SQ, RQ, and CQ; Reuse CEQ based on the PCI function */
-       nesvnic->nic_mem_size = 256 +
-                       (NES_NIC_WQ_SIZE * sizeof(struct nes_first_frag)) +
-                       (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe)) +
-                       (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe)) +
-                       (NES_NIC_WQ_SIZE * 2 * sizeof(struct nes_hw_nic_cqe)) +
-                       sizeof(struct nes_hw_nic_qp_context);
-
-       nesvnic->nic_vbase = pci_zalloc_consistent(nesdev->pcidev,
-                                                  nesvnic->nic_mem_size,
-                                                  &nesvnic->nic_pbase);
-       if (!nesvnic->nic_vbase) {
-               nes_debug(NES_DBG_INIT, "Unable to allocate memory for NIC host descriptor rings\n");
-               return -ENOMEM;
-       }
-       nes_debug(NES_DBG_INIT, "Allocated NIC QP structures at %p (phys = %016lX), size = %u.\n",
-                       nesvnic->nic_vbase, (unsigned long)nesvnic->nic_pbase, nesvnic->nic_mem_size);
-
-       vmem = (void *)(((unsigned long)nesvnic->nic_vbase + (256 - 1)) &
-                       ~(unsigned long)(256 - 1));
-       pmem = (dma_addr_t)(((unsigned long long)nesvnic->nic_pbase + (256 - 1)) &
-                       ~(unsigned long long)(256 - 1));
-
-       /* Setup the first Fragment buffers */
-       nesvnic->nic.first_frag_vbase = vmem;
-
-       for (counter = 0; counter < NES_NIC_WQ_SIZE; counter++) {
-               nesvnic->nic.frag_paddr[counter] = pmem;
-               pmem += sizeof(struct nes_first_frag);
-       }
-
-       /* setup the SQ */
-       vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_first_frag));
-
-       nesvnic->nic.sq_vbase = (void *)vmem;
-       nesvnic->nic.sq_pbase = pmem;
-       nesvnic->nic.sq_head = 0;
-       nesvnic->nic.sq_tail = 0;
-       nesvnic->nic.sq_size = NES_NIC_WQ_SIZE;
-       for (counter = 0; counter < NES_NIC_WQ_SIZE; counter++) {
-               nic_sqe = &nesvnic->nic.sq_vbase[counter];
-               nic_sqe->wqe_words[NES_NIC_SQ_WQE_MISC_IDX] =
-                               cpu_to_le32(NES_NIC_SQ_WQE_DISABLE_CHKSUM |
-                               NES_NIC_SQ_WQE_COMPLETION);
-               nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX] =
-                               cpu_to_le32((u32)NES_FIRST_FRAG_SIZE << 16);
-               nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX] =
-                               cpu_to_le32((u32)nesvnic->nic.frag_paddr[counter]);
-               nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX] =
-                               cpu_to_le32((u32)((u64)nesvnic->nic.frag_paddr[counter] >> 32));
-       }
-
-       nesvnic->get_cqp_request = nes_get_cqp_request;
-       nesvnic->post_cqp_request = nes_post_cqp_request;
-       nesvnic->mcrq_mcast_filter = NULL;
-
-       spin_lock_init(&nesvnic->nic.rq_lock);
-
-       /* setup the RQ */
-       vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe));
-       pmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe));
-
-
-       nesvnic->nic.rq_vbase = vmem;
-       nesvnic->nic.rq_pbase = pmem;
-       nesvnic->nic.rq_head = 0;
-       nesvnic->nic.rq_tail = 0;
-       nesvnic->nic.rq_size = NES_NIC_WQ_SIZE;
-
-       /* setup the CQ */
-       vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe));
-       pmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe));
-
-       if (nesdev->nesadapter->netdev_count > 2)
-               nesvnic->mcrq_qp_id = nesvnic->nic_index + 32;
-       else
-               nesvnic->mcrq_qp_id = nesvnic->nic.qp_id + 4;
-
-       nesvnic->nic_cq.cq_vbase = vmem;
-       nesvnic->nic_cq.cq_pbase = pmem;
-       nesvnic->nic_cq.cq_head = 0;
-       nesvnic->nic_cq.cq_size = NES_NIC_WQ_SIZE * 2;
-
-       nesvnic->nic_cq.ce_handler = nes_nic_napi_ce_handler;
-
-       /* Send CreateCQ request to CQP */
-       spin_lock_irqsave(&nesdev->cqp.lock, flags);
-       cqp_head = nesdev->cqp.sq_head;
-
-       cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-       nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-
-       cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(
-                       NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
-                       ((u32)nesvnic->nic_cq.cq_size << 16));
-       cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(
-                       nesvnic->nic_cq.cq_number | ((u32)nesdev->nic_ceq_index << 16));
-       u64temp = (u64)nesvnic->nic_cq.cq_pbase;
-       set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
-       cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =  0;
-       u64temp = (unsigned long)&nesvnic->nic_cq;
-       cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] =  cpu_to_le32((u32)(u64temp >> 1));
-       cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
-                       cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
-       cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
-       if (++cqp_head >= nesdev->cqp.sq_size)
-               cqp_head = 0;
-       cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-       nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-
-       /* Send CreateQP request to CQP */
-       nic_context = (void *)(&nesvnic->nic_cq.cq_vbase[nesvnic->nic_cq.cq_size]);
-       nic_context->context_words[NES_NIC_CTX_MISC_IDX] =
-                       cpu_to_le32((u32)NES_NIC_CTX_SIZE |
-                       ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 12));
-       nes_debug(NES_DBG_INIT, "RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x%08X, RX_WINDOW_BUFFER_SIZE = 0x%08X\n",
-                       nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE),
-                       nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE));
-       if (nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE) != 0) {
-               nic_context->context_words[NES_NIC_CTX_MISC_IDX] |= cpu_to_le32(NES_NIC_BACK_STORE);
-       }
-
-       u64temp = (u64)nesvnic->nic.sq_pbase;
-       nic_context->context_words[NES_NIC_CTX_SQ_LOW_IDX]  = cpu_to_le32((u32)u64temp);
-       nic_context->context_words[NES_NIC_CTX_SQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
-       u64temp = (u64)nesvnic->nic.rq_pbase;
-       nic_context->context_words[NES_NIC_CTX_RQ_LOW_IDX]  = cpu_to_le32((u32)u64temp);
-       nic_context->context_words[NES_NIC_CTX_RQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
-
-       cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_CREATE_QP |
-                       NES_CQP_QP_TYPE_NIC);
-       cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesvnic->nic.qp_id);
-       u64temp = (u64)nesvnic->nic_cq.cq_pbase +
-                       (nesvnic->nic_cq.cq_size * sizeof(struct nes_hw_nic_cqe));
-       set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
-
-       if (++cqp_head >= nesdev->cqp.sq_size)
-               cqp_head = 0;
-       nesdev->cqp.sq_head = cqp_head;
-
-       barrier();
-
-       /* Ring doorbell (2 WQEs) */
-       nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
-
-       spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-       nes_debug(NES_DBG_INIT, "Waiting for create NIC QP%u to complete.\n",
-                       nesvnic->nic.qp_id);
-
-       ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
-                       NES_EVENT_TIMEOUT);
-       nes_debug(NES_DBG_INIT, "Create NIC QP%u completed, wait_event_timeout ret = %u.\n",
-                       nesvnic->nic.qp_id, ret);
-       if (!ret) {
-               nes_debug(NES_DBG_INIT, "NIC QP%u create timeout expired\n", nesvnic->nic.qp_id);
-               pci_free_consistent(nesdev->pcidev, nesvnic->nic_mem_size, nesvnic->nic_vbase,
-                               nesvnic->nic_pbase);
-               return -EIO;
-       }
-
-       /* Populate the RQ */
-       for (counter = 0; counter < (NES_NIC_WQ_SIZE - 1); counter++) {
-               skb = dev_alloc_skb(nesvnic->max_frame_size);
-               if (!skb) {
-                       nes_debug(NES_DBG_INIT, "%s: out of memory for receive skb\n", netdev->name);
-
-                       nes_destroy_nic_qp(nesvnic);
-                       return -ENOMEM;
-               }
-
-               skb->dev = netdev;
-
-               pmem = pci_map_single(nesdev->pcidev, skb->data,
-                               nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
-               cb = (struct nes_rskb_cb *)&skb->cb[0];
-               cb->busaddr = pmem;
-               cb->maplen = nesvnic->max_frame_size;
-
-               nic_rqe = &nesvnic->nic.rq_vbase[counter];
-               nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = cpu_to_le32(nesvnic->max_frame_size);
-               nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
-               nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX]  = cpu_to_le32((u32)pmem);
-               nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = cpu_to_le32((u32)((u64)pmem >> 32));
-               nesvnic->nic.rx_skb[counter] = skb;
-       }
-
-       wqe_count = NES_NIC_WQ_SIZE - 1;
-       nesvnic->nic.rq_head = wqe_count;
-       barrier();
-       do {
-               counter = min(wqe_count, ((u32)255));
-               wqe_count -= counter;
-               nes_write32(nesdev->regs+NES_WQE_ALLOC, (counter << 24) | nesvnic->nic.qp_id);
-       } while (wqe_count);
-       timer_setup(&nesvnic->rq_wqes_timer, nes_rq_wqes_timeout, 0);
-       nes_debug(NES_DBG_INIT, "NAPI support Enabled\n");
-       if (nesdev->nesadapter->et_use_adaptive_rx_coalesce)
-       {
-               nes_nic_init_timer(nesdev);
-               if (netdev->mtu > 1500)
-                       jumbomode = 1;
-               nes_nic_init_timer_defaults(nesdev, jumbomode);
-       }
-       if ((nesdev->nesadapter->allow_unaligned_fpdus) &&
-               (nes_init_mgt_qp(nesdev, netdev, nesvnic))) {
-               nes_debug(NES_DBG_INIT, "%s: Out of memory for pau nic\n",
-                         netdev->name);
-               nes_destroy_nic_qp(nesvnic);
-               return -ENOMEM;
-       }
-
-       return 0;
-}
-
-
-/**
- * nes_destroy_nic_qp
- */
-void nes_destroy_nic_qp(struct nes_vnic *nesvnic)
-{
-       u64 u64temp;
-       dma_addr_t bus_address;
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       struct nes_hw_nic_sq_wqe *nic_sqe;
-       __le16 *wqe_fragment_length;
-       u16  wqe_fragment_index;
-       u32 cqp_head;
-       u32 wqm_cfg0;
-       unsigned long flags;
-       struct sk_buff *rx_skb;
-       struct nes_rskb_cb *cb;
-       int ret;
-
-       if (nesdev->nesadapter->allow_unaligned_fpdus)
-               nes_destroy_mgt(nesvnic);
-
-       /* clear wqe stall before destroying NIC QP */
-       wqm_cfg0 = nes_read_indexed(nesdev, NES_IDX_WQM_CONFIG0);
-       nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG0, wqm_cfg0 & 0xFFFF7FFF);
-
-       /* Free remaining NIC receive buffers */
-       while (nesvnic->nic.rq_head != nesvnic->nic.rq_tail) {
-               rx_skb = nesvnic->nic.rx_skb[nesvnic->nic.rq_tail];
-               cb = (struct nes_rskb_cb *)&rx_skb->cb[0];
-               pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen,
-                       PCI_DMA_FROMDEVICE);
-
-               dev_kfree_skb(nesvnic->nic.rx_skb[nesvnic->nic.rq_tail++]);
-               nesvnic->nic.rq_tail &= (nesvnic->nic.rq_size - 1);
-       }
-
-       /* Free remaining NIC transmit buffers */
-       while (nesvnic->nic.sq_head != nesvnic->nic.sq_tail) {
-               nic_sqe = &nesvnic->nic.sq_vbase[nesvnic->nic.sq_tail];
-               wqe_fragment_index = 1;
-               wqe_fragment_length = (__le16 *)
-                       &nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
-               /* bump past the vlan tag */
-               wqe_fragment_length++;
-               if (le16_to_cpu(wqe_fragment_length[wqe_fragment_index]) != 0) {
-                       u64temp = (u64)le32_to_cpu(
-                               nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX+
-                               wqe_fragment_index*2]);
-                       u64temp += ((u64)le32_to_cpu(
-                               nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX
-                               + wqe_fragment_index*2]))<<32;
-                       bus_address = (dma_addr_t)u64temp;
-                       if (test_and_clear_bit(nesvnic->nic.sq_tail,
-                                       nesvnic->nic.first_frag_overflow)) {
-                               pci_unmap_single(nesdev->pcidev,
-                                               bus_address,
-                                               le16_to_cpu(wqe_fragment_length[
-                                                       wqe_fragment_index++]),
-                                               PCI_DMA_TODEVICE);
-                       }
-                       for (; wqe_fragment_index < 5; wqe_fragment_index++) {
-                               if (wqe_fragment_length[wqe_fragment_index]) {
-                                       u64temp = le32_to_cpu(
-                                               nic_sqe->wqe_words[
-                                               NES_NIC_SQ_WQE_FRAG0_LOW_IDX+
-                                               wqe_fragment_index*2]);
-                                       u64temp += ((u64)le32_to_cpu(
-                                               nic_sqe->wqe_words[
-                                               NES_NIC_SQ_WQE_FRAG0_HIGH_IDX+
-                                               wqe_fragment_index*2]))<<32;
-                                       bus_address = (dma_addr_t)u64temp;
-                                       pci_unmap_page(nesdev->pcidev,
-                                                       bus_address,
-                                                       le16_to_cpu(
-                                                       wqe_fragment_length[
-                                                       wqe_fragment_index]),
-                                                       PCI_DMA_TODEVICE);
-                               } else
-                                       break;
-                       }
-               }
-               if (nesvnic->nic.tx_skb[nesvnic->nic.sq_tail])
-                       dev_kfree_skb(
-                               nesvnic->nic.tx_skb[nesvnic->nic.sq_tail]);
-
-               nesvnic->nic.sq_tail = (nesvnic->nic.sq_tail + 1)
-                                       & (nesvnic->nic.sq_size - 1);
-       }
-
-       spin_lock_irqsave(&nesdev->cqp.lock, flags);
-
-       /* Destroy NIC QP */
-       cqp_head = nesdev->cqp.sq_head;
-       cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-       nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
-               (NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_NIC));
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
-               nesvnic->nic.qp_id);
-
-       if (++cqp_head >= nesdev->cqp.sq_size)
-               cqp_head = 0;
-
-       cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-
-       /* Destroy NIC CQ */
-       nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
-               (NES_CQP_DESTROY_CQ | ((u32)nesvnic->nic_cq.cq_size << 16)));
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
-               (nesvnic->nic_cq.cq_number | ((u32)nesdev->nic_ceq_index << 16)));
-
-       if (++cqp_head >= nesdev->cqp.sq_size)
-               cqp_head = 0;
-
-       nesdev->cqp.sq_head = cqp_head;
-       barrier();
-
-       /* Ring doorbell (2 WQEs) */
-       nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
-
-       spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-       nes_debug(NES_DBG_SHUTDOWN, "Waiting for CQP, cqp_head=%u, cqp.sq_head=%u,"
-                       " cqp.sq_tail=%u, cqp.sq_size=%u\n",
-                       cqp_head, nesdev->cqp.sq_head,
-                       nesdev->cqp.sq_tail, nesdev->cqp.sq_size);
-
-       ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
-                       NES_EVENT_TIMEOUT);
-
-       nes_debug(NES_DBG_SHUTDOWN, "Destroy NIC QP returned, wait_event_timeout ret = %u, cqp_head=%u,"
-                       " cqp.sq_head=%u, cqp.sq_tail=%u\n",
-                       ret, cqp_head, nesdev->cqp.sq_head, nesdev->cqp.sq_tail);
-       if (!ret) {
-               nes_debug(NES_DBG_SHUTDOWN, "NIC QP%u destroy timeout expired\n",
-                               nesvnic->nic.qp_id);
-       }
-
-       pci_free_consistent(nesdev->pcidev, nesvnic->nic_mem_size, nesvnic->nic_vbase,
-                       nesvnic->nic_pbase);
-
-       /* restore old wqm_cfg0 value */
-       nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG0, wqm_cfg0);
-}
-
-/**
- * nes_napi_isr
- */
-int nes_napi_isr(struct nes_device *nesdev)
-{
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       u32 int_stat;
-
-       if (nesdev->napi_isr_ran) {
-               /* interrupt status has already been read in ISR */
-               int_stat = nesdev->int_stat;
-       } else {
-               int_stat = nes_read32(nesdev->regs + NES_INT_STAT);
-               nesdev->int_stat = int_stat;
-               nesdev->napi_isr_ran = 1;
-       }
-
-       int_stat &= nesdev->int_req;
-       /* iff NIC, process here, else wait for DPC */
-       if ((int_stat) && ((int_stat & 0x0000ff00) == int_stat)) {
-               nesdev->napi_isr_ran = 0;
-               nes_write32(nesdev->regs + NES_INT_STAT,
-                       (int_stat &
-                       ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0 | NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3)));
-
-               /* Process the CEQs */
-               nes_process_ceq(nesdev, &nesdev->nesadapter->ceq[nesdev->nic_ceq_index]);
-
-               if (unlikely((((nesadapter->et_rx_coalesce_usecs_irq) &&
-                                       (!nesadapter->et_use_adaptive_rx_coalesce)) ||
-                                       ((nesadapter->et_use_adaptive_rx_coalesce) &&
-                                        (nesdev->deepcq_count > nesadapter->et_pkt_rate_low))))) {
-                       if ((nesdev->int_req & NES_INT_TIMER) == 0) {
-                               /* Enable Periodic timer interrupts */
-                               nesdev->int_req |= NES_INT_TIMER;
-                               /* ack any pending periodic timer interrupts so we don't get an immediate interrupt */
-                               /* TODO: need to also ack other unused periodic timer values, get from nesadapter */
-                               nes_write32(nesdev->regs+NES_TIMER_STAT,
-                                               nesdev->timer_int_req  | ~(nesdev->nesadapter->timer_int_req));
-                               nes_write32(nesdev->regs+NES_INTF_INT_MASK,
-                                               ~(nesdev->intf_int_req | NES_INTF_PERIODIC_TIMER));
-                       }
-
-                       if (unlikely(nesadapter->et_use_adaptive_rx_coalesce))
-                       {
-                               nes_nic_init_timer(nesdev);
-                       }
-                       /* Enable interrupts, except CEQs */
-                       nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
-               } else {
-                       /* Enable interrupts, make sure timer is off */
-                       nesdev->int_req &= ~NES_INT_TIMER;
-                       nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
-                       nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
-               }
-               nesdev->deepcq_count = 0;
-               return 1;
-       } else {
-               return 0;
-       }
-}
-
-static void process_critical_error(struct nes_device *nesdev)
-{
-       u32 debug_error;
-       u32 nes_idx_debug_error_masks0 = 0;
-       u16 error_module = 0;
-
-       debug_error = nes_read_indexed(nesdev, NES_IDX_DEBUG_ERROR_CONTROL_STATUS);
-       printk(KERN_ERR PFX "Critical Error reported by device!!! 0x%02X\n",
-                       (u16)debug_error);
-       nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_CONTROL_STATUS,
-                       0x01010000 | (debug_error & 0x0000ffff));
-       if (crit_err_count++ > 10)
-               nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS1, 1 << 0x17);
-       error_module = (u16) (debug_error & 0x1F00) >> 8;
-       if (++nesdev->nesadapter->crit_error_count[error_module-1] >=
-                       nes_max_critical_error_count) {
-               printk(KERN_ERR PFX "Masking off critical error for module "
-                       "0x%02X\n", (u16)error_module);
-               nes_idx_debug_error_masks0 = nes_read_indexed(nesdev,
-                       NES_IDX_DEBUG_ERROR_MASKS0);
-               nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS0,
-                       nes_idx_debug_error_masks0 | (1 << error_module));
-       }
-}
-/**
- * nes_dpc
- */
-void nes_dpc(unsigned long param)
-{
-       struct nes_device *nesdev = (struct nes_device *)param;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       u32 counter;
-       u32 loop_counter = 0;
-       u32 int_status_bit;
-       u32 int_stat;
-       u32 timer_stat;
-       u32 temp_int_stat;
-       u32 intf_int_stat;
-       u32 processed_intf_int = 0;
-       u16 processed_timer_int = 0;
-       u16 completion_ints = 0;
-       u16 timer_ints = 0;
-
-       /* nes_debug(NES_DBG_ISR, "\n"); */
-
-       do {
-               timer_stat = 0;
-               if (nesdev->napi_isr_ran) {
-                       nesdev->napi_isr_ran = 0;
-                       int_stat = nesdev->int_stat;
-               } else
-                       int_stat = nes_read32(nesdev->regs+NES_INT_STAT);
-               if (processed_intf_int != 0)
-                       int_stat &= nesdev->int_req & ~NES_INT_INTF;
-               else
-                       int_stat &= nesdev->int_req;
-               if (processed_timer_int == 0) {
-                       processed_timer_int = 1;
-                       if (int_stat & NES_INT_TIMER) {
-                               timer_stat = nes_read32(nesdev->regs + NES_TIMER_STAT);
-                               if ((timer_stat & nesdev->timer_int_req) == 0) {
-                                       int_stat &= ~NES_INT_TIMER;
-                               }
-                       }
-               } else {
-                       int_stat &= ~NES_INT_TIMER;
-               }
-
-               if (int_stat) {
-                       if (int_stat & ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0|
-                                       NES_INT_MAC1|NES_INT_MAC2 | NES_INT_MAC3)) {
-                               /* Ack the interrupts */
-                               nes_write32(nesdev->regs+NES_INT_STAT,
-                                       (int_stat & ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0|
-                                       NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3)));
-                       }
-
-                       temp_int_stat = int_stat;
-                       for (counter = 0, int_status_bit = 1; counter < 16; counter++) {
-                               if (int_stat & int_status_bit) {
-                                       nes_process_ceq(nesdev, &nesadapter->ceq[counter]);
-                                       temp_int_stat &= ~int_status_bit;
-                                       completion_ints = 1;
-                               }
-                               if (!(temp_int_stat & 0x0000ffff))
-                                       break;
-                               int_status_bit <<= 1;
-                       }
-
-                       /* Process the AEQ for this pci function */
-                       int_status_bit = 1 << (16 + PCI_FUNC(nesdev->pcidev->devfn));
-                       if (int_stat & int_status_bit) {
-                               nes_process_aeq(nesdev, &nesadapter->aeq[PCI_FUNC(nesdev->pcidev->devfn)]);
-                       }
-
-                       /* Process the MAC interrupt for this pci function */
-                       int_status_bit = 1 << (24 + nesdev->mac_index);
-                       if (int_stat & int_status_bit) {
-                               nes_process_mac_intr(nesdev, nesdev->mac_index);
-                       }
-
-                       if (int_stat & NES_INT_TIMER) {
-                               if (timer_stat & nesdev->timer_int_req) {
-                                       nes_write32(nesdev->regs + NES_TIMER_STAT,
-                                                       (timer_stat & nesdev->timer_int_req) |
-                                                       ~(nesdev->nesadapter->timer_int_req));
-                                       timer_ints = 1;
-                               }
-                       }
-
-                       if (int_stat & NES_INT_INTF) {
-                               processed_intf_int = 1;
-                               intf_int_stat = nes_read32(nesdev->regs+NES_INTF_INT_STAT);
-                               intf_int_stat &= nesdev->intf_int_req;
-                               if (NES_INTF_INT_CRITERR & intf_int_stat) {
-                                       process_critical_error(nesdev);
-                               }
-                               if (NES_INTF_INT_PCIERR & intf_int_stat) {
-                                       printk(KERN_ERR PFX "PCI Error reported by device!!!\n");
-                                       BUG();
-                               }
-                               if (NES_INTF_INT_AEQ_OFLOW & intf_int_stat) {
-                                       printk(KERN_ERR PFX "AEQ Overflow reported by device!!!\n");
-                                       BUG();
-                               }
-                               nes_write32(nesdev->regs+NES_INTF_INT_STAT, intf_int_stat);
-                       }
-
-                       if (int_stat & NES_INT_TSW) {
-                       }
-               }
-               /* Don't use the interface interrupt bit stay in loop */
-               int_stat &= ~NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0 |
-                               NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3;
-       } while ((int_stat != 0) && (loop_counter++ < MAX_DPC_ITERATIONS));
-
-       if (timer_ints == 1) {
-               if ((nesadapter->et_rx_coalesce_usecs_irq) || (nesadapter->et_use_adaptive_rx_coalesce)) {
-                       if (completion_ints == 0) {
-                               nesdev->timer_only_int_count++;
-                               if (nesdev->timer_only_int_count>=nesadapter->timer_int_limit) {
-                                       nesdev->timer_only_int_count = 0;
-                                       nesdev->int_req &= ~NES_INT_TIMER;
-                                       nes_write32(nesdev->regs + NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
-                                       nes_write32(nesdev->regs + NES_INT_MASK, ~nesdev->int_req);
-                               } else {
-                                       nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
-                               }
-                       } else {
-                               if (unlikely(nesadapter->et_use_adaptive_rx_coalesce))
-                               {
-                                       nes_nic_init_timer(nesdev);
-                               }
-                               nesdev->timer_only_int_count = 0;
-                               nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
-                       }
-               } else {
-                       nesdev->timer_only_int_count = 0;
-                       nesdev->int_req &= ~NES_INT_TIMER;
-                       nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
-                       nes_write32(nesdev->regs+NES_TIMER_STAT,
-                                       nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req));
-                       nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
-               }
-       } else {
-               if ( (completion_ints == 1) &&
-                        (((nesadapter->et_rx_coalesce_usecs_irq) &&
-                          (!nesadapter->et_use_adaptive_rx_coalesce)) ||
-                         ((nesdev->deepcq_count > nesadapter->et_pkt_rate_low) &&
-                          (nesadapter->et_use_adaptive_rx_coalesce) )) ) {
-                       /* nes_debug(NES_DBG_ISR, "Enabling periodic timer interrupt.\n" ); */
-                       nesdev->timer_only_int_count = 0;
-                       nesdev->int_req |= NES_INT_TIMER;
-                       nes_write32(nesdev->regs+NES_TIMER_STAT,
-                                       nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req));
-                       nes_write32(nesdev->regs+NES_INTF_INT_MASK,
-                                       ~(nesdev->intf_int_req | NES_INTF_PERIODIC_TIMER));
-                       nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
-               } else {
-                       nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
-               }
-       }
-       nesdev->deepcq_count = 0;
-}
-
-
-/**
- * nes_process_ceq
- */
-static void nes_process_ceq(struct nes_device *nesdev, struct nes_hw_ceq *ceq)
-{
-       u64 u64temp;
-       struct nes_hw_cq *cq;
-       u32 head;
-       u32 ceq_size;
-
-       /* nes_debug(NES_DBG_CQ, "\n"); */
-       head = ceq->ceq_head;
-       ceq_size = ceq->ceq_size;
-
-       do {
-               if (le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX]) &
-                               NES_CEQE_VALID) {
-                       u64temp = (((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX]))) << 32) |
-                                               ((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_LOW_IDX])));
-                       u64temp <<= 1;
-                       cq = *((struct nes_hw_cq **)&u64temp);
-                       /* nes_debug(NES_DBG_CQ, "pCQ = %p\n", cq); */
-                       barrier();
-                       ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX] = 0;
-
-                       /* call the event handler */
-                       cq->ce_handler(nesdev, cq);
-
-                       if (++head >= ceq_size)
-                               head = 0;
-               } else {
-                       break;
-               }
-
-       } while (1);
-
-       ceq->ceq_head = head;
-}
-
-
-/**
- * nes_process_aeq
- */
-static void nes_process_aeq(struct nes_device *nesdev, struct nes_hw_aeq *aeq)
-{
-       /* u64 u64temp; */
-       u32 head;
-       u32 aeq_size;
-       u32 aeqe_misc;
-       u32 aeqe_cq_id;
-       struct nes_hw_aeqe volatile *aeqe;
-
-       head = aeq->aeq_head;
-       aeq_size = aeq->aeq_size;
-
-       do {
-               aeqe = &aeq->aeq_vbase[head];
-               if ((le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]) & NES_AEQE_VALID) == 0)
-                       break;
-               aeqe_misc  = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
-               aeqe_cq_id = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]);
-               if (aeqe_misc & (NES_AEQE_QP|NES_AEQE_CQ)) {
-                       if (aeqe_cq_id >= NES_FIRST_QPN) {
-                               /* dealing with an accelerated QP related AE */
-                               /*
-                                * u64temp = (((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX]))) << 32) |
-                                *           ((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX])));
-                                */
-                               nes_process_iwarp_aeqe(nesdev, (struct nes_hw_aeqe *)aeqe);
-                       } else {
-                               /* TODO: dealing with a CQP related AE */
-                               nes_debug(NES_DBG_AEQ, "Processing CQP related AE, misc = 0x%04X\n",
-                                               (u16)(aeqe_misc >> 16));
-                       }
-               }
-
-               aeqe->aeqe_words[NES_AEQE_MISC_IDX] = 0;
-
-               if (++head >= aeq_size)
-                       head = 0;
-
-               nes_write32(nesdev->regs + NES_AEQ_ALLOC, 1 << 16);
-       }
-       while (1);
-       aeq->aeq_head = head;
-}
-
-static void nes_reset_link(struct nes_device *nesdev, u32 mac_index)
-{
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       u32 reset_value;
-       u32 i=0;
-       u32 u32temp;
-
-       if (nesadapter->hw_rev == NE020_REV) {
-               return;
-       }
-       mh_detected++;
-
-       reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
-
-       if ((mac_index == 0) || ((mac_index == 1) && (nesadapter->OneG_Mode)))
-               reset_value |= 0x0000001d;
-       else
-               reset_value |= 0x0000002d;
-
-       if (4 <= (nesadapter->link_interrupt_count[mac_index] / ((u16)NES_MAX_LINK_INTERRUPTS))) {
-               if ((!nesadapter->OneG_Mode) && (nesadapter->port_count == 2)) {
-                       nesadapter->link_interrupt_count[0] = 0;
-                       nesadapter->link_interrupt_count[1] = 0;
-                       u32temp = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
-                       if (0x00000040 & u32temp)
-                               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088);
-                       else
-                               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8);
-
-                       reset_value |= 0x0000003d;
-               }
-               nesadapter->link_interrupt_count[mac_index] = 0;
-       }
-
-       nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
-
-       while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
-                       & 0x00000040) != 0x00000040) && (i++ < 5000));
-
-       if (0x0000003d == (reset_value & 0x0000003d)) {
-               u32 pcs_control_status0, pcs_control_status1;
-
-               for (i = 0; i < 10; i++) {
-                       pcs_control_status0 = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0);
-                       pcs_control_status1 = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
-                       if (((0x0F000000 == (pcs_control_status0 & 0x0F000000))
-                            && (pcs_control_status0 & 0x00100000))
-                           || ((0x0F000000 == (pcs_control_status1 & 0x0F000000))
-                               && (pcs_control_status1 & 0x00100000)))
-                               continue;
-                       else
-                               break;
-               }
-               if (10 == i) {
-                       u32temp = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
-                       if (0x00000040 & u32temp)
-                               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088);
-                       else
-                               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8);
-
-                       nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
-
-                       while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET)
-                                & 0x00000040) != 0x00000040) && (i++ < 5000));
-               }
-       }
-}
-
-/**
- * nes_process_mac_intr
- */
-static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number)
-{
-       unsigned long flags;
-       u32 pcs_control_status;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       struct nes_vnic *nesvnic;
-       u32 mac_status;
-       u32 mac_index = nesdev->mac_index;
-       u32 u32temp;
-       u16 phy_data;
-       u16 temp_phy_data;
-       u32 pcs_val  = 0x0f0f0000;
-       u32 pcs_mask = 0x0f1f0000;
-       u32 cdr_ctrl;
-
-       spin_lock_irqsave(&nesadapter->phy_lock, flags);
-       if (nesadapter->mac_sw_state[mac_number] != NES_MAC_SW_IDLE) {
-               spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-               return;
-       }
-       nesadapter->mac_sw_state[mac_number] = NES_MAC_SW_INTERRUPT;
-
-       /* ack the MAC interrupt */
-       mac_status = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (mac_index * 0x200));
-       /* Clear the interrupt */
-       nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (mac_index * 0x200), mac_status);
-
-       nes_debug(NES_DBG_PHY, "MAC%u interrupt status = 0x%X.\n", mac_number, mac_status);
-
-       if (mac_status & (NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT)) {
-               nesdev->link_status_interrupts++;
-               if (0 == (++nesadapter->link_interrupt_count[mac_index] % ((u16)NES_MAX_LINK_INTERRUPTS)))
-                       nes_reset_link(nesdev, mac_index);
-
-               /* read the PHY interrupt status register */
-               if ((nesadapter->OneG_Mode) &&
-               (nesadapter->phy_type[mac_index] != NES_PHY_TYPE_PUMA_1G)) {
-                       do {
-                               nes_read_1G_phy_reg(nesdev, 0x1a,
-                                               nesadapter->phy_index[mac_index], &phy_data);
-                               nes_debug(NES_DBG_PHY, "Phy%d data from register 0x1a = 0x%X.\n",
-                                               nesadapter->phy_index[mac_index], phy_data);
-                       } while (phy_data&0x8000);
-
-                       temp_phy_data = 0;
-                       do {
-                               nes_read_1G_phy_reg(nesdev, 0x11,
-                                               nesadapter->phy_index[mac_index], &phy_data);
-                               nes_debug(NES_DBG_PHY, "Phy%d data from register 0x11 = 0x%X.\n",
-                                               nesadapter->phy_index[mac_index], phy_data);
-                               if (temp_phy_data == phy_data)
-                                       break;
-                               temp_phy_data = phy_data;
-                       } while (1);
-
-                       nes_read_1G_phy_reg(nesdev, 0x1e,
-                                       nesadapter->phy_index[mac_index], &phy_data);
-                       nes_debug(NES_DBG_PHY, "Phy%d data from register 0x1e = 0x%X.\n",
-                                       nesadapter->phy_index[mac_index], phy_data);
-
-                       nes_read_1G_phy_reg(nesdev, 1,
-                                       nesadapter->phy_index[mac_index], &phy_data);
-                       nes_debug(NES_DBG_PHY, "1G phy%u data from register 1 = 0x%X\n",
-                                       nesadapter->phy_index[mac_index], phy_data);
-
-                       if (temp_phy_data & 0x1000) {
-                               nes_debug(NES_DBG_PHY, "The Link is up according to the PHY\n");
-                               phy_data = 4;
-                       } else {
-                               nes_debug(NES_DBG_PHY, "The Link is down according to the PHY\n");
-                       }
-               }
-               nes_debug(NES_DBG_PHY, "Eth SERDES Common Status: 0=0x%08X, 1=0x%08X\n",
-                               nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0),
-                               nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0+0x200));
-
-               if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_PUMA_1G) {
-                       switch (mac_index) {
-                       case 1:
-                       case 3:
-                               pcs_control_status = nes_read_indexed(nesdev,
-                                               NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
-                               break;
-                       default:
-                               pcs_control_status = nes_read_indexed(nesdev,
-                                               NES_IDX_PHY_PCS_CONTROL_STATUS0);
-                               break;
-                       }
-               } else {
-                       pcs_control_status = nes_read_indexed(nesdev,
-                                       NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index & 1) * 0x200));
-                       pcs_control_status = nes_read_indexed(nesdev,
-                                       NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index & 1) * 0x200));
-               }
-
-               nes_debug(NES_DBG_PHY, "PCS PHY Control/Status%u: 0x%08X\n",
-                               mac_index, pcs_control_status);
-               if ((nesadapter->OneG_Mode) &&
-                               (nesadapter->phy_type[mac_index] != NES_PHY_TYPE_PUMA_1G)) {
-                       u32temp = 0x01010000;
-                       if (nesadapter->port_count > 2) {
-                               u32temp |= 0x02020000;
-                       }
-                       if ((pcs_control_status & u32temp)!= u32temp) {
-                               phy_data = 0;
-                               nes_debug(NES_DBG_PHY, "PCS says the link is down\n");
-                       }
-               } else {
-                       switch (nesadapter->phy_type[mac_index]) {
-                       case NES_PHY_TYPE_ARGUS:
-                       case NES_PHY_TYPE_SFP_D:
-                       case NES_PHY_TYPE_KR:
-                               /* clear the alarms */
-                               nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0x0008);
-                               nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc001);
-                               nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc002);
-                               nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc005);
-                               nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc006);
-                               nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003);
-                               nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9004);
-                               nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9005);
-                               /* check link status */
-                               nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003);
-                               temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-
-                               nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021);
-                               nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-                               nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021);
-                               phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-
-                               phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0;
-
-                               nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n",
-                                       __func__, phy_data, nesadapter->mac_link_down[mac_index] ? "DOWN" : "UP");
-                               break;
-
-                       case NES_PHY_TYPE_PUMA_1G:
-                               if (mac_index < 2)
-                                       pcs_val = pcs_mask = 0x01010000;
-                               else
-                                       pcs_val = pcs_mask = 0x02020000;
-                               /* fall through */
-                       default:
-                               phy_data = (pcs_val == (pcs_control_status & pcs_mask)) ? 0x4 : 0x0;
-                               break;
-                       }
-               }
-
-               if (phy_data & 0x0004) {
-                       if (wide_ppm_offset &&
-                           (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_CX4) &&
-                           (nesadapter->hw_rev != NE020_REV)) {
-                               cdr_ctrl = nes_read_indexed(nesdev,
-                                                           NES_IDX_ETH_SERDES_CDR_CONTROL0 +
-                                                           mac_index * 0x200);
-                               nes_write_indexed(nesdev,
-                                                 NES_IDX_ETH_SERDES_CDR_CONTROL0 +
-                                                 mac_index * 0x200,
-                                                 cdr_ctrl | 0x000F0000);
-                       }
-                       nesadapter->mac_link_down[mac_index] = 0;
-                       list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
-                               nes_debug(NES_DBG_PHY, "The Link is UP!!.  linkup was %d\n",
-                                               nesvnic->linkup);
-                               if (nesvnic->linkup == 0) {
-                                       printk(PFX "The Link is now up for port %s, netdev %p.\n",
-                                                       nesvnic->netdev->name, nesvnic->netdev);
-                                       if (netif_queue_stopped(nesvnic->netdev))
-                                               netif_start_queue(nesvnic->netdev);
-                                       nesvnic->linkup = 1;
-                                       netif_carrier_on(nesvnic->netdev);
-
-                                       spin_lock(&nesvnic->port_ibevent_lock);
-                                       if (nesvnic->of_device_registered) {
-                                               if (nesdev->iw_status == 0) {
-                                                       nesdev->iw_status = 1;
-                                                       nes_port_ibevent(nesvnic);
-                                               }
-                                       }
-                                       spin_unlock(&nesvnic->port_ibevent_lock);
-                               }
-                       }
-               } else {
-                       if (wide_ppm_offset &&
-                           (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_CX4) &&
-                           (nesadapter->hw_rev != NE020_REV)) {
-                               cdr_ctrl = nes_read_indexed(nesdev,
-                                                           NES_IDX_ETH_SERDES_CDR_CONTROL0 +
-                                                           mac_index * 0x200);
-                               nes_write_indexed(nesdev,
-                                                 NES_IDX_ETH_SERDES_CDR_CONTROL0 +
-                                                 mac_index * 0x200,
-                                                 cdr_ctrl & 0xFFF0FFFF);
-                       }
-                       nesadapter->mac_link_down[mac_index] = 1;
-                       list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
-                               nes_debug(NES_DBG_PHY, "The Link is Down!!. linkup was %d\n",
-                                               nesvnic->linkup);
-                               if (nesvnic->linkup == 1) {
-                                       printk(PFX "The Link is now down for port %s, netdev %p.\n",
-                                                       nesvnic->netdev->name, nesvnic->netdev);
-                                       if (!(netif_queue_stopped(nesvnic->netdev)))
-                                               netif_stop_queue(nesvnic->netdev);
-                                       nesvnic->linkup = 0;
-                                       netif_carrier_off(nesvnic->netdev);
-
-                                       spin_lock(&nesvnic->port_ibevent_lock);
-                                       if (nesvnic->of_device_registered) {
-                                               if (nesdev->iw_status == 1) {
-                                                       nesdev->iw_status = 0;
-                                                       nes_port_ibevent(nesvnic);
-                                               }
-                                       }
-                                       spin_unlock(&nesvnic->port_ibevent_lock);
-                               }
-                       }
-               }
-               if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_SFP_D) {
-                       nesdev->link_recheck = 1;
-                       mod_delayed_work(system_wq, &nesdev->work,
-                                        NES_LINK_RECHECK_DELAY);
-               }
-       }
-
-       spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-
-       nesadapter->mac_sw_state[mac_number] = NES_MAC_SW_IDLE;
-}
-
-void nes_recheck_link_status(struct work_struct *work)
-{
-       unsigned long flags;
-       struct nes_device *nesdev = container_of(work, struct nes_device, work.work);
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       struct nes_vnic *nesvnic;
-       u32 mac_index = nesdev->mac_index;
-       u16 phy_data;
-       u16 temp_phy_data;
-
-       spin_lock_irqsave(&nesadapter->phy_lock, flags);
-
-       /* check link status */
-       nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003);
-       temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-
-       nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021);
-       nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-       nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021);
-       phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-
-       phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0;
-
-       nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n",
-               __func__, phy_data,
-               nesadapter->mac_link_down[mac_index] ? "DOWN" : "UP");
-
-       if (phy_data & 0x0004) {
-               nesadapter->mac_link_down[mac_index] = 0;
-               list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
-                       if (nesvnic->linkup == 0) {
-                               printk(PFX "The Link is now up for port %s, netdev %p.\n",
-                                               nesvnic->netdev->name, nesvnic->netdev);
-                               if (netif_queue_stopped(nesvnic->netdev))
-                                       netif_start_queue(nesvnic->netdev);
-                               nesvnic->linkup = 1;
-                               netif_carrier_on(nesvnic->netdev);
-
-                               spin_lock(&nesvnic->port_ibevent_lock);
-                               if (nesvnic->of_device_registered) {
-                                       if (nesdev->iw_status == 0) {
-                                               nesdev->iw_status = 1;
-                                               nes_port_ibevent(nesvnic);
-                                       }
-                               }
-                               spin_unlock(&nesvnic->port_ibevent_lock);
-                       }
-               }
-
-       } else {
-               nesadapter->mac_link_down[mac_index] = 1;
-               list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
-                       if (nesvnic->linkup == 1) {
-                               printk(PFX "The Link is now down for port %s, netdev %p.\n",
-                                               nesvnic->netdev->name, nesvnic->netdev);
-                               if (!(netif_queue_stopped(nesvnic->netdev)))
-                                       netif_stop_queue(nesvnic->netdev);
-                               nesvnic->linkup = 0;
-                               netif_carrier_off(nesvnic->netdev);
-
-                               spin_lock(&nesvnic->port_ibevent_lock);
-                               if (nesvnic->of_device_registered) {
-                                       if (nesdev->iw_status == 1) {
-                                               nesdev->iw_status = 0;
-                                               nes_port_ibevent(nesvnic);
-                                       }
-                               }
-                               spin_unlock(&nesvnic->port_ibevent_lock);
-                       }
-               }
-       }
-       if (nesdev->link_recheck++ < NES_LINK_RECHECK_MAX)
-               schedule_delayed_work(&nesdev->work, NES_LINK_RECHECK_DELAY);
-       else
-               nesdev->link_recheck = 0;
-
-       spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-}
-
-
-static void nes_nic_napi_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq)
-{
-       struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq);
-
-       napi_schedule(&nesvnic->napi);
-}
-
-
-/* The MAX_RQES_TO_PROCESS defines how many max read requests to complete before
-* getting out of nic_ce_handler
-*/
-#define        MAX_RQES_TO_PROCESS     384
-
-/**
- * nes_nic_ce_handler
- */
-void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq)
-{
-       u64 u64temp;
-       dma_addr_t bus_address;
-       struct nes_hw_nic *nesnic;
-       struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq);
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       struct nes_hw_nic_rq_wqe *nic_rqe;
-       struct nes_hw_nic_sq_wqe *nic_sqe;
-       struct sk_buff *skb;
-       struct sk_buff *rx_skb;
-       struct nes_rskb_cb *cb;
-       __le16 *wqe_fragment_length;
-       u32 head;
-       u32 cq_size;
-       u32 rx_pkt_size;
-       u32 cqe_count=0;
-       u32 cqe_errv;
-       u32 cqe_misc;
-       u16 wqe_fragment_index = 1;     /* first fragment (0) is used by copy buffer */
-       u16 vlan_tag;
-       u16 pkt_type;
-       u16 rqes_processed = 0;
-       u8 sq_cqes = 0;
-
-       head = cq->cq_head;
-       cq_size = cq->cq_size;
-       cq->cqes_pending = 1;
-       do {
-               if (le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]) &
-                               NES_NIC_CQE_VALID) {
-                       nesnic = &nesvnic->nic;
-                       cqe_misc = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]);
-                       if (cqe_misc & NES_NIC_CQE_SQ) {
-                               sq_cqes++;
-                               wqe_fragment_index = 1;
-                               nic_sqe = &nesnic->sq_vbase[nesnic->sq_tail];
-                               skb = nesnic->tx_skb[nesnic->sq_tail];
-                               wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
-                               /* bump past the vlan tag */
-                               wqe_fragment_length++;
-                               if (le16_to_cpu(wqe_fragment_length[wqe_fragment_index]) != 0) {
-                                       u64temp = (u64) le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX +
-                                                       wqe_fragment_index * 2]);
-                                       u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX +
-                                                       wqe_fragment_index * 2])) << 32;
-                                       bus_address = (dma_addr_t)u64temp;
-                                       if (test_and_clear_bit(nesnic->sq_tail, nesnic->first_frag_overflow)) {
-                                               pci_unmap_single(nesdev->pcidev,
-                                                               bus_address,
-                                                               le16_to_cpu(wqe_fragment_length[wqe_fragment_index++]),
-                                                               PCI_DMA_TODEVICE);
-                                       }
-                                       for (; wqe_fragment_index < 5; wqe_fragment_index++) {
-                                               if (wqe_fragment_length[wqe_fragment_index]) {
-                                                       u64temp = le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX +
-                                                                               wqe_fragment_index * 2]);
-                                                       u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX
-                                                                               + wqe_fragment_index * 2])) <<32;
-                                                       bus_address = (dma_addr_t)u64temp;
-                                                       pci_unmap_page(nesdev->pcidev,
-                                                                       bus_address,
-                                                                       le16_to_cpu(wqe_fragment_length[wqe_fragment_index]),
-                                                                       PCI_DMA_TODEVICE);
-                                               } else
-                                                       break;
-                                       }
-                               }
-                               if (skb)
-                                       dev_kfree_skb_any(skb);
-                               nesnic->sq_tail++;
-                               nesnic->sq_tail &= nesnic->sq_size-1;
-                               if (sq_cqes > 128) {
-                                       barrier();
-                                       /* restart the queue if it had been stopped */
-                                       if (netif_queue_stopped(nesvnic->netdev))
-                                               netif_wake_queue(nesvnic->netdev);
-                                       sq_cqes = 0;
-                               }
-                       } else {
-                               rqes_processed ++;
-
-                               cq->rx_cqes_completed++;
-                               cq->rx_pkts_indicated++;
-                               rx_pkt_size = cqe_misc & 0x0000ffff;
-                               nic_rqe = &nesnic->rq_vbase[nesnic->rq_tail];
-                               /* Get the skb */
-                               rx_skb = nesnic->rx_skb[nesnic->rq_tail];
-                               nic_rqe = &nesnic->rq_vbase[nesvnic->nic.rq_tail];
-                               bus_address = (dma_addr_t)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX]);
-                               bus_address += ((u64)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX])) << 32;
-                               pci_unmap_single(nesdev->pcidev, bus_address,
-                                               nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
-                               cb = (struct nes_rskb_cb *)&rx_skb->cb[0];
-                               cb->busaddr = 0;
-                               /* rx_skb->tail = rx_skb->data + rx_pkt_size; */
-                               /* rx_skb->len = rx_pkt_size; */
-                               rx_skb->len = 0;  /* TODO: see if this is necessary */
-                               skb_put(rx_skb, rx_pkt_size);
-                               rx_skb->protocol = eth_type_trans(rx_skb, nesvnic->netdev);
-                               nesnic->rq_tail++;
-                               nesnic->rq_tail &= nesnic->rq_size - 1;
-
-                               atomic_inc(&nesvnic->rx_skbs_needed);
-                               if (atomic_read(&nesvnic->rx_skbs_needed) > (nesvnic->nic.rq_size>>1)) {
-                                       nes_write32(nesdev->regs+NES_CQE_ALLOC,
-                                                       cq->cq_number | (cqe_count << 16));
-                                       /* nesadapter->tune_timer.cq_count += cqe_count; */
-                                       nesdev->currcq_count += cqe_count;
-                                       cqe_count = 0;
-                                       nes_replenish_nic_rq(nesvnic);
-                               }
-                               pkt_type = (u16)(le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX]));
-                               cqe_errv = (cqe_misc & NES_NIC_CQE_ERRV_MASK) >> NES_NIC_CQE_ERRV_SHIFT;
-                               rx_skb->ip_summed = CHECKSUM_NONE;
-
-                               if ((NES_PKT_TYPE_TCPV4_BITS == (pkt_type & NES_PKT_TYPE_TCPV4_MASK)) ||
-                                               (NES_PKT_TYPE_UDPV4_BITS == (pkt_type & NES_PKT_TYPE_UDPV4_MASK))) {
-                                       if ((cqe_errv &
-                                                       (NES_NIC_ERRV_BITS_IPV4_CSUM_ERR | NES_NIC_ERRV_BITS_TCPUDP_CSUM_ERR |
-                                                       NES_NIC_ERRV_BITS_IPH_ERR | NES_NIC_ERRV_BITS_WQE_OVERRUN)) == 0) {
-                                               if (nesvnic->netdev->features & NETIF_F_RXCSUM)
-                                                       rx_skb->ip_summed = CHECKSUM_UNNECESSARY;
-                                       } else
-                                               nes_debug(NES_DBG_CQ, "%s: unsuccessfully checksummed TCP or UDP packet."
-                                                               " errv = 0x%X, pkt_type = 0x%X.\n",
-                                                               nesvnic->netdev->name, cqe_errv, pkt_type);
-
-                               } else if ((pkt_type & NES_PKT_TYPE_IPV4_MASK) == NES_PKT_TYPE_IPV4_BITS) {
-                                       if ((cqe_errv &
-                                                       (NES_NIC_ERRV_BITS_IPV4_CSUM_ERR | NES_NIC_ERRV_BITS_IPH_ERR |
-                                                       NES_NIC_ERRV_BITS_WQE_OVERRUN)) == 0) {
-                                               if (nesvnic->netdev->features & NETIF_F_RXCSUM) {
-                                                       rx_skb->ip_summed = CHECKSUM_UNNECESSARY;
-                                                       /* nes_debug(NES_DBG_CQ, "%s: Reporting successfully checksummed IPv4 packet.\n",
-                                                                 nesvnic->netdev->name); */
-                                               }
-                                       } else
-                                               nes_debug(NES_DBG_CQ, "%s: unsuccessfully checksummed TCP or UDP packet."
-                                                               " errv = 0x%X, pkt_type = 0x%X.\n",
-                                                               nesvnic->netdev->name, cqe_errv, pkt_type);
-                                       }
-                               /* nes_debug(NES_DBG_CQ, "pkt_type=%x, APBVT_MASK=%x\n",
-                                                       pkt_type, (pkt_type & NES_PKT_TYPE_APBVT_MASK)); */
-
-                               if ((pkt_type & NES_PKT_TYPE_APBVT_MASK) == NES_PKT_TYPE_APBVT_BITS) {
-                                       if (nes_cm_recv(rx_skb, nesvnic->netdev))
-                                               rx_skb = NULL;
-                               }
-                               if (rx_skb == NULL)
-                                       goto skip_rx_indicate0;
-
-
-                               if (cqe_misc & NES_NIC_CQE_TAG_VALID) {
-                                       vlan_tag = (u16)(le32_to_cpu(
-                                                       cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX])
-                                                       >> 16);
-                                       nes_debug(NES_DBG_CQ, "%s: Reporting stripped VLAN packet. Tag = 0x%04X\n",
-                                                       nesvnic->netdev->name, vlan_tag);
-
-                                       __vlan_hwaccel_put_tag(rx_skb, htons(ETH_P_8021Q), vlan_tag);
-                               }
-                               napi_gro_receive(&nesvnic->napi, rx_skb);
-
-skip_rx_indicate0:
-                               ;
-                               /* nesvnic->netstats.rx_packets++; */
-                               /* nesvnic->netstats.rx_bytes += rx_pkt_size; */
-                       }
-
-                       cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX] = 0;
-                       /* Accounting... */
-                       cqe_count++;
-                       if (++head >= cq_size)
-                               head = 0;
-                       if (cqe_count == 255) {
-                               /* Replenish Nic CQ */
-                               nes_write32(nesdev->regs+NES_CQE_ALLOC,
-                                               cq->cq_number | (cqe_count << 16));
-                               /* nesdev->nesadapter->tune_timer.cq_count += cqe_count; */
-                               nesdev->currcq_count += cqe_count;
-                               cqe_count = 0;
-                       }
-
-                       if (cq->rx_cqes_completed >= nesvnic->budget)
-                               break;
-               } else {
-                       cq->cqes_pending = 0;
-                       break;
-               }
-
-       } while (1);
-
-       if (sq_cqes) {
-               barrier();
-               /* restart the queue if it had been stopped */
-               if (netif_queue_stopped(nesvnic->netdev))
-                       netif_wake_queue(nesvnic->netdev);
-       }
-       cq->cq_head = head;
-       /* nes_debug(NES_DBG_CQ, "CQ%u Processed = %u cqes, new head = %u.\n",
-                       cq->cq_number, cqe_count, cq->cq_head); */
-       cq->cqe_allocs_pending = cqe_count;
-       if (unlikely(nesadapter->et_use_adaptive_rx_coalesce))
-       {
-               /* nesdev->nesadapter->tune_timer.cq_count += cqe_count; */
-               nesdev->currcq_count += cqe_count;
-               nes_nic_tune_timer(nesdev);
-       }
-       if (atomic_read(&nesvnic->rx_skbs_needed))
-               nes_replenish_nic_rq(nesvnic);
-}
-
-
-
-/**
- * nes_cqp_ce_handler
- */
-static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq)
-{
-       u64 u64temp;
-       unsigned long flags;
-       struct nes_hw_cqp *cqp = NULL;
-       struct nes_cqp_request *cqp_request;
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       u32 head;
-       u32 cq_size;
-       u32 cqe_count=0;
-       u32 error_code;
-       u32 opcode;
-       u32 ctx_index;
-       /* u32 counter; */
-
-       head = cq->cq_head;
-       cq_size = cq->cq_size;
-
-       do {
-               /* process the CQE */
-               /* nes_debug(NES_DBG_CQP, "head=%u cqe_words=%08X\n", head,
-                         le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])); */
-
-               opcode = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]);
-               if (opcode & NES_CQE_VALID) {
-                       cqp = &nesdev->cqp;
-
-                       error_code = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_ERROR_CODE_IDX]);
-                       if (error_code) {
-                               nes_debug(NES_DBG_CQP, "Bad Completion code for opcode 0x%02X from CQP,"
-                                               " Major/Minor codes = 0x%04X:%04X.\n",
-                                               le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])&0x3f,
-                                               (u16)(error_code >> 16),
-                                               (u16)error_code);
-                       }
-
-                       u64temp = (((u64)(le32_to_cpu(cq->cq_vbase[head].
-                                       cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX]))) << 32) |
-                                       ((u64)(le32_to_cpu(cq->cq_vbase[head].
-                                       cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX])));
-
-                       cqp_request = (struct nes_cqp_request *)(unsigned long)u64temp;
-                       if (cqp_request) {
-                               if (cqp_request->waiting) {
-                                       /* nes_debug(NES_DBG_CQP, "%s: Waking up requestor\n"); */
-                                       cqp_request->major_code = (u16)(error_code >> 16);
-                                       cqp_request->minor_code = (u16)error_code;
-                                       barrier();
-                                       cqp_request->request_done = 1;
-                                       wake_up(&cqp_request->waitq);
-                                       nes_put_cqp_request(nesdev, cqp_request);
-                               } else {
-                                       if (cqp_request->callback)
-                                               cqp_request->cqp_callback(nesdev, cqp_request);
-                                       nes_free_cqp_request(nesdev, cqp_request);
-                               }
-                       } else {
-                               wake_up(&nesdev->cqp.waitq);
-                       }
-
-                       cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0;
-                       nes_write32(nesdev->regs + NES_CQE_ALLOC, cq->cq_number | (1 << 16));
-                       if (++cqp->sq_tail >= cqp->sq_size)
-                               cqp->sq_tail = 0;
-
-                       /* Accounting... */
-                       cqe_count++;
-                       if (++head >= cq_size)
-                               head = 0;
-               } else {
-                       break;
-               }
-       } while (1);
-       cq->cq_head = head;
-
-       spin_lock_irqsave(&nesdev->cqp.lock, flags);
-       while ((!list_empty(&nesdev->cqp_pending_reqs)) &&
-                       ((((nesdev->cqp.sq_tail+nesdev->cqp.sq_size)-nesdev->cqp.sq_head) &
-                       (nesdev->cqp.sq_size - 1)) != 1)) {
-               cqp_request = list_entry(nesdev->cqp_pending_reqs.next,
-                               struct nes_cqp_request, list);
-               list_del_init(&cqp_request->list);
-               head = nesdev->cqp.sq_head++;
-               nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
-               cqp_wqe = &nesdev->cqp.sq_vbase[head];
-               memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe));
-               barrier();
-
-               opcode = le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]);
-               if ((opcode & NES_CQP_OPCODE_MASK) == NES_CQP_DOWNLOAD_SEGMENT)
-                       ctx_index = NES_CQP_WQE_DL_COMP_CTX_LOW_IDX;
-               else
-                       ctx_index = NES_CQP_WQE_COMP_CTX_LOW_IDX;
-               cqp_wqe->wqe_words[ctx_index] =
-                       cpu_to_le32((u32)((unsigned long)cqp_request));
-               cqp_wqe->wqe_words[ctx_index + 1] =
-                       cpu_to_le32((u32)(upper_32_bits((unsigned long)cqp_request)));
-               nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) put on CQPs SQ wqe%u.\n",
-                               cqp_request, le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f, head);
-               /* Ring doorbell (1 WQEs) */
-               barrier();
-               nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id);
-       }
-       spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-
-       /* Arm the CCQ */
-       nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
-                       cq->cq_number);
-       nes_read32(nesdev->regs+NES_CQE_ALLOC);
-}
-
-static u8 *locate_mpa(u8 *pkt, u32 aeq_info)
-{
-       if (aeq_info & NES_AEQE_Q2_DATA_ETHERNET) {
-               /* skip over ethernet header */
-               pkt += ETH_HLEN;
-
-               /* Skip over IP and TCP headers */
-               pkt += 4 * (pkt[0] & 0x0f);
-               pkt += 4 * ((pkt[12] >> 4) & 0x0f);
-       }
-       return pkt;
-}
-
-/* Determine if incoming error pkt is rdma layer */
-static u32 iwarp_opcode(struct nes_qp *nesqp, u32 aeq_info)
-{
-       u8 *pkt;
-       u16 *mpa;
-       u32 opcode = 0xffffffff;
-
-       if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) {
-               pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET;
-               mpa = (u16 *)locate_mpa(pkt, aeq_info);
-               opcode = be16_to_cpu(mpa[1]) & 0xf;
-       }
-
-       return opcode;
-}
-
-/* Build iWARP terminate header */
-static int nes_bld_terminate_hdr(struct nes_qp *nesqp, u16 async_event_id, u32 aeq_info)
-{
-       u8 *pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET;
-       u16 ddp_seg_len;
-       int copy_len = 0;
-       u8 is_tagged = 0;
-       u8 flush_code = 0;
-       struct nes_terminate_hdr *termhdr;
-
-       termhdr = (struct nes_terminate_hdr *)nesqp->hwqp.q2_vbase;
-       memset(termhdr, 0, 64);
-
-       if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) {
-
-               /* Use data from offending packet to fill in ddp & rdma hdrs */
-               pkt = locate_mpa(pkt, aeq_info);
-               ddp_seg_len = be16_to_cpu(*(u16 *)pkt);
-               if (ddp_seg_len) {
-                       copy_len = 2;
-                       termhdr->hdrct = DDP_LEN_FLAG;
-                       if (pkt[2] & 0x80) {
-                               is_tagged = 1;
-                               if (ddp_seg_len >= TERM_DDP_LEN_TAGGED) {
-                                       copy_len += TERM_DDP_LEN_TAGGED;
-                                       termhdr->hdrct |= DDP_HDR_FLAG;
-                               }
-                       } else {
-                               if (ddp_seg_len >= TERM_DDP_LEN_UNTAGGED) {
-                                       copy_len += TERM_DDP_LEN_UNTAGGED;
-                                       termhdr->hdrct |= DDP_HDR_FLAG;
-                               }
-
-                               if (ddp_seg_len >= (TERM_DDP_LEN_UNTAGGED + TERM_RDMA_LEN)) {
-                                       if ((pkt[3] & RDMA_OPCODE_MASK) == RDMA_READ_REQ_OPCODE) {
-                                               copy_len += TERM_RDMA_LEN;
-                                               termhdr->hdrct |= RDMA_HDR_FLAG;
-                                       }
-                               }
-                       }
-               }
-       }
-
-       switch (async_event_id) {
-       case NES_AEQE_AEID_AMP_UNALLOCATED_STAG:
-               switch (iwarp_opcode(nesqp, aeq_info)) {
-               case IWARP_OPCODE_WRITE:
-                       flush_code = IB_WC_LOC_PROT_ERR;
-                       termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER;
-                       termhdr->error_code = DDP_TAGGED_INV_STAG;
-                       break;
-               default:
-                       flush_code = IB_WC_REM_ACCESS_ERR;
-                       termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
-                       termhdr->error_code = RDMAP_INV_STAG;
-               }
-               break;
-       case NES_AEQE_AEID_AMP_INVALID_STAG:
-               flush_code = IB_WC_REM_ACCESS_ERR;
-               termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
-               termhdr->error_code = RDMAP_INV_STAG;
-               break;
-       case NES_AEQE_AEID_AMP_BAD_QP:
-               flush_code = IB_WC_LOC_QP_OP_ERR;
-               termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
-               termhdr->error_code = DDP_UNTAGGED_INV_QN;
-               break;
-       case NES_AEQE_AEID_AMP_BAD_STAG_KEY:
-       case NES_AEQE_AEID_AMP_BAD_STAG_INDEX:
-               switch (iwarp_opcode(nesqp, aeq_info)) {
-               case IWARP_OPCODE_SEND_INV:
-               case IWARP_OPCODE_SEND_SE_INV:
-                       flush_code = IB_WC_REM_OP_ERR;
-                       termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP;
-                       termhdr->error_code = RDMAP_CANT_INV_STAG;
-                       break;
-               default:
-                       flush_code = IB_WC_REM_ACCESS_ERR;
-                       termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
-                       termhdr->error_code = RDMAP_INV_STAG;
-               }
-               break;
-       case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION:
-               if (aeq_info & (NES_AEQE_Q2_DATA_ETHERNET | NES_AEQE_Q2_DATA_MPA)) {
-                       flush_code = IB_WC_LOC_PROT_ERR;
-                       termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER;
-                       termhdr->error_code = DDP_TAGGED_BOUNDS;
-               } else {
-                       flush_code = IB_WC_REM_ACCESS_ERR;
-                       termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
-                       termhdr->error_code = RDMAP_INV_BOUNDS;
-               }
-               break;
-       case NES_AEQE_AEID_AMP_RIGHTS_VIOLATION:
-       case NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS:
-       case NES_AEQE_AEID_PRIV_OPERATION_DENIED:
-               flush_code = IB_WC_REM_ACCESS_ERR;
-               termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
-               termhdr->error_code = RDMAP_ACCESS;
-               break;
-       case NES_AEQE_AEID_AMP_TO_WRAP:
-               flush_code = IB_WC_REM_ACCESS_ERR;
-               termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
-               termhdr->error_code = RDMAP_TO_WRAP;
-               break;
-       case NES_AEQE_AEID_AMP_BAD_PD:
-               switch (iwarp_opcode(nesqp, aeq_info)) {
-               case IWARP_OPCODE_WRITE:
-                       flush_code = IB_WC_LOC_PROT_ERR;
-                       termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER;
-                       termhdr->error_code = DDP_TAGGED_UNASSOC_STAG;
-                       break;
-               case IWARP_OPCODE_SEND_INV:
-               case IWARP_OPCODE_SEND_SE_INV:
-                       flush_code = IB_WC_REM_ACCESS_ERR;
-                       termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
-                       termhdr->error_code = RDMAP_CANT_INV_STAG;
-                       break;
-               default:
-                       flush_code = IB_WC_REM_ACCESS_ERR;
-                       termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
-                       termhdr->error_code = RDMAP_UNASSOC_STAG;
-               }
-               break;
-       case NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH:
-               flush_code = IB_WC_LOC_LEN_ERR;
-               termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP;
-               termhdr->error_code = MPA_MARKER;
-               break;
-       case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR:
-               flush_code = IB_WC_GENERAL_ERR;
-               termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP;
-               termhdr->error_code = MPA_CRC;
-               break;
-       case NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE:
-       case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL:
-               flush_code = IB_WC_LOC_LEN_ERR;
-               termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC;
-               termhdr->error_code = DDP_CATASTROPHIC_LOCAL;
-               break;
-       case NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC:
-       case NES_AEQE_AEID_DDP_NO_L_BIT:
-               flush_code = IB_WC_FATAL_ERR;
-               termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC;
-               termhdr->error_code = DDP_CATASTROPHIC_LOCAL;
-               break;
-       case NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN:
-       case NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID:
-               flush_code = IB_WC_GENERAL_ERR;
-               termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
-               termhdr->error_code = DDP_UNTAGGED_INV_MSN_RANGE;
-               break;
-       case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER:
-               flush_code = IB_WC_LOC_LEN_ERR;
-               termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
-               termhdr->error_code = DDP_UNTAGGED_INV_TOO_LONG;
-               break;
-       case NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION:
-               flush_code = IB_WC_GENERAL_ERR;
-               if (is_tagged) {
-                       termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER;
-                       termhdr->error_code = DDP_TAGGED_INV_DDP_VER;
-               } else {
-                       termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
-                       termhdr->error_code = DDP_UNTAGGED_INV_DDP_VER;
-               }
-               break;
-       case NES_AEQE_AEID_DDP_UBE_INVALID_MO:
-               flush_code = IB_WC_GENERAL_ERR;
-               termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
-               termhdr->error_code = DDP_UNTAGGED_INV_MO;
-               break;
-       case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE:
-               flush_code = IB_WC_REM_OP_ERR;
-               termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
-               termhdr->error_code = DDP_UNTAGGED_INV_MSN_NO_BUF;
-               break;
-       case NES_AEQE_AEID_DDP_UBE_INVALID_QN:
-               flush_code = IB_WC_GENERAL_ERR;
-               termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
-               termhdr->error_code = DDP_UNTAGGED_INV_QN;
-               break;
-       case NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION:
-               flush_code = IB_WC_GENERAL_ERR;
-               termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP;
-               termhdr->error_code = RDMAP_INV_RDMAP_VER;
-               break;
-       case NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE:
-               flush_code = IB_WC_LOC_QP_OP_ERR;
-               termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP;
-               termhdr->error_code = RDMAP_UNEXPECTED_OP;
-               break;
-       default:
-               flush_code = IB_WC_FATAL_ERR;
-               termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP;
-               termhdr->error_code = RDMAP_UNSPECIFIED;
-               break;
-       }
-
-       if (copy_len)
-               memcpy(termhdr + 1, pkt, copy_len);
-
-       if ((flush_code) && ((NES_AEQE_INBOUND_RDMA & aeq_info) == 0)) {
-               if (aeq_info & NES_AEQE_SQ)
-                       nesqp->term_sq_flush_code = flush_code;
-               else
-                       nesqp->term_rq_flush_code = flush_code;
-       }
-
-       return sizeof(struct nes_terminate_hdr) + copy_len;
-}
-
-static void nes_terminate_connection(struct nes_device *nesdev, struct nes_qp *nesqp,
-                struct nes_hw_aeqe *aeqe, enum ib_event_type eventtype)
-{
-       u64 context;
-       unsigned long flags;
-       u32 aeq_info;
-       u16 async_event_id;
-       u8 tcp_state;
-       u8 iwarp_state;
-       u32 termlen = 0;
-       u32 mod_qp_flags = NES_CQP_QP_IWARP_STATE_TERMINATE |
-                          NES_CQP_QP_TERM_DONT_SEND_FIN;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-
-       if (nesqp->term_flags & NES_TERM_SENT)
-               return; /* Sanity check */
-
-       aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
-       tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT;
-       iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT;
-       async_event_id = (u16)aeq_info;
-
-       context = (unsigned long)nesadapter->qp_table[le32_to_cpu(
-               aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]) - NES_FIRST_QPN];
-       if (!context) {
-               WARN_ON(!context);
-               return;
-       }
-
-       nesqp = (struct nes_qp *)(unsigned long)context;
-       spin_lock_irqsave(&nesqp->lock, flags);
-       nesqp->hw_iwarp_state = iwarp_state;
-       nesqp->hw_tcp_state = tcp_state;
-       nesqp->last_aeq = async_event_id;
-       nesqp->terminate_eventtype = eventtype;
-       spin_unlock_irqrestore(&nesqp->lock, flags);
-
-       if (nesadapter->send_term_ok)
-               termlen = nes_bld_terminate_hdr(nesqp, async_event_id, aeq_info);
-       else
-               mod_qp_flags |= NES_CQP_QP_TERM_DONT_SEND_TERM_MSG;
-
-       if (!nesdev->iw_status)  {
-               nesqp->term_flags = NES_TERM_DONE;
-               nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_ERROR, 0, 0);
-               nes_cm_disconn(nesqp);
-       } else {
-               nes_terminate_start_timer(nesqp);
-               nesqp->term_flags |= NES_TERM_SENT;
-               nes_hw_modify_qp(nesdev, nesqp, mod_qp_flags, termlen, 0);
-       }
-}
-
-static void nes_terminate_send_fin(struct nes_device *nesdev,
-                         struct nes_qp *nesqp, struct nes_hw_aeqe *aeqe)
-{
-       u32 aeq_info;
-       u16 async_event_id;
-       u8 tcp_state;
-       u8 iwarp_state;
-       unsigned long flags;
-
-       aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
-       tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT;
-       iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT;
-       async_event_id = (u16)aeq_info;
-
-       spin_lock_irqsave(&nesqp->lock, flags);
-       nesqp->hw_iwarp_state = iwarp_state;
-       nesqp->hw_tcp_state = tcp_state;
-       nesqp->last_aeq = async_event_id;
-       spin_unlock_irqrestore(&nesqp->lock, flags);
-
-       /* Send the fin only */
-       nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_TERMINATE |
-               NES_CQP_QP_TERM_DONT_SEND_TERM_MSG, 0, 0);
-}
-
-/* Cleanup after a terminate sent or received */
-static void nes_terminate_done(struct nes_qp *nesqp, int timeout_occurred)
-{
-       u32 next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR;
-       unsigned long flags;
-       struct nes_vnic *nesvnic = to_nesvnic(nesqp->ibqp.device);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       u8 first_time = 0;
-
-       spin_lock_irqsave(&nesqp->lock, flags);
-       if (nesqp->hte_added) {
-               nesqp->hte_added = 0;
-               next_iwarp_state |= NES_CQP_QP_DEL_HTE;
-       }
-
-       first_time = (nesqp->term_flags & NES_TERM_DONE) == 0;
-       nesqp->term_flags |= NES_TERM_DONE;
-       spin_unlock_irqrestore(&nesqp->lock, flags);
-
-       /* Make sure we go through this only once */
-       if (first_time) {
-               if (timeout_occurred == 0)
-                       del_timer(&nesqp->terminate_timer);
-               else
-                       next_iwarp_state |= NES_CQP_QP_RESET;
-
-               nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0);
-               nes_cm_disconn(nesqp);
-       }
-}
-
-static void nes_terminate_received(struct nes_device *nesdev,
-                               struct nes_qp *nesqp, struct nes_hw_aeqe *aeqe)
-{
-       u32 aeq_info;
-       u8 *pkt;
-       u32 *mpa;
-       u8 ddp_ctl;
-       u8 rdma_ctl;
-       u16 aeq_id = 0;
-
-       aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
-       if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) {
-               /* Terminate is not a performance path so the silicon */
-               /* did not validate the frame - do it now */
-               pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET;
-               mpa = (u32 *)locate_mpa(pkt, aeq_info);
-               ddp_ctl = (be32_to_cpu(mpa[0]) >> 8) & 0xff;
-               rdma_ctl = be32_to_cpu(mpa[0]) & 0xff;
-               if ((ddp_ctl & 0xc0) != 0x40)
-                       aeq_id = NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC;
-               else if ((ddp_ctl & 0x03) != 1)
-                       aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION;
-               else if (be32_to_cpu(mpa[2]) != 2)
-                       aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_QN;
-               else if (be32_to_cpu(mpa[3]) != 1)
-                       aeq_id = NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN;
-               else if (be32_to_cpu(mpa[4]) != 0)
-                       aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_MO;
-               else if ((rdma_ctl & 0xc0) != 0x40)
-                       aeq_id = NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION;
-
-               if (aeq_id) {
-                       /* Bad terminate recvd - send back a terminate */
-                       aeq_info = (aeq_info & 0xffff0000) | aeq_id;
-                       aeqe->aeqe_words[NES_AEQE_MISC_IDX] = cpu_to_le32(aeq_info);
-                       nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL);
-                       return;
-               }
-       }
-
-       nesqp->term_flags |= NES_TERM_RCVD;
-       nesqp->terminate_eventtype = IB_EVENT_QP_FATAL;
-       nes_terminate_start_timer(nesqp);
-       nes_terminate_send_fin(nesdev, nesqp, aeqe);
-}
-
-/* Timeout routine in case terminate fails to complete */
-void nes_terminate_timeout(struct timer_list *t)
-{
-       struct nes_qp *nesqp = from_timer(nesqp, t, terminate_timer);
-
-       nes_terminate_done(nesqp, 1);
-}
-
-/* Set a timer in case hw cannot complete the terminate sequence */
-static void nes_terminate_start_timer(struct nes_qp *nesqp)
-{
-       mod_timer(&nesqp->terminate_timer, (jiffies + HZ));
-}
-
-/**
- * nes_process_iwarp_aeqe
- */
-static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
-                                  struct nes_hw_aeqe *aeqe)
-{
-       u64 context;
-       unsigned long flags;
-       struct nes_qp *nesqp;
-       struct nes_hw_cq *hw_cq;
-       struct nes_cq *nescq;
-       int resource_allocated;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       u32 aeq_info;
-       u32 next_iwarp_state = 0;
-       u32 aeqe_cq_id;
-       u16 async_event_id;
-       u8 tcp_state;
-       u8 iwarp_state;
-       struct ib_event ibevent;
-
-       nes_debug(NES_DBG_AEQ, "\n");
-       aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
-       if ((NES_AEQE_INBOUND_RDMA & aeq_info) || (!(NES_AEQE_QP & aeq_info))) {
-               context  = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]);
-               context += ((u64)le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX])) << 32;
-       } else {
-               context = (unsigned long)nesadapter->qp_table[le32_to_cpu(
-                                               aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]) - NES_FIRST_QPN];
-               BUG_ON(!context);
-       }
-
-       /* context is nesqp unless async_event_id == CQ ERROR */
-       nesqp = (struct nes_qp *)(unsigned long)context;
-       async_event_id = (u16)aeq_info;
-       tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT;
-       iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT;
-       nes_debug(NES_DBG_AEQ, "aeid = 0x%04X, qp-cq id = %d, aeqe = %p,"
-                       " Tcp state = %s, iWARP state = %s\n",
-                       async_event_id,
-                       le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), aeqe,
-                       nes_tcp_state_str[tcp_state], nes_iwarp_state_str[iwarp_state]);
-
-       aeqe_cq_id = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]);
-       if (aeq_info & NES_AEQE_QP) {
-               if (!nes_is_resource_allocated(nesadapter,
-                               nesadapter->allocated_qps,
-                               aeqe_cq_id))
-                       return;
-       }
-
-       switch (async_event_id) {
-               case NES_AEQE_AEID_LLP_FIN_RECEIVED:
-                       if (nesqp->term_flags)
-                               return; /* Ignore it, wait for close complete */
-
-                       if (atomic_inc_return(&nesqp->close_timer_started) == 1) {
-                               if ((tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) &&
-                                       (nesqp->ibqp_state == IB_QPS_RTS)) {
-                                       spin_lock_irqsave(&nesqp->lock, flags);
-                                       nesqp->hw_iwarp_state = iwarp_state;
-                                       nesqp->hw_tcp_state = tcp_state;
-                                       nesqp->last_aeq = async_event_id;
-                                       next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING;
-                                       nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING;
-                                       spin_unlock_irqrestore(&nesqp->lock, flags);
-                                       nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0);
-                                       nes_cm_disconn(nesqp);
-                               }
-                               nesqp->cm_id->add_ref(nesqp->cm_id);
-                               schedule_nes_timer(nesqp->cm_node, (struct sk_buff *)nesqp,
-                                               NES_TIMER_TYPE_CLOSE, 1, 0);
-                               nes_debug(NES_DBG_AEQ, "QP%u Not decrementing QP refcount (%d),"
-                                               " need ae to finish up, original_last_aeq = 0x%04X."
-                                               " last_aeq = 0x%04X, scheduling timer. TCP state = %d\n",
-                                               nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
-                                               async_event_id, nesqp->last_aeq, tcp_state);
-                       }
-                       break;
-               case NES_AEQE_AEID_LLP_CLOSE_COMPLETE:
-                       spin_lock_irqsave(&nesqp->lock, flags);
-                       nesqp->hw_iwarp_state = iwarp_state;
-                       nesqp->hw_tcp_state = tcp_state;
-                       nesqp->last_aeq = async_event_id;
-                       spin_unlock_irqrestore(&nesqp->lock, flags);
-                       nes_cm_disconn(nesqp);
-                       break;
-
-               case NES_AEQE_AEID_RESET_SENT:
-                       tcp_state = NES_AEQE_TCP_STATE_CLOSED;
-                       spin_lock_irqsave(&nesqp->lock, flags);
-                       nesqp->hw_iwarp_state = iwarp_state;
-                       nesqp->hw_tcp_state = tcp_state;
-                       nesqp->last_aeq = async_event_id;
-                       nesqp->hte_added = 0;
-                       spin_unlock_irqrestore(&nesqp->lock, flags);
-                       next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE;
-                       nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0);
-                       nes_cm_disconn(nesqp);
-                       break;
-
-               case NES_AEQE_AEID_LLP_CONNECTION_RESET:
-                       if (atomic_read(&nesqp->close_timer_started))
-                               return;
-                       spin_lock_irqsave(&nesqp->lock, flags);
-                       nesqp->hw_iwarp_state = iwarp_state;
-                       nesqp->hw_tcp_state = tcp_state;
-                       nesqp->last_aeq = async_event_id;
-                       spin_unlock_irqrestore(&nesqp->lock, flags);
-                       nes_cm_disconn(nesqp);
-                       break;
-
-               case NES_AEQE_AEID_TERMINATE_SENT:
-                       nes_terminate_send_fin(nesdev, nesqp, aeqe);
-                       break;
-
-               case NES_AEQE_AEID_LLP_TERMINATE_RECEIVED:
-                       nes_terminate_received(nesdev, nesqp, aeqe);
-                       break;
-
-               case NES_AEQE_AEID_AMP_BAD_STAG_KEY:
-               case NES_AEQE_AEID_AMP_BAD_STAG_INDEX:
-               case NES_AEQE_AEID_AMP_UNALLOCATED_STAG:
-               case NES_AEQE_AEID_AMP_INVALID_STAG:
-               case NES_AEQE_AEID_AMP_RIGHTS_VIOLATION:
-               case NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS:
-               case NES_AEQE_AEID_PRIV_OPERATION_DENIED:
-               case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER:
-               case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION:
-               case NES_AEQE_AEID_AMP_TO_WRAP:
-                       printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_ACCESS_ERR\n",
-                                       nesqp->hwqp.qp_id, async_event_id);
-                       nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_ACCESS_ERR);
-                       break;
-
-               case NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE:
-               case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL:
-               case NES_AEQE_AEID_DDP_UBE_INVALID_MO:
-               case NES_AEQE_AEID_DDP_UBE_INVALID_QN:
-                       if (iwarp_opcode(nesqp, aeq_info) > IWARP_OPCODE_TERM) {
-                               aeq_info &= 0xffff0000;
-                               aeq_info |= NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE;
-                               aeqe->aeqe_words[NES_AEQE_MISC_IDX] = cpu_to_le32(aeq_info);
-                       }
-                       /* fall through */
-               case NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE:
-               case NES_AEQE_AEID_LLP_TOO_MANY_RETRIES:
-               case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE:
-               case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR:
-               case NES_AEQE_AEID_AMP_BAD_QP:
-               case NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH:
-               case NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC:
-               case NES_AEQE_AEID_DDP_NO_L_BIT:
-               case NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN:
-               case NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID:
-               case NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION:
-               case NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION:
-               case NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE:
-               case NES_AEQE_AEID_AMP_BAD_PD:
-               case NES_AEQE_AEID_AMP_FASTREG_SHARED:
-               case NES_AEQE_AEID_AMP_FASTREG_VALID_STAG:
-               case NES_AEQE_AEID_AMP_FASTREG_MW_STAG:
-               case NES_AEQE_AEID_AMP_FASTREG_INVALID_RIGHTS:
-               case NES_AEQE_AEID_AMP_FASTREG_PBL_TABLE_OVERFLOW:
-               case NES_AEQE_AEID_AMP_FASTREG_INVALID_LENGTH:
-               case NES_AEQE_AEID_AMP_INVALIDATE_SHARED:
-               case NES_AEQE_AEID_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS:
-               case NES_AEQE_AEID_AMP_MWBIND_VALID_STAG:
-               case NES_AEQE_AEID_AMP_MWBIND_OF_MR_STAG:
-               case NES_AEQE_AEID_AMP_MWBIND_TO_ZERO_BASED_STAG:
-               case NES_AEQE_AEID_AMP_MWBIND_TO_MW_STAG:
-               case NES_AEQE_AEID_AMP_MWBIND_INVALID_RIGHTS:
-               case NES_AEQE_AEID_AMP_MWBIND_INVALID_BOUNDS:
-               case NES_AEQE_AEID_AMP_MWBIND_TO_INVALID_PARENT:
-               case NES_AEQE_AEID_AMP_MWBIND_BIND_DISABLED:
-               case NES_AEQE_AEID_BAD_CLOSE:
-               case NES_AEQE_AEID_RDMA_READ_WHILE_ORD_ZERO:
-               case NES_AEQE_AEID_STAG_ZERO_INVALID:
-               case NES_AEQE_AEID_ROE_INVALID_RDMA_READ_REQUEST:
-               case NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP:
-                       printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_FATAL\n",
-                                       nesqp->hwqp.qp_id, async_event_id);
-                       print_ip(nesqp->cm_node);
-                       if (!atomic_read(&nesqp->close_timer_started))
-                               nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL);
-                       break;
-
-               case NES_AEQE_AEID_CQ_OPERATION_ERROR:
-                       context <<= 1;
-                       nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u, %p\n",
-                                       le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), (void *)(unsigned long)context);
-                       resource_allocated = nes_is_resource_allocated(nesadapter, nesadapter->allocated_cqs,
-                                       le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]));
-                       if (resource_allocated) {
-                               printk(KERN_ERR PFX "%s: Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u\n",
-                                               __func__, le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]));
-                               hw_cq = (struct nes_hw_cq *)(unsigned long)context;
-                               if (hw_cq) {
-                                       nescq = container_of(hw_cq, struct nes_cq, hw_cq);
-                                       if (nescq->ibcq.event_handler) {
-                                               ibevent.device = nescq->ibcq.device;
-                                               ibevent.event = IB_EVENT_CQ_ERR;
-                                               ibevent.element.cq = &nescq->ibcq;
-                                               nescq->ibcq.event_handler(&ibevent, nescq->ibcq.cq_context);
-                                       }
-                               }
-                       }
-                       break;
-
-               default:
-                       nes_debug(NES_DBG_AEQ, "Processing an iWARP related AE for QP, misc = 0x%04X\n",
-                                       async_event_id);
-                       break;
-       }
-
-}
-
-/**
- * nes_iwarp_ce_handler
- */
-void nes_iwarp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *hw_cq)
-{
-       struct nes_cq *nescq = container_of(hw_cq, struct nes_cq, hw_cq);
-
-       /* nes_debug(NES_DBG_CQ, "Processing completion event for iWARP CQ%u.\n",
-                       nescq->hw_cq.cq_number); */
-       nes_write32(nesdev->regs+NES_CQ_ACK, nescq->hw_cq.cq_number);
-
-       if (nescq->ibcq.comp_handler)
-               nescq->ibcq.comp_handler(&nescq->ibcq, nescq->ibcq.cq_context);
-
-       return;
-}
-
-
-/**
- * nes_manage_apbvt()
- */
-int nes_manage_apbvt(struct nes_vnic *nesvnic, u32 accel_local_port,
-               u32 nic_index, u32 add_port)
-{
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       struct nes_cqp_request *cqp_request;
-       int ret = 0;
-       u16 major_code;
-
-       /* Send manage APBVT request to CQP */
-       cqp_request = nes_get_cqp_request(nesdev);
-       if (cqp_request == NULL) {
-               nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n");
-               return -ENOMEM;
-       }
-       cqp_request->waiting = 1;
-       cqp_wqe = &cqp_request->cqp_wqe;
-
-       nes_debug(NES_DBG_QP, "%s APBV for local port=%u(0x%04x), nic_index=%u\n",
-                       (add_port == NES_MANAGE_APBVT_ADD) ? "ADD" : "DEL",
-                       accel_local_port, accel_local_port, nic_index);
-
-       nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, (NES_CQP_MANAGE_APBVT |
-                       ((add_port == NES_MANAGE_APBVT_ADD) ? NES_CQP_APBVT_ADD : 0)));
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
-                       ((nic_index << NES_CQP_APBVT_NIC_SHIFT) | accel_local_port));
-
-       nes_debug(NES_DBG_QP, "Waiting for CQP completion for APBVT.\n");
-
-       atomic_set(&cqp_request->refcount, 2);
-       nes_post_cqp_request(nesdev, cqp_request);
-
-       if (add_port == NES_MANAGE_APBVT_ADD)
-               ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
-                               NES_EVENT_TIMEOUT);
-       nes_debug(NES_DBG_QP, "Completed, ret=%u,  CQP Major:Minor codes = 0x%04X:0x%04X\n",
-                       ret, cqp_request->major_code, cqp_request->minor_code);
-       major_code = cqp_request->major_code;
-
-       nes_put_cqp_request(nesdev, cqp_request);
-
-       if (!ret)
-               return -ETIME;
-       else if (major_code)
-               return -EIO;
-       else
-               return 0;
-}
-
-
-/**
- * nes_manage_arp_cache
- */
-void nes_manage_arp_cache(struct net_device *netdev, unsigned char *mac_addr,
-               u32 ip_addr, u32 action)
-{
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-       struct nes_device *nesdev;
-       struct nes_cqp_request *cqp_request;
-       int arp_index;
-
-       nesdev = nesvnic->nesdev;
-       arp_index = nes_arp_table(nesdev, ip_addr, mac_addr, action);
-       if (arp_index == -1) {
-               return;
-       }
-
-       /* update the ARP entry */
-       cqp_request = nes_get_cqp_request(nesdev);
-       if (cqp_request == NULL) {
-               nes_debug(NES_DBG_NETDEV, "Failed to get a cqp_request.\n");
-               return;
-       }
-       cqp_request->waiting = 0;
-       cqp_wqe = &cqp_request->cqp_wqe;
-       nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-
-       cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(
-                       NES_CQP_MANAGE_ARP_CACHE | NES_CQP_ARP_PERM);
-       cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(
-                       (u32)PCI_FUNC(nesdev->pcidev->devfn) << NES_CQP_ARP_AEQ_INDEX_SHIFT);
-       cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(arp_index);
-
-       if (action == NES_ARP_ADD) {
-               cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_ARP_VALID);
-               cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = cpu_to_le32(
-                               (((u32)mac_addr[2]) << 24) | (((u32)mac_addr[3]) << 16) |
-                               (((u32)mac_addr[4]) << 8)  | (u32)mac_addr[5]);
-               cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = cpu_to_le32(
-                               (((u32)mac_addr[0]) << 8) | (u32)mac_addr[1]);
-       } else {
-               cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = 0;
-               cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = 0;
-       }
-
-       nes_debug(NES_DBG_NETDEV, "Not waiting for CQP, cqp.sq_head=%u, cqp.sq_tail=%u\n",
-                       nesdev->cqp.sq_head, nesdev->cqp.sq_tail);
-
-       atomic_set(&cqp_request->refcount, 1);
-       nes_post_cqp_request(nesdev, cqp_request);
-}
-
-
-/**
- * flush_wqes
- */
-void flush_wqes(struct nes_device *nesdev, struct nes_qp *nesqp,
-               u32 which_wq, u32 wait_completion)
-{
-       struct nes_cqp_request *cqp_request;
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       u32 sq_code = (NES_IWARP_CQE_MAJOR_FLUSH << 16) | NES_IWARP_CQE_MINOR_FLUSH;
-       u32 rq_code = (NES_IWARP_CQE_MAJOR_FLUSH << 16) | NES_IWARP_CQE_MINOR_FLUSH;
-       int ret;
-
-       cqp_request = nes_get_cqp_request(nesdev);
-       if (cqp_request == NULL) {
-               nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n");
-               return;
-       }
-       if (wait_completion) {
-               cqp_request->waiting = 1;
-               atomic_set(&cqp_request->refcount, 2);
-       } else {
-               cqp_request->waiting = 0;
-       }
-       cqp_wqe = &cqp_request->cqp_wqe;
-       nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-
-       /* If wqe in error was identified, set code to be put into cqe */
-       if ((nesqp->term_sq_flush_code) && (which_wq & NES_CQP_FLUSH_SQ)) {
-               which_wq |= NES_CQP_FLUSH_MAJ_MIN;
-               sq_code = (CQE_MAJOR_DRV << 16) | nesqp->term_sq_flush_code;
-               nesqp->term_sq_flush_code = 0;
-       }
-
-       if ((nesqp->term_rq_flush_code) && (which_wq & NES_CQP_FLUSH_RQ)) {
-               which_wq |= NES_CQP_FLUSH_MAJ_MIN;
-               rq_code = (CQE_MAJOR_DRV << 16) | nesqp->term_rq_flush_code;
-               nesqp->term_rq_flush_code = 0;
-       }
-
-       if (which_wq & NES_CQP_FLUSH_MAJ_MIN) {
-               cqp_wqe->wqe_words[NES_CQP_QP_WQE_FLUSH_SQ_CODE] = cpu_to_le32(sq_code);
-               cqp_wqe->wqe_words[NES_CQP_QP_WQE_FLUSH_RQ_CODE] = cpu_to_le32(rq_code);
-       }
-
-       cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] =
-                       cpu_to_le32(NES_CQP_FLUSH_WQES | which_wq);
-       cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesqp->hwqp.qp_id);
-
-       nes_post_cqp_request(nesdev, cqp_request);
-
-       if (wait_completion) {
-               /* Wait for CQP */
-               ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
-                               NES_EVENT_TIMEOUT);
-               nes_debug(NES_DBG_QP, "Flush SQ QP WQEs completed, ret=%u,"
-                               " CQP Major:Minor codes = 0x%04X:0x%04X\n",
-                               ret, cqp_request->major_code, cqp_request->minor_code);
-               nes_put_cqp_request(nesdev, cqp_request);
-       }
-}
diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h
deleted file mode 100644 (file)
index 3c56470..0000000
+++ /dev/null
@@ -1,1380 +0,0 @@
-/*
-* Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenIB.org BSD license below:
-*
-*     Redistribution and use in source and binary forms, with or
-*     without modification, are permitted provided that the following
-*     conditions are met:
-*
-*      - Redistributions of source code must retain the above
-*        copyright notice, this list of conditions and the following
-*        disclaimer.
-*
-*      - Redistributions in binary form must reproduce the above
-*        copyright notice, this list of conditions and the following
-*        disclaimer in the documentation and/or other materials
-*        provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-
-#ifndef __NES_HW_H
-#define __NES_HW_H
-
-#define NES_PHY_TYPE_CX4       1
-#define NES_PHY_TYPE_1G        2
-#define NES_PHY_TYPE_ARGUS     4
-#define NES_PHY_TYPE_PUMA_1G   5
-#define NES_PHY_TYPE_PUMA_10G  6
-#define NES_PHY_TYPE_GLADIUS   7
-#define NES_PHY_TYPE_SFP_D     8
-#define NES_PHY_TYPE_KR               9
-
-#define NES_MULTICAST_PF_MAX 8
-#define NES_A0 3
-
-#define NES_ENABLE_PAU 0x07000001
-#define NES_DISABLE_PAU 0x07000000
-#define NES_PAU_COUNTER 10
-#define NES_CQP_OPCODE_MASK 0x3f
-
-enum pci_regs {
-       NES_INT_STAT = 0x0000,
-       NES_INT_MASK = 0x0004,
-       NES_INT_PENDING = 0x0008,
-       NES_INTF_INT_STAT = 0x000C,
-       NES_INTF_INT_MASK = 0x0010,
-       NES_TIMER_STAT = 0x0014,
-       NES_PERIODIC_CONTROL = 0x0018,
-       NES_ONE_SHOT_CONTROL = 0x001C,
-       NES_EEPROM_COMMAND = 0x0020,
-       NES_EEPROM_DATA = 0x0024,
-       NES_FLASH_COMMAND = 0x0028,
-       NES_FLASH_DATA  = 0x002C,
-       NES_SOFTWARE_RESET = 0x0030,
-       NES_CQ_ACK = 0x0034,
-       NES_WQE_ALLOC = 0x0040,
-       NES_CQE_ALLOC = 0x0044,
-       NES_AEQ_ALLOC = 0x0048
-};
-
-enum indexed_regs {
-       NES_IDX_CREATE_CQP_LOW = 0x0000,
-       NES_IDX_CREATE_CQP_HIGH = 0x0004,
-       NES_IDX_QP_CONTROL = 0x0040,
-       NES_IDX_FLM_CONTROL = 0x0080,
-       NES_IDX_INT_CPU_STATUS = 0x00a0,
-       NES_IDX_GPR_TRIGGER = 0x00bc,
-       NES_IDX_GPIO_CONTROL = 0x00f0,
-       NES_IDX_GPIO_DATA = 0x00f4,
-       NES_IDX_GPR2 = 0x010c,
-       NES_IDX_TCP_CONFIG0 = 0x01e4,
-       NES_IDX_TCP_TIMER_CONFIG = 0x01ec,
-       NES_IDX_TCP_NOW = 0x01f0,
-       NES_IDX_QP_MAX_CFG_SIZES = 0x0200,
-       NES_IDX_QP_CTX_SIZE = 0x0218,
-       NES_IDX_TCP_TIMER_SIZE0 = 0x0238,
-       NES_IDX_TCP_TIMER_SIZE1 = 0x0240,
-       NES_IDX_ARP_CACHE_SIZE = 0x0258,
-       NES_IDX_CQ_CTX_SIZE = 0x0260,
-       NES_IDX_MRT_SIZE = 0x0278,
-       NES_IDX_PBL_REGION_SIZE = 0x0280,
-       NES_IDX_IRRQ_COUNT = 0x02b0,
-       NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x02f0,
-       NES_IDX_RX_WINDOW_BUFFER_SIZE = 0x0300,
-       NES_IDX_DST_IP_ADDR = 0x0400,
-       NES_IDX_PCIX_DIAG = 0x08e8,
-       NES_IDX_MPP_DEBUG = 0x0a00,
-       NES_IDX_PORT_RX_DISCARDS = 0x0a30,
-       NES_IDX_PORT_TX_DISCARDS = 0x0a34,
-       NES_IDX_MPP_LB_DEBUG = 0x0b00,
-       NES_IDX_DENALI_CTL_22 = 0x1058,
-       NES_IDX_MAC_TX_CONTROL = 0x2000,
-       NES_IDX_MAC_TX_CONFIG = 0x2004,
-       NES_IDX_MAC_TX_PAUSE_QUANTA = 0x2008,
-       NES_IDX_MAC_RX_CONTROL = 0x200c,
-       NES_IDX_MAC_RX_CONFIG = 0x2010,
-       NES_IDX_MAC_EXACT_MATCH_BOTTOM = 0x201c,
-       NES_IDX_MAC_MDIO_CONTROL = 0x2084,
-       NES_IDX_MAC_TX_OCTETS_LOW = 0x2100,
-       NES_IDX_MAC_TX_OCTETS_HIGH = 0x2104,
-       NES_IDX_MAC_TX_FRAMES_LOW = 0x2108,
-       NES_IDX_MAC_TX_FRAMES_HIGH = 0x210c,
-       NES_IDX_MAC_TX_PAUSE_FRAMES = 0x2118,
-       NES_IDX_MAC_TX_ERRORS = 0x2138,
-       NES_IDX_MAC_RX_OCTETS_LOW = 0x213c,
-       NES_IDX_MAC_RX_OCTETS_HIGH = 0x2140,
-       NES_IDX_MAC_RX_FRAMES_LOW = 0x2144,
-       NES_IDX_MAC_RX_FRAMES_HIGH = 0x2148,
-       NES_IDX_MAC_RX_BC_FRAMES_LOW = 0x214c,
-       NES_IDX_MAC_RX_MC_FRAMES_HIGH = 0x2150,
-       NES_IDX_MAC_RX_PAUSE_FRAMES = 0x2154,
-       NES_IDX_MAC_RX_SHORT_FRAMES = 0x2174,
-       NES_IDX_MAC_RX_OVERSIZED_FRAMES = 0x2178,
-       NES_IDX_MAC_RX_JABBER_FRAMES = 0x217c,
-       NES_IDX_MAC_RX_CRC_ERR_FRAMES = 0x2180,
-       NES_IDX_MAC_RX_LENGTH_ERR_FRAMES = 0x2184,
-       NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES = 0x2188,
-       NES_IDX_MAC_INT_STATUS = 0x21f0,
-       NES_IDX_MAC_INT_MASK = 0x21f4,
-       NES_IDX_PHY_PCS_CONTROL_STATUS0 = 0x2800,
-       NES_IDX_PHY_PCS_CONTROL_STATUS1 = 0x2a00,
-       NES_IDX_ETH_SERDES_COMMON_CONTROL0 = 0x2808,
-       NES_IDX_ETH_SERDES_COMMON_CONTROL1 = 0x2a08,
-       NES_IDX_ETH_SERDES_COMMON_STATUS0 = 0x280c,
-       NES_IDX_ETH_SERDES_COMMON_STATUS1 = 0x2a0c,
-       NES_IDX_ETH_SERDES_TX_EMP0 = 0x2810,
-       NES_IDX_ETH_SERDES_TX_EMP1 = 0x2a10,
-       NES_IDX_ETH_SERDES_TX_DRIVE0 = 0x2814,
-       NES_IDX_ETH_SERDES_TX_DRIVE1 = 0x2a14,
-       NES_IDX_ETH_SERDES_RX_MODE0 = 0x2818,
-       NES_IDX_ETH_SERDES_RX_MODE1 = 0x2a18,
-       NES_IDX_ETH_SERDES_RX_SIGDET0 = 0x281c,
-       NES_IDX_ETH_SERDES_RX_SIGDET1 = 0x2a1c,
-       NES_IDX_ETH_SERDES_BYPASS0 = 0x2820,
-       NES_IDX_ETH_SERDES_BYPASS1 = 0x2a20,
-       NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0 = 0x2824,
-       NES_IDX_ETH_SERDES_LOOPBACK_CONTROL1 = 0x2a24,
-       NES_IDX_ETH_SERDES_RX_EQ_CONTROL0 = 0x2828,
-       NES_IDX_ETH_SERDES_RX_EQ_CONTROL1 = 0x2a28,
-       NES_IDX_ETH_SERDES_RX_EQ_STATUS0 = 0x282c,
-       NES_IDX_ETH_SERDES_RX_EQ_STATUS1 = 0x2a2c,
-       NES_IDX_ETH_SERDES_CDR_RESET0 = 0x2830,
-       NES_IDX_ETH_SERDES_CDR_RESET1 = 0x2a30,
-       NES_IDX_ETH_SERDES_CDR_CONTROL0 = 0x2834,
-       NES_IDX_ETH_SERDES_CDR_CONTROL1 = 0x2a34,
-       NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0 = 0x2838,
-       NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1 = 0x2a38,
-       NES_IDX_ENDNODE0_NSTAT_RX_DISCARD = 0x3080,
-       NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO = 0x3000,
-       NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI = 0x3004,
-       NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO = 0x3008,
-       NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI = 0x300c,
-       NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO = 0x7000,
-       NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI = 0x7004,
-       NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO = 0x7008,
-       NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI = 0x700c,
-       NES_IDX_WQM_CONFIG0 = 0x5000,
-       NES_IDX_WQM_CONFIG1 = 0x5004,
-       NES_IDX_CM_CONFIG = 0x5100,
-       NES_IDX_NIC_LOGPORT_TO_PHYPORT = 0x6000,
-       NES_IDX_NIC_PHYPORT_TO_USW = 0x6008,
-       NES_IDX_NIC_ACTIVE = 0x6010,
-       NES_IDX_NIC_UNICAST_ALL = 0x6018,
-       NES_IDX_NIC_MULTICAST_ALL = 0x6020,
-       NES_IDX_NIC_MULTICAST_ENABLE = 0x6028,
-       NES_IDX_NIC_BROADCAST_ON = 0x6030,
-       NES_IDX_USED_CHUNKS_TX = 0x60b0,
-       NES_IDX_TX_POOL_SIZE = 0x60b8,
-       NES_IDX_QUAD_HASH_TABLE_SIZE = 0x6148,
-       NES_IDX_PERFECT_FILTER_LOW = 0x6200,
-       NES_IDX_PERFECT_FILTER_HIGH = 0x6204,
-       NES_IDX_IPV4_TCP_REXMITS = 0x7080,
-       NES_IDX_DEBUG_ERROR_CONTROL_STATUS = 0x913c,
-       NES_IDX_DEBUG_ERROR_MASKS0 = 0x9140,
-       NES_IDX_DEBUG_ERROR_MASKS1 = 0x9144,
-       NES_IDX_DEBUG_ERROR_MASKS2 = 0x9148,
-       NES_IDX_DEBUG_ERROR_MASKS3 = 0x914c,
-       NES_IDX_DEBUG_ERROR_MASKS4 = 0x9150,
-       NES_IDX_DEBUG_ERROR_MASKS5 = 0x9154,
-};
-
-#define NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE   1
-#define NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE (1 << 17)
-
-enum nes_cqp_opcodes {
-       NES_CQP_CREATE_QP = 0x00,
-       NES_CQP_MODIFY_QP = 0x01,
-       NES_CQP_DESTROY_QP = 0x02,
-       NES_CQP_CREATE_CQ = 0x03,
-       NES_CQP_MODIFY_CQ = 0x04,
-       NES_CQP_DESTROY_CQ = 0x05,
-       NES_CQP_ALLOCATE_STAG = 0x09,
-       NES_CQP_REGISTER_STAG = 0x0a,
-       NES_CQP_QUERY_STAG = 0x0b,
-       NES_CQP_REGISTER_SHARED_STAG = 0x0c,
-       NES_CQP_DEALLOCATE_STAG = 0x0d,
-       NES_CQP_MANAGE_ARP_CACHE = 0x0f,
-       NES_CQP_DOWNLOAD_SEGMENT = 0x10,
-       NES_CQP_SUSPEND_QPS = 0x11,
-       NES_CQP_UPLOAD_CONTEXT = 0x13,
-       NES_CQP_CREATE_CEQ = 0x16,
-       NES_CQP_DESTROY_CEQ = 0x18,
-       NES_CQP_CREATE_AEQ = 0x19,
-       NES_CQP_DESTROY_AEQ = 0x1b,
-       NES_CQP_LMI_ACCESS = 0x20,
-       NES_CQP_FLUSH_WQES = 0x22,
-       NES_CQP_MANAGE_APBVT = 0x23,
-       NES_CQP_MANAGE_QUAD_HASH = 0x25
-};
-
-enum nes_cqp_wqe_word_idx {
-       NES_CQP_WQE_OPCODE_IDX = 0,
-       NES_CQP_WQE_ID_IDX = 1,
-       NES_CQP_WQE_COMP_CTX_LOW_IDX = 2,
-       NES_CQP_WQE_COMP_CTX_HIGH_IDX = 3,
-       NES_CQP_WQE_COMP_SCRATCH_LOW_IDX = 4,
-       NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX = 5,
-};
-
-enum nes_cqp_wqe_word_download_idx { /* format differs from other cqp ops */
-       NES_CQP_WQE_DL_OPCODE_IDX = 0,
-       NES_CQP_WQE_DL_COMP_CTX_LOW_IDX = 1,
-       NES_CQP_WQE_DL_COMP_CTX_HIGH_IDX = 2,
-       NES_CQP_WQE_DL_LENGTH_0_TOTAL_IDX = 3
-       /* For index values 4-15 use NES_NIC_SQ_WQE_ values */
-};
-
-enum nes_cqp_cq_wqeword_idx {
-       NES_CQP_CQ_WQE_PBL_LOW_IDX = 6,
-       NES_CQP_CQ_WQE_PBL_HIGH_IDX = 7,
-       NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX = 8,
-       NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX = 9,
-       NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX = 10,
-};
-
-enum nes_cqp_stag_wqeword_idx {
-       NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX = 1,
-       NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX = 6,
-       NES_CQP_STAG_WQE_LEN_LOW_IDX = 7,
-       NES_CQP_STAG_WQE_STAG_IDX = 8,
-       NES_CQP_STAG_WQE_VA_LOW_IDX = 10,
-       NES_CQP_STAG_WQE_VA_HIGH_IDX = 11,
-       NES_CQP_STAG_WQE_PA_LOW_IDX = 12,
-       NES_CQP_STAG_WQE_PA_HIGH_IDX = 13,
-       NES_CQP_STAG_WQE_PBL_LEN_IDX = 14
-};
-
-#define NES_CQP_OP_LOGICAL_PORT_SHIFT 26
-#define NES_CQP_OP_IWARP_STATE_SHIFT 28
-#define NES_CQP_OP_TERMLEN_SHIFT     28
-
-enum nes_cqp_qp_bits {
-       NES_CQP_QP_ARP_VALID = (1<<8),
-       NES_CQP_QP_WINBUF_VALID = (1<<9),
-       NES_CQP_QP_CONTEXT_VALID = (1<<10),
-       NES_CQP_QP_ORD_VALID = (1<<11),
-       NES_CQP_QP_WINBUF_DATAIND_EN = (1<<12),
-       NES_CQP_QP_VIRT_WQS = (1<<13),
-       NES_CQP_QP_DEL_HTE = (1<<14),
-       NES_CQP_QP_CQS_VALID = (1<<15),
-       NES_CQP_QP_TYPE_TSA = 0,
-       NES_CQP_QP_TYPE_IWARP = (1<<16),
-       NES_CQP_QP_TYPE_CQP = (4<<16),
-       NES_CQP_QP_TYPE_NIC = (5<<16),
-       NES_CQP_QP_MSS_CHG = (1<<20),
-       NES_CQP_QP_STATIC_RESOURCES = (1<<21),
-       NES_CQP_QP_IGNORE_MW_BOUND = (1<<22),
-       NES_CQP_QP_VWQ_USE_LMI = (1<<23),
-       NES_CQP_QP_IWARP_STATE_IDLE = (1<<NES_CQP_OP_IWARP_STATE_SHIFT),
-       NES_CQP_QP_IWARP_STATE_RTS = (2<<NES_CQP_OP_IWARP_STATE_SHIFT),
-       NES_CQP_QP_IWARP_STATE_CLOSING = (3<<NES_CQP_OP_IWARP_STATE_SHIFT),
-       NES_CQP_QP_IWARP_STATE_TERMINATE = (5<<NES_CQP_OP_IWARP_STATE_SHIFT),
-       NES_CQP_QP_IWARP_STATE_ERROR = (6<<NES_CQP_OP_IWARP_STATE_SHIFT),
-       NES_CQP_QP_IWARP_STATE_MASK = (7<<NES_CQP_OP_IWARP_STATE_SHIFT),
-       NES_CQP_QP_TERM_DONT_SEND_FIN = (1<<24),
-       NES_CQP_QP_TERM_DONT_SEND_TERM_MSG = (1<<25),
-       NES_CQP_QP_RESET = (1<<31),
-};
-
-enum nes_cqp_qp_wqe_word_idx {
-       NES_CQP_QP_WQE_CONTEXT_LOW_IDX = 6,
-       NES_CQP_QP_WQE_CONTEXT_HIGH_IDX = 7,
-       NES_CQP_QP_WQE_FLUSH_SQ_CODE = 8,
-       NES_CQP_QP_WQE_FLUSH_RQ_CODE = 9,
-       NES_CQP_QP_WQE_NEW_MSS_IDX = 15,
-};
-
-enum nes_nic_ctx_bits {
-       NES_NIC_CTX_RQ_SIZE_32 = (3<<8),
-       NES_NIC_CTX_RQ_SIZE_512 = (3<<8),
-       NES_NIC_CTX_SQ_SIZE_32 = (1<<10),
-       NES_NIC_CTX_SQ_SIZE_512 = (3<<10),
-};
-
-enum nes_nic_qp_ctx_word_idx {
-       NES_NIC_CTX_MISC_IDX = 0,
-       NES_NIC_CTX_SQ_LOW_IDX = 2,
-       NES_NIC_CTX_SQ_HIGH_IDX = 3,
-       NES_NIC_CTX_RQ_LOW_IDX = 4,
-       NES_NIC_CTX_RQ_HIGH_IDX = 5,
-};
-
-enum nes_cqp_cq_bits {
-       NES_CQP_CQ_CEQE_MASK = (1<<9),
-       NES_CQP_CQ_CEQ_VALID = (1<<10),
-       NES_CQP_CQ_RESIZE = (1<<11),
-       NES_CQP_CQ_CHK_OVERFLOW = (1<<12),
-       NES_CQP_CQ_4KB_CHUNK = (1<<14),
-       NES_CQP_CQ_VIRT = (1<<15),
-};
-
-enum nes_cqp_stag_bits {
-       NES_CQP_STAG_VA_TO = (1<<9),
-       NES_CQP_STAG_DEALLOC_PBLS = (1<<10),
-       NES_CQP_STAG_PBL_BLK_SIZE = (1<<11),
-       NES_CQP_STAG_MR = (1<<13),
-       NES_CQP_STAG_RIGHTS_LOCAL_READ = (1<<16),
-       NES_CQP_STAG_RIGHTS_LOCAL_WRITE = (1<<17),
-       NES_CQP_STAG_RIGHTS_REMOTE_READ = (1<<18),
-       NES_CQP_STAG_RIGHTS_REMOTE_WRITE = (1<<19),
-       NES_CQP_STAG_RIGHTS_WINDOW_BIND = (1<<20),
-       NES_CQP_STAG_REM_ACC_EN = (1<<21),
-       NES_CQP_STAG_LEAVE_PENDING = (1<<31),
-};
-
-enum nes_cqp_ceq_wqeword_idx {
-       NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX = 1,
-       NES_CQP_CEQ_WQE_PBL_LOW_IDX = 6,
-       NES_CQP_CEQ_WQE_PBL_HIGH_IDX = 7,
-};
-
-enum nes_cqp_ceq_bits {
-       NES_CQP_CEQ_4KB_CHUNK = (1<<14),
-       NES_CQP_CEQ_VIRT = (1<<15),
-};
-
-enum nes_cqp_aeq_wqeword_idx {
-       NES_CQP_AEQ_WQE_ELEMENT_COUNT_IDX = 1,
-       NES_CQP_AEQ_WQE_PBL_LOW_IDX = 6,
-       NES_CQP_AEQ_WQE_PBL_HIGH_IDX = 7,
-};
-
-enum nes_cqp_aeq_bits {
-       NES_CQP_AEQ_4KB_CHUNK = (1<<14),
-       NES_CQP_AEQ_VIRT = (1<<15),
-};
-
-enum nes_cqp_lmi_wqeword_idx {
-       NES_CQP_LMI_WQE_LMI_OFFSET_IDX = 1,
-       NES_CQP_LMI_WQE_FRAG_LOW_IDX = 8,
-       NES_CQP_LMI_WQE_FRAG_HIGH_IDX = 9,
-       NES_CQP_LMI_WQE_FRAG_LEN_IDX = 10,
-};
-
-enum nes_cqp_arp_wqeword_idx {
-       NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX = 6,
-       NES_CQP_ARP_WQE_MAC_HIGH_IDX = 7,
-       NES_CQP_ARP_WQE_REACHABILITY_MAX_IDX = 1,
-};
-
-enum nes_cqp_upload_wqeword_idx {
-       NES_CQP_UPLOAD_WQE_CTXT_LOW_IDX = 6,
-       NES_CQP_UPLOAD_WQE_CTXT_HIGH_IDX = 7,
-       NES_CQP_UPLOAD_WQE_HTE_IDX = 8,
-};
-
-enum nes_cqp_arp_bits {
-       NES_CQP_ARP_VALID = (1<<8),
-       NES_CQP_ARP_PERM = (1<<9),
-};
-
-enum nes_cqp_flush_bits {
-       NES_CQP_FLUSH_SQ = (1<<30),
-       NES_CQP_FLUSH_RQ = (1<<31),
-       NES_CQP_FLUSH_MAJ_MIN = (1<<28),
-};
-
-enum nes_cqe_opcode_bits {
-       NES_CQE_STAG_VALID = (1<<6),
-       NES_CQE_ERROR = (1<<7),
-       NES_CQE_SQ = (1<<8),
-       NES_CQE_SE = (1<<9),
-       NES_CQE_PSH = (1<<29),
-       NES_CQE_FIN = (1<<30),
-       NES_CQE_VALID = (1<<31),
-};
-
-
-enum nes_cqe_word_idx {
-       NES_CQE_PAYLOAD_LENGTH_IDX = 0,
-       NES_CQE_COMP_COMP_CTX_LOW_IDX = 2,
-       NES_CQE_COMP_COMP_CTX_HIGH_IDX = 3,
-       NES_CQE_INV_STAG_IDX = 4,
-       NES_CQE_QP_ID_IDX = 5,
-       NES_CQE_ERROR_CODE_IDX = 6,
-       NES_CQE_OPCODE_IDX = 7,
-};
-
-enum nes_ceqe_word_idx {
-       NES_CEQE_CQ_CTX_LOW_IDX = 0,
-       NES_CEQE_CQ_CTX_HIGH_IDX = 1,
-};
-
-enum nes_ceqe_status_bit {
-       NES_CEQE_VALID = (1<<31),
-};
-
-enum nes_int_bits {
-       NES_INT_CEQ0 = (1<<0),
-       NES_INT_CEQ1 = (1<<1),
-       NES_INT_CEQ2 = (1<<2),
-       NES_INT_CEQ3 = (1<<3),
-       NES_INT_CEQ4 = (1<<4),
-       NES_INT_CEQ5 = (1<<5),
-       NES_INT_CEQ6 = (1<<6),
-       NES_INT_CEQ7 = (1<<7),
-       NES_INT_CEQ8 = (1<<8),
-       NES_INT_CEQ9 = (1<<9),
-       NES_INT_CEQ10 = (1<<10),
-       NES_INT_CEQ11 = (1<<11),
-       NES_INT_CEQ12 = (1<<12),
-       NES_INT_CEQ13 = (1<<13),
-       NES_INT_CEQ14 = (1<<14),
-       NES_INT_CEQ15 = (1<<15),
-       NES_INT_AEQ0 = (1<<16),
-       NES_INT_AEQ1 = (1<<17),
-       NES_INT_AEQ2 = (1<<18),
-       NES_INT_AEQ3 = (1<<19),
-       NES_INT_AEQ4 = (1<<20),
-       NES_INT_AEQ5 = (1<<21),
-       NES_INT_AEQ6 = (1<<22),
-       NES_INT_AEQ7 = (1<<23),
-       NES_INT_MAC0 = (1<<24),
-       NES_INT_MAC1 = (1<<25),
-       NES_INT_MAC2 = (1<<26),
-       NES_INT_MAC3 = (1<<27),
-       NES_INT_TSW = (1<<28),
-       NES_INT_TIMER = (1<<29),
-       NES_INT_INTF = (1<<30),
-};
-
-enum nes_intf_int_bits {
-       NES_INTF_INT_PCIERR = (1<<0),
-       NES_INTF_PERIODIC_TIMER = (1<<2),
-       NES_INTF_ONE_SHOT_TIMER = (1<<3),
-       NES_INTF_INT_CRITERR = (1<<14),
-       NES_INTF_INT_AEQ0_OFLOW = (1<<16),
-       NES_INTF_INT_AEQ1_OFLOW = (1<<17),
-       NES_INTF_INT_AEQ2_OFLOW = (1<<18),
-       NES_INTF_INT_AEQ3_OFLOW = (1<<19),
-       NES_INTF_INT_AEQ4_OFLOW = (1<<20),
-       NES_INTF_INT_AEQ5_OFLOW = (1<<21),
-       NES_INTF_INT_AEQ6_OFLOW = (1<<22),
-       NES_INTF_INT_AEQ7_OFLOW = (1<<23),
-       NES_INTF_INT_AEQ_OFLOW = (0xff<<16),
-};
-
-enum nes_mac_int_bits {
-       NES_MAC_INT_LINK_STAT_CHG = (1<<1),
-       NES_MAC_INT_XGMII_EXT = (1<<2),
-       NES_MAC_INT_TX_UNDERFLOW = (1<<6),
-       NES_MAC_INT_TX_ERROR = (1<<7),
-};
-
-enum nes_cqe_allocate_bits {
-       NES_CQE_ALLOC_INC_SELECT = (1<<28),
-       NES_CQE_ALLOC_NOTIFY_NEXT = (1<<29),
-       NES_CQE_ALLOC_NOTIFY_SE = (1<<30),
-       NES_CQE_ALLOC_RESET = (1<<31),
-};
-
-enum nes_nic_rq_wqe_word_idx {
-       NES_NIC_RQ_WQE_LENGTH_1_0_IDX = 0,
-       NES_NIC_RQ_WQE_LENGTH_3_2_IDX = 1,
-       NES_NIC_RQ_WQE_FRAG0_LOW_IDX = 2,
-       NES_NIC_RQ_WQE_FRAG0_HIGH_IDX = 3,
-       NES_NIC_RQ_WQE_FRAG1_LOW_IDX = 4,
-       NES_NIC_RQ_WQE_FRAG1_HIGH_IDX = 5,
-       NES_NIC_RQ_WQE_FRAG2_LOW_IDX = 6,
-       NES_NIC_RQ_WQE_FRAG2_HIGH_IDX = 7,
-       NES_NIC_RQ_WQE_FRAG3_LOW_IDX = 8,
-       NES_NIC_RQ_WQE_FRAG3_HIGH_IDX = 9,
-};
-
-enum nes_nic_sq_wqe_word_idx {
-       NES_NIC_SQ_WQE_MISC_IDX = 0,
-       NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX = 1,
-       NES_NIC_SQ_WQE_LSO_INFO_IDX = 2,
-       NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX = 3,
-       NES_NIC_SQ_WQE_LENGTH_2_1_IDX = 4,
-       NES_NIC_SQ_WQE_LENGTH_4_3_IDX = 5,
-       NES_NIC_SQ_WQE_FRAG0_LOW_IDX = 6,
-       NES_NIC_SQ_WQE_FRAG0_HIGH_IDX = 7,
-       NES_NIC_SQ_WQE_FRAG1_LOW_IDX = 8,
-       NES_NIC_SQ_WQE_FRAG1_HIGH_IDX = 9,
-       NES_NIC_SQ_WQE_FRAG2_LOW_IDX = 10,
-       NES_NIC_SQ_WQE_FRAG2_HIGH_IDX = 11,
-       NES_NIC_SQ_WQE_FRAG3_LOW_IDX = 12,
-       NES_NIC_SQ_WQE_FRAG3_HIGH_IDX = 13,
-       NES_NIC_SQ_WQE_FRAG4_LOW_IDX = 14,
-       NES_NIC_SQ_WQE_FRAG4_HIGH_IDX = 15,
-};
-
-enum nes_iwarp_sq_wqe_word_idx {
-       NES_IWARP_SQ_WQE_MISC_IDX = 0,
-       NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX = 1,
-       NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX = 2,
-       NES_IWARP_SQ_WQE_COMP_CTX_HIGH_IDX = 3,
-       NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX = 4,
-       NES_IWARP_SQ_WQE_COMP_SCRATCH_HIGH_IDX = 5,
-       NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX = 7,
-       NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX = 8,
-       NES_IWARP_SQ_WQE_RDMA_TO_HIGH_IDX = 9,
-       NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX = 10,
-       NES_IWARP_SQ_WQE_RDMA_STAG_IDX = 11,
-       NES_IWARP_SQ_WQE_IMM_DATA_START_IDX = 12,
-       NES_IWARP_SQ_WQE_FRAG0_LOW_IDX = 16,
-       NES_IWARP_SQ_WQE_FRAG0_HIGH_IDX = 17,
-       NES_IWARP_SQ_WQE_LENGTH0_IDX = 18,
-       NES_IWARP_SQ_WQE_STAG0_IDX = 19,
-       NES_IWARP_SQ_WQE_FRAG1_LOW_IDX = 20,
-       NES_IWARP_SQ_WQE_FRAG1_HIGH_IDX = 21,
-       NES_IWARP_SQ_WQE_LENGTH1_IDX = 22,
-       NES_IWARP_SQ_WQE_STAG1_IDX = 23,
-       NES_IWARP_SQ_WQE_FRAG2_LOW_IDX = 24,
-       NES_IWARP_SQ_WQE_FRAG2_HIGH_IDX = 25,
-       NES_IWARP_SQ_WQE_LENGTH2_IDX = 26,
-       NES_IWARP_SQ_WQE_STAG2_IDX = 27,
-       NES_IWARP_SQ_WQE_FRAG3_LOW_IDX = 28,
-       NES_IWARP_SQ_WQE_FRAG3_HIGH_IDX = 29,
-       NES_IWARP_SQ_WQE_LENGTH3_IDX = 30,
-       NES_IWARP_SQ_WQE_STAG3_IDX = 31,
-};
-
-enum nes_iwarp_sq_bind_wqe_word_idx {
-       NES_IWARP_SQ_BIND_WQE_MR_IDX = 6,
-       NES_IWARP_SQ_BIND_WQE_MW_IDX = 7,
-       NES_IWARP_SQ_BIND_WQE_LENGTH_LOW_IDX = 8,
-       NES_IWARP_SQ_BIND_WQE_LENGTH_HIGH_IDX = 9,
-       NES_IWARP_SQ_BIND_WQE_VA_FBO_LOW_IDX = 10,
-       NES_IWARP_SQ_BIND_WQE_VA_FBO_HIGH_IDX = 11,
-};
-
-enum nes_iwarp_sq_fmr_wqe_word_idx {
-       NES_IWARP_SQ_FMR_WQE_MR_STAG_IDX = 7,
-       NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX = 8,
-       NES_IWARP_SQ_FMR_WQE_LENGTH_HIGH_IDX = 9,
-       NES_IWARP_SQ_FMR_WQE_VA_FBO_LOW_IDX = 10,
-       NES_IWARP_SQ_FMR_WQE_VA_FBO_HIGH_IDX = 11,
-       NES_IWARP_SQ_FMR_WQE_PBL_ADDR_LOW_IDX = 12,
-       NES_IWARP_SQ_FMR_WQE_PBL_ADDR_HIGH_IDX = 13,
-       NES_IWARP_SQ_FMR_WQE_PBL_LENGTH_IDX = 14,
-};
-
-enum nes_iwarp_sq_fmr_opcodes {
-       NES_IWARP_SQ_FMR_WQE_ZERO_BASED                 = (1<<6),
-       NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_4K               = (0<<7),
-       NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_2M               = (1<<7),
-       NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_READ   = (1<<16),
-       NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_WRITE  = (1<<17),
-       NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_READ  = (1<<18),
-       NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_WRITE = (1<<19),
-       NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_WINDOW_BIND  = (1<<20),
-};
-
-#define NES_IWARP_SQ_FMR_WQE_MR_LENGTH_HIGH_MASK       0xFF;
-
-enum nes_iwarp_sq_locinv_wqe_word_idx {
-       NES_IWARP_SQ_LOCINV_WQE_INV_STAG_IDX = 6,
-};
-
-enum nes_iwarp_rq_wqe_word_idx {
-       NES_IWARP_RQ_WQE_TOTAL_PAYLOAD_IDX = 1,
-       NES_IWARP_RQ_WQE_COMP_CTX_LOW_IDX = 2,
-       NES_IWARP_RQ_WQE_COMP_CTX_HIGH_IDX = 3,
-       NES_IWARP_RQ_WQE_COMP_SCRATCH_LOW_IDX = 4,
-       NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX = 5,
-       NES_IWARP_RQ_WQE_FRAG0_LOW_IDX = 8,
-       NES_IWARP_RQ_WQE_FRAG0_HIGH_IDX = 9,
-       NES_IWARP_RQ_WQE_LENGTH0_IDX = 10,
-       NES_IWARP_RQ_WQE_STAG0_IDX = 11,
-       NES_IWARP_RQ_WQE_FRAG1_LOW_IDX = 12,
-       NES_IWARP_RQ_WQE_FRAG1_HIGH_IDX = 13,
-       NES_IWARP_RQ_WQE_LENGTH1_IDX = 14,
-       NES_IWARP_RQ_WQE_STAG1_IDX = 15,
-       NES_IWARP_RQ_WQE_FRAG2_LOW_IDX = 16,
-       NES_IWARP_RQ_WQE_FRAG2_HIGH_IDX = 17,
-       NES_IWARP_RQ_WQE_LENGTH2_IDX = 18,
-       NES_IWARP_RQ_WQE_STAG2_IDX = 19,
-       NES_IWARP_RQ_WQE_FRAG3_LOW_IDX = 20,
-       NES_IWARP_RQ_WQE_FRAG3_HIGH_IDX = 21,
-       NES_IWARP_RQ_WQE_LENGTH3_IDX = 22,
-       NES_IWARP_RQ_WQE_STAG3_IDX = 23,
-};
-
-enum nes_nic_sq_wqe_bits {
-       NES_NIC_SQ_WQE_PHDR_CS_READY =  (1<<21),
-       NES_NIC_SQ_WQE_LSO_ENABLE = (1<<22),
-       NES_NIC_SQ_WQE_TAGVALUE_ENABLE = (1<<23),
-       NES_NIC_SQ_WQE_DISABLE_CHKSUM = (1<<30),
-       NES_NIC_SQ_WQE_COMPLETION = (1<<31),
-};
-
-enum nes_nic_cqe_word_idx {
-       NES_NIC_CQE_ACCQP_ID_IDX = 0,
-       NES_NIC_CQE_HASH_RCVNXT = 1,
-       NES_NIC_CQE_TAG_PKT_TYPE_IDX = 2,
-       NES_NIC_CQE_MISC_IDX = 3,
-};
-
-#define NES_PKT_TYPE_APBVT_BITS 0xC112
-#define NES_PKT_TYPE_APBVT_MASK 0xff3e
-
-#define NES_PKT_TYPE_PVALID_BITS 0x10000000
-#define NES_PKT_TYPE_PVALID_MASK 0x30000000
-
-#define NES_PKT_TYPE_TCPV4_BITS 0x0110
-#define NES_PKT_TYPE_TCPV4_MASK 0x3f30
-
-#define NES_PKT_TYPE_UDPV4_BITS 0x0210
-#define NES_PKT_TYPE_UDPV4_MASK 0x3f30
-
-#define NES_PKT_TYPE_IPV4_BITS  0x0010
-#define NES_PKT_TYPE_IPV4_MASK  0x3f30
-
-#define NES_PKT_TYPE_OTHER_BITS 0x0000
-#define NES_PKT_TYPE_OTHER_MASK 0x0030
-
-#define NES_NIC_CQE_ERRV_SHIFT 16
-enum nes_nic_ev_bits {
-       NES_NIC_ERRV_BITS_MODE = (1<<0),
-       NES_NIC_ERRV_BITS_IPV4_CSUM_ERR = (1<<1),
-       NES_NIC_ERRV_BITS_TCPUDP_CSUM_ERR = (1<<2),
-       NES_NIC_ERRV_BITS_WQE_OVERRUN = (1<<3),
-       NES_NIC_ERRV_BITS_IPH_ERR = (1<<4),
-};
-
-enum nes_nic_cqe_bits {
-       NES_NIC_CQE_ERRV_MASK = (0xff<<NES_NIC_CQE_ERRV_SHIFT),
-       NES_NIC_CQE_SQ = (1<<24),
-       NES_NIC_CQE_ACCQP_PORT = (1<<28),
-       NES_NIC_CQE_ACCQP_VALID = (1<<29),
-       NES_NIC_CQE_TAG_VALID = (1<<30),
-       NES_NIC_CQE_VALID = (1<<31),
-};
-
-enum nes_aeqe_word_idx {
-       NES_AEQE_COMP_CTXT_LOW_IDX = 0,
-       NES_AEQE_COMP_CTXT_HIGH_IDX = 1,
-       NES_AEQE_COMP_QP_CQ_ID_IDX = 2,
-       NES_AEQE_MISC_IDX = 3,
-};
-
-enum nes_aeqe_bits {
-       NES_AEQE_QP = (1<<16),
-       NES_AEQE_CQ = (1<<17),
-       NES_AEQE_SQ = (1<<18),
-       NES_AEQE_INBOUND_RDMA = (1<<19),
-       NES_AEQE_IWARP_STATE_MASK = (7<<20),
-       NES_AEQE_TCP_STATE_MASK = (0xf<<24),
-       NES_AEQE_Q2_DATA_WRITTEN = (0x3<<28),
-       NES_AEQE_VALID = (1<<31),
-};
-
-#define NES_AEQE_IWARP_STATE_SHIFT     20
-#define NES_AEQE_TCP_STATE_SHIFT       24
-#define NES_AEQE_Q2_DATA_ETHERNET       (1<<28)
-#define NES_AEQE_Q2_DATA_MPA            (1<<29)
-
-enum nes_aeqe_iwarp_state {
-       NES_AEQE_IWARP_STATE_NON_EXISTANT = 0,
-       NES_AEQE_IWARP_STATE_IDLE = 1,
-       NES_AEQE_IWARP_STATE_RTS = 2,
-       NES_AEQE_IWARP_STATE_CLOSING = 3,
-       NES_AEQE_IWARP_STATE_TERMINATE = 5,
-       NES_AEQE_IWARP_STATE_ERROR = 6
-};
-
-enum nes_aeqe_tcp_state {
-       NES_AEQE_TCP_STATE_NON_EXISTANT = 0,
-       NES_AEQE_TCP_STATE_CLOSED = 1,
-       NES_AEQE_TCP_STATE_LISTEN = 2,
-       NES_AEQE_TCP_STATE_SYN_SENT = 3,
-       NES_AEQE_TCP_STATE_SYN_RCVD = 4,
-       NES_AEQE_TCP_STATE_ESTABLISHED = 5,
-       NES_AEQE_TCP_STATE_CLOSE_WAIT = 6,
-       NES_AEQE_TCP_STATE_FIN_WAIT_1 = 7,
-       NES_AEQE_TCP_STATE_CLOSING = 8,
-       NES_AEQE_TCP_STATE_LAST_ACK = 9,
-       NES_AEQE_TCP_STATE_FIN_WAIT_2 = 10,
-       NES_AEQE_TCP_STATE_TIME_WAIT = 11
-};
-
-enum nes_aeqe_aeid {
-       NES_AEQE_AEID_AMP_UNALLOCATED_STAG                            = 0x0102,
-       NES_AEQE_AEID_AMP_INVALID_STAG                                = 0x0103,
-       NES_AEQE_AEID_AMP_BAD_QP                                      = 0x0104,
-       NES_AEQE_AEID_AMP_BAD_PD                                      = 0x0105,
-       NES_AEQE_AEID_AMP_BAD_STAG_KEY                                = 0x0106,
-       NES_AEQE_AEID_AMP_BAD_STAG_INDEX                              = 0x0107,
-       NES_AEQE_AEID_AMP_BOUNDS_VIOLATION                            = 0x0108,
-       NES_AEQE_AEID_AMP_RIGHTS_VIOLATION                            = 0x0109,
-       NES_AEQE_AEID_AMP_TO_WRAP                                     = 0x010a,
-       NES_AEQE_AEID_AMP_FASTREG_SHARED                              = 0x010b,
-       NES_AEQE_AEID_AMP_FASTREG_VALID_STAG                          = 0x010c,
-       NES_AEQE_AEID_AMP_FASTREG_MW_STAG                             = 0x010d,
-       NES_AEQE_AEID_AMP_FASTREG_INVALID_RIGHTS                      = 0x010e,
-       NES_AEQE_AEID_AMP_FASTREG_PBL_TABLE_OVERFLOW                  = 0x010f,
-       NES_AEQE_AEID_AMP_FASTREG_INVALID_LENGTH                      = 0x0110,
-       NES_AEQE_AEID_AMP_INVALIDATE_SHARED                           = 0x0111,
-       NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS          = 0x0112,
-       NES_AEQE_AEID_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS            = 0x0113,
-       NES_AEQE_AEID_AMP_MWBIND_VALID_STAG                           = 0x0114,
-       NES_AEQE_AEID_AMP_MWBIND_OF_MR_STAG                           = 0x0115,
-       NES_AEQE_AEID_AMP_MWBIND_TO_ZERO_BASED_STAG                   = 0x0116,
-       NES_AEQE_AEID_AMP_MWBIND_TO_MW_STAG                           = 0x0117,
-       NES_AEQE_AEID_AMP_MWBIND_INVALID_RIGHTS                       = 0x0118,
-       NES_AEQE_AEID_AMP_MWBIND_INVALID_BOUNDS                       = 0x0119,
-       NES_AEQE_AEID_AMP_MWBIND_TO_INVALID_PARENT                    = 0x011a,
-       NES_AEQE_AEID_AMP_MWBIND_BIND_DISABLED                        = 0x011b,
-       NES_AEQE_AEID_BAD_CLOSE                                       = 0x0201,
-       NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE                         = 0x0202,
-       NES_AEQE_AEID_CQ_OPERATION_ERROR                              = 0x0203,
-       NES_AEQE_AEID_PRIV_OPERATION_DENIED                           = 0x0204,
-       NES_AEQE_AEID_RDMA_READ_WHILE_ORD_ZERO                        = 0x0205,
-       NES_AEQE_AEID_STAG_ZERO_INVALID                               = 0x0206,
-       NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN                      = 0x0301,
-       NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID              = 0x0302,
-       NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER = 0x0303,
-       NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION                     = 0x0304,
-       NES_AEQE_AEID_DDP_UBE_INVALID_MO                              = 0x0305,
-       NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE         = 0x0306,
-       NES_AEQE_AEID_DDP_UBE_INVALID_QN                              = 0x0307,
-       NES_AEQE_AEID_DDP_NO_L_BIT                                    = 0x0308,
-       NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION                 = 0x0311,
-       NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE                     = 0x0312,
-       NES_AEQE_AEID_ROE_INVALID_RDMA_READ_REQUEST                   = 0x0313,
-       NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP             = 0x0314,
-       NES_AEQE_AEID_INVALID_ARP_ENTRY                               = 0x0401,
-       NES_AEQE_AEID_INVALID_TCP_OPTION_RCVD                         = 0x0402,
-       NES_AEQE_AEID_STALE_ARP_ENTRY                                 = 0x0403,
-       NES_AEQE_AEID_LLP_CLOSE_COMPLETE                              = 0x0501,
-       NES_AEQE_AEID_LLP_CONNECTION_RESET                            = 0x0502,
-       NES_AEQE_AEID_LLP_FIN_RECEIVED                                = 0x0503,
-       NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH =  0x0504,
-       NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR                      = 0x0505,
-       NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE                           = 0x0506,
-       NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL                           = 0x0507,
-       NES_AEQE_AEID_LLP_SYN_RECEIVED                                = 0x0508,
-       NES_AEQE_AEID_LLP_TERMINATE_RECEIVED                          = 0x0509,
-       NES_AEQE_AEID_LLP_TOO_MANY_RETRIES                            = 0x050a,
-       NES_AEQE_AEID_LLP_TOO_MANY_KEEPALIVE_RETRIES                  = 0x050b,
-       NES_AEQE_AEID_RESET_SENT                                      = 0x0601,
-       NES_AEQE_AEID_TERMINATE_SENT                                  = 0x0602,
-       NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC                      = 0x0700
-};
-
-enum nes_iwarp_sq_opcodes {
-       NES_IWARP_SQ_WQE_WRPDU = (1<<15),
-       NES_IWARP_SQ_WQE_PSH = (1<<21),
-       NES_IWARP_SQ_WQE_STREAMING = (1<<23),
-       NES_IWARP_SQ_WQE_IMM_DATA = (1<<28),
-       NES_IWARP_SQ_WQE_READ_FENCE = (1<<29),
-       NES_IWARP_SQ_WQE_LOCAL_FENCE = (1<<30),
-       NES_IWARP_SQ_WQE_SIGNALED_COMPL = (1<<31),
-};
-
-enum nes_iwarp_sq_wqe_bits {
-       NES_IWARP_SQ_OP_RDMAW = 0,
-       NES_IWARP_SQ_OP_RDMAR = 1,
-       NES_IWARP_SQ_OP_SEND = 3,
-       NES_IWARP_SQ_OP_SENDINV = 4,
-       NES_IWARP_SQ_OP_SENDSE = 5,
-       NES_IWARP_SQ_OP_SENDSEINV = 6,
-       NES_IWARP_SQ_OP_BIND = 8,
-       NES_IWARP_SQ_OP_FAST_REG = 9,
-       NES_IWARP_SQ_OP_LOCINV = 10,
-       NES_IWARP_SQ_OP_RDMAR_LOCINV = 11,
-       NES_IWARP_SQ_OP_NOP = 12,
-};
-
-enum nes_iwarp_cqe_major_code {
-       NES_IWARP_CQE_MAJOR_FLUSH = 1,
-       NES_IWARP_CQE_MAJOR_DRV = 0x8000
-};
-
-enum nes_iwarp_cqe_minor_code {
-       NES_IWARP_CQE_MINOR_FLUSH = 1
-};
-
-#define NES_EEPROM_READ_REQUEST (1<<16)
-#define NES_MAC_ADDR_VALID      (1<<20)
-
-/*
- * NES index registers init values.
- */
-struct nes_init_values {
-       u32 index;
-       u32 data;
-       u8  wrt;
-};
-
-/*
- * NES registers in BAR0.
- */
-struct nes_pci_regs {
-       u32 int_status;
-       u32 int_mask;
-       u32 int_pending;
-       u32 intf_int_status;
-       u32 intf_int_mask;
-       u32 other_regs[59];      /* pad out to 256 bytes for now */
-};
-
-#define NES_CQP_SQ_SIZE    128
-#define NES_CCQ_SIZE       128
-#define NES_NIC_WQ_SIZE    512
-#define NES_NIC_CTX_SIZE   ((NES_NIC_CTX_RQ_SIZE_512) | (NES_NIC_CTX_SQ_SIZE_512))
-#define NES_NIC_BACK_STORE 0x00038000
-
-struct nes_device;
-
-struct nes_hw_nic_qp_context {
-       __le32 context_words[6];
-};
-
-struct nes_hw_nic_sq_wqe {
-       __le32 wqe_words[16];
-};
-
-struct nes_hw_nic_rq_wqe {
-       __le32 wqe_words[16];
-};
-
-struct nes_hw_nic_cqe {
-       __le32 cqe_words[4];
-};
-
-struct nes_hw_cqp_qp_context {
-       __le32 context_words[4];
-};
-
-struct nes_hw_cqp_wqe {
-       __le32 wqe_words[16];
-};
-
-struct nes_hw_qp_wqe {
-       __le32 wqe_words[32];
-};
-
-struct nes_hw_cqe {
-       __le32 cqe_words[8];
-};
-
-struct nes_hw_ceqe {
-       __le32 ceqe_words[2];
-};
-
-struct nes_hw_aeqe {
-       __le32 aeqe_words[4];
-};
-
-struct nes_cqp_request {
-       union {
-               u64 cqp_callback_context;
-               void *cqp_callback_pointer;
-       };
-       wait_queue_head_t     waitq;
-       struct nes_hw_cqp_wqe cqp_wqe;
-       struct list_head      list;
-       atomic_t              refcount;
-       void (*cqp_callback)(struct nes_device *nesdev, struct nes_cqp_request *cqp_request);
-       u16                   major_code;
-       u16                   minor_code;
-       u8                    waiting;
-       u8                    request_done;
-       u8                    dynamic;
-       u8                    callback;
-};
-
-struct nes_hw_cqp {
-       struct nes_hw_cqp_wqe *sq_vbase;
-       dma_addr_t            sq_pbase;
-       spinlock_t            lock;
-       wait_queue_head_t     waitq;
-       u16                   qp_id;
-       u16                   sq_head;
-       u16                   sq_tail;
-       u16                   sq_size;
-};
-
-#define NES_FIRST_FRAG_SIZE 128
-struct nes_first_frag {
-       u8 buffer[NES_FIRST_FRAG_SIZE];
-};
-
-struct nes_hw_nic {
-       struct nes_first_frag    *first_frag_vbase;     /* virtual address of first frags */
-       struct nes_hw_nic_sq_wqe *sq_vbase;                     /* virtual address of sq */
-       struct nes_hw_nic_rq_wqe *rq_vbase;                     /* virtual address of rq */
-       struct sk_buff           *tx_skb[NES_NIC_WQ_SIZE];
-       struct sk_buff           *rx_skb[NES_NIC_WQ_SIZE];
-       dma_addr_t frag_paddr[NES_NIC_WQ_SIZE];
-       unsigned long first_frag_overflow[BITS_TO_LONGS(NES_NIC_WQ_SIZE)];
-       dma_addr_t sq_pbase;                    /* PCI memory for host rings */
-       dma_addr_t rq_pbase;                    /* PCI memory for host rings */
-
-       u16 qp_id;
-       u16 sq_head;
-       u16 sq_tail;
-       u16 sq_size;
-       u16 rq_head;
-       u16 rq_tail;
-       u16 rq_size;
-       u8 replenishing_rq;
-       u8 reserved;
-
-       spinlock_t rq_lock;
-};
-
-struct nes_hw_nic_cq {
-       struct nes_hw_nic_cqe volatile *cq_vbase;       /* PCI memory for host rings */
-       void (*ce_handler)(struct nes_device *nesdev, struct nes_hw_nic_cq *cq);
-       dma_addr_t cq_pbase;    /* PCI memory for host rings */
-       int rx_cqes_completed;
-       int cqe_allocs_pending;
-       int rx_pkts_indicated;
-       u16 cq_head;
-       u16 cq_size;
-       u16 cq_number;
-       u8  cqes_pending;
-};
-
-struct nes_hw_qp {
-       struct nes_hw_qp_wqe *sq_vbase;         /* PCI memory for host rings */
-       struct nes_hw_qp_wqe *rq_vbase;         /* PCI memory for host rings */
-       void                 *q2_vbase;                 /* PCI memory for host rings */
-       dma_addr_t sq_pbase;    /* PCI memory for host rings */
-       dma_addr_t rq_pbase;    /* PCI memory for host rings */
-       dma_addr_t q2_pbase;    /* PCI memory for host rings */
-       u32 qp_id;
-       u16 sq_head;
-       u16 sq_tail;
-       u16 sq_size;
-       u16 rq_head;
-       u16 rq_tail;
-       u16 rq_size;
-       u8  rq_encoded_size;
-       u8  sq_encoded_size;
-};
-
-struct nes_hw_cq {
-       struct nes_hw_cqe *cq_vbase;    /* PCI memory for host rings */
-       void (*ce_handler)(struct nes_device *nesdev, struct nes_hw_cq *cq);
-       dma_addr_t cq_pbase;    /* PCI memory for host rings */
-       u16 cq_head;
-       u16 cq_size;
-       u16 cq_number;
-};
-
-struct nes_hw_ceq {
-       struct nes_hw_ceqe volatile *ceq_vbase; /* PCI memory for host rings */
-       dma_addr_t ceq_pbase;   /* PCI memory for host rings */
-       u16 ceq_head;
-       u16 ceq_size;
-};
-
-struct nes_hw_aeq {
-       struct nes_hw_aeqe volatile *aeq_vbase; /* PCI memory for host rings */
-       dma_addr_t aeq_pbase;   /* PCI memory for host rings */
-       u16 aeq_head;
-       u16 aeq_size;
-};
-
-struct nic_qp_map {
-       u8 qpid;
-       u8 nic_index;
-       u8 logical_port;
-       u8 is_hnic;
-};
-
-#define        NES_CQP_ARP_AEQ_INDEX_MASK  0x000f0000
-#define        NES_CQP_ARP_AEQ_INDEX_SHIFT 16
-
-#define NES_CQP_APBVT_ADD                      0x00008000
-#define NES_CQP_APBVT_NIC_SHIFT                16
-
-#define NES_ARP_ADD     1
-#define NES_ARP_DELETE  2
-#define NES_ARP_RESOLVE 3
-
-#define NES_MAC_SW_IDLE      0
-#define NES_MAC_SW_INTERRUPT 1
-#define NES_MAC_SW_MH        2
-
-struct nes_arp_entry {
-       u32 ip_addr;
-       u8  mac_addr[ETH_ALEN];
-};
-
-#define NES_NIC_FAST_TIMER          96
-#define NES_NIC_FAST_TIMER_LOW      40
-#define NES_NIC_FAST_TIMER_HIGH     1000
-#define DEFAULT_NES_QL_HIGH         256
-#define DEFAULT_NES_QL_LOW          16
-#define DEFAULT_NES_QL_TARGET       64
-#define DEFAULT_JUMBO_NES_QL_LOW    12
-#define DEFAULT_JUMBO_NES_QL_TARGET 40
-#define DEFAULT_JUMBO_NES_QL_HIGH   128
-#define NES_NIC_CQ_DOWNWARD_TREND   16
-#define NES_PFT_SIZE               48
-
-#define NES_MGT_WQ_COUNT 32
-#define NES_MGT_CTX_SIZE ((NES_NIC_CTX_RQ_SIZE_32) | (NES_NIC_CTX_SQ_SIZE_32))
-#define NES_MGT_QP_OFFSET 36
-#define NES_MGT_QP_COUNT 4
-
-struct nes_hw_tune_timer {
-    /* u16 cq_count; */
-    u16 threshold_low;
-    u16 threshold_target;
-    u16 threshold_high;
-    u16 timer_in_use;
-    u16 timer_in_use_old;
-    u16 timer_in_use_min;
-    u16 timer_in_use_max;
-    u8  timer_direction_upward;
-    u8  timer_direction_downward;
-    u16 cq_count_old;
-    u8  cq_direction_downward;
-};
-
-#define NES_TIMER_INT_LIMIT         2
-#define NES_TIMER_INT_LIMIT_DYNAMIC 10
-#define NES_TIMER_ENABLE_LIMIT      4
-#define NES_MAX_LINK_INTERRUPTS     128
-#define NES_MAX_LINK_CHECK          200
-
-struct nes_adapter {
-       u64              fw_ver;
-       unsigned long    *allocated_qps;
-       unsigned long    *allocated_cqs;
-       unsigned long    *allocated_mrs;
-       unsigned long    *allocated_pds;
-       unsigned long    *allocated_arps;
-       struct nes_qp    **qp_table;
-       struct workqueue_struct *work_q;
-
-       struct list_head list;
-       struct list_head active_listeners;
-       /* list of the netdev's associated with each logical port */
-       struct list_head nesvnic_list[4];
-
-       struct timer_list  mh_timer;
-       struct timer_list  lc_timer;
-       struct work_struct work;
-       spinlock_t         resource_lock;
-       spinlock_t         phy_lock;
-       spinlock_t         pbl_lock;
-       spinlock_t         periodic_timer_lock;
-
-       struct nes_arp_entry arp_table[NES_MAX_ARP_TABLE_SIZE];
-
-       /* Adapter CEQ and AEQs */
-       struct nes_hw_ceq ceq[16];
-       struct nes_hw_aeq aeq[8];
-
-       struct nes_hw_tune_timer tune_timer;
-
-       unsigned long doorbell_start;
-
-       u32 hw_rev;
-       u32 vendor_id;
-       u32 vendor_part_id;
-       u32 device_cap_flags;
-       u32 tick_delta;
-       u32 timer_int_req;
-       u32 arp_table_size;
-       u32 next_arp_index;
-
-       u32 max_mr;
-       u32 max_256pbl;
-       u32 max_4kpbl;
-       u32 free_256pbl;
-       u32 free_4kpbl;
-       u32 max_mr_size;
-       u32 max_qp;
-       u32 next_qp;
-       u32 max_irrq;
-       u32 max_qp_wr;
-       u32 max_sge;
-       u32 max_cq;
-       u32 next_cq;
-       u32 max_cqe;
-       u32 max_pd;
-       u32 base_pd;
-       u32 next_pd;
-       u32 hte_index_mask;
-
-       /* EEPROM information */
-       u32 rx_pool_size;
-       u32 tx_pool_size;
-       u32 rx_threshold;
-       u32 tcp_timer_core_clk_divisor;
-       u32 iwarp_config;
-       u32 cm_config;
-       u32 sws_timer_config;
-       u32 tcp_config1;
-       u32 wqm_wat;
-       u32 core_clock;
-       u32 firmware_version;
-       u32 eeprom_version;
-
-       u32 nic_rx_eth_route_err;
-
-       u32 et_rx_coalesce_usecs;
-       u32 et_rx_max_coalesced_frames;
-       u32 et_rx_coalesce_usecs_irq;
-       u32 et_rx_max_coalesced_frames_irq;
-       u32 et_pkt_rate_low;
-       u32 et_rx_coalesce_usecs_low;
-       u32 et_rx_max_coalesced_frames_low;
-       u32 et_pkt_rate_high;
-       u32 et_rx_coalesce_usecs_high;
-       u32 et_rx_max_coalesced_frames_high;
-       u32 et_rate_sample_interval;
-       u32 timer_int_limit;
-       u32 wqm_quanta;
-       u8 allow_unaligned_fpdus;
-
-       /* Adapter base MAC address */
-       u32 mac_addr_low;
-       u16 mac_addr_high;
-
-       u16 firmware_eeprom_offset;
-       u16 software_eeprom_offset;
-
-       u16 max_irrq_wr;
-
-       /* pd config for each port */
-       u16 pd_config_size[4];
-       u16 pd_config_base[4];
-
-       u16 link_interrupt_count[4];
-       u8 crit_error_count[32];
-
-       /* the phy index for each port */
-       u8  phy_index[4];
-       u8  mac_sw_state[4];
-       u8  mac_link_down[4];
-       u8  phy_type[4];
-       u8  log_port;
-
-       /* PCI information */
-       struct nes_device *nesdev;
-       unsigned int  devfn;
-       unsigned char bus_number;
-       unsigned char OneG_Mode;
-
-       unsigned char ref_count;
-       u8            netdev_count;
-       u8            netdev_max;       /* from host nic address count in EEPROM */
-       u8            port_count;
-       u8            virtwq;
-       u8            send_term_ok;
-       u8            et_use_adaptive_rx_coalesce;
-       u8            adapter_fcn_count;
-       u8 pft_mcast_map[NES_PFT_SIZE];
-};
-
-struct nes_pbl {
-       u64              *pbl_vbase;
-       dma_addr_t       pbl_pbase;
-       struct page      *page;
-       unsigned long    user_base;
-       u32              pbl_size;
-       struct list_head list;
-       /* TODO: need to add list for two level tables */
-};
-
-#define NES_4K_PBL_CHUNK_SIZE  4096
-
-struct nes_fast_mr_wqe_pbl {
-       u64             *kva;
-       dma_addr_t      paddr;
-};
-
-struct nes_listener {
-       struct work_struct      work;
-       struct workqueue_struct *wq;
-       struct nes_vnic         *nesvnic;
-       struct iw_cm_id         *cm_id;
-       struct list_head        list;
-       unsigned long           socket;
-       u8                      accept_failed;
-};
-
-struct nes_ib_device;
-
-#define NES_EVENT_DELAY msecs_to_jiffies(100)
-
-struct nes_vnic {
-       struct nes_ib_device *nesibdev;
-       u64 sq_full;
-       u64 tso_requests;
-       u64 segmented_tso_requests;
-       u64 linearized_skbs;
-       u64 tx_sw_dropped;
-       u64 endnode_nstat_rx_discard;
-       u64 endnode_nstat_rx_octets;
-       u64 endnode_nstat_rx_frames;
-       u64 endnode_nstat_tx_octets;
-       u64 endnode_nstat_tx_frames;
-       u64 endnode_ipv4_tcp_retransmits;
-       /* void *mem; */
-       struct nes_device *nesdev;
-       struct net_device *netdev;
-       atomic_t          rx_skbs_needed;
-       atomic_t          rx_skb_timer_running;
-       int               budget;
-       u32               msg_enable;
-       /* u32 tx_avail; */
-       __be32            local_ipaddr;
-       struct napi_struct   napi;
-       spinlock_t           tx_lock;   /* could use netdev tx lock? */
-       struct timer_list    rq_wqes_timer;
-       u32                  nic_mem_size;
-       void                 *nic_vbase;
-       dma_addr_t           nic_pbase;
-       struct nes_hw_nic    nic;
-       struct nes_hw_nic_cq nic_cq;
-       u32    mcrq_qp_id;
-       struct nes_ucontext *mcrq_ucontext;
-       struct nes_cqp_request* (*get_cqp_request)(struct nes_device *nesdev);
-       void (*post_cqp_request)(struct nes_device*, struct nes_cqp_request *);
-       int (*mcrq_mcast_filter)( struct nes_vnic* nesvnic, __u8* dmi_addr );
-       struct net_device_stats netstats;
-       /* used to put the netdev on the adapters logical port list */
-       struct list_head list;
-       u16 max_frame_size;
-       u8  netdev_open;
-       u8  linkup;
-       u8  logical_port;
-       u8  netdev_index;  /* might not be needed, indexes nesdev->netdev */
-       u8  perfect_filter_index;
-       u8  nic_index;
-       u8  qp_nic_index[4];
-       u8  next_qp_nic_index;
-       u8  of_device_registered;
-       u8  rdma_enabled;
-       struct timer_list event_timer;
-       enum ib_event_type delayed_event;
-       enum ib_event_type last_dispatched_event;
-       spinlock_t port_ibevent_lock;
-       u32 mgt_mem_size;
-       void *mgt_vbase;
-       dma_addr_t mgt_pbase;
-       struct nes_vnic_mgt *mgtvnic[NES_MGT_QP_COUNT];
-       struct task_struct *mgt_thread;
-       wait_queue_head_t mgt_wait_queue;
-       struct sk_buff_head mgt_skb_list;
-
-};
-
-struct nes_ib_device {
-       struct ib_device ibdev;
-       struct nes_vnic *nesvnic;
-
-       /* Virtual RNIC Limits */
-       u32 max_mr;
-       u32 max_qp;
-       u32 max_cq;
-       u32 max_pd;
-       u32 num_mr;
-       u32 num_qp;
-       u32 num_cq;
-       u32 num_pd;
-};
-
-enum nes_hdrct_flags {
-       DDP_LEN_FLAG                    = 0x80,
-       DDP_HDR_FLAG                    = 0x40,
-       RDMA_HDR_FLAG                   = 0x20
-};
-
-enum nes_term_layers {
-       LAYER_RDMA                      = 0,
-       LAYER_DDP                       = 1,
-       LAYER_MPA                       = 2
-};
-
-enum nes_term_error_types {
-       RDMAP_CATASTROPHIC              = 0,
-       RDMAP_REMOTE_PROT               = 1,
-       RDMAP_REMOTE_OP                 = 2,
-       DDP_CATASTROPHIC                = 0,
-       DDP_TAGGED_BUFFER               = 1,
-       DDP_UNTAGGED_BUFFER             = 2,
-       DDP_LLP                         = 3
-};
-
-enum nes_term_rdma_errors {
-       RDMAP_INV_STAG                  = 0x00,
-       RDMAP_INV_BOUNDS                = 0x01,
-       RDMAP_ACCESS                    = 0x02,
-       RDMAP_UNASSOC_STAG              = 0x03,
-       RDMAP_TO_WRAP                   = 0x04,
-       RDMAP_INV_RDMAP_VER             = 0x05,
-       RDMAP_UNEXPECTED_OP             = 0x06,
-       RDMAP_CATASTROPHIC_LOCAL        = 0x07,
-       RDMAP_CATASTROPHIC_GLOBAL       = 0x08,
-       RDMAP_CANT_INV_STAG             = 0x09,
-       RDMAP_UNSPECIFIED               = 0xff
-};
-
-enum nes_term_ddp_errors {
-       DDP_CATASTROPHIC_LOCAL          = 0x00,
-       DDP_TAGGED_INV_STAG             = 0x00,
-       DDP_TAGGED_BOUNDS               = 0x01,
-       DDP_TAGGED_UNASSOC_STAG         = 0x02,
-       DDP_TAGGED_TO_WRAP              = 0x03,
-       DDP_TAGGED_INV_DDP_VER          = 0x04,
-       DDP_UNTAGGED_INV_QN             = 0x01,
-       DDP_UNTAGGED_INV_MSN_NO_BUF     = 0x02,
-       DDP_UNTAGGED_INV_MSN_RANGE      = 0x03,
-       DDP_UNTAGGED_INV_MO             = 0x04,
-       DDP_UNTAGGED_INV_TOO_LONG       = 0x05,
-       DDP_UNTAGGED_INV_DDP_VER        = 0x06
-};
-
-enum nes_term_mpa_errors {
-       MPA_CLOSED                      = 0x01,
-       MPA_CRC                         = 0x02,
-       MPA_MARKER                      = 0x03,
-       MPA_REQ_RSP                     = 0x04,
-};
-
-struct nes_terminate_hdr {
-       u8 layer_etype;
-       u8 error_code;
-       u8 hdrct;
-       u8 rsvd;
-};
-
-/* Used to determine how to fill in terminate error codes */
-#define IWARP_OPCODE_WRITE             0
-#define IWARP_OPCODE_READREQ           1
-#define IWARP_OPCODE_READRSP           2
-#define IWARP_OPCODE_SEND              3
-#define IWARP_OPCODE_SEND_INV          4
-#define IWARP_OPCODE_SEND_SE           5
-#define IWARP_OPCODE_SEND_SE_INV       6
-#define IWARP_OPCODE_TERM              7
-
-/* These values are used only during terminate processing */
-#define TERM_DDP_LEN_TAGGED    14
-#define TERM_DDP_LEN_UNTAGGED  18
-#define TERM_RDMA_LEN          28
-#define RDMA_OPCODE_MASK       0x0f
-#define RDMA_READ_REQ_OPCODE   1
-#define BAD_FRAME_OFFSET       64
-#define CQE_MAJOR_DRV          0x8000
-
-/* Used for link status recheck after interrupt processing */
-#define NES_LINK_RECHECK_DELAY msecs_to_jiffies(50)
-#define NES_LINK_RECHECK_MAX   60
-
-#endif         /* __NES_HW_H */
diff --git a/drivers/infiniband/hw/nes/nes_mgt.c b/drivers/infiniband/hw/nes/nes_mgt.c
deleted file mode 100644 (file)
index cc4dce5..0000000
+++ /dev/null
@@ -1,1155 +0,0 @@
-/*
- * Copyright (c) 2006 - 2011 Intel-NE, Inc.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include <linux/skbuff.h>
-#include <linux/etherdevice.h>
-#include <linux/kthread.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <net/tcp.h>
-#include "nes.h"
-#include "nes_mgt.h"
-
-atomic_t pau_qps_created;
-atomic_t pau_qps_destroyed;
-
-static void nes_replenish_mgt_rq(struct nes_vnic_mgt *mgtvnic)
-{
-       unsigned long flags;
-       dma_addr_t bus_address;
-       struct sk_buff *skb;
-       struct nes_hw_nic_rq_wqe *nic_rqe;
-       struct nes_hw_mgt *nesmgt;
-       struct nes_device *nesdev;
-       struct nes_rskb_cb *cb;
-       u32 rx_wqes_posted = 0;
-
-       nesmgt = &mgtvnic->mgt;
-       nesdev = mgtvnic->nesvnic->nesdev;
-       spin_lock_irqsave(&nesmgt->rq_lock, flags);
-       if (nesmgt->replenishing_rq != 0) {
-               if (((nesmgt->rq_size - 1) == atomic_read(&mgtvnic->rx_skbs_needed)) &&
-                   (atomic_read(&mgtvnic->rx_skb_timer_running) == 0)) {
-                       atomic_set(&mgtvnic->rx_skb_timer_running, 1);
-                       spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
-                       mgtvnic->rq_wqes_timer.expires = jiffies + (HZ / 2);      /* 1/2 second */
-                       add_timer(&mgtvnic->rq_wqes_timer);
-               } else {
-                       spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
-               }
-               return;
-       }
-       nesmgt->replenishing_rq = 1;
-       spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
-       do {
-               skb = dev_alloc_skb(mgtvnic->nesvnic->max_frame_size);
-               if (skb) {
-                       skb->dev = mgtvnic->nesvnic->netdev;
-
-                       bus_address = pci_map_single(nesdev->pcidev,
-                                                    skb->data, mgtvnic->nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
-                       cb = (struct nes_rskb_cb *)&skb->cb[0];
-                       cb->busaddr = bus_address;
-                       cb->maplen = mgtvnic->nesvnic->max_frame_size;
-
-                       nic_rqe = &nesmgt->rq_vbase[mgtvnic->mgt.rq_head];
-                       nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] =
-                               cpu_to_le32(mgtvnic->nesvnic->max_frame_size);
-                       nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
-                       nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] =
-                               cpu_to_le32((u32)bus_address);
-                       nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] =
-                               cpu_to_le32((u32)((u64)bus_address >> 32));
-                       nesmgt->rx_skb[nesmgt->rq_head] = skb;
-                       nesmgt->rq_head++;
-                       nesmgt->rq_head &= nesmgt->rq_size - 1;
-                       atomic_dec(&mgtvnic->rx_skbs_needed);
-                       barrier();
-                       if (++rx_wqes_posted == 255) {
-                               nes_write32(nesdev->regs + NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesmgt->qp_id);
-                               rx_wqes_posted = 0;
-                       }
-               } else {
-                       spin_lock_irqsave(&nesmgt->rq_lock, flags);
-                       if (((nesmgt->rq_size - 1) == atomic_read(&mgtvnic->rx_skbs_needed)) &&
-                           (atomic_read(&mgtvnic->rx_skb_timer_running) == 0)) {
-                               atomic_set(&mgtvnic->rx_skb_timer_running, 1);
-                               spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
-                               mgtvnic->rq_wqes_timer.expires = jiffies + (HZ / 2);      /* 1/2 second */
-                               add_timer(&mgtvnic->rq_wqes_timer);
-                       } else {
-                               spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
-                       }
-                       break;
-               }
-       } while (atomic_read(&mgtvnic->rx_skbs_needed));
-       barrier();
-       if (rx_wqes_posted)
-               nes_write32(nesdev->regs + NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesmgt->qp_id);
-       nesmgt->replenishing_rq = 0;
-}
-
-/**
- * nes_mgt_rq_wqes_timeout
- */
-static void nes_mgt_rq_wqes_timeout(struct timer_list *t)
-{
-       struct nes_vnic_mgt *mgtvnic = from_timer(mgtvnic, t,
-                                                      rq_wqes_timer);
-
-       atomic_set(&mgtvnic->rx_skb_timer_running, 0);
-       if (atomic_read(&mgtvnic->rx_skbs_needed))
-               nes_replenish_mgt_rq(mgtvnic);
-}
-
-/**
- * nes_mgt_free_skb - unmap and free skb
- */
-static void nes_mgt_free_skb(struct nes_device *nesdev, struct sk_buff *skb, u32 dir)
-{
-       struct nes_rskb_cb *cb;
-
-       cb = (struct nes_rskb_cb *)&skb->cb[0];
-       pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, dir);
-       cb->busaddr = 0;
-       dev_kfree_skb_any(skb);
-}
-
-/**
- * nes_download_callback - handle download completions
- */
-static void nes_download_callback(struct nes_device *nesdev, struct nes_cqp_request *cqp_request)
-{
-       struct pau_fpdu_info *fpdu_info = cqp_request->cqp_callback_pointer;
-       struct nes_qp *nesqp = fpdu_info->nesqp;
-       struct sk_buff *skb;
-       int i;
-
-       for (i = 0; i < fpdu_info->frag_cnt; i++) {
-               skb = fpdu_info->frags[i].skb;
-               if (fpdu_info->frags[i].cmplt) {
-                       nes_mgt_free_skb(nesdev, skb, PCI_DMA_TODEVICE);
-                       nes_rem_ref_cm_node(nesqp->cm_node);
-               }
-       }
-
-       if (fpdu_info->hdr_vbase)
-               pci_free_consistent(nesdev->pcidev, fpdu_info->hdr_len,
-                                   fpdu_info->hdr_vbase, fpdu_info->hdr_pbase);
-       kfree(fpdu_info);
-}
-
-/**
- * nes_get_seq - Get the seq, ack_seq and window from the packet
- */
-static u32 nes_get_seq(struct sk_buff *skb, u32 *ack, u16 *wnd, u32 *fin_rcvd, u32 *rst_rcvd)
-{
-       struct nes_rskb_cb *cb = (struct nes_rskb_cb *)&skb->cb[0];
-       struct iphdr *iph = (struct iphdr *)(cb->data_start + ETH_HLEN);
-       struct tcphdr *tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
-
-       *ack = be32_to_cpu(tcph->ack_seq);
-       *wnd = be16_to_cpu(tcph->window);
-       *fin_rcvd = tcph->fin;
-       *rst_rcvd = tcph->rst;
-       return be32_to_cpu(tcph->seq);
-}
-
-/**
- * nes_get_next_skb - Get the next skb based on where current skb is in the queue
- */
-static struct sk_buff *nes_get_next_skb(struct nes_device *nesdev, struct nes_qp *nesqp,
-                                       struct sk_buff *skb, u32 nextseq, u32 *ack,
-                                       u16 *wnd, u32 *fin_rcvd, u32 *rst_rcvd)
-{
-       u32 seq;
-       bool processacks;
-       struct sk_buff *old_skb;
-
-       if (skb) {
-               /* Continue processing fpdu */
-               skb = skb_peek_next(skb, &nesqp->pau_list);
-               if (!skb)
-                       goto out;
-               processacks = false;
-       } else {
-               /* Starting a new one */
-               if (skb_queue_empty(&nesqp->pau_list))
-                       goto out;
-               skb = skb_peek(&nesqp->pau_list);
-               processacks = true;
-       }
-
-       while (1) {
-               if (skb_queue_empty(&nesqp->pau_list))
-                       goto out;
-
-               seq = nes_get_seq(skb, ack, wnd, fin_rcvd, rst_rcvd);
-               if (seq == nextseq) {
-                       if (skb->len || processacks)
-                               break;
-               } else if (after(seq, nextseq)) {
-                       goto out;
-               }
-
-               old_skb = skb;
-               skb = skb_peek_next(skb, &nesqp->pau_list);
-               skb_unlink(old_skb, &nesqp->pau_list);
-               nes_mgt_free_skb(nesdev, old_skb, PCI_DMA_TODEVICE);
-               nes_rem_ref_cm_node(nesqp->cm_node);
-               if (!skb)
-                       goto out;
-       }
-       return skb;
-
-out:
-       return NULL;
-}
-
-/**
- * get_fpdu_info - Find the next complete fpdu and return its fragments.
- */
-static int get_fpdu_info(struct nes_device *nesdev, struct nes_qp *nesqp,
-                        struct pau_fpdu_info **pau_fpdu_info)
-{
-       struct sk_buff *skb;
-       struct iphdr *iph;
-       struct tcphdr *tcph;
-       struct nes_rskb_cb *cb;
-       struct pau_fpdu_info *fpdu_info = NULL;
-       struct pau_fpdu_frag frags[MAX_FPDU_FRAGS];
-       u32 fpdu_len = 0;
-       u32 tmp_len;
-       int frag_cnt = 0;
-       u32 tot_len;
-       u32 frag_tot;
-       u32 ack;
-       u32 fin_rcvd;
-       u32 rst_rcvd;
-       u16 wnd;
-       int i;
-       int rc = 0;
-
-       *pau_fpdu_info = NULL;
-
-       skb = nes_get_next_skb(nesdev, nesqp, NULL, nesqp->pau_rcv_nxt, &ack, &wnd, &fin_rcvd, &rst_rcvd);
-       if (!skb)
-               goto out;
-
-       cb = (struct nes_rskb_cb *)&skb->cb[0];
-       if (skb->len) {
-               fpdu_len = be16_to_cpu(*(__be16 *) skb->data) + MPA_FRAMING;
-               fpdu_len = (fpdu_len + 3) & 0xfffffffc;
-               tmp_len = fpdu_len;
-
-               /* See if we have all of the fpdu */
-               frag_tot = 0;
-               memset(&frags, 0, sizeof frags);
-               for (i = 0; i < MAX_FPDU_FRAGS; i++) {
-                       frags[i].physaddr = cb->busaddr;
-                       frags[i].physaddr += skb->data - cb->data_start;
-                       frags[i].frag_len = min(tmp_len, skb->len);
-                       frags[i].skb = skb;
-                       frags[i].cmplt = (skb->len == frags[i].frag_len);
-                       frag_tot += frags[i].frag_len;
-                       frag_cnt++;
-
-                       tmp_len -= frags[i].frag_len;
-                       if (tmp_len == 0)
-                               break;
-
-                       skb = nes_get_next_skb(nesdev, nesqp, skb,
-                                              nesqp->pau_rcv_nxt + frag_tot, &ack, &wnd, &fin_rcvd, &rst_rcvd);
-                       if (!skb)
-                               goto out;
-                       if (rst_rcvd) {
-                               /* rst received in the middle of fpdu */
-                               for (; i >= 0; i--) {
-                                       skb_unlink(frags[i].skb, &nesqp->pau_list);
-                                       nes_mgt_free_skb(nesdev, frags[i].skb, PCI_DMA_TODEVICE);
-                               }
-                               cb = (struct nes_rskb_cb *)&skb->cb[0];
-                               frags[0].physaddr = cb->busaddr;
-                               frags[0].physaddr += skb->data - cb->data_start;
-                               frags[0].frag_len = skb->len;
-                               frags[0].skb = skb;
-                               frags[0].cmplt = true;
-                               frag_cnt = 1;
-                               break;
-                       }
-
-                       cb = (struct nes_rskb_cb *)&skb->cb[0];
-               }
-       } else {
-               /* no data */
-               frags[0].physaddr = cb->busaddr;
-               frags[0].frag_len = 0;
-               frags[0].skb = skb;
-               frags[0].cmplt = true;
-               frag_cnt = 1;
-       }
-
-       /* Found one */
-       fpdu_info = kzalloc(sizeof(*fpdu_info), GFP_ATOMIC);
-       if (!fpdu_info) {
-               rc = -ENOMEM;
-               goto out;
-       }
-
-       fpdu_info->cqp_request = nes_get_cqp_request(nesdev);
-       if (fpdu_info->cqp_request == NULL) {
-               nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n");
-               rc = -ENOMEM;
-               goto out;
-       }
-
-       cb = (struct nes_rskb_cb *)&frags[0].skb->cb[0];
-       iph = (struct iphdr *)(cb->data_start + ETH_HLEN);
-       tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
-       fpdu_info->hdr_len = (((unsigned char *)tcph) + 4 * (tcph->doff)) - cb->data_start;
-       fpdu_info->data_len = fpdu_len;
-       tot_len = fpdu_info->hdr_len + fpdu_len - ETH_HLEN;
-
-       if (frags[0].cmplt) {
-               fpdu_info->hdr_pbase = cb->busaddr;
-               fpdu_info->hdr_vbase = NULL;
-       } else {
-               fpdu_info->hdr_vbase = pci_alloc_consistent(nesdev->pcidev,
-                                                           fpdu_info->hdr_len, &fpdu_info->hdr_pbase);
-               if (!fpdu_info->hdr_vbase) {
-                       nes_debug(NES_DBG_PAU, "Unable to allocate memory for pau first frag\n");
-                       rc = -ENOMEM;
-                       goto out;
-               }
-
-               /* Copy hdrs, adjusting len and seqnum */
-               memcpy(fpdu_info->hdr_vbase, cb->data_start, fpdu_info->hdr_len);
-               iph = (struct iphdr *)(fpdu_info->hdr_vbase + ETH_HLEN);
-               tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
-       }
-
-       iph->tot_len = cpu_to_be16(tot_len);
-       iph->saddr = cpu_to_be32(0x7f000001);
-
-       tcph->seq = cpu_to_be32(nesqp->pau_rcv_nxt);
-       tcph->ack_seq = cpu_to_be32(ack);
-       tcph->window = cpu_to_be16(wnd);
-
-       nesqp->pau_rcv_nxt += fpdu_len + fin_rcvd;
-
-       memcpy(fpdu_info->frags, frags, sizeof(fpdu_info->frags));
-       fpdu_info->frag_cnt = frag_cnt;
-       fpdu_info->nesqp = nesqp;
-       *pau_fpdu_info = fpdu_info;
-
-       /* Update skb's for next pass */
-       for (i = 0; i < frag_cnt; i++) {
-               cb = (struct nes_rskb_cb *)&frags[i].skb->cb[0];
-               skb_pull(frags[i].skb, frags[i].frag_len);
-
-               if (frags[i].skb->len == 0) {
-                       /* Pull skb off the list - it will be freed in the callback */
-                       if (!skb_queue_empty(&nesqp->pau_list))
-                               skb_unlink(frags[i].skb, &nesqp->pau_list);
-               } else {
-                       /* Last skb still has data so update the seq */
-                       iph = (struct iphdr *)(cb->data_start + ETH_HLEN);
-                       tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
-                       tcph->seq = cpu_to_be32(nesqp->pau_rcv_nxt);
-               }
-       }
-
-out:
-       if (rc) {
-               if (fpdu_info) {
-                       if (fpdu_info->cqp_request)
-                               nes_put_cqp_request(nesdev, fpdu_info->cqp_request);
-                       kfree(fpdu_info);
-               }
-       }
-       return rc;
-}
-
-/**
- * forward_fpdu - send complete fpdus, one at a time
- */
-static int forward_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp)
-{
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct pau_fpdu_info *fpdu_info;
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       struct nes_cqp_request *cqp_request;
-       unsigned long flags;
-       u64 u64tmp;
-       u32 u32tmp;
-       int rc;
-
-       while (1) {
-               spin_lock_irqsave(&nesqp->pau_lock, flags);
-               rc = get_fpdu_info(nesdev, nesqp, &fpdu_info);
-               if (rc || (fpdu_info == NULL)) {
-                       spin_unlock_irqrestore(&nesqp->pau_lock, flags);
-                       return rc;
-               }
-
-               cqp_request = fpdu_info->cqp_request;
-               cqp_wqe = &cqp_request->cqp_wqe;
-               nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_DL_OPCODE_IDX,
-                                   NES_CQP_DOWNLOAD_SEGMENT |
-                                   (((u32)nesvnic->logical_port) << NES_CQP_OP_LOGICAL_PORT_SHIFT));
-
-               u32tmp = fpdu_info->hdr_len << 16;
-               u32tmp |= fpdu_info->hdr_len + (u32)fpdu_info->data_len;
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_DL_LENGTH_0_TOTAL_IDX,
-                                   u32tmp);
-
-               u32tmp = (fpdu_info->frags[1].frag_len << 16) | fpdu_info->frags[0].frag_len;
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_LENGTH_2_1_IDX,
-                                   u32tmp);
-
-               u32tmp = (fpdu_info->frags[3].frag_len << 16) | fpdu_info->frags[2].frag_len;
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_LENGTH_4_3_IDX,
-                                   u32tmp);
-
-               u64tmp = (u64)fpdu_info->hdr_pbase;
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX,
-                                   lower_32_bits(u64tmp));
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_HIGH_IDX,
-                                   upper_32_bits(u64tmp));
-
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX,
-                                   lower_32_bits(fpdu_info->frags[0].physaddr));
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_HIGH_IDX,
-                                   upper_32_bits(fpdu_info->frags[0].physaddr));
-
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG2_LOW_IDX,
-                                   lower_32_bits(fpdu_info->frags[1].physaddr));
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG2_HIGH_IDX,
-                                   upper_32_bits(fpdu_info->frags[1].physaddr));
-
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG3_LOW_IDX,
-                                   lower_32_bits(fpdu_info->frags[2].physaddr));
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG3_HIGH_IDX,
-                                   upper_32_bits(fpdu_info->frags[2].physaddr));
-
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG4_LOW_IDX,
-                                   lower_32_bits(fpdu_info->frags[3].physaddr));
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG4_HIGH_IDX,
-                                   upper_32_bits(fpdu_info->frags[3].physaddr));
-
-               cqp_request->cqp_callback_pointer = fpdu_info;
-               cqp_request->callback = 1;
-               cqp_request->cqp_callback = nes_download_callback;
-
-               atomic_set(&cqp_request->refcount, 1);
-               nes_post_cqp_request(nesdev, cqp_request);
-               spin_unlock_irqrestore(&nesqp->pau_lock, flags);
-       }
-
-       return 0;
-}
-
-static void process_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp)
-{
-       int again = 1;
-       unsigned long flags;
-
-       do {
-               /* Ignore rc - if it failed, tcp retries will cause it to try again */
-               forward_fpdus(nesvnic, nesqp);
-
-               spin_lock_irqsave(&nesqp->pau_lock, flags);
-               if (nesqp->pau_pending) {
-                       nesqp->pau_pending = 0;
-               } else {
-                       nesqp->pau_busy = 0;
-                       again = 0;
-               }
-
-               spin_unlock_irqrestore(&nesqp->pau_lock, flags);
-       } while (again);
-}
-
-/**
- * queue_fpdus - Handle fpdu's that hw passed up to sw
- */
-static void queue_fpdus(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp)
-{
-       struct sk_buff *tmpskb;
-       struct nes_rskb_cb *cb;
-       struct iphdr *iph;
-       struct tcphdr *tcph;
-       unsigned char *tcph_end;
-       u32 rcv_nxt;
-       u32 rcv_wnd;
-       u32 seqnum;
-       u32 len;
-       bool process_it = false;
-       unsigned long flags;
-
-       /* Move data ptr to after tcp header */
-       iph = (struct iphdr *)skb->data;
-       tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
-       seqnum = be32_to_cpu(tcph->seq);
-       tcph_end = (((char *)tcph) + (4 * tcph->doff));
-
-       len = be16_to_cpu(iph->tot_len);
-       if (skb->len > len)
-               skb_trim(skb, len);
-       skb_pull(skb, tcph_end - skb->data);
-
-       /* Initialize tracking values */
-       cb = (struct nes_rskb_cb *)&skb->cb[0];
-       cb->seqnum = seqnum;
-
-       /* Make sure data is in the receive window */
-       rcv_nxt = nesqp->pau_rcv_nxt;
-       rcv_wnd = le32_to_cpu(nesqp->nesqp_context->rcv_wnd);
-       if (!between(seqnum, rcv_nxt, (rcv_nxt + rcv_wnd))) {
-               nes_mgt_free_skb(nesvnic->nesdev, skb, PCI_DMA_TODEVICE);
-               nes_rem_ref_cm_node(nesqp->cm_node);
-               return;
-       }
-
-       spin_lock_irqsave(&nesqp->pau_lock, flags);
-
-       if (nesqp->pau_busy)
-               nesqp->pau_pending = 1;
-       else
-               nesqp->pau_busy = 1;
-
-       /* Queue skb by sequence number */
-       if (skb_queue_len(&nesqp->pau_list) == 0) {
-               __skb_queue_head(&nesqp->pau_list, skb);
-       } else {
-               skb_queue_walk(&nesqp->pau_list, tmpskb) {
-                       cb = (struct nes_rskb_cb *)&tmpskb->cb[0];
-                       if (before(seqnum, cb->seqnum))
-                               break;
-               }
-               __skb_insert(skb, tmpskb->prev, tmpskb, &nesqp->pau_list);
-       }
-       if (nesqp->pau_state == PAU_READY)
-               process_it = true;
-       spin_unlock_irqrestore(&nesqp->pau_lock, flags);
-
-       if (process_it)
-               process_fpdus(nesvnic, nesqp);
-
-       return;
-}
-
-/**
- * mgt_thread - Handle mgt skbs in a safe context
- */
-static int mgt_thread(void *context)
-{
-       struct nes_vnic *nesvnic = context;
-       struct sk_buff *skb;
-       struct nes_rskb_cb *cb;
-
-       while (!kthread_should_stop()) {
-               wait_event_interruptible(nesvnic->mgt_wait_queue,
-                                        skb_queue_len(&nesvnic->mgt_skb_list) || kthread_should_stop());
-               while ((skb_queue_len(&nesvnic->mgt_skb_list)) && !kthread_should_stop()) {
-                       skb = skb_dequeue(&nesvnic->mgt_skb_list);
-                       cb = (struct nes_rskb_cb *)&skb->cb[0];
-                       cb->data_start = skb->data - ETH_HLEN;
-                       cb->busaddr = pci_map_single(nesvnic->nesdev->pcidev, cb->data_start,
-                                                    nesvnic->max_frame_size, PCI_DMA_TODEVICE);
-                       queue_fpdus(skb, nesvnic, cb->nesqp);
-               }
-       }
-
-       /* Closing down so delete any entries on the queue */
-       while (skb_queue_len(&nesvnic->mgt_skb_list)) {
-               skb = skb_dequeue(&nesvnic->mgt_skb_list);
-               cb = (struct nes_rskb_cb *)&skb->cb[0];
-               nes_rem_ref_cm_node(cb->nesqp->cm_node);
-               dev_kfree_skb_any(skb);
-       }
-       return 0;
-}
-
-/**
- * nes_queue_skbs - Queue skb so it can be handled in a thread context
- */
-void nes_queue_mgt_skbs(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp)
-{
-       struct nes_rskb_cb *cb;
-
-       cb = (struct nes_rskb_cb *)&skb->cb[0];
-       cb->nesqp = nesqp;
-       skb_queue_tail(&nesvnic->mgt_skb_list, skb);
-       wake_up_interruptible(&nesvnic->mgt_wait_queue);
-}
-
-void nes_destroy_pau_qp(struct nes_device *nesdev, struct nes_qp *nesqp)
-{
-       struct sk_buff *skb;
-       unsigned long flags;
-       atomic_inc(&pau_qps_destroyed);
-
-       /* Free packets that have not yet been forwarded */
-       /* Lock is acquired by skb_dequeue when removing the skb */
-       spin_lock_irqsave(&nesqp->pau_lock, flags);
-       while (skb_queue_len(&nesqp->pau_list)) {
-               skb = skb_dequeue(&nesqp->pau_list);
-               nes_mgt_free_skb(nesdev, skb, PCI_DMA_TODEVICE);
-               nes_rem_ref_cm_node(nesqp->cm_node);
-       }
-       spin_unlock_irqrestore(&nesqp->pau_lock, flags);
-}
-
-static void nes_chg_qh_handler(struct nes_device *nesdev, struct nes_cqp_request *cqp_request)
-{
-       struct pau_qh_chg *qh_chg = cqp_request->cqp_callback_pointer;
-       struct nes_cqp_request *new_request;
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       struct nes_adapter *nesadapter;
-       struct nes_qp *nesqp;
-       struct nes_v4_quad nes_quad;
-       u32 crc_value;
-       u64 u64temp;
-
-       nesadapter = nesdev->nesadapter;
-       nesqp = qh_chg->nesqp;
-
-       /* Should we handle the bad completion */
-       if (cqp_request->major_code)
-               WARN(1, PFX "Invalid cqp_request major_code=0x%x\n",
-                      cqp_request->major_code);
-
-       switch (nesqp->pau_state) {
-       case PAU_DEL_QH:
-               /* Old hash code deleted, now set the new one */
-               nesqp->pau_state = PAU_ADD_LB_QH;
-               new_request = nes_get_cqp_request(nesdev);
-               if (new_request == NULL) {
-                       nes_debug(NES_DBG_PAU, "Failed to get a new_request.\n");
-                       WARN_ON(1);
-                       return;
-               }
-
-               memset(&nes_quad, 0, sizeof(nes_quad));
-               nes_quad.DstIpAdrIndex =
-                       cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24);
-               nes_quad.SrcIpadr = cpu_to_be32(0x7f000001);
-               nes_quad.TcpPorts[0] = swab16(nesqp->nesqp_context->tcpPorts[1]);
-               nes_quad.TcpPorts[1] = swab16(nesqp->nesqp_context->tcpPorts[0]);
-
-               /* Produce hash key */
-               crc_value = get_crc_value(&nes_quad);
-               nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff);
-               nes_debug(NES_DBG_PAU, "new HTE Index = 0x%08X, CRC = 0x%08X\n",
-                         nesqp->hte_index, nesqp->hte_index & nesadapter->hte_index_mask);
-
-               nesqp->hte_index &= nesadapter->hte_index_mask;
-               nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index);
-               nesqp->nesqp_context->ip0 = cpu_to_le32(0x7f000001);
-               nesqp->nesqp_context->rcv_nxt = cpu_to_le32(nesqp->pau_rcv_nxt);
-
-               cqp_wqe = &new_request->cqp_wqe;
-               nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-               set_wqe_32bit_value(cqp_wqe->wqe_words,
-                                   NES_CQP_WQE_OPCODE_IDX, NES_CQP_MANAGE_QUAD_HASH |
-                                   NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_CONTEXT_VALID | NES_CQP_QP_IWARP_STATE_RTS);
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
-               u64temp = (u64)nesqp->nesqp_context_pbase;
-               set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
-
-               nes_debug(NES_DBG_PAU, "Waiting for CQP completion for adding the quad hash.\n");
-
-               new_request->cqp_callback_pointer = qh_chg;
-               new_request->callback = 1;
-               new_request->cqp_callback = nes_chg_qh_handler;
-               atomic_set(&new_request->refcount, 1);
-               nes_post_cqp_request(nesdev, new_request);
-               break;
-
-       case PAU_ADD_LB_QH:
-               /* Start processing the queued fpdu's */
-               nesqp->pau_state = PAU_READY;
-               process_fpdus(qh_chg->nesvnic, qh_chg->nesqp);
-               kfree(qh_chg);
-               break;
-       }
-}
-
-/**
- * nes_change_quad_hash
- */
-static int nes_change_quad_hash(struct nes_device *nesdev,
-                               struct nes_vnic *nesvnic, struct nes_qp *nesqp)
-{
-       struct nes_cqp_request *cqp_request = NULL;
-       struct pau_qh_chg *qh_chg = NULL;
-       u64 u64temp;
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       int ret = 0;
-
-       cqp_request = nes_get_cqp_request(nesdev);
-       if (cqp_request == NULL) {
-               nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n");
-               ret = -ENOMEM;
-               goto chg_qh_err;
-       }
-
-       qh_chg = kmalloc(sizeof *qh_chg, GFP_ATOMIC);
-       if (!qh_chg) {
-               ret = -ENOMEM;
-               goto chg_qh_err;
-       }
-       qh_chg->nesdev = nesdev;
-       qh_chg->nesvnic = nesvnic;
-       qh_chg->nesqp = nesqp;
-       nesqp->pau_state = PAU_DEL_QH;
-
-       cqp_wqe = &cqp_request->cqp_wqe;
-       nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-       set_wqe_32bit_value(cqp_wqe->wqe_words,
-                           NES_CQP_WQE_OPCODE_IDX, NES_CQP_MANAGE_QUAD_HASH | NES_CQP_QP_DEL_HTE |
-                           NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_CONTEXT_VALID | NES_CQP_QP_IWARP_STATE_RTS);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
-       u64temp = (u64)nesqp->nesqp_context_pbase;
-       set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
-
-       nes_debug(NES_DBG_PAU, "Waiting for CQP completion for deleting the quad hash.\n");
-
-       cqp_request->cqp_callback_pointer = qh_chg;
-       cqp_request->callback = 1;
-       cqp_request->cqp_callback = nes_chg_qh_handler;
-       atomic_set(&cqp_request->refcount, 1);
-       nes_post_cqp_request(nesdev, cqp_request);
-
-       return ret;
-
-chg_qh_err:
-       kfree(qh_chg);
-       if (cqp_request)
-               nes_put_cqp_request(nesdev, cqp_request);
-       return ret;
-}
-
-/**
- * nes_mgt_ce_handler
- * This management code deals with any packed and unaligned (pau) fpdu's
- * that the hardware cannot handle.
- */
-static void nes_mgt_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq)
-{
-       struct nes_vnic_mgt *mgtvnic = container_of(cq, struct nes_vnic_mgt, mgt_cq);
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       u32 head;
-       u32 cq_size;
-       u32 cqe_count = 0;
-       u32 cqe_misc;
-       u32 qp_id = 0;
-       u32 skbs_needed;
-       unsigned long context;
-       struct nes_qp *nesqp;
-       struct sk_buff *rx_skb;
-       struct nes_rskb_cb *cb;
-
-       head = cq->cq_head;
-       cq_size = cq->cq_size;
-
-       while (1) {
-               cqe_misc = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]);
-               if (!(cqe_misc & NES_NIC_CQE_VALID))
-                       break;
-
-               nesqp = NULL;
-               if (cqe_misc & NES_NIC_CQE_ACCQP_VALID) {
-                       qp_id = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_ACCQP_ID_IDX]);
-                       qp_id &= 0x001fffff;
-                       if (qp_id < nesadapter->max_qp) {
-                               context = (unsigned long)nesadapter->qp_table[qp_id - NES_FIRST_QPN];
-                               nesqp = (struct nes_qp *)context;
-                       }
-               }
-
-               if (nesqp) {
-                       if (nesqp->pau_mode == false) {
-                               nesqp->pau_mode = true; /* First time for this qp */
-                               nesqp->pau_rcv_nxt = le32_to_cpu(
-                                       cq->cq_vbase[head].cqe_words[NES_NIC_CQE_HASH_RCVNXT]);
-                               skb_queue_head_init(&nesqp->pau_list);
-                               spin_lock_init(&nesqp->pau_lock);
-                               atomic_inc(&pau_qps_created);
-                               nes_change_quad_hash(nesdev, mgtvnic->nesvnic, nesqp);
-                       }
-
-                       rx_skb = mgtvnic->mgt.rx_skb[mgtvnic->mgt.rq_tail];
-                       rx_skb->len = 0;
-                       skb_put(rx_skb, cqe_misc & 0x0000ffff);
-                       rx_skb->protocol = eth_type_trans(rx_skb, mgtvnic->nesvnic->netdev);
-                       cb = (struct nes_rskb_cb *)&rx_skb->cb[0];
-                       pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, PCI_DMA_FROMDEVICE);
-                       cb->busaddr = 0;
-                       mgtvnic->mgt.rq_tail++;
-                       mgtvnic->mgt.rq_tail &= mgtvnic->mgt.rq_size - 1;
-
-                       nes_add_ref_cm_node(nesqp->cm_node);
-                       nes_queue_mgt_skbs(rx_skb, mgtvnic->nesvnic, nesqp);
-               } else {
-                       printk(KERN_ERR PFX "Invalid QP %d for packed/unaligned handling\n", qp_id);
-               }
-
-               cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX] = 0;
-               cqe_count++;
-               if (++head >= cq_size)
-                       head = 0;
-
-               if (cqe_count == 255) {
-                       /* Replenish mgt CQ */
-                       nes_write32(nesdev->regs + NES_CQE_ALLOC, cq->cq_number | (cqe_count << 16));
-                       nesdev->currcq_count += cqe_count;
-                       cqe_count = 0;
-               }
-
-               skbs_needed = atomic_inc_return(&mgtvnic->rx_skbs_needed);
-               if (skbs_needed > (mgtvnic->mgt.rq_size >> 1))
-                       nes_replenish_mgt_rq(mgtvnic);
-       }
-
-       cq->cq_head = head;
-       nes_write32(nesdev->regs + NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
-                   cq->cq_number | (cqe_count << 16));
-       nes_read32(nesdev->regs + NES_CQE_ALLOC);
-       nesdev->currcq_count += cqe_count;
-}
-
-/**
- * nes_init_mgt_qp
- */
-int nes_init_mgt_qp(struct nes_device *nesdev, struct net_device *netdev, struct nes_vnic *nesvnic)
-{
-       struct nes_vnic_mgt *mgtvnic;
-       u32 counter;
-       void *vmem;
-       dma_addr_t pmem;
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       u32 cqp_head;
-       unsigned long flags;
-       struct nes_hw_nic_qp_context *mgt_context;
-       u64 u64temp;
-       struct nes_hw_nic_rq_wqe *mgt_rqe;
-       struct sk_buff *skb;
-       u32 wqe_count;
-       struct nes_rskb_cb *cb;
-       u32 mgt_mem_size;
-       void *mgt_vbase;
-       dma_addr_t mgt_pbase;
-       int i;
-       int ret;
-
-       /* Allocate space the all mgt QPs once */
-       mgtvnic = kcalloc(NES_MGT_QP_COUNT, sizeof(struct nes_vnic_mgt),
-                         GFP_KERNEL);
-       if (!mgtvnic)
-               return -ENOMEM;
-
-       /* Allocate fragment, RQ, and CQ; Reuse CEQ based on the PCI function */
-       /* We are not sending from this NIC so sq is not allocated */
-       mgt_mem_size = 256 +
-                      (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe)) +
-                      (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_cqe)) +
-                      sizeof(struct nes_hw_nic_qp_context);
-       mgt_mem_size = (mgt_mem_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
-       mgt_vbase = pci_alloc_consistent(nesdev->pcidev, NES_MGT_QP_COUNT * mgt_mem_size, &mgt_pbase);
-       if (!mgt_vbase) {
-               kfree(mgtvnic);
-               nes_debug(NES_DBG_INIT, "Unable to allocate memory for mgt host descriptor rings\n");
-               return -ENOMEM;
-       }
-
-       nesvnic->mgt_mem_size = NES_MGT_QP_COUNT * mgt_mem_size;
-       nesvnic->mgt_vbase = mgt_vbase;
-       nesvnic->mgt_pbase = mgt_pbase;
-
-       skb_queue_head_init(&nesvnic->mgt_skb_list);
-       init_waitqueue_head(&nesvnic->mgt_wait_queue);
-       nesvnic->mgt_thread = kthread_run(mgt_thread, nesvnic, "nes_mgt_thread");
-
-       for (i = 0; i < NES_MGT_QP_COUNT; i++) {
-               mgtvnic->nesvnic = nesvnic;
-               mgtvnic->mgt.qp_id = nesdev->mac_index + NES_MGT_QP_OFFSET + i;
-               memset(mgt_vbase, 0, mgt_mem_size);
-               nes_debug(NES_DBG_INIT, "Allocated mgt QP structures at %p (phys = %016lX), size = %u.\n",
-                         mgt_vbase, (unsigned long)mgt_pbase, mgt_mem_size);
-
-               vmem = (void *)(((unsigned long)mgt_vbase + (256 - 1)) &
-                               ~(unsigned long)(256 - 1));
-               pmem = (dma_addr_t)(((unsigned long long)mgt_pbase + (256 - 1)) &
-                                   ~(unsigned long long)(256 - 1));
-
-               spin_lock_init(&mgtvnic->mgt.rq_lock);
-
-               /* setup the RQ */
-               mgtvnic->mgt.rq_vbase = vmem;
-               mgtvnic->mgt.rq_pbase = pmem;
-               mgtvnic->mgt.rq_head = 0;
-               mgtvnic->mgt.rq_tail = 0;
-               mgtvnic->mgt.rq_size = NES_MGT_WQ_COUNT;
-
-               /* setup the CQ */
-               vmem += (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe));
-               pmem += (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe));
-
-               mgtvnic->mgt_cq.cq_number = mgtvnic->mgt.qp_id;
-               mgtvnic->mgt_cq.cq_vbase = vmem;
-               mgtvnic->mgt_cq.cq_pbase = pmem;
-               mgtvnic->mgt_cq.cq_head = 0;
-               mgtvnic->mgt_cq.cq_size = NES_MGT_WQ_COUNT;
-
-               mgtvnic->mgt_cq.ce_handler = nes_mgt_ce_handler;
-
-               /* Send CreateCQ request to CQP */
-               spin_lock_irqsave(&nesdev->cqp.lock, flags);
-               cqp_head = nesdev->cqp.sq_head;
-
-               cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-               nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-
-               cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(
-                       NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
-                       ((u32)mgtvnic->mgt_cq.cq_size << 16));
-               cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(
-                       mgtvnic->mgt_cq.cq_number | ((u32)nesdev->ceq_index << 16));
-               u64temp = (u64)mgtvnic->mgt_cq.cq_pbase;
-               set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
-               cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0;
-               u64temp = (unsigned long)&mgtvnic->mgt_cq;
-               cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] = cpu_to_le32((u32)(u64temp >> 1));
-               cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
-                       cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
-               cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
-
-               if (++cqp_head >= nesdev->cqp.sq_size)
-                       cqp_head = 0;
-               cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-               nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-
-               /* Send CreateQP request to CQP */
-               mgt_context = (void *)(&mgtvnic->mgt_cq.cq_vbase[mgtvnic->mgt_cq.cq_size]);
-               mgt_context->context_words[NES_NIC_CTX_MISC_IDX] =
-                       cpu_to_le32((u32)NES_MGT_CTX_SIZE |
-                                   ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 12));
-               nes_debug(NES_DBG_INIT, "RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x%08X, RX_WINDOW_BUFFER_SIZE = 0x%08X\n",
-                         nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE),
-                         nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE));
-               if (nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE) != 0)
-                       mgt_context->context_words[NES_NIC_CTX_MISC_IDX] |= cpu_to_le32(NES_NIC_BACK_STORE);
-
-               u64temp = (u64)mgtvnic->mgt.rq_pbase;
-               mgt_context->context_words[NES_NIC_CTX_SQ_LOW_IDX] = cpu_to_le32((u32)u64temp);
-               mgt_context->context_words[NES_NIC_CTX_SQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
-               u64temp = (u64)mgtvnic->mgt.rq_pbase;
-               mgt_context->context_words[NES_NIC_CTX_RQ_LOW_IDX] = cpu_to_le32((u32)u64temp);
-               mgt_context->context_words[NES_NIC_CTX_RQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
-
-               cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_CREATE_QP |
-                                                                        NES_CQP_QP_TYPE_NIC);
-               cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(mgtvnic->mgt.qp_id);
-               u64temp = (u64)mgtvnic->mgt_cq.cq_pbase +
-                         (mgtvnic->mgt_cq.cq_size * sizeof(struct nes_hw_nic_cqe));
-               set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
-
-               if (++cqp_head >= nesdev->cqp.sq_size)
-                       cqp_head = 0;
-               nesdev->cqp.sq_head = cqp_head;
-
-               barrier();
-
-               /* Ring doorbell (2 WQEs) */
-               nes_write32(nesdev->regs + NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
-
-               spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-               nes_debug(NES_DBG_INIT, "Waiting for create MGT QP%u to complete.\n",
-                         mgtvnic->mgt.qp_id);
-
-               ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
-                                        NES_EVENT_TIMEOUT);
-               nes_debug(NES_DBG_INIT, "Create MGT QP%u completed, wait_event_timeout ret = %u.\n",
-                         mgtvnic->mgt.qp_id, ret);
-               if (!ret) {
-                       nes_debug(NES_DBG_INIT, "MGT QP%u create timeout expired\n", mgtvnic->mgt.qp_id);
-                       if (i == 0) {
-                               pci_free_consistent(nesdev->pcidev, nesvnic->mgt_mem_size, nesvnic->mgt_vbase,
-                                                   nesvnic->mgt_pbase);
-                               kfree(mgtvnic);
-                       } else {
-                               nes_destroy_mgt(nesvnic);
-                       }
-                       return -EIO;
-               }
-
-               /* Populate the RQ */
-               for (counter = 0; counter < (NES_MGT_WQ_COUNT - 1); counter++) {
-                       skb = dev_alloc_skb(nesvnic->max_frame_size);
-                       if (!skb) {
-                               nes_debug(NES_DBG_INIT, "%s: out of memory for receive skb\n", netdev->name);
-                               return -ENOMEM;
-                       }
-
-                       skb->dev = netdev;
-
-                       pmem = pci_map_single(nesdev->pcidev, skb->data,
-                                             nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
-                       cb = (struct nes_rskb_cb *)&skb->cb[0];
-                       cb->busaddr = pmem;
-                       cb->maplen = nesvnic->max_frame_size;
-
-                       mgt_rqe = &mgtvnic->mgt.rq_vbase[counter];
-                       mgt_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = cpu_to_le32((u32)nesvnic->max_frame_size);
-                       mgt_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
-                       mgt_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = cpu_to_le32((u32)pmem);
-                       mgt_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = cpu_to_le32((u32)((u64)pmem >> 32));
-                       mgtvnic->mgt.rx_skb[counter] = skb;
-               }
-
-               timer_setup(&mgtvnic->rq_wqes_timer, nes_mgt_rq_wqes_timeout,
-                           0);
-
-               wqe_count = NES_MGT_WQ_COUNT - 1;
-               mgtvnic->mgt.rq_head = wqe_count;
-               barrier();
-               do {
-                       counter = min(wqe_count, ((u32)255));
-                       wqe_count -= counter;
-                       nes_write32(nesdev->regs + NES_WQE_ALLOC, (counter << 24) | mgtvnic->mgt.qp_id);
-               } while (wqe_count);
-
-               nes_write32(nesdev->regs + NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
-                           mgtvnic->mgt_cq.cq_number);
-               nes_read32(nesdev->regs + NES_CQE_ALLOC);
-
-               mgt_vbase += mgt_mem_size;
-               mgt_pbase += mgt_mem_size;
-               nesvnic->mgtvnic[i] = mgtvnic++;
-       }
-       return 0;
-}
-
-
-void nes_destroy_mgt(struct nes_vnic *nesvnic)
-{
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_vnic_mgt *mgtvnic;
-       struct nes_vnic_mgt *first_mgtvnic;
-       unsigned long flags;
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       u32 cqp_head;
-       struct sk_buff *rx_skb;
-       int i;
-       int ret;
-
-       kthread_stop(nesvnic->mgt_thread);
-
-       /* Free remaining NIC receive buffers */
-       first_mgtvnic = nesvnic->mgtvnic[0];
-       for (i = 0; i < NES_MGT_QP_COUNT; i++) {
-               mgtvnic = nesvnic->mgtvnic[i];
-               if (mgtvnic == NULL)
-                       continue;
-
-               while (mgtvnic->mgt.rq_head != mgtvnic->mgt.rq_tail) {
-                       rx_skb = mgtvnic->mgt.rx_skb[mgtvnic->mgt.rq_tail];
-                       nes_mgt_free_skb(nesdev, rx_skb, PCI_DMA_FROMDEVICE);
-                       mgtvnic->mgt.rq_tail++;
-                       mgtvnic->mgt.rq_tail &= (mgtvnic->mgt.rq_size - 1);
-               }
-
-               spin_lock_irqsave(&nesdev->cqp.lock, flags);
-
-               /* Destroy NIC QP */
-               cqp_head = nesdev->cqp.sq_head;
-               cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-               nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
-                                   (NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_NIC));
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
-                                   mgtvnic->mgt.qp_id);
-
-               if (++cqp_head >= nesdev->cqp.sq_size)
-                       cqp_head = 0;
-
-               cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-
-               /* Destroy NIC CQ */
-               nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
-                                   (NES_CQP_DESTROY_CQ | ((u32)mgtvnic->mgt_cq.cq_size << 16)));
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
-                                   (mgtvnic->mgt_cq.cq_number | ((u32)nesdev->ceq_index << 16)));
-
-               if (++cqp_head >= nesdev->cqp.sq_size)
-                       cqp_head = 0;
-
-               nesdev->cqp.sq_head = cqp_head;
-               barrier();
-
-               /* Ring doorbell (2 WQEs) */
-               nes_write32(nesdev->regs + NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
-
-               spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-               nes_debug(NES_DBG_SHUTDOWN, "Waiting for CQP, cqp_head=%u, cqp.sq_head=%u,"
-                         " cqp.sq_tail=%u, cqp.sq_size=%u\n",
-                         cqp_head, nesdev->cqp.sq_head,
-                         nesdev->cqp.sq_tail, nesdev->cqp.sq_size);
-
-               ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
-                                        NES_EVENT_TIMEOUT);
-
-               nes_debug(NES_DBG_SHUTDOWN, "Destroy MGT QP returned, wait_event_timeout ret = %u, cqp_head=%u,"
-                         " cqp.sq_head=%u, cqp.sq_tail=%u\n",
-                         ret, cqp_head, nesdev->cqp.sq_head, nesdev->cqp.sq_tail);
-               if (!ret)
-                       nes_debug(NES_DBG_SHUTDOWN, "MGT QP%u destroy timeout expired\n",
-                                 mgtvnic->mgt.qp_id);
-
-               nesvnic->mgtvnic[i] = NULL;
-       }
-
-       if (nesvnic->mgt_vbase) {
-               pci_free_consistent(nesdev->pcidev, nesvnic->mgt_mem_size, nesvnic->mgt_vbase,
-                                   nesvnic->mgt_pbase);
-               nesvnic->mgt_vbase = NULL;
-               nesvnic->mgt_pbase = 0;
-       }
-
-       kfree(first_mgtvnic);
-}
diff --git a/drivers/infiniband/hw/nes/nes_mgt.h b/drivers/infiniband/hw/nes/nes_mgt.h
deleted file mode 100644 (file)
index 4f7f701..0000000
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
-* Copyright (c) 2006 - 2011 Intel-NE, Inc.  All rights reserved.
-*
-* This software is available to you under a choice of one of two
-* licenses.  You may choose to be licensed under the terms of the GNU
-* General Public License (GPL) Version 2, available from the file
-* COPYING in the main directory of this source tree, or the
-* OpenIB.org BSD license below:
-*
-*     Redistribution and use in source and binary forms, with or
-*     without modification, are permitted provided that the following
-*     conditions are met:
-*
-*      - Redistributions of source code must retain the above
-*        copyright notice, this list of conditions and the following
-*        disclaimer.
-*
-*      - Redistributions in binary form must reproduce the above
-*        copyright notice, this list of conditions and the following
-*        disclaimer in the documentation and/or other materials
-*        provided with the distribution.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-
-#ifndef __NES_MGT_H
-#define __NES_MGT_H
-
-#define MPA_FRAMING 6  /* length is 2 bytes, crc is 4 bytes */
-
-int nes_init_mgt_qp(struct nes_device *nesdev, struct net_device *netdev, struct nes_vnic *nesvnic);
-void nes_queue_mgt_skbs(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp);
-void nes_destroy_mgt(struct nes_vnic *nesvnic);
-void nes_destroy_pau_qp(struct nes_device *nesdev, struct nes_qp *nesqp);
-
-struct nes_hw_mgt {
-       struct nes_hw_nic_rq_wqe *rq_vbase;     /* virtual address of rq */
-       dma_addr_t rq_pbase;                    /* PCI memory for host rings */
-       struct sk_buff *rx_skb[NES_NIC_WQ_SIZE];
-       u16 qp_id;
-       u16 sq_head;
-       u16 rq_head;
-       u16 rq_tail;
-       u16 rq_size;
-       u8 replenishing_rq;
-       u8 reserved;
-       spinlock_t rq_lock;
-};
-
-struct nes_vnic_mgt {
-       struct nes_vnic        *nesvnic;
-       struct nes_hw_mgt      mgt;
-       struct nes_hw_nic_cq   mgt_cq;
-       atomic_t               rx_skbs_needed;
-       struct timer_list      rq_wqes_timer;
-       atomic_t               rx_skb_timer_running;
-};
-
-#define MAX_FPDU_FRAGS 4
-struct pau_fpdu_frag {
-       struct sk_buff         *skb;
-       u64                    physaddr;
-       u32                    frag_len;
-       bool                   cmplt;
-};
-
-struct pau_fpdu_info {
-       struct nes_qp          *nesqp;
-       struct nes_cqp_request *cqp_request;
-       void                   *hdr_vbase;
-       dma_addr_t             hdr_pbase;
-       int                    hdr_len;
-       u16                    data_len;
-       u16                    frag_cnt;
-       struct pau_fpdu_frag   frags[MAX_FPDU_FRAGS];
-};
-
-enum pau_qh_state {
-       PAU_DEL_QH,
-       PAU_ADD_LB_QH,
-       PAU_READY
-};
-
-struct pau_qh_chg {
-       struct nes_device *nesdev;
-       struct nes_vnic *nesvnic;
-       struct nes_qp *nesqp;
-};
-
-#endif          /* __NES_MGT_H */
diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c
deleted file mode 100644 (file)
index 16f3345..0000000
+++ /dev/null
@@ -1,1870 +0,0 @@
-/*
- * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/if_arp.h>
-#include <linux/if_vlan.h>
-#include <linux/ethtool.h>
-#include <linux/slab.h>
-#include <net/tcp.h>
-
-#include <net/inet_common.h>
-#include <linux/inet.h>
-
-#include "nes.h"
-
-static struct nic_qp_map nic_qp_mapping_0[] = {
-       {16,0,0,1},{24,4,0,0},{28,8,0,0},{32,12,0,0},
-       {20,2,2,1},{26,6,2,0},{30,10,2,0},{34,14,2,0},
-       {18,1,1,1},{25,5,1,0},{29,9,1,0},{33,13,1,0},
-       {22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0}
-};
-
-static struct nic_qp_map nic_qp_mapping_1[] = {
-       {18,1,1,1},{25,5,1,0},{29,9,1,0},{33,13,1,0},
-       {22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0}
-};
-
-static struct nic_qp_map nic_qp_mapping_2[] = {
-       {20,2,2,1},{26,6,2,0},{30,10,2,0},{34,14,2,0}
-};
-
-static struct nic_qp_map nic_qp_mapping_3[] = {
-       {22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0}
-};
-
-static struct nic_qp_map nic_qp_mapping_4[] = {
-       {28,8,0,0},{32,12,0,0}
-};
-
-static struct nic_qp_map nic_qp_mapping_5[] = {
-       {29,9,1,0},{33,13,1,0}
-};
-
-static struct nic_qp_map nic_qp_mapping_6[] = {
-       {30,10,2,0},{34,14,2,0}
-};
-
-static struct nic_qp_map nic_qp_mapping_7[] = {
-       {31,11,3,0},{35,15,3,0}
-};
-
-static struct nic_qp_map *nic_qp_mapping_per_function[] = {
-       nic_qp_mapping_0, nic_qp_mapping_1, nic_qp_mapping_2, nic_qp_mapping_3,
-       nic_qp_mapping_4, nic_qp_mapping_5, nic_qp_mapping_6, nic_qp_mapping_7
-};
-
-static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK
-               | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN;
-static int debug = -1;
-static int nics_per_function = 1;
-
-/**
- * nes_netdev_poll
- */
-static int nes_netdev_poll(struct napi_struct *napi, int budget)
-{
-       struct nes_vnic *nesvnic = container_of(napi, struct nes_vnic, napi);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_hw_nic_cq *nescq = &nesvnic->nic_cq;
-
-       nesvnic->budget = budget;
-       nescq->cqes_pending = 0;
-       nescq->rx_cqes_completed = 0;
-       nescq->cqe_allocs_pending = 0;
-       nescq->rx_pkts_indicated = 0;
-
-       nes_nic_ce_handler(nesdev, nescq);
-
-       if (nescq->cqes_pending == 0) {
-               napi_complete(napi);
-               /* clear out completed cqes and arm */
-               nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
-                               nescq->cq_number | (nescq->cqe_allocs_pending << 16));
-               nes_read32(nesdev->regs+NES_CQE_ALLOC);
-       } else {
-               /* clear out completed cqes but don't arm */
-               nes_write32(nesdev->regs+NES_CQE_ALLOC,
-                               nescq->cq_number | (nescq->cqe_allocs_pending << 16));
-               nes_debug(NES_DBG_NETDEV, "%s: exiting with work pending\n",
-                               nesvnic->netdev->name);
-       }
-       return nescq->rx_pkts_indicated;
-}
-
-
-/**
- * nes_netdev_open - Activate the network interface; ifconfig
- * ethx up.
- */
-static int nes_netdev_open(struct net_device *netdev)
-{
-       u32 macaddr_low;
-       u16 macaddr_high;
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       int ret;
-       int i;
-       struct nes_vnic *first_nesvnic = NULL;
-       u32 nic_active_bit;
-       u32 nic_active;
-       struct list_head *list_pos, *list_temp;
-       unsigned long flags;
-
-       if (nesvnic->netdev_open == 1)
-               return 0;
-
-       if (netif_msg_ifup(nesvnic))
-               printk(KERN_INFO PFX "%s: enabling interface\n", netdev->name);
-
-       ret = nes_init_nic_qp(nesdev, netdev);
-       if (ret) {
-               return ret;
-       }
-
-       netif_carrier_off(netdev);
-       netif_stop_queue(netdev);
-
-       if ((!nesvnic->of_device_registered) && (nesvnic->rdma_enabled)) {
-               nesvnic->nesibdev = nes_init_ofa_device(netdev);
-               if (nesvnic->nesibdev == NULL) {
-                       printk(KERN_ERR PFX "%s: nesvnic->nesibdev alloc failed", netdev->name);
-               } else {
-                       nesvnic->nesibdev->nesvnic = nesvnic;
-                       ret = nes_register_ofa_device(nesvnic->nesibdev);
-                       if (ret) {
-                               printk(KERN_ERR PFX "%s: Unable to register RDMA device, ret = %d\n",
-                                               netdev->name, ret);
-                       }
-               }
-       }
-       /* Set packet filters */
-       nic_active_bit = 1 << nesvnic->nic_index;
-       nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_ACTIVE);
-       nic_active |= nic_active_bit;
-       nes_write_indexed(nesdev, NES_IDX_NIC_ACTIVE, nic_active);
-       nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE);
-       nic_active |= nic_active_bit;
-       nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE, nic_active);
-       nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON);
-       nic_active |= nic_active_bit;
-       nes_write_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON, nic_active);
-
-       macaddr_high  = ((u16)netdev->dev_addr[0]) << 8;
-       macaddr_high += (u16)netdev->dev_addr[1];
-
-       macaddr_low   = ((u32)netdev->dev_addr[2]) << 24;
-       macaddr_low  += ((u32)netdev->dev_addr[3]) << 16;
-       macaddr_low  += ((u32)netdev->dev_addr[4]) << 8;
-       macaddr_low  += (u32)netdev->dev_addr[5];
-
-       /* Program the various MAC regs */
-       for (i = 0; i < NES_MAX_PORT_COUNT; i++) {
-               if (nesvnic->qp_nic_index[i] == 0xf) {
-                       break;
-               }
-               nes_debug(NES_DBG_NETDEV, "i=%d, perfect filter table index= %d, PERF FILTER LOW"
-                               " (Addr:%08X) = %08X, HIGH = %08X.\n",
-                               i, nesvnic->qp_nic_index[i],
-                               NES_IDX_PERFECT_FILTER_LOW+
-                                       (nesvnic->qp_nic_index[i] * 8),
-                               macaddr_low,
-                               (u32)macaddr_high | NES_MAC_ADDR_VALID |
-                               ((((u32)nesvnic->nic_index) << 16)));
-               nes_write_indexed(nesdev,
-                               NES_IDX_PERFECT_FILTER_LOW + (nesvnic->qp_nic_index[i] * 8),
-                               macaddr_low);
-               nes_write_indexed(nesdev,
-                               NES_IDX_PERFECT_FILTER_HIGH + (nesvnic->qp_nic_index[i] * 8),
-                               (u32)macaddr_high | NES_MAC_ADDR_VALID |
-                               ((((u32)nesvnic->nic_index) << 16)));
-       }
-
-
-       nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
-                       nesvnic->nic_cq.cq_number);
-       nes_read32(nesdev->regs+NES_CQE_ALLOC);
-       list_for_each_safe(list_pos, list_temp, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]) {
-               first_nesvnic = container_of(list_pos, struct nes_vnic, list);
-               if (first_nesvnic->netdev_open == 1)
-                       break;
-       }
-       if (first_nesvnic->netdev_open == 0) {
-               nes_debug(NES_DBG_INIT, "Setting up MAC interrupt mask.\n");
-               nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK + (0x200 * nesdev->mac_index),
-                               ~(NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT |
-                               NES_MAC_INT_TX_UNDERFLOW | NES_MAC_INT_TX_ERROR));
-               first_nesvnic = nesvnic;
-       }
-
-       if (first_nesvnic->linkup) {
-               /* Enable network packets */
-               nesvnic->linkup = 1;
-               netif_start_queue(netdev);
-               netif_carrier_on(netdev);
-       }
-
-       spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
-       if (nesdev->nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_SFP_D) {
-               nesdev->link_recheck = 1;
-               mod_delayed_work(system_wq, &nesdev->work,
-                                NES_LINK_RECHECK_DELAY);
-       }
-       spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
-
-       spin_lock_irqsave(&nesvnic->port_ibevent_lock, flags);
-       if (nesvnic->of_device_registered) {
-               nesdev->nesadapter->send_term_ok = 1;
-               if (nesvnic->linkup == 1) {
-                       if (nesdev->iw_status == 0) {
-                               nesdev->iw_status = 1;
-                               nes_port_ibevent(nesvnic);
-                       }
-               } else {
-                       nesdev->iw_status = 0;
-               }
-       }
-       spin_unlock_irqrestore(&nesvnic->port_ibevent_lock, flags);
-
-       napi_enable(&nesvnic->napi);
-       nesvnic->netdev_open = 1;
-
-       return 0;
-}
-
-
-/**
- * nes_netdev_stop
- */
-static int nes_netdev_stop(struct net_device *netdev)
-{
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       u32 nic_active_mask;
-       u32 nic_active;
-       struct nes_vnic *first_nesvnic = NULL;
-       struct list_head *list_pos, *list_temp;
-       unsigned long flags;
-
-       nes_debug(NES_DBG_SHUTDOWN, "nesvnic=%p, nesdev=%p, netdev=%p %s\n",
-                       nesvnic, nesdev, netdev, netdev->name);
-       if (nesvnic->netdev_open == 0)
-               return 0;
-
-       if (netif_msg_ifdown(nesvnic))
-               printk(KERN_INFO PFX "%s: disabling interface\n", netdev->name);
-       netif_carrier_off(netdev);
-
-       /* Disable network packets */
-       napi_disable(&nesvnic->napi);
-       netif_stop_queue(netdev);
-       list_for_each_safe(list_pos, list_temp, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]) {
-               first_nesvnic = container_of(list_pos, struct nes_vnic, list);
-               if ((first_nesvnic->netdev_open == 1) && (first_nesvnic != nesvnic))
-                       break;
-       }
-
-       if ((first_nesvnic->netdev_open == 1) && (first_nesvnic != nesvnic)  &&
-               (PCI_FUNC(first_nesvnic->nesdev->pcidev->devfn) !=
-               PCI_FUNC(nesvnic->nesdev->pcidev->devfn))) {
-                       nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK+
-                               (0x200*nesdev->mac_index), 0xffffffff);
-                       nes_write_indexed(first_nesvnic->nesdev,
-                               NES_IDX_MAC_INT_MASK+
-                               (0x200*first_nesvnic->nesdev->mac_index),
-                       ~(NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT |
-                       NES_MAC_INT_TX_UNDERFLOW | NES_MAC_INT_TX_ERROR));
-       } else {
-               nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK+(0x200*nesdev->mac_index), 0xffffffff);
-       }
-
-       nic_active_mask = ~((u32)(1 << nesvnic->nic_index));
-       nes_write_indexed(nesdev, NES_IDX_PERFECT_FILTER_HIGH+
-                       (nesvnic->perfect_filter_index*8), 0);
-       nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_ACTIVE);
-       nic_active &= nic_active_mask;
-       nes_write_indexed(nesdev, NES_IDX_NIC_ACTIVE, nic_active);
-       nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
-       nic_active &= nic_active_mask;
-       nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
-       nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE);
-       nic_active &= nic_active_mask;
-       nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE, nic_active);
-       nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
-       nic_active &= nic_active_mask;
-       nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
-       nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON);
-       nic_active &= nic_active_mask;
-       nes_write_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON, nic_active);
-
-       spin_lock_irqsave(&nesvnic->port_ibevent_lock, flags);
-       if (nesvnic->of_device_registered) {
-               nesdev->nesadapter->send_term_ok = 0;
-               nesdev->iw_status = 0;
-               if (nesvnic->linkup == 1)
-                       nes_port_ibevent(nesvnic);
-       }
-       del_timer_sync(&nesvnic->event_timer);
-       nesvnic->event_timer.function = NULL;
-       spin_unlock_irqrestore(&nesvnic->port_ibevent_lock, flags);
-
-       nes_destroy_nic_qp(nesvnic);
-
-       nesvnic->netdev_open = 0;
-
-       return 0;
-}
-
-
-/**
- * nes_nic_send
- */
-static bool nes_nic_send(struct sk_buff *skb, struct net_device *netdev)
-{
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_hw_nic *nesnic = &nesvnic->nic;
-       struct nes_hw_nic_sq_wqe *nic_sqe;
-       struct tcphdr *tcph;
-       __le16 *wqe_fragment_length;
-       u32 wqe_misc;
-       u16 wqe_fragment_index = 1;     /* first fragment (0) is used by copy buffer */
-       u16 skb_fragment_index;
-       dma_addr_t bus_address;
-
-       nic_sqe = &nesnic->sq_vbase[nesnic->sq_head];
-       wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
-
-       /* setup the VLAN tag if present */
-       if (skb_vlan_tag_present(skb)) {
-               nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n",
-                               netdev->name, skb_vlan_tag_get(skb));
-               wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE;
-               wqe_fragment_length[0] = (__force __le16) skb_vlan_tag_get(skb);
-       } else
-               wqe_misc = 0;
-
-       /* bump past the vlan tag */
-       wqe_fragment_length++;
-       /*      wqe_fragment_address = (u64 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX]; */
-       wqe_misc |= NES_NIC_SQ_WQE_COMPLETION;
-
-       if (skb->ip_summed == CHECKSUM_PARTIAL) {
-               if (skb_is_gso(skb)) {
-                       tcph = tcp_hdr(skb);
-                       /* nes_debug(NES_DBG_NIC_TX, "%s: TSO request... is_gso = %u seg size = %u\n",
-                                       netdev->name, skb_is_gso(skb), skb_shinfo(skb)->gso_size); */
-                       wqe_misc |= NES_NIC_SQ_WQE_LSO_ENABLE | (u16)skb_shinfo(skb)->gso_size;
-                       set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_LSO_INFO_IDX,
-                                       ((u32)tcph->doff) |
-                                       (((u32)(((unsigned char *)tcph) - skb->data)) << 4));
-               }
-       } else {        /* CHECKSUM_HW */
-               wqe_misc |= NES_NIC_SQ_WQE_DISABLE_CHKSUM;
-       }
-
-       set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX,
-                               skb->len);
-       memcpy(&nesnic->first_frag_vbase[nesnic->sq_head].buffer,
-                       skb->data, min(((unsigned int)NES_FIRST_FRAG_SIZE), skb_headlen(skb)));
-       wqe_fragment_length[0] = cpu_to_le16(min(((unsigned int)NES_FIRST_FRAG_SIZE),
-                       skb_headlen(skb)));
-       wqe_fragment_length[1] = 0;
-       if (skb_headlen(skb) > NES_FIRST_FRAG_SIZE) {
-               if ((skb_shinfo(skb)->nr_frags + 1) > 4) {
-                       nes_debug(NES_DBG_NIC_TX, "%s: Packet with %u fragments not sent, skb_headlen=%u\n",
-                                       netdev->name, skb_shinfo(skb)->nr_frags + 2, skb_headlen(skb));
-                       kfree_skb(skb);
-                       nesvnic->tx_sw_dropped++;
-                       return false;
-               }
-               set_bit(nesnic->sq_head, nesnic->first_frag_overflow);
-               bus_address = pci_map_single(nesdev->pcidev, skb->data + NES_FIRST_FRAG_SIZE,
-                               skb_headlen(skb) - NES_FIRST_FRAG_SIZE, PCI_DMA_TODEVICE);
-               wqe_fragment_length[wqe_fragment_index++] =
-                               cpu_to_le16(skb_headlen(skb) - NES_FIRST_FRAG_SIZE);
-               wqe_fragment_length[wqe_fragment_index] = 0;
-               set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX,
-                               ((u64)(bus_address)));
-               nesnic->tx_skb[nesnic->sq_head] = skb;
-       }
-
-       if (skb_headlen(skb) == skb->len) {
-               if (skb_headlen(skb) <= NES_FIRST_FRAG_SIZE) {
-                       nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_2_1_IDX] = 0;
-                       nesnic->tx_skb[nesnic->sq_head] = skb;
-               }
-       } else {
-               /* Deal with Fragments */
-               nesnic->tx_skb[nesnic->sq_head] = skb;
-               for (skb_fragment_index = 0; skb_fragment_index < skb_shinfo(skb)->nr_frags;
-                               skb_fragment_index++) {
-                       skb_frag_t *frag =
-                               &skb_shinfo(skb)->frags[skb_fragment_index];
-                       bus_address = skb_frag_dma_map(&nesdev->pcidev->dev,
-                                                      frag, 0, skb_frag_size(frag),
-                                                      DMA_TO_DEVICE);
-                       wqe_fragment_length[wqe_fragment_index] =
-                                       cpu_to_le16(skb_frag_size(&skb_shinfo(skb)->frags[skb_fragment_index]));
-                       set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX+(2*wqe_fragment_index),
-                               bus_address);
-                       wqe_fragment_index++;
-                       if (wqe_fragment_index < 5)
-                               wqe_fragment_length[wqe_fragment_index] = 0;
-               }
-       }
-
-       set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX, wqe_misc);
-       nesnic->sq_head++;
-       nesnic->sq_head &= nesnic->sq_size - 1;
-       return true;
-}
-
-
-/**
- * nes_netdev_start_xmit
- */
-static netdev_tx_t nes_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev)
-{
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_hw_nic *nesnic = &nesvnic->nic;
-       struct nes_hw_nic_sq_wqe *nic_sqe;
-       struct tcphdr *tcph;
-       /* struct udphdr *udph; */
-#define NES_MAX_TSO_FRAGS MAX_SKB_FRAGS
-       /* 64K segment plus overflow on each side */
-       dma_addr_t tso_bus_address[NES_MAX_TSO_FRAGS];
-       dma_addr_t bus_address;
-       u32 tso_frag_index;
-       u32 tso_frag_count;
-       u32 tso_wqe_length;
-       u32 curr_tcp_seq;
-       u32 wqe_count=1;
-       struct iphdr *iph;
-       __le16 *wqe_fragment_length;
-       u32 nr_frags;
-       u32 original_first_length;
-       /* u64 *wqe_fragment_address; */
-       /* first fragment (0) is used by copy buffer */
-       u16 wqe_fragment_index=1;
-       u16 hoffset;
-       u16 nhoffset;
-       u16 wqes_needed;
-       u16 wqes_available;
-       u32 wqe_misc;
-
-       /*
-        * nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u,"
-        *              " (%u frags), tso_size=%u\n",
-        *              netdev->name, skb->len, skb_headlen(skb),
-        *              skb_shinfo(skb)->nr_frags, skb_is_gso(skb));
-        */
-
-       if (netif_queue_stopped(netdev))
-               return NETDEV_TX_BUSY;
-
-       /* Check if SQ is full */
-       if ((((nesnic->sq_tail+(nesnic->sq_size*2))-nesnic->sq_head) & (nesnic->sq_size - 1)) == 1) {
-               if (!netif_queue_stopped(netdev)) {
-                       netif_stop_queue(netdev);
-                       barrier();
-                       if ((((((volatile u16)nesnic->sq_tail)+(nesnic->sq_size*2))-nesnic->sq_head) & (nesnic->sq_size - 1)) != 1) {
-                               netif_start_queue(netdev);
-                               goto sq_no_longer_full;
-                       }
-               }
-               nesvnic->sq_full++;
-               return NETDEV_TX_BUSY;
-       }
-
-sq_no_longer_full:
-       nr_frags = skb_shinfo(skb)->nr_frags;
-       if (skb_headlen(skb) > NES_FIRST_FRAG_SIZE) {
-               nr_frags++;
-       }
-       /* Check if too many fragments */
-       if (unlikely((nr_frags > 4))) {
-               if (skb_is_gso(skb)) {
-                       nesvnic->segmented_tso_requests++;
-                       nesvnic->tso_requests++;
-                       /* Basically 4 fragments available per WQE with extended fragments */
-                       wqes_needed = nr_frags >> 2;
-                       wqes_needed += (nr_frags&3)?1:0;
-                       wqes_available = (((nesnic->sq_tail+nesnic->sq_size)-nesnic->sq_head) - 1) &
-                                       (nesnic->sq_size - 1);
-
-                       if (unlikely(wqes_needed > wqes_available)) {
-                               if (!netif_queue_stopped(netdev)) {
-                                       netif_stop_queue(netdev);
-                                       barrier();
-                                       wqes_available = (((((volatile u16)nesnic->sq_tail)+nesnic->sq_size)-nesnic->sq_head) - 1) &
-                                               (nesnic->sq_size - 1);
-                                       if (wqes_needed <= wqes_available) {
-                                               netif_start_queue(netdev);
-                                               goto tso_sq_no_longer_full;
-                                       }
-                               }
-                               nesvnic->sq_full++;
-                               nes_debug(NES_DBG_NIC_TX, "%s: HNIC SQ full- TSO request has too many frags!\n",
-                                               netdev->name);
-                               return NETDEV_TX_BUSY;
-                       }
-tso_sq_no_longer_full:
-                       /* Map all the buffers */
-                       for (tso_frag_count=0; tso_frag_count < skb_shinfo(skb)->nr_frags;
-                                       tso_frag_count++) {
-                               skb_frag_t *frag =
-                                       &skb_shinfo(skb)->frags[tso_frag_count];
-                               tso_bus_address[tso_frag_count] =
-                                       skb_frag_dma_map(&nesdev->pcidev->dev,
-                                                        frag, 0, skb_frag_size(frag),
-                                                        DMA_TO_DEVICE);
-                       }
-
-                       tso_frag_index = 0;
-                       curr_tcp_seq = ntohl(tcp_hdr(skb)->seq);
-                       hoffset = skb_transport_header(skb) - skb->data;
-                       nhoffset = skb_network_header(skb) - skb->data;
-                       original_first_length = hoffset + ((((struct tcphdr *)skb_transport_header(skb))->doff)<<2);
-
-                       for (wqe_count=0; wqe_count<((u32)wqes_needed); wqe_count++) {
-                               tso_wqe_length = 0;
-                               nic_sqe = &nesnic->sq_vbase[nesnic->sq_head];
-                               wqe_fragment_length =
-                                               (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
-                               /* setup the VLAN tag if present */
-                               if (skb_vlan_tag_present(skb)) {
-                                       nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n",
-                                                       netdev->name,
-                                                 skb_vlan_tag_get(skb));
-                                       wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE;
-                                       wqe_fragment_length[0] = (__force __le16) skb_vlan_tag_get(skb);
-                               } else
-                                       wqe_misc = 0;
-
-                               /* bump past the vlan tag */
-                               wqe_fragment_length++;
-
-                               /* Assumes header totally fits in allocated buffer and is in first fragment */
-                               if (original_first_length > NES_FIRST_FRAG_SIZE) {
-                                       nes_debug(NES_DBG_NIC_TX, "ERROR: SKB header too big, headlen=%u, FIRST_FRAG_SIZE=%u\n",
-                                                       original_first_length, NES_FIRST_FRAG_SIZE);
-                                       nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u,"
-                                                       " (%u frags), is_gso = %u tso_size=%u\n",
-                                                       netdev->name,
-                                                       skb->len, skb_headlen(skb),
-                                                       skb_shinfo(skb)->nr_frags, skb_is_gso(skb), skb_shinfo(skb)->gso_size);
-                               }
-                               memcpy(&nesnic->first_frag_vbase[nesnic->sq_head].buffer,
-                                               skb->data, min(((unsigned int)NES_FIRST_FRAG_SIZE),
-                                               original_first_length));
-                               iph = (struct iphdr *)
-                               (&nesnic->first_frag_vbase[nesnic->sq_head].buffer[nhoffset]);
-                               tcph = (struct tcphdr *)
-                               (&nesnic->first_frag_vbase[nesnic->sq_head].buffer[hoffset]);
-                               if ((wqe_count+1)!=(u32)wqes_needed) {
-                                       tcph->fin = 0;
-                                       tcph->psh = 0;
-                                       tcph->rst = 0;
-                                       tcph->urg = 0;
-                               }
-                               if (wqe_count) {
-                                       tcph->syn = 0;
-                               }
-                               tcph->seq = htonl(curr_tcp_seq);
-                               wqe_fragment_length[0] = cpu_to_le16(min(((unsigned int)NES_FIRST_FRAG_SIZE),
-                                               original_first_length));
-
-                               wqe_fragment_index = 1;
-                               if ((wqe_count==0) && (skb_headlen(skb) > original_first_length)) {
-                                       set_bit(nesnic->sq_head, nesnic->first_frag_overflow);
-                                       bus_address = pci_map_single(nesdev->pcidev, skb->data + original_first_length,
-                                                       skb_headlen(skb) - original_first_length, PCI_DMA_TODEVICE);
-                                       wqe_fragment_length[wqe_fragment_index++] =
-                                               cpu_to_le16(skb_headlen(skb) - original_first_length);
-                                       wqe_fragment_length[wqe_fragment_index] = 0;
-                                       set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX,
-                                                                       bus_address);
-                                       tso_wqe_length += skb_headlen(skb) -
-                                                       original_first_length;
-                               }
-                               while (wqe_fragment_index < 5) {
-                                       wqe_fragment_length[wqe_fragment_index] =
-                                                       cpu_to_le16(skb_frag_size(&skb_shinfo(skb)->frags[tso_frag_index]));
-                                       set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX+(2*wqe_fragment_index),
-                                               (u64)tso_bus_address[tso_frag_index]);
-                                       wqe_fragment_index++;
-                                       tso_wqe_length += skb_frag_size(&skb_shinfo(skb)->frags[tso_frag_index++]);
-                                       if (wqe_fragment_index < 5)
-                                               wqe_fragment_length[wqe_fragment_index] = 0;
-                                       if (tso_frag_index == tso_frag_count)
-                                               break;
-                               }
-                               if ((wqe_count+1) == (u32)wqes_needed) {
-                                       nesnic->tx_skb[nesnic->sq_head] = skb;
-                               } else {
-                                       nesnic->tx_skb[nesnic->sq_head] = NULL;
-                               }
-                               wqe_misc |= NES_NIC_SQ_WQE_COMPLETION | (u16)skb_shinfo(skb)->gso_size;
-                               if ((tso_wqe_length + original_first_length) > skb_shinfo(skb)->gso_size) {
-                                       wqe_misc |= NES_NIC_SQ_WQE_LSO_ENABLE;
-                               } else {
-                                       iph->tot_len = htons(tso_wqe_length + original_first_length - nhoffset);
-                               }
-
-                               set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX,
-                                                wqe_misc);
-                               set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_LSO_INFO_IDX,
-                                               ((u32)tcph->doff) | (((u32)hoffset) << 4));
-
-                               set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX,
-                                               tso_wqe_length + original_first_length);
-                               curr_tcp_seq += tso_wqe_length;
-                               nesnic->sq_head++;
-                               nesnic->sq_head &= nesnic->sq_size-1;
-                       }
-               } else {
-                       hoffset = skb_transport_header(skb) - skb->data;
-                       nhoffset = skb_network_header(skb) - skb->data;
-                       if (skb_linearize(skb)) {
-                               nesvnic->tx_sw_dropped++;
-                               kfree_skb(skb);
-                               return NETDEV_TX_OK;
-                       }
-                       nesvnic->linearized_skbs++;
-                       skb_set_transport_header(skb, hoffset);
-                       skb_set_network_header(skb, nhoffset);
-                       if (!nes_nic_send(skb, netdev))
-                               return NETDEV_TX_OK;
-               }
-       } else {
-               if (!nes_nic_send(skb, netdev))
-                       return NETDEV_TX_OK;
-       }
-
-       barrier();
-
-       if (wqe_count)
-               nes_write32(nesdev->regs+NES_WQE_ALLOC,
-                               (wqe_count << 24) | (1 << 23) | nesvnic->nic.qp_id);
-
-       netif_trans_update(netdev);
-
-       return NETDEV_TX_OK;
-}
-
-
-/**
- * nes_netdev_get_stats
- */
-static struct net_device_stats *nes_netdev_get_stats(struct net_device *netdev)
-{
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       u64 u64temp;
-       u32 u32temp;
-
-       u32temp = nes_read_indexed(nesdev,
-                       NES_IDX_ENDNODE0_NSTAT_RX_DISCARD + (nesvnic->nic_index*0x200));
-       nesvnic->netstats.rx_dropped += u32temp;
-       nesvnic->endnode_nstat_rx_discard += u32temp;
-
-       u64temp = (u64)nes_read_indexed(nesdev,
-                       NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO + (nesvnic->nic_index*0x200));
-       u64temp += ((u64)nes_read_indexed(nesdev,
-                       NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI + (nesvnic->nic_index*0x200))) << 32;
-
-       nesvnic->endnode_nstat_rx_octets += u64temp;
-       nesvnic->netstats.rx_bytes += u64temp;
-
-       u64temp = (u64)nes_read_indexed(nesdev,
-                       NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO + (nesvnic->nic_index*0x200));
-       u64temp += ((u64)nes_read_indexed(nesdev,
-                       NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI + (nesvnic->nic_index*0x200))) << 32;
-
-       nesvnic->endnode_nstat_rx_frames += u64temp;
-       nesvnic->netstats.rx_packets += u64temp;
-
-       u64temp = (u64)nes_read_indexed(nesdev,
-                       NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO + (nesvnic->nic_index*0x200));
-       u64temp += ((u64)nes_read_indexed(nesdev,
-                       NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI + (nesvnic->nic_index*0x200))) << 32;
-
-       nesvnic->endnode_nstat_tx_octets += u64temp;
-       nesvnic->netstats.tx_bytes += u64temp;
-
-       u64temp = (u64)nes_read_indexed(nesdev,
-                       NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO + (nesvnic->nic_index*0x200));
-       u64temp += ((u64)nes_read_indexed(nesdev,
-                       NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI + (nesvnic->nic_index*0x200))) << 32;
-
-       nesvnic->endnode_nstat_tx_frames += u64temp;
-       nesvnic->netstats.tx_packets += u64temp;
-
-       u32temp = nes_read_indexed(nesdev,
-                       NES_IDX_MAC_RX_SHORT_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-       nesvnic->netstats.rx_dropped += u32temp;
-       nesvnic->nesdev->mac_rx_errors += u32temp;
-       nesvnic->nesdev->mac_rx_short_frames += u32temp;
-
-       u32temp = nes_read_indexed(nesdev,
-                       NES_IDX_MAC_RX_OVERSIZED_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-       nesvnic->netstats.rx_dropped += u32temp;
-       nesvnic->nesdev->mac_rx_errors += u32temp;
-       nesvnic->nesdev->mac_rx_oversized_frames += u32temp;
-
-       u32temp = nes_read_indexed(nesdev,
-                       NES_IDX_MAC_RX_JABBER_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-       nesvnic->netstats.rx_dropped += u32temp;
-       nesvnic->nesdev->mac_rx_errors += u32temp;
-       nesvnic->nesdev->mac_rx_jabber_frames += u32temp;
-
-       u32temp = nes_read_indexed(nesdev,
-                       NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-       nesvnic->netstats.rx_dropped += u32temp;
-       nesvnic->nesdev->mac_rx_errors += u32temp;
-       nesvnic->nesdev->mac_rx_symbol_err_frames += u32temp;
-
-       u32temp = nes_read_indexed(nesdev,
-                       NES_IDX_MAC_RX_LENGTH_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-       nesvnic->netstats.rx_length_errors += u32temp;
-       nesvnic->nesdev->mac_rx_errors += u32temp;
-
-       u32temp = nes_read_indexed(nesdev,
-                       NES_IDX_MAC_RX_CRC_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-       nesvnic->nesdev->mac_rx_errors += u32temp;
-       nesvnic->nesdev->mac_rx_crc_errors += u32temp;
-       nesvnic->netstats.rx_crc_errors += u32temp;
-
-       u32temp = nes_read_indexed(nesdev,
-                       NES_IDX_MAC_TX_ERRORS + (nesvnic->nesdev->mac_index*0x200));
-       nesvnic->nesdev->mac_tx_errors += u32temp;
-       nesvnic->netstats.tx_errors += u32temp;
-
-       return &nesvnic->netstats;
-}
-
-
-/**
- * nes_netdev_tx_timeout
- */
-static void nes_netdev_tx_timeout(struct net_device *netdev)
-{
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-
-       if (netif_msg_timer(nesvnic))
-               nes_debug(NES_DBG_NIC_TX, "%s: tx timeout\n", netdev->name);
-}
-
-
-/**
- * nes_netdev_set_mac_address
- */
-static int nes_netdev_set_mac_address(struct net_device *netdev, void *p)
-{
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct sockaddr *mac_addr = p;
-       int i;
-       u32 macaddr_low;
-       u16 macaddr_high;
-
-       if (!is_valid_ether_addr(mac_addr->sa_data))
-               return -EADDRNOTAVAIL;
-
-       memcpy(netdev->dev_addr, mac_addr->sa_data, netdev->addr_len);
-       printk(PFX "%s: Address length = %d, Address = %pM\n",
-              __func__, netdev->addr_len, mac_addr->sa_data);
-       macaddr_high  = ((u16)netdev->dev_addr[0]) << 8;
-       macaddr_high += (u16)netdev->dev_addr[1];
-       macaddr_low   = ((u32)netdev->dev_addr[2]) << 24;
-       macaddr_low  += ((u32)netdev->dev_addr[3]) << 16;
-       macaddr_low  += ((u32)netdev->dev_addr[4]) << 8;
-       macaddr_low  += (u32)netdev->dev_addr[5];
-
-       for (i = 0; i < NES_MAX_PORT_COUNT; i++) {
-               if (nesvnic->qp_nic_index[i] == 0xf) {
-                       break;
-               }
-               nes_write_indexed(nesdev,
-                               NES_IDX_PERFECT_FILTER_LOW + (nesvnic->qp_nic_index[i] * 8),
-                               macaddr_low);
-               nes_write_indexed(nesdev,
-                               NES_IDX_PERFECT_FILTER_HIGH + (nesvnic->qp_nic_index[i] * 8),
-                               (u32)macaddr_high | NES_MAC_ADDR_VALID |
-                               ((((u32)nesvnic->nic_index) << 16)));
-       }
-       return 0;
-}
-
-
-static void set_allmulti(struct nes_device *nesdev, u32 nic_active_bit)
-{
-       u32 nic_active;
-
-       nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
-       nic_active |= nic_active_bit;
-       nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
-       nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
-       nic_active &= ~nic_active_bit;
-       nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
-}
-
-#define get_addr(addrs, index) ((addrs) + (index) * ETH_ALEN)
-
-/**
- * nes_netdev_set_multicast_list
- */
-static void nes_netdev_set_multicast_list(struct net_device *netdev)
-{
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter;
-       u32 nic_active_bit;
-       u32 nic_active;
-       u32 perfect_filter_register_address;
-       u32 macaddr_low;
-       u16 macaddr_high;
-       u8 mc_all_on = 0;
-       u8 mc_index;
-       int mc_nic_index = -1;
-       u8 pft_entries_preallocated = max(nesadapter->adapter_fcn_count *
-                                       nics_per_function, 4);
-       u8 max_pft_entries_avaiable = NES_PFT_SIZE - pft_entries_preallocated;
-       unsigned long flags;
-       int mc_count = netdev_mc_count(netdev);
-
-       spin_lock_irqsave(&nesadapter->resource_lock, flags);
-       nic_active_bit = 1 << nesvnic->nic_index;
-
-       if (netdev->flags & IFF_PROMISC) {
-               nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
-               nic_active |= nic_active_bit;
-               nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
-               nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
-               nic_active |= nic_active_bit;
-               nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
-               mc_all_on = 1;
-       } else if ((netdev->flags & IFF_ALLMULTI) ||
-                          (nesvnic->nic_index > 3)) {
-               set_allmulti(nesdev, nic_active_bit);
-               mc_all_on = 1;
-       } else {
-               nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
-               nic_active &= ~nic_active_bit;
-               nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
-               nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
-               nic_active &= ~nic_active_bit;
-               nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
-       }
-
-       nes_debug(NES_DBG_NIC_RX, "Number of MC entries = %d, Promiscuous = %d, All Multicast = %d.\n",
-                 mc_count, !!(netdev->flags & IFF_PROMISC),
-                 !!(netdev->flags & IFF_ALLMULTI));
-       if (!mc_all_on) {
-               char *addrs;
-               int i;
-               struct netdev_hw_addr *ha;
-
-               addrs = kmalloc_array(mc_count, ETH_ALEN, GFP_ATOMIC);
-               if (!addrs) {
-                       set_allmulti(nesdev, nic_active_bit);
-                       goto unlock;
-               }
-               i = 0;
-               netdev_for_each_mc_addr(ha, netdev)
-                       memcpy(get_addr(addrs, i++), ha->addr, ETH_ALEN);
-
-               perfect_filter_register_address = NES_IDX_PERFECT_FILTER_LOW +
-                                               pft_entries_preallocated * 0x8;
-               for (i = 0, mc_index = 0; mc_index < max_pft_entries_avaiable;
-                    mc_index++) {
-                       while (i < mc_count && nesvnic->mcrq_mcast_filter &&
-                       ((mc_nic_index = nesvnic->mcrq_mcast_filter(nesvnic,
-                                       get_addr(addrs, i++))) == 0));
-                       if (mc_nic_index < 0)
-                               mc_nic_index = nesvnic->nic_index;
-                       while (nesadapter->pft_mcast_map[mc_index] < 16 &&
-                               nesadapter->pft_mcast_map[mc_index] !=
-                                       nesvnic->nic_index &&
-                                       mc_index < max_pft_entries_avaiable) {
-                               nes_debug(NES_DBG_NIC_RX,
-                                         "mc_index=%d skipping nic_index=%d, used for=%d\n",
-                                         mc_index, nesvnic->nic_index,
-                                         nesadapter->pft_mcast_map[mc_index]);
-                               mc_index++;
-                       }
-                       if (mc_index >= max_pft_entries_avaiable)
-                               break;
-                       if (i < mc_count) {
-                               char *addr = get_addr(addrs, i++);
-
-                               nes_debug(NES_DBG_NIC_RX, "Assigning MC Address %pM to register 0x%04X nic_idx=%d\n",
-                                         addr,
-                                         perfect_filter_register_address+(mc_index * 8),
-                                         mc_nic_index);
-                               macaddr_high  = ((u8) addr[0]) << 8;
-                               macaddr_high += (u8) addr[1];
-                               macaddr_low   = ((u8) addr[2]) << 24;
-                               macaddr_low  += ((u8) addr[3]) << 16;
-                               macaddr_low  += ((u8) addr[4]) << 8;
-                               macaddr_low  += (u8) addr[5];
-
-                               nes_write_indexed(nesdev,
-                                               perfect_filter_register_address+(mc_index * 8),
-                                               macaddr_low);
-                               nes_write_indexed(nesdev,
-                                               perfect_filter_register_address+4+(mc_index * 8),
-                                               (u32)macaddr_high | NES_MAC_ADDR_VALID |
-                                               ((((u32)(1<<mc_nic_index)) << 16)));
-                               nesadapter->pft_mcast_map[mc_index] =
-                                                       nesvnic->nic_index;
-                       } else {
-                               nes_debug(NES_DBG_NIC_RX, "Clearing MC Address at register 0x%04X\n",
-                                                 perfect_filter_register_address+(mc_index * 8));
-                               nes_write_indexed(nesdev,
-                                               perfect_filter_register_address+4+(mc_index * 8),
-                                               0);
-                               nesadapter->pft_mcast_map[mc_index] = 255;
-                       }
-               }
-               kfree(addrs);
-               /* PFT is not large enough */
-               if (i < mc_count)
-                       set_allmulti(nesdev, nic_active_bit);
-       }
-
-unlock:
-       spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
-}
-
-
-/**
- * nes_netdev_change_mtu
- */
-static int nes_netdev_change_mtu(struct net_device *netdev, int new_mtu)
-{
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       u8 jumbomode = 0;
-       u32 nic_active;
-       u32 nic_active_bit;
-       u32 uc_all_active;
-       u32 mc_all_active;
-
-       netdev->mtu = new_mtu;
-       nesvnic->max_frame_size = new_mtu + VLAN_ETH_HLEN;
-
-       if (netdev->mtu > ETH_DATA_LEN) {
-               jumbomode=1;
-       }
-       nes_nic_init_timer_defaults(nesdev, jumbomode);
-
-       if (netif_running(netdev)) {
-               nic_active_bit = 1 << nesvnic->nic_index;
-               mc_all_active = nes_read_indexed(nesdev,
-                               NES_IDX_NIC_MULTICAST_ALL) & nic_active_bit;
-               uc_all_active = nes_read_indexed(nesdev,
-                               NES_IDX_NIC_UNICAST_ALL)  & nic_active_bit;
-
-               nes_netdev_stop(netdev);
-               nes_netdev_open(netdev);
-
-               nic_active = nes_read_indexed(nesdev,
-                                       NES_IDX_NIC_MULTICAST_ALL);
-               nic_active |= mc_all_active;
-               nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL,
-                                                       nic_active);
-
-               nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
-               nic_active |= uc_all_active;
-               nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
-       }
-
-       return 0;
-}
-
-
-static const char nes_ethtool_stringset[][ETH_GSTRING_LEN] = {
-       "Link Change Interrupts",
-       "Linearized SKBs",
-       "T/GSO Requests",
-       "Pause Frames Sent",
-       "Pause Frames Received",
-       "Internal Routing Errors",
-       "SQ SW Dropped SKBs",
-       "SQ Full",
-       "Segmented TSO Requests",
-       "Rx Symbol Errors",
-       "Rx Jabber Errors",
-       "Rx Oversized Frames",
-       "Rx Short Frames",
-       "Rx Length Errors",
-       "Rx CRC Errors",
-       "Rx Port Discard",
-       "Endnode Rx Discards",
-       "Endnode Rx Octets",
-       "Endnode Rx Frames",
-       "Endnode Tx Octets",
-       "Endnode Tx Frames",
-       "Tx Errors",
-       "mh detected",
-       "mh pauses",
-       "Retransmission Count",
-       "CM Connects",
-       "CM Accepts",
-       "Disconnects",
-       "Connected Events",
-       "Connect Requests",
-       "CM Rejects",
-       "ModifyQP Timeouts",
-       "CreateQPs",
-       "SW DestroyQPs",
-       "DestroyQPs",
-       "CM Closes",
-       "CM Packets Sent",
-       "CM Packets Bounced",
-       "CM Packets Created",
-       "CM Packets Rcvd",
-       "CM Packets Dropped",
-       "CM Packets Retrans",
-       "CM Listens Created",
-       "CM Listens Destroyed",
-       "CM Backlog Drops",
-       "CM Loopbacks",
-       "CM Nodes Created",
-       "CM Nodes Destroyed",
-       "CM Accel Drops",
-       "CM Resets Received",
-       "Free 4Kpbls",
-       "Free 256pbls",
-       "Timer Inits",
-       "PAU CreateQPs",
-       "PAU DestroyQPs",
-};
-#define NES_ETHTOOL_STAT_COUNT  ARRAY_SIZE(nes_ethtool_stringset)
-
-
-/**
- * nes_netdev_get_sset_count
- */
-static int nes_netdev_get_sset_count(struct net_device *netdev, int stringset)
-{
-       if (stringset == ETH_SS_STATS)
-               return NES_ETHTOOL_STAT_COUNT;
-       else
-               return -EINVAL;
-}
-
-
-/**
- * nes_netdev_get_strings
- */
-static void nes_netdev_get_strings(struct net_device *netdev, u32 stringset,
-               u8 *ethtool_strings)
-{
-       if (stringset == ETH_SS_STATS)
-               memcpy(ethtool_strings,
-                               &nes_ethtool_stringset,
-                               sizeof(nes_ethtool_stringset));
-}
-
-
-/**
- * nes_netdev_get_ethtool_stats
- */
-
-static void nes_netdev_get_ethtool_stats(struct net_device *netdev,
-               struct ethtool_stats *target_ethtool_stats, u64 *target_stat_values)
-{
-       u64 u64temp;
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       u32 nic_count;
-       u32 u32temp;
-       u32 index = 0;
-
-       target_ethtool_stats->n_stats = NES_ETHTOOL_STAT_COUNT;
-       target_stat_values[index] = nesvnic->nesdev->link_status_interrupts;
-       target_stat_values[++index] = nesvnic->linearized_skbs;
-       target_stat_values[++index] = nesvnic->tso_requests;
-
-       u32temp = nes_read_indexed(nesdev,
-                       NES_IDX_MAC_TX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-       nesvnic->nesdev->mac_pause_frames_sent += u32temp;
-       target_stat_values[++index] = nesvnic->nesdev->mac_pause_frames_sent;
-
-       u32temp = nes_read_indexed(nesdev,
-                       NES_IDX_MAC_RX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-       nesvnic->nesdev->mac_pause_frames_received += u32temp;
-
-       u32temp = nes_read_indexed(nesdev,
-                       NES_IDX_PORT_RX_DISCARDS + (nesvnic->nesdev->mac_index*0x40));
-       nesvnic->nesdev->port_rx_discards += u32temp;
-       nesvnic->netstats.rx_dropped += u32temp;
-
-       u32temp = nes_read_indexed(nesdev,
-                       NES_IDX_PORT_TX_DISCARDS + (nesvnic->nesdev->mac_index*0x40));
-       nesvnic->nesdev->port_tx_discards += u32temp;
-       nesvnic->netstats.tx_dropped += u32temp;
-
-       u32temp = nes_read_indexed(nesdev,
-                       NES_IDX_MAC_RX_SHORT_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-       nesvnic->netstats.rx_dropped += u32temp;
-       nesvnic->nesdev->mac_rx_errors += u32temp;
-       nesvnic->nesdev->mac_rx_short_frames += u32temp;
-
-       u32temp = nes_read_indexed(nesdev,
-                       NES_IDX_MAC_RX_OVERSIZED_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-       nesvnic->netstats.rx_dropped += u32temp;
-       nesvnic->nesdev->mac_rx_errors += u32temp;
-       nesvnic->nesdev->mac_rx_oversized_frames += u32temp;
-
-       u32temp = nes_read_indexed(nesdev,
-                       NES_IDX_MAC_RX_JABBER_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-       nesvnic->netstats.rx_dropped += u32temp;
-       nesvnic->nesdev->mac_rx_errors += u32temp;
-       nesvnic->nesdev->mac_rx_jabber_frames += u32temp;
-
-       u32temp = nes_read_indexed(nesdev,
-                       NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-       nesvnic->netstats.rx_dropped += u32temp;
-       nesvnic->nesdev->mac_rx_errors += u32temp;
-       nesvnic->nesdev->mac_rx_symbol_err_frames += u32temp;
-
-       u32temp = nes_read_indexed(nesdev,
-                       NES_IDX_MAC_RX_LENGTH_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-       nesvnic->netstats.rx_length_errors += u32temp;
-       nesvnic->nesdev->mac_rx_errors += u32temp;
-
-       u32temp = nes_read_indexed(nesdev,
-                       NES_IDX_MAC_RX_CRC_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
-       nesvnic->nesdev->mac_rx_errors += u32temp;
-       nesvnic->nesdev->mac_rx_crc_errors += u32temp;
-       nesvnic->netstats.rx_crc_errors += u32temp;
-
-       u32temp = nes_read_indexed(nesdev,
-                       NES_IDX_MAC_TX_ERRORS + (nesvnic->nesdev->mac_index*0x200));
-       nesvnic->nesdev->mac_tx_errors += u32temp;
-       nesvnic->netstats.tx_errors += u32temp;
-
-       for (nic_count = 0; nic_count < NES_MAX_PORT_COUNT; nic_count++) {
-               if (nesvnic->qp_nic_index[nic_count] == 0xf)
-                       break;
-
-               u32temp = nes_read_indexed(nesdev,
-                               NES_IDX_ENDNODE0_NSTAT_RX_DISCARD +
-                               (nesvnic->qp_nic_index[nic_count]*0x200));
-               nesvnic->netstats.rx_dropped += u32temp;
-               nesvnic->endnode_nstat_rx_discard += u32temp;
-
-               u64temp = (u64)nes_read_indexed(nesdev,
-                               NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO +
-                               (nesvnic->qp_nic_index[nic_count]*0x200));
-               u64temp += ((u64)nes_read_indexed(nesdev,
-                               NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI +
-                               (nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
-
-               nesvnic->endnode_nstat_rx_octets += u64temp;
-               nesvnic->netstats.rx_bytes += u64temp;
-
-               u64temp = (u64)nes_read_indexed(nesdev,
-                               NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO +
-                               (nesvnic->qp_nic_index[nic_count]*0x200));
-               u64temp += ((u64)nes_read_indexed(nesdev,
-                               NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI +
-                               (nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
-
-               nesvnic->endnode_nstat_rx_frames += u64temp;
-               nesvnic->netstats.rx_packets += u64temp;
-
-               u64temp = (u64)nes_read_indexed(nesdev,
-                               NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO +
-                               (nesvnic->qp_nic_index[nic_count]*0x200));
-               u64temp += ((u64)nes_read_indexed(nesdev,
-                               NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI +
-                               (nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
-
-               nesvnic->endnode_nstat_tx_octets += u64temp;
-               nesvnic->netstats.tx_bytes += u64temp;
-
-               u64temp = (u64)nes_read_indexed(nesdev,
-                               NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO +
-                               (nesvnic->qp_nic_index[nic_count]*0x200));
-               u64temp += ((u64)nes_read_indexed(nesdev,
-                               NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI +
-                               (nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
-
-               nesvnic->endnode_nstat_tx_frames += u64temp;
-               nesvnic->netstats.tx_packets += u64temp;
-
-               u32temp = nes_read_indexed(nesdev,
-                               NES_IDX_IPV4_TCP_REXMITS + (nesvnic->qp_nic_index[nic_count]*0x200));
-               nesvnic->endnode_ipv4_tcp_retransmits += u32temp;
-       }
-
-       target_stat_values[++index] = nesvnic->nesdev->mac_pause_frames_received;
-       target_stat_values[++index] = nesdev->nesadapter->nic_rx_eth_route_err;
-       target_stat_values[++index] = nesvnic->tx_sw_dropped;
-       target_stat_values[++index] = nesvnic->sq_full;
-       target_stat_values[++index] = nesvnic->segmented_tso_requests;
-       target_stat_values[++index] = nesvnic->nesdev->mac_rx_symbol_err_frames;
-       target_stat_values[++index] = nesvnic->nesdev->mac_rx_jabber_frames;
-       target_stat_values[++index] = nesvnic->nesdev->mac_rx_oversized_frames;
-       target_stat_values[++index] = nesvnic->nesdev->mac_rx_short_frames;
-       target_stat_values[++index] = nesvnic->netstats.rx_length_errors;
-       target_stat_values[++index] = nesvnic->nesdev->mac_rx_crc_errors;
-       target_stat_values[++index] = nesvnic->nesdev->port_rx_discards;
-       target_stat_values[++index] = nesvnic->endnode_nstat_rx_discard;
-       target_stat_values[++index] = nesvnic->endnode_nstat_rx_octets;
-       target_stat_values[++index] = nesvnic->endnode_nstat_rx_frames;
-       target_stat_values[++index] = nesvnic->endnode_nstat_tx_octets;
-       target_stat_values[++index] = nesvnic->endnode_nstat_tx_frames;
-       target_stat_values[++index] = nesvnic->nesdev->mac_tx_errors;
-       target_stat_values[++index] = mh_detected;
-       target_stat_values[++index] = mh_pauses_sent;
-       target_stat_values[++index] = nesvnic->endnode_ipv4_tcp_retransmits;
-       target_stat_values[++index] = atomic_read(&cm_connects);
-       target_stat_values[++index] = atomic_read(&cm_accepts);
-       target_stat_values[++index] = atomic_read(&cm_disconnects);
-       target_stat_values[++index] = atomic_read(&cm_connecteds);
-       target_stat_values[++index] = atomic_read(&cm_connect_reqs);
-       target_stat_values[++index] = atomic_read(&cm_rejects);
-       target_stat_values[++index] = atomic_read(&mod_qp_timouts);
-       target_stat_values[++index] = atomic_read(&qps_created);
-       target_stat_values[++index] = atomic_read(&sw_qps_destroyed);
-       target_stat_values[++index] = atomic_read(&qps_destroyed);
-       target_stat_values[++index] = atomic_read(&cm_closes);
-       target_stat_values[++index] = cm_packets_sent;
-       target_stat_values[++index] = cm_packets_bounced;
-       target_stat_values[++index] = cm_packets_created;
-       target_stat_values[++index] = cm_packets_received;
-       target_stat_values[++index] = cm_packets_dropped;
-       target_stat_values[++index] = cm_packets_retrans;
-       target_stat_values[++index] = atomic_read(&cm_listens_created);
-       target_stat_values[++index] = atomic_read(&cm_listens_destroyed);
-       target_stat_values[++index] = cm_backlog_drops;
-       target_stat_values[++index] = atomic_read(&cm_loopbacks);
-       target_stat_values[++index] = atomic_read(&cm_nodes_created);
-       target_stat_values[++index] = atomic_read(&cm_nodes_destroyed);
-       target_stat_values[++index] = atomic_read(&cm_accel_dropped_pkts);
-       target_stat_values[++index] = atomic_read(&cm_resets_recvd);
-       target_stat_values[++index] = nesadapter->free_4kpbl;
-       target_stat_values[++index] = nesadapter->free_256pbl;
-       target_stat_values[++index] = int_mod_timer_init;
-       target_stat_values[++index] = atomic_read(&pau_qps_created);
-       target_stat_values[++index] = atomic_read(&pau_qps_destroyed);
-}
-
-/**
- * nes_netdev_get_drvinfo
- */
-static void nes_netdev_get_drvinfo(struct net_device *netdev,
-               struct ethtool_drvinfo *drvinfo)
-{
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-       struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter;
-
-       strlcpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver));
-       strlcpy(drvinfo->bus_info, pci_name(nesvnic->nesdev->pcidev),
-               sizeof(drvinfo->bus_info));
-       snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
-                "%u.%u", nesadapter->firmware_version >> 16,
-                nesadapter->firmware_version & 0x000000ff);
-       strlcpy(drvinfo->version, DRV_VERSION, sizeof(drvinfo->version));
-}
-
-
-/**
- * nes_netdev_set_coalesce
- */
-static int nes_netdev_set_coalesce(struct net_device *netdev,
-               struct ethtool_coalesce *et_coalesce)
-{
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
-       unsigned long flags;
-
-       spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
-       if (et_coalesce->rx_max_coalesced_frames_low) {
-               shared_timer->threshold_low = et_coalesce->rx_max_coalesced_frames_low;
-       }
-       if (et_coalesce->rx_max_coalesced_frames_irq) {
-               shared_timer->threshold_target = et_coalesce->rx_max_coalesced_frames_irq;
-       }
-       if (et_coalesce->rx_max_coalesced_frames_high) {
-               shared_timer->threshold_high = et_coalesce->rx_max_coalesced_frames_high;
-       }
-       if (et_coalesce->rx_coalesce_usecs_low) {
-               shared_timer->timer_in_use_min = et_coalesce->rx_coalesce_usecs_low;
-       }
-       if (et_coalesce->rx_coalesce_usecs_high) {
-               shared_timer->timer_in_use_max = et_coalesce->rx_coalesce_usecs_high;
-       }
-       spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
-
-       /* using this to drive total interrupt moderation */
-       nesadapter->et_rx_coalesce_usecs_irq = et_coalesce->rx_coalesce_usecs_irq;
-       if (et_coalesce->use_adaptive_rx_coalesce) {
-               nesadapter->et_use_adaptive_rx_coalesce = 1;
-               nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT_DYNAMIC;
-               nesadapter->et_rx_coalesce_usecs_irq = 0;
-               if (et_coalesce->pkt_rate_low) {
-                       nesadapter->et_pkt_rate_low = et_coalesce->pkt_rate_low;
-               }
-       } else {
-               nesadapter->et_use_adaptive_rx_coalesce = 0;
-               nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT;
-               if (nesadapter->et_rx_coalesce_usecs_irq) {
-                       nes_write32(nesdev->regs+NES_PERIODIC_CONTROL,
-                                       0x80000000 | ((u32)(nesadapter->et_rx_coalesce_usecs_irq*8)));
-               }
-       }
-       return 0;
-}
-
-
-/**
- * nes_netdev_get_coalesce
- */
-static int nes_netdev_get_coalesce(struct net_device *netdev,
-               struct ethtool_coalesce *et_coalesce)
-{
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       struct ethtool_coalesce temp_et_coalesce;
-       struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
-       unsigned long flags;
-
-       memset(&temp_et_coalesce, 0, sizeof(temp_et_coalesce));
-       temp_et_coalesce.rx_coalesce_usecs_irq    = nesadapter->et_rx_coalesce_usecs_irq;
-       temp_et_coalesce.use_adaptive_rx_coalesce = nesadapter->et_use_adaptive_rx_coalesce;
-       temp_et_coalesce.rate_sample_interval     = nesadapter->et_rate_sample_interval;
-       temp_et_coalesce.pkt_rate_low = nesadapter->et_pkt_rate_low;
-       spin_lock_irqsave(&nesadapter->periodic_timer_lock,     flags);
-       temp_et_coalesce.rx_max_coalesced_frames_low  = shared_timer->threshold_low;
-       temp_et_coalesce.rx_max_coalesced_frames_irq  = shared_timer->threshold_target;
-       temp_et_coalesce.rx_max_coalesced_frames_high = shared_timer->threshold_high;
-       temp_et_coalesce.rx_coalesce_usecs_low  = shared_timer->timer_in_use_min;
-       temp_et_coalesce.rx_coalesce_usecs_high = shared_timer->timer_in_use_max;
-       if (nesadapter->et_use_adaptive_rx_coalesce) {
-               temp_et_coalesce.rx_coalesce_usecs_irq = shared_timer->timer_in_use;
-       }
-       spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
-       memcpy(et_coalesce, &temp_et_coalesce, sizeof(*et_coalesce));
-       return 0;
-}
-
-
-/**
- * nes_netdev_get_pauseparam
- */
-static void nes_netdev_get_pauseparam(struct net_device *netdev,
-               struct ethtool_pauseparam *et_pauseparam)
-{
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-
-       et_pauseparam->autoneg = 0;
-       et_pauseparam->rx_pause = (nesvnic->nesdev->disable_rx_flow_control == 0) ? 1:0;
-       et_pauseparam->tx_pause = (nesvnic->nesdev->disable_tx_flow_control == 0) ? 1:0;
-}
-
-
-/**
- * nes_netdev_set_pauseparam
- */
-static int nes_netdev_set_pauseparam(struct net_device *netdev,
-               struct ethtool_pauseparam *et_pauseparam)
-{
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       u32 u32temp;
-
-       if (et_pauseparam->autoneg) {
-               /* TODO: should return unsupported */
-               return 0;
-       }
-       if ((et_pauseparam->tx_pause == 1) && (nesdev->disable_tx_flow_control == 1)) {
-               u32temp = nes_read_indexed(nesdev,
-                               NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200));
-               u32temp |= NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE;
-               nes_write_indexed(nesdev,
-                               NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200), u32temp);
-               nesdev->disable_tx_flow_control = 0;
-       } else if ((et_pauseparam->tx_pause == 0) && (nesdev->disable_tx_flow_control == 0)) {
-               u32temp = nes_read_indexed(nesdev,
-                               NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200));
-               u32temp &= ~NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE;
-               nes_write_indexed(nesdev,
-                               NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200), u32temp);
-               nesdev->disable_tx_flow_control = 1;
-       }
-       if ((et_pauseparam->rx_pause == 1) && (nesdev->disable_rx_flow_control == 1)) {
-               u32temp = nes_read_indexed(nesdev,
-                               NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40));
-               u32temp &= ~NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE;
-               nes_write_indexed(nesdev,
-                               NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40), u32temp);
-               nesdev->disable_rx_flow_control = 0;
-       } else if ((et_pauseparam->rx_pause == 0) && (nesdev->disable_rx_flow_control == 0)) {
-               u32temp = nes_read_indexed(nesdev,
-                               NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40));
-               u32temp |= NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE;
-               nes_write_indexed(nesdev,
-                               NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40), u32temp);
-               nesdev->disable_rx_flow_control = 1;
-       }
-
-       return 0;
-}
-
-
-/**
- * nes_netdev_get_settings
- */
-static int nes_netdev_get_link_ksettings(struct net_device *netdev,
-                                        struct ethtool_link_ksettings *cmd)
-{
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       u32 mac_index = nesdev->mac_index;
-       u8 phy_type = nesadapter->phy_type[mac_index];
-       u8 phy_index = nesadapter->phy_index[mac_index];
-       u16 phy_data;
-       u32 supported, advertising;
-
-       cmd->base.duplex = DUPLEX_FULL;
-       cmd->base.port   = PORT_MII;
-
-       if (nesadapter->OneG_Mode) {
-               cmd->base.speed = SPEED_1000;
-               if (phy_type == NES_PHY_TYPE_PUMA_1G) {
-                       supported   = SUPPORTED_1000baseT_Full;
-                       advertising = ADVERTISED_1000baseT_Full;
-                       cmd->base.autoneg     = AUTONEG_DISABLE;
-                       cmd->base.phy_address = mac_index;
-               } else {
-                       unsigned long flags;
-
-                       supported = SUPPORTED_1000baseT_Full
-                               | SUPPORTED_Autoneg;
-                       advertising = ADVERTISED_1000baseT_Full
-                               | ADVERTISED_Autoneg;
-                       spin_lock_irqsave(&nesadapter->phy_lock, flags);
-                       nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
-                       spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-                       if (phy_data & 0x1000)
-                               cmd->base.autoneg = AUTONEG_ENABLE;
-                       else
-                               cmd->base.autoneg = AUTONEG_DISABLE;
-                       cmd->base.phy_address = phy_index;
-               }
-               ethtool_convert_legacy_u32_to_link_mode(
-                       cmd->link_modes.supported, supported);
-               ethtool_convert_legacy_u32_to_link_mode(
-                       cmd->link_modes.advertising, advertising);
-               return 0;
-       }
-       if ((phy_type == NES_PHY_TYPE_ARGUS) ||
-           (phy_type == NES_PHY_TYPE_SFP_D) ||
-           (phy_type == NES_PHY_TYPE_KR)) {
-               cmd->base.port        = PORT_FIBRE;
-               supported   = SUPPORTED_FIBRE;
-               advertising = ADVERTISED_FIBRE;
-               cmd->base.phy_address = phy_index;
-       } else {
-               supported   = SUPPORTED_10000baseT_Full;
-               advertising = ADVERTISED_10000baseT_Full;
-               cmd->base.phy_address = mac_index;
-       }
-       cmd->base.speed = SPEED_10000;
-       cmd->base.autoneg = AUTONEG_DISABLE;
-       ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported,
-                                               supported);
-       ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.advertising,
-                                               advertising);
-
-       return 0;
-}
-
-
-/**
- * nes_netdev_set_settings
- */
-static int
-nes_netdev_set_link_ksettings(struct net_device *netdev,
-                             const struct ethtool_link_ksettings *cmd)
-{
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-
-       if ((nesadapter->OneG_Mode) &&
-           (nesadapter->phy_type[nesdev->mac_index] != NES_PHY_TYPE_PUMA_1G)) {
-               unsigned long flags;
-               u16 phy_data;
-               u8 phy_index = nesadapter->phy_index[nesdev->mac_index];
-
-               spin_lock_irqsave(&nesadapter->phy_lock, flags);
-               nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
-               if (cmd->base.autoneg) {
-                       /* Turn on Full duplex, Autoneg, and restart autonegotiation */
-                       phy_data |= 0x1300;
-               } else {
-                       /* Turn off autoneg */
-                       phy_data &= ~0x1000;
-               }
-               nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data);
-               spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-       }
-
-       return 0;
-}
-
-
-static const struct ethtool_ops nes_ethtool_ops = {
-       .get_link = ethtool_op_get_link,
-       .get_strings = nes_netdev_get_strings,
-       .get_sset_count = nes_netdev_get_sset_count,
-       .get_ethtool_stats = nes_netdev_get_ethtool_stats,
-       .get_drvinfo = nes_netdev_get_drvinfo,
-       .get_coalesce = nes_netdev_get_coalesce,
-       .set_coalesce = nes_netdev_set_coalesce,
-       .get_pauseparam = nes_netdev_get_pauseparam,
-       .set_pauseparam = nes_netdev_set_pauseparam,
-       .get_link_ksettings = nes_netdev_get_link_ksettings,
-       .set_link_ksettings = nes_netdev_set_link_ksettings,
-};
-
-static void nes_vlan_mode(struct net_device *netdev, struct nes_device *nesdev, netdev_features_t features)
-{
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       u32 u32temp;
-       unsigned long flags;
-
-       spin_lock_irqsave(&nesadapter->phy_lock, flags);
-
-       nes_debug(NES_DBG_NETDEV, "%s: %s\n", __func__, netdev->name);
-
-       /* Enable/Disable VLAN Stripping */
-       u32temp = nes_read_indexed(nesdev, NES_IDX_PCIX_DIAG);
-       if (features & NETIF_F_HW_VLAN_CTAG_RX)
-               u32temp &= 0xfdffffff;
-       else
-               u32temp |= 0x02000000;
-
-       nes_write_indexed(nesdev, NES_IDX_PCIX_DIAG, u32temp);
-       spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-}
-
-static netdev_features_t nes_fix_features(struct net_device *netdev, netdev_features_t features)
-{
-       /*
-        * Since there is no support for separate rx/tx vlan accel
-        * enable/disable make sure tx flag is always in same state as rx.
-        */
-       if (features & NETIF_F_HW_VLAN_CTAG_RX)
-               features |= NETIF_F_HW_VLAN_CTAG_TX;
-       else
-               features &= ~NETIF_F_HW_VLAN_CTAG_TX;
-
-       return features;
-}
-
-static int nes_set_features(struct net_device *netdev, netdev_features_t features)
-{
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       u32 changed = netdev->features ^ features;
-
-       if (changed & NETIF_F_HW_VLAN_CTAG_RX)
-               nes_vlan_mode(netdev, nesdev, features);
-
-       return 0;
-}
-
-static const struct net_device_ops nes_netdev_ops = {
-       .ndo_open               = nes_netdev_open,
-       .ndo_stop               = nes_netdev_stop,
-       .ndo_start_xmit         = nes_netdev_start_xmit,
-       .ndo_get_stats          = nes_netdev_get_stats,
-       .ndo_tx_timeout         = nes_netdev_tx_timeout,
-       .ndo_set_mac_address    = nes_netdev_set_mac_address,
-       .ndo_set_rx_mode        = nes_netdev_set_multicast_list,
-       .ndo_change_mtu         = nes_netdev_change_mtu,
-       .ndo_validate_addr      = eth_validate_addr,
-       .ndo_fix_features       = nes_fix_features,
-       .ndo_set_features       = nes_set_features,
-};
-
-/**
- * nes_netdev_init - initialize network device
- */
-struct net_device *nes_netdev_init(struct nes_device *nesdev,
-               void __iomem *mmio_addr)
-{
-       u64 u64temp;
-       struct nes_vnic *nesvnic;
-       struct net_device *netdev;
-       struct nic_qp_map *curr_qp_map;
-       u8 phy_type = nesdev->nesadapter->phy_type[nesdev->mac_index];
-
-       netdev = alloc_etherdev(sizeof(struct nes_vnic));
-       if (!netdev) {
-               printk(KERN_ERR PFX "nesvnic etherdev alloc failed");
-               return NULL;
-       }
-       nesvnic = netdev_priv(netdev);
-
-       nes_debug(NES_DBG_INIT, "netdev = %p, %s\n", netdev, netdev->name);
-
-       SET_NETDEV_DEV(netdev, &nesdev->pcidev->dev);
-
-       netdev->watchdog_timeo = NES_TX_TIMEOUT;
-       netdev->irq = nesdev->pcidev->irq;
-       netdev->max_mtu = NES_MAX_MTU;
-       netdev->hard_header_len = ETH_HLEN;
-       netdev->addr_len = ETH_ALEN;
-       netdev->type = ARPHRD_ETHER;
-       netdev->netdev_ops = &nes_netdev_ops;
-       netdev->ethtool_ops = &nes_ethtool_ops;
-       netif_napi_add(netdev, &nesvnic->napi, nes_netdev_poll, 128);
-       nes_debug(NES_DBG_INIT, "Enabling VLAN Insert/Delete.\n");
-
-       /* Fill in the port structure */
-       nesvnic->netdev = netdev;
-       nesvnic->nesdev = nesdev;
-       nesvnic->msg_enable = netif_msg_init(debug, default_msg);
-       nesvnic->netdev_index = nesdev->netdev_count;
-       nesvnic->perfect_filter_index = nesdev->nesadapter->netdev_count;
-       nesvnic->max_frame_size = netdev->mtu + netdev->hard_header_len + VLAN_HLEN;
-
-       curr_qp_map = nic_qp_mapping_per_function[PCI_FUNC(nesdev->pcidev->devfn)];
-       nesvnic->nic.qp_id = curr_qp_map[nesdev->netdev_count].qpid;
-       nesvnic->nic_index = curr_qp_map[nesdev->netdev_count].nic_index;
-       nesvnic->logical_port = curr_qp_map[nesdev->netdev_count].logical_port;
-
-       /* Setup the burned in MAC address */
-       u64temp = (u64)nesdev->nesadapter->mac_addr_low;
-       u64temp += ((u64)nesdev->nesadapter->mac_addr_high) << 32;
-       u64temp += nesvnic->nic_index;
-       netdev->dev_addr[0] = (u8)(u64temp>>40);
-       netdev->dev_addr[1] = (u8)(u64temp>>32);
-       netdev->dev_addr[2] = (u8)(u64temp>>24);
-       netdev->dev_addr[3] = (u8)(u64temp>>16);
-       netdev->dev_addr[4] = (u8)(u64temp>>8);
-       netdev->dev_addr[5] = (u8)u64temp;
-
-       netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_RX;
-       if ((nesvnic->logical_port < 2) || (nesdev->nesadapter->hw_rev != NE020_REV))
-               netdev->hw_features |= NETIF_F_TSO;
-
-       netdev->features = netdev->hw_features | NETIF_F_HIGHDMA | NETIF_F_HW_VLAN_CTAG_TX;
-
-       nes_debug(NES_DBG_INIT, "nesvnic = %p, reported features = 0x%lX, QPid = %d,"
-                       " nic_index = %d, logical_port = %d, mac_index = %d.\n",
-                       nesvnic, (unsigned long)netdev->features, nesvnic->nic.qp_id,
-                       nesvnic->nic_index, nesvnic->logical_port,  nesdev->mac_index);
-
-       if (nesvnic->nesdev->nesadapter->port_count == 1 &&
-               nesvnic->nesdev->nesadapter->adapter_fcn_count == 1) {
-
-               nesvnic->qp_nic_index[0] = nesvnic->nic_index;
-               nesvnic->qp_nic_index[1] = nesvnic->nic_index + 1;
-               if (nes_drv_opt & NES_DRV_OPT_DUAL_LOGICAL_PORT) {
-                       nesvnic->qp_nic_index[2] = 0xf;
-                       nesvnic->qp_nic_index[3] = 0xf;
-               } else {
-                       nesvnic->qp_nic_index[2] = nesvnic->nic_index + 2;
-                       nesvnic->qp_nic_index[3] = nesvnic->nic_index + 3;
-               }
-       } else {
-               if (nesvnic->nesdev->nesadapter->port_count == 2 ||
-                       (nesvnic->nesdev->nesadapter->port_count == 1 &&
-                       nesvnic->nesdev->nesadapter->adapter_fcn_count == 2)) {
-                               nesvnic->qp_nic_index[0] = nesvnic->nic_index;
-                               nesvnic->qp_nic_index[1] = nesvnic->nic_index
-                                                                       + 2;
-                               nesvnic->qp_nic_index[2] = 0xf;
-                               nesvnic->qp_nic_index[3] = 0xf;
-               } else {
-                       nesvnic->qp_nic_index[0] = nesvnic->nic_index;
-                       nesvnic->qp_nic_index[1] = 0xf;
-                       nesvnic->qp_nic_index[2] = 0xf;
-                       nesvnic->qp_nic_index[3] = 0xf;
-               }
-       }
-       nesvnic->next_qp_nic_index = 0;
-
-       if (nesdev->netdev_count == 0) {
-               nesvnic->rdma_enabled = 1;
-       } else {
-               nesvnic->rdma_enabled = 0;
-       }
-       nesvnic->nic_cq.cq_number = nesvnic->nic.qp_id;
-       timer_setup(&nesvnic->event_timer, NULL, 0);
-       spin_lock_init(&nesvnic->tx_lock);
-       spin_lock_init(&nesvnic->port_ibevent_lock);
-       nesdev->netdev[nesdev->netdev_count] = netdev;
-
-       nes_debug(NES_DBG_INIT, "Adding nesvnic (%p) to the adapters nesvnic_list for MAC%d.\n",
-                       nesvnic, nesdev->mac_index);
-       list_add_tail(&nesvnic->list, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]);
-
-       if ((nesdev->netdev_count == 0) &&
-           ((PCI_FUNC(nesdev->pcidev->devfn) == nesdev->mac_index) ||
-            ((phy_type == NES_PHY_TYPE_PUMA_1G) &&
-             (((PCI_FUNC(nesdev->pcidev->devfn) == 1) && (nesdev->mac_index == 2)) ||
-              ((PCI_FUNC(nesdev->pcidev->devfn) == 2) && (nesdev->mac_index == 1)))))) {
-               u32 u32temp;
-               u32 link_mask = 0;
-               u32 link_val = 0;
-               u16 temp_phy_data;
-               u16 phy_data = 0;
-               unsigned long flags;
-
-               u32temp = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 +
-                               (0x200 * (nesdev->mac_index & 1)));
-               if (phy_type != NES_PHY_TYPE_PUMA_1G) {
-                       u32temp |= 0x00200000;
-                       nes_write_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 +
-                               (0x200 * (nesdev->mac_index & 1)), u32temp);
-               }
-
-               /* Check and set linkup here.  This is for back to back */
-               /* configuration where second port won't get link interrupt */
-               switch (phy_type) {
-               case NES_PHY_TYPE_PUMA_1G:
-                       if (nesdev->mac_index < 2) {
-                               link_mask = 0x01010000;
-                               link_val = 0x01010000;
-                       } else {
-                               link_mask = 0x02020000;
-                               link_val = 0x02020000;
-                       }
-                       break;
-               case NES_PHY_TYPE_SFP_D:
-                       spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
-                       nes_read_10G_phy_reg(nesdev,
-                                            nesdev->nesadapter->phy_index[nesdev->mac_index],
-                                            1, 0x9003);
-                       temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-                       nes_read_10G_phy_reg(nesdev,
-                                            nesdev->nesadapter->phy_index[nesdev->mac_index],
-                                            3, 0x0021);
-                       nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-                       nes_read_10G_phy_reg(nesdev,
-                                            nesdev->nesadapter->phy_index[nesdev->mac_index],
-                                            3, 0x0021);
-                       phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-                       spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
-                       phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0;
-                       break;
-               default:
-                       link_mask = 0x0f1f0000;
-                       link_val = 0x0f0f0000;
-                       break;
-               }
-
-               u32temp = nes_read_indexed(nesdev,
-                                          NES_IDX_PHY_PCS_CONTROL_STATUS0 +
-                                          (0x200 * (nesdev->mac_index & 1)));
-
-               if (phy_type == NES_PHY_TYPE_SFP_D) {
-                       if (phy_data & 0x0004)
-                               nesvnic->linkup = 1;
-               } else {
-                       if ((u32temp & link_mask) == link_val)
-                               nesvnic->linkup = 1;
-               }
-
-               /* clear the MAC interrupt status, assumes direct logical to physical mapping */
-               u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (0x200 * nesdev->mac_index));
-               nes_debug(NES_DBG_INIT, "Phy interrupt status = 0x%X.\n", u32temp);
-               nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (0x200 * nesdev->mac_index), u32temp);
-
-               nes_init_phy(nesdev);
-       }
-
-       nes_vlan_mode(netdev, nesdev, netdev->features);
-
-       return netdev;
-}
-
-
-/**
- * nes_netdev_destroy - destroy network device structure
- */
-void nes_netdev_destroy(struct net_device *netdev)
-{
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-
-       /* make sure 'stop' method is called by Linux stack */
-       /* nes_netdev_stop(netdev); */
-
-       list_del(&nesvnic->list);
-
-       if (nesvnic->of_device_registered) {
-               nes_destroy_ofa_device(nesvnic->nesibdev);
-       }
-
-       free_netdev(netdev);
-}
-
-
-/**
- * nes_nic_cm_xmit -- CM calls this to send out pkts
- */
-int nes_nic_cm_xmit(struct sk_buff *skb, struct net_device *netdev)
-{
-       int ret;
-
-       skb->dev = netdev;
-       ret = dev_queue_xmit(skb);
-       if (ret) {
-               nes_debug(NES_DBG_CM, "Bad return code from dev_queue_xmit %d\n", ret);
-       }
-
-       return ret;
-}
diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c
deleted file mode 100644 (file)
index 21b4a83..0000000
+++ /dev/null
@@ -1,916 +0,0 @@
-/*
- * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/ethtool.h>
-#include <linux/mii.h>
-#include <linux/if_vlan.h>
-#include <linux/slab.h>
-#include <linux/crc32.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/byteorder.h>
-
-#include "nes.h"
-
-static u16 nes_read16_eeprom(void __iomem *addr, u16 offset);
-
-u32 mh_detected;
-u32 mh_pauses_sent;
-
-static u32 nes_set_pau(struct nes_device *nesdev)
-{
-       u32 ret = 0;
-       u32 counter;
-
-       nes_write_indexed(nesdev, NES_IDX_GPR2, NES_ENABLE_PAU);
-       nes_write_indexed(nesdev, NES_IDX_GPR_TRIGGER, 1);
-
-       for (counter = 0; counter < NES_PAU_COUNTER; counter++) {
-               udelay(30);
-               if (!nes_read_indexed(nesdev, NES_IDX_GPR2)) {
-                       printk(KERN_INFO PFX "PAU is supported.\n");
-                       break;
-               }
-               nes_write_indexed(nesdev, NES_IDX_GPR_TRIGGER, 1);
-       }
-       if (counter == NES_PAU_COUNTER) {
-               printk(KERN_INFO PFX "PAU is not supported.\n");
-               return -EPERM;
-       }
-       return ret;
-}
-
-/**
- * nes_read_eeprom_values -
- */
-int nes_read_eeprom_values(struct nes_device *nesdev, struct nes_adapter *nesadapter)
-{
-       u32 mac_addr_low;
-       u16 mac_addr_high;
-       u16 eeprom_data;
-       u16 eeprom_offset;
-       u16 next_section_address;
-       u16 sw_section_ver;
-       u8  major_ver = 0;
-       u8  minor_ver = 0;
-
-       /* TODO: deal with EEPROM endian issues */
-       if (nesadapter->firmware_eeprom_offset == 0) {
-               /* Read the EEPROM Parameters */
-               eeprom_data = nes_read16_eeprom(nesdev->regs, 0);
-               nes_debug(NES_DBG_HW, "EEPROM Offset 0  = 0x%04X\n", eeprom_data);
-               eeprom_offset = 2 + (((eeprom_data & 0x007f) << 3) <<
-                               ((eeprom_data & 0x0080) >> 7));
-               nes_debug(NES_DBG_HW, "Firmware Offset = 0x%04X\n", eeprom_offset);
-               nesadapter->firmware_eeprom_offset = eeprom_offset;
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 4);
-               if (eeprom_data != 0x5746) {
-                       nes_debug(NES_DBG_HW, "Not a valid Firmware Image = 0x%04X\n", eeprom_data);
-                       return -1;
-               }
-
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
-               nes_debug(NES_DBG_HW, "EEPROM Offset %u  = 0x%04X\n",
-                               eeprom_offset + 2, eeprom_data);
-               eeprom_offset += ((eeprom_data & 0x00ff) << 3) << ((eeprom_data & 0x0100) >> 8);
-               nes_debug(NES_DBG_HW, "Software Offset = 0x%04X\n", eeprom_offset);
-               nesadapter->software_eeprom_offset = eeprom_offset;
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 4);
-               if (eeprom_data != 0x5753) {
-                       printk("Not a valid Software Image = 0x%04X\n", eeprom_data);
-                       return -1;
-               }
-               sw_section_ver = nes_read16_eeprom(nesdev->regs, nesadapter->software_eeprom_offset  + 6);
-               nes_debug(NES_DBG_HW, "Software section version number = 0x%04X\n",
-                               sw_section_ver);
-
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
-               nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
-                               eeprom_offset + 2, eeprom_data);
-               next_section_address = eeprom_offset + (((eeprom_data & 0x00ff) << 3) <<
-                               ((eeprom_data & 0x0100) >> 8));
-               eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
-               if (eeprom_data != 0x414d) {
-                       nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x414d but was 0x%04X\n",
-                                       eeprom_data);
-                       goto no_fw_rev;
-               }
-               eeprom_offset = next_section_address;
-
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
-               nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
-                               eeprom_offset + 2, eeprom_data);
-               next_section_address = eeprom_offset + (((eeprom_data & 0x00ff) << 3) <<
-                               ((eeprom_data & 0x0100) >> 8));
-               eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
-               if (eeprom_data != 0x4f52) {
-                       nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x4f52 but was 0x%04X\n",
-                                       eeprom_data);
-                       goto no_fw_rev;
-               }
-               eeprom_offset = next_section_address;
-
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
-               nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
-                               eeprom_offset + 2, eeprom_data);
-               next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
-               eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
-               if (eeprom_data != 0x5746) {
-                       nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x5746 but was 0x%04X\n",
-                                       eeprom_data);
-                       goto no_fw_rev;
-               }
-               eeprom_offset = next_section_address;
-
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
-               nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
-                               eeprom_offset + 2, eeprom_data);
-               next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
-               eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
-               if (eeprom_data != 0x5753) {
-                       nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x5753 but was 0x%04X\n",
-                                       eeprom_data);
-                       goto no_fw_rev;
-               }
-               eeprom_offset = next_section_address;
-
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
-               nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
-                               eeprom_offset + 2, eeprom_data);
-               next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
-               eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
-               if (eeprom_data != 0x414d) {
-                       nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x414d but was 0x%04X\n",
-                                       eeprom_data);
-                       goto no_fw_rev;
-               }
-               eeprom_offset = next_section_address;
-
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
-               nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
-                               eeprom_offset + 2, eeprom_data);
-               next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
-               eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
-               if (eeprom_data != 0x464e) {
-                       nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x464e but was 0x%04X\n",
-                                       eeprom_data);
-                       goto no_fw_rev;
-               }
-               eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 8);
-               printk(PFX "Firmware version %u.%u\n", (u8)(eeprom_data>>8), (u8)eeprom_data);
-               major_ver = (u8)(eeprom_data >> 8);
-               minor_ver = (u8)(eeprom_data);
-
-               if (nes_drv_opt & NES_DRV_OPT_DISABLE_VIRT_WQ) {
-                       nes_debug(NES_DBG_HW, "Virtual WQs have been disabled\n");
-               } else if (((major_ver == 2) && (minor_ver > 21)) || ((major_ver > 2) && (major_ver != 255))) {
-                       nesadapter->virtwq = 1;
-               }
-               if (((major_ver == 3) && (minor_ver >= 16)) || (major_ver > 3))
-                       nesadapter->send_term_ok = 1;
-
-               if (nes_drv_opt & NES_DRV_OPT_ENABLE_PAU) {
-                       if (!nes_set_pau(nesdev))
-                               nesadapter->allow_unaligned_fpdus = 1;
-               }
-
-               nesadapter->firmware_version = (((u32)(u8)(eeprom_data>>8))  <<  16) +
-                               (u32)((u8)eeprom_data);
-
-               eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 10);
-               printk(PFX "EEPROM version %u.%u\n", (u8)(eeprom_data>>8), (u8)eeprom_data);
-               nesadapter->eeprom_version = (((u32)(u8)(eeprom_data>>8)) << 16) +
-                               (u32)((u8)eeprom_data);
-
-no_fw_rev:
-               /* eeprom is valid */
-               eeprom_offset = nesadapter->software_eeprom_offset;
-               eeprom_offset += 8;
-               nesadapter->netdev_max = (u8)nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               eeprom_offset += 2;
-               mac_addr_high = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               eeprom_offset += 2;
-               mac_addr_low = (u32)nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               eeprom_offset += 2;
-               mac_addr_low <<= 16;
-               mac_addr_low += (u32)nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               nes_debug(NES_DBG_HW, "Base MAC Address = 0x%04X%08X\n",
-                               mac_addr_high, mac_addr_low);
-               nes_debug(NES_DBG_HW, "MAC Address count = %u\n", nesadapter->netdev_max);
-
-               nesadapter->mac_addr_low = mac_addr_low;
-               nesadapter->mac_addr_high = mac_addr_high;
-
-               /* Read the Phy Type array */
-               eeprom_offset += 10;
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               nesadapter->phy_type[0] = (u8)(eeprom_data >> 8);
-               nesadapter->phy_type[1] = (u8)eeprom_data;
-
-               /* Read the port array */
-               eeprom_offset += 2;
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               nesadapter->phy_type[2] = (u8)(eeprom_data >> 8);
-               nesadapter->phy_type[3] = (u8)eeprom_data;
-               /* port_count is set by soft reset reg */
-               nes_debug(NES_DBG_HW, "port_count = %u, port 0 -> %u, port 1 -> %u,"
-                               " port 2 -> %u, port 3 -> %u\n",
-                               nesadapter->port_count,
-                               nesadapter->phy_type[0], nesadapter->phy_type[1],
-                               nesadapter->phy_type[2], nesadapter->phy_type[3]);
-
-               /* Read PD config array */
-               eeprom_offset += 10;
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               nesadapter->pd_config_size[0] = eeprom_data;
-               eeprom_offset += 2;
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               nesadapter->pd_config_base[0] = eeprom_data;
-               nes_debug(NES_DBG_HW, "PD0 config, size=0x%04x, base=0x%04x\n",
-                               nesadapter->pd_config_size[0], nesadapter->pd_config_base[0]);
-
-               eeprom_offset += 2;
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               nesadapter->pd_config_size[1] = eeprom_data;
-               eeprom_offset += 2;
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               nesadapter->pd_config_base[1] = eeprom_data;
-               nes_debug(NES_DBG_HW, "PD1 config, size=0x%04x, base=0x%04x\n",
-                               nesadapter->pd_config_size[1], nesadapter->pd_config_base[1]);
-
-               eeprom_offset += 2;
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               nesadapter->pd_config_size[2] = eeprom_data;
-               eeprom_offset += 2;
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               nesadapter->pd_config_base[2] = eeprom_data;
-               nes_debug(NES_DBG_HW, "PD2 config, size=0x%04x, base=0x%04x\n",
-                               nesadapter->pd_config_size[2], nesadapter->pd_config_base[2]);
-
-               eeprom_offset += 2;
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               nesadapter->pd_config_size[3] = eeprom_data;
-               eeprom_offset += 2;
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               nesadapter->pd_config_base[3] = eeprom_data;
-               nes_debug(NES_DBG_HW, "PD3 config, size=0x%04x, base=0x%04x\n",
-                               nesadapter->pd_config_size[3], nesadapter->pd_config_base[3]);
-
-               /* Read Rx Pool Size */
-               eeprom_offset += 22;   /* 46 */
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               eeprom_offset += 2;
-               nesadapter->rx_pool_size = (((u32)eeprom_data) << 16) +
-                               nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               nes_debug(NES_DBG_HW, "rx_pool_size = 0x%08X\n", nesadapter->rx_pool_size);
-
-               eeprom_offset += 2;
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               eeprom_offset += 2;
-               nesadapter->tx_pool_size = (((u32)eeprom_data) << 16) +
-                               nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               nes_debug(NES_DBG_HW, "tx_pool_size = 0x%08X\n", nesadapter->tx_pool_size);
-
-               eeprom_offset += 2;
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               eeprom_offset += 2;
-               nesadapter->rx_threshold = (((u32)eeprom_data) << 16) +
-                               nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               nes_debug(NES_DBG_HW, "rx_threshold = 0x%08X\n", nesadapter->rx_threshold);
-
-               eeprom_offset += 2;
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               eeprom_offset += 2;
-               nesadapter->tcp_timer_core_clk_divisor = (((u32)eeprom_data) << 16) +
-                               nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               nes_debug(NES_DBG_HW, "tcp_timer_core_clk_divisor = 0x%08X\n",
-                               nesadapter->tcp_timer_core_clk_divisor);
-
-               eeprom_offset += 2;
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               eeprom_offset += 2;
-               nesadapter->iwarp_config = (((u32)eeprom_data) << 16) +
-                               nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               nes_debug(NES_DBG_HW, "iwarp_config = 0x%08X\n", nesadapter->iwarp_config);
-
-               eeprom_offset += 2;
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               eeprom_offset += 2;
-               nesadapter->cm_config = (((u32)eeprom_data) << 16) +
-                               nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               nes_debug(NES_DBG_HW, "cm_config = 0x%08X\n", nesadapter->cm_config);
-
-               eeprom_offset += 2;
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               eeprom_offset += 2;
-               nesadapter->sws_timer_config = (((u32)eeprom_data) << 16) +
-                               nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               nes_debug(NES_DBG_HW, "sws_timer_config = 0x%08X\n", nesadapter->sws_timer_config);
-
-               eeprom_offset += 2;
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               eeprom_offset += 2;
-               nesadapter->tcp_config1 = (((u32)eeprom_data) << 16) +
-                               nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               nes_debug(NES_DBG_HW, "tcp_config1 = 0x%08X\n", nesadapter->tcp_config1);
-
-               eeprom_offset += 2;
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               eeprom_offset += 2;
-               nesadapter->wqm_wat = (((u32)eeprom_data) << 16) +
-                               nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               nes_debug(NES_DBG_HW, "wqm_wat = 0x%08X\n", nesadapter->wqm_wat);
-
-               eeprom_offset += 2;
-               eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               eeprom_offset += 2;
-               nesadapter->core_clock = (((u32)eeprom_data) << 16) +
-                               nes_read16_eeprom(nesdev->regs, eeprom_offset);
-               nes_debug(NES_DBG_HW, "core_clock = 0x%08X\n", nesadapter->core_clock);
-
-               if ((sw_section_ver) && (nesadapter->hw_rev != NE020_REV)) {
-                       eeprom_offset += 2;
-                       eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-                       nesadapter->phy_index[0] = (eeprom_data & 0xff00)>>8;
-                       nesadapter->phy_index[1] = eeprom_data & 0x00ff;
-                       eeprom_offset += 2;
-                       eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
-                       nesadapter->phy_index[2] = (eeprom_data & 0xff00)>>8;
-                       nesadapter->phy_index[3] = eeprom_data & 0x00ff;
-               } else {
-                       nesadapter->phy_index[0] = 4;
-                       nesadapter->phy_index[1] = 5;
-                       nesadapter->phy_index[2] = 6;
-                       nesadapter->phy_index[3] = 7;
-               }
-               nes_debug(NES_DBG_HW, "Phy address map = 0 > %u,  1 > %u, 2 > %u, 3 > %u\n",
-                          nesadapter->phy_index[0],nesadapter->phy_index[1],
-                          nesadapter->phy_index[2],nesadapter->phy_index[3]);
-       }
-
-       return 0;
-}
-
-
-/**
- * nes_read16_eeprom
- */
-static u16 nes_read16_eeprom(void __iomem *addr, u16 offset)
-{
-       writel(NES_EEPROM_READ_REQUEST + (offset >> 1),
-                       (void __iomem *)addr + NES_EEPROM_COMMAND);
-
-       do {
-       } while (readl((void __iomem *)addr + NES_EEPROM_COMMAND) &
-                       NES_EEPROM_READ_REQUEST);
-
-       return readw((void __iomem *)addr + NES_EEPROM_DATA);
-}
-
-
-/**
- * nes_write_1G_phy_reg
- */
-void nes_write_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 data)
-{
-       u32 u32temp;
-       u32 counter;
-
-       nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
-                       0x50020000 | data | ((u32)phy_reg << 18) | ((u32)phy_addr << 23));
-       for (counter = 0; counter < 100 ; counter++) {
-               udelay(30);
-               u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
-               if (u32temp & 1) {
-                       /* nes_debug(NES_DBG_PHY, "Phy interrupt status = 0x%X.\n", u32temp); */
-                       nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
-                       break;
-               }
-       }
-       if (!(u32temp & 1))
-               nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
-                               u32temp);
-}
-
-
-/**
- * nes_read_1G_phy_reg
- * This routine only issues the read, the data must be read
- * separately.
- */
-void nes_read_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 *data)
-{
-       u32 u32temp;
-       u32 counter;
-
-       /* nes_debug(NES_DBG_PHY, "phy addr = %d, mac_index = %d\n",
-                       phy_addr, nesdev->mac_index); */
-
-       nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
-                       0x60020000 | ((u32)phy_reg << 18) | ((u32)phy_addr << 23));
-       for (counter = 0; counter < 100 ; counter++) {
-               udelay(30);
-               u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
-               if (u32temp & 1) {
-                       /* nes_debug(NES_DBG_PHY, "Phy interrupt status = 0x%X.\n", u32temp); */
-                       nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
-                       break;
-               }
-       }
-       if (!(u32temp & 1)) {
-               nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
-                               u32temp);
-               *data = 0xffff;
-       } else {
-               *data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-       }
-}
-
-
-/**
- * nes_write_10G_phy_reg
- */
-void nes_write_10G_phy_reg(struct nes_device *nesdev, u16 phy_addr, u8 dev_addr, u16 phy_reg,
-               u16 data)
-{
-       u32 port_addr;
-       u32 u32temp;
-       u32 counter;
-
-       port_addr = phy_addr;
-
-       /* set address */
-       nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
-                       0x00020000 | (u32)phy_reg | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
-       for (counter = 0; counter < 100 ; counter++) {
-               udelay(30);
-               u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
-               if (u32temp & 1) {
-                       nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
-                       break;
-               }
-       }
-       if (!(u32temp & 1))
-               nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
-                               u32temp);
-
-       /* set data */
-       nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
-                       0x10020000 | (u32)data | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
-       for (counter = 0; counter < 100 ; counter++) {
-               udelay(30);
-               u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
-               if (u32temp & 1) {
-                       nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
-                       break;
-               }
-       }
-       if (!(u32temp & 1))
-               nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
-                               u32temp);
-}
-
-
-/**
- * nes_read_10G_phy_reg
- * This routine only issues the read, the data must be read
- * separately.
- */
-void nes_read_10G_phy_reg(struct nes_device *nesdev, u8 phy_addr, u8 dev_addr, u16 phy_reg)
-{
-       u32 port_addr;
-       u32 u32temp;
-       u32 counter;
-
-       port_addr = phy_addr;
-
-       /* set address */
-       nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
-                       0x00020000 | (u32)phy_reg | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
-       for (counter = 0; counter < 100 ; counter++) {
-               udelay(30);
-               u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
-               if (u32temp & 1) {
-                       nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
-                       break;
-               }
-       }
-       if (!(u32temp & 1))
-               nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
-                               u32temp);
-
-       /* issue read */
-       nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
-                       0x30020000 | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
-       for (counter = 0; counter < 100 ; counter++) {
-               udelay(30);
-               u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
-               if (u32temp & 1) {
-                       nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
-                       break;
-               }
-       }
-       if (!(u32temp & 1))
-               nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
-                               u32temp);
-}
-
-
-/**
- * nes_get_cqp_request
- */
-struct nes_cqp_request *nes_get_cqp_request(struct nes_device *nesdev)
-{
-       unsigned long flags;
-       struct nes_cqp_request *cqp_request = NULL;
-
-       if (!list_empty(&nesdev->cqp_avail_reqs)) {
-               spin_lock_irqsave(&nesdev->cqp.lock, flags);
-               if (!list_empty(&nesdev->cqp_avail_reqs)) {
-                       cqp_request = list_entry(nesdev->cqp_avail_reqs.next,
-                               struct nes_cqp_request, list);
-                       list_del_init(&cqp_request->list);
-               }
-               spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-       }
-       if (cqp_request == NULL) {
-               cqp_request = kzalloc(sizeof(struct nes_cqp_request), GFP_ATOMIC);
-               if (cqp_request) {
-                       cqp_request->dynamic = 1;
-                       INIT_LIST_HEAD(&cqp_request->list);
-               }
-       }
-
-       if (cqp_request) {
-               init_waitqueue_head(&cqp_request->waitq);
-               cqp_request->waiting = 0;
-               cqp_request->request_done = 0;
-               cqp_request->callback = 0;
-               init_waitqueue_head(&cqp_request->waitq);
-               nes_debug(NES_DBG_CQP, "Got cqp request %p from the available list \n",
-                               cqp_request);
-       } else
-               printk(KERN_ERR PFX "%s: Could not allocated a CQP request.\n",
-                          __func__);
-
-       return cqp_request;
-}
-
-void nes_free_cqp_request(struct nes_device *nesdev,
-                         struct nes_cqp_request *cqp_request)
-{
-       unsigned long flags;
-
-       nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) freed.\n",
-                 cqp_request,
-                 le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX]) & 0x3f);
-
-       if (cqp_request->dynamic) {
-               kfree(cqp_request);
-       } else {
-               spin_lock_irqsave(&nesdev->cqp.lock, flags);
-               list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
-               spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-       }
-}
-
-void nes_put_cqp_request(struct nes_device *nesdev,
-                        struct nes_cqp_request *cqp_request)
-{
-       if (atomic_dec_and_test(&cqp_request->refcount))
-               nes_free_cqp_request(nesdev, cqp_request);
-}
-
-
-/**
- * nes_post_cqp_request
- */
-void nes_post_cqp_request(struct nes_device *nesdev,
-                         struct nes_cqp_request *cqp_request)
-{
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       unsigned long flags;
-       u32 cqp_head;
-       u64 u64temp;
-       u32 opcode;
-       int ctx_index = NES_CQP_WQE_COMP_CTX_LOW_IDX;
-
-       spin_lock_irqsave(&nesdev->cqp.lock, flags);
-
-       if (((((nesdev->cqp.sq_tail+(nesdev->cqp.sq_size*2))-nesdev->cqp.sq_head) &
-                       (nesdev->cqp.sq_size - 1)) != 1)
-                       && (list_empty(&nesdev->cqp_pending_reqs))) {
-               cqp_head = nesdev->cqp.sq_head++;
-               nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
-               cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
-               memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe));
-               opcode = le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]);
-               if ((opcode & NES_CQP_OPCODE_MASK) == NES_CQP_DOWNLOAD_SEGMENT)
-                       ctx_index = NES_CQP_WQE_DL_COMP_CTX_LOW_IDX;
-               barrier();
-               u64temp = (unsigned long)cqp_request;
-               set_wqe_64bit_value(cqp_wqe->wqe_words, ctx_index, u64temp);
-               nes_debug(NES_DBG_CQP, "CQP request (opcode 0x%02X), line 1 = 0x%08X put on CQPs SQ,"
-                       " request = %p, cqp_head = %u, cqp_tail = %u, cqp_size = %u,"
-                       " waiting = %d, refcount = %d.\n",
-                       opcode & NES_CQP_OPCODE_MASK,
-                       le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX]), cqp_request,
-                       nesdev->cqp.sq_head, nesdev->cqp.sq_tail, nesdev->cqp.sq_size,
-                       cqp_request->waiting, atomic_read(&cqp_request->refcount));
-
-               barrier();
-
-               /* Ring doorbell (1 WQEs) */
-               nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id);
-
-               barrier();
-       } else {
-               nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X), line 1 = 0x%08X"
-                               " put on the pending queue.\n",
-                               cqp_request,
-                               le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f,
-                               le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_ID_IDX]));
-               list_add_tail(&cqp_request->list, &nesdev->cqp_pending_reqs);
-       }
-
-       spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-
-       return;
-}
-
-/**
- * nes_arp_table
- */
-int nes_arp_table(struct nes_device *nesdev, u32 ip_addr, u8 *mac_addr, u32 action)
-{
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       int arp_index;
-       int err = 0;
-       __be32 tmp_addr;
-
-       for (arp_index = 0; (u32) arp_index < nesadapter->arp_table_size; arp_index++) {
-               if (nesadapter->arp_table[arp_index].ip_addr == ip_addr)
-                       break;
-       }
-
-       if (action == NES_ARP_ADD) {
-               if (arp_index != nesadapter->arp_table_size) {
-                       return -1;
-               }
-
-               arp_index = 0;
-               err = nes_alloc_resource(nesadapter, nesadapter->allocated_arps,
-                               nesadapter->arp_table_size, (u32 *)&arp_index, &nesadapter->next_arp_index, NES_RESOURCE_ARP);
-               if (err) {
-                       nes_debug(NES_DBG_NETDEV, "nes_alloc_resource returned error = %u\n", err);
-                       return err;
-               }
-               nes_debug(NES_DBG_NETDEV, "ADD, arp_index=%d\n", arp_index);
-
-               nesadapter->arp_table[arp_index].ip_addr = ip_addr;
-               memcpy(nesadapter->arp_table[arp_index].mac_addr, mac_addr, ETH_ALEN);
-               return arp_index;
-       }
-
-       /* DELETE or RESOLVE */
-       if (arp_index == nesadapter->arp_table_size) {
-               tmp_addr = cpu_to_be32(ip_addr);
-               nes_debug(NES_DBG_NETDEV, "MAC for %pI4 not in ARP table - cannot %s\n",
-                         &tmp_addr, action == NES_ARP_RESOLVE ? "resolve" : "delete");
-               return -1;
-       }
-
-       if (action == NES_ARP_RESOLVE) {
-               nes_debug(NES_DBG_NETDEV, "RESOLVE, arp_index=%d\n", arp_index);
-               return arp_index;
-       }
-
-       if (action == NES_ARP_DELETE) {
-               nes_debug(NES_DBG_NETDEV, "DELETE, arp_index=%d\n", arp_index);
-               nesadapter->arp_table[arp_index].ip_addr = 0;
-               eth_zero_addr(nesadapter->arp_table[arp_index].mac_addr);
-               nes_free_resource(nesadapter, nesadapter->allocated_arps, arp_index);
-               return arp_index;
-       }
-
-       return -1;
-}
-
-
-/**
- * nes_mh_fix
- */
-void nes_mh_fix(struct timer_list *t)
-{
-       struct nes_adapter *nesadapter = from_timer(nesadapter, t, mh_timer);
-       struct nes_device *nesdev = nesadapter->nesdev;
-       unsigned long flags;
-       struct nes_vnic *nesvnic;
-       u32 used_chunks_tx;
-       u32 temp_used_chunks_tx;
-       u32 temp_last_used_chunks_tx;
-       u32 used_chunks_mask;
-       u32 mac_tx_frames_low;
-       u32 mac_tx_frames_high;
-       u32 mac_tx_pauses;
-       u32 reset_value;
-       u32 tx_control;
-       u32 tx_config;
-       u32 tx_pause_quanta;
-       u32 rx_control;
-       u32 rx_config;
-       u32 mac_exact_match;
-       u32 mpp_debug;
-       u32 i=0;
-       u32 chunks_tx_progress = 0;
-
-       spin_lock_irqsave(&nesadapter->phy_lock, flags);
-       if ((nesadapter->mac_sw_state[0] != NES_MAC_SW_IDLE) || (nesadapter->mac_link_down[0])) {
-               spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-               goto no_mh_work;
-       }
-       nesadapter->mac_sw_state[0] = NES_MAC_SW_MH;
-       spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-       do {
-               mac_tx_frames_low = nes_read_indexed(nesdev, NES_IDX_MAC_TX_FRAMES_LOW);
-               mac_tx_frames_high = nes_read_indexed(nesdev, NES_IDX_MAC_TX_FRAMES_HIGH);
-               mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES);
-               used_chunks_tx = nes_read_indexed(nesdev, NES_IDX_USED_CHUNKS_TX);
-               nesdev->mac_pause_frames_sent += mac_tx_pauses;
-               used_chunks_mask = 0;
-               temp_used_chunks_tx = used_chunks_tx;
-               temp_last_used_chunks_tx = nesdev->last_used_chunks_tx;
-
-               if (nesdev->netdev[0]) {
-                       nesvnic = netdev_priv(nesdev->netdev[0]);
-               } else {
-                       break;
-               }
-
-               for (i=0; i<4; i++) {
-                       used_chunks_mask <<= 8;
-                       if (nesvnic->qp_nic_index[i] != 0xff) {
-                               used_chunks_mask |= 0xff;
-                               if ((temp_used_chunks_tx&0xff)<(temp_last_used_chunks_tx&0xff)) {
-                                       chunks_tx_progress = 1;
-                               }
-                       }
-                       temp_used_chunks_tx >>= 8;
-                       temp_last_used_chunks_tx >>= 8;
-               }
-               if ((mac_tx_frames_low) || (mac_tx_frames_high) ||
-                       (!(used_chunks_tx&used_chunks_mask)) ||
-                       (!(nesdev->last_used_chunks_tx&used_chunks_mask)) ||
-                       (chunks_tx_progress) ) {
-                       nesdev->last_used_chunks_tx = used_chunks_tx;
-                       break;
-               }
-               nesdev->last_used_chunks_tx = used_chunks_tx;
-               barrier();
-
-               nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, 0x00000005);
-               mh_pauses_sent++;
-               mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES);
-               if (mac_tx_pauses) {
-                       nesdev->mac_pause_frames_sent += mac_tx_pauses;
-                       break;
-               }
-
-               tx_control = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONTROL);
-               tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG);
-               tx_pause_quanta = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_QUANTA);
-               rx_control = nes_read_indexed(nesdev, NES_IDX_MAC_RX_CONTROL);
-               rx_config = nes_read_indexed(nesdev, NES_IDX_MAC_RX_CONFIG);
-               mac_exact_match = nes_read_indexed(nesdev, NES_IDX_MAC_EXACT_MATCH_BOTTOM);
-               mpp_debug = nes_read_indexed(nesdev, NES_IDX_MPP_DEBUG);
-
-               /* one last ditch effort to avoid a false positive */
-               mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES);
-               if (mac_tx_pauses) {
-                       nesdev->last_mac_tx_pauses = nesdev->mac_pause_frames_sent;
-                       nes_debug(NES_DBG_HW, "failsafe caught slow outbound pause\n");
-                       break;
-               }
-               mh_detected++;
-
-               nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, 0x00000000);
-               nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, 0x00000000);
-               reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
-
-               nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value | 0x0000001d);
-
-               while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
-                               & 0x00000040) != 0x00000040) && (i++ < 5000)) {
-                       /* mdelay(1); */
-               }
-
-               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, 0x00000008);
-               nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0);
-
-               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x000bdef7);
-               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE0, 0x9ce73000);
-               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE0, 0x0ff00000);
-               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET0, 0x00000000);
-               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS0, 0x00000000);
-               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0, 0x00000000);
-               if (nesadapter->OneG_Mode) {
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0182222);
-               } else {
-                       nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0042222);
-               }
-               nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_STATUS0);
-               nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000ff);
-
-               nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, tx_control);
-               nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config);
-               nes_write_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_QUANTA, tx_pause_quanta);
-               nes_write_indexed(nesdev, NES_IDX_MAC_RX_CONTROL, rx_control);
-               nes_write_indexed(nesdev, NES_IDX_MAC_RX_CONFIG, rx_config);
-               nes_write_indexed(nesdev, NES_IDX_MAC_EXACT_MATCH_BOTTOM, mac_exact_match);
-               nes_write_indexed(nesdev, NES_IDX_MPP_DEBUG, mpp_debug);
-
-       } while (0);
-
-       nesadapter->mac_sw_state[0] = NES_MAC_SW_IDLE;
-no_mh_work:
-       nesdev->nesadapter->mh_timer.expires = jiffies + (HZ/5);
-       add_timer(&nesdev->nesadapter->mh_timer);
-}
-
-/**
- * nes_clc
- */
-void nes_clc(struct timer_list *t)
-{
-       struct nes_adapter *nesadapter = from_timer(nesadapter, t, lc_timer);
-       unsigned long flags;
-
-       spin_lock_irqsave(&nesadapter->phy_lock, flags);
-       nesadapter->link_interrupt_count[0] = 0;
-       nesadapter->link_interrupt_count[1] = 0;
-       nesadapter->link_interrupt_count[2] = 0;
-       nesadapter->link_interrupt_count[3] = 0;
-       spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
-
-       nesadapter->lc_timer.expires = jiffies + 3600 * HZ;  /* 1 hour */
-       add_timer(&nesadapter->lc_timer);
-}
-
-
-/**
- * nes_dump_mem
- */
-void nes_dump_mem(unsigned int dump_debug_level, void *addr, int length)
-{
-       if (!(nes_debug_level & dump_debug_level)) {
-               return;
-       }
-
-       if (length > 0x100) {
-               nes_debug(dump_debug_level, "Length truncated from %x to %x\n", length, 0x100);
-               length = 0x100;
-       }
-       nes_debug(dump_debug_level, "Address=0x%p, length=0x%x (%d)\n", addr, length, length);
-
-       print_hex_dump(KERN_ERR, PFX, DUMP_PREFIX_NONE, 16, 1, addr, length, true);
-}
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
deleted file mode 100644 (file)
index 4902432..0000000
+++ /dev/null
@@ -1,3759 +0,0 @@
-/*
- * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/random.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <asm/byteorder.h>
-
-#include <rdma/ib_verbs.h>
-#include <rdma/iw_cm.h>
-#include <rdma/ib_user_verbs.h>
-#include <rdma/uverbs_ioctl.h>
-
-#include "nes.h"
-
-#include <rdma/ib_umem.h>
-
-atomic_t mod_qp_timouts;
-atomic_t qps_created;
-atomic_t sw_qps_destroyed;
-
-static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev);
-static int nes_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata);
-
-/**
- * nes_alloc_mw
- */
-static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd, enum ib_mw_type type,
-                                 struct ib_udata *udata)
-{
-       struct nes_pd *nespd = to_nespd(ibpd);
-       struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       struct nes_cqp_request *cqp_request;
-       struct nes_mr *nesmr;
-       struct ib_mw *ibmw;
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       int ret;
-       u32 stag;
-       u32 stag_index = 0;
-       u32 next_stag_index = 0;
-       u32 driver_key = 0;
-       u8 stag_key = 0;
-
-       if (type != IB_MW_TYPE_1)
-               return ERR_PTR(-EINVAL);
-
-       get_random_bytes(&next_stag_index, sizeof(next_stag_index));
-       stag_key = (u8)next_stag_index;
-
-       driver_key = 0;
-
-       next_stag_index >>= 8;
-       next_stag_index %= nesadapter->max_mr;
-
-       ret = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs,
-                       nesadapter->max_mr, &stag_index, &next_stag_index, NES_RESOURCE_MW);
-       if (ret) {
-               return ERR_PTR(ret);
-       }
-
-       nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
-       if (!nesmr) {
-               nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       stag = stag_index << 8;
-       stag |= driver_key;
-       stag += (u32)stag_key;
-
-       nes_debug(NES_DBG_MR, "Registering STag 0x%08X, index = 0x%08X\n",
-                       stag, stag_index);
-
-       /* Register the region with the adapter */
-       cqp_request = nes_get_cqp_request(nesdev);
-       if (cqp_request == NULL) {
-               kfree(nesmr);
-               nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       cqp_request->waiting = 1;
-       cqp_wqe = &cqp_request->cqp_wqe;
-
-       cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] =
-                       cpu_to_le32( NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_RIGHTS_REMOTE_READ |
-                       NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_VA_TO |
-                       NES_CQP_STAG_REM_ACC_EN);
-
-       nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX, (nespd->pd_id & 0x00007fff));
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag);
-
-       atomic_set(&cqp_request->refcount, 2);
-       nes_post_cqp_request(nesdev, cqp_request);
-
-       /* Wait for CQP */
-       ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
-                       NES_EVENT_TIMEOUT);
-       nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u,"
-                       " CQP Major:Minor codes = 0x%04X:0x%04X.\n",
-                       stag, ret, cqp_request->major_code, cqp_request->minor_code);
-       if ((!ret) || (cqp_request->major_code)) {
-               nes_put_cqp_request(nesdev, cqp_request);
-               kfree(nesmr);
-               nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-               if (!ret) {
-                       return ERR_PTR(-ETIME);
-               } else {
-                       return ERR_PTR(-ENOMEM);
-               }
-       }
-       nes_put_cqp_request(nesdev, cqp_request);
-
-       nesmr->ibmw.rkey = stag;
-       nesmr->mode = IWNES_MEMREG_TYPE_MW;
-       ibmw = &nesmr->ibmw;
-       nesmr->pbl_4k = 0;
-       nesmr->pbls_used = 0;
-
-       return ibmw;
-}
-
-
-/**
- * nes_dealloc_mw
- */
-static int nes_dealloc_mw(struct ib_mw *ibmw)
-{
-       struct nes_mr *nesmr = to_nesmw(ibmw);
-       struct nes_vnic *nesvnic = to_nesvnic(ibmw->device);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       struct nes_cqp_request *cqp_request;
-       int err = 0;
-       int ret;
-
-       /* Deallocate the window with the adapter */
-       cqp_request = nes_get_cqp_request(nesdev);
-       if (cqp_request == NULL) {
-               nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
-               return -ENOMEM;
-       }
-       cqp_request->waiting = 1;
-       cqp_wqe = &cqp_request->cqp_wqe;
-       nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, NES_CQP_DEALLOCATE_STAG);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ibmw->rkey);
-
-       atomic_set(&cqp_request->refcount, 2);
-       nes_post_cqp_request(nesdev, cqp_request);
-
-       /* Wait for CQP */
-       nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X to complete.\n",
-                       ibmw->rkey);
-       ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
-                       NES_EVENT_TIMEOUT);
-       nes_debug(NES_DBG_MR, "Deallocate STag completed, wait_event_timeout ret = %u,"
-                       " CQP Major:Minor codes = 0x%04X:0x%04X.\n",
-                       ret, cqp_request->major_code, cqp_request->minor_code);
-       if (!ret)
-               err = -ETIME;
-       else if (cqp_request->major_code)
-               err = -EIO;
-
-       nes_put_cqp_request(nesdev, cqp_request);
-
-       nes_free_resource(nesadapter, nesadapter->allocated_mrs,
-                       (ibmw->rkey & 0x0fffff00) >> 8);
-       kfree(nesmr);
-
-       return err;
-}
-
-
-/*
- * nes_alloc_fast_mr
- */
-static int alloc_fast_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
-                            u32 stag, u32 page_count)
-{
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       struct nes_cqp_request *cqp_request;
-       unsigned long flags;
-       int ret;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       u32 opcode = 0;
-       u16 major_code;
-       u64 region_length = page_count * PAGE_SIZE;
-
-
-       cqp_request = nes_get_cqp_request(nesdev);
-       if (cqp_request == NULL) {
-               nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
-               return -ENOMEM;
-       }
-       nes_debug(NES_DBG_MR, "alloc_fast_reg_mr: page_count = %d, "
-                             "region_length = %llu\n",
-                             page_count, region_length);
-       cqp_request->waiting = 1;
-       cqp_wqe = &cqp_request->cqp_wqe;
-
-       spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-       if (nesadapter->free_4kpbl > 0) {
-               nesadapter->free_4kpbl--;
-               spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-       } else {
-               /* No 4kpbl's available: */
-               spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-               nes_debug(NES_DBG_MR, "Out of Pbls\n");
-               nes_free_cqp_request(nesdev, cqp_request);
-               return -ENOMEM;
-       }
-
-       opcode = NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_MR |
-                NES_CQP_STAG_PBL_BLK_SIZE | NES_CQP_STAG_VA_TO |
-                NES_CQP_STAG_REM_ACC_EN;
-       /*
-        * The current OFED API does not support the zero based TO option.
-        * If added then need to changed the NES_CQP_STAG_VA* option.  Also,
-        * the API does not support that ability to have the MR set for local
-        * access only when created and not allow the SQ op to override. Given
-        * this the remote enable must be set here.
-        */
-
-       nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, 1);
-
-       cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] =
-                       cpu_to_le32((u32)(region_length >> 8) & 0xff000000);
-       cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] |=
-                       cpu_to_le32(nespd->pd_id & 0x00007fff);
-
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag);
-       set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_VA_LOW_IDX, 0);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_LOW_IDX, 0);
-       set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, 0);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX, (page_count * 8));
-       cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE);
-       barrier();
-
-       atomic_set(&cqp_request->refcount, 2);
-       nes_post_cqp_request(nesdev, cqp_request);
-
-       /* Wait for CQP */
-       ret = wait_event_timeout(cqp_request->waitq,
-                                (0 != cqp_request->request_done),
-                                NES_EVENT_TIMEOUT);
-
-       nes_debug(NES_DBG_MR, "Allocate STag 0x%08X completed, "
-                 "wait_event_timeout ret = %u, CQP Major:Minor codes = "
-                 "0x%04X:0x%04X.\n", stag, ret, cqp_request->major_code,
-                 cqp_request->minor_code);
-       major_code = cqp_request->major_code;
-       nes_put_cqp_request(nesdev, cqp_request);
-
-       if (!ret || major_code) {
-               spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-               nesadapter->free_4kpbl++;
-               spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-       }
-
-       if (!ret)
-               return -ETIME;
-       else if (major_code)
-               return -EIO;
-       return 0;
-}
-
-/*
- * nes_alloc_mr
- */
-static struct ib_mr *nes_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
-                                 u32 max_num_sg, struct ib_udata *udata)
-{
-       struct nes_pd *nespd = to_nespd(ibpd);
-       struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-
-       u32 next_stag_index;
-       u8 stag_key = 0;
-       u32 driver_key = 0;
-       int err = 0;
-       u32 stag_index = 0;
-       struct nes_mr *nesmr;
-       u32 stag;
-       int ret;
-       struct ib_mr *ibmr;
-
-       if (mr_type != IB_MR_TYPE_MEM_REG)
-               return ERR_PTR(-EINVAL);
-
-       if (max_num_sg > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64)))
-               return ERR_PTR(-E2BIG);
-
-/*
- * Note:  Set to always use a fixed length single page entry PBL.  This is to allow
- *      for the fast_reg_mr operation to always know the size of the PBL.
- */
-       if (max_num_sg > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64)))
-               return ERR_PTR(-E2BIG);
-
-       get_random_bytes(&next_stag_index, sizeof(next_stag_index));
-       stag_key = (u8)next_stag_index;
-       next_stag_index >>= 8;
-       next_stag_index %= nesadapter->max_mr;
-
-       err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs,
-                                nesadapter->max_mr, &stag_index,
-                                &next_stag_index, NES_RESOURCE_FAST_MR);
-       if (err)
-               return ERR_PTR(err);
-
-       nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
-       if (!nesmr) {
-               nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       stag = stag_index << 8;
-       stag |= driver_key;
-       stag += (u32)stag_key;
-
-       nes_debug(NES_DBG_MR, "Allocating STag 0x%08X index = 0x%08X\n",
-                 stag, stag_index);
-
-       ret = alloc_fast_reg_mr(nesdev, nespd, stag, max_num_sg);
-
-       if (ret == 0) {
-               nesmr->ibmr.rkey = stag;
-               nesmr->ibmr.lkey = stag;
-               nesmr->mode = IWNES_MEMREG_TYPE_FMEM;
-               ibmr = &nesmr->ibmr;
-       } else {
-               kfree(nesmr);
-               nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       nesmr->pages = pci_alloc_consistent(nesdev->pcidev,
-                                           max_num_sg * sizeof(u64),
-                                           &nesmr->paddr);
-       if (!nesmr->paddr)
-               goto err;
-
-       nesmr->max_pages = max_num_sg;
-
-       return ibmr;
-
-err:
-       nes_dereg_mr(ibmr, udata);
-
-       return ERR_PTR(-ENOMEM);
-}
-
-static int nes_set_page(struct ib_mr *ibmr, u64 addr)
-{
-       struct nes_mr *nesmr = to_nesmr(ibmr);
-
-       if (unlikely(nesmr->npages == nesmr->max_pages))
-               return -ENOMEM;
-
-       nesmr->pages[nesmr->npages++] = cpu_to_le64(addr);
-
-       return 0;
-}
-
-static int nes_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
-                        int sg_nents, unsigned int *sg_offset)
-{
-       struct nes_mr *nesmr = to_nesmr(ibmr);
-
-       nesmr->npages = 0;
-
-       return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, nes_set_page);
-}
-
-/**
- * nes_query_device
- */
-static int nes_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
-                           struct ib_udata *uhw)
-{
-       struct nes_vnic *nesvnic = to_nesvnic(ibdev);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_ib_device *nesibdev = nesvnic->nesibdev;
-
-       if (uhw->inlen || uhw->outlen)
-               return -EINVAL;
-
-       memset(props, 0, sizeof(*props));
-       memcpy(&props->sys_image_guid, nesvnic->netdev->dev_addr, 6);
-
-       props->fw_ver = nesdev->nesadapter->firmware_version;
-       props->device_cap_flags = nesdev->nesadapter->device_cap_flags;
-       props->vendor_id = nesdev->nesadapter->vendor_id;
-       props->vendor_part_id = nesdev->nesadapter->vendor_part_id;
-       props->hw_ver = nesdev->nesadapter->hw_rev;
-       props->max_mr_size = 0x80000000;
-       props->max_qp = nesibdev->max_qp;
-       props->max_qp_wr = nesdev->nesadapter->max_qp_wr - 2;
-       props->max_send_sge = nesdev->nesadapter->max_sge;
-       props->max_recv_sge = nesdev->nesadapter->max_sge;
-       props->max_cq = nesibdev->max_cq;
-       props->max_cqe = nesdev->nesadapter->max_cqe;
-       props->max_mr = nesibdev->max_mr;
-       props->max_mw = nesibdev->max_mr;
-       props->max_pd = nesibdev->max_pd;
-       props->max_sge_rd = 1;
-       switch (nesdev->nesadapter->max_irrq_wr) {
-               case 0:
-                       props->max_qp_rd_atom = 2;
-                       break;
-               case 1:
-                       props->max_qp_rd_atom = 8;
-                       break;
-               case 2:
-                       props->max_qp_rd_atom = 32;
-                       break;
-               case 3:
-                       props->max_qp_rd_atom = 64;
-                       break;
-               default:
-                       props->max_qp_rd_atom = 0;
-       }
-       props->max_qp_init_rd_atom = props->max_qp_rd_atom;
-       props->atomic_cap = IB_ATOMIC_NONE;
-       props->max_map_per_fmr = 1;
-
-       return 0;
-}
-
-
-/**
- * nes_query_port
- */
-static int nes_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props)
-{
-       struct nes_vnic *nesvnic = to_nesvnic(ibdev);
-       struct net_device *netdev = nesvnic->netdev;
-
-       /* props being zeroed by the caller, avoid zeroing it here */
-
-       props->max_mtu = IB_MTU_4096;
-       props->active_mtu = ib_mtu_int_to_enum(netdev->mtu);
-
-       props->lid = 1;
-       if (netif_queue_stopped(netdev))
-               props->state = IB_PORT_DOWN;
-       else if (nesvnic->linkup)
-               props->state = IB_PORT_ACTIVE;
-       else
-               props->state = IB_PORT_DOWN;
-       props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP |
-                       IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
-       props->gid_tbl_len = 1;
-       props->pkey_tbl_len = 1;
-       props->active_width = IB_WIDTH_4X;
-       props->active_speed = IB_SPEED_SDR;
-       props->max_msg_sz = 0x80000000;
-
-       return 0;
-}
-
-/**
- * nes_query_pkey
- */
-static int nes_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
-{
-       *pkey = 0;
-       return 0;
-}
-
-
-/**
- * nes_query_gid
- */
-static int nes_query_gid(struct ib_device *ibdev, u8 port,
-               int index, union ib_gid *gid)
-{
-       struct nes_vnic *nesvnic = to_nesvnic(ibdev);
-
-       memset(&(gid->raw[0]), 0, sizeof(gid->raw));
-       memcpy(&(gid->raw[0]), nesvnic->netdev->dev_addr, 6);
-
-       return 0;
-}
-
-
-/**
- * nes_alloc_ucontext - Allocate the user context data structure. This keeps track
- * of all objects associated with a particular user-mode client.
- */
-static int nes_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
-{
-       struct ib_device *ibdev = uctx->device;
-       struct nes_vnic *nesvnic = to_nesvnic(ibdev);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       struct nes_alloc_ucontext_req req;
-       struct nes_alloc_ucontext_resp uresp = {};
-       struct nes_ucontext *nes_ucontext = to_nesucontext(uctx);
-       struct nes_ib_device *nesibdev = nesvnic->nesibdev;
-
-
-       if (ib_copy_from_udata(&req, udata, sizeof(struct nes_alloc_ucontext_req))) {
-               printk(KERN_ERR PFX "Invalid structure size on allocate user context.\n");
-               return -EINVAL;
-       }
-
-       if (req.userspace_ver != NES_ABI_USERSPACE_VER) {
-               printk(KERN_ERR PFX "Invalid userspace driver version detected. Detected version %d, should be %d\n",
-                       req.userspace_ver, NES_ABI_USERSPACE_VER);
-               return -EINVAL;
-       }
-
-
-       uresp.max_qps = nesibdev->max_qp;
-       uresp.max_pds = nesibdev->max_pd;
-       uresp.wq_size = nesdev->nesadapter->max_qp_wr * 2;
-       uresp.virtwq = nesadapter->virtwq;
-       uresp.kernel_ver = NES_ABI_KERNEL_VER;
-
-       nes_ucontext->nesdev = nesdev;
-       nes_ucontext->mmap_wq_offset = uresp.max_pds;
-       nes_ucontext->mmap_cq_offset = nes_ucontext->mmap_wq_offset +
-                       ((sizeof(struct nes_hw_qp_wqe) * uresp.max_qps * 2) + PAGE_SIZE-1) /
-                       PAGE_SIZE;
-
-
-       if (ib_copy_to_udata(udata, &uresp, sizeof(uresp)))
-               return -EFAULT;
-
-       INIT_LIST_HEAD(&nes_ucontext->cq_reg_mem_list);
-       INIT_LIST_HEAD(&nes_ucontext->qp_reg_mem_list);
-       return 0;
-}
-
-/**
- * nes_dealloc_ucontext
- */
-static void nes_dealloc_ucontext(struct ib_ucontext *context)
-{
-       return;
-}
-
-/**
- * nes_mmap
- */
-static int nes_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
-{
-       unsigned long index;
-       struct nes_vnic *nesvnic = to_nesvnic(context->device);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       /* struct nes_adapter *nesadapter = nesdev->nesadapter; */
-       struct nes_ucontext *nes_ucontext;
-       struct nes_qp *nesqp;
-
-       nes_ucontext = to_nesucontext(context);
-
-
-       if (vma->vm_pgoff >= nes_ucontext->mmap_wq_offset) {
-               index = (vma->vm_pgoff - nes_ucontext->mmap_wq_offset) * PAGE_SIZE;
-               index /= ((sizeof(struct nes_hw_qp_wqe) * nesdev->nesadapter->max_qp_wr * 2) +
-                               PAGE_SIZE-1) & (~(PAGE_SIZE-1));
-               if (!test_bit(index, nes_ucontext->allocated_wqs)) {
-                       nes_debug(NES_DBG_MMAP, "wq %lu not allocated\n", index);
-                       return -EFAULT;
-               }
-               nesqp = nes_ucontext->mmap_nesqp[index];
-               if (nesqp == NULL) {
-                       nes_debug(NES_DBG_MMAP, "wq %lu has a NULL QP base.\n", index);
-                       return -EFAULT;
-               }
-               if (remap_pfn_range(vma, vma->vm_start,
-                               virt_to_phys(nesqp->hwqp.sq_vbase) >> PAGE_SHIFT,
-                               vma->vm_end - vma->vm_start,
-                               vma->vm_page_prot)) {
-                       nes_debug(NES_DBG_MMAP, "remap_pfn_range failed.\n");
-                       return -EAGAIN;
-               }
-               vma->vm_private_data = nesqp;
-               return 0;
-       } else {
-               index = vma->vm_pgoff;
-               if (!test_bit(index, nes_ucontext->allocated_doorbells))
-                       return -EFAULT;
-
-               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-               if (io_remap_pfn_range(vma, vma->vm_start,
-                               (nesdev->doorbell_start +
-                               ((nes_ucontext->mmap_db_index[index] - nesdev->base_doorbell_index) * 4096))
-                               >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot))
-                       return -EAGAIN;
-               vma->vm_private_data = nes_ucontext;
-               return 0;
-       }
-
-       return -ENOSYS;
-}
-
-
-/**
- * nes_alloc_pd
- */
-static int nes_alloc_pd(struct ib_pd *pd, struct ib_udata *udata)
-{
-       struct ib_device *ibdev = pd->device;
-       struct nes_pd *nespd = to_nespd(pd);
-       struct nes_vnic *nesvnic = to_nesvnic(ibdev);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       struct nes_alloc_pd_resp uresp;
-       u32 pd_num = 0;
-       int err;
-       struct nes_ucontext *nesucontext = rdma_udata_to_drv_context(
-               udata, struct nes_ucontext, ibucontext);
-
-       nes_debug(
-               NES_DBG_PD,
-               "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n",
-               nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev,
-               &nesucontext->ibucontext, netdev_refcnt_read(nesvnic->netdev));
-
-       err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds,
-                       nesadapter->max_pd, &pd_num, &nesadapter->next_pd, NES_RESOURCE_PD);
-       if (err)
-               return err;
-
-       nes_debug(NES_DBG_PD, "Allocating PD (%p) for ib device %s\n",
-                       nespd, dev_name(&nesvnic->nesibdev->ibdev.dev));
-
-       nespd->pd_id = (pd_num << (PAGE_SHIFT-12)) + nesadapter->base_pd;
-
-       if (udata) {
-               nespd->mmap_db_index = find_next_zero_bit(nesucontext->allocated_doorbells,
-                               NES_MAX_USER_DB_REGIONS, nesucontext->first_free_db);
-               nes_debug(NES_DBG_PD, "find_first_zero_biton doorbells returned %u, mapping pd_id %u.\n",
-                               nespd->mmap_db_index, nespd->pd_id);
-               if (nespd->mmap_db_index >= NES_MAX_USER_DB_REGIONS) {
-                       nes_debug(NES_DBG_PD, "mmap_db_index > MAX\n");
-                       nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
-                       return -ENOMEM;
-               }
-
-               uresp.pd_id = nespd->pd_id;
-               uresp.mmap_db_index = nespd->mmap_db_index;
-               if (ib_copy_to_udata(udata, &uresp, sizeof (struct nes_alloc_pd_resp))) {
-                       nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
-                       return -EFAULT;
-               }
-
-               set_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells);
-               nesucontext->mmap_db_index[nespd->mmap_db_index] = nespd->pd_id;
-               nesucontext->first_free_db = nespd->mmap_db_index + 1;
-       }
-
-       nes_debug(NES_DBG_PD, "PD%u structure located @%p.\n", nespd->pd_id, nespd);
-       return 0;
-}
-
-
-/**
- * nes_dealloc_pd
- */
-static void nes_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
-{
-       struct nes_ucontext *nesucontext;
-       struct nes_pd *nespd = to_nespd(ibpd);
-       struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-
-       if (udata) {
-               nesucontext =
-                       rdma_udata_to_drv_context(
-                               udata,
-                               struct nes_ucontext,
-                               ibucontext);
-               nes_debug(NES_DBG_PD, "Clearing bit %u from allocated doorbells\n",
-                               nespd->mmap_db_index);
-               clear_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells);
-               nesucontext->mmap_db_index[nespd->mmap_db_index] = 0;
-               if (nesucontext->first_free_db > nespd->mmap_db_index) {
-                       nesucontext->first_free_db = nespd->mmap_db_index;
-               }
-       }
-
-       nes_debug(NES_DBG_PD, "Deallocating PD%u structure located @%p.\n",
-                       nespd->pd_id, nespd);
-       nes_free_resource(nesadapter, nesadapter->allocated_pds,
-                       (nespd->pd_id-nesadapter->base_pd)>>(PAGE_SHIFT-12));
-}
-
-
-/**
- * nes_get_encoded_size
- */
-static inline u8 nes_get_encoded_size(int *size)
-{
-       u8 encoded_size = 0;
-       if (*size <= 32) {
-               *size = 32;
-               encoded_size = 1;
-       } else if (*size <= 128) {
-               *size = 128;
-               encoded_size = 2;
-       } else if (*size <= 512) {
-               *size = 512;
-               encoded_size = 3;
-       }
-       return (encoded_size);
-}
-
-
-
-/**
- * nes_setup_virt_qp
- */
-static int nes_setup_virt_qp(struct nes_qp *nesqp, struct nes_pbl *nespbl,
-               struct nes_vnic *nesvnic, int sq_size, int rq_size)
-{
-       unsigned long flags;
-       void *mem;
-       __le64 *pbl = NULL;
-       __le64 *tpbl;
-       __le64 *pblbuffer;
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       u32 pbl_entries;
-       u8 rq_pbl_entries;
-       u8 sq_pbl_entries;
-
-       pbl_entries = nespbl->pbl_size >> 3;
-       nes_debug(NES_DBG_QP, "Userspace PBL, pbl_size=%u, pbl_entries = %d pbl_vbase=%p, pbl_pbase=%lx\n",
-                       nespbl->pbl_size, pbl_entries,
-                       (void *)nespbl->pbl_vbase,
-                       (unsigned long) nespbl->pbl_pbase);
-       pbl = (__le64 *) nespbl->pbl_vbase; /* points to first pbl entry */
-       /* now lets set the sq_vbase as well as rq_vbase addrs we will assign */
-       /* the first pbl to be fro the rq_vbase... */
-       rq_pbl_entries = (rq_size * sizeof(struct nes_hw_qp_wqe)) >> 12;
-       sq_pbl_entries = (sq_size * sizeof(struct nes_hw_qp_wqe)) >> 12;
-       nesqp->hwqp.sq_pbase = (le32_to_cpu(((__le32 *)pbl)[0])) | ((u64)((le32_to_cpu(((__le32 *)pbl)[1]))) << 32);
-       if (!nespbl->page) {
-               nes_debug(NES_DBG_QP, "QP nespbl->page is NULL \n");
-               kfree(nespbl);
-               return -ENOMEM;
-       }
-
-       nesqp->hwqp.sq_vbase = kmap(nespbl->page);
-       nesqp->page = nespbl->page;
-       if (!nesqp->hwqp.sq_vbase) {
-               nes_debug(NES_DBG_QP, "QP sq_vbase kmap failed\n");
-               kfree(nespbl);
-               return -ENOMEM;
-       }
-
-       /* Now to get to sq.. we need to calculate how many */
-       /* PBL entries were used by the rq.. */
-       pbl += sq_pbl_entries;
-       nesqp->hwqp.rq_pbase = (le32_to_cpu(((__le32 *)pbl)[0])) | ((u64)((le32_to_cpu(((__le32 *)pbl)[1]))) << 32);
-       /* nesqp->hwqp.rq_vbase = bus_to_virt(*pbl); */
-       /*nesqp->hwqp.rq_vbase = phys_to_virt(*pbl); */
-
-       nes_debug(NES_DBG_QP, "QP sq_vbase= %p sq_pbase=%lx rq_vbase=%p rq_pbase=%lx\n",
-                 nesqp->hwqp.sq_vbase, (unsigned long) nesqp->hwqp.sq_pbase,
-                 nesqp->hwqp.rq_vbase, (unsigned long) nesqp->hwqp.rq_pbase);
-       spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-       if (!nesadapter->free_256pbl) {
-               pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
-                               nespbl->pbl_pbase);
-               spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-               kunmap(nesqp->page);
-               kfree(nespbl);
-               return -ENOMEM;
-       }
-       nesadapter->free_256pbl--;
-       spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-
-       nesqp->pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 256, &nesqp->pbl_pbase);
-       pblbuffer = nesqp->pbl_vbase;
-       if (!nesqp->pbl_vbase) {
-               /* memory allocated during nes_reg_user_mr() */
-               pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
-                                   nespbl->pbl_pbase);
-               kfree(nespbl);
-               spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-               nesadapter->free_256pbl++;
-               spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-               kunmap(nesqp->page);
-               return -ENOMEM;
-       }
-       memset(nesqp->pbl_vbase, 0, 256);
-       /* fill in the page address in the pbl buffer.. */
-       tpbl = pblbuffer + 16;
-       pbl = (__le64 *)nespbl->pbl_vbase;
-       while (sq_pbl_entries--)
-               *tpbl++ = *pbl++;
-       tpbl = pblbuffer;
-       while (rq_pbl_entries--)
-               *tpbl++ = *pbl++;
-
-       /* done with memory allocated during nes_reg_user_mr() */
-       pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
-                           nespbl->pbl_pbase);
-       kfree(nespbl);
-
-       nesqp->qp_mem_size =
-                       max((u32)sizeof(struct nes_qp_context), ((u32)256)) + 256;     /* this is Q2 */
-       /* Round up to a multiple of a page */
-       nesqp->qp_mem_size += PAGE_SIZE - 1;
-       nesqp->qp_mem_size &= ~(PAGE_SIZE - 1);
-
-       mem = pci_alloc_consistent(nesdev->pcidev, nesqp->qp_mem_size,
-                       &nesqp->hwqp.q2_pbase);
-
-       if (!mem) {
-               pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase);
-               nesqp->pbl_vbase = NULL;
-               spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-               nesadapter->free_256pbl++;
-               spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-               kunmap(nesqp->page);
-               return -ENOMEM;
-       }
-       nesqp->sq_kmapped = 1;
-       nesqp->hwqp.q2_vbase = mem;
-       mem += 256;
-       memset(nesqp->hwqp.q2_vbase, 0, 256);
-       nesqp->nesqp_context = mem;
-       memset(nesqp->nesqp_context, 0, sizeof(*nesqp->nesqp_context));
-       nesqp->nesqp_context_pbase = nesqp->hwqp.q2_pbase + 256;
-
-       return 0;
-}
-
-
-/**
- * nes_setup_mmap_qp
- */
-static int nes_setup_mmap_qp(struct nes_qp *nesqp, struct nes_vnic *nesvnic,
-               int sq_size, int rq_size)
-{
-       void *mem;
-       struct nes_device *nesdev = nesvnic->nesdev;
-
-       nesqp->qp_mem_size = (sizeof(struct nes_hw_qp_wqe) * sq_size) +
-                       (sizeof(struct nes_hw_qp_wqe) * rq_size) +
-                       max((u32)sizeof(struct nes_qp_context), ((u32)256)) +
-                       256; /* this is Q2 */
-       /* Round up to a multiple of a page */
-       nesqp->qp_mem_size += PAGE_SIZE - 1;
-       nesqp->qp_mem_size &= ~(PAGE_SIZE - 1);
-
-       mem = pci_alloc_consistent(nesdev->pcidev, nesqp->qp_mem_size,
-                       &nesqp->hwqp.sq_pbase);
-       if (!mem)
-               return -ENOMEM;
-       nes_debug(NES_DBG_QP, "PCI consistent memory for "
-                       "host descriptor rings located @ %p (pa = 0x%08lX.) size = %u.\n",
-                       mem, (unsigned long)nesqp->hwqp.sq_pbase, nesqp->qp_mem_size);
-
-       memset(mem, 0, nesqp->qp_mem_size);
-
-       nesqp->hwqp.sq_vbase = mem;
-       mem += sizeof(struct nes_hw_qp_wqe) * sq_size;
-
-       nesqp->hwqp.rq_vbase = mem;
-       nesqp->hwqp.rq_pbase = nesqp->hwqp.sq_pbase +
-                       sizeof(struct nes_hw_qp_wqe) * sq_size;
-       mem += sizeof(struct nes_hw_qp_wqe) * rq_size;
-
-       nesqp->hwqp.q2_vbase = mem;
-       nesqp->hwqp.q2_pbase = nesqp->hwqp.rq_pbase +
-                       sizeof(struct nes_hw_qp_wqe) * rq_size;
-       mem += 256;
-       memset(nesqp->hwqp.q2_vbase, 0, 256);
-
-       nesqp->nesqp_context = mem;
-       nesqp->nesqp_context_pbase = nesqp->hwqp.q2_pbase + 256;
-       memset(nesqp->nesqp_context, 0, sizeof(*nesqp->nesqp_context));
-       return 0;
-}
-
-
-/**
- * nes_free_qp_mem() is to free up the qp's pci_alloc_consistent() memory.
- */
-static void nes_free_qp_mem(struct nes_device *nesdev,
-               struct nes_qp *nesqp, int virt_wqs)
-{
-       unsigned long flags;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       if (!virt_wqs) {
-               pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size,
-                               nesqp->hwqp.sq_vbase, nesqp->hwqp.sq_pbase);
-       }else {
-               spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-               nesadapter->free_256pbl++;
-               spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-               pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase);
-               pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase );
-               nesqp->pbl_vbase = NULL;
-               if (nesqp->sq_kmapped) {
-                       nesqp->sq_kmapped = 0;
-                       kunmap(nesqp->page);
-               }
-       }
-}
-
-
-/**
- * nes_create_qp
- */
-static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
-               struct ib_qp_init_attr *init_attr, struct ib_udata *udata)
-{
-       u64 u64temp= 0;
-       u64 u64nesqp = 0;
-       struct nes_pd *nespd = to_nespd(ibpd);
-       struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       struct nes_qp *nesqp;
-       struct nes_cq *nescq;
-       struct nes_ucontext *nes_ucontext = rdma_udata_to_drv_context(
-               udata, struct nes_ucontext, ibucontext);
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       struct nes_cqp_request *cqp_request;
-       struct nes_create_qp_req req;
-       struct nes_create_qp_resp uresp;
-       struct nes_pbl  *nespbl = NULL;
-       u32 qp_num = 0;
-       u32 opcode = 0;
-       /* u32 counter = 0; */
-       void *mem;
-       unsigned long flags;
-       int ret;
-       int err;
-       int virt_wqs = 0;
-       int sq_size;
-       int rq_size;
-       u8 sq_encoded_size;
-       u8 rq_encoded_size;
-       /* int counter; */
-
-       if (init_attr->create_flags)
-               return ERR_PTR(-EINVAL);
-
-       atomic_inc(&qps_created);
-       switch (init_attr->qp_type) {
-               case IB_QPT_RC:
-                       if (nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) {
-                               init_attr->cap.max_inline_data = 0;
-                       } else {
-                               init_attr->cap.max_inline_data = 64;
-                       }
-                       sq_size = init_attr->cap.max_send_wr;
-                       rq_size = init_attr->cap.max_recv_wr;
-
-                       /* check if the encoded sizes are OK or not... */
-                       sq_encoded_size = nes_get_encoded_size(&sq_size);
-                       rq_encoded_size = nes_get_encoded_size(&rq_size);
-
-                       if ((!sq_encoded_size) || (!rq_encoded_size)) {
-                               nes_debug(NES_DBG_QP, "ERROR bad rq (%u) or sq (%u) size\n",
-                                               rq_size, sq_size);
-                               return ERR_PTR(-EINVAL);
-                       }
-
-                       init_attr->cap.max_send_wr = sq_size -2;
-                       init_attr->cap.max_recv_wr = rq_size -1;
-                       nes_debug(NES_DBG_QP, "RQ size=%u, SQ Size=%u\n", rq_size, sq_size);
-
-                       ret = nes_alloc_resource(nesadapter, nesadapter->allocated_qps,
-                                       nesadapter->max_qp, &qp_num, &nesadapter->next_qp, NES_RESOURCE_QP);
-                       if (ret) {
-                               return ERR_PTR(ret);
-                       }
-
-                       /* Need 512 (actually now 1024) byte alignment on this structure */
-                       mem = kzalloc(sizeof(*nesqp)+NES_SW_CONTEXT_ALIGN-1, GFP_KERNEL);
-                       if (!mem) {
-                               nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
-                               return ERR_PTR(-ENOMEM);
-                       }
-                       u64nesqp = (unsigned long)mem;
-                       u64nesqp += ((u64)NES_SW_CONTEXT_ALIGN) - 1;
-                       u64temp = ((u64)NES_SW_CONTEXT_ALIGN) - 1;
-                       u64nesqp &= ~u64temp;
-                       nesqp = (struct nes_qp *)(unsigned long)u64nesqp;
-                       /* nes_debug(NES_DBG_QP, "nesqp=%p, allocated buffer=%p.  Rounded to closest %u\n",
-                                       nesqp, mem, NES_SW_CONTEXT_ALIGN); */
-                       nesqp->allocated_buffer = mem;
-
-                       if (udata) {
-                               if (ib_copy_from_udata(&req, udata, sizeof(struct nes_create_qp_req))) {
-                                       nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
-                                       kfree(nesqp->allocated_buffer);
-                                       nes_debug(NES_DBG_QP, "ib_copy_from_udata() Failed \n");
-                                       return ERR_PTR(-EFAULT);
-                               }
-                               if (req.user_wqe_buffers) {
-                                       virt_wqs = 1;
-                               }
-                               if (req.user_qp_buffer)
-                                       nesqp->nesuqp_addr = req.user_qp_buffer;
-
-                               nesqp->user_mode = 1;
-                               if (virt_wqs) {
-                                       err = 1;
-                                       list_for_each_entry(nespbl, &nes_ucontext->qp_reg_mem_list, list) {
-                                               if (nespbl->user_base == (unsigned long )req.user_wqe_buffers) {
-                                                       list_del(&nespbl->list);
-                                                       err = 0;
-                                                       nes_debug(NES_DBG_QP, "Found PBL for virtual QP. nespbl=%p. user_base=0x%lx\n",
-                                                                 nespbl, nespbl->user_base);
-                                                       break;
-                                               }
-                                       }
-                                       if (err) {
-                                               nes_debug(NES_DBG_QP, "Didn't Find PBL for virtual QP. address = %llx.\n",
-                                                         (long long unsigned int)req.user_wqe_buffers);
-                                               nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
-                                               kfree(nesqp->allocated_buffer);
-                                               return ERR_PTR(-EFAULT);
-                                       }
-                               }
-
-                               nesqp->mmap_sq_db_index =
-                                       find_next_zero_bit(nes_ucontext->allocated_wqs,
-                                                          NES_MAX_USER_WQ_REGIONS, nes_ucontext->first_free_wq);
-                               /* nes_debug(NES_DBG_QP, "find_first_zero_biton wqs returned %u\n",
-                                               nespd->mmap_db_index); */
-                               if (nesqp->mmap_sq_db_index >= NES_MAX_USER_WQ_REGIONS) {
-                                       nes_debug(NES_DBG_QP,
-                                                 "db index > max user regions, failing create QP\n");
-                                       nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
-                                       if (virt_wqs) {
-                                               pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
-                                                                   nespbl->pbl_pbase);
-                                               kfree(nespbl);
-                                       }
-                                       kfree(nesqp->allocated_buffer);
-                                       return ERR_PTR(-ENOMEM);
-                               }
-                               set_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs);
-                               nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = nesqp;
-                               nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index + 1;
-                       }
-                       err = (!virt_wqs) ? nes_setup_mmap_qp(nesqp, nesvnic, sq_size, rq_size) :
-                                       nes_setup_virt_qp(nesqp, nespbl, nesvnic, sq_size, rq_size);
-                       if (err) {
-                               nes_debug(NES_DBG_QP,
-                                         "error geting qp mem code = %d\n", err);
-                               nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
-                               kfree(nesqp->allocated_buffer);
-                               return ERR_PTR(-ENOMEM);
-                       }
-
-                       nesqp->hwqp.sq_size = sq_size;
-                       nesqp->hwqp.sq_encoded_size = sq_encoded_size;
-                       nesqp->hwqp.sq_head = 1;
-                       nesqp->hwqp.rq_size = rq_size;
-                       nesqp->hwqp.rq_encoded_size = rq_encoded_size;
-                       /* nes_debug(NES_DBG_QP, "nesqp->nesqp_context_pbase = %p\n",
-                                       (void *)nesqp->nesqp_context_pbase);
-                       */
-                       nesqp->hwqp.qp_id = qp_num;
-                       nesqp->ibqp.qp_num = nesqp->hwqp.qp_id;
-                       nesqp->nespd = nespd;
-
-                       nescq = to_nescq(init_attr->send_cq);
-                       nesqp->nesscq = nescq;
-                       nescq = to_nescq(init_attr->recv_cq);
-                       nesqp->nesrcq = nescq;
-
-                       nesqp->nesqp_context->misc |= cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) <<
-                                       NES_QPCONTEXT_MISC_PCI_FCN_SHIFT);
-                       nesqp->nesqp_context->misc |= cpu_to_le32((u32)nesqp->hwqp.rq_encoded_size <<
-                                       NES_QPCONTEXT_MISC_RQ_SIZE_SHIFT);
-                       nesqp->nesqp_context->misc |= cpu_to_le32((u32)nesqp->hwqp.sq_encoded_size <<
-                                       NES_QPCONTEXT_MISC_SQ_SIZE_SHIFT);
-                       if (!udata) {
-                               nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_PRIV_EN);
-                               nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_FAST_REGISTER_EN);
-                       }
-                       nesqp->nesqp_context->cqs = cpu_to_le32(nesqp->nesscq->hw_cq.cq_number +
-                                       ((u32)nesqp->nesrcq->hw_cq.cq_number << 16));
-                       u64temp = (u64)nesqp->hwqp.sq_pbase;
-                       nesqp->nesqp_context->sq_addr_low = cpu_to_le32((u32)u64temp);
-                       nesqp->nesqp_context->sq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
-
-
-                       if (!virt_wqs) {
-                               u64temp = (u64)nesqp->hwqp.sq_pbase;
-                               nesqp->nesqp_context->sq_addr_low = cpu_to_le32((u32)u64temp);
-                               nesqp->nesqp_context->sq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
-                               u64temp = (u64)nesqp->hwqp.rq_pbase;
-                               nesqp->nesqp_context->rq_addr_low = cpu_to_le32((u32)u64temp);
-                               nesqp->nesqp_context->rq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
-                       } else {
-                               u64temp = (u64)nesqp->pbl_pbase;
-                               nesqp->nesqp_context->rq_addr_low = cpu_to_le32((u32)u64temp);
-                               nesqp->nesqp_context->rq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
-                       }
-
-                       /* nes_debug(NES_DBG_QP, "next_qp_nic_index=%u, using nic_index=%d\n",
-                                       nesvnic->next_qp_nic_index,
-                                       nesvnic->qp_nic_index[nesvnic->next_qp_nic_index]); */
-                       spin_lock_irqsave(&nesdev->cqp.lock, flags);
-                       nesqp->nesqp_context->misc2 |= cpu_to_le32(
-                                       (u32)nesvnic->qp_nic_index[nesvnic->next_qp_nic_index] <<
-                                       NES_QPCONTEXT_MISC2_NIC_INDEX_SHIFT);
-                       nesvnic->next_qp_nic_index++;
-                       if ((nesvnic->next_qp_nic_index > 3) ||
-                                       (nesvnic->qp_nic_index[nesvnic->next_qp_nic_index] == 0xf)) {
-                               nesvnic->next_qp_nic_index = 0;
-                       }
-                       spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
-
-                       nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32((u32)nesqp->nespd->pd_id << 16);
-                       u64temp = (u64)nesqp->hwqp.q2_pbase;
-                       nesqp->nesqp_context->q2_addr_low = cpu_to_le32((u32)u64temp);
-                       nesqp->nesqp_context->q2_addr_high = cpu_to_le32((u32)(u64temp >> 32));
-                       nesqp->nesqp_context->aeq_token_low =  cpu_to_le32((u32)((unsigned long)(nesqp)));
-                       nesqp->nesqp_context->aeq_token_high =  cpu_to_le32((u32)(upper_32_bits((unsigned long)(nesqp))));
-                       nesqp->nesqp_context->ird_ord_sizes = cpu_to_le32(NES_QPCONTEXT_ORDIRD_ALSMM |
-                                       NES_QPCONTEXT_ORDIRD_AAH |
-                                       ((((u32)nesadapter->max_irrq_wr) <<
-                                       NES_QPCONTEXT_ORDIRD_IRDSIZE_SHIFT) & NES_QPCONTEXT_ORDIRD_IRDSIZE_MASK));
-                       if (disable_mpa_crc) {
-                               nes_debug(NES_DBG_QP, "Disabling MPA crc checking due to module option.\n");
-                               nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(NES_QPCONTEXT_ORDIRD_RNMC);
-                       }
-
-
-                       /* Create the QP */
-                       cqp_request = nes_get_cqp_request(nesdev);
-                       if (cqp_request == NULL) {
-                               nes_debug(NES_DBG_QP, "Failed to get a cqp_request\n");
-                               nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
-                               nes_free_qp_mem(nesdev, nesqp,virt_wqs);
-                               kfree(nesqp->allocated_buffer);
-                               return ERR_PTR(-ENOMEM);
-                       }
-                       cqp_request->waiting = 1;
-                       cqp_wqe = &cqp_request->cqp_wqe;
-
-                       if (!virt_wqs) {
-                               opcode = NES_CQP_CREATE_QP | NES_CQP_QP_TYPE_IWARP |
-                                       NES_CQP_QP_IWARP_STATE_IDLE;
-                       } else {
-                               opcode = NES_CQP_CREATE_QP | NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_VIRT_WQS |
-                                       NES_CQP_QP_IWARP_STATE_IDLE;
-                       }
-                       opcode |= NES_CQP_QP_CQS_VALID;
-                       nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-                       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
-                       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
-
-                       u64temp = (u64)nesqp->nesqp_context_pbase;
-                       set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
-
-                       atomic_set(&cqp_request->refcount, 2);
-                       nes_post_cqp_request(nesdev, cqp_request);
-
-                       /* Wait for CQP */
-                       nes_debug(NES_DBG_QP, "Waiting for create iWARP QP%u to complete.\n",
-                                       nesqp->hwqp.qp_id);
-                       ret = wait_event_timeout(cqp_request->waitq,
-                                       (cqp_request->request_done != 0), NES_EVENT_TIMEOUT);
-                       nes_debug(NES_DBG_QP, "Create iwarp QP%u completed, wait_event_timeout ret=%u,"
-                                       " nesdev->cqp_head = %u, nesdev->cqp.sq_tail = %u,"
-                                       " CQP Major:Minor codes = 0x%04X:0x%04X.\n",
-                                       nesqp->hwqp.qp_id, ret, nesdev->cqp.sq_head, nesdev->cqp.sq_tail,
-                                       cqp_request->major_code, cqp_request->minor_code);
-                       if ((!ret) || (cqp_request->major_code)) {
-                               nes_put_cqp_request(nesdev, cqp_request);
-                               nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
-                               nes_free_qp_mem(nesdev, nesqp,virt_wqs);
-                               kfree(nesqp->allocated_buffer);
-                               if (!ret) {
-                                       return ERR_PTR(-ETIME);
-                               } else {
-                                       return ERR_PTR(-EIO);
-                               }
-                       }
-
-                       nes_put_cqp_request(nesdev, cqp_request);
-
-                       if (udata) {
-                               uresp.mmap_sq_db_index = nesqp->mmap_sq_db_index;
-                               uresp.mmap_rq_db_index = 0;
-                               uresp.actual_sq_size = sq_size;
-                               uresp.actual_rq_size = rq_size;
-                               uresp.qp_id = nesqp->hwqp.qp_id;
-                               uresp.nes_drv_opt = nes_drv_opt;
-                               if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) {
-                                       nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
-                                       nes_free_qp_mem(nesdev, nesqp,virt_wqs);
-                                       kfree(nesqp->allocated_buffer);
-                                       return ERR_PTR(-EFAULT);
-                               }
-                       }
-
-                       nes_debug(NES_DBG_QP, "QP%u structure located @%p.Size = %u.\n",
-                                       nesqp->hwqp.qp_id, nesqp, (u32)sizeof(*nesqp));
-                       spin_lock_init(&nesqp->lock);
-                       nes_add_ref(&nesqp->ibqp);
-                       break;
-               default:
-                       nes_debug(NES_DBG_QP, "Invalid QP type: %d\n", init_attr->qp_type);
-                       return ERR_PTR(-EINVAL);
-       }
-       init_completion(&nesqp->sq_drained);
-       init_completion(&nesqp->rq_drained);
-
-       nesqp->sig_all = (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR);
-       timer_setup(&nesqp->terminate_timer, nes_terminate_timeout, 0);
-
-       /* update the QP table */
-       nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp;
-       nes_debug(NES_DBG_QP, "netdev refcnt=%u\n",
-                       netdev_refcnt_read(nesvnic->netdev));
-
-       return &nesqp->ibqp;
-}
-
-/**
- * nes_clean_cq
- */
-static void nes_clean_cq(struct nes_qp *nesqp, struct nes_cq *nescq)
-{
-       u32 cq_head;
-       u32 lo;
-       u32 hi;
-       u64 u64temp;
-       unsigned long flags = 0;
-
-       spin_lock_irqsave(&nescq->lock, flags);
-
-       cq_head = nescq->hw_cq.cq_head;
-       while (le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_VALID) {
-               rmb();
-               lo = le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]);
-               hi = le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX]);
-               u64temp = (((u64)hi) << 32) | ((u64)lo);
-               u64temp &= ~(NES_SW_CONTEXT_ALIGN-1);
-               if (u64temp == (u64)(unsigned long)nesqp) {
-                       /* Zero the context value so cqe will be ignored */
-                       nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX] = 0;
-                       nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX] = 0;
-               }
-
-               if (++cq_head >= nescq->hw_cq.cq_size)
-                       cq_head = 0;
-       }
-
-       spin_unlock_irqrestore(&nescq->lock, flags);
-}
-
-
-/**
- * nes_destroy_qp
- */
-static int nes_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
-{
-       struct nes_qp *nesqp = to_nesqp(ibqp);
-       struct nes_ucontext *nes_ucontext;
-       struct ib_qp_attr attr;
-       struct iw_cm_id *cm_id;
-       struct iw_cm_event cm_event;
-       int ret = 0;
-
-       atomic_inc(&sw_qps_destroyed);
-       nesqp->destroyed = 1;
-
-       /* Blow away the connection if it exists. */
-       if (nesqp->ibqp_state >= IB_QPS_INIT && nesqp->ibqp_state <= IB_QPS_RTS) {
-               /* if (nesqp->ibqp_state == IB_QPS_RTS) { */
-               attr.qp_state = IB_QPS_ERR;
-               nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL);
-       }
-
-       if (((nesqp->ibqp_state == IB_QPS_INIT) ||
-                       (nesqp->ibqp_state == IB_QPS_RTR)) && (nesqp->cm_id)) {
-               cm_id = nesqp->cm_id;
-               cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
-               cm_event.status = -ETIMEDOUT;
-               cm_event.local_addr = cm_id->local_addr;
-               cm_event.remote_addr = cm_id->remote_addr;
-               cm_event.private_data = NULL;
-               cm_event.private_data_len = 0;
-
-               nes_debug(NES_DBG_QP, "Generating a CM Timeout Event for "
-                               "QP%u. cm_id = %p, refcount = %u. \n",
-                               nesqp->hwqp.qp_id, cm_id, atomic_read(&nesqp->refcount));
-
-               cm_id->rem_ref(cm_id);
-               ret = cm_id->event_handler(cm_id, &cm_event);
-               if (ret)
-                       nes_debug(NES_DBG_QP, "OFA CM event_handler returned, ret=%d\n", ret);
-       }
-
-       if (nesqp->user_mode) {
-               if (udata) {
-                       nes_ucontext =
-                               rdma_udata_to_drv_context(
-                                       udata,
-                                       struct nes_ucontext,
-                                       ibucontext);
-                       clear_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs);
-                       nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = NULL;
-                       if (nes_ucontext->first_free_wq > nesqp->mmap_sq_db_index) {
-                               nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index;
-                       }
-               }
-               if (nesqp->pbl_pbase && nesqp->sq_kmapped) {
-                       nesqp->sq_kmapped = 0;
-                       kunmap(nesqp->page);
-               }
-       } else {
-               /* Clean any pending completions from the cq(s) */
-               if (nesqp->nesscq)
-                       nes_clean_cq(nesqp, nesqp->nesscq);
-
-               if ((nesqp->nesrcq) && (nesqp->nesrcq != nesqp->nesscq))
-                       nes_clean_cq(nesqp, nesqp->nesrcq);
-       }
-       nes_rem_ref(&nesqp->ibqp);
-       return 0;
-}
-
-
-/**
- * nes_create_cq
- */
-static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
-                                  const struct ib_cq_init_attr *attr,
-                                  struct ib_udata *udata)
-{
-       int entries = attr->cqe;
-       u64 u64temp;
-       struct nes_vnic *nesvnic = to_nesvnic(ibdev);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       struct nes_cq *nescq;
-       struct nes_ucontext *nes_ucontext = NULL;
-       struct nes_cqp_request *cqp_request;
-       void *mem = NULL;
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       struct nes_pbl *nespbl = NULL;
-       struct nes_create_cq_req req;
-       struct nes_create_cq_resp resp;
-       u32 cq_num = 0;
-       u32 opcode = 0;
-       u32 pbl_entries = 1;
-       int err;
-       unsigned long flags;
-       int ret;
-
-       if (attr->flags)
-               return ERR_PTR(-EINVAL);
-
-       if (entries > nesadapter->max_cqe)
-               return ERR_PTR(-EINVAL);
-
-       err = nes_alloc_resource(nesadapter, nesadapter->allocated_cqs,
-                       nesadapter->max_cq, &cq_num, &nesadapter->next_cq, NES_RESOURCE_CQ);
-       if (err) {
-               return ERR_PTR(err);
-       }
-
-       nescq = kzalloc(sizeof(struct nes_cq), GFP_KERNEL);
-       if (!nescq) {
-               nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       nescq->hw_cq.cq_size = max(entries + 1, 5);
-       nescq->hw_cq.cq_number = cq_num;
-       nescq->ibcq.cqe = nescq->hw_cq.cq_size - 1;
-
-       if (udata) {
-               struct nes_ucontext *nes_ucontext = rdma_udata_to_drv_context(
-                       udata, struct nes_ucontext, ibucontext);
-
-               if (ib_copy_from_udata(&req, udata, sizeof (struct nes_create_cq_req))) {
-                       nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-                       kfree(nescq);
-                       return ERR_PTR(-EFAULT);
-               }
-               nesvnic->mcrq_ucontext = nes_ucontext;
-               nes_ucontext->mcrqf = req.mcrqf;
-               if (nes_ucontext->mcrqf) {
-                       if (nes_ucontext->mcrqf & 0x80000000)
-                               nescq->hw_cq.cq_number = nesvnic->nic.qp_id + 28 + 2 * ((nes_ucontext->mcrqf & 0xf) - 1);
-                       else if (nes_ucontext->mcrqf & 0x40000000)
-                               nescq->hw_cq.cq_number = nes_ucontext->mcrqf & 0xffff;
-                       else
-                               nescq->hw_cq.cq_number = nesvnic->mcrq_qp_id + nes_ucontext->mcrqf-1;
-                       nescq->mcrqf = nes_ucontext->mcrqf;
-                       nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-               }
-               nes_debug(NES_DBG_CQ, "CQ Virtual Address = %08lX, size = %u.\n",
-                               (unsigned long)req.user_cq_buffer, entries);
-               err = 1;
-               list_for_each_entry(nespbl, &nes_ucontext->cq_reg_mem_list, list) {
-                       if (nespbl->user_base == (unsigned long )req.user_cq_buffer) {
-                               list_del(&nespbl->list);
-                               err = 0;
-                               nes_debug(NES_DBG_CQ, "Found PBL for virtual CQ. nespbl=%p.\n",
-                                               nespbl);
-                               break;
-                       }
-               }
-               if (err) {
-                       nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-                       kfree(nescq);
-                       return ERR_PTR(-EFAULT);
-               }
-
-               pbl_entries = nespbl->pbl_size >> 3;
-               nescq->cq_mem_size = 0;
-       } else {
-               nescq->cq_mem_size = nescq->hw_cq.cq_size * sizeof(struct nes_hw_cqe);
-               nes_debug(NES_DBG_CQ, "Attempting to allocate pci memory (%u entries, %u bytes) for CQ%u.\n",
-                               entries, nescq->cq_mem_size, nescq->hw_cq.cq_number);
-
-               /* allocate the physical buffer space */
-               mem = pci_zalloc_consistent(nesdev->pcidev, nescq->cq_mem_size,
-                                           &nescq->hw_cq.cq_pbase);
-               if (!mem) {
-                       printk(KERN_ERR PFX "Unable to allocate pci memory for cq\n");
-                       nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-                       kfree(nescq);
-                       return ERR_PTR(-ENOMEM);
-               }
-
-               nescq->hw_cq.cq_vbase = mem;
-               nescq->hw_cq.cq_head = 0;
-               nes_debug(NES_DBG_CQ, "CQ%u virtual address @ %p, phys = 0x%08X\n",
-                               nescq->hw_cq.cq_number, nescq->hw_cq.cq_vbase,
-                               (u32)nescq->hw_cq.cq_pbase);
-       }
-
-       nescq->hw_cq.ce_handler = nes_iwarp_ce_handler;
-       spin_lock_init(&nescq->lock);
-
-       /* send CreateCQ request to CQP */
-       cqp_request = nes_get_cqp_request(nesdev);
-       if (cqp_request == NULL) {
-               nes_debug(NES_DBG_CQ, "Failed to get a cqp_request.\n");
-               if (!udata)
-                       pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
-                                       nescq->hw_cq.cq_pbase);
-               else {
-                       pci_free_consistent(nesdev->pcidev, nespbl->pbl_size,
-                                           nespbl->pbl_vbase, nespbl->pbl_pbase);
-                       kfree(nespbl);
-               }
-
-               nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-               kfree(nescq);
-               return ERR_PTR(-ENOMEM);
-       }
-       cqp_request->waiting = 1;
-       cqp_wqe = &cqp_request->cqp_wqe;
-
-       opcode = NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
-                       NES_CQP_CQ_CHK_OVERFLOW |
-                       NES_CQP_CQ_CEQE_MASK | ((u32)nescq->hw_cq.cq_size << 16);
-
-       spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-
-       if (pbl_entries != 1) {
-               if (pbl_entries > 32) {
-                       /* use 4k pbl */
-                       nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 4k PBL\n", pbl_entries);
-                       if (nesadapter->free_4kpbl == 0) {
-                               spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-                               nes_free_cqp_request(nesdev, cqp_request);
-                               if (!udata)
-                                       pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
-                                                       nescq->hw_cq.cq_pbase);
-                               else {
-                                       pci_free_consistent(nesdev->pcidev, nespbl->pbl_size,
-                                                           nespbl->pbl_vbase, nespbl->pbl_pbase);
-                                       kfree(nespbl);
-                               }
-                               nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-                               kfree(nescq);
-                               return ERR_PTR(-ENOMEM);
-                       } else {
-                               opcode |= (NES_CQP_CQ_VIRT | NES_CQP_CQ_4KB_CHUNK);
-                               nescq->virtual_cq = 2;
-                               nesadapter->free_4kpbl--;
-                       }
-               } else {
-                       /* use 256 byte pbl */
-                       nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 256 byte PBL\n", pbl_entries);
-                       if (nesadapter->free_256pbl == 0) {
-                               spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-                               nes_free_cqp_request(nesdev, cqp_request);
-                               if (!udata)
-                                       pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
-                                                       nescq->hw_cq.cq_pbase);
-                               else {
-                                       pci_free_consistent(nesdev->pcidev, nespbl->pbl_size,
-                                                           nespbl->pbl_vbase, nespbl->pbl_pbase);
-                                       kfree(nespbl);
-                               }
-                               nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-                               kfree(nescq);
-                               return ERR_PTR(-ENOMEM);
-                       } else {
-                               opcode |= NES_CQP_CQ_VIRT;
-                               nescq->virtual_cq = 1;
-                               nesadapter->free_256pbl--;
-                       }
-               }
-       }
-
-       spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-
-       nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
-                       (nescq->hw_cq.cq_number | ((u32)nesdev->ceq_index << 16)));
-
-       if (udata) {
-               if (pbl_entries != 1)
-                       u64temp = (u64)nespbl->pbl_pbase;
-               else
-                       u64temp = le64_to_cpu(nespbl->pbl_vbase[0]);
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX,
-                               nes_ucontext->mmap_db_index[0]);
-       } else {
-               u64temp = (u64)nescq->hw_cq.cq_pbase;
-               cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
-       }
-       set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
-       cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0;
-       u64temp = (u64)(unsigned long)&nescq->hw_cq;
-       cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] =
-                       cpu_to_le32((u32)(u64temp >> 1));
-       cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
-                       cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
-
-       atomic_set(&cqp_request->refcount, 2);
-       nes_post_cqp_request(nesdev, cqp_request);
-
-       /* Wait for CQP */
-       nes_debug(NES_DBG_CQ, "Waiting for create iWARP CQ%u to complete.\n",
-                       nescq->hw_cq.cq_number);
-       ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
-                       NES_EVENT_TIMEOUT * 2);
-       nes_debug(NES_DBG_CQ, "Create iWARP CQ%u completed, wait_event_timeout ret = %d.\n",
-                       nescq->hw_cq.cq_number, ret);
-       if ((!ret) || (cqp_request->major_code)) {
-               nes_put_cqp_request(nesdev, cqp_request);
-               if (!udata)
-                       pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
-                                       nescq->hw_cq.cq_pbase);
-               else {
-                       pci_free_consistent(nesdev->pcidev, nespbl->pbl_size,
-                                           nespbl->pbl_vbase, nespbl->pbl_pbase);
-                       kfree(nespbl);
-               }
-               nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-               kfree(nescq);
-               return ERR_PTR(-EIO);
-       }
-       nes_put_cqp_request(nesdev, cqp_request);
-
-       if (udata) {
-               /* free the nespbl */
-               pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
-                               nespbl->pbl_pbase);
-               kfree(nespbl);
-               resp.cq_id = nescq->hw_cq.cq_number;
-               resp.cq_size = nescq->hw_cq.cq_size;
-               resp.mmap_db_index = 0;
-               if (ib_copy_to_udata(udata, &resp, sizeof resp - sizeof resp.reserved)) {
-                       nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
-                       kfree(nescq);
-                       return ERR_PTR(-EFAULT);
-               }
-       }
-
-       return &nescq->ibcq;
-}
-
-
-/**
- * nes_destroy_cq
- */
-static int nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
-{
-       struct nes_cq *nescq;
-       struct nes_device *nesdev;
-       struct nes_vnic *nesvnic;
-       struct nes_adapter *nesadapter;
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       struct nes_cqp_request *cqp_request;
-       unsigned long flags;
-       u32 opcode = 0;
-       int ret;
-
-       if (ib_cq == NULL)
-               return 0;
-
-       nescq = to_nescq(ib_cq);
-       nesvnic = to_nesvnic(ib_cq->device);
-       nesdev = nesvnic->nesdev;
-       nesadapter = nesdev->nesadapter;
-
-       nes_debug(NES_DBG_CQ, "Destroy CQ%u\n", nescq->hw_cq.cq_number);
-
-       /* Send DestroyCQ request to CQP */
-       cqp_request = nes_get_cqp_request(nesdev);
-       if (cqp_request == NULL) {
-               nes_debug(NES_DBG_CQ, "Failed to get a cqp_request.\n");
-               return -ENOMEM;
-       }
-       cqp_request->waiting = 1;
-       cqp_wqe = &cqp_request->cqp_wqe;
-       opcode = NES_CQP_DESTROY_CQ | (nescq->hw_cq.cq_size << 16);
-       spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-       if (nescq->virtual_cq == 1) {
-               nesadapter->free_256pbl++;
-               if (nesadapter->free_256pbl > nesadapter->max_256pbl) {
-                       printk(KERN_ERR PFX "%s: free 256B PBLs(%u) has exceeded the max(%u)\n",
-                                       __func__, nesadapter->free_256pbl, nesadapter->max_256pbl);
-               }
-       } else if (nescq->virtual_cq == 2) {
-               nesadapter->free_4kpbl++;
-               if (nesadapter->free_4kpbl > nesadapter->max_4kpbl) {
-                       printk(KERN_ERR PFX "%s: free 4K PBLs(%u) has exceeded the max(%u)\n",
-                                       __func__, nesadapter->free_4kpbl, nesadapter->max_4kpbl);
-               }
-               opcode |= NES_CQP_CQ_4KB_CHUNK;
-       }
-
-       spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-
-       nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
-               (nescq->hw_cq.cq_number | ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 16)));
-       if (!nescq->mcrqf)
-               nes_free_resource(nesadapter, nesadapter->allocated_cqs, nescq->hw_cq.cq_number);
-
-       atomic_set(&cqp_request->refcount, 2);
-       nes_post_cqp_request(nesdev, cqp_request);
-
-       /* Wait for CQP */
-       nes_debug(NES_DBG_CQ, "Waiting for destroy iWARP CQ%u to complete.\n",
-                       nescq->hw_cq.cq_number);
-       ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
-                       NES_EVENT_TIMEOUT);
-       nes_debug(NES_DBG_CQ, "Destroy iWARP CQ%u completed, wait_event_timeout ret = %u,"
-                       " CQP Major:Minor codes = 0x%04X:0x%04X.\n",
-                       nescq->hw_cq.cq_number, ret, cqp_request->major_code,
-                       cqp_request->minor_code);
-       if (!ret) {
-               nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy timeout expired\n",
-                                       nescq->hw_cq.cq_number);
-               ret = -ETIME;
-       } else if (cqp_request->major_code) {
-               nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy failed\n",
-                                       nescq->hw_cq.cq_number);
-               ret = -EIO;
-       } else {
-               ret = 0;
-       }
-       nes_put_cqp_request(nesdev, cqp_request);
-
-       if (nescq->cq_mem_size)
-               pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size,
-                                   nescq->hw_cq.cq_vbase, nescq->hw_cq.cq_pbase);
-       kfree(nescq);
-
-       return ret;
-}
-
-/**
- * root_256
- */
-static u32 root_256(struct nes_device *nesdev,
-                   struct nes_root_vpbl *root_vpbl,
-                   struct nes_root_vpbl *new_root,
-                   u16 pbl_count_4k)
-{
-       u64 leaf_pbl;
-       int i, j, k;
-
-       if (pbl_count_4k == 1) {
-               new_root->pbl_vbase = pci_alloc_consistent(nesdev->pcidev,
-                                               512, &new_root->pbl_pbase);
-
-               if (new_root->pbl_vbase == NULL)
-                       return 0;
-
-               leaf_pbl = (u64)root_vpbl->pbl_pbase;
-               for (i = 0; i < 16; i++) {
-                       new_root->pbl_vbase[i].pa_low =
-                               cpu_to_le32((u32)leaf_pbl);
-                       new_root->pbl_vbase[i].pa_high =
-                               cpu_to_le32((u32)((((u64)leaf_pbl) >> 32)));
-                       leaf_pbl += 256;
-               }
-       } else {
-               for (i = 3; i >= 0; i--) {
-                       j = i * 16;
-                       root_vpbl->pbl_vbase[j] = root_vpbl->pbl_vbase[i];
-                       leaf_pbl = le32_to_cpu(root_vpbl->pbl_vbase[j].pa_low) +
-                           (((u64)le32_to_cpu(root_vpbl->pbl_vbase[j].pa_high))
-                               << 32);
-                       for (k = 1; k < 16; k++) {
-                               leaf_pbl += 256;
-                               root_vpbl->pbl_vbase[j + k].pa_low =
-                                               cpu_to_le32((u32)leaf_pbl);
-                               root_vpbl->pbl_vbase[j + k].pa_high =
-                                   cpu_to_le32((u32)((((u64)leaf_pbl) >> 32)));
-                       }
-               }
-       }
-
-       return 1;
-}
-
-
-/**
- * nes_reg_mr
- */
-static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
-               u32 stag, u64 region_length, struct nes_root_vpbl *root_vpbl,
-               dma_addr_t single_buffer, u16 pbl_count_4k,
-               u16 residual_page_count_4k, int acc, u64 *iova_start,
-               u16 *actual_pbl_cnt, u8 *used_4k_pbls)
-{
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       struct nes_cqp_request *cqp_request;
-       unsigned long flags;
-       int ret;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       uint pg_cnt = 0;
-       u16 pbl_count_256 = 0;
-       u16 pbl_count = 0;
-       u8  use_256_pbls = 0;
-       u8  use_4k_pbls = 0;
-       u16 use_two_level = (pbl_count_4k > 1) ? 1 : 0;
-       struct nes_root_vpbl new_root = { 0, NULL, NULL };
-       u32 opcode = 0;
-       u16 major_code;
-
-       /* Register the region with the adapter */
-       cqp_request = nes_get_cqp_request(nesdev);
-       if (cqp_request == NULL) {
-               nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
-               return -ENOMEM;
-       }
-       cqp_request->waiting = 1;
-       cqp_wqe = &cqp_request->cqp_wqe;
-
-       if (pbl_count_4k) {
-               spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-
-               pg_cnt = ((pbl_count_4k - 1) * 512) + residual_page_count_4k;
-               pbl_count_256 = (pg_cnt + 31) / 32;
-               if (pg_cnt <= 32) {
-                       if (pbl_count_256 <= nesadapter->free_256pbl)
-                               use_256_pbls = 1;
-                       else if (pbl_count_4k <= nesadapter->free_4kpbl)
-                               use_4k_pbls = 1;
-               } else if (pg_cnt <= 2048) {
-                       if (((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) &&
-                           (nesadapter->free_4kpbl > (nesadapter->max_4kpbl >> 1))) {
-                               use_4k_pbls = 1;
-                       } else if ((pbl_count_256 + 1) <= nesadapter->free_256pbl) {
-                               use_256_pbls = 1;
-                               use_two_level = 1;
-                       } else if ((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) {
-                               use_4k_pbls = 1;
-                       }
-               } else {
-                       if ((pbl_count_4k + 1) <= nesadapter->free_4kpbl)
-                               use_4k_pbls = 1;
-               }
-
-               if (use_256_pbls) {
-                       pbl_count = pbl_count_256;
-                       nesadapter->free_256pbl -= pbl_count + use_two_level;
-               } else if (use_4k_pbls) {
-                       pbl_count =  pbl_count_4k;
-                       nesadapter->free_4kpbl -= pbl_count + use_two_level;
-               } else {
-                       spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-                       nes_debug(NES_DBG_MR, "Out of Pbls\n");
-                       nes_free_cqp_request(nesdev, cqp_request);
-                       return -ENOMEM;
-               }
-
-               spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-       }
-
-       if (use_256_pbls && use_two_level) {
-               if (root_256(nesdev, root_vpbl, &new_root, pbl_count_4k) == 1) {
-                       if (new_root.pbl_pbase != 0)
-                               root_vpbl = &new_root;
-               } else {
-                       spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-                       nesadapter->free_256pbl += pbl_count_256 + use_two_level;
-                       use_256_pbls = 0;
-
-                       if (pbl_count_4k == 1)
-                               use_two_level = 0;
-                       pbl_count = pbl_count_4k;
-
-                       if ((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) {
-                               nesadapter->free_4kpbl -= pbl_count + use_two_level;
-                               use_4k_pbls = 1;
-                       }
-                       spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-
-                       if (use_4k_pbls == 0)
-                               return -ENOMEM;
-               }
-       }
-
-       opcode = NES_CQP_REGISTER_STAG | NES_CQP_STAG_RIGHTS_LOCAL_READ |
-                                       NES_CQP_STAG_VA_TO | NES_CQP_STAG_MR;
-       if (acc & IB_ACCESS_LOCAL_WRITE)
-               opcode |= NES_CQP_STAG_RIGHTS_LOCAL_WRITE;
-       if (acc & IB_ACCESS_REMOTE_WRITE)
-               opcode |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_REM_ACC_EN;
-       if (acc & IB_ACCESS_REMOTE_READ)
-               opcode |= NES_CQP_STAG_RIGHTS_REMOTE_READ | NES_CQP_STAG_REM_ACC_EN;
-       if (acc & IB_ACCESS_MW_BIND)
-               opcode |= NES_CQP_STAG_RIGHTS_WINDOW_BIND | NES_CQP_STAG_REM_ACC_EN;
-
-       nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
-       set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_VA_LOW_IDX, *iova_start);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_LOW_IDX, region_length);
-
-       cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] =
-                       cpu_to_le32((u32)(region_length >> 8) & 0xff000000);
-       cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] |=
-                       cpu_to_le32(nespd->pd_id & 0x00007fff);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag);
-
-       if (pbl_count == 0) {
-               set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, single_buffer);
-       } else {
-               set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, root_vpbl->pbl_pbase);
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, pbl_count);
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX, (pg_cnt * 8));
-
-               if (use_4k_pbls)
-                       cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE);
-       }
-       barrier();
-
-       atomic_set(&cqp_request->refcount, 2);
-       nes_post_cqp_request(nesdev, cqp_request);
-
-       /* Wait for CQP */
-       ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
-                       NES_EVENT_TIMEOUT);
-       nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u,"
-                       " CQP Major:Minor codes = 0x%04X:0x%04X.\n",
-                       stag, ret, cqp_request->major_code, cqp_request->minor_code);
-       major_code = cqp_request->major_code;
-       nes_put_cqp_request(nesdev, cqp_request);
-
-       if ((!ret || major_code) && pbl_count != 0) {
-               spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-               if (use_256_pbls)
-                       nesadapter->free_256pbl += pbl_count + use_two_level;
-               else if (use_4k_pbls)
-                       nesadapter->free_4kpbl += pbl_count + use_two_level;
-               spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-       }
-       if (new_root.pbl_pbase)
-               pci_free_consistent(nesdev->pcidev, 512, new_root.pbl_vbase,
-                                   new_root.pbl_pbase);
-
-       if (!ret)
-               return -ETIME;
-       else if (major_code)
-               return -EIO;
-
-       *actual_pbl_cnt = pbl_count + use_two_level;
-       *used_4k_pbls = use_4k_pbls;
-       return 0;
-}
-
-
-/**
- * nes_reg_phys_mr
- */
-struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, u64 addr, u64 size,
-               int acc, u64 *iova_start)
-{
-       u64 region_length;
-       struct nes_pd *nespd = to_nespd(ib_pd);
-       struct nes_vnic *nesvnic = to_nesvnic(ib_pd->device);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       struct nes_mr *nesmr;
-       struct ib_mr *ibmr;
-       struct nes_vpbl vpbl;
-       struct nes_root_vpbl root_vpbl;
-       u32 stag;
-       unsigned long mask;
-       u32 stag_index = 0;
-       u32 next_stag_index = 0;
-       u32 driver_key = 0;
-       int err = 0;
-       int ret = 0;
-       u16 pbl_count = 0;
-       u8 single_page = 1;
-       u8 stag_key = 0;
-
-       region_length = 0;
-       vpbl.pbl_vbase = NULL;
-       root_vpbl.pbl_vbase = NULL;
-       root_vpbl.pbl_pbase = 0;
-
-       get_random_bytes(&next_stag_index, sizeof(next_stag_index));
-       stag_key = (u8)next_stag_index;
-
-       driver_key = 0;
-
-       next_stag_index >>= 8;
-       next_stag_index %= nesadapter->max_mr;
-
-       if ((addr ^ *iova_start) & ~PAGE_MASK)
-               return ERR_PTR(-EINVAL);
-
-       err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, nesadapter->max_mr,
-                       &stag_index, &next_stag_index, NES_RESOURCE_PHYS_MR);
-       if (err) {
-               return ERR_PTR(err);
-       }
-
-       nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
-       if (!nesmr) {
-               nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       /* Allocate a 4K buffer for the PBL */
-       vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
-                       &vpbl.pbl_pbase);
-       nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%016lX\n",
-                       vpbl.pbl_vbase, (unsigned long)vpbl.pbl_pbase);
-       if (!vpbl.pbl_vbase) {
-               nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-               ibmr = ERR_PTR(-ENOMEM);
-               kfree(nesmr);
-               goto reg_phys_err;
-       }
-
-
-       mask = !size;
-
-       if (mask & ~PAGE_MASK) {
-               nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-               nes_debug(NES_DBG_MR, "Invalid buffer addr or size\n");
-               ibmr = ERR_PTR(-EINVAL);
-               kfree(nesmr);
-               goto reg_phys_err;
-       }
-
-       region_length += size;
-       vpbl.pbl_vbase[0].pa_low = cpu_to_le32((u32)addr & PAGE_MASK);
-       vpbl.pbl_vbase[0].pa_high = cpu_to_le32((u32)((((u64)addr) >> 32)));
-
-       stag = stag_index << 8;
-       stag |= driver_key;
-       stag += (u32)stag_key;
-
-       nes_debug(NES_DBG_MR, "Registering STag 0x%08X, VA = 0x%016lX,"
-                       " length = 0x%016lX, index = 0x%08X\n",
-                       stag, (unsigned long)*iova_start, (unsigned long)region_length, stag_index);
-
-       /* Make the leaf PBL the root if only one PBL */
-       root_vpbl.pbl_pbase = vpbl.pbl_pbase;
-
-       if (single_page) {
-               pbl_count = 0;
-       } else {
-               pbl_count = 1;
-       }
-       ret = nes_reg_mr(nesdev, nespd, stag, region_length, &root_vpbl,
-                       addr, pbl_count, 1, acc, iova_start,
-                       &nesmr->pbls_used, &nesmr->pbl_4k);
-
-       if (ret == 0) {
-               nesmr->ibmr.rkey = stag;
-               nesmr->ibmr.lkey = stag;
-               nesmr->mode = IWNES_MEMREG_TYPE_MEM;
-               ibmr = &nesmr->ibmr;
-       } else {
-               kfree(nesmr);
-               ibmr = ERR_PTR(-ENOMEM);
-       }
-
-reg_phys_err:
-       /* single PBL case */
-       pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, vpbl.pbl_pbase);
-       return ibmr;
-}
-
-
-/**
- * nes_get_dma_mr
- */
-static struct ib_mr *nes_get_dma_mr(struct ib_pd *pd, int acc)
-{
-       u64 kva = 0;
-
-       nes_debug(NES_DBG_MR, "\n");
-
-       return nes_reg_phys_mr(pd, 0, 0xffffffffffULL, acc, &kva);
-}
-
-/**
- * nes_reg_user_mr
- */
-static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
-               u64 virt, int acc, struct ib_udata *udata)
-{
-       u64 iova_start;
-       __le64 *pbl;
-       u64 region_length;
-       dma_addr_t last_dma_addr = 0;
-       dma_addr_t first_dma_addr = 0;
-       struct nes_pd *nespd = to_nespd(pd);
-       struct nes_vnic *nesvnic = to_nesvnic(pd->device);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       struct ib_mr *ibmr = ERR_PTR(-EINVAL);
-       struct sg_dma_page_iter dma_iter;
-       struct nes_ucontext *nes_ucontext = rdma_udata_to_drv_context(
-               udata, struct nes_ucontext, ibucontext);
-       struct nes_pbl *nespbl;
-       struct nes_mr *nesmr;
-       struct ib_umem *region;
-       struct nes_mem_reg_req req;
-       struct nes_vpbl vpbl;
-       struct nes_root_vpbl root_vpbl;
-       int page_index;
-       int page_count = 0;
-       int err, pbl_depth = 0;
-       int ret;
-       u32 stag;
-       u32 stag_index = 0;
-       u32 next_stag_index;
-       u32 driver_key;
-       u32 root_pbl_index = 0;
-       u32 cur_pbl_index = 0;
-       u32 skip_pages;
-       u16 pbl_count;
-       u8 single_page = 1;
-       u8 stag_key;
-
-       region = ib_umem_get(udata, start, length, acc, 0);
-       if (IS_ERR(region)) {
-               return (struct ib_mr *)region;
-       }
-
-       nes_debug(NES_DBG_MR, "User base = 0x%lX, Virt base = 0x%lX, length = %u,"
-                       " offset = %u, page size = %lu.\n",
-                       (unsigned long int)start, (unsigned long int)virt, (u32)length,
-                       ib_umem_offset(region), BIT(region->page_shift));
-
-       skip_pages = ((u32)ib_umem_offset(region)) >> 12;
-
-       if (ib_copy_from_udata(&req, udata, sizeof(req))) {
-               ib_umem_release(region);
-               return ERR_PTR(-EFAULT);
-       }
-       nes_debug(NES_DBG_MR, "Memory Registration type = %08X.\n", req.reg_type);
-
-       switch (req.reg_type) {
-               case IWNES_MEMREG_TYPE_MEM:
-                       pbl_depth = 0;
-                       region_length = 0;
-                       vpbl.pbl_vbase = NULL;
-                       root_vpbl.pbl_vbase = NULL;
-                       root_vpbl.pbl_pbase = 0;
-
-                       get_random_bytes(&next_stag_index, sizeof(next_stag_index));
-                       stag_key = (u8)next_stag_index;
-
-                       driver_key = next_stag_index & 0x70000000;
-
-                       next_stag_index >>= 8;
-                       next_stag_index %= nesadapter->max_mr;
-
-                       err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs,
-                                       nesadapter->max_mr, &stag_index, &next_stag_index, NES_RESOURCE_USER_MR);
-                       if (err) {
-                               ib_umem_release(region);
-                               return ERR_PTR(err);
-                       }
-
-                       nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
-                       if (!nesmr) {
-                               ib_umem_release(region);
-                               nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-                               return ERR_PTR(-ENOMEM);
-                       }
-                       nesmr->region = region;
-
-                       for_each_sg_dma_page (region->sg_head.sgl, &dma_iter, region->nmap, 0) {
-
-                               region_length += PAGE_SIZE;
-                               region_length -= skip_pages << 12;
-                               skip_pages = 0;
-                               if ((page_count != 0) && (page_count << 12) - (ib_umem_offset(region) & (4096 - 1)) >= region->length)
-                                       goto enough_pages;
-                               if ((page_count & 0x01FF) == 0) {
-                                       if (page_count >= 1024 * 512) {
-                                               ib_umem_release(region);
-                                               nes_free_resource(nesadapter,
-                                                                 nesadapter->allocated_mrs, stag_index);
-                                               kfree(nesmr);
-                                               ibmr = ERR_PTR(-E2BIG);
-                                               goto reg_user_mr_err;
-                                       }
-                                       if (root_pbl_index == 1) {
-                                               root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev,
-                                                               8192, &root_vpbl.pbl_pbase);
-                                               nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n",
-                                                         root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase);
-                                               if (!root_vpbl.pbl_vbase) {
-                                                       ib_umem_release(region);
-                                                       pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
-                                                                           vpbl.pbl_pbase);
-                                                       nes_free_resource(nesadapter, nesadapter->allocated_mrs,
-                                                                         stag_index);
-                                                       kfree(nesmr);
-                                                       ibmr = ERR_PTR(-ENOMEM);
-                                                       goto reg_user_mr_err;
-                                               }
-                                               root_vpbl.leaf_vpbl = kcalloc(1024,
-                                                                             sizeof(*root_vpbl.leaf_vpbl),
-                                                                             GFP_KERNEL);
-                                               if (!root_vpbl.leaf_vpbl) {
-                                                       ib_umem_release(region);
-                                                       pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
-                                                                           root_vpbl.pbl_pbase);
-                                                       pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
-                                                                           vpbl.pbl_pbase);
-                                                       nes_free_resource(nesadapter, nesadapter->allocated_mrs,
-                                                                         stag_index);
-                                                       kfree(nesmr);
-                                                       ibmr = ERR_PTR(-ENOMEM);
-                                                       goto reg_user_mr_err;
-                                               }
-                                               root_vpbl.pbl_vbase[0].pa_low =
-                                                               cpu_to_le32((u32)vpbl.pbl_pbase);
-                                               root_vpbl.pbl_vbase[0].pa_high =
-                                                               cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
-                                               root_vpbl.leaf_vpbl[0] = vpbl;
-                                       }
-                                       vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
-                                                       &vpbl.pbl_pbase);
-                                       nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%08X\n",
-                                                 vpbl.pbl_vbase, (unsigned int)vpbl.pbl_pbase);
-                                       if (!vpbl.pbl_vbase) {
-                                               ib_umem_release(region);
-                                               nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-                                               ibmr = ERR_PTR(-ENOMEM);
-                                               kfree(nesmr);
-                                               goto reg_user_mr_err;
-                                       }
-                                       if (1 <= root_pbl_index) {
-                                               root_vpbl.pbl_vbase[root_pbl_index].pa_low =
-                                                               cpu_to_le32((u32)vpbl.pbl_pbase);
-                                               root_vpbl.pbl_vbase[root_pbl_index].pa_high =
-                                                               cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
-                                               root_vpbl.leaf_vpbl[root_pbl_index] = vpbl;
-                                       }
-                                       root_pbl_index++;
-                                       cur_pbl_index = 0;
-                               }
-                               if (single_page) {
-                                       if (page_count != 0) {
-                                               if ((last_dma_addr + 4096) != sg_page_iter_dma_address(&dma_iter))
-                                                       single_page = 0;
-                                               last_dma_addr = sg_page_iter_dma_address(&dma_iter);
-                                       } else {
-                                               first_dma_addr = sg_page_iter_dma_address(&dma_iter);
-                                               last_dma_addr = first_dma_addr;
-                                       }
-                               }
-
-                               vpbl.pbl_vbase[cur_pbl_index].pa_low =
-                                               cpu_to_le32((u32)(sg_page_iter_dma_address(&dma_iter)));
-                               vpbl.pbl_vbase[cur_pbl_index].pa_high =
-                                               cpu_to_le32((u32)((u64)(sg_page_iter_dma_address(&dma_iter))));
-                               cur_pbl_index++;
-                               page_count++;
-                       }
-
-enough_pages:
-                       nes_debug(NES_DBG_MR, "calculating stag, stag_index=0x%08x, driver_key=0x%08x,"
-                                       " stag_key=0x%08x\n",
-                                       stag_index, driver_key, stag_key);
-                       stag = stag_index << 8;
-                       stag |= driver_key;
-                       stag += (u32)stag_key;
-
-                       iova_start = virt;
-                       /* Make the leaf PBL the root if only one PBL */
-                       if (root_pbl_index == 1) {
-                               root_vpbl.pbl_pbase = vpbl.pbl_pbase;
-                       }
-
-                       if (single_page) {
-                               pbl_count = 0;
-                       } else {
-                               pbl_count = root_pbl_index;
-                               first_dma_addr = 0;
-                       }
-                       nes_debug(NES_DBG_MR, "Registering STag 0x%08X, VA = 0x%08X, length = 0x%08X,"
-                                       " index = 0x%08X, region->length=0x%08llx, pbl_count = %u\n",
-                                       stag, (unsigned int)iova_start,
-                                       (unsigned int)region_length, stag_index,
-                                       (unsigned long long)region->length, pbl_count);
-                       ret = nes_reg_mr(nesdev, nespd, stag, region->length, &root_vpbl,
-                                        first_dma_addr, pbl_count, (u16)cur_pbl_index, acc,
-                                        &iova_start, &nesmr->pbls_used, &nesmr->pbl_4k);
-
-                       nes_debug(NES_DBG_MR, "ret=%d\n", ret);
-
-                       if (ret == 0) {
-                               nesmr->ibmr.rkey = stag;
-                               nesmr->ibmr.lkey = stag;
-                               nesmr->mode = IWNES_MEMREG_TYPE_MEM;
-                               ibmr = &nesmr->ibmr;
-                       } else {
-                               ib_umem_release(region);
-                               kfree(nesmr);
-                               ibmr = ERR_PTR(-ENOMEM);
-                       }
-
-reg_user_mr_err:
-                       /* free the resources */
-                       if (root_pbl_index == 1) {
-                               pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
-                                               vpbl.pbl_pbase);
-                       } else {
-                               for (page_index=0; page_index<root_pbl_index; page_index++) {
-                                       pci_free_consistent(nesdev->pcidev, 4096,
-                                                       root_vpbl.leaf_vpbl[page_index].pbl_vbase,
-                                                       root_vpbl.leaf_vpbl[page_index].pbl_pbase);
-                               }
-                               kfree(root_vpbl.leaf_vpbl);
-                               pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
-                                               root_vpbl.pbl_pbase);
-                       }
-
-                       nes_debug(NES_DBG_MR, "Leaving, ibmr=%p", ibmr);
-
-                       return ibmr;
-               case IWNES_MEMREG_TYPE_QP:
-               case IWNES_MEMREG_TYPE_CQ:
-                       if (!region->length) {
-                               nes_debug(NES_DBG_MR, "Unable to register zero length region for CQ\n");
-                               ib_umem_release(region);
-                               return ERR_PTR(-EINVAL);
-                       }
-                       nespbl = kzalloc(sizeof(*nespbl), GFP_KERNEL);
-                       if (!nespbl) {
-                               ib_umem_release(region);
-                               return ERR_PTR(-ENOMEM);
-                       }
-                       nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
-                       if (!nesmr) {
-                               ib_umem_release(region);
-                               kfree(nespbl);
-                               return ERR_PTR(-ENOMEM);
-                       }
-                       nesmr->region = region;
-                       pbl_depth = region->length >> 12;
-                       pbl_depth += (region->length & (4096-1)) ? 1 : 0;
-                       nespbl->pbl_size = pbl_depth*sizeof(u64);
-                       if (req.reg_type == IWNES_MEMREG_TYPE_QP) {
-                               nes_debug(NES_DBG_MR, "Attempting to allocate QP PBL memory");
-                       } else {
-                               nes_debug(NES_DBG_MR, "Attempting to allocate CP PBL memory");
-                       }
-
-                       nes_debug(NES_DBG_MR, " %u bytes, %u entries.\n",
-                                       nespbl->pbl_size, pbl_depth);
-                       pbl = pci_alloc_consistent(nesdev->pcidev, nespbl->pbl_size,
-                                       &nespbl->pbl_pbase);
-                       if (!pbl) {
-                               ib_umem_release(region);
-                               kfree(nesmr);
-                               kfree(nespbl);
-                               nes_debug(NES_DBG_MR, "Unable to allocate PBL memory\n");
-                               return ERR_PTR(-ENOMEM);
-                       }
-
-                       nespbl->pbl_vbase = (u64 *)pbl;
-                       nespbl->user_base = start;
-                       nes_debug(NES_DBG_MR, "Allocated PBL memory, %u bytes, pbl_pbase=%lx,"
-                                       " pbl_vbase=%p user_base=0x%lx\n",
-                                 nespbl->pbl_size, (unsigned long) nespbl->pbl_pbase,
-                                 (void *) nespbl->pbl_vbase, nespbl->user_base);
-
-                       nespbl->page = sg_page(region->sg_head.sgl);
-                       for_each_sg_dma_page(region->sg_head.sgl, &dma_iter, region->nmap, 0) {
-                               ((__le32 *)pbl)[0] = cpu_to_le32((u32)(sg_page_iter_dma_address(&dma_iter)));
-                               ((__le32 *)pbl)[1] = cpu_to_le32(((u64)(sg_page_iter_dma_address(&dma_iter)))>>32);
-                               nes_debug(NES_DBG_MR, "pbl=%p, *pbl=0x%016llx, 0x%08x%08x\n", pbl,
-                                         (unsigned long long)*pbl,
-                                         le32_to_cpu(((__le32 *)pbl)[1]), le32_to_cpu(((__le32 *)pbl)[0]));
-                               pbl++;
-                       }
-
-                       if (req.reg_type == IWNES_MEMREG_TYPE_QP) {
-                               list_add_tail(&nespbl->list, &nes_ucontext->qp_reg_mem_list);
-                       } else {
-                               list_add_tail(&nespbl->list, &nes_ucontext->cq_reg_mem_list);
-                       }
-                       nesmr->ibmr.rkey = -1;
-                       nesmr->ibmr.lkey = -1;
-                       nesmr->mode = req.reg_type;
-                       return &nesmr->ibmr;
-       }
-
-       ib_umem_release(region);
-       return ERR_PTR(-ENOSYS);
-}
-
-
-/**
- * nes_dereg_mr
- */
-static int nes_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
-{
-       struct nes_mr *nesmr = to_nesmr(ib_mr);
-       struct nes_vnic *nesvnic = to_nesvnic(ib_mr->device);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       struct nes_cqp_request *cqp_request;
-       unsigned long flags;
-       int ret;
-       u16 major_code;
-       u16 minor_code;
-
-
-       if (nesmr->pages)
-               pci_free_consistent(nesdev->pcidev,
-                                   nesmr->max_pages * sizeof(u64),
-                                   nesmr->pages,
-                                   nesmr->paddr);
-
-       if (nesmr->region) {
-               ib_umem_release(nesmr->region);
-       }
-       if (nesmr->mode != IWNES_MEMREG_TYPE_MEM) {
-               kfree(nesmr);
-               return 0;
-       }
-
-       /* Deallocate the region with the adapter */
-
-       cqp_request = nes_get_cqp_request(nesdev);
-       if (cqp_request == NULL) {
-               nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
-               return -ENOMEM;
-       }
-       cqp_request->waiting = 1;
-       cqp_wqe = &cqp_request->cqp_wqe;
-
-       nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
-                       NES_CQP_DEALLOCATE_STAG | NES_CQP_STAG_VA_TO |
-                       NES_CQP_STAG_DEALLOC_PBLS | NES_CQP_STAG_MR);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ib_mr->rkey);
-
-       atomic_set(&cqp_request->refcount, 2);
-       nes_post_cqp_request(nesdev, cqp_request);
-
-       /* Wait for CQP */
-       nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X completed\n", ib_mr->rkey);
-       ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
-                       NES_EVENT_TIMEOUT);
-       nes_debug(NES_DBG_MR, "Deallocate STag 0x%08X completed, wait_event_timeout ret = %u,"
-                       " CQP Major:Minor codes = 0x%04X:0x%04X\n",
-                       ib_mr->rkey, ret, cqp_request->major_code, cqp_request->minor_code);
-
-       major_code = cqp_request->major_code;
-       minor_code = cqp_request->minor_code;
-
-       nes_put_cqp_request(nesdev, cqp_request);
-
-       if (!ret) {
-               nes_debug(NES_DBG_MR, "Timeout waiting to destroy STag,"
-                               " ib_mr=%p, rkey = 0x%08X\n",
-                               ib_mr, ib_mr->rkey);
-               return -ETIME;
-       } else if (major_code) {
-               nes_debug(NES_DBG_MR, "Error (0x%04X:0x%04X) while attempting"
-                               " to destroy STag, ib_mr=%p, rkey = 0x%08X\n",
-                               major_code, minor_code, ib_mr, ib_mr->rkey);
-               return -EIO;
-       }
-
-       if (nesmr->pbls_used != 0) {
-               spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-               if (nesmr->pbl_4k) {
-                       nesadapter->free_4kpbl += nesmr->pbls_used;
-                       if (nesadapter->free_4kpbl > nesadapter->max_4kpbl)
-                               printk(KERN_ERR PFX "free 4KB PBLs(%u) has "
-                                       "exceeded the max(%u)\n",
-                                       nesadapter->free_4kpbl,
-                                       nesadapter->max_4kpbl);
-               } else {
-                       nesadapter->free_256pbl += nesmr->pbls_used;
-                       if (nesadapter->free_256pbl > nesadapter->max_256pbl)
-                               printk(KERN_ERR PFX "free 256B PBLs(%u) has "
-                                       "exceeded the max(%u)\n",
-                                       nesadapter->free_256pbl,
-                                       nesadapter->max_256pbl);
-               }
-               spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-       }
-       nes_free_resource(nesadapter, nesadapter->allocated_mrs,
-                       (ib_mr->rkey & 0x0fffff00) >> 8);
-
-       kfree(nesmr);
-
-       return 0;
-}
-
-
-/**
- * show_rev
- */
-static ssize_t hw_rev_show(struct device *dev,
-                          struct device_attribute *attr, char *buf)
-{
-       struct nes_ib_device *nesibdev =
-               rdma_device_to_drv_device(dev, struct nes_ib_device, ibdev);
-       struct nes_vnic *nesvnic = nesibdev->nesvnic;
-
-       nes_debug(NES_DBG_INIT, "\n");
-       return sprintf(buf, "%x\n", nesvnic->nesdev->nesadapter->hw_rev);
-}
-static DEVICE_ATTR_RO(hw_rev);
-
-/**
- * show_hca
- */
-static ssize_t hca_type_show(struct device *dev,
-                            struct device_attribute *attr, char *buf)
-{
-       nes_debug(NES_DBG_INIT, "\n");
-       return sprintf(buf, "NES020\n");
-}
-static DEVICE_ATTR_RO(hca_type);
-
-/**
- * show_board
- */
-static ssize_t board_id_show(struct device *dev,
-                            struct device_attribute *attr, char *buf)
-{
-       nes_debug(NES_DBG_INIT, "\n");
-       return sprintf(buf, "%.*s\n", 32, "NES020 Board ID");
-}
-static DEVICE_ATTR_RO(board_id);
-
-static struct attribute *nes_dev_attributes[] = {
-       &dev_attr_hw_rev.attr,
-       &dev_attr_hca_type.attr,
-       &dev_attr_board_id.attr,
-       NULL
-};
-
-static const struct attribute_group nes_attr_group = {
-       .attrs = nes_dev_attributes,
-};
-
-/**
- * nes_query_qp
- */
-static int nes_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-               int attr_mask, struct ib_qp_init_attr *init_attr)
-{
-       struct nes_qp *nesqp = to_nesqp(ibqp);
-
-       nes_debug(NES_DBG_QP, "\n");
-
-       attr->qp_access_flags = 0;
-       attr->cap.max_send_wr = nesqp->hwqp.sq_size;
-       attr->cap.max_recv_wr = nesqp->hwqp.rq_size;
-       attr->cap.max_recv_sge = 1;
-       if (nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA)
-               attr->cap.max_inline_data = 0;
-       else
-               attr->cap.max_inline_data = 64;
-
-       init_attr->event_handler = nesqp->ibqp.event_handler;
-       init_attr->qp_context = nesqp->ibqp.qp_context;
-       init_attr->send_cq = nesqp->ibqp.send_cq;
-       init_attr->recv_cq = nesqp->ibqp.recv_cq;
-       init_attr->srq = nesqp->ibqp.srq;
-       init_attr->cap = attr->cap;
-
-       return 0;
-}
-
-
-/**
- * nes_hw_modify_qp
- */
-int nes_hw_modify_qp(struct nes_device *nesdev, struct nes_qp *nesqp,
-               u32 next_iwarp_state, u32 termlen, u32 wait_completion)
-{
-       struct nes_hw_cqp_wqe *cqp_wqe;
-       /* struct iw_cm_id *cm_id = nesqp->cm_id; */
-       /* struct iw_cm_event cm_event; */
-       struct nes_cqp_request *cqp_request;
-       int ret;
-       u16 major_code;
-
-       nes_debug(NES_DBG_MOD_QP, "QP%u, refcount=%d\n",
-                       nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount));
-
-       cqp_request = nes_get_cqp_request(nesdev);
-       if (cqp_request == NULL) {
-               nes_debug(NES_DBG_MOD_QP, "Failed to get a cqp_request.\n");
-               return -ENOMEM;
-       }
-       if (wait_completion) {
-               cqp_request->waiting = 1;
-       } else {
-               cqp_request->waiting = 0;
-       }
-       cqp_wqe = &cqp_request->cqp_wqe;
-
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
-                       NES_CQP_MODIFY_QP | NES_CQP_QP_TYPE_IWARP | next_iwarp_state);
-       nes_debug(NES_DBG_MOD_QP, "using next_iwarp_state=%08x, wqe_words=%08x\n",
-                       next_iwarp_state, le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]));
-       nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
-       set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
-       set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, (u64)nesqp->nesqp_context_pbase);
-
-       /* If sending a terminate message, fill in the length (in words) */
-       if (((next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK) == NES_CQP_QP_IWARP_STATE_TERMINATE) &&
-           !(next_iwarp_state & NES_CQP_QP_TERM_DONT_SEND_TERM_MSG)) {
-               termlen = ((termlen + 3) >> 2) << NES_CQP_OP_TERMLEN_SHIFT;
-               set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_NEW_MSS_IDX, termlen);
-       }
-
-       atomic_set(&cqp_request->refcount, 2);
-       nes_post_cqp_request(nesdev, cqp_request);
-
-       /* Wait for CQP */
-       if (wait_completion) {
-               /* nes_debug(NES_DBG_MOD_QP, "Waiting for modify iWARP QP%u to complete.\n",
-                               nesqp->hwqp.qp_id); */
-               ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
-                               NES_EVENT_TIMEOUT);
-               nes_debug(NES_DBG_MOD_QP, "Modify iwarp QP%u completed, wait_event_timeout ret=%u, "
-                               "CQP Major:Minor codes = 0x%04X:0x%04X.\n",
-                               nesqp->hwqp.qp_id, ret, cqp_request->major_code, cqp_request->minor_code);
-               major_code = cqp_request->major_code;
-               if (major_code) {
-                       nes_debug(NES_DBG_MOD_QP, "Modify iwarp QP%u failed"
-                                       "CQP Major:Minor codes = 0x%04X:0x%04X, intended next state = 0x%08X.\n",
-                                       nesqp->hwqp.qp_id, cqp_request->major_code,
-                                       cqp_request->minor_code, next_iwarp_state);
-               }
-
-               nes_put_cqp_request(nesdev, cqp_request);
-
-               if (!ret)
-                       return -ETIME;
-               else if (major_code)
-                       return -EIO;
-               else
-                       return 0;
-       } else {
-               return 0;
-       }
-}
-
-
-/**
- * nes_modify_qp
- */
-int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-               int attr_mask, struct ib_udata *udata)
-{
-       struct nes_qp *nesqp = to_nesqp(ibqp);
-       struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       /* u32 cqp_head; */
-       /* u32 counter; */
-       u32 next_iwarp_state = 0;
-       int err;
-       unsigned long qplockflags;
-       int ret;
-       u16 original_last_aeq;
-       u8 issue_modify_qp = 0;
-       u8 dont_wait = 0;
-
-       nes_debug(NES_DBG_MOD_QP, "QP%u: QP State=%u, cur QP State=%u,"
-                       " iwarp_state=0x%X, refcount=%d\n",
-                       nesqp->hwqp.qp_id, attr->qp_state, nesqp->ibqp_state,
-                       nesqp->iwarp_state, atomic_read(&nesqp->refcount));
-
-       spin_lock_irqsave(&nesqp->lock, qplockflags);
-
-       nes_debug(NES_DBG_MOD_QP, "QP%u: hw_iwarp_state=0x%X, hw_tcp_state=0x%X,"
-                       " QP Access Flags=0x%X, attr_mask = 0x%0x\n",
-                       nesqp->hwqp.qp_id, nesqp->hw_iwarp_state,
-                       nesqp->hw_tcp_state, attr->qp_access_flags, attr_mask);
-
-       if (attr_mask & IB_QP_STATE) {
-               switch (attr->qp_state) {
-                       case IB_QPS_INIT:
-                               nes_debug(NES_DBG_MOD_QP, "QP%u: new state = init\n",
-                                               nesqp->hwqp.qp_id);
-                               if (nesqp->iwarp_state > (u32)NES_CQP_QP_IWARP_STATE_IDLE) {
-                                       spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-                                       return -EINVAL;
-                               }
-                               next_iwarp_state = NES_CQP_QP_IWARP_STATE_IDLE;
-                               issue_modify_qp = 1;
-                               break;
-                       case IB_QPS_RTR:
-                               nes_debug(NES_DBG_MOD_QP, "QP%u: new state = rtr\n",
-                                               nesqp->hwqp.qp_id);
-                               if (nesqp->iwarp_state>(u32)NES_CQP_QP_IWARP_STATE_IDLE) {
-                                       spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-                                       return -EINVAL;
-                               }
-                               next_iwarp_state = NES_CQP_QP_IWARP_STATE_IDLE;
-                               issue_modify_qp = 1;
-                               break;
-                       case IB_QPS_RTS:
-                               nes_debug(NES_DBG_MOD_QP, "QP%u: new state = rts\n",
-                                               nesqp->hwqp.qp_id);
-                               if (nesqp->iwarp_state>(u32)NES_CQP_QP_IWARP_STATE_RTS) {
-                                       spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-                                       return -EINVAL;
-                               }
-                               if (nesqp->cm_id == NULL) {
-                                       nes_debug(NES_DBG_MOD_QP, "QP%u: Failing attempt to move QP to RTS without a CM_ID. \n",
-                                                       nesqp->hwqp.qp_id );
-                                       spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-                                       return -EINVAL;
-                               }
-                               next_iwarp_state = NES_CQP_QP_IWARP_STATE_RTS;
-                               if (nesqp->iwarp_state != NES_CQP_QP_IWARP_STATE_RTS)
-                                       next_iwarp_state |= NES_CQP_QP_CONTEXT_VALID |
-                                                       NES_CQP_QP_ARP_VALID | NES_CQP_QP_ORD_VALID;
-                               issue_modify_qp = 1;
-                               nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_ESTABLISHED;
-                               nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_RTS;
-                               nesqp->hte_added = 1;
-                               break;
-                       case IB_QPS_SQD:
-                               issue_modify_qp = 1;
-                               nes_debug(NES_DBG_MOD_QP, "QP%u: new state=closing. SQ head=%u, SQ tail=%u\n",
-                                               nesqp->hwqp.qp_id, nesqp->hwqp.sq_head, nesqp->hwqp.sq_tail);
-                               if (nesqp->iwarp_state == (u32)NES_CQP_QP_IWARP_STATE_CLOSING) {
-                                       spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-                                       return 0;
-                               } else {
-                                       if (nesqp->iwarp_state > (u32)NES_CQP_QP_IWARP_STATE_CLOSING) {
-                                               nes_debug(NES_DBG_MOD_QP, "QP%u: State change to closing"
-                                                               " ignored due to current iWARP state\n",
-                                                               nesqp->hwqp.qp_id);
-                                               spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-                                               return -EINVAL;
-                                       }
-                                       if (nesqp->hw_iwarp_state != NES_AEQE_IWARP_STATE_RTS) {
-                                               nes_debug(NES_DBG_MOD_QP, "QP%u: State change to closing"
-                                                               " already done based on hw state.\n",
-                                                               nesqp->hwqp.qp_id);
-                                               issue_modify_qp = 0;
-                                       }
-                                       switch (nesqp->hw_iwarp_state) {
-                                               case NES_AEQE_IWARP_STATE_CLOSING:
-                                                       next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING;
-                                                       break;
-                                               case NES_AEQE_IWARP_STATE_TERMINATE:
-                                                       next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE;
-                                                       break;
-                                               case NES_AEQE_IWARP_STATE_ERROR:
-                                                       next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR;
-                                                       break;
-                                               default:
-                                                       next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING;
-                                                       nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING;
-                                                       break;
-                                       }
-                               }
-                               break;
-                       case IB_QPS_SQE:
-                               nes_debug(NES_DBG_MOD_QP, "QP%u: new state = terminate\n",
-                                               nesqp->hwqp.qp_id);
-                               if (nesqp->iwarp_state>=(u32)NES_CQP_QP_IWARP_STATE_TERMINATE) {
-                                       spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-                                       return -EINVAL;
-                               }
-                               /* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */
-                               next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE;
-                               nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_TERMINATE;
-                               issue_modify_qp = 1;
-                               break;
-                       case IB_QPS_ERR:
-                       case IB_QPS_RESET:
-                               if (nesqp->iwarp_state == (u32)NES_CQP_QP_IWARP_STATE_ERROR) {
-                                       spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-                                       return -EINVAL;
-                               }
-                               nes_debug(NES_DBG_MOD_QP, "QP%u: new state = error\n",
-                                               nesqp->hwqp.qp_id);
-                               if (nesqp->term_flags)
-                                       del_timer(&nesqp->terminate_timer);
-
-                               next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR;
-                               /* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */
-                               if (nesqp->hte_added) {
-                                       nes_debug(NES_DBG_MOD_QP, "set CQP_QP_DEL_HTE\n");
-                                       next_iwarp_state |= NES_CQP_QP_DEL_HTE;
-                                       nesqp->hte_added = 0;
-                               }
-                               if ((nesqp->hw_tcp_state > NES_AEQE_TCP_STATE_CLOSED) &&
-                                               (nesdev->iw_status) &&
-                                               (nesqp->hw_tcp_state != NES_AEQE_TCP_STATE_TIME_WAIT)) {
-                                       next_iwarp_state |= NES_CQP_QP_RESET;
-                               } else {
-                                       nes_debug(NES_DBG_MOD_QP, "QP%u NOT setting NES_CQP_QP_RESET since TCP state = %u\n",
-                                                       nesqp->hwqp.qp_id, nesqp->hw_tcp_state);
-                                       dont_wait = 1;
-                               }
-                               issue_modify_qp = 1;
-                               nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_ERROR;
-                               break;
-                       default:
-                               spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-                               return -EINVAL;
-                               break;
-               }
-
-               nesqp->ibqp_state = attr->qp_state;
-               nesqp->iwarp_state = next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK;
-               nes_debug(NES_DBG_MOD_QP, "Change nesqp->iwarp_state=%08x\n",
-                               nesqp->iwarp_state);
-       }
-
-       if (attr_mask & IB_QP_ACCESS_FLAGS) {
-               if (attr->qp_access_flags & IB_ACCESS_LOCAL_WRITE) {
-                       nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN |
-                                       NES_QPCONTEXT_MISC_RDMA_READ_EN);
-                       issue_modify_qp = 1;
-               }
-               if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) {
-                       nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN);
-                       issue_modify_qp = 1;
-               }
-               if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) {
-                       nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_READ_EN);
-                       issue_modify_qp = 1;
-               }
-               if (attr->qp_access_flags & IB_ACCESS_MW_BIND) {
-                       nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_WBIND_EN);
-                       issue_modify_qp = 1;
-               }
-
-               if (nesqp->user_mode) {
-                       nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN |
-                                       NES_QPCONTEXT_MISC_RDMA_READ_EN);
-                       issue_modify_qp = 1;
-               }
-       }
-
-       original_last_aeq = nesqp->last_aeq;
-       spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-
-       nes_debug(NES_DBG_MOD_QP, "issue_modify_qp=%u\n", issue_modify_qp);
-
-       ret = 0;
-
-
-       if (issue_modify_qp) {
-               nes_debug(NES_DBG_MOD_QP, "call nes_hw_modify_qp\n");
-               ret = nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 1);
-               if (ret)
-                       nes_debug(NES_DBG_MOD_QP, "nes_hw_modify_qp (next_iwarp_state = 0x%08X)"
-                                       " failed for QP%u.\n",
-                                       next_iwarp_state, nesqp->hwqp.qp_id);
-
-       }
-
-       if ((issue_modify_qp) && (nesqp->ibqp_state > IB_QPS_RTS)) {
-               nes_debug(NES_DBG_MOD_QP, "QP%u Issued ModifyQP refcount (%d),"
-                               " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
-                               nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
-                               original_last_aeq, nesqp->last_aeq);
-               if (!ret || original_last_aeq != NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) {
-                       if (dont_wait) {
-                               if (nesqp->cm_id && nesqp->hw_tcp_state != 0) {
-                                       nes_debug(NES_DBG_MOD_QP, "QP%u Queuing fake disconnect for QP refcount (%d),"
-                                                       " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
-                                                       nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
-                                                       original_last_aeq, nesqp->last_aeq);
-                                       /* this one is for the cm_disconnect thread */
-                                       spin_lock_irqsave(&nesqp->lock, qplockflags);
-                                       nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED;
-                                       nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT;
-                                       spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-                                       nes_cm_disconn(nesqp);
-                               } else {
-                                       nes_debug(NES_DBG_MOD_QP, "QP%u No fake disconnect, QP refcount=%d\n",
-                                                       nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount));
-                               }
-                       } else {
-                               spin_lock_irqsave(&nesqp->lock, qplockflags);
-                               if (nesqp->cm_id) {
-                                       /* These two are for the timer thread */
-                                       if (atomic_inc_return(&nesqp->close_timer_started) == 1) {
-                                               nesqp->cm_id->add_ref(nesqp->cm_id);
-                                               nes_debug(NES_DBG_MOD_QP, "QP%u Not decrementing QP refcount (%d),"
-                                                               " need ae to finish up, original_last_aeq = 0x%04X."
-                                                               " last_aeq = 0x%04X, scheduling timer.\n",
-                                                               nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
-                                                               original_last_aeq, nesqp->last_aeq);
-                                               schedule_nes_timer(nesqp->cm_node, (struct sk_buff *) nesqp, NES_TIMER_TYPE_CLOSE, 1, 0);
-                                       }
-                                       spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-                               } else {
-                                       spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-                                       nes_debug(NES_DBG_MOD_QP, "QP%u Not decrementing QP refcount (%d),"
-                                                       " need ae to finish up, original_last_aeq = 0x%04X."
-                                                       " last_aeq = 0x%04X.\n",
-                                                       nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
-                                                       original_last_aeq, nesqp->last_aeq);
-                               }
-                       }
-               } else {
-                       nes_debug(NES_DBG_MOD_QP, "QP%u Decrementing QP refcount (%d), No ae to finish up,"
-                                       " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
-                                       nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
-                                       original_last_aeq, nesqp->last_aeq);
-               }
-       } else {
-               nes_debug(NES_DBG_MOD_QP, "QP%u Decrementing QP refcount (%d), No ae to finish up,"
-                               " original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
-                               nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
-                               original_last_aeq, nesqp->last_aeq);
-       }
-
-       err = 0;
-
-       nes_debug(NES_DBG_MOD_QP, "QP%u Leaving, refcount=%d\n",
-                       nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount));
-
-       return err;
-}
-
-static inline void
-fill_wqe_sg_send(struct nes_hw_qp_wqe *wqe, const struct ib_send_wr *ib_wr,
-                u32 uselkey)
-{
-       int sge_index;
-       int total_payload_length = 0;
-       for (sge_index = 0; sge_index < ib_wr->num_sge; sge_index++) {
-               set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX+(sge_index*4),
-                       ib_wr->sg_list[sge_index].addr);
-               set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_LENGTH0_IDX + (sge_index*4),
-                       ib_wr->sg_list[sge_index].length);
-               if (uselkey)
-                       set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX + (sge_index*4),
-                                               (ib_wr->sg_list[sge_index].lkey));
-               else
-                       set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX + (sge_index*4), 0);
-
-               total_payload_length += ib_wr->sg_list[sge_index].length;
-       }
-       nes_debug(NES_DBG_IW_TX, "UC UC UC, sending total_payload_length=%u \n",
-                       total_payload_length);
-       set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX,
-                               total_payload_length);
-}
-
-/**
- * nes_post_send
- */
-static int nes_post_send(struct ib_qp *ibqp, const struct ib_send_wr *ib_wr,
-                        const struct ib_send_wr **bad_wr)
-{
-       u64 u64temp;
-       unsigned long flags = 0;
-       struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_qp *nesqp = to_nesqp(ibqp);
-       struct nes_hw_qp_wqe *wqe;
-       int err = 0;
-       u32 qsize = nesqp->hwqp.sq_size;
-       u32 head;
-       u32 wqe_misc = 0;
-       u32 wqe_count = 0;
-       u32 counter;
-
-       if (nesqp->ibqp_state > IB_QPS_RTS) {
-               err = -EINVAL;
-               goto out;
-       }
-
-       spin_lock_irqsave(&nesqp->lock, flags);
-
-       head = nesqp->hwqp.sq_head;
-
-       while (ib_wr) {
-               /* Check for QP error */
-               if (nesqp->term_flags) {
-                       err = -EINVAL;
-                       break;
-               }
-
-               /* Check for SQ overflow */
-               if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) {
-                       err = -ENOMEM;
-                       break;
-               }
-
-               wqe = &nesqp->hwqp.sq_vbase[head];
-               /* nes_debug(NES_DBG_IW_TX, "processing sq wqe for QP%u at %p, head = %u.\n",
-                               nesqp->hwqp.qp_id, wqe, head); */
-               nes_fill_init_qp_wqe(wqe, nesqp, head);
-               u64temp = (u64)(ib_wr->wr_id);
-               set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX,
-                                       u64temp);
-               switch (ib_wr->opcode) {
-               case IB_WR_SEND:
-               case IB_WR_SEND_WITH_INV:
-                       if (IB_WR_SEND == ib_wr->opcode) {
-                               if (ib_wr->send_flags & IB_SEND_SOLICITED)
-                                       wqe_misc = NES_IWARP_SQ_OP_SENDSE;
-                               else
-                                       wqe_misc = NES_IWARP_SQ_OP_SEND;
-                       } else {
-                               if (ib_wr->send_flags & IB_SEND_SOLICITED)
-                                       wqe_misc = NES_IWARP_SQ_OP_SENDSEINV;
-                               else
-                                       wqe_misc = NES_IWARP_SQ_OP_SENDINV;
-
-                               set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX,
-                                                   ib_wr->ex.invalidate_rkey);
-                       }
-
-                       if (ib_wr->num_sge > nesdev->nesadapter->max_sge) {
-                               err = -EINVAL;
-                               break;
-                       }
-
-                       if (ib_wr->send_flags & IB_SEND_FENCE)
-                               wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE;
-
-                       if ((ib_wr->send_flags & IB_SEND_INLINE) &&
-                           ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) &&
-                            (ib_wr->sg_list[0].length <= 64)) {
-                               memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX],
-                                      (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length);
-                               set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX,
-                                                   ib_wr->sg_list[0].length);
-                               wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA;
-                       } else {
-                               fill_wqe_sg_send(wqe, ib_wr, 1);
-                       }
-
-                       break;
-               case IB_WR_RDMA_WRITE:
-                       wqe_misc = NES_IWARP_SQ_OP_RDMAW;
-                       if (ib_wr->num_sge > nesdev->nesadapter->max_sge) {
-                               nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=%u\n",
-                                         ib_wr->num_sge, nesdev->nesadapter->max_sge);
-                               err = -EINVAL;
-                               break;
-                       }
-
-                       if (ib_wr->send_flags & IB_SEND_FENCE)
-                               wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE;
-
-                       set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX,
-                                           rdma_wr(ib_wr)->rkey);
-                       set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX,
-                                           rdma_wr(ib_wr)->remote_addr);
-
-                       if ((ib_wr->send_flags & IB_SEND_INLINE) &&
-                           ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) &&
-                            (ib_wr->sg_list[0].length <= 64)) {
-                               memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX],
-                                      (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length);
-                               set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX,
-                                                   ib_wr->sg_list[0].length);
-                               wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA;
-                       } else {
-                               fill_wqe_sg_send(wqe, ib_wr, 1);
-                       }
-
-                       wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] =
-                               wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX];
-                       break;
-               case IB_WR_RDMA_READ:
-               case IB_WR_RDMA_READ_WITH_INV:
-                       /* iWARP only supports 1 sge for RDMA reads */
-                       if (ib_wr->num_sge > 1) {
-                               nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=1\n",
-                                         ib_wr->num_sge);
-                               err = -EINVAL;
-                               break;
-                       }
-                       if (ib_wr->opcode == IB_WR_RDMA_READ) {
-                               wqe_misc = NES_IWARP_SQ_OP_RDMAR;
-                       } else {
-                               wqe_misc = NES_IWARP_SQ_OP_RDMAR_LOCINV;
-                               set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX,
-                                                   ib_wr->ex.invalidate_rkey);
-                       }
-
-                       set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX,
-                                           rdma_wr(ib_wr)->remote_addr);
-                       set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX,
-                                           rdma_wr(ib_wr)->rkey);
-                       set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX,
-                                           ib_wr->sg_list->length);
-                       set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX,
-                                           ib_wr->sg_list->addr);
-                       set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX,
-                                           ib_wr->sg_list->lkey);
-                       break;
-               case IB_WR_LOCAL_INV:
-                       wqe_misc = NES_IWARP_SQ_OP_LOCINV;
-                       set_wqe_32bit_value(wqe->wqe_words,
-                                           NES_IWARP_SQ_LOCINV_WQE_INV_STAG_IDX,
-                                           ib_wr->ex.invalidate_rkey);
-                       break;
-               case IB_WR_REG_MR:
-               {
-                       struct nes_mr *mr = to_nesmr(reg_wr(ib_wr)->mr);
-                       int page_shift = ilog2(reg_wr(ib_wr)->mr->page_size);
-                       int flags = reg_wr(ib_wr)->access;
-
-                       if (mr->npages > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64))) {
-                               nes_debug(NES_DBG_IW_TX, "SQ_FMR: bad page_list_len\n");
-                               err = -EINVAL;
-                               break;
-                       }
-                       wqe_misc = NES_IWARP_SQ_OP_FAST_REG;
-                       set_wqe_64bit_value(wqe->wqe_words,
-                                           NES_IWARP_SQ_FMR_WQE_VA_FBO_LOW_IDX,
-                                           mr->ibmr.iova);
-                       set_wqe_32bit_value(wqe->wqe_words,
-                                           NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX,
-                                           lower_32_bits(mr->ibmr.length));
-                       set_wqe_32bit_value(wqe->wqe_words,
-                                           NES_IWARP_SQ_FMR_WQE_LENGTH_HIGH_IDX, 0);
-                       set_wqe_32bit_value(wqe->wqe_words,
-                                           NES_IWARP_SQ_FMR_WQE_MR_STAG_IDX,
-                                           reg_wr(ib_wr)->key);
-
-                       if (page_shift == 12) {
-                               wqe_misc |= NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_4K;
-                       } else if (page_shift == 21) {
-                               wqe_misc |= NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_2M;
-                       } else {
-                               nes_debug(NES_DBG_IW_TX, "Invalid page shift,"
-                                         " ib_wr=%u, max=1\n", ib_wr->num_sge);
-                               err = -EINVAL;
-                               break;
-                       }
-
-                       /* Set access_flags */
-                       wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_READ;
-                       if (flags & IB_ACCESS_LOCAL_WRITE)
-                               wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_WRITE;
-
-                       if (flags & IB_ACCESS_REMOTE_WRITE)
-                               wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_WRITE;
-
-                       if (flags & IB_ACCESS_REMOTE_READ)
-                               wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_READ;
-
-                       if (flags & IB_ACCESS_MW_BIND)
-                               wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_WINDOW_BIND;
-
-                       /* Fill in PBL info: */
-                       set_wqe_64bit_value(wqe->wqe_words,
-                                           NES_IWARP_SQ_FMR_WQE_PBL_ADDR_LOW_IDX,
-                                           mr->paddr);
-
-                       set_wqe_32bit_value(wqe->wqe_words,
-                                           NES_IWARP_SQ_FMR_WQE_PBL_LENGTH_IDX,
-                                           mr->npages * 8);
-
-                       nes_debug(NES_DBG_IW_TX, "SQ_REG_MR: iova_start: %llx, "
-                                 "length: %lld, rkey: %0x, pgl_paddr: %llx, "
-                                 "page_list_len: %u, wqe_misc: %x\n",
-                                 (unsigned long long) mr->ibmr.iova,
-                                 mr->ibmr.length,
-                                 reg_wr(ib_wr)->key,
-                                 (unsigned long long) mr->paddr,
-                                 mr->npages,
-                                 wqe_misc);
-                       break;
-               }
-               default:
-                       /* error */
-                       err = -EINVAL;
-                       break;
-               }
-
-               if (err)
-                       break;
-
-               if ((ib_wr->send_flags & IB_SEND_SIGNALED) || nesqp->sig_all)
-                       wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL;
-
-               wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = cpu_to_le32(wqe_misc);
-
-               ib_wr = ib_wr->next;
-               head++;
-               wqe_count++;
-               if (head >= qsize)
-                       head = 0;
-
-       }
-
-       nesqp->hwqp.sq_head = head;
-       barrier();
-       while (wqe_count) {
-               counter = min(wqe_count, ((u32)255));
-               wqe_count -= counter;
-               nes_write32(nesdev->regs + NES_WQE_ALLOC,
-                               (counter << 24) | 0x00800000 | nesqp->hwqp.qp_id);
-       }
-
-       spin_unlock_irqrestore(&nesqp->lock, flags);
-
-out:
-       if (err)
-               *bad_wr = ib_wr;
-       return err;
-}
-
-
-/**
- * nes_post_recv
- */
-static int nes_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *ib_wr,
-                        const struct ib_recv_wr **bad_wr)
-{
-       u64 u64temp;
-       unsigned long flags = 0;
-       struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_qp *nesqp = to_nesqp(ibqp);
-       struct nes_hw_qp_wqe *wqe;
-       int err = 0;
-       int sge_index;
-       u32 qsize = nesqp->hwqp.rq_size;
-       u32 head;
-       u32 wqe_count = 0;
-       u32 counter;
-       u32 total_payload_length;
-
-       if (nesqp->ibqp_state > IB_QPS_RTS) {
-               err = -EINVAL;
-               goto out;
-       }
-
-       spin_lock_irqsave(&nesqp->lock, flags);
-
-       head = nesqp->hwqp.rq_head;
-
-       while (ib_wr) {
-               /* Check for QP error */
-               if (nesqp->term_flags) {
-                       err = -EINVAL;
-                       break;
-               }
-
-               if (ib_wr->num_sge > nesdev->nesadapter->max_sge) {
-                       err = -EINVAL;
-                       break;
-               }
-               /* Check for RQ overflow */
-               if (((head + (2 * qsize) - nesqp->hwqp.rq_tail) % qsize) == (qsize - 1)) {
-                       err = -ENOMEM;
-                       break;
-               }
-
-               nes_debug(NES_DBG_IW_RX, "ibwr sge count = %u.\n", ib_wr->num_sge);
-               wqe = &nesqp->hwqp.rq_vbase[head];
-
-               /* nes_debug(NES_DBG_IW_RX, "QP%u:processing rq wqe at %p, head = %u.\n",
-                               nesqp->hwqp.qp_id, wqe, head); */
-               nes_fill_init_qp_wqe(wqe, nesqp, head);
-               u64temp = (u64)(ib_wr->wr_id);
-               set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX,
-                                       u64temp);
-               total_payload_length = 0;
-               for (sge_index=0; sge_index < ib_wr->num_sge; sge_index++) {
-                       set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_FRAG0_LOW_IDX+(sge_index*4),
-                                       ib_wr->sg_list[sge_index].addr);
-                       set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_LENGTH0_IDX+(sge_index*4),
-                                       ib_wr->sg_list[sge_index].length);
-                       set_wqe_32bit_value(wqe->wqe_words,NES_IWARP_RQ_WQE_STAG0_IDX+(sge_index*4),
-                                       ib_wr->sg_list[sge_index].lkey);
-
-                       total_payload_length += ib_wr->sg_list[sge_index].length;
-               }
-               set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_TOTAL_PAYLOAD_IDX,
-                                       total_payload_length);
-
-               ib_wr = ib_wr->next;
-               head++;
-               wqe_count++;
-               if (head >= qsize)
-                       head = 0;
-       }
-
-       nesqp->hwqp.rq_head = head;
-       barrier();
-       while (wqe_count) {
-               counter = min(wqe_count, ((u32)255));
-               wqe_count -= counter;
-               nes_write32(nesdev->regs+NES_WQE_ALLOC, (counter<<24) | nesqp->hwqp.qp_id);
-       }
-
-       spin_unlock_irqrestore(&nesqp->lock, flags);
-
-out:
-       if (err)
-               *bad_wr = ib_wr;
-       return err;
-}
-
-/**
- * nes_drain_sq - drain sq
- * @ibqp: pointer to ibqp
- */
-static void nes_drain_sq(struct ib_qp *ibqp)
-{
-       struct nes_qp *nesqp = to_nesqp(ibqp);
-
-       if (nesqp->hwqp.sq_tail != nesqp->hwqp.sq_head)
-               wait_for_completion(&nesqp->sq_drained);
-}
-
-/**
- * nes_drain_rq - drain rq
- * @ibqp: pointer to ibqp
- */
-static void nes_drain_rq(struct ib_qp *ibqp)
-{
-       struct nes_qp *nesqp = to_nesqp(ibqp);
-
-       if (nesqp->hwqp.rq_tail != nesqp->hwqp.rq_head)
-               wait_for_completion(&nesqp->rq_drained);
-}
-
-/**
- * nes_poll_cq
- */
-static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
-{
-       u64 u64temp;
-       u64 wrid;
-       unsigned long flags = 0;
-       struct nes_vnic *nesvnic = to_nesvnic(ibcq->device);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_cq *nescq = to_nescq(ibcq);
-       struct nes_qp *nesqp;
-       struct nes_hw_cqe cqe;
-       u32 head;
-       u32 wq_tail = 0;
-       u32 cq_size;
-       u32 cqe_count = 0;
-       u32 wqe_index;
-       u32 u32temp;
-       u32 move_cq_head = 1;
-       u32 err_code;
-
-       nes_debug(NES_DBG_CQ, "\n");
-
-       spin_lock_irqsave(&nescq->lock, flags);
-
-       head = nescq->hw_cq.cq_head;
-       cq_size = nescq->hw_cq.cq_size;
-
-       while (cqe_count < num_entries) {
-               if ((le32_to_cpu(nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]) &
-                               NES_CQE_VALID) == 0)
-                       break;
-
-               /*
-                * Make sure we read CQ entry contents *after*
-                * we've checked the valid bit.
-                */
-               rmb();
-
-               cqe = nescq->hw_cq.cq_vbase[head];
-               u32temp = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]);
-               wqe_index = u32temp & (nesdev->nesadapter->max_qp_wr - 1);
-               u32temp &= ~(NES_SW_CONTEXT_ALIGN-1);
-               /* parse CQE, get completion context from WQE (either rq or sq) */
-               u64temp = (((u64)(le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX])))<<32) |
-                               ((u64)u32temp);
-
-               if (u64temp) {
-                       nesqp = (struct nes_qp *)(unsigned long)u64temp;
-                       memset(entry, 0, sizeof *entry);
-                       if (cqe.cqe_words[NES_CQE_ERROR_CODE_IDX] == 0) {
-                               entry->status = IB_WC_SUCCESS;
-                       } else {
-                               err_code = le32_to_cpu(cqe.cqe_words[NES_CQE_ERROR_CODE_IDX]);
-                               if (NES_IWARP_CQE_MAJOR_DRV == (err_code >> 16)) {
-                                       entry->status = err_code & 0x0000ffff;
-
-                                       /* The rest of the cqe's will be marked as flushed */
-                                       nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_ERROR_CODE_IDX] =
-                                               cpu_to_le32((NES_IWARP_CQE_MAJOR_FLUSH << 16) |
-                                                           NES_IWARP_CQE_MINOR_FLUSH);
-                               } else
-                                       entry->status = IB_WC_WR_FLUSH_ERR;
-                       }
-
-                       entry->qp = &nesqp->ibqp;
-                       entry->src_qp = nesqp->hwqp.qp_id;
-
-                       if (le32_to_cpu(cqe.cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_SQ) {
-                               if (nesqp->skip_lsmm) {
-                                       nesqp->skip_lsmm = 0;
-                                       nesqp->hwqp.sq_tail++;
-                               }
-
-                               /* Working on a SQ Completion*/
-                               wrid = (((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wqe_index].
-                                               wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_HIGH_IDX]))) << 32) |
-                                               ((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wqe_index].
-                                               wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX])));
-                               entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index].
-                                               wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX]);
-
-                               switch (le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index].
-                                               wqe_words[NES_IWARP_SQ_WQE_MISC_IDX]) & 0x3f) {
-                                       case NES_IWARP_SQ_OP_RDMAW:
-                                               nes_debug(NES_DBG_CQ, "Operation = RDMA WRITE.\n");
-                                               entry->opcode = IB_WC_RDMA_WRITE;
-                                               break;
-                                       case NES_IWARP_SQ_OP_RDMAR:
-                                               nes_debug(NES_DBG_CQ, "Operation = RDMA READ.\n");
-                                               entry->opcode = IB_WC_RDMA_READ;
-                                               entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index].
-                                                               wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX]);
-                                               break;
-                                       case NES_IWARP_SQ_OP_SENDINV:
-                                       case NES_IWARP_SQ_OP_SENDSEINV:
-                                       case NES_IWARP_SQ_OP_SEND:
-                                       case NES_IWARP_SQ_OP_SENDSE:
-                                               nes_debug(NES_DBG_CQ, "Operation = Send.\n");
-                                               entry->opcode = IB_WC_SEND;
-                                               break;
-                                       case NES_IWARP_SQ_OP_LOCINV:
-                                               entry->opcode = IB_WC_LOCAL_INV;
-                                               break;
-                                       case NES_IWARP_SQ_OP_FAST_REG:
-                                               entry->opcode = IB_WC_REG_MR;
-                                               break;
-                               }
-
-                               nesqp->hwqp.sq_tail = (wqe_index+1)&(nesqp->hwqp.sq_size - 1);
-                               if ((entry->status != IB_WC_SUCCESS) && (nesqp->hwqp.sq_tail != nesqp->hwqp.sq_head)) {
-                                       move_cq_head = 0;
-                                       wq_tail = nesqp->hwqp.sq_tail;
-                               }
-                       } else {
-                               /* Working on a RQ Completion*/
-                               entry->byte_len = le32_to_cpu(cqe.cqe_words[NES_CQE_PAYLOAD_LENGTH_IDX]);
-                               wrid = ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wqe_index].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_LOW_IDX]))) |
-                                       ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wqe_index].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX]))<<32);
-                               entry->opcode = IB_WC_RECV;
-
-                               nesqp->hwqp.rq_tail = (wqe_index+1)&(nesqp->hwqp.rq_size - 1);
-                               if ((entry->status != IB_WC_SUCCESS) && (nesqp->hwqp.rq_tail != nesqp->hwqp.rq_head)) {
-                                       move_cq_head = 0;
-                                       wq_tail = nesqp->hwqp.rq_tail;
-                               }
-                       }
-
-                       if (nesqp->iwarp_state > NES_CQP_QP_IWARP_STATE_RTS) {
-                               if (nesqp->hwqp.sq_tail == nesqp->hwqp.sq_head)
-                                       complete(&nesqp->sq_drained);
-                               if (nesqp->hwqp.rq_tail == nesqp->hwqp.rq_head)
-                                       complete(&nesqp->rq_drained);
-                       }
-
-                       entry->wr_id = wrid;
-                       entry++;
-                       cqe_count++;
-               }
-
-               if (move_cq_head) {
-                       nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0;
-                       if (++head >= cq_size)
-                               head = 0;
-                       nescq->polled_completions++;
-
-                       if ((nescq->polled_completions > (cq_size / 2)) ||
-                                       (nescq->polled_completions == 255)) {
-                               nes_debug(NES_DBG_CQ, "CQ%u Issuing CQE Allocate since more than half of cqes"
-                                       " are pending %u of %u.\n",
-                                       nescq->hw_cq.cq_number, nescq->polled_completions, cq_size);
-                               nes_write32(nesdev->regs+NES_CQE_ALLOC,
-                                       nescq->hw_cq.cq_number | (nescq->polled_completions << 16));
-                               nescq->polled_completions = 0;
-                       }
-               } else {
-                       /* Update the wqe index and set status to flush */
-                       wqe_index = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]);
-                       wqe_index = (wqe_index & (~(nesdev->nesadapter->max_qp_wr - 1))) | wq_tail;
-                       nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX] =
-                               cpu_to_le32(wqe_index);
-                       move_cq_head = 1; /* ready for next pass */
-               }
-       }
-
-       if (nescq->polled_completions) {
-               nes_write32(nesdev->regs+NES_CQE_ALLOC,
-                               nescq->hw_cq.cq_number | (nescq->polled_completions << 16));
-               nescq->polled_completions = 0;
-       }
-
-       nescq->hw_cq.cq_head = head;
-       nes_debug(NES_DBG_CQ, "Reporting %u completions for CQ%u.\n",
-                       cqe_count, nescq->hw_cq.cq_number);
-
-       spin_unlock_irqrestore(&nescq->lock, flags);
-
-       return cqe_count;
-}
-
-
-/**
- * nes_req_notify_cq
- */
-static int nes_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
-               {
-       struct nes_vnic *nesvnic = to_nesvnic(ibcq->device);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_cq *nescq = to_nescq(ibcq);
-       u32 cq_arm;
-
-       nes_debug(NES_DBG_CQ, "Requesting notification for CQ%u.\n",
-                       nescq->hw_cq.cq_number);
-
-       cq_arm = nescq->hw_cq.cq_number;
-       if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_NEXT_COMP)
-               cq_arm |= NES_CQE_ALLOC_NOTIFY_NEXT;
-       else if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
-               cq_arm |= NES_CQE_ALLOC_NOTIFY_SE;
-       else
-               return -EINVAL;
-
-       nes_write32(nesdev->regs+NES_CQE_ALLOC, cq_arm);
-       nes_read32(nesdev->regs+NES_CQE_ALLOC);
-
-       return 0;
-}
-
-static int nes_port_immutable(struct ib_device *ibdev, u8 port_num,
-                             struct ib_port_immutable *immutable)
-{
-       struct ib_port_attr attr;
-       int err;
-
-       immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
-
-       err = nes_query_port(ibdev, port_num, &attr);
-       if (err)
-               return err;
-
-       immutable->pkey_tbl_len = attr.pkey_tbl_len;
-       immutable->gid_tbl_len = attr.gid_tbl_len;
-
-       return 0;
-}
-
-static void get_dev_fw_str(struct ib_device *dev, char *str)
-{
-       struct nes_ib_device *nesibdev =
-                       container_of(dev, struct nes_ib_device, ibdev);
-       struct nes_vnic *nesvnic = nesibdev->nesvnic;
-
-       nes_debug(NES_DBG_INIT, "\n");
-       snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u",
-                (nesvnic->nesdev->nesadapter->firmware_version >> 16),
-                (nesvnic->nesdev->nesadapter->firmware_version & 0x000000ff));
-}
-
-static const struct ib_device_ops nes_dev_ops = {
-       .alloc_mr = nes_alloc_mr,
-       .alloc_mw = nes_alloc_mw,
-       .alloc_pd = nes_alloc_pd,
-       .alloc_ucontext = nes_alloc_ucontext,
-       .create_cq = nes_create_cq,
-       .create_qp = nes_create_qp,
-       .dealloc_mw = nes_dealloc_mw,
-       .dealloc_pd = nes_dealloc_pd,
-       .dealloc_ucontext = nes_dealloc_ucontext,
-       .dereg_mr = nes_dereg_mr,
-       .destroy_cq = nes_destroy_cq,
-       .destroy_qp = nes_destroy_qp,
-       .drain_rq = nes_drain_rq,
-       .drain_sq = nes_drain_sq,
-       .get_dev_fw_str = get_dev_fw_str,
-       .get_dma_mr = nes_get_dma_mr,
-       .get_port_immutable = nes_port_immutable,
-       .iw_accept = nes_accept,
-       .iw_add_ref = nes_add_ref,
-       .iw_connect = nes_connect,
-       .iw_create_listen = nes_create_listen,
-       .iw_destroy_listen = nes_destroy_listen,
-       .iw_get_qp = nes_get_qp,
-       .iw_reject = nes_reject,
-       .iw_rem_ref = nes_rem_ref,
-       .map_mr_sg = nes_map_mr_sg,
-       .mmap = nes_mmap,
-       .modify_qp = nes_modify_qp,
-       .poll_cq = nes_poll_cq,
-       .post_recv = nes_post_recv,
-       .post_send = nes_post_send,
-       .query_device = nes_query_device,
-       .query_gid = nes_query_gid,
-       .query_pkey = nes_query_pkey,
-       .query_port = nes_query_port,
-       .query_qp = nes_query_qp,
-       .reg_user_mr = nes_reg_user_mr,
-       .req_notify_cq = nes_req_notify_cq,
-       INIT_RDMA_OBJ_SIZE(ib_pd, nes_pd, ibpd),
-       INIT_RDMA_OBJ_SIZE(ib_ucontext, nes_ucontext, ibucontext),
-};
-
-/**
- * nes_init_ofa_device
- */
-struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
-{
-       struct nes_ib_device *nesibdev;
-       struct nes_vnic *nesvnic = netdev_priv(netdev);
-       struct nes_device *nesdev = nesvnic->nesdev;
-
-       nesibdev = ib_alloc_device(nes_ib_device, ibdev);
-       if (nesibdev == NULL) {
-               return NULL;
-       }
-       nesibdev->ibdev.owner = THIS_MODULE;
-
-       nesibdev->ibdev.node_type = RDMA_NODE_RNIC;
-       memset(&nesibdev->ibdev.node_guid, 0, sizeof(nesibdev->ibdev.node_guid));
-       memcpy(&nesibdev->ibdev.node_guid, netdev->dev_addr, 6);
-
-       nesibdev->ibdev.uverbs_cmd_mask =
-                       (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
-                       (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
-                       (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
-                       (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
-                       (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
-                       (1ull << IB_USER_VERBS_CMD_REG_MR) |
-                       (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
-                       (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
-                       (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
-                       (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
-                       (1ull << IB_USER_VERBS_CMD_CREATE_AH) |
-                       (1ull << IB_USER_VERBS_CMD_DESTROY_AH) |
-                       (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
-                       (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
-                       (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
-                       (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
-                       (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
-                       (1ull << IB_USER_VERBS_CMD_ALLOC_MW) |
-                       (1ull << IB_USER_VERBS_CMD_BIND_MW) |
-                       (1ull << IB_USER_VERBS_CMD_DEALLOC_MW) |
-                       (1ull << IB_USER_VERBS_CMD_POST_RECV) |
-                       (1ull << IB_USER_VERBS_CMD_POST_SEND);
-
-       nesibdev->ibdev.phys_port_cnt = 1;
-       nesibdev->ibdev.num_comp_vectors = 1;
-       nesibdev->ibdev.dev.parent = &nesdev->pcidev->dev;
-
-       ib_set_device_ops(&nesibdev->ibdev, &nes_dev_ops);
-       memcpy(nesibdev->ibdev.iw_ifname, netdev->name,
-              sizeof(nesibdev->ibdev.iw_ifname));
-
-       return nesibdev;
-}
-
-
-/**
- * nes_handle_delayed_event
- */
-static void nes_handle_delayed_event(struct timer_list *t)
-{
-       struct nes_vnic *nesvnic = from_timer(nesvnic, t, event_timer);
-
-       if (nesvnic->delayed_event != nesvnic->last_dispatched_event) {
-               struct ib_event event;
-
-               event.device = &nesvnic->nesibdev->ibdev;
-               if (!event.device)
-                       goto stop_timer;
-               event.event = nesvnic->delayed_event;
-               event.element.port_num = nesvnic->logical_port + 1;
-               ib_dispatch_event(&event);
-       }
-
-stop_timer:
-       nesvnic->event_timer.function = NULL;
-}
-
-
-void  nes_port_ibevent(struct nes_vnic *nesvnic)
-{
-       struct nes_ib_device *nesibdev = nesvnic->nesibdev;
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct ib_event event;
-       event.device = &nesibdev->ibdev;
-       event.element.port_num = nesvnic->logical_port + 1;
-       event.event = nesdev->iw_status ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
-
-       if (!nesvnic->event_timer.function) {
-               ib_dispatch_event(&event);
-               nesvnic->last_dispatched_event = event.event;
-               nesvnic->event_timer.function = nes_handle_delayed_event;
-               nesvnic->event_timer.expires = jiffies + NES_EVENT_DELAY;
-               add_timer(&nesvnic->event_timer);
-       } else {
-               mod_timer(&nesvnic->event_timer, jiffies + NES_EVENT_DELAY);
-       }
-       nesvnic->delayed_event = event.event;
-}
-
-
-/**
- * nes_destroy_ofa_device
- */
-void nes_destroy_ofa_device(struct nes_ib_device *nesibdev)
-{
-       if (nesibdev == NULL)
-               return;
-
-       nes_unregister_ofa_device(nesibdev);
-
-       ib_dealloc_device(&nesibdev->ibdev);
-}
-
-
-/**
- * nes_register_ofa_device
- */
-int nes_register_ofa_device(struct nes_ib_device *nesibdev)
-{
-       struct nes_vnic *nesvnic = nesibdev->nesvnic;
-       struct nes_device *nesdev = nesvnic->nesdev;
-       struct nes_adapter *nesadapter = nesdev->nesadapter;
-       int ret;
-
-       rdma_set_device_sysfs_group(&nesvnic->nesibdev->ibdev, &nes_attr_group);
-       nesvnic->nesibdev->ibdev.driver_id = RDMA_DRIVER_NES;
-       ret = ib_register_device(&nesvnic->nesibdev->ibdev, "nes%d");
-       if (ret) {
-               return ret;
-       }
-
-       /* Get the resources allocated to this device */
-       nesibdev->max_cq = (nesadapter->max_cq-NES_FIRST_QPN) / nesadapter->port_count;
-       nesibdev->max_mr = nesadapter->max_mr / nesadapter->port_count;
-       nesibdev->max_qp = (nesadapter->max_qp-NES_FIRST_QPN) / nesadapter->port_count;
-       nesibdev->max_pd = nesadapter->max_pd / nesadapter->port_count;
-
-       nesvnic->of_device_registered = 1;
-
-       return 0;
-}
-
-
-/**
- * nes_unregister_ofa_device
- */
-static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev)
-{
-       struct nes_vnic *nesvnic = nesibdev->nesvnic;
-
-       if (nesvnic->of_device_registered)
-               ib_unregister_device(&nesibdev->ibdev);
-
-       nesvnic->of_device_registered = 0;
-}
diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h
deleted file mode 100644 (file)
index 114a9b5..0000000
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#ifndef NES_VERBS_H
-#define NES_VERBS_H
-
-struct nes_device;
-
-#define NES_MAX_USER_DB_REGIONS  4096
-#define NES_MAX_USER_WQ_REGIONS  4096
-
-#define NES_TERM_SENT            0x01
-#define NES_TERM_RCVD            0x02
-#define NES_TERM_DONE            0x04
-
-struct nes_ucontext {
-       struct ib_ucontext ibucontext;
-       struct nes_device  *nesdev;
-       unsigned long      mmap_wq_offset;
-       unsigned long      mmap_cq_offset; /* to be removed */
-       int                index;               /* rnic index (minor) */
-       unsigned long      allocated_doorbells[BITS_TO_LONGS(NES_MAX_USER_DB_REGIONS)];
-       u16                mmap_db_index[NES_MAX_USER_DB_REGIONS];
-       u16                first_free_db;
-       unsigned long      allocated_wqs[BITS_TO_LONGS(NES_MAX_USER_WQ_REGIONS)];
-       struct nes_qp      *mmap_nesqp[NES_MAX_USER_WQ_REGIONS];
-       u16                first_free_wq;
-       struct list_head   cq_reg_mem_list;
-       struct list_head   qp_reg_mem_list;
-       u32                mcrqf;
-};
-
-struct nes_pd {
-       struct ib_pd ibpd;
-       u16          pd_id;
-       atomic_t     sqp_count;
-       u16          mmap_db_index;
-};
-
-struct nes_mr {
-       union {
-               struct ib_mr  ibmr;
-               struct ib_mw  ibmw;
-               struct ib_fmr ibfmr;
-       };
-       struct ib_umem    *region;
-       u16               pbls_used;
-       u8                mode;
-       u8                pbl_4k;
-       __le64            *pages;
-       dma_addr_t        paddr;
-       u32               max_pages;
-       u32               npages;
-};
-
-struct nes_hw_pb {
-       __le32 pa_low;
-       __le32 pa_high;
-};
-
-struct nes_vpbl {
-       dma_addr_t       pbl_pbase;
-       struct nes_hw_pb *pbl_vbase;
-};
-
-struct nes_root_vpbl {
-       dma_addr_t       pbl_pbase;
-       struct nes_hw_pb *pbl_vbase;
-       struct nes_vpbl  *leaf_vpbl;
-};
-
-struct nes_fmr {
-       struct nes_mr        nesmr;
-       u32                  leaf_pbl_cnt;
-       struct nes_root_vpbl root_vpbl;
-       struct ib_qp         *ib_qp;
-       int                  access_rights;
-       struct ib_fmr_attr   attr;
-};
-
-struct nes_av;
-
-struct nes_cq {
-       struct ib_cq     ibcq;
-       struct nes_hw_cq hw_cq;
-       u32              polled_completions;
-       u32              cq_mem_size;
-       spinlock_t       lock;
-       u8               virtual_cq;
-       u8               pad[3];
-       u32              mcrqf;
-};
-
-struct nes_wq {
-       spinlock_t lock;
-};
-
-struct disconn_work {
-       struct work_struct    work;
-       struct nes_qp         *nesqp;
-};
-
-struct iw_cm_id;
-struct ietf_mpa_frame;
-
-struct nes_qp {
-       struct ib_qp          ibqp;
-       void                  *allocated_buffer;
-       struct iw_cm_id       *cm_id;
-       struct nes_cq         *nesscq;
-       struct nes_cq         *nesrcq;
-       struct nes_pd         *nespd;
-       void *cm_node; /* handle of the node this QP is associated with */
-       void                  *ietf_frame;
-       u8                    ietf_frame_size;
-       dma_addr_t            ietf_frame_pbase;
-       struct ib_mr          *lsmm_mr;
-       struct nes_hw_qp      hwqp;
-       struct work_struct    work;
-       enum ib_qp_state      ibqp_state;
-       u32                   iwarp_state;
-       u32                   hte_index;
-       u32                   last_aeq;
-       u32                   qp_mem_size;
-       atomic_t              refcount;
-       atomic_t              close_timer_started;
-       u32                   mmap_sq_db_index;
-       u32                   mmap_rq_db_index;
-       spinlock_t            lock;
-       spinlock_t            pau_lock;
-       struct nes_qp_context *nesqp_context;
-       dma_addr_t            nesqp_context_pbase;
-       void                  *pbl_vbase;
-       dma_addr_t            pbl_pbase;
-       struct page           *page;
-       struct timer_list     terminate_timer;
-       enum ib_event_type    terminate_eventtype;
-       struct sk_buff_head   pau_list;
-       u32                   pau_rcv_nxt;
-       u16                   active_conn:1;
-       u16                   skip_lsmm:1;
-       u16                   user_mode:1;
-       u16                   hte_added:1;
-       u16                   flush_issued:1;
-       u16                   destroyed:1;
-       u16                   sig_all:1;
-       u16                   pau_mode:1;
-       u16                   rsvd:8;
-       u16                   private_data_len;
-       u16                   term_sq_flush_code;
-       u16                   term_rq_flush_code;
-       u8                    hw_iwarp_state;
-       u8                    hw_tcp_state;
-       u8                    term_flags;
-       u8                    sq_kmapped;
-       u8                    pau_busy;
-       u8                    pau_pending;
-       u8                    pau_state;
-       __u64                 nesuqp_addr;
-       struct completion     sq_drained;
-       struct completion     rq_drained;
-};
-
-struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
-               u64 addr, u64 size, int acc, u64 *iova_start);
-
-#endif                 /* NES_VERBS_H */
index 5127e2ea4bdd2d510153ae06c81f28b72f52e88b..d82d3ec3649ea0cfa9ea337b0b89ac948f65b11d 100644 (file)
@@ -1351,7 +1351,6 @@ static int ocrdma_mbx_get_ctrl_attribs(struct ocrdma_dev *dev)
        mqe->u.nonemb_req.sge[0].pa_hi = (u32) upper_32_bits(dma.pa);
        mqe->u.nonemb_req.sge[0].len = dma.size;
 
-       memset(dma.va, 0, dma.size);
        ocrdma_init_mch((struct ocrdma_mbx_hdr *)dma.va,
                        OCRDMA_CMD_GET_CTRL_ATTRIBUTES,
                        OCRDMA_SUBSYS_COMMON,
@@ -1690,7 +1689,6 @@ static int ocrdma_mbx_create_ah_tbl(struct ocrdma_dev *dev)
                goto mem_err_ah;
        dev->av_tbl.pa = pa;
        dev->av_tbl.num_ah = max_ah;
-       memset(dev->av_tbl.va, 0, dev->av_tbl.size);
 
        pbes = (struct ocrdma_pbe *)dev->av_tbl.pbl.va;
        for (i = 0; i < dev->av_tbl.size / OCRDMA_MIN_Q_PAGE_SIZE; i++) {
@@ -1888,14 +1886,13 @@ mem_err:
        return status;
 }
 
-int ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq)
+void ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq)
 {
-       int status = -ENOMEM;
        struct ocrdma_destroy_cq *cmd;
 
        cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_CQ, sizeof(*cmd));
        if (!cmd)
-               return status;
+               return;
        ocrdma_init_mch(&cmd->req, OCRDMA_CMD_DELETE_CQ,
                        OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
 
@@ -1903,11 +1900,10 @@ int ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq)
            (cq->id << OCRDMA_DESTROY_CQ_QID_SHIFT) &
            OCRDMA_DESTROY_CQ_QID_MASK;
 
-       status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+       ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
        ocrdma_unbind_eq(dev, cq->eqn);
        dma_free_coherent(&dev->nic_info.pdev->dev, cq->len, cq->va, cq->pa);
        kfree(cmd);
-       return status;
 }
 
 int ocrdma_mbx_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_hw_mr *hwmr,
@@ -2905,7 +2901,6 @@ static int ocrdma_mbx_get_dcbx_config(struct ocrdma_dev *dev, u32 ptype,
        mqe_sge->pa_hi = (u32) upper_32_bits(pa);
        mqe_sge->len = cmd.hdr.pyld_len;
 
-       memset(req, 0, sizeof(struct ocrdma_get_dcbx_cfg_req));
        ocrdma_init_mch(&req->hdr, OCRDMA_CMD_GET_DCBX_CONFIG,
                        OCRDMA_SUBSYS_DCBX, cmd.hdr.pyld_len);
        req->param_type = ptype;
index 06ec59326a90fdce71716463074a998973d561ad..12c23a7652b96f7f8fb416e14873c10e98f99db4 100644 (file)
@@ -122,7 +122,7 @@ int ocrdma_reg_mr(struct ocrdma_dev *, struct ocrdma_hw_mr *hwmr,
                        u32 pd_id, int acc);
 int ocrdma_mbx_create_cq(struct ocrdma_dev *, struct ocrdma_cq *,
                                int entries, int dpp_cq, u16 pd_id);
-int ocrdma_mbx_destroy_cq(struct ocrdma_dev *, struct ocrdma_cq *);
+void ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq);
 
 int ocrdma_mbx_create_qp(struct ocrdma_qp *, struct ib_qp_init_attr *attrs,
                         u8 enable_dpp_cq, u16 dpp_cq_id, u16 *dpp_offset,
index fc6c0962dea987f40e49bd5e62af45625af9ae6f..c15cfc6cef81ac2850804e595fb06775e7d671ae 100644 (file)
@@ -144,6 +144,10 @@ static const struct attribute_group ocrdma_attr_group = {
 };
 
 static const struct ib_device_ops ocrdma_dev_ops = {
+       .owner = THIS_MODULE,
+       .driver_id = RDMA_DRIVER_OCRDMA,
+       .uverbs_abi_ver = OCRDMA_ABI_VERSION,
+
        .alloc_mr = ocrdma_alloc_mr,
        .alloc_pd = ocrdma_alloc_pd,
        .alloc_ucontext = ocrdma_alloc_ucontext,
@@ -178,6 +182,7 @@ static const struct ib_device_ops ocrdma_dev_ops = {
        .resize_cq = ocrdma_resize_cq,
 
        INIT_RDMA_OBJ_SIZE(ib_ah, ocrdma_ah, ibah),
+       INIT_RDMA_OBJ_SIZE(ib_cq, ocrdma_cq, ibcq),
        INIT_RDMA_OBJ_SIZE(ib_pd, ocrdma_pd, ibpd),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, ocrdma_ucontext, ibucontext),
 };
@@ -200,8 +205,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
        BUILD_BUG_ON(sizeof(OCRDMA_NODE_DESC) > IB_DEVICE_NODE_DESC_MAX);
        memcpy(dev->ibdev.node_desc, OCRDMA_NODE_DESC,
               sizeof(OCRDMA_NODE_DESC));
-       dev->ibdev.owner = THIS_MODULE;
-       dev->ibdev.uverbs_abi_ver = OCRDMA_ABI_VERSION;
        dev->ibdev.uverbs_cmd_mask =
            OCRDMA_UVERBS(GET_CONTEXT) |
            OCRDMA_UVERBS(QUERY_DEVICE) |
@@ -249,7 +252,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
                ib_set_device_ops(&dev->ibdev, &ocrdma_dev_srq_ops);
        }
        rdma_set_device_sysfs_group(&dev->ibdev, &ocrdma_attr_group);
-       dev->ibdev.driver_id = RDMA_DRIVER_OCRDMA;
        ret = ib_device_set_netdev(&dev->ibdev, dev->nic_info.netdev, 1);
        if (ret)
                return ret;
index 35ec87015792ce16ecbec853b6a20ed402759603..bccc1137810924780a60dedc100a784edac45a9a 100644 (file)
@@ -925,8 +925,7 @@ int ocrdma_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
        ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr);
 
        /* it could be user registered memory. */
-       if (mr->umem)
-               ib_umem_release(mr->umem);
+       ib_umem_release(mr->umem);
        kfree(mr);
 
        /* Don't stop cleanup, in case FW is unresponsive */
@@ -977,12 +976,12 @@ err:
        return status;
 }
 
-struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev,
-                              const struct ib_cq_init_attr *attr,
-                              struct ib_udata *udata)
+int ocrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                    struct ib_udata *udata)
 {
+       struct ib_device *ibdev = ibcq->device;
        int entries = attr->cqe;
-       struct ocrdma_cq *cq;
+       struct ocrdma_cq *cq = get_ocrdma_cq(ibcq);
        struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
        struct ocrdma_ucontext *uctx = rdma_udata_to_drv_context(
                udata, struct ocrdma_ucontext, ibucontext);
@@ -991,16 +990,13 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev,
        struct ocrdma_create_cq_ureq ureq;
 
        if (attr->flags)
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
 
        if (udata) {
                if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
-                       return ERR_PTR(-EFAULT);
+                       return -EFAULT;
        } else
                ureq.dpp_cq = 0;
-       cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-       if (!cq)
-               return ERR_PTR(-ENOMEM);
 
        spin_lock_init(&cq->cq_lock);
        spin_lock_init(&cq->comp_handler_lock);
@@ -1011,10 +1007,9 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev,
                pd_id = uctx->cntxt_pd->id;
 
        status = ocrdma_mbx_create_cq(dev, cq, entries, ureq.dpp_cq, pd_id);
-       if (status) {
-               kfree(cq);
-               return ERR_PTR(status);
-       }
+       if (status)
+               return status;
+
        if (udata) {
                status = ocrdma_copy_cq_uresp(dev, cq, udata);
                if (status)
@@ -1022,12 +1017,11 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev,
        }
        cq->phase = OCRDMA_CQE_VALID;
        dev->cq_tbl[cq->id] = cq;
-       return &cq->ibcq;
+       return 0;
 
 ctx_err:
        ocrdma_mbx_destroy_cq(dev, cq);
-       kfree(cq);
-       return ERR_PTR(status);
+       return status;
 }
 
 int ocrdma_resize_cq(struct ib_cq *ibcq, int new_cnt,
@@ -1070,7 +1064,7 @@ static void ocrdma_flush_cq(struct ocrdma_cq *cq)
        spin_unlock_irqrestore(&cq->cq_lock, flags);
 }
 
-int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+void ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
        struct ocrdma_cq *cq = get_ocrdma_cq(ibcq);
        struct ocrdma_eq *eq = NULL;
@@ -1080,14 +1074,13 @@ int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 
        dev->cq_tbl[cq->id] = NULL;
        indx = ocrdma_get_eq_table_index(dev, cq->eqn);
-       BUG_ON(indx == -EINVAL);
 
        eq = &dev->eq_tbl[indx];
        irq = ocrdma_get_irq(dev, eq);
        synchronize_irq(irq);
        ocrdma_flush_cq(cq);
 
-       (void)ocrdma_mbx_destroy_cq(dev, cq);
+       ocrdma_mbx_destroy_cq(dev, cq);
        if (cq->ucontext) {
                pdid = cq->ucontext->cntxt_pd->id;
                ocrdma_del_mmap(cq->ucontext, (u64) cq->pa,
@@ -1096,9 +1089,6 @@ int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
                                ocrdma_get_db_addr(dev, pdid),
                                dev->nic_info.db_page_size);
        }
-
-       kfree(cq);
-       return 0;
 }
 
 static int ocrdma_add_qpn_map(struct ocrdma_dev *dev, struct ocrdma_qp *qp)
index d76aae7ed0d36970d85f92cd76522c9c33b6f3ec..32488da1b7520f16eb934739ee99c10723553a18 100644 (file)
@@ -71,11 +71,10 @@ int ocrdma_mmap(struct ib_ucontext *, struct vm_area_struct *vma);
 int ocrdma_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 void ocrdma_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 
-struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev,
-                              const struct ib_cq_init_attr *attr,
-                              struct ib_udata *udata);
+int ocrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                    struct ib_udata *udata);
 int ocrdma_resize_cq(struct ib_cq *, int cqe, struct ib_udata *);
-int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+void ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 
 struct ib_qp *ocrdma_create_qp(struct ib_pd *,
                               struct ib_qp_init_attr *attrs,
index 5ebf3c53b3fbf18bbb6a9cc0ee74325cfbf025a8..533157a2a3be09e754a8425650ae9d4f5d7bb9a0 100644 (file)
@@ -183,6 +183,10 @@ static void qedr_roce_register_device(struct qedr_dev *dev)
 }
 
 static const struct ib_device_ops qedr_dev_ops = {
+       .owner = THIS_MODULE,
+       .driver_id = RDMA_DRIVER_QEDR,
+       .uverbs_abi_ver = QEDR_ABI_VERSION,
+
        .alloc_mr = qedr_alloc_mr,
        .alloc_pd = qedr_alloc_pd,
        .alloc_ucontext = qedr_alloc_ucontext,
@@ -220,6 +224,7 @@ static const struct ib_device_ops qedr_dev_ops = {
        .resize_cq = qedr_resize_cq,
 
        INIT_RDMA_OBJ_SIZE(ib_ah, qedr_ah, ibah),
+       INIT_RDMA_OBJ_SIZE(ib_cq, qedr_cq, ibcq),
        INIT_RDMA_OBJ_SIZE(ib_pd, qedr_pd, ibpd),
        INIT_RDMA_OBJ_SIZE(ib_srq, qedr_srq, ibsrq),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, qedr_ucontext, ibucontext),
@@ -231,8 +236,6 @@ static int qedr_register_device(struct qedr_dev *dev)
 
        dev->ibdev.node_guid = dev->attr.node_guid;
        memcpy(dev->ibdev.node_desc, QEDR_NODE_DESC, sizeof(QEDR_NODE_DESC));
-       dev->ibdev.owner = THIS_MODULE;
-       dev->ibdev.uverbs_abi_ver = QEDR_ABI_VERSION;
 
        dev->ibdev.uverbs_cmd_mask = QEDR_UVERBS(GET_CONTEXT) |
                                     QEDR_UVERBS(QUERY_DEVICE) |
@@ -274,7 +277,6 @@ static int qedr_register_device(struct qedr_dev *dev)
        rdma_set_device_sysfs_group(&dev->ibdev, &qedr_attr_group);
        ib_set_device_ops(&dev->ibdev, &qedr_dev_ops);
 
-       dev->ibdev.driver_id = RDMA_DRIVER_QEDR;
        rc = ib_device_set_netdev(&dev->ibdev, dev->ndev, 1);
        if (rc)
                return rc;
index 3d7bde19838e7b7c6251f493af9fbb3ab7272153..27d90a84ea01c9b8d4b9b0c96ef47a390a8cd6ea 100644 (file)
@@ -159,54 +159,47 @@ int qedr_query_device(struct ib_device *ibdev,
        return 0;
 }
 
-#define QEDR_SPEED_SDR         (1)
-#define QEDR_SPEED_DDR         (2)
-#define QEDR_SPEED_QDR         (4)
-#define QEDR_SPEED_FDR10       (8)
-#define QEDR_SPEED_FDR         (16)
-#define QEDR_SPEED_EDR         (32)
-
 static inline void get_link_speed_and_width(int speed, u8 *ib_speed,
                                            u8 *ib_width)
 {
        switch (speed) {
        case 1000:
-               *ib_speed = QEDR_SPEED_SDR;
+               *ib_speed = IB_SPEED_SDR;
                *ib_width = IB_WIDTH_1X;
                break;
        case 10000:
-               *ib_speed = QEDR_SPEED_QDR;
+               *ib_speed = IB_SPEED_QDR;
                *ib_width = IB_WIDTH_1X;
                break;
 
        case 20000:
-               *ib_speed = QEDR_SPEED_DDR;
+               *ib_speed = IB_SPEED_DDR;
                *ib_width = IB_WIDTH_4X;
                break;
 
        case 25000:
-               *ib_speed = QEDR_SPEED_EDR;
+               *ib_speed = IB_SPEED_EDR;
                *ib_width = IB_WIDTH_1X;
                break;
 
        case 40000:
-               *ib_speed = QEDR_SPEED_QDR;
+               *ib_speed = IB_SPEED_QDR;
                *ib_width = IB_WIDTH_4X;
                break;
 
        case 50000:
-               *ib_speed = QEDR_SPEED_QDR;
-               *ib_width = IB_WIDTH_4X;
+               *ib_speed = IB_SPEED_HDR;
+               *ib_width = IB_WIDTH_1X;
                break;
 
        case 100000:
-               *ib_speed = QEDR_SPEED_EDR;
+               *ib_speed = IB_SPEED_EDR;
                *ib_width = IB_WIDTH_4X;
                break;
 
        default:
                /* Unsupported */
-               *ib_speed = QEDR_SPEED_SDR;
+               *ib_speed = IB_SPEED_SDR;
                *ib_width = IB_WIDTH_1X;
        }
 }
@@ -813,20 +806,20 @@ int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
        return 0;
 }
 
-struct ib_cq *qedr_create_cq(struct ib_device *ibdev,
-                            const struct ib_cq_init_attr *attr,
-                            struct ib_udata *udata)
+int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                  struct ib_udata *udata)
 {
+       struct ib_device *ibdev = ibcq->device;
        struct qedr_ucontext *ctx = rdma_udata_to_drv_context(
                udata, struct qedr_ucontext, ibucontext);
        struct qed_rdma_destroy_cq_out_params destroy_oparams;
        struct qed_rdma_destroy_cq_in_params destroy_iparams;
        struct qedr_dev *dev = get_qedr_dev(ibdev);
        struct qed_rdma_create_cq_in_params params;
-       struct qedr_create_cq_ureq ureq;
+       struct qedr_create_cq_ureq ureq = {};
        int vector = attr->comp_vector;
        int entries = attr->cqe;
-       struct qedr_cq *cq;
+       struct qedr_cq *cq = get_qedr_cq(ibcq);
        int chain_entries;
        int page_cnt;
        u64 pbl_ptr;
@@ -841,18 +834,13 @@ struct ib_cq *qedr_create_cq(struct ib_device *ibdev,
                DP_ERR(dev,
                       "create cq: the number of entries %d is too high. Must be equal or below %d.\n",
                       entries, QEDR_MAX_CQES);
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
        }
 
        chain_entries = qedr_align_cq_entries(entries);
        chain_entries = min_t(int, chain_entries, QEDR_MAX_CQES);
 
-       cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-       if (!cq)
-               return ERR_PTR(-ENOMEM);
-
        if (udata) {
-               memset(&ureq, 0, sizeof(ureq));
                if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) {
                        DP_ERR(dev,
                               "create cq: problem copying data from user space\n");
@@ -930,7 +918,7 @@ struct ib_cq *qedr_create_cq(struct ib_device *ibdev,
                 "create cq: icid=0x%0x, addr=%p, size(entries)=0x%0x\n",
                 cq->icid, cq, params.cq_size);
 
-       return &cq->ibcq;
+       return 0;
 
 err3:
        destroy_iparams.icid = cq->icid;
@@ -945,8 +933,7 @@ err1:
        if (udata)
                ib_umem_release(cq->q.umem);
 err0:
-       kfree(cq);
-       return ERR_PTR(-EINVAL);
+       return -EINVAL;
 }
 
 int qedr_resize_cq(struct ib_cq *ibcq, int new_cnt, struct ib_udata *udata)
@@ -962,14 +949,13 @@ int qedr_resize_cq(struct ib_cq *ibcq, int new_cnt, struct ib_udata *udata)
 #define QEDR_DESTROY_CQ_MAX_ITERATIONS         (10)
 #define QEDR_DESTROY_CQ_ITER_DURATION          (10)
 
-int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
        struct qedr_dev *dev = get_qedr_dev(ibcq->device);
        struct qed_rdma_destroy_cq_out_params oparams;
        struct qed_rdma_destroy_cq_in_params iparams;
        struct qedr_cq *cq = get_qedr_cq(ibcq);
        int iter;
-       int rc;
 
        DP_DEBUG(dev, QEDR_MSG_CQ, "destroy cq %p (icid=%d)\n", cq, cq->icid);
 
@@ -977,13 +963,10 @@ int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 
        /* GSIs CQs are handled by driver, so they don't exist in the FW */
        if (cq->cq_type == QEDR_CQ_TYPE_GSI)
-               goto done;
+               return;
 
        iparams.icid = cq->icid;
-       rc = dev->ops->rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams);
-       if (rc)
-               return rc;
-
+       dev->ops->rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams);
        dev->ops->common->chain_free(dev->cdev, &cq->pbl);
 
        if (udata) {
@@ -1014,27 +997,11 @@ int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
                iter--;
        }
 
-       if (oparams.num_cq_notif != cq->cnq_notif)
-               goto err;
-
        /* Note that we don't need to have explicit code to wait for the
         * completion of the event handler because it is invoked from the EQ.
         * Since the destroy CQ ramrod has also been received on the EQ we can
         * be certain that there's no event handler in process.
         */
-done:
-       cq->sig = ~cq->sig;
-
-       kfree(cq);
-
-       return 0;
-
-err:
-       DP_ERR(dev,
-              "CQ %p (icid=%d) not freed, expecting %d ints but got %d ints\n",
-              cq, cq->icid, oparams.num_cq_notif, cq->cnq_notif);
-
-       return -EINVAL;
 }
 
 static inline int get_gid_info_from_table(struct ib_qp *ibqp,
@@ -1605,12 +1572,10 @@ qedr_iwarp_populate_user_qp(struct qedr_dev *dev,
 
 static void qedr_cleanup_user(struct qedr_dev *dev, struct qedr_qp *qp)
 {
-       if (qp->usq.umem)
-               ib_umem_release(qp->usq.umem);
+       ib_umem_release(qp->usq.umem);
        qp->usq.umem = NULL;
 
-       if (qp->urq.umem)
-               ib_umem_release(qp->urq.umem);
+       ib_umem_release(qp->urq.umem);
        qp->urq.umem = NULL;
 }
 
@@ -2713,8 +2678,7 @@ int qedr_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
                qedr_free_pbl(dev, &mr->info.pbl_info, mr->info.pbl_table);
 
        /* it could be user registered memory. */
-       if (mr->umem)
-               ib_umem_release(mr->umem);
+       ib_umem_release(mr->umem);
 
        kfree(mr);
 
index 9328c80375efb9c60e93c90d5d36d71f7f7c5ca1..9aaa90283d6e36cd2b92d8cfcea9bbf3dcf59af4 100644 (file)
@@ -50,11 +50,10 @@ int qedr_mmap(struct ib_ucontext *, struct vm_area_struct *vma);
 int qedr_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 void qedr_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 
-struct ib_cq *qedr_create_cq(struct ib_device *ibdev,
-                            const struct ib_cq_init_attr *attr,
-                            struct ib_udata *udata);
+int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                  struct ib_udata *udata);
 int qedr_resize_cq(struct ib_cq *, int cqe, struct ib_udata *);
-int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
 struct ib_qp *qedr_create_qp(struct ib_pd *, struct ib_qp_init_attr *attrs,
                             struct ib_udata *);
index a81905df2d0f3d0815c1c53761ee8b7d75b4e62e..8d0563ef5be174fb5d111a2ba78f7669a524b0ca 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012 - 2017 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2012 - 2019 Intel Corporation.  All rights reserved.
  * Copyright (c) 2006 - 2012 QLogic Corporation.  * All rights reserved.
  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
  *
@@ -398,7 +398,7 @@ int qib_check_send_wqe(struct rvt_qp *qp,
        case IB_QPT_SMI:
        case IB_QPT_GSI:
        case IB_QPT_UD:
-               ah = ibah_to_rvtah(wqe->ud_wr.ah);
+               ah = rvt_get_swqe_ah(wqe);
                if (wqe->length > (1 << ah->log_pmtu))
                        return -EINVAL;
                /* progress hint */
index 2ac4c67f5ba1ae18948bb4e45df3b6f0991e4d38..1d5e2d4ee257b3b0d2ccb49e8d2135d5722d722b 100644 (file)
@@ -921,20 +921,11 @@ void qib_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr)
                rvt_add_retry_timer(qp);
 
        while (qp->s_last != qp->s_acked) {
-               u32 s_last;
-
                wqe = rvt_get_swqe_ptr(qp, qp->s_last);
                if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) >= 0 &&
                    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
                        break;
-               s_last = qp->s_last;
-               if (++s_last >= qp->s_size)
-                       s_last = 0;
-               qp->s_last = s_last;
-               /* see post_send() */
-               barrier();
-               rvt_put_qp_swqe(qp, wqe);
-               rvt_qp_swqe_complete(qp,
+               rvt_qp_complete_swqe(qp,
                                     wqe,
                                     ib_qib_wc_opcode[wqe->wr.opcode],
                                     IB_WC_SUCCESS);
@@ -972,21 +963,12 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
         * is finished.
         */
        if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) < 0 ||
-           qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
-               u32 s_last;
-
-               rvt_put_qp_swqe(qp, wqe);
-               s_last = qp->s_last;
-               if (++s_last >= qp->s_size)
-                       s_last = 0;
-               qp->s_last = s_last;
-               /* see post_send() */
-               barrier();
-               rvt_qp_swqe_complete(qp,
+           qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0)
+               rvt_qp_complete_swqe(qp,
                                     wqe,
                                     ib_qib_wc_opcode[wqe->wr.opcode],
                                     IB_WC_SUCCESS);
-       else
+       else
                this_cpu_inc(*ibp->rvp.rc_delayed_comp);
 
        qp->s_retry = qp->s_retry_cnt;
@@ -1909,8 +1891,7 @@ send_last:
                wc.dlid_path_bits = 0;
                wc.port_num = 0;
                /* Signal completion event if the solicited bit is set. */
-               rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-                            ib_bth_is_solicited(ohdr));
+               rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
                break;
 
        case OP(RDMA_WRITE_FIRST):
index 30c70ad0f4bf9c04c1d39603523d1c5ef8e85adc..e17b91e2c22a91022930dc6cbbc896500bf6663a 100644 (file)
@@ -400,8 +400,7 @@ last_imm:
                wc.dlid_path_bits = 0;
                wc.port_num = 0;
                /* Signal completion event if the solicited bit is set. */
-               rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-                            ib_bth_is_solicited(ohdr));
+               rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
                break;
 
        case OP(RDMA_WRITE_FIRST):
index 5cdedba2d164ee4ba8d1f37f3fb8aa6493f7bd83..93ca21347959f734b650ac2528ec10f1c7827225 100644 (file)
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2012 - 2019 Intel Corporation.  All rights reserved.
  * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
  *
@@ -63,7 +64,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
        enum ib_qp_type sqptype, dqptype;
 
        rcu_read_lock();
-       qp = rvt_lookup_qpn(rdi, &ibp->rvp, swqe->ud_wr.remote_qpn);
+       qp = rvt_lookup_qpn(rdi, &ibp->rvp, rvt_get_swqe_remote_qpn(swqe));
        if (!qp) {
                ibp->rvp.n_pkt_drops++;
                goto drop;
@@ -80,7 +81,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
                goto drop;
        }
 
-       ah_attr = &ibah_to_rvtah(swqe->ud_wr.ah)->attr;
+       ah_attr = rvt_get_swqe_ah_attr(swqe);
        ppd = ppd_from_ibp(ibp);
 
        if (qp->ibqp.qp_num > 1) {
@@ -110,8 +111,8 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
        if (qp->ibqp.qp_num) {
                u32 qkey;
 
-               qkey = (int)swqe->ud_wr.remote_qkey < 0 ?
-                       sqp->qkey : swqe->ud_wr.remote_qkey;
+               qkey = (int)rvt_get_swqe_remote_qkey(swqe) < 0 ?
+                       sqp->qkey : rvt_get_swqe_remote_qkey(swqe);
                if (unlikely(qkey != qp->qkey))
                        goto drop;
        }
@@ -203,15 +204,14 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
        wc.qp = &qp->ibqp;
        wc.src_qp = sqp->ibqp.qp_num;
        wc.pkey_index = qp->ibqp.qp_type == IB_QPT_GSI ?
-               swqe->ud_wr.pkey_index : 0;
+               rvt_get_swqe_pkey_index(swqe) : 0;
        wc.slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) &
                                ((1 << ppd->lmc) - 1));
        wc.sl = rdma_ah_get_sl(ah_attr);
        wc.dlid_path_bits = rdma_ah_get_dlid(ah_attr) & ((1 << ppd->lmc) - 1);
        wc.port_num = qp->port_num;
        /* Signal completion event if the solicited bit is set. */
-       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-                    swqe->wr.send_flags & IB_SEND_SOLICITED);
+       rvt_recv_cq(qp, &wc, swqe->wr.send_flags & IB_SEND_SOLICITED);
        ibp->rvp.n_loop_pkts++;
 bail_unlock:
        spin_unlock_irqrestore(&qp->r_lock, flags);
@@ -271,7 +271,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags)
        /* Construct the header. */
        ibp = to_iport(qp->ibqp.device, qp->port_num);
        ppd = ppd_from_ibp(ibp);
-       ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+       ah_attr = rvt_get_swqe_ah_attr(wqe);
        if (rdma_ah_get_dlid(ah_attr) >= be16_to_cpu(IB_MULTICAST_LID_BASE)) {
                if (rdma_ah_get_dlid(ah_attr) !=
                                be16_to_cpu(IB_LID_PERMISSIVE))
@@ -363,7 +363,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags)
        bth0 |= extra_bytes << 20;
        bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? QIB_DEFAULT_P_KEY :
                qib_get_pkey(ibp, qp->ibqp.qp_type == IB_QPT_GSI ?
-                            wqe->ud_wr.pkey_index : qp->s_pkey_index);
+                            rvt_get_swqe_pkey_index(wqe) : qp->s_pkey_index);
        ohdr->bth[0] = cpu_to_be32(bth0);
        /*
         * Use the multicast QP if the destination LID is a multicast LID.
@@ -372,14 +372,15 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags)
                        be16_to_cpu(IB_MULTICAST_LID_BASE) &&
                rdma_ah_get_dlid(ah_attr) != be16_to_cpu(IB_LID_PERMISSIVE) ?
                cpu_to_be32(QIB_MULTICAST_QPN) :
-               cpu_to_be32(wqe->ud_wr.remote_qpn);
+               cpu_to_be32(rvt_get_swqe_remote_qpn(wqe));
        ohdr->bth[2] = cpu_to_be32(wqe->psn & QIB_PSN_MASK);
        /*
         * Qkeys with the high order bit set mean use the
         * qkey from the QP context instead of the WR (see 10.2.5).
         */
-       ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ?
-                                        qp->qkey : wqe->ud_wr.remote_qkey);
+       ohdr->u.ud.deth[0] =
+               cpu_to_be32((int)rvt_get_swqe_remote_qkey(wqe) < 0 ? qp->qkey :
+                           rvt_get_swqe_remote_qkey(wqe));
        ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
 
 done:
@@ -573,8 +574,7 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr,
                dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1);
        wc.port_num = qp->port_num;
        /* Signal completion event if the solicited bit is set. */
-       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-                    ib_bth_is_solicited(ohdr));
+       rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
        return;
 
 drop:
index f712fb7fa82f995b2bbef7c24ddc7bb29cada4f2..bfbfbb7e0ff461299520d3d072d388bd13ab0420 100644 (file)
 static void __qib_release_user_pages(struct page **p, size_t num_pages,
                                     int dirty)
 {
-       size_t i;
-
-       for (i = 0; i < num_pages; i++) {
-               if (dirty)
-                       set_page_dirty_lock(p[i]);
-               put_page(p[i]);
-       }
+       if (dirty)
+               put_user_pages_dirty_lock(p, num_pages);
+       else
+               put_user_pages(p, num_pages);
 }
 
 /**
index 0c204776263f2da7911126de46a37ff7219a3fcc..05190edc2611ef7dad6af79d7a550627c84486a9 100644 (file)
@@ -317,7 +317,7 @@ static int qib_user_sdma_page_to_frags(const struct qib_devdata *dd,
                 * the caller can ignore this page.
                 */
                if (put) {
-                       put_page(page);
+                       put_user_page(page);
                } else {
                        /* coalesce case */
                        kunmap(page);
@@ -631,7 +631,7 @@ static void qib_user_sdma_free_pkt_frag(struct device *dev,
                        kunmap(pkt->addr[i].page);
 
                if (pkt->addr[i].put_page)
-                       put_page(pkt->addr[i].page);
+                       put_user_page(pkt->addr[i].page);
                else
                        __free_page(pkt->addr[i].page);
        } else if (pkt->addr[i].kvaddr) {
@@ -706,7 +706,7 @@ static int qib_user_sdma_pin_pages(const struct qib_devdata *dd,
        /* if error, return all pages not managed by pkt */
 free_pages:
        while (i < j)
-               put_page(pages[i++]);
+               put_user_page(pages[i++]);
 
 done:
        return ret;
@@ -904,10 +904,11 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd,
                }
 
                if (frag_size) {
-                       int pktsize, tidsmsize, n;
+                       int tidsmsize, n;
+                       size_t pktsize;
 
                        n = npages*((2*PAGE_SIZE/frag_size)+1);
-                       pktsize = sizeof(*pkt) + sizeof(pkt->addr[0])*n;
+                       pktsize = struct_size(pkt, addr, n);
 
                        /*
                         * Determine if this is tid-sdma or just sdma.
index 2c4e569ce43810362c85259db35b2499cb854d5a..33778d451b827f3b91a9025875f365e41587f499 100644 (file)
@@ -1480,6 +1480,9 @@ static void qib_fill_device_attr(struct qib_devdata *dd)
 }
 
 static const struct ib_device_ops qib_dev_ops = {
+       .owner = THIS_MODULE,
+       .driver_id = RDMA_DRIVER_QIB,
+
        .init_port = qib_create_port_files,
        .modify_device = qib_modify_device,
        .process_mad = qib_process_mad,
@@ -1543,7 +1546,6 @@ int qib_register_ib_device(struct qib_devdata *dd)
        if (!ib_qib_sys_image_guid)
                ib_qib_sys_image_guid = ppd->guid;
 
-       ibdev->owner = THIS_MODULE;
        ibdev->node_guid = ppd->guid;
        ibdev->phys_port_cnt = dd->num_pports;
        ibdev->dev.parent = &dd->pcidev->dev;
@@ -1614,7 +1616,7 @@ int qib_register_ib_device(struct qib_devdata *dd)
        rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev, &qib_attr_group);
 
        ib_set_device_ops(ibdev, &qib_dev_ops);
-       ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_QIB);
+       ret = rvt_register_device(&dd->verbs_dev.rdi);
        if (ret)
                goto err_tx;
 
index 525bf272671e6973afb13472263dd8f730c4eac3..84dd682d23341beb46c89211cfa3e4385ada5488 100644 (file)
@@ -61,6 +61,10 @@ struct usnic_ib_pd {
        struct usnic_uiom_pd            *umem_pd;
 };
 
+struct usnic_ib_cq {
+       struct ib_cq                    ibcq;
+};
+
 struct usnic_ib_mr {
        struct ib_mr                    ibmr;
        struct usnic_uiom_reg           *umem;
index 34c1f9d6c915013f75f826aafc895122e3e0b57a..03f54eb9404b0aa60c81677b7119b15c9aabc296 100644 (file)
@@ -329,6 +329,10 @@ static void usnic_get_dev_fw_str(struct ib_device *device, char *str)
 }
 
 static const struct ib_device_ops usnic_dev_ops = {
+       .owner = THIS_MODULE,
+       .driver_id = RDMA_DRIVER_USNIC,
+       .uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION,
+
        .alloc_pd = usnic_ib_alloc_pd,
        .alloc_ucontext = usnic_ib_alloc_ucontext,
        .create_cq = usnic_ib_create_cq,
@@ -350,6 +354,7 @@ static const struct ib_device_ops usnic_dev_ops = {
        .query_qp = usnic_ib_query_qp,
        .reg_user_mr = usnic_ib_reg_mr,
        INIT_RDMA_OBJ_SIZE(ib_pd, usnic_ib_pd, ibpd),
+       INIT_RDMA_OBJ_SIZE(ib_cq, usnic_ib_cq, ibcq),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, usnic_ib_ucontext, ibucontext),
 };
 
@@ -384,12 +389,10 @@ static void *usnic_ib_device_add(struct pci_dev *dev)
 
        us_ibdev->pdev = dev;
        us_ibdev->netdev = pci_get_drvdata(dev);
-       us_ibdev->ib_dev.owner = THIS_MODULE;
        us_ibdev->ib_dev.node_type = RDMA_NODE_USNIC_UDP;
        us_ibdev->ib_dev.phys_port_cnt = USNIC_IB_PORT_CNT;
        us_ibdev->ib_dev.num_comp_vectors = USNIC_IB_NUM_COMP_VECTORS;
        us_ibdev->ib_dev.dev.parent = &dev->dev;
-       us_ibdev->ib_dev.uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION;
 
        us_ibdev->ib_dev.uverbs_cmd_mask =
                (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
@@ -412,7 +415,6 @@ static void *usnic_ib_device_add(struct pci_dev *dev)
 
        ib_set_device_ops(&us_ibdev->ib_dev, &usnic_dev_ops);
 
-       us_ibdev->ib_dev.driver_id = RDMA_DRIVER_USNIC;
        rdma_set_device_sysfs_group(&us_ibdev->ib_dev, &usnic_attr_group);
 
        ret = ib_device_set_netdev(&us_ibdev->ib_dev, us_ibdev->netdev, 1);
index e9352750e029cf596921d368e9f963bce34a28d4..eeb07b245ef9bb9c735841199b5c1a3e1a628ada 100644 (file)
@@ -587,28 +587,18 @@ out_unlock:
        return status;
 }
 
-struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev,
-                                const struct ib_cq_init_attr *attr,
-                                struct ib_udata *udata)
+int usnic_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                      struct ib_udata *udata)
 {
-       struct ib_cq *cq;
-
-       usnic_dbg("\n");
        if (attr->flags)
-               return ERR_PTR(-EINVAL);
-
-       cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-       if (!cq)
-               return ERR_PTR(-EBUSY);
+               return -EINVAL;
 
-       return cq;
+       return 0;
 }
 
-int usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
+void usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
-       usnic_dbg("\n");
-       kfree(cq);
-       return 0;
+       return;
 }
 
 struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
index 028f322f8e9be35e5af752ed6842065f66d66af0..2aedf78c13cf23da5882f18a000a205a68f4c0b9 100644 (file)
@@ -58,10 +58,9 @@ struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
 int usnic_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata);
 int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
                                int attr_mask, struct ib_udata *udata);
-struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev,
-                                const struct ib_cq_init_attr *attr,
-                                struct ib_udata *udata);
-int usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
+int usnic_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                      struct ib_udata *udata);
+void usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
                                u64 virt_addr, int access_flags,
                                struct ib_udata *udata);
index e312f522a66de7a3e333d931955474bcaf907916..0b0237d41613fc4cb61ba4f1cd02430aeda74490 100644 (file)
@@ -75,9 +75,10 @@ static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty)
                for_each_sg(chunk->page_list, sg, chunk->nents, i) {
                        page = sg_page(sg);
                        pa = sg_phys(sg);
-                       if (!PageDirty(page) && dirty)
-                               set_page_dirty_lock(page);
-                       put_page(page);
+                       if (dirty)
+                               put_user_pages_dirty_lock(&page, 1);
+                       else
+                               put_user_page(page);
                        usnic_dbg("pa: %pa\n", &pa);
                }
                kfree(chunk);
index 3c633ab58052847e4ba05da464f420a937750c2f..c142f5e7f25f82d0d163570c5eb5663e1172e54c 100644 (file)
@@ -456,7 +456,7 @@ static inline enum pvrdma_wr_opcode ib_wr_opcode_to_pvrdma(enum ib_wr_opcode op)
                return PVRDMA_WR_MASKED_ATOMIC_CMP_AND_SWP;
        case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
                return PVRDMA_WR_MASKED_ATOMIC_FETCH_AND_ADD;
-       case IB_WR_REG_SIG_MR:
+       case IB_WR_REG_MR_INTEGRITY:
                return PVRDMA_WR_REG_SIG_MR;
        default:
                return PVRDMA_WR_ERROR;
index d7deb19a2800cd15f3a89aeaedb0b6315dd64d11..7800e6930502de7da27e9cf7207fce03a2825125 100644 (file)
@@ -92,20 +92,19 @@ int pvrdma_req_notify_cq(struct ib_cq *ibcq,
 
 /**
  * pvrdma_create_cq - create completion queue
- * @ibdev: the device
+ * @ibcq: Allocated CQ
  * @attr: completion queue attributes
  * @udata: user data
  *
- * @return: ib_cq completion queue pointer on success,
- *          otherwise returns negative errno.
+ * @return: 0 on success
  */
-struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
-                              const struct ib_cq_init_attr *attr,
-                              struct ib_udata *udata)
+int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                    struct ib_udata *udata)
 {
+       struct ib_device *ibdev = ibcq->device;
        int entries = attr->cqe;
        struct pvrdma_dev *dev = to_vdev(ibdev);
-       struct pvrdma_cq *cq;
+       struct pvrdma_cq *cq = to_vcq(ibcq);
        int ret;
        int npages;
        unsigned long flags;
@@ -113,7 +112,7 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
        union pvrdma_cmd_resp rsp;
        struct pvrdma_cmd_create_cq *cmd = &req.create_cq;
        struct pvrdma_cmd_create_cq_resp *resp = &rsp.create_cq_resp;
-       struct pvrdma_create_cq_resp cq_resp = {0};
+       struct pvrdma_create_cq_resp cq_resp = {};
        struct pvrdma_create_cq ucmd;
        struct pvrdma_ucontext *context = rdma_udata_to_drv_context(
                udata, struct pvrdma_ucontext, ibucontext);
@@ -122,16 +121,10 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
 
        entries = roundup_pow_of_two(entries);
        if (entries < 1 || entries > dev->dsr->caps.max_cqe)
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
 
        if (!atomic_add_unless(&dev->num_cqs, 1, dev->dsr->caps.max_cq))
-               return ERR_PTR(-ENOMEM);
-
-       cq = kzalloc(sizeof(*cq), GFP_KERNEL);
-       if (!cq) {
-               atomic_dec(&dev->num_cqs);
-               return ERR_PTR(-ENOMEM);
-       }
+               return -ENOMEM;
 
        cq->ibcq.cqe = entries;
        cq->is_kernel = !udata;
@@ -211,22 +204,19 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
                        dev_warn(&dev->pdev->dev,
                                 "failed to copy back udata\n");
                        pvrdma_destroy_cq(&cq->ibcq, udata);
-                       return ERR_PTR(-EINVAL);
+                       return -EINVAL;
                }
        }
 
-       return &cq->ibcq;
+       return 0;
 
 err_page_dir:
        pvrdma_page_dir_cleanup(dev, &cq->pdir);
 err_umem:
-       if (!cq->is_kernel)
-               ib_umem_release(cq->umem);
+       ib_umem_release(cq->umem);
 err_cq:
        atomic_dec(&dev->num_cqs);
-       kfree(cq);
-
-       return ERR_PTR(ret);
+       return ret;
 }
 
 static void pvrdma_free_cq(struct pvrdma_dev *dev, struct pvrdma_cq *cq)
@@ -235,21 +225,17 @@ static void pvrdma_free_cq(struct pvrdma_dev *dev, struct pvrdma_cq *cq)
                complete(&cq->free);
        wait_for_completion(&cq->free);
 
-       if (!cq->is_kernel)
-               ib_umem_release(cq->umem);
+       ib_umem_release(cq->umem);
 
        pvrdma_page_dir_cleanup(dev, &cq->pdir);
-       kfree(cq);
 }
 
 /**
  * pvrdma_destroy_cq - destroy completion queue
  * @cq: the completion queue to destroy.
  * @udata: user data or null for kernel object
- *
- * @return: 0 for success.
  */
-int pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
+void pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
        struct pvrdma_cq *vcq = to_vcq(cq);
        union pvrdma_cmd_req req;
@@ -275,8 +261,6 @@ int pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 
        pvrdma_free_cq(dev, vcq);
        atomic_dec(&dev->num_cqs);
-
-       return ret;
 }
 
 static inline struct pvrdma_cqe *get_cqe(struct pvrdma_cq *cq, int i)
index 40182297f87fd5b0447b94c483382583337a4e77..e580ae9cc55a52143362f512e369d7717beaffe3 100644 (file)
@@ -144,6 +144,10 @@ static int pvrdma_port_immutable(struct ib_device *ibdev, u8 port_num,
 }
 
 static const struct ib_device_ops pvrdma_dev_ops = {
+       .owner = THIS_MODULE,
+       .driver_id = RDMA_DRIVER_VMW_PVRDMA,
+       .uverbs_abi_ver = PVRDMA_UVERBS_ABI_VERSION,
+
        .add_gid = pvrdma_add_gid,
        .alloc_mr = pvrdma_alloc_mr,
        .alloc_pd = pvrdma_alloc_pd,
@@ -178,6 +182,7 @@ static const struct ib_device_ops pvrdma_dev_ops = {
        .req_notify_cq = pvrdma_req_notify_cq,
 
        INIT_RDMA_OBJ_SIZE(ib_ah, pvrdma_ah, ibah),
+       INIT_RDMA_OBJ_SIZE(ib_cq, pvrdma_cq, ibcq),
        INIT_RDMA_OBJ_SIZE(ib_pd, pvrdma_pd, ibpd),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, pvrdma_ucontext, ibucontext),
 };
@@ -198,10 +203,8 @@ static int pvrdma_register_device(struct pvrdma_dev *dev)
        dev->ib_dev.node_guid = dev->dsr->caps.node_guid;
        dev->sys_image_guid = dev->dsr->caps.sys_image_guid;
        dev->flags = 0;
-       dev->ib_dev.owner = THIS_MODULE;
        dev->ib_dev.num_comp_vectors = 1;
        dev->ib_dev.dev.parent = &dev->pdev->dev;
-       dev->ib_dev.uverbs_abi_ver = PVRDMA_UVERBS_ABI_VERSION;
        dev->ib_dev.uverbs_cmd_mask =
                (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
                (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
@@ -261,7 +264,6 @@ static int pvrdma_register_device(struct pvrdma_dev *dev)
                if (!dev->srq_tbl)
                        goto err_qp_free;
        }
-       dev->ib_dev.driver_id = RDMA_DRIVER_VMW_PVRDMA;
        ret = ib_device_set_netdev(&dev->ib_dev, dev->netdev, 1);
        if (ret)
                return ret;
index 65dc47ffb8f329fb75e9587667df99bd2c782c39..f3a3d22ee8d7343bd634f4c8cc60c36e7410c456 100644 (file)
@@ -290,8 +290,7 @@ int pvrdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
                         "could not deregister mem region, error: %d\n", ret);
 
        pvrdma_page_dir_cleanup(dev, &mr->pdir);
-       if (mr->umem)
-               ib_umem_release(mr->umem);
+       ib_umem_release(mr->umem);
 
        kfree(mr->pages);
        kfree(mr);
index 0eaaead5baecbcdb7ebcad9650e484deb10a117b..bca6a58a442e1d2869e9bcc88c91d60380e7e518 100644 (file)
@@ -391,12 +391,8 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd,
 err_pdir:
        pvrdma_page_dir_cleanup(dev, &qp->pdir);
 err_umem:
-       if (!qp->is_kernel) {
-               if (qp->rumem)
-                       ib_umem_release(qp->rumem);
-               if (qp->sumem)
-                       ib_umem_release(qp->sumem);
-       }
+       ib_umem_release(qp->rumem);
+       ib_umem_release(qp->sumem);
 err_qp:
        kfree(qp);
        atomic_dec(&dev->num_qps);
@@ -429,12 +425,8 @@ static void pvrdma_free_qp(struct pvrdma_qp *qp)
                complete(&qp->free);
        wait_for_completion(&qp->free);
 
-       if (!qp->is_kernel) {
-               if (qp->rumem)
-                       ib_umem_release(qp->rumem);
-               if (qp->sumem)
-                       ib_umem_release(qp->sumem);
-       }
+       ib_umem_release(qp->rumem);
+       ib_umem_release(qp->sumem);
 
        pvrdma_page_dir_cleanup(dev, &qp->pdir);
 
index 9d7b021e1c5991dbb83c396a35e552c4c736b9bb..e4a48f5c0c854d918cf0765d95c21b89c94e375f 100644 (file)
@@ -409,10 +409,9 @@ struct ib_mr *pvrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
                              u32 max_num_sg, struct ib_udata *udata);
 int pvrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
                     int sg_nents, unsigned int *sg_offset);
-struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
-                              const struct ib_cq_init_attr *attr,
-                              struct ib_udata *udata);
-int pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
+int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                    struct ib_udata *udata);
+void pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int pvrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int pvrdma_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
 int pvrdma_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags,
index ab48a9b60844e9424771245c328630236e5e1d71..68e0230f8f3113e841e0889aa94983cd8a828838 100644 (file)
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_INFINIBAND_RDMAVT)                += rdmavt/
 obj-$(CONFIG_RDMA_RXE)                 += rxe/
+obj-$(CONFIG_RDMA_SIW)                 += siw/
index 0e147b32cbe9f55a0210f527d9df28cdb998b256..fe99da0ff060f37b10e629b9b0efa42a38811ebc 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2016 - 2019 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -119,8 +119,6 @@ int rvt_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr,
 
        rdma_copy_ah_attr(&ah->attr, ah_attr);
 
-       atomic_set(&ah->refcount, 0);
-
        if (dev->driver_f.notify_new_ah)
                dev->driver_f.notify_new_ah(ibah->device, ah_attr, ah);
 
@@ -141,8 +139,6 @@ void rvt_destroy_ah(struct ib_ah *ibah, u32 destroy_flags)
        struct rvt_ah *ah = ibah_to_rvtah(ibah);
        unsigned long flags;
 
-       WARN_ON_ONCE(atomic_read(&ah->refcount));
-
        spin_lock_irqsave(&dev->n_ahs_lock, flags);
        dev->n_ahs_allocated--;
        spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
index a06e6da7a0264a673bddd0641de97bc0c1a9c528..a85571a4cf57de47fb59537e164be695e67a78e7 100644 (file)
@@ -60,22 +60,39 @@ static struct workqueue_struct *comp_vector_wq;
  * @solicited: true if @entry is solicited
  *
  * This may be called with qp->s_lock held.
+ *
+ * Return: return true on success, else return
+ * false if cq is full.
  */
-void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
+bool rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
 {
-       struct rvt_cq_wc *wc;
+       struct ib_uverbs_wc *uqueue = NULL;
+       struct ib_wc *kqueue = NULL;
+       struct rvt_cq_wc *u_wc = NULL;
+       struct rvt_k_cq_wc *k_wc = NULL;
        unsigned long flags;
        u32 head;
        u32 next;
+       u32 tail;
 
        spin_lock_irqsave(&cq->lock, flags);
 
+       if (cq->ip) {
+               u_wc = cq->queue;
+               uqueue = &u_wc->uqueue[0];
+               head = RDMA_READ_UAPI_ATOMIC(u_wc->head);
+               tail = RDMA_READ_UAPI_ATOMIC(u_wc->tail);
+       } else {
+               k_wc = cq->kqueue;
+               kqueue = &k_wc->kqueue[0];
+               head = k_wc->head;
+               tail = k_wc->tail;
+       }
+
        /*
-        * Note that the head pointer might be writable by user processes.
-        * Take care to verify it is a sane value.
+        * Note that the head pointer might be writable by
+        * user processes.Take care to verify it is a sane value.
         */
-       wc = cq->queue;
-       head = wc->head;
        if (head >= (unsigned)cq->ibcq.cqe) {
                head = cq->ibcq.cqe;
                next = 0;
@@ -83,7 +100,12 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
                next = head + 1;
        }
 
-       if (unlikely(next == wc->tail)) {
+       if (unlikely(next == tail || cq->cq_full)) {
+               struct rvt_dev_info *rdi = cq->rdi;
+
+               if (!cq->cq_full)
+                       rvt_pr_err_ratelimited(rdi, "CQ is full!\n");
+               cq->cq_full = true;
                spin_unlock_irqrestore(&cq->lock, flags);
                if (cq->ibcq.event_handler) {
                        struct ib_event ev;
@@ -93,30 +115,30 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
                        ev.event = IB_EVENT_CQ_ERR;
                        cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
                }
-               return;
+               return false;
        }
        trace_rvt_cq_enter(cq, entry, head);
-       if (cq->ip) {
-               wc->uqueue[head].wr_id = entry->wr_id;
-               wc->uqueue[head].status = entry->status;
-               wc->uqueue[head].opcode = entry->opcode;
-               wc->uqueue[head].vendor_err = entry->vendor_err;
-               wc->uqueue[head].byte_len = entry->byte_len;
-               wc->uqueue[head].ex.imm_data = entry->ex.imm_data;
-               wc->uqueue[head].qp_num = entry->qp->qp_num;
-               wc->uqueue[head].src_qp = entry->src_qp;
-               wc->uqueue[head].wc_flags = entry->wc_flags;
-               wc->uqueue[head].pkey_index = entry->pkey_index;
-               wc->uqueue[head].slid = ib_lid_cpu16(entry->slid);
-               wc->uqueue[head].sl = entry->sl;
-               wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
-               wc->uqueue[head].port_num = entry->port_num;
+       if (uqueue) {
+               uqueue[head].wr_id = entry->wr_id;
+               uqueue[head].status = entry->status;
+               uqueue[head].opcode = entry->opcode;
+               uqueue[head].vendor_err = entry->vendor_err;
+               uqueue[head].byte_len = entry->byte_len;
+               uqueue[head].ex.imm_data = entry->ex.imm_data;
+               uqueue[head].qp_num = entry->qp->qp_num;
+               uqueue[head].src_qp = entry->src_qp;
+               uqueue[head].wc_flags = entry->wc_flags;
+               uqueue[head].pkey_index = entry->pkey_index;
+               uqueue[head].slid = ib_lid_cpu16(entry->slid);
+               uqueue[head].sl = entry->sl;
+               uqueue[head].dlid_path_bits = entry->dlid_path_bits;
+               uqueue[head].port_num = entry->port_num;
                /* Make sure entry is written before the head index. */
-               smp_wmb();
+               RDMA_WRITE_UAPI_ATOMIC(u_wc->head, next);
        } else {
-               wc->kqueue[head] = *entry;
+               kqueue[head] = *entry;
+               k_wc->head = next;
        }
-       wc->head = next;
 
        if (cq->notify == IB_CQ_NEXT_COMP ||
            (cq->notify == IB_CQ_SOLICITED &&
@@ -132,6 +154,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
        }
 
        spin_unlock_irqrestore(&cq->lock, flags);
+       return true;
 }
 EXPORT_SYMBOL(rvt_cq_enter);
 
@@ -166,43 +189,38 @@ static void send_complete(struct work_struct *work)
 
 /**
  * rvt_create_cq - create a completion queue
- * @ibdev: the device this completion queue is attached to
+ * @ibcq: Allocated CQ
  * @attr: creation attributes
  * @udata: user data for libibverbs.so
  *
  * Called by ib_create_cq() in the generic verbs code.
  *
- * Return: pointer to the completion queue or negative errno values
- * for failure.
+ * Return: 0 on success
  */
-struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
-                           const struct ib_cq_init_attr *attr,
-                           struct ib_udata *udata)
+int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                 struct ib_udata *udata)
 {
+       struct ib_device *ibdev = ibcq->device;
        struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
-       struct rvt_cq *cq;
-       struct rvt_cq_wc *wc;
-       struct ib_cq *ret;
+       struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
+       struct rvt_cq_wc *u_wc = NULL;
+       struct rvt_k_cq_wc *k_wc = NULL;
        u32 sz;
        unsigned int entries = attr->cqe;
        int comp_vector = attr->comp_vector;
+       int err;
 
        if (attr->flags)
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
 
        if (entries < 1 || entries > rdi->dparms.props.max_cqe)
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
 
        if (comp_vector < 0)
                comp_vector = 0;
 
        comp_vector = comp_vector % rdi->ibdev.num_comp_vectors;
 
-       /* Allocate the completion queue structure. */
-       cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, rdi->dparms.node);
-       if (!cq)
-               return ERR_PTR(-ENOMEM);
-
        /*
         * Allocate the completion queue entries and head/tail pointers.
         * This is allocated separately so that it can be resized and
@@ -210,17 +228,18 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
         * We need to use vmalloc() in order to support mmap and large
         * numbers of entries.
         */
-       sz = sizeof(*wc);
-       if (udata && udata->outlen >= sizeof(__u64))
-               sz += sizeof(struct ib_uverbs_wc) * (entries + 1);
-       else
-               sz += sizeof(struct ib_wc) * (entries + 1);
-       wc = udata ?
-               vmalloc_user(sz) :
-               vzalloc_node(sz, rdi->dparms.node);
-       if (!wc) {
-               ret = ERR_PTR(-ENOMEM);
-               goto bail_cq;
+       if (udata && udata->outlen >= sizeof(__u64)) {
+               sz = sizeof(struct ib_uverbs_wc) * (entries + 1);
+               sz += sizeof(*u_wc);
+               u_wc = vmalloc_user(sz);
+               if (!u_wc)
+                       return -ENOMEM;
+       } else {
+               sz = sizeof(struct ib_wc) * (entries + 1);
+               sz += sizeof(*k_wc);
+               k_wc = vzalloc_node(sz, rdi->dparms.node);
+               if (!k_wc)
+                       return -ENOMEM;
        }
 
        /*
@@ -228,26 +247,22 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
         * See rvt_mmap() for details.
         */
        if (udata && udata->outlen >= sizeof(__u64)) {
-               int err;
-
-               cq->ip = rvt_create_mmap_info(rdi, sz, udata, wc);
+               cq->ip = rvt_create_mmap_info(rdi, sz, udata, u_wc);
                if (!cq->ip) {
-                       ret = ERR_PTR(-ENOMEM);
+                       err = -ENOMEM;
                        goto bail_wc;
                }
 
                err = ib_copy_to_udata(udata, &cq->ip->offset,
                                       sizeof(cq->ip->offset));
-               if (err) {
-                       ret = ERR_PTR(err);
+               if (err)
                        goto bail_ip;
-               }
        }
 
        spin_lock_irq(&rdi->n_cqs_lock);
        if (rdi->n_cqs_allocated == rdi->dparms.props.max_cq) {
                spin_unlock_irq(&rdi->n_cqs_lock);
-               ret = ERR_PTR(-ENOMEM);
+               err = -ENOMEM;
                goto bail_ip;
        }
 
@@ -277,21 +292,20 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
        cq->notify = RVT_CQ_NONE;
        spin_lock_init(&cq->lock);
        INIT_WORK(&cq->comptask, send_complete);
-       cq->queue = wc;
-
-       ret = &cq->ibcq;
+       if (u_wc)
+               cq->queue = u_wc;
+       else
+               cq->kqueue = k_wc;
 
        trace_rvt_create_cq(cq, attr);
-       goto done;
+       return 0;
 
 bail_ip:
        kfree(cq->ip);
 bail_wc:
-       vfree(wc);
-bail_cq:
-       kfree(cq);
-done:
-       return ret;
+       vfree(u_wc);
+       vfree(k_wc);
+       return err;
 }
 
 /**
@@ -300,10 +314,8 @@ done:
  * @udata: user data or NULL for kernel object
  *
  * Called by ib_destroy_cq() in the generic verbs code.
- *
- * Return: always 0
  */
-int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+void rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
        struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
        struct rvt_dev_info *rdi = cq->rdi;
@@ -316,9 +328,6 @@ int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
                kref_put(&cq->ip->ref, rvt_release_mmap_info);
        else
                vfree(cq->queue);
-       kfree(cq);
-
-       return 0;
 }
 
 /**
@@ -345,9 +354,16 @@ int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
        if (cq->notify != IB_CQ_NEXT_COMP)
                cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
 
-       if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
-           cq->queue->head != cq->queue->tail)
-               ret = 1;
+       if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
+               if (cq->queue) {
+                       if (RDMA_READ_UAPI_ATOMIC(cq->queue->head) !=
+                               RDMA_READ_UAPI_ATOMIC(cq->queue->tail))
+                               ret = 1;
+               } else {
+                       if (cq->kqueue->head != cq->kqueue->tail)
+                               ret = 1;
+               }
+       }
 
        spin_unlock_irqrestore(&cq->lock, flags);
 
@@ -363,12 +379,14 @@ int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
 int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
 {
        struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
-       struct rvt_cq_wc *old_wc;
-       struct rvt_cq_wc *wc;
        u32 head, tail, n;
        int ret;
        u32 sz;
        struct rvt_dev_info *rdi = cq->rdi;
+       struct rvt_cq_wc *u_wc = NULL;
+       struct rvt_cq_wc *old_u_wc = NULL;
+       struct rvt_k_cq_wc *k_wc = NULL;
+       struct rvt_k_cq_wc *old_k_wc = NULL;
 
        if (cqe < 1 || cqe > rdi->dparms.props.max_cqe)
                return -EINVAL;
@@ -376,17 +394,19 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
        /*
         * Need to use vmalloc() if we want to support large #s of entries.
         */
-       sz = sizeof(*wc);
-       if (udata && udata->outlen >= sizeof(__u64))
-               sz += sizeof(struct ib_uverbs_wc) * (cqe + 1);
-       else
-               sz += sizeof(struct ib_wc) * (cqe + 1);
-       wc = udata ?
-               vmalloc_user(sz) :
-               vzalloc_node(sz, rdi->dparms.node);
-       if (!wc)
-               return -ENOMEM;
-
+       if (udata && udata->outlen >= sizeof(__u64)) {
+               sz = sizeof(struct ib_uverbs_wc) * (cqe + 1);
+               sz += sizeof(*u_wc);
+               u_wc = vmalloc_user(sz);
+               if (!u_wc)
+                       return -ENOMEM;
+       } else {
+               sz = sizeof(struct ib_wc) * (cqe + 1);
+               sz += sizeof(*k_wc);
+               k_wc = vzalloc_node(sz, rdi->dparms.node);
+               if (!k_wc)
+                       return -ENOMEM;
+       }
        /* Check that we can write the offset to mmap. */
        if (udata && udata->outlen >= sizeof(__u64)) {
                __u64 offset = 0;
@@ -401,11 +421,18 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
         * Make sure head and tail are sane since they
         * might be user writable.
         */
-       old_wc = cq->queue;
-       head = old_wc->head;
+       if (u_wc) {
+               old_u_wc = cq->queue;
+               head = RDMA_READ_UAPI_ATOMIC(old_u_wc->head);
+               tail = RDMA_READ_UAPI_ATOMIC(old_u_wc->tail);
+       } else {
+               old_k_wc = cq->kqueue;
+               head = old_k_wc->head;
+               tail = old_k_wc->tail;
+       }
+
        if (head > (u32)cq->ibcq.cqe)
                head = (u32)cq->ibcq.cqe;
-       tail = old_wc->tail;
        if (tail > (u32)cq->ibcq.cqe)
                tail = (u32)cq->ibcq.cqe;
        if (head < tail)
@@ -417,27 +444,36 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
                goto bail_unlock;
        }
        for (n = 0; tail != head; n++) {
-               if (cq->ip)
-                       wc->uqueue[n] = old_wc->uqueue[tail];
+               if (u_wc)
+                       u_wc->uqueue[n] = old_u_wc->uqueue[tail];
                else
-                       wc->kqueue[n] = old_wc->kqueue[tail];
+                       k_wc->kqueue[n] = old_k_wc->kqueue[tail];
                if (tail == (u32)cq->ibcq.cqe)
                        tail = 0;
                else
                        tail++;
        }
        cq->ibcq.cqe = cqe;
-       wc->head = n;
-       wc->tail = 0;
-       cq->queue = wc;
+       if (u_wc) {
+               RDMA_WRITE_UAPI_ATOMIC(u_wc->head, n);
+               RDMA_WRITE_UAPI_ATOMIC(u_wc->tail, 0);
+               cq->queue = u_wc;
+       } else {
+               k_wc->head = n;
+               k_wc->tail = 0;
+               cq->kqueue = k_wc;
+       }
        spin_unlock_irq(&cq->lock);
 
-       vfree(old_wc);
+       if (u_wc)
+               vfree(old_u_wc);
+       else
+               vfree(old_k_wc);
 
        if (cq->ip) {
                struct rvt_mmap_info *ip = cq->ip;
 
-               rvt_update_mmap_info(rdi, ip, sz, wc);
+               rvt_update_mmap_info(rdi, ip, sz, u_wc);
 
                /*
                 * Return the offset to mmap.
@@ -461,7 +497,9 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
 bail_unlock:
        spin_unlock_irq(&cq->lock);
 bail_free:
-       vfree(wc);
+       vfree(u_wc);
+       vfree(k_wc);
+
        return ret;
 }
 
@@ -479,7 +517,7 @@ bail_free:
 int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
 {
        struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
-       struct rvt_cq_wc *wc;
+       struct rvt_k_cq_wc *wc;
        unsigned long flags;
        int npolled;
        u32 tail;
@@ -490,7 +528,7 @@ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
 
        spin_lock_irqsave(&cq->lock, flags);
 
-       wc = cq->queue;
+       wc = cq->kqueue;
        tail = wc->tail;
        if (tail > (u32)cq->ibcq.cqe)
                tail = (u32)cq->ibcq.cqe;
index 3ad6faf18ecb13a430ad3df44a82817c7f6f106e..5e26a2eb19a4c6d1e31f1b4d1e1cc1dd5bbbf44c 100644 (file)
 #include <rdma/rdma_vt.h>
 #include <rdma/rdmavt_cq.h>
 
-struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
-                           const struct ib_cq_init_attr *attr,
-                           struct ib_udata *udata);
-int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                 struct ib_udata *udata);
+void rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags);
 int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
 int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
index f48240f66b8fa188e97a3334af5282468f1eb79d..a6a39f01dca37112af1acdadc6bfd050d81c9b38 100644 (file)
@@ -562,8 +562,7 @@ int rvt_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
        if (ret)
                goto out;
        rvt_deinit_mregion(&mr->mr);
-       if (mr->umem)
-               ib_umem_release(mr->umem);
+       ib_umem_release(mr->umem);
        kfree(mr);
 out:
        return ret;
@@ -613,8 +612,8 @@ static int rvt_set_page(struct ib_mr *ibmr, u64 addr)
        n = mapped_segs % RVT_SEGSZ;
        mr->mr.map[m]->segs[n].vaddr = (void *)addr;
        mr->mr.map[m]->segs[n].length = ps;
-       trace_rvt_mr_page_seg(&mr->mr, m, n, (void *)addr, ps);
        mr->mr.length += ps;
+       trace_rvt_mr_page_seg(&mr->mr, m, n, (void *)addr, ps);
 
        return 0;
 }
@@ -643,6 +642,7 @@ int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
        mr->mr.iova = ibmr->iova;
        mr->mr.offset = ibmr->iova - (u64)mr->mr.map[0]->segs[0].vaddr;
        mr->mr.length = (size_t)ibmr->length;
+       trace_rvt_map_mr_sg(ibmr, sg_nents, sg_offset);
        return ret;
 }
 
index c5a50614a6c630b34a1ccbe6fea431b43bf04613..0b0a241c57ff37e2897fc7a63dc3b45e525ba6fb 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2016 - 2018 Intel Corporation.
+ * Copyright(c) 2016 - 2019 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -58,6 +58,8 @@
 #include "vt.h"
 #include "trace.h"
 
+#define RVT_RWQ_COUNT_THRESHOLD 16
+
 static void rvt_rc_timeout(struct timer_list *t);
 
 /*
@@ -802,6 +804,47 @@ static void rvt_remove_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp)
        }
 }
 
+/**
+ * rvt_alloc_rq - allocate memory for user or kernel buffer
+ * @rq: receive queue data structure
+ * @size: number of request queue entries
+ * @node: The NUMA node
+ * @udata: True if user data is available or not false
+ *
+ * Return: If memory allocation failed, return -ENONEM
+ * This function is used by both shared receive
+ * queues and non-shared receive queues to allocate
+ * memory.
+ */
+int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node,
+                struct ib_udata *udata)
+{
+       if (udata) {
+               rq->wq = vmalloc_user(sizeof(struct rvt_rwq) + size);
+               if (!rq->wq)
+                       goto bail;
+               /* need kwq with no buffers */
+               rq->kwq = kzalloc_node(sizeof(*rq->kwq), GFP_KERNEL, node);
+               if (!rq->kwq)
+                       goto bail;
+               rq->kwq->curr_wq = rq->wq->wq;
+       } else {
+               /* need kwq with buffers */
+               rq->kwq =
+                       vzalloc_node(sizeof(struct rvt_krwq) + size, node);
+               if (!rq->kwq)
+                       goto bail;
+               rq->kwq->curr_wq = rq->kwq->wq;
+       }
+
+       spin_lock_init(&rq->kwq->p_lock);
+       spin_lock_init(&rq->kwq->c_lock);
+       return 0;
+bail:
+       rvt_free_rq(rq);
+       return -ENOMEM;
+}
+
 /**
  * rvt_init_qp - initialize the QP state to the reset state
  * @qp: the QP to init or reinit
@@ -852,10 +895,8 @@ static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
        qp->s_tail_ack_queue = 0;
        qp->s_acked_ack_queue = 0;
        qp->s_num_rd_atomic = 0;
-       if (qp->r_rq.wq) {
-               qp->r_rq.wq->head = 0;
-               qp->r_rq.wq->tail = 0;
-       }
+       if (qp->r_rq.kwq)
+               qp->r_rq.kwq->count = qp->r_rq.size;
        qp->r_sge.num_sge = 0;
        atomic_set(&qp->s_reserved_used, 0);
 }
@@ -927,6 +968,61 @@ static void rvt_free_qpn(struct rvt_qpn_table *qpt, u32 qpn)
                clear_bit(qpn & RVT_BITS_PER_PAGE_MASK, map->page);
 }
 
+/**
+ * get_allowed_ops - Given a QP type return the appropriate allowed OP
+ * @type: valid, supported, QP type
+ */
+static u8 get_allowed_ops(enum ib_qp_type type)
+{
+       return type == IB_QPT_RC ? IB_OPCODE_RC : type == IB_QPT_UC ?
+               IB_OPCODE_UC : IB_OPCODE_UD;
+}
+
+/**
+ * free_ud_wq_attr - Clean up AH attribute cache for UD QPs
+ * @qp: Valid QP with allowed_ops set
+ *
+ * The rvt_swqe data structure being used is a union, so this is
+ * only valid for UD QPs.
+ */
+static void free_ud_wq_attr(struct rvt_qp *qp)
+{
+       struct rvt_swqe *wqe;
+       int i;
+
+       for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) {
+               wqe = rvt_get_swqe_ptr(qp, i);
+               kfree(wqe->ud_wr.attr);
+               wqe->ud_wr.attr = NULL;
+       }
+}
+
+/**
+ * alloc_ud_wq_attr - AH attribute cache for UD QPs
+ * @qp: Valid QP with allowed_ops set
+ * @node: Numa node for allocation
+ *
+ * The rvt_swqe data structure being used is a union, so this is
+ * only valid for UD QPs.
+ */
+static int alloc_ud_wq_attr(struct rvt_qp *qp, int node)
+{
+       struct rvt_swqe *wqe;
+       int i;
+
+       for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) {
+               wqe = rvt_get_swqe_ptr(qp, i);
+               wqe->ud_wr.attr = kzalloc_node(sizeof(*wqe->ud_wr.attr),
+                                              GFP_KERNEL, node);
+               if (!wqe->ud_wr.attr) {
+                       free_ud_wq_attr(qp);
+                       return -ENOMEM;
+               }
+       }
+
+       return 0;
+}
+
 /**
  * rvt_create_qp - create a queue pair for a device
  * @ibpd: the protection domain who's device we create the queue pair for
@@ -989,9 +1085,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
        case IB_QPT_UC:
        case IB_QPT_RC:
        case IB_QPT_UD:
-               sz = sizeof(struct rvt_sge) *
-                       init_attr->cap.max_send_sge +
-                       sizeof(struct rvt_swqe);
+               sz = struct_size(swq, sg_list, init_attr->cap.max_send_sge);
                swq = vzalloc_node(array_size(sz, sqsize), rdi->dparms.node);
                if (!swq)
                        return ERR_PTR(-ENOMEM);
@@ -1011,6 +1105,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
                                  rdi->dparms.node);
                if (!qp)
                        goto bail_swq;
+               qp->allowed_ops = get_allowed_ops(init_attr->qp_type);
 
                RCU_INIT_POINTER(qp->next, NULL);
                if (init_attr->qp_type == IB_QPT_RC) {
@@ -1048,17 +1143,12 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
                        qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
                        sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
                                sizeof(struct rvt_rwqe);
-                       if (udata)
-                               qp->r_rq.wq = vmalloc_user(
-                                               sizeof(struct rvt_rwq) +
-                                               qp->r_rq.size * sz);
-                       else
-                               qp->r_rq.wq = vzalloc_node(
-                                               sizeof(struct rvt_rwq) +
-                                               qp->r_rq.size * sz,
-                                               rdi->dparms.node);
-                       if (!qp->r_rq.wq)
+                       err = rvt_alloc_rq(&qp->r_rq, qp->r_rq.size * sz,
+                                          rdi->dparms.node, udata);
+                       if (err) {
+                               ret = ERR_PTR(err);
                                goto bail_driver_priv;
+                       }
                }
 
                /*
@@ -1068,7 +1158,6 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
                spin_lock_init(&qp->r_lock);
                spin_lock_init(&qp->s_hlock);
                spin_lock_init(&qp->s_lock);
-               spin_lock_init(&qp->r_rq.lock);
                atomic_set(&qp->refcount, 0);
                atomic_set(&qp->local_ops_pending, 0);
                init_waitqueue_head(&qp->wait);
@@ -1080,6 +1169,11 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
                qp->s_max_sge = init_attr->cap.max_send_sge;
                if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
                        qp->s_flags = RVT_S_SIGNAL_REQ_WR;
+               err = alloc_ud_wq_attr(qp, rdi->dparms.node);
+               if (err) {
+                       ret = (ERR_PTR(err));
+                       goto bail_driver_priv;
+               }
 
                err = alloc_qpn(rdi, &rdi->qp_dev->qpn_table,
                                init_attr->qp_type,
@@ -1172,28 +1266,6 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
 
        ret = &qp->ibqp;
 
-       /*
-        * We have our QP and its good, now keep track of what types of opcodes
-        * can be processed on this QP. We do this by keeping track of what the
-        * 3 high order bits of the opcode are.
-        */
-       switch (init_attr->qp_type) {
-       case IB_QPT_SMI:
-       case IB_QPT_GSI:
-       case IB_QPT_UD:
-               qp->allowed_ops = IB_OPCODE_UD;
-               break;
-       case IB_QPT_RC:
-               qp->allowed_ops = IB_OPCODE_RC;
-               break;
-       case IB_QPT_UC:
-               qp->allowed_ops = IB_OPCODE_UC;
-               break;
-       default:
-               ret = ERR_PTR(-EINVAL);
-               goto bail_ip;
-       }
-
        return ret;
 
 bail_ip:
@@ -1204,8 +1276,8 @@ bail_qpn:
        rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
 
 bail_rq_wq:
-       if (!qp->ip)
-               vfree(qp->r_rq.wq);
+       rvt_free_rq(&qp->r_rq);
+       free_ud_wq_attr(qp);
 
 bail_driver_priv:
        rdi->driver_f.qp_priv_free(rdi, qp);
@@ -1271,19 +1343,26 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err)
        }
        wc.status = IB_WC_WR_FLUSH_ERR;
 
-       if (qp->r_rq.wq) {
-               struct rvt_rwq *wq;
+       if (qp->r_rq.kwq) {
                u32 head;
                u32 tail;
-
-               spin_lock(&qp->r_rq.lock);
-
+               struct rvt_rwq *wq = NULL;
+               struct rvt_krwq *kwq = NULL;
+
+               spin_lock(&qp->r_rq.kwq->c_lock);
+               /* qp->ip used to validate if there is a  user buffer mmaped */
+               if (qp->ip) {
+                       wq = qp->r_rq.wq;
+                       head = RDMA_READ_UAPI_ATOMIC(wq->head);
+                       tail = RDMA_READ_UAPI_ATOMIC(wq->tail);
+               } else {
+                       kwq = qp->r_rq.kwq;
+                       head = kwq->head;
+                       tail = kwq->tail;
+               }
                /* sanity check pointers before trusting them */
-               wq = qp->r_rq.wq;
-               head = wq->head;
                if (head >= qp->r_rq.size)
                        head = 0;
-               tail = wq->tail;
                if (tail >= qp->r_rq.size)
                        tail = 0;
                while (tail != head) {
@@ -1292,9 +1371,11 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err)
                                tail = 0;
                        rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
                }
-               wq->tail = tail;
-
-               spin_unlock(&qp->r_rq.lock);
+               if (qp->ip)
+                       RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail);
+               else
+                       kwq->tail = tail;
+               spin_unlock(&qp->r_rq.kwq->c_lock);
        } else if (qp->ibqp.event_handler) {
                ret = 1;
        }
@@ -1636,12 +1717,12 @@ int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 
        if (qp->ip)
                kref_put(&qp->ip->ref, rvt_release_mmap_info);
-       else
-               vfree(qp->r_rq.wq);
+       kvfree(qp->r_rq.kwq);
        rdi->driver_f.qp_priv_free(rdi, qp);
        kfree(qp->s_ack_queue);
        rdma_destroy_ah_attr(&qp->remote_ah_attr);
        rdma_destroy_ah_attr(&qp->alt_ah_attr);
+       free_ud_wq_attr(qp);
        vfree(qp->s_wq);
        kfree(qp);
        return 0;
@@ -1723,7 +1804,7 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
                  const struct ib_recv_wr **bad_wr)
 {
        struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
-       struct rvt_rwq *wq = qp->r_rq.wq;
+       struct rvt_krwq *wq = qp->r_rq.kwq;
        unsigned long flags;
        int qp_err_flush = (ib_rvt_state_ops[qp->state] & RVT_FLUSH_RECV) &&
                                !qp->ibqp.srq;
@@ -1744,12 +1825,12 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
                        return -EINVAL;
                }
 
-               spin_lock_irqsave(&qp->r_rq.lock, flags);
+               spin_lock_irqsave(&qp->r_rq.kwq->p_lock, flags);
                next = wq->head + 1;
                if (next >= qp->r_rq.size)
                        next = 0;
-               if (next == wq->tail) {
-                       spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+               if (next == READ_ONCE(wq->tail)) {
+                       spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags);
                        *bad_wr = wr;
                        return -ENOMEM;
                }
@@ -1766,16 +1847,18 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
                        wqe = rvt_get_rwqe_ptr(&qp->r_rq, wq->head);
                        wqe->wr_id = wr->wr_id;
                        wqe->num_sge = wr->num_sge;
-                       for (i = 0; i < wr->num_sge; i++)
-                               wqe->sg_list[i] = wr->sg_list[i];
+                       for (i = 0; i < wr->num_sge; i++) {
+                               wqe->sg_list[i].addr = wr->sg_list[i].addr;
+                               wqe->sg_list[i].length = wr->sg_list[i].length;
+                               wqe->sg_list[i].lkey = wr->sg_list[i].lkey;
+                       }
                        /*
                         * Make sure queue entry is written
                         * before the head index.
                         */
-                       smp_wmb();
-                       wq->head = next;
+                       smp_store_release(&wq->head, next);
                }
-               spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+               spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags);
        }
        return 0;
 }
@@ -1856,10 +1939,9 @@ static inline int rvt_qp_is_avail(
 
        /* see rvt_qp_wqe_unreserve() */
        smp_mb__before_atomic();
-       reserved_used = atomic_read(&qp->s_reserved_used);
        if (unlikely(reserved_op)) {
                /* see rvt_qp_wqe_unreserve() */
-               smp_mb__before_atomic();
+               reserved_used = atomic_read(&qp->s_reserved_used);
                if (reserved_used >= rdi->dparms.reserved_operations)
                        return -ENOMEM;
                return 0;
@@ -1867,14 +1949,13 @@ static inline int rvt_qp_is_avail(
        /* non-reserved operations */
        if (likely(qp->s_avail))
                return 0;
-       slast = READ_ONCE(qp->s_last);
+       /* See rvt_qp_complete_swqe() */
+       slast = smp_load_acquire(&qp->s_last);
        if (qp->s_head >= slast)
                avail = qp->s_size - (qp->s_head - slast);
        else
                avail = slast - qp->s_head;
 
-       /* see rvt_qp_wqe_unreserve() */
-       smp_mb__before_atomic();
        reserved_used = atomic_read(&qp->s_reserved_used);
        avail =  avail - 1 -
                (rdi->dparms.reserved_operations - reserved_used);
@@ -2011,10 +2092,10 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
         */
        log_pmtu = qp->log_pmtu;
        if (qp->allowed_ops == IB_OPCODE_UD) {
-               struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.ah);
+               struct rvt_ah *ah = rvt_get_swqe_ah(wqe);
 
                log_pmtu = ah->log_pmtu;
-               atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
+               rdma_copy_ah_attr(wqe->ud_wr.attr, &ah->attr);
        }
 
        if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) {
@@ -2059,7 +2140,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
 
 bail_inval_free_ref:
        if (qp->allowed_ops == IB_OPCODE_UD)
-               atomic_dec(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
+               rdma_destroy_ah_attr(wqe->ud_wr.attr);
 bail_inval_free:
        /* release mr holds */
        while (j) {
@@ -2145,7 +2226,7 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
                      const struct ib_recv_wr **bad_wr)
 {
        struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
-       struct rvt_rwq *wq;
+       struct rvt_krwq *wq;
        unsigned long flags;
 
        for (; wr; wr = wr->next) {
@@ -2158,13 +2239,13 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
                        return -EINVAL;
                }
 
-               spin_lock_irqsave(&srq->rq.lock, flags);
-               wq = srq->rq.wq;
+               spin_lock_irqsave(&srq->rq.kwq->p_lock, flags);
+               wq = srq->rq.kwq;
                next = wq->head + 1;
                if (next >= srq->rq.size)
                        next = 0;
-               if (next == wq->tail) {
-                       spin_unlock_irqrestore(&srq->rq.lock, flags);
+               if (next == READ_ONCE(wq->tail)) {
+                       spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags);
                        *bad_wr = wr;
                        return -ENOMEM;
                }
@@ -2172,16 +2253,34 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
                wqe = rvt_get_rwqe_ptr(&srq->rq, wq->head);
                wqe->wr_id = wr->wr_id;
                wqe->num_sge = wr->num_sge;
-               for (i = 0; i < wr->num_sge; i++)
-                       wqe->sg_list[i] = wr->sg_list[i];
+               for (i = 0; i < wr->num_sge; i++) {
+                       wqe->sg_list[i].addr = wr->sg_list[i].addr;
+                       wqe->sg_list[i].length = wr->sg_list[i].length;
+                       wqe->sg_list[i].lkey = wr->sg_list[i].lkey;
+               }
                /* Make sure queue entry is written before the head index. */
-               smp_wmb();
-               wq->head = next;
-               spin_unlock_irqrestore(&srq->rq.lock, flags);
+               smp_store_release(&wq->head, next);
+               spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags);
        }
        return 0;
 }
 
+/*
+ * rvt used the internal kernel struct as part of its ABI, for now make sure
+ * the kernel struct does not change layout. FIXME: rvt should never cast the
+ * user struct to a kernel struct.
+ */
+static struct ib_sge *rvt_cast_sge(struct rvt_wqe_sge *sge)
+{
+       BUILD_BUG_ON(offsetof(struct ib_sge, addr) !=
+                    offsetof(struct rvt_wqe_sge, addr));
+       BUILD_BUG_ON(offsetof(struct ib_sge, length) !=
+                    offsetof(struct rvt_wqe_sge, length));
+       BUILD_BUG_ON(offsetof(struct ib_sge, lkey) !=
+                    offsetof(struct rvt_wqe_sge, lkey));
+       return (struct ib_sge *)sge;
+}
+
 /*
  * Validate a RWQE and fill in the SGE state.
  * Return 1 if OK.
@@ -2205,7 +2304,7 @@ static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe)
                        continue;
                /* Check LKEY */
                ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
-                                 NULL, &wqe->sg_list[i],
+                                 NULL, rvt_cast_sge(&wqe->sg_list[i]),
                                  IB_ACCESS_LOCAL_WRITE);
                if (unlikely(ret <= 0))
                        goto bad_lkey;
@@ -2233,6 +2332,50 @@ bad_lkey:
        return 0;
 }
 
+/**
+ * get_count - count numbers of request work queue entries
+ * in circular buffer
+ * @rq: data structure for request queue entry
+ * @tail: tail indices of the circular buffer
+ * @head: head indices of the circular buffer
+ *
+ * Return - total number of entries in the circular buffer
+ */
+static u32 get_count(struct rvt_rq *rq, u32 tail, u32 head)
+{
+       u32 count;
+
+       count = head;
+
+       if (count >= rq->size)
+               count = 0;
+       if (count < tail)
+               count += rq->size - tail;
+       else
+               count -= tail;
+
+       return count;
+}
+
+/**
+ * get_rvt_head - get head indices of the circular buffer
+ * @rq: data structure for request queue entry
+ * @ip: the QP
+ *
+ * Return - head index value
+ */
+static inline u32 get_rvt_head(struct rvt_rq *rq, void *ip)
+{
+       u32 head;
+
+       if (ip)
+               head = RDMA_READ_UAPI_ATOMIC(rq->wq->head);
+       else
+               head = rq->kwq->head;
+
+       return head;
+}
+
 /**
  * rvt_get_rwqe - copy the next RWQE into the QP's RWQE
  * @qp: the QP
@@ -2247,39 +2390,54 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only)
 {
        unsigned long flags;
        struct rvt_rq *rq;
+       struct rvt_krwq *kwq = NULL;
        struct rvt_rwq *wq;
        struct rvt_srq *srq;
        struct rvt_rwqe *wqe;
        void (*handler)(struct ib_event *, void *);
        u32 tail;
+       u32 head;
        int ret;
+       void *ip = NULL;
 
        if (qp->ibqp.srq) {
                srq = ibsrq_to_rvtsrq(qp->ibqp.srq);
                handler = srq->ibsrq.event_handler;
                rq = &srq->rq;
+               ip = srq->ip;
        } else {
                srq = NULL;
                handler = NULL;
                rq = &qp->r_rq;
+               ip = qp->ip;
        }
 
-       spin_lock_irqsave(&rq->lock, flags);
+       spin_lock_irqsave(&rq->kwq->c_lock, flags);
        if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
                ret = 0;
                goto unlock;
        }
+       kwq = rq->kwq;
+       if (ip) {
+               wq = rq->wq;
+               tail = RDMA_READ_UAPI_ATOMIC(wq->tail);
+       } else {
+               tail = kwq->tail;
+       }
 
-       wq = rq->wq;
-       tail = wq->tail;
        /* Validate tail before using it since it is user writable. */
        if (tail >= rq->size)
                tail = 0;
-       if (unlikely(tail == wq->head)) {
+
+       if (kwq->count < RVT_RWQ_COUNT_THRESHOLD) {
+               head = get_rvt_head(rq, ip);
+               kwq->count = get_count(rq, tail, head);
+       }
+       if (unlikely(kwq->count == 0)) {
                ret = 0;
                goto unlock;
        }
-       /* Make sure entry is read after head index is read. */
+       /* Make sure entry is read after the count is read. */
        smp_rmb();
        wqe = rvt_get_rwqe_ptr(rq, tail);
        /*
@@ -2289,43 +2447,41 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only)
         */
        if (++tail >= rq->size)
                tail = 0;
-       wq->tail = tail;
+       if (ip)
+               RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail);
+       else
+               kwq->tail = tail;
        if (!wr_id_only && !init_sge(qp, wqe)) {
                ret = -1;
                goto unlock;
        }
        qp->r_wr_id = wqe->wr_id;
 
+       kwq->count--;
        ret = 1;
        set_bit(RVT_R_WRID_VALID, &qp->r_aflags);
        if (handler) {
-               u32 n;
-
                /*
                 * Validate head pointer value and compute
                 * the number of remaining WQEs.
                 */
-               n = wq->head;
-               if (n >= rq->size)
-                       n = 0;
-               if (n < tail)
-                       n += rq->size - tail;
-               else
-                       n -= tail;
-               if (n < srq->limit) {
-                       struct ib_event ev;
-
-                       srq->limit = 0;
-                       spin_unlock_irqrestore(&rq->lock, flags);
-                       ev.device = qp->ibqp.device;
-                       ev.element.srq = qp->ibqp.srq;
-                       ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
-                       handler(&ev, srq->ibsrq.srq_context);
-                       goto bail;
+               if (kwq->count < srq->limit) {
+                       kwq->count = get_count(rq, tail, get_rvt_head(rq, ip));
+                       if (kwq->count < srq->limit) {
+                               struct ib_event ev;
+
+                               srq->limit = 0;
+                               spin_unlock_irqrestore(&rq->kwq->c_lock, flags);
+                               ev.device = qp->ibqp.device;
+                               ev.element.srq = qp->ibqp.srq;
+                               ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+                               handler(&ev, srq->ibsrq.srq_context);
+                               goto bail;
+                       }
                }
        }
 unlock:
-       spin_unlock_irqrestore(&rq->lock, flags);
+       spin_unlock_irqrestore(&rq->kwq->c_lock, flags);
 bail:
        return ret;
 }
@@ -2667,27 +2823,16 @@ void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
                       enum ib_wc_status status)
 {
        u32 old_last, last;
-       struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+       struct rvt_dev_info *rdi;
 
        if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
                return;
+       rdi = ib_to_rvt(qp->ibqp.device);
 
-       last = qp->s_last;
-       old_last = last;
-       trace_rvt_qp_send_completion(qp, wqe, last);
-       if (++last >= qp->s_size)
-               last = 0;
-       trace_rvt_qp_send_completion(qp, wqe, last);
-       qp->s_last = last;
-       /* See post_send() */
-       barrier();
-       rvt_put_qp_swqe(qp, wqe);
-
-       rvt_qp_swqe_complete(qp,
-                            wqe,
-                            rdi->wc_opcode[wqe->wr.opcode],
-                            status);
-
+       old_last = qp->s_last;
+       trace_rvt_qp_send_completion(qp, wqe, old_last);
+       last = rvt_qp_complete_swqe(qp, wqe, rdi->wc_opcode[wqe->wr.opcode],
+                                   status);
        if (qp->s_acked == old_last)
                qp->s_acked = last;
        if (qp->s_cur == old_last)
@@ -3021,8 +3166,7 @@ do_write:
        wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
        wc.port_num = 1;
        /* Signal completion event if the solicited bit is set. */
-       rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-                    wqe->wr.send_flags & IB_SEND_SOLICITED);
+       rvt_recv_cq(qp, &wc, wqe->wr.send_flags & IB_SEND_SOLICITED);
 
 send_comp:
        spin_unlock_irqrestore(&qp->r_lock, flags);
index 6db1619389b09c240335e05824a47787f64d79e0..2cdba1283bf6da6f8f1cab2649a6e4efe1226008 100644 (file)
@@ -68,4 +68,6 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
                      const struct ib_recv_wr **bad_wr);
 int rvt_wss_init(struct rvt_dev_info *rdi);
 void rvt_wss_exit(struct rvt_dev_info *rdi);
+int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node,
+                struct ib_udata *udata);
 #endif          /* DEF_RVTQP_H */
index 09f0cf538be6927385a3c7d88a3cec1e9dfc63d8..890d7b760d2ec6b5b0e63ff2368db7ba3d71b4d8 100644 (file)
@@ -104,26 +104,33 @@ __be32 rvt_compute_aeth(struct rvt_qp *qp)
        } else {
                u32 min, max, x;
                u32 credits;
-               struct rvt_rwq *wq = qp->r_rq.wq;
                u32 head;
                u32 tail;
 
-               /* sanity check pointers before trusting them */
-               head = wq->head;
-               if (head >= qp->r_rq.size)
-                       head = 0;
-               tail = wq->tail;
-               if (tail >= qp->r_rq.size)
-                       tail = 0;
-               /*
-                * Compute the number of credits available (RWQEs).
-                * There is a small chance that the pair of reads are
-                * not atomic, which is OK, since the fuzziness is
-                * resolved as further ACKs go out.
-                */
-               credits = head - tail;
-               if ((int)credits < 0)
-                       credits += qp->r_rq.size;
+               credits = READ_ONCE(qp->r_rq.kwq->count);
+               if (credits == 0) {
+                       /* sanity check pointers before trusting them */
+                       if (qp->ip) {
+                               head = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->head);
+                               tail = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->tail);
+                       } else {
+                               head = READ_ONCE(qp->r_rq.kwq->head);
+                               tail = READ_ONCE(qp->r_rq.kwq->tail);
+                       }
+                       if (head >= qp->r_rq.size)
+                               head = 0;
+                       if (tail >= qp->r_rq.size)
+                               tail = 0;
+                       /*
+                        * Compute the number of credits available (RWQEs).
+                        * There is a small chance that the pair of reads are
+                        * not atomic, which is OK, since the fuzziness is
+                        * resolved as further ACKs go out.
+                        */
+                       credits = head - tail;
+                       if ((int)credits < 0)
+                               credits += qp->r_rq.size;
+               }
                /*
                 * Binary search the credit table to find the code to
                 * use.
index 8d6b3e76425579f0cae9641aa1557aa12bd6abd3..24fef021d51dbb825135d01c6840339ed6a805ee 100644 (file)
@@ -52,7 +52,7 @@
 
 #include "srq.h"
 #include "vt.h"
-
+#include "qp.h"
 /**
  * rvt_driver_srq_init - init srq resources on a per driver basis
  * @rdi: rvt dev structure
@@ -97,11 +97,8 @@ int rvt_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *srq_init_attr,
        srq->rq.max_sge = srq_init_attr->attr.max_sge;
        sz = sizeof(struct ib_sge) * srq->rq.max_sge +
                sizeof(struct rvt_rwqe);
-       srq->rq.wq = udata ?
-               vmalloc_user(sizeof(struct rvt_rwq) + srq->rq.size * sz) :
-               vzalloc_node(sizeof(struct rvt_rwq) + srq->rq.size * sz,
-                            dev->dparms.node);
-       if (!srq->rq.wq) {
+       if (rvt_alloc_rq(&srq->rq, srq->rq.size * sz,
+                        dev->dparms.node, udata)) {
                ret = -ENOMEM;
                goto bail_srq;
        }
@@ -152,7 +149,7 @@ int rvt_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *srq_init_attr,
 bail_ip:
        kfree(srq->ip);
 bail_wq:
-       vfree(srq->rq.wq);
+       rvt_free_rq(&srq->rq);
 bail_srq:
        return ret;
 }
@@ -172,11 +169,12 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 {
        struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
        struct rvt_dev_info *dev = ib_to_rvt(ibsrq->device);
-       struct rvt_rwq *wq;
+       struct rvt_rq tmp_rq = {};
        int ret = 0;
 
        if (attr_mask & IB_SRQ_MAX_WR) {
-               struct rvt_rwq *owq;
+               struct rvt_krwq *okwq = NULL;
+               struct rvt_rwq *owq = NULL;
                struct rvt_rwqe *p;
                u32 sz, size, n, head, tail;
 
@@ -185,17 +183,12 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
                    ((attr_mask & IB_SRQ_LIMIT) ?
                     attr->srq_limit : srq->limit) > attr->max_wr)
                        return -EINVAL;
-
                sz = sizeof(struct rvt_rwqe) +
                        srq->rq.max_sge * sizeof(struct ib_sge);
                size = attr->max_wr + 1;
-               wq = udata ?
-                       vmalloc_user(sizeof(struct rvt_rwq) + size * sz) :
-                       vzalloc_node(sizeof(struct rvt_rwq) + size * sz,
-                                    dev->dparms.node);
-               if (!wq)
+               if (rvt_alloc_rq(&tmp_rq, size * sz, dev->dparms.node,
+                                udata))
                        return -ENOMEM;
-
                /* Check that we can write the offset to mmap. */
                if (udata && udata->inlen >= sizeof(__u64)) {
                        __u64 offset_addr;
@@ -213,14 +206,20 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
                                goto bail_free;
                }
 
-               spin_lock_irq(&srq->rq.lock);
+               spin_lock_irq(&srq->rq.kwq->c_lock);
                /*
                 * validate head and tail pointer values and compute
                 * the number of remaining WQEs.
                 */
-               owq = srq->rq.wq;
-               head = owq->head;
-               tail = owq->tail;
+               if (udata) {
+                       owq = srq->rq.wq;
+                       head = RDMA_READ_UAPI_ATOMIC(owq->head);
+                       tail = RDMA_READ_UAPI_ATOMIC(owq->tail);
+               } else {
+                       okwq = srq->rq.kwq;
+                       head = okwq->head;
+                       tail = okwq->tail;
+               }
                if (head >= srq->rq.size || tail >= srq->rq.size) {
                        ret = -EINVAL;
                        goto bail_unlock;
@@ -235,7 +234,7 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
                        goto bail_unlock;
                }
                n = 0;
-               p = wq->wq;
+               p = tmp_rq.kwq->curr_wq;
                while (tail != head) {
                        struct rvt_rwqe *wqe;
                        int i;
@@ -250,22 +249,29 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
                        if (++tail >= srq->rq.size)
                                tail = 0;
                }
-               srq->rq.wq = wq;
+               srq->rq.kwq = tmp_rq.kwq;
+               if (udata) {
+                       srq->rq.wq = tmp_rq.wq;
+                       RDMA_WRITE_UAPI_ATOMIC(tmp_rq.wq->head, n);
+                       RDMA_WRITE_UAPI_ATOMIC(tmp_rq.wq->tail, 0);
+               } else {
+                       tmp_rq.kwq->head = n;
+                       tmp_rq.kwq->tail = 0;
+               }
                srq->rq.size = size;
-               wq->head = n;
-               wq->tail = 0;
                if (attr_mask & IB_SRQ_LIMIT)
                        srq->limit = attr->srq_limit;
-               spin_unlock_irq(&srq->rq.lock);
+               spin_unlock_irq(&srq->rq.kwq->c_lock);
 
                vfree(owq);
+               kvfree(okwq);
 
                if (srq->ip) {
                        struct rvt_mmap_info *ip = srq->ip;
                        struct rvt_dev_info *dev = ib_to_rvt(srq->ibsrq.device);
                        u32 s = sizeof(struct rvt_rwq) + size * sz;
 
-                       rvt_update_mmap_info(dev, ip, s, wq);
+                       rvt_update_mmap_info(dev, ip, s, tmp_rq.wq);
 
                        /*
                         * Return the offset to mmap.
@@ -289,19 +295,19 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
                        spin_unlock_irq(&dev->pending_lock);
                }
        } else if (attr_mask & IB_SRQ_LIMIT) {
-               spin_lock_irq(&srq->rq.lock);
+               spin_lock_irq(&srq->rq.kwq->c_lock);
                if (attr->srq_limit >= srq->rq.size)
                        ret = -EINVAL;
                else
                        srq->limit = attr->srq_limit;
-               spin_unlock_irq(&srq->rq.lock);
+               spin_unlock_irq(&srq->rq.kwq->c_lock);
        }
        return ret;
 
 bail_unlock:
-       spin_unlock_irq(&srq->rq.lock);
+       spin_unlock_irq(&srq->rq.kwq->c_lock);
 bail_free:
-       vfree(wq);
+       rvt_free_rq(&tmp_rq);
        return ret;
 }
 
@@ -336,6 +342,5 @@ void rvt_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
        spin_unlock(&dev->n_srqs_lock);
        if (srq->ip)
                kref_put(&srq->ip->ref, rvt_release_mmap_info);
-       else
-               vfree(srq->rq.wq);
+       kvfree(srq->rq.kwq);
 }
index 976e482930a3f5e6cc6f4006093088b1f74fc85e..95b8a0e3b8bdb52cdcb7b99665ca4f57bc49cbbe 100644 (file)
@@ -54,6 +54,8 @@
 #include <rdma/rdma_vt.h>
 #include <rdma/rdmavt_mr.h>
 
+#include "mr.h"
+
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM rvt_mr
 DECLARE_EVENT_CLASS(
@@ -64,8 +66,12 @@ DECLARE_EVENT_CLASS(
                RDI_DEV_ENTRY(ib_to_rvt(mr->pd->device))
                __field(void *, vaddr)
                __field(struct page *, page)
+               __field(u64, iova)
+               __field(u64, user_base)
                __field(size_t, len)
+               __field(size_t, length)
                __field(u32, lkey)
+               __field(u32, offset)
                __field(u16, m)
                __field(u16, n)
        ),
@@ -73,18 +79,28 @@ DECLARE_EVENT_CLASS(
                RDI_DEV_ASSIGN(ib_to_rvt(mr->pd->device));
                __entry->vaddr = v;
                __entry->page = virt_to_page(v);
+               __entry->iova = mr->iova;
+               __entry->user_base = mr->user_base;
+               __entry->lkey = mr->lkey;
                __entry->m = m;
                __entry->n = n;
                __entry->len = len;
+               __entry->length = mr->length;
+               __entry->offset = mr->offset;
        ),
        TP_printk(
-               "[%s] vaddr %p page %p m %u n %u len %ld",
+               "[%s] lkey %x iova %llx user_base %llx mr_len %lu vaddr %llx page %p m %u n %u len %lu off %u",
                __get_str(dev),
-               __entry->vaddr,
+               __entry->lkey,
+               __entry->iova,
+               __entry->user_base,
+               __entry->length,
+               (unsigned long long)__entry->vaddr,
                __entry->page,
                __entry->m,
                __entry->n,
-               __entry->len
+               __entry->len,
+               __entry->offset
        )
 );
 
@@ -165,6 +181,40 @@ DEFINE_EVENT(
        TP_PROTO(struct rvt_sge *sge, struct ib_sge *isge),
        TP_ARGS(sge, isge));
 
+TRACE_EVENT(
+       rvt_map_mr_sg,
+       TP_PROTO(struct ib_mr *ibmr, int sg_nents, unsigned int *sg_offset),
+       TP_ARGS(ibmr, sg_nents, sg_offset),
+       TP_STRUCT__entry(
+               RDI_DEV_ENTRY(ib_to_rvt(to_imr(ibmr)->mr.pd->device))
+               __field(u64, iova)
+               __field(u64, ibmr_iova)
+               __field(u64, user_base)
+               __field(u64, ibmr_length)
+               __field(int, sg_nents)
+               __field(uint, sg_offset)
+       ),
+       TP_fast_assign(
+               RDI_DEV_ASSIGN(ib_to_rvt(to_imr(ibmr)->mr.pd->device))
+               __entry->ibmr_iova = ibmr->iova;
+               __entry->iova = to_imr(ibmr)->mr.iova;
+               __entry->user_base = to_imr(ibmr)->mr.user_base;
+               __entry->ibmr_length = to_imr(ibmr)->mr.length;
+               __entry->sg_nents = sg_nents;
+               __entry->sg_offset = sg_offset ? *sg_offset : 0;
+       ),
+       TP_printk(
+               "[%s] ibmr_iova %llx iova %llx user_base %llx length %llx sg_nents %d sg_offset %u",
+               __get_str(dev),
+               __entry->ibmr_iova,
+               __entry->iova,
+               __entry->user_base,
+               __entry->ibmr_length,
+               __entry->sg_nents,
+               __entry->sg_offset
+       )
+);
+
 #endif /* __RVT_TRACE_MR_H */
 
 #undef TRACE_INCLUDE_PATH
index 9546a837a8ac41dce833d811af9534bce70908c9..18da1e1ea9797c850bbde2d780c7cfc0d0f1ebd9 100644 (file)
@@ -382,6 +382,8 @@ enum {
 };
 
 static const struct ib_device_ops rvt_dev_ops = {
+       .uverbs_abi_ver = RVT_UVERBS_ABI_VERSION,
+
        .alloc_fmr = rvt_alloc_fmr,
        .alloc_mr = rvt_alloc_mr,
        .alloc_pd = rvt_alloc_pd,
@@ -427,6 +429,7 @@ static const struct ib_device_ops rvt_dev_ops = {
        .unmap_fmr = rvt_unmap_fmr,
 
        INIT_RDMA_OBJ_SIZE(ib_ah, rvt_ah, ibah),
+       INIT_RDMA_OBJ_SIZE(ib_cq, rvt_cq, ibcq),
        INIT_RDMA_OBJ_SIZE(ib_pd, rvt_pd, ibpd),
        INIT_RDMA_OBJ_SIZE(ib_srq, rvt_srq, ibsrq),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, rvt_ucontext, ibucontext),
@@ -530,7 +533,7 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb)
  *
  * Return: 0 on success otherwise an errno.
  */
-int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id)
+int rvt_register_device(struct rvt_dev_info *rdi)
 {
        int ret = 0, i;
 
@@ -600,7 +603,6 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id)
         * exactly which functions rdmavt supports, nor do they know the ABI
         * version, so we do all of this sort of stuff here.
         */
-       rdi->ibdev.uverbs_abi_ver = RVT_UVERBS_ABI_VERSION;
        rdi->ibdev.uverbs_cmd_mask =
                (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
                (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
@@ -636,7 +638,6 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id)
        if (!rdi->ibdev.num_comp_vectors)
                rdi->ibdev.num_comp_vectors = 1;
 
-       rdi->ibdev.driver_id = driver_id;
        /* We are now good to announce we exist */
        ret = ib_register_device(&rdi->ibdev, dev_name(&rdi->ibdev.dev));
        if (ret) {
index 0675ea6c3872b702d30d677dbc9edcc6083c6d9f..d19ff817c2c716780521468c1a1832de18956e00 100644 (file)
                     fmt, \
                     ##__VA_ARGS__)
 
+#define rvt_pr_err_ratelimited(rdi, fmt, ...) \
+       __rvt_pr_err_ratelimited((rdi)->driver_f.get_pci_dev(rdi), \
+                                rvt_get_ibdev_name(rdi), \
+                                fmt, \
+                                ##__VA_ARGS__)
+
 #define __rvt_pr_info(pdev, name, fmt, ...) \
        dev_info(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__)
 
@@ -87,6 +93,9 @@
 #define __rvt_pr_err(pdev, name, fmt, ...) \
        dev_err(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__)
 
+#define __rvt_pr_err_ratelimited(pdev, name, fmt, ...) \
+       dev_err_ratelimited(&(pdev)->dev, "%s: " fmt, name, ##__VA_ARGS__)
+
 static inline int ibport_num_to_idx(struct ib_device *ibdev, u8 port_num)
 {
        struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
index 00eb99d3df8666b94633ab901433a2e470be27ac..116cafc9afcf601a2a040831668947be785c49ed 100644 (file)
@@ -558,7 +558,7 @@ int rxe_completer(void *arg)
 {
        struct rxe_qp *qp = (struct rxe_qp *)arg;
        struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
-       struct rxe_send_wqe *wqe = wqe;
+       struct rxe_send_wqe *wqe = NULL;
        struct sk_buff *skb = NULL;
        struct rxe_pkt_info *pkt = NULL;
        enum comp_state state;
index f501f72489d84fba5ad881d862fa5fb1e0ab7d93..ea6a819b716750869087c82f56fe86467579a2dd 100644 (file)
@@ -96,8 +96,7 @@ void rxe_mem_cleanup(struct rxe_pool_entry *arg)
        struct rxe_mem *mem = container_of(arg, typeof(*mem), pelem);
        int i;
 
-       if (mem->umem)
-               ib_umem_release(mem->umem);
+       ib_umem_release(mem->umem);
 
        if (mem->map) {
                for (i = 0; i < mem->num_map; i++)
index 56cf18af016a0db8aa72e417539f18e14920304c..fbcbac52290b4f23b5413510264f9ea319f20786 100644 (file)
@@ -72,6 +72,7 @@ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
        [RXE_TYPE_CQ] = {
                .name           = "rxe-cq",
                .size           = sizeof(struct rxe_cq),
+               .flags          = RXE_POOL_NO_ALLOC,
                .cleanup        = rxe_cq_cleanup,
        },
        [RXE_TYPE_MR] = {
index aca9f60f9b214460b68ac17e522009fb3752a9e1..1cbfbd98eb221804e9424ecabbe267f6ade8c82f 100644 (file)
@@ -431,6 +431,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
                        qp->resp.va = reth_va(pkt);
                        qp->resp.rkey = reth_rkey(pkt);
                        qp->resp.resid = reth_len(pkt);
+                       qp->resp.length = reth_len(pkt);
                }
                access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ
                                                     : IB_ACCESS_REMOTE_WRITE;
@@ -856,7 +857,9 @@ static enum resp_states do_complete(struct rxe_qp *qp,
                                pkt->mask & RXE_WRITE_MASK) ?
                                        IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV;
                wc->vendor_err = 0;
-               wc->byte_len = wqe->dma.length - wqe->dma.resid;
+               wc->byte_len = (pkt->mask & RXE_IMMDT_MASK &&
+                               pkt->mask & RXE_WRITE_MASK) ?
+                                       qp->resp.length : wqe->dma.length - wqe->dma.resid;
 
                /* fields after byte_len are different between kernel and user
                 * space
index 8c3e2a18cfe4072a7695297365d4241166487a55..4ebdfcf4d33e3800de9e7884f7869659735590cd 100644 (file)
@@ -778,55 +778,43 @@ err1:
        return err;
 }
 
-static struct ib_cq *rxe_create_cq(struct ib_device *dev,
-                                  const struct ib_cq_init_attr *attr,
-                                  struct ib_udata *udata)
+static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+                        struct ib_udata *udata)
 {
        int err;
+       struct ib_device *dev = ibcq->device;
        struct rxe_dev *rxe = to_rdev(dev);
-       struct rxe_cq *cq;
+       struct rxe_cq *cq = to_rcq(ibcq);
        struct rxe_create_cq_resp __user *uresp = NULL;
 
        if (udata) {
                if (udata->outlen < sizeof(*uresp))
-                       return ERR_PTR(-EINVAL);
+                       return -EINVAL;
                uresp = udata->outbuf;
        }
 
        if (attr->flags)
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
 
        err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector);
        if (err)
-               goto err1;
-
-       cq = rxe_alloc(&rxe->cq_pool);
-       if (!cq) {
-               err = -ENOMEM;
-               goto err1;
-       }
+               return err;
 
        err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector, udata,
                               uresp);
        if (err)
-               goto err2;
-
-       return &cq->ibcq;
+               return err;
 
-err2:
-       rxe_drop_ref(cq);
-err1:
-       return ERR_PTR(err);
+       return rxe_add_to_pool(&rxe->cq_pool, &cq->pelem);
 }
 
-static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+static void rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
        struct rxe_cq *cq = to_rcq(ibcq);
 
        rxe_cq_disable(cq);
 
        rxe_drop_ref(cq);
-       return 0;
 }
 
 static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
@@ -1111,6 +1099,10 @@ static int rxe_enable_driver(struct ib_device *ib_dev)
 }
 
 static const struct ib_device_ops rxe_dev_ops = {
+       .owner = THIS_MODULE,
+       .driver_id = RDMA_DRIVER_RXE,
+       .uverbs_abi_ver = RXE_UVERBS_ABI_VERSION,
+
        .alloc_hw_stats = rxe_ib_alloc_hw_stats,
        .alloc_mr = rxe_alloc_mr,
        .alloc_pd = rxe_alloc_pd,
@@ -1157,6 +1149,7 @@ static const struct ib_device_ops rxe_dev_ops = {
        .resize_cq = rxe_resize_cq,
 
        INIT_RDMA_OBJ_SIZE(ib_ah, rxe_ah, ibah),
+       INIT_RDMA_OBJ_SIZE(ib_cq, rxe_cq, ibcq),
        INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd),
        INIT_RDMA_OBJ_SIZE(ib_srq, rxe_srq, ibsrq),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc),
@@ -1170,7 +1163,6 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
 
        strlcpy(dev->node_desc, "rxe", sizeof(dev->node_desc));
 
-       dev->owner = THIS_MODULE;
        dev->node_type = RDMA_NODE_IB_CA;
        dev->phys_port_cnt = 1;
        dev->num_comp_vectors = num_possible_cpus();
@@ -1182,7 +1174,6 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
        dma_coerce_mask_and_coherent(&dev->dev,
                                     dma_get_required_mask(&dev->dev));
 
-       dev->uverbs_abi_ver = RXE_UVERBS_ABI_VERSION;
        dev->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT)
            | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)
            | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE)
@@ -1230,7 +1221,6 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
        rxe->tfm = tfm;
 
        rdma_set_device_sysfs_group(dev, &rxe_attr_group);
-       dev->driver_id = RDMA_DRIVER_RXE;
        err = ib_register_device(dev, ibdev_name);
        if (err)
                pr_warn("%s failed with error %d\n", __func__, err);
index e8be7f44e3beb8b12b4297cfb6d1733c1e061d0e..5c4b2239129cc5ce96aa7ad07652c448b02d794c 100644 (file)
@@ -85,8 +85,8 @@ struct rxe_cqe {
 };
 
 struct rxe_cq {
-       struct rxe_pool_entry   pelem;
        struct ib_cq            ibcq;
+       struct rxe_pool_entry   pelem;
        struct rxe_queue        *queue;
        spinlock_t              cq_lock;
        u8                      notify;
@@ -213,6 +213,7 @@ struct rxe_resp_info {
        struct rxe_mem          *mr;
        u32                     resid;
        u32                     rkey;
+       u32                     length;
        u64                     atomic_orig;
 
        /* SRQ only */
diff --git a/drivers/infiniband/sw/siw/Kconfig b/drivers/infiniband/sw/siw/Kconfig
new file mode 100644 (file)
index 0000000..dace276
--- /dev/null
@@ -0,0 +1,18 @@
+config RDMA_SIW
+       tristate "Software RDMA over TCP/IP (iWARP) driver"
+       depends on INET && INFINIBAND && LIBCRC32C && 64BIT
+       select DMA_VIRT_OPS
+       help
+       This driver implements the iWARP RDMA transport over
+       the Linux TCP/IP network stack. It enables a system with a
+       standard Ethernet adapter to interoperate with a iWARP
+       adapter or with another system running the SIW driver.
+       (See also RXE which is a similar software driver for RoCE.)
+
+       The driver interfaces with the Linux RDMA stack and
+       implements both a kernel and user space RDMA verbs API.
+       The user space verbs API requires a support
+       library named libsiw which is loaded by the generic user
+       space verbs API, libibverbs. To implement RDMA over
+       TCP/IP, the driver further interfaces with the Linux
+       in-kernel TCP socket layer.
diff --git a/drivers/infiniband/sw/siw/Makefile b/drivers/infiniband/sw/siw/Makefile
new file mode 100644 (file)
index 0000000..f5f7e38
--- /dev/null
@@ -0,0 +1,11 @@
+obj-$(CONFIG_RDMA_SIW) += siw.o
+
+siw-y := \
+       siw_cm.o \
+       siw_cq.o \
+       siw_main.o \
+       siw_mem.o \
+       siw_qp.o \
+       siw_qp_tx.o \
+       siw_qp_rx.o \
+       siw_verbs.o
diff --git a/drivers/infiniband/sw/siw/iwarp.h b/drivers/infiniband/sw/siw/iwarp.h
new file mode 100644 (file)
index 0000000..e8a04d9
--- /dev/null
@@ -0,0 +1,380 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#ifndef _IWARP_H
+#define _IWARP_H
+
+#include <rdma/rdma_user_cm.h> /* RDMA_MAX_PRIVATE_DATA */
+#include <linux/types.h>
+#include <asm/byteorder.h>
+
+#define RDMAP_VERSION 1
+#define DDP_VERSION 1
+#define MPA_REVISION_1 1
+#define MPA_REVISION_2 2
+#define MPA_MAX_PRIVDATA RDMA_MAX_PRIVATE_DATA
+#define MPA_KEY_REQ "MPA ID Req Frame"
+#define MPA_KEY_REP "MPA ID Rep Frame"
+#define MPA_IRD_ORD_MASK 0x3fff
+
+struct mpa_rr_params {
+       __be16 bits;
+       __be16 pd_len;
+};
+
+/*
+ * MPA request/response header bits & fields
+ */
+enum {
+       MPA_RR_FLAG_MARKERS = cpu_to_be16(0x8000),
+       MPA_RR_FLAG_CRC = cpu_to_be16(0x4000),
+       MPA_RR_FLAG_REJECT = cpu_to_be16(0x2000),
+       MPA_RR_FLAG_ENHANCED = cpu_to_be16(0x1000),
+       MPA_RR_FLAG_GSO_EXP = cpu_to_be16(0x0800),
+       MPA_RR_MASK_REVISION = cpu_to_be16(0x00ff)
+};
+
+/*
+ * MPA request/reply header
+ */
+struct mpa_rr {
+       __u8 key[16];
+       struct mpa_rr_params params;
+};
+
+static inline void __mpa_rr_set_revision(__be16 *bits, u8 rev)
+{
+       *bits = (*bits & ~MPA_RR_MASK_REVISION) |
+               (cpu_to_be16(rev) & MPA_RR_MASK_REVISION);
+}
+
+static inline u8 __mpa_rr_revision(__be16 mpa_rr_bits)
+{
+       __be16 rev = mpa_rr_bits & MPA_RR_MASK_REVISION;
+
+       return be16_to_cpu(rev);
+}
+
+enum mpa_v2_ctrl {
+       MPA_V2_PEER_TO_PEER = cpu_to_be16(0x8000),
+       MPA_V2_ZERO_LENGTH_RTR = cpu_to_be16(0x4000),
+       MPA_V2_RDMA_WRITE_RTR = cpu_to_be16(0x8000),
+       MPA_V2_RDMA_READ_RTR = cpu_to_be16(0x4000),
+       MPA_V2_RDMA_NO_RTR = cpu_to_be16(0x0000),
+       MPA_V2_MASK_IRD_ORD = cpu_to_be16(0x3fff)
+};
+
+struct mpa_v2_data {
+       __be16 ird;
+       __be16 ord;
+};
+
+struct mpa_marker {
+       __be16 rsvd;
+       __be16 fpdu_hmd; /* FPDU header-marker distance (= MPA's FPDUPTR) */
+};
+
+/*
+ * maximum MPA trailer
+ */
+struct mpa_trailer {
+       __u8 pad[4];
+       __be32 crc;
+};
+
+#define MPA_HDR_SIZE 2
+#define MPA_CRC_SIZE 4
+
+/*
+ * Common portion of iWARP headers (MPA, DDP, RDMAP)
+ * for any FPDU
+ */
+struct iwarp_ctrl {
+       __be16 mpa_len;
+       __be16 ddp_rdmap_ctrl;
+};
+
+/*
+ * DDP/RDMAP Hdr bits & fields
+ */
+enum {
+       DDP_FLAG_TAGGED = cpu_to_be16(0x8000),
+       DDP_FLAG_LAST = cpu_to_be16(0x4000),
+       DDP_MASK_RESERVED = cpu_to_be16(0x3C00),
+       DDP_MASK_VERSION = cpu_to_be16(0x0300),
+       RDMAP_MASK_VERSION = cpu_to_be16(0x00C0),
+       RDMAP_MASK_RESERVED = cpu_to_be16(0x0030),
+       RDMAP_MASK_OPCODE = cpu_to_be16(0x000f)
+};
+
+static inline u8 __ddp_get_version(struct iwarp_ctrl *ctrl)
+{
+       return be16_to_cpu(ctrl->ddp_rdmap_ctrl & DDP_MASK_VERSION) >> 8;
+}
+
+static inline void __ddp_set_version(struct iwarp_ctrl *ctrl, u8 version)
+{
+       ctrl->ddp_rdmap_ctrl =
+               (ctrl->ddp_rdmap_ctrl & ~DDP_MASK_VERSION) |
+               (cpu_to_be16((u16)version << 8) & DDP_MASK_VERSION);
+}
+
+static inline u8 __rdmap_get_version(struct iwarp_ctrl *ctrl)
+{
+       __be16 ver = ctrl->ddp_rdmap_ctrl & RDMAP_MASK_VERSION;
+
+       return be16_to_cpu(ver) >> 6;
+}
+
+static inline void __rdmap_set_version(struct iwarp_ctrl *ctrl, u8 version)
+{
+       ctrl->ddp_rdmap_ctrl = (ctrl->ddp_rdmap_ctrl & ~RDMAP_MASK_VERSION) |
+                              (cpu_to_be16(version << 6) & RDMAP_MASK_VERSION);
+}
+
+static inline u8 __rdmap_get_opcode(struct iwarp_ctrl *ctrl)
+{
+       return be16_to_cpu(ctrl->ddp_rdmap_ctrl & RDMAP_MASK_OPCODE);
+}
+
+static inline void __rdmap_set_opcode(struct iwarp_ctrl *ctrl, u8 opcode)
+{
+       ctrl->ddp_rdmap_ctrl = (ctrl->ddp_rdmap_ctrl & ~RDMAP_MASK_OPCODE) |
+                              (cpu_to_be16(opcode) & RDMAP_MASK_OPCODE);
+}
+
+struct iwarp_rdma_write {
+       struct iwarp_ctrl ctrl;
+       __be32 sink_stag;
+       __be64 sink_to;
+};
+
+struct iwarp_rdma_rreq {
+       struct iwarp_ctrl ctrl;
+       __be32 rsvd;
+       __be32 ddp_qn;
+       __be32 ddp_msn;
+       __be32 ddp_mo;
+       __be32 sink_stag;
+       __be64 sink_to;
+       __be32 read_size;
+       __be32 source_stag;
+       __be64 source_to;
+};
+
+struct iwarp_rdma_rresp {
+       struct iwarp_ctrl ctrl;
+       __be32 sink_stag;
+       __be64 sink_to;
+};
+
+struct iwarp_send {
+       struct iwarp_ctrl ctrl;
+       __be32 rsvd;
+       __be32 ddp_qn;
+       __be32 ddp_msn;
+       __be32 ddp_mo;
+};
+
+struct iwarp_send_inv {
+       struct iwarp_ctrl ctrl;
+       __be32 inval_stag;
+       __be32 ddp_qn;
+       __be32 ddp_msn;
+       __be32 ddp_mo;
+};
+
+struct iwarp_terminate {
+       struct iwarp_ctrl ctrl;
+       __be32 rsvd;
+       __be32 ddp_qn;
+       __be32 ddp_msn;
+       __be32 ddp_mo;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __be32 layer : 4;
+       __be32 etype : 4;
+       __be32 ecode : 8;
+       __be32 flag_m : 1;
+       __be32 flag_d : 1;
+       __be32 flag_r : 1;
+       __be32 reserved : 13;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+       __be32 reserved : 13;
+       __be32 flag_r : 1;
+       __be32 flag_d : 1;
+       __be32 flag_m : 1;
+       __be32 ecode : 8;
+       __be32 etype : 4;
+       __be32 layer : 4;
+#else
+#error "undefined byte order"
+#endif
+};
+
+/*
+ * Terminate Hdr bits & fields
+ */
+enum {
+       TERM_MASK_LAYER = cpu_to_be32(0xf0000000),
+       TERM_MASK_ETYPE = cpu_to_be32(0x0f000000),
+       TERM_MASK_ECODE = cpu_to_be32(0x00ff0000),
+       TERM_FLAG_M = cpu_to_be32(0x00008000),
+       TERM_FLAG_D = cpu_to_be32(0x00004000),
+       TERM_FLAG_R = cpu_to_be32(0x00002000),
+       TERM_MASK_RESVD = cpu_to_be32(0x00001fff)
+};
+
+static inline u8 __rdmap_term_layer(struct iwarp_terminate *term)
+{
+       return term->layer;
+}
+
+static inline void __rdmap_term_set_layer(struct iwarp_terminate *term,
+                                         u8 layer)
+{
+       term->layer = layer & 0xf;
+}
+
+static inline u8 __rdmap_term_etype(struct iwarp_terminate *term)
+{
+       return term->etype;
+}
+
+static inline void __rdmap_term_set_etype(struct iwarp_terminate *term,
+                                         u8 etype)
+{
+       term->etype = etype & 0xf;
+}
+
+static inline u8 __rdmap_term_ecode(struct iwarp_terminate *term)
+{
+       return term->ecode;
+}
+
+static inline void __rdmap_term_set_ecode(struct iwarp_terminate *term,
+                                         u8 ecode)
+{
+       term->ecode = ecode;
+}
+
+/*
+ * Common portion of iWARP headers (MPA, DDP, RDMAP)
+ * for an FPDU carrying an untagged DDP segment
+ */
+struct iwarp_ctrl_untagged {
+       struct iwarp_ctrl ctrl;
+       __be32 rsvd;
+       __be32 ddp_qn;
+       __be32 ddp_msn;
+       __be32 ddp_mo;
+};
+
+/*
+ * Common portion of iWARP headers (MPA, DDP, RDMAP)
+ * for an FPDU carrying a tagged DDP segment
+ */
+struct iwarp_ctrl_tagged {
+       struct iwarp_ctrl ctrl;
+       __be32 ddp_stag;
+       __be64 ddp_to;
+};
+
+union iwarp_hdr {
+       struct iwarp_ctrl ctrl;
+       struct iwarp_ctrl_untagged c_untagged;
+       struct iwarp_ctrl_tagged c_tagged;
+       struct iwarp_rdma_write rwrite;
+       struct iwarp_rdma_rreq rreq;
+       struct iwarp_rdma_rresp rresp;
+       struct iwarp_terminate terminate;
+       struct iwarp_send send;
+       struct iwarp_send_inv send_inv;
+};
+
+enum term_elayer {
+       TERM_ERROR_LAYER_RDMAP = 0x00,
+       TERM_ERROR_LAYER_DDP = 0x01,
+       TERM_ERROR_LAYER_LLP = 0x02 /* eg., MPA */
+};
+
+enum ddp_etype {
+       DDP_ETYPE_CATASTROPHIC = 0x0,
+       DDP_ETYPE_TAGGED_BUF = 0x1,
+       DDP_ETYPE_UNTAGGED_BUF = 0x2,
+       DDP_ETYPE_RSVD = 0x3
+};
+
+enum ddp_ecode {
+       /* unspecified, set to zero */
+       DDP_ECODE_CATASTROPHIC = 0x00,
+       /* Tagged Buffer Errors */
+       DDP_ECODE_T_INVALID_STAG = 0x00,
+       DDP_ECODE_T_BASE_BOUNDS = 0x01,
+       DDP_ECODE_T_STAG_NOT_ASSOC = 0x02,
+       DDP_ECODE_T_TO_WRAP = 0x03,
+       DDP_ECODE_T_VERSION = 0x04,
+       /* Untagged Buffer Errors */
+       DDP_ECODE_UT_INVALID_QN = 0x01,
+       DDP_ECODE_UT_INVALID_MSN_NOBUF = 0x02,
+       DDP_ECODE_UT_INVALID_MSN_RANGE = 0x03,
+       DDP_ECODE_UT_INVALID_MO = 0x04,
+       DDP_ECODE_UT_MSG_TOOLONG = 0x05,
+       DDP_ECODE_UT_VERSION = 0x06
+};
+
+enum rdmap_untagged_qn {
+       RDMAP_UNTAGGED_QN_SEND = 0,
+       RDMAP_UNTAGGED_QN_RDMA_READ = 1,
+       RDMAP_UNTAGGED_QN_TERMINATE = 2,
+       RDMAP_UNTAGGED_QN_COUNT = 3
+};
+
+enum rdmap_etype {
+       RDMAP_ETYPE_CATASTROPHIC = 0x0,
+       RDMAP_ETYPE_REMOTE_PROTECTION = 0x1,
+       RDMAP_ETYPE_REMOTE_OPERATION = 0x2
+};
+
+enum rdmap_ecode {
+       RDMAP_ECODE_INVALID_STAG = 0x00,
+       RDMAP_ECODE_BASE_BOUNDS = 0x01,
+       RDMAP_ECODE_ACCESS_RIGHTS = 0x02,
+       RDMAP_ECODE_STAG_NOT_ASSOC = 0x03,
+       RDMAP_ECODE_TO_WRAP = 0x04,
+       RDMAP_ECODE_VERSION = 0x05,
+       RDMAP_ECODE_OPCODE = 0x06,
+       RDMAP_ECODE_CATASTROPHIC_STREAM = 0x07,
+       RDMAP_ECODE_CATASTROPHIC_GLOBAL = 0x08,
+       RDMAP_ECODE_CANNOT_INVALIDATE = 0x09,
+       RDMAP_ECODE_UNSPECIFIED = 0xff
+};
+
+enum llp_ecode {
+       LLP_ECODE_TCP_STREAM_LOST = 0x01, /* How to transfer this ?? */
+       LLP_ECODE_RECEIVED_CRC = 0x02,
+       LLP_ECODE_FPDU_START = 0x03,
+       LLP_ECODE_INVALID_REQ_RESP = 0x04,
+
+       /* Errors for Enhanced Connection Establishment only */
+       LLP_ECODE_LOCAL_CATASTROPHIC = 0x05,
+       LLP_ECODE_INSUFFICIENT_IRD = 0x06,
+       LLP_ECODE_NO_MATCHING_RTR = 0x07
+};
+
+enum llp_etype { LLP_ETYPE_MPA = 0x00 };
+
+enum rdma_opcode {
+       RDMAP_RDMA_WRITE = 0x0,
+       RDMAP_RDMA_READ_REQ = 0x1,
+       RDMAP_RDMA_READ_RESP = 0x2,
+       RDMAP_SEND = 0x3,
+       RDMAP_SEND_INVAL = 0x4,
+       RDMAP_SEND_SE = 0x5,
+       RDMAP_SEND_SE_INVAL = 0x6,
+       RDMAP_TERMINATE = 0x7,
+       RDMAP_NOT_SUPPORTED = RDMAP_TERMINATE + 1
+};
+
+#endif
diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h
new file mode 100644 (file)
index 0000000..03fd7b2
--- /dev/null
@@ -0,0 +1,745 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#ifndef _SIW_H
+#define _SIW_H
+
+#include <rdma/ib_verbs.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <crypto/hash.h>
+#include <linux/crc32.h>
+#include <linux/crc32c.h>
+
+#include <rdma/siw-abi.h>
+#include "iwarp.h"
+
+#define SIW_VENDOR_ID 0x626d74 /* ascii 'bmt' for now */
+#define SIW_VENDORT_PART_ID 0
+#define SIW_MAX_QP (1024 * 100)
+#define SIW_MAX_QP_WR (1024 * 32)
+#define SIW_MAX_ORD_QP 128
+#define SIW_MAX_IRD_QP 128
+#define SIW_MAX_SGE_PBL 256 /* max num sge's for PBL */
+#define SIW_MAX_SGE_RD 1 /* iwarp limitation. we could relax */
+#define SIW_MAX_CQ (1024 * 100)
+#define SIW_MAX_CQE (SIW_MAX_QP_WR * 100)
+#define SIW_MAX_MR (SIW_MAX_QP * 10)
+#define SIW_MAX_PD SIW_MAX_QP
+#define SIW_MAX_MW 0 /* to be set if MW's are supported */
+#define SIW_MAX_FMR SIW_MAX_MR
+#define SIW_MAX_SRQ SIW_MAX_QP
+#define SIW_MAX_SRQ_WR (SIW_MAX_QP_WR * 10)
+#define SIW_MAX_CONTEXT SIW_MAX_PD
+
+/* Min number of bytes for using zero copy transmit */
+#define SENDPAGE_THRESH PAGE_SIZE
+
+/* Maximum number of frames which can be send in one SQ processing */
+#define SQ_USER_MAXBURST 100
+
+/* Maximum number of consecutive IRQ elements which get served
+ * if SQ has pending work. Prevents starving local SQ processing
+ * by serving peer Read Requests.
+ */
+#define SIW_IRQ_MAXBURST_SQ_ACTIVE 4
+
+struct siw_dev_cap {
+       int max_qp;
+       int max_qp_wr;
+       int max_ord; /* max. outbound read queue depth */
+       int max_ird; /* max. inbound read queue depth */
+       int max_sge;
+       int max_sge_rd;
+       int max_cq;
+       int max_cqe;
+       int max_mr;
+       int max_pd;
+       int max_mw;
+       int max_fmr;
+       int max_srq;
+       int max_srq_wr;
+       int max_srq_sge;
+};
+
+struct siw_pd {
+       struct ib_pd base_pd;
+};
+
+struct siw_device {
+       struct ib_device base_dev;
+       struct net_device *netdev;
+       struct siw_dev_cap attrs;
+
+       u32 vendor_part_id;
+       int numa_node;
+
+       /* physical port state (only one port per device) */
+       enum ib_port_state state;
+
+       spinlock_t lock;
+
+       struct xarray qp_xa;
+       struct xarray mem_xa;
+
+       struct list_head cep_list;
+       struct list_head qp_list;
+
+       /* active objects statistics to enforce limits */
+       atomic_t num_qp;
+       atomic_t num_cq;
+       atomic_t num_pd;
+       atomic_t num_mr;
+       atomic_t num_srq;
+       atomic_t num_ctx;
+
+       struct work_struct netdev_down;
+};
+
+struct siw_uobj {
+       void *addr;
+       u32 size;
+};
+
+struct siw_ucontext {
+       struct ib_ucontext base_ucontext;
+       struct siw_device *sdev;
+
+       /* xarray of user mappable objects */
+       struct xarray xa;
+       u32 uobj_nextkey;
+};
+
+/*
+ * The RDMA core does not define LOCAL_READ access, which is always
+ * enabled implictely.
+ */
+#define IWARP_ACCESS_MASK                                      \
+       (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |       \
+        IB_ACCESS_REMOTE_READ)
+
+/*
+ * siw presentation of user memory registered as source
+ * or target of RDMA operations.
+ */
+
+struct siw_page_chunk {
+       struct page **plist;
+};
+
+struct siw_umem {
+       struct siw_page_chunk *page_chunk;
+       int num_pages;
+       bool writable;
+       u64 fp_addr; /* First page base address */
+       struct mm_struct *owning_mm;
+};
+
+struct siw_pble {
+       u64 addr; /* Address of assigned user buffer */
+       u64 size; /* Size of this entry */
+       u64 pbl_off; /* Total offset from start of PBL */
+};
+
+struct siw_pbl {
+       unsigned int num_buf;
+       unsigned int max_buf;
+       struct siw_pble pbe[1];
+};
+
+struct siw_mr;
+
+/*
+ * Generic memory representation for registered siw memory.
+ * Memory lookup always via higher 24 bit of STag (STag index).
+ */
+struct siw_mem {
+       struct siw_device *sdev;
+       struct kref ref;
+       u64 va; /* VA of memory */
+       u64 len; /* length of the memory buffer in bytes */
+       u32 stag; /* iWarp memory access steering tag */
+       u8 stag_valid; /* VALID or INVALID */
+       u8 is_pbl; /* PBL or user space mem */
+       u8 is_mw; /* Memory Region or Memory Window */
+       enum ib_access_flags perms; /* local/remote READ & WRITE */
+       union {
+               struct siw_umem *umem;
+               struct siw_pbl *pbl;
+               void *mem_obj;
+       };
+       struct ib_pd *pd;
+};
+
+struct siw_mr {
+       struct ib_mr base_mr;
+       struct siw_mem *mem;
+       struct rcu_head rcu;
+};
+
+/*
+ * Error codes for local or remote
+ * access to registered memory
+ */
+enum siw_access_state {
+       E_ACCESS_OK,
+       E_STAG_INVALID,
+       E_BASE_BOUNDS,
+       E_ACCESS_PERM,
+       E_PD_MISMATCH
+};
+
+enum siw_wr_state {
+       SIW_WR_IDLE,
+       SIW_WR_QUEUED, /* processing has not started yet */
+       SIW_WR_INPROGRESS /* initiated processing of the WR */
+};
+
+/* The WQE currently being processed (RX or TX) */
+struct siw_wqe {
+       /* Copy of applications SQE or RQE */
+       union {
+               struct siw_sqe sqe;
+               struct siw_rqe rqe;
+       };
+       struct siw_mem *mem[SIW_MAX_SGE]; /* per sge's resolved mem */
+       enum siw_wr_state wr_status;
+       enum siw_wc_status wc_status;
+       u32 bytes; /* total bytes to process */
+       u32 processed; /* bytes processed */
+};
+
+struct siw_cq {
+       struct ib_cq base_cq;
+       spinlock_t lock;
+       u64 *notify;
+       struct siw_cqe *queue;
+       u32 cq_put;
+       u32 cq_get;
+       u32 num_cqe;
+       bool kernel_verbs;
+       u32 xa_cq_index; /* mmap information for CQE array */
+       u32 id; /* For debugging only */
+};
+
+enum siw_qp_state {
+       SIW_QP_STATE_IDLE,
+       SIW_QP_STATE_RTR,
+       SIW_QP_STATE_RTS,
+       SIW_QP_STATE_CLOSING,
+       SIW_QP_STATE_TERMINATE,
+       SIW_QP_STATE_ERROR,
+       SIW_QP_STATE_COUNT
+};
+
+enum siw_qp_flags {
+       SIW_RDMA_BIND_ENABLED = (1 << 0),
+       SIW_RDMA_WRITE_ENABLED = (1 << 1),
+       SIW_RDMA_READ_ENABLED = (1 << 2),
+       SIW_SIGNAL_ALL_WR = (1 << 3),
+       SIW_MPA_CRC = (1 << 4),
+       SIW_QP_IN_DESTROY = (1 << 5)
+};
+
+enum siw_qp_attr_mask {
+       SIW_QP_ATTR_STATE = (1 << 0),
+       SIW_QP_ATTR_ACCESS_FLAGS = (1 << 1),
+       SIW_QP_ATTR_LLP_HANDLE = (1 << 2),
+       SIW_QP_ATTR_ORD = (1 << 3),
+       SIW_QP_ATTR_IRD = (1 << 4),
+       SIW_QP_ATTR_SQ_SIZE = (1 << 5),
+       SIW_QP_ATTR_RQ_SIZE = (1 << 6),
+       SIW_QP_ATTR_MPA = (1 << 7)
+};
+
+struct siw_srq {
+       struct ib_srq base_srq;
+       spinlock_t lock;
+       u32 max_sge;
+       u32 limit; /* low watermark for async event */
+       struct siw_rqe *recvq;
+       u32 rq_put;
+       u32 rq_get;
+       u32 num_rqe; /* max # of wqe's allowed */
+       u32 xa_srq_index; /* mmap information for SRQ array */
+       char armed; /* inform user if limit hit */
+       char kernel_verbs; /* '1' if kernel client */
+};
+
+struct siw_qp_attrs {
+       enum siw_qp_state state;
+       u32 sq_size;
+       u32 rq_size;
+       u32 orq_size;
+       u32 irq_size;
+       u32 sq_max_sges;
+       u32 rq_max_sges;
+       enum siw_qp_flags flags;
+
+       struct socket *sk;
+};
+
+enum siw_tx_ctx {
+       SIW_SEND_HDR, /* start or continue sending HDR */
+       SIW_SEND_DATA, /* start or continue sending DDP payload */
+       SIW_SEND_TRAILER, /* start or continue sending TRAILER */
+       SIW_SEND_SHORT_FPDU/* send whole FPDU hdr|data|trailer at once */
+};
+
+enum siw_rx_state {
+       SIW_GET_HDR, /* await new hdr or within hdr */
+       SIW_GET_DATA_START, /* start of inbound DDP payload */
+       SIW_GET_DATA_MORE, /* continuation of (misaligned) DDP payload */
+       SIW_GET_TRAILER/* await new trailer or within trailer */
+};
+
+struct siw_rx_stream {
+       struct sk_buff *skb;
+       int skb_new; /* pending unread bytes in skb */
+       int skb_offset; /* offset in skb */
+       int skb_copied; /* processed bytes in skb */
+
+       union iwarp_hdr hdr;
+       struct mpa_trailer trailer;
+
+       enum siw_rx_state state;
+
+       /*
+        * For each FPDU, main RX loop runs through 3 stages:
+        * Receiving protocol headers, placing DDP payload and receiving
+        * trailer information (CRC + possibly padding).
+        * Next two variables keep state on receive status of the
+        * current FPDU part (hdr, data, trailer).
+        */
+       int fpdu_part_rcvd; /* bytes in pkt part copied */
+       int fpdu_part_rem; /* bytes in pkt part not seen */
+
+       /*
+        * Next expected DDP MSN for each QN +
+        * expected steering tag +
+        * expected DDP tagget offset (all HBO)
+        */
+       u32 ddp_msn[RDMAP_UNTAGGED_QN_COUNT];
+       u32 ddp_stag;
+       u64 ddp_to;
+       u32 inval_stag; /* Stag to be invalidated */
+
+       struct shash_desc *mpa_crc_hd;
+       u8 rx_suspend : 1;
+       u8 pad : 2; /* # of pad bytes expected */
+       u8 rdmap_op : 4; /* opcode of current frame */
+};
+
+struct siw_rx_fpdu {
+       /*
+        * Local destination memory of inbound RDMA operation.
+        * Valid, according to wqe->wr_status
+        */
+       struct siw_wqe wqe_active;
+
+       unsigned int pbl_idx; /* Index into current PBL */
+       unsigned int sge_idx; /* current sge in rx */
+       unsigned int sge_off; /* already rcvd in curr. sge */
+
+       char first_ddp_seg; /* this is the first DDP seg */
+       char more_ddp_segs; /* more DDP segs expected */
+       u8 prev_rdmap_op : 4; /* opcode of prev frame */
+};
+
+/*
+ * Shorthands for short packets w/o payload
+ * to be transmitted more efficient.
+ */
+struct siw_send_pkt {
+       struct iwarp_send send;
+       __be32 crc;
+};
+
+struct siw_write_pkt {
+       struct iwarp_rdma_write write;
+       __be32 crc;
+};
+
+struct siw_rreq_pkt {
+       struct iwarp_rdma_rreq rreq;
+       __be32 crc;
+};
+
+struct siw_rresp_pkt {
+       struct iwarp_rdma_rresp rresp;
+       __be32 crc;
+};
+
+struct siw_iwarp_tx {
+       union {
+               union iwarp_hdr hdr;
+
+               /* Generic part of FPDU header */
+               struct iwarp_ctrl ctrl;
+               struct iwarp_ctrl_untagged c_untagged;
+               struct iwarp_ctrl_tagged c_tagged;
+
+               /* FPDU headers */
+               struct iwarp_rdma_write rwrite;
+               struct iwarp_rdma_rreq rreq;
+               struct iwarp_rdma_rresp rresp;
+               struct iwarp_terminate terminate;
+               struct iwarp_send send;
+               struct iwarp_send_inv send_inv;
+
+               /* complete short FPDUs */
+               struct siw_send_pkt send_pkt;
+               struct siw_write_pkt write_pkt;
+               struct siw_rreq_pkt rreq_pkt;
+               struct siw_rresp_pkt rresp_pkt;
+       } pkt;
+
+       struct mpa_trailer trailer;
+       /* DDP MSN for untagged messages */
+       u32 ddp_msn[RDMAP_UNTAGGED_QN_COUNT];
+
+       enum siw_tx_ctx state;
+       u16 ctrl_len; /* ddp+rdmap hdr */
+       u16 ctrl_sent;
+       int burst;
+       int bytes_unsent; /* ddp payload bytes */
+
+       struct shash_desc *mpa_crc_hd;
+
+       u8 do_crc : 1; /* do crc for segment */
+       u8 use_sendpage : 1; /* send w/o copy */
+       u8 tx_suspend : 1; /* stop sending DDP segs. */
+       u8 pad : 2; /* # pad in current fpdu */
+       u8 orq_fence : 1; /* ORQ full or Send fenced */
+       u8 in_syscall : 1; /* TX out of user context */
+       u8 zcopy_tx : 1; /* Use TCP_SENDPAGE if possible */
+       u8 gso_seg_limit; /* Maximum segments for GSO, 0 = unbound */
+
+       u16 fpdu_len; /* len of FPDU to tx */
+       unsigned int tcp_seglen; /* remaining tcp seg space */
+
+       struct siw_wqe wqe_active;
+
+       int pbl_idx; /* Index into current PBL */
+       int sge_idx; /* current sge in tx */
+       u32 sge_off; /* already sent in curr. sge */
+};
+
+struct siw_qp {
+       struct siw_device *sdev;
+       struct ib_qp *ib_qp;
+       struct kref ref;
+       u32 qp_num;
+       struct list_head devq;
+       int tx_cpu;
+       bool kernel_verbs;
+       struct siw_qp_attrs attrs;
+
+       struct siw_cep *cep;
+       struct rw_semaphore state_lock;
+
+       struct ib_pd *pd;
+       struct siw_cq *scq;
+       struct siw_cq *rcq;
+       struct siw_srq *srq;
+
+       struct siw_iwarp_tx tx_ctx; /* Transmit context */
+       spinlock_t sq_lock;
+       struct siw_sqe *sendq; /* send queue element array */
+       uint32_t sq_get; /* consumer index into sq array */
+       uint32_t sq_put; /* kernel prod. index into sq array */
+       struct llist_node tx_list;
+
+       struct siw_sqe *orq; /* outbound read queue element array */
+       spinlock_t orq_lock;
+       uint32_t orq_get; /* consumer index into orq array */
+       uint32_t orq_put; /* shared producer index for ORQ */
+
+       struct siw_rx_stream rx_stream;
+       struct siw_rx_fpdu *rx_fpdu;
+       struct siw_rx_fpdu rx_tagged;
+       struct siw_rx_fpdu rx_untagged;
+       spinlock_t rq_lock;
+       struct siw_rqe *recvq; /* recv queue element array */
+       uint32_t rq_get; /* consumer index into rq array */
+       uint32_t rq_put; /* kernel prod. index into rq array */
+
+       struct siw_sqe *irq; /* inbound read queue element array */
+       uint32_t irq_get; /* consumer index into irq array */
+       uint32_t irq_put; /* producer index into irq array */
+       int irq_burst;
+
+       struct { /* information to be carried in TERMINATE pkt, if valid */
+               u8 valid;
+               u8 in_tx;
+               u8 layer : 4, etype : 4;
+               u8 ecode;
+       } term_info;
+       u32 xa_sq_index; /* mmap information for SQE array */
+       u32 xa_rq_index; /* mmap information for RQE array */
+       struct rcu_head rcu;
+};
+
+struct siw_base_qp {
+       struct ib_qp base_qp;
+       struct siw_qp *qp;
+};
+
+/* helper macros */
+#define rx_qp(rx) container_of(rx, struct siw_qp, rx_stream)
+#define tx_qp(tx) container_of(tx, struct siw_qp, tx_ctx)
+#define tx_wqe(qp) (&(qp)->tx_ctx.wqe_active)
+#define rx_wqe(rctx) (&(rctx)->wqe_active)
+#define rx_mem(rctx) ((rctx)->wqe_active.mem[0])
+#define tx_type(wqe) ((wqe)->sqe.opcode)
+#define rx_type(wqe) ((wqe)->rqe.opcode)
+#define tx_flags(wqe) ((wqe)->sqe.flags)
+
+struct iwarp_msg_info {
+       int hdr_len;
+       struct iwarp_ctrl ctrl;
+       int (*rx_data)(struct siw_qp *qp);
+};
+
+/* Global siw parameters. Currently set in siw_main.c */
+extern const bool zcopy_tx;
+extern const bool try_gso;
+extern const bool loopback_enabled;
+extern const bool mpa_crc_required;
+extern const bool mpa_crc_strict;
+extern const bool siw_tcp_nagle;
+extern u_char mpa_version;
+extern const bool peer_to_peer;
+extern struct task_struct *siw_tx_thread[];
+
+extern struct crypto_shash *siw_crypto_shash;
+extern struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1];
+
+/* QP general functions */
+int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attr,
+                 enum siw_qp_attr_mask mask);
+int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl);
+void siw_qp_llp_close(struct siw_qp *qp);
+void siw_qp_cm_drop(struct siw_qp *qp, int schedule);
+void siw_send_terminate(struct siw_qp *qp);
+
+void siw_qp_get_ref(struct ib_qp *qp);
+void siw_qp_put_ref(struct ib_qp *qp);
+int siw_qp_add(struct siw_device *sdev, struct siw_qp *qp);
+void siw_free_qp(struct kref *ref);
+
+void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer,
+                       u8 etype, u8 ecode, int in_tx);
+enum ddp_ecode siw_tagged_error(enum siw_access_state state);
+enum rdmap_ecode siw_rdmap_error(enum siw_access_state state);
+
+void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe);
+int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32 bytes,
+                    enum siw_wc_status status);
+int siw_rqe_complete(struct siw_qp *qp, struct siw_rqe *rqe, u32 bytes,
+                    u32 inval_stag, enum siw_wc_status status);
+void siw_qp_llp_data_ready(struct sock *sk);
+void siw_qp_llp_write_space(struct sock *sk);
+
+/* QP TX path functions */
+int siw_run_sq(void *arg);
+int siw_qp_sq_process(struct siw_qp *qp);
+int siw_sq_start(struct siw_qp *qp);
+int siw_activate_tx(struct siw_qp *qp);
+void siw_stop_tx_thread(int nr_cpu);
+int siw_get_tx_cpu(struct siw_device *sdev);
+void siw_put_tx_cpu(int cpu);
+
+/* QP RX path functions */
+int siw_proc_send(struct siw_qp *qp);
+int siw_proc_rreq(struct siw_qp *qp);
+int siw_proc_rresp(struct siw_qp *qp);
+int siw_proc_write(struct siw_qp *qp);
+int siw_proc_terminate(struct siw_qp *qp);
+
+int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
+                   unsigned int off, size_t len);
+
+static inline void set_rx_fpdu_context(struct siw_qp *qp, u8 opcode)
+{
+       if (opcode == RDMAP_RDMA_WRITE || opcode == RDMAP_RDMA_READ_RESP)
+               qp->rx_fpdu = &qp->rx_tagged;
+       else
+               qp->rx_fpdu = &qp->rx_untagged;
+
+       qp->rx_stream.rdmap_op = opcode;
+}
+
+static inline struct siw_ucontext *to_siw_ctx(struct ib_ucontext *base_ctx)
+{
+       return container_of(base_ctx, struct siw_ucontext, base_ucontext);
+}
+
+static inline struct siw_base_qp *to_siw_base_qp(struct ib_qp *base_qp)
+{
+       return container_of(base_qp, struct siw_base_qp, base_qp);
+}
+
+static inline struct siw_qp *to_siw_qp(struct ib_qp *base_qp)
+{
+       return to_siw_base_qp(base_qp)->qp;
+}
+
+static inline struct siw_cq *to_siw_cq(struct ib_cq *base_cq)
+{
+       return container_of(base_cq, struct siw_cq, base_cq);
+}
+
+static inline struct siw_srq *to_siw_srq(struct ib_srq *base_srq)
+{
+       return container_of(base_srq, struct siw_srq, base_srq);
+}
+
+static inline struct siw_device *to_siw_dev(struct ib_device *base_dev)
+{
+       return container_of(base_dev, struct siw_device, base_dev);
+}
+
+static inline struct siw_mr *to_siw_mr(struct ib_mr *base_mr)
+{
+       return container_of(base_mr, struct siw_mr, base_mr);
+}
+
+static inline struct siw_qp *siw_qp_id2obj(struct siw_device *sdev, int id)
+{
+       struct siw_qp *qp;
+
+       rcu_read_lock();
+       qp = xa_load(&sdev->qp_xa, id);
+       if (likely(qp && kref_get_unless_zero(&qp->ref))) {
+               rcu_read_unlock();
+               return qp;
+       }
+       rcu_read_unlock();
+       return NULL;
+}
+
+static inline u32 qp_id(struct siw_qp *qp)
+{
+       return qp->qp_num;
+}
+
+static inline void siw_qp_get(struct siw_qp *qp)
+{
+       kref_get(&qp->ref);
+}
+
+static inline void siw_qp_put(struct siw_qp *qp)
+{
+       kref_put(&qp->ref, siw_free_qp);
+}
+
+static inline int siw_sq_empty(struct siw_qp *qp)
+{
+       struct siw_sqe *sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size];
+
+       return READ_ONCE(sqe->flags) == 0;
+}
+
+static inline struct siw_sqe *sq_get_next(struct siw_qp *qp)
+{
+       struct siw_sqe *sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size];
+
+       if (READ_ONCE(sqe->flags) & SIW_WQE_VALID)
+               return sqe;
+
+       return NULL;
+}
+
+static inline struct siw_sqe *orq_get_current(struct siw_qp *qp)
+{
+       return &qp->orq[qp->orq_get % qp->attrs.orq_size];
+}
+
+static inline struct siw_sqe *orq_get_tail(struct siw_qp *qp)
+{
+       return &qp->orq[qp->orq_put % qp->attrs.orq_size];
+}
+
+static inline struct siw_sqe *orq_get_free(struct siw_qp *qp)
+{
+       struct siw_sqe *orq_e = orq_get_tail(qp);
+
+       if (orq_e && READ_ONCE(orq_e->flags) == 0)
+               return orq_e;
+
+       return NULL;
+}
+
+static inline int siw_orq_empty(struct siw_qp *qp)
+{
+       return qp->orq[qp->orq_get % qp->attrs.orq_size].flags == 0 ? 1 : 0;
+}
+
+static inline struct siw_sqe *irq_alloc_free(struct siw_qp *qp)
+{
+       struct siw_sqe *irq_e = &qp->irq[qp->irq_put % qp->attrs.irq_size];
+
+       if (READ_ONCE(irq_e->flags) == 0) {
+               qp->irq_put++;
+               return irq_e;
+       }
+       return NULL;
+}
+
+static inline __wsum siw_csum_update(const void *buff, int len, __wsum sum)
+{
+       return (__force __wsum)crc32c((__force __u32)sum, buff, len);
+}
+
+static inline __wsum siw_csum_combine(__wsum csum, __wsum csum2, int offset,
+                                     int len)
+{
+       return (__force __wsum)__crc32c_le_combine((__force __u32)csum,
+                                                  (__force __u32)csum2, len);
+}
+
+static inline void siw_crc_skb(struct siw_rx_stream *srx, unsigned int len)
+{
+       const struct skb_checksum_ops siw_cs_ops = {
+               .update = siw_csum_update,
+               .combine = siw_csum_combine,
+       };
+       __wsum crc = *(u32 *)shash_desc_ctx(srx->mpa_crc_hd);
+
+       crc = __skb_checksum(srx->skb, srx->skb_offset, len, crc,
+                            &siw_cs_ops);
+       *(u32 *)shash_desc_ctx(srx->mpa_crc_hd) = crc;
+}
+
+#define siw_dbg(ibdev, fmt, ...)                                               \
+       ibdev_dbg(ibdev, "%s: " fmt, __func__, ##__VA_ARGS__)
+
+#define siw_dbg_qp(qp, fmt, ...)                                               \
+       ibdev_dbg(&qp->sdev->base_dev, "QP[%u] %s: " fmt, qp_id(qp), __func__, \
+                 ##__VA_ARGS__)
+
+#define siw_dbg_cq(cq, fmt, ...)                                               \
+       ibdev_dbg(cq->base_cq.device, "CQ[%u] %s: " fmt, cq->id, __func__,     \
+                 ##__VA_ARGS__)
+
+#define siw_dbg_pd(pd, fmt, ...)                                               \
+       ibdev_dbg(pd->device, "PD[%u] %s: " fmt, pd->res.id, __func__,         \
+                 ##__VA_ARGS__)
+
+#define siw_dbg_mem(mem, fmt, ...)                                             \
+       ibdev_dbg(&mem->sdev->base_dev,                                        \
+                 "MEM[0x%08x] %s: " fmt, mem->stag, __func__, ##__VA_ARGS__)
+
+#define siw_dbg_cep(cep, fmt, ...)                                             \
+       ibdev_dbg(&cep->sdev->base_dev, "CEP[0x%p] %s: " fmt,                  \
+                 cep, __func__, ##__VA_ARGS__)
+
+void siw_cq_flush(struct siw_cq *cq);
+void siw_sq_flush(struct siw_qp *qp);
+void siw_rq_flush(struct siw_qp *qp);
+int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc);
+
+#endif
diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c
new file mode 100644 (file)
index 0000000..a7cde98
--- /dev/null
@@ -0,0 +1,2070 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/*          Fredy Neeser */
+/*          Greg Joyce <greg@opengridcomputing.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+/* Copyright (c) 2017, Open Grid Computing, Inc. */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+#include <linux/workqueue.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/inet.h>
+#include <linux/tcp.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "siw.h"
+#include "siw_cm.h"
+
+/*
+ * Set to any combination of
+ * MPA_V2_RDMA_NO_RTR, MPA_V2_RDMA_READ_RTR, MPA_V2_RDMA_WRITE_RTR
+ */
+static __be16 rtr_type = MPA_V2_RDMA_READ_RTR | MPA_V2_RDMA_WRITE_RTR;
+static const bool relaxed_ird_negotiation = 1;
+
+static void siw_cm_llp_state_change(struct sock *s);
+static void siw_cm_llp_data_ready(struct sock *s);
+static void siw_cm_llp_write_space(struct sock *s);
+static void siw_cm_llp_error_report(struct sock *s);
+static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
+                        int status);
+
+static void siw_sk_assign_cm_upcalls(struct sock *sk)
+{
+       write_lock_bh(&sk->sk_callback_lock);
+       sk->sk_state_change = siw_cm_llp_state_change;
+       sk->sk_data_ready = siw_cm_llp_data_ready;
+       sk->sk_write_space = siw_cm_llp_write_space;
+       sk->sk_error_report = siw_cm_llp_error_report;
+       write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void siw_sk_save_upcalls(struct sock *sk)
+{
+       struct siw_cep *cep = sk_to_cep(sk);
+
+       write_lock_bh(&sk->sk_callback_lock);
+       cep->sk_state_change = sk->sk_state_change;
+       cep->sk_data_ready = sk->sk_data_ready;
+       cep->sk_write_space = sk->sk_write_space;
+       cep->sk_error_report = sk->sk_error_report;
+       write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void siw_sk_restore_upcalls(struct sock *sk, struct siw_cep *cep)
+{
+       sk->sk_state_change = cep->sk_state_change;
+       sk->sk_data_ready = cep->sk_data_ready;
+       sk->sk_write_space = cep->sk_write_space;
+       sk->sk_error_report = cep->sk_error_report;
+       sk->sk_user_data = NULL;
+}
+
+static void siw_qp_socket_assoc(struct siw_cep *cep, struct siw_qp *qp)
+{
+       struct socket *s = cep->sock;
+       struct sock *sk = s->sk;
+
+       write_lock_bh(&sk->sk_callback_lock);
+
+       qp->attrs.sk = s;
+       sk->sk_data_ready = siw_qp_llp_data_ready;
+       sk->sk_write_space = siw_qp_llp_write_space;
+
+       write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void siw_socket_disassoc(struct socket *s)
+{
+       struct sock *sk = s->sk;
+       struct siw_cep *cep;
+
+       if (sk) {
+               write_lock_bh(&sk->sk_callback_lock);
+               cep = sk_to_cep(sk);
+               if (cep) {
+                       siw_sk_restore_upcalls(sk, cep);
+                       siw_cep_put(cep);
+               } else {
+                       pr_warn("siw: cannot restore sk callbacks: no ep\n");
+               }
+               write_unlock_bh(&sk->sk_callback_lock);
+       } else {
+               pr_warn("siw: cannot restore sk callbacks: no sk\n");
+       }
+}
+
+static void siw_rtr_data_ready(struct sock *sk)
+{
+       struct siw_cep *cep;
+       struct siw_qp *qp = NULL;
+       read_descriptor_t rd_desc;
+
+       read_lock(&sk->sk_callback_lock);
+
+       cep = sk_to_cep(sk);
+       if (!cep) {
+               WARN(1, "No connection endpoint\n");
+               goto out;
+       }
+       qp = sk_to_qp(sk);
+
+       memset(&rd_desc, 0, sizeof(rd_desc));
+       rd_desc.arg.data = qp;
+       rd_desc.count = 1;
+
+       tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
+       /*
+        * Check if first frame was successfully processed.
+        * Signal connection full establishment if yes.
+        * Failed data processing would have already scheduled
+        * connection drop.
+        */
+       if (!qp->rx_stream.rx_suspend)
+               siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0);
+out:
+       read_unlock(&sk->sk_callback_lock);
+       if (qp)
+               siw_qp_socket_assoc(cep, qp);
+}
+
+static void siw_sk_assign_rtr_upcalls(struct siw_cep *cep)
+{
+       struct sock *sk = cep->sock->sk;
+
+       write_lock_bh(&sk->sk_callback_lock);
+       sk->sk_data_ready = siw_rtr_data_ready;
+       sk->sk_write_space = siw_qp_llp_write_space;
+       write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void siw_cep_socket_assoc(struct siw_cep *cep, struct socket *s)
+{
+       cep->sock = s;
+       siw_cep_get(cep);
+       s->sk->sk_user_data = cep;
+
+       siw_sk_save_upcalls(s->sk);
+       siw_sk_assign_cm_upcalls(s->sk);
+}
+
+static struct siw_cep *siw_cep_alloc(struct siw_device *sdev)
+{
+       struct siw_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL);
+       unsigned long flags;
+
+       if (!cep)
+               return NULL;
+
+       INIT_LIST_HEAD(&cep->listenq);
+       INIT_LIST_HEAD(&cep->devq);
+       INIT_LIST_HEAD(&cep->work_freelist);
+
+       kref_init(&cep->ref);
+       cep->state = SIW_EPSTATE_IDLE;
+       init_waitqueue_head(&cep->waitq);
+       spin_lock_init(&cep->lock);
+       cep->sdev = sdev;
+       cep->enhanced_rdma_conn_est = false;
+
+       spin_lock_irqsave(&sdev->lock, flags);
+       list_add_tail(&cep->devq, &sdev->cep_list);
+       spin_unlock_irqrestore(&sdev->lock, flags);
+
+       siw_dbg_cep(cep, "new endpoint\n");
+       return cep;
+}
+
+static void siw_cm_free_work(struct siw_cep *cep)
+{
+       struct list_head *w, *tmp;
+       struct siw_cm_work *work;
+
+       list_for_each_safe(w, tmp, &cep->work_freelist) {
+               work = list_entry(w, struct siw_cm_work, list);
+               list_del(&work->list);
+               kfree(work);
+       }
+}
+
+static void siw_cancel_mpatimer(struct siw_cep *cep)
+{
+       spin_lock_bh(&cep->lock);
+       if (cep->mpa_timer) {
+               if (cancel_delayed_work(&cep->mpa_timer->work)) {
+                       siw_cep_put(cep);
+                       kfree(cep->mpa_timer); /* not needed again */
+               }
+               cep->mpa_timer = NULL;
+       }
+       spin_unlock_bh(&cep->lock);
+}
+
+static void siw_put_work(struct siw_cm_work *work)
+{
+       INIT_LIST_HEAD(&work->list);
+       spin_lock_bh(&work->cep->lock);
+       list_add(&work->list, &work->cep->work_freelist);
+       spin_unlock_bh(&work->cep->lock);
+}
+
+static void siw_cep_set_inuse(struct siw_cep *cep)
+{
+       unsigned long flags;
+       int rv;
+retry:
+       spin_lock_irqsave(&cep->lock, flags);
+
+       if (cep->in_use) {
+               spin_unlock_irqrestore(&cep->lock, flags);
+               rv = wait_event_interruptible(cep->waitq, !cep->in_use);
+               if (signal_pending(current))
+                       flush_signals(current);
+               goto retry;
+       } else {
+               cep->in_use = 1;
+               spin_unlock_irqrestore(&cep->lock, flags);
+       }
+}
+
+static void siw_cep_set_free(struct siw_cep *cep)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&cep->lock, flags);
+       cep->in_use = 0;
+       spin_unlock_irqrestore(&cep->lock, flags);
+
+       wake_up(&cep->waitq);
+}
+
+static void __siw_cep_dealloc(struct kref *ref)
+{
+       struct siw_cep *cep = container_of(ref, struct siw_cep, ref);
+       struct siw_device *sdev = cep->sdev;
+       unsigned long flags;
+
+       WARN_ON(cep->listen_cep);
+
+       /* kfree(NULL) is safe */
+       kfree(cep->mpa.pdata);
+       spin_lock_bh(&cep->lock);
+       if (!list_empty(&cep->work_freelist))
+               siw_cm_free_work(cep);
+       spin_unlock_bh(&cep->lock);
+
+       spin_lock_irqsave(&sdev->lock, flags);
+       list_del(&cep->devq);
+       spin_unlock_irqrestore(&sdev->lock, flags);
+
+       siw_dbg_cep(cep, "free endpoint\n");
+       kfree(cep);
+}
+
+static struct siw_cm_work *siw_get_work(struct siw_cep *cep)
+{
+       struct siw_cm_work *work = NULL;
+
+       spin_lock_bh(&cep->lock);
+       if (!list_empty(&cep->work_freelist)) {
+               work = list_entry(cep->work_freelist.next, struct siw_cm_work,
+                                 list);
+               list_del_init(&work->list);
+       }
+       spin_unlock_bh(&cep->lock);
+       return work;
+}
+
+static int siw_cm_alloc_work(struct siw_cep *cep, int num)
+{
+       struct siw_cm_work *work;
+
+       while (num--) {
+               work = kmalloc(sizeof(*work), GFP_KERNEL);
+               if (!work) {
+                       if (!(list_empty(&cep->work_freelist)))
+                               siw_cm_free_work(cep);
+                       return -ENOMEM;
+               }
+               work->cep = cep;
+               INIT_LIST_HEAD(&work->list);
+               list_add(&work->list, &cep->work_freelist);
+       }
+       return 0;
+}
+
+/*
+ * siw_cm_upcall()
+ *
+ * Upcall to IWCM to inform about async connection events
+ */
+static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
+                        int status)
+{
+       struct iw_cm_event event;
+       struct iw_cm_id *id;
+
+       memset(&event, 0, sizeof(event));
+       event.status = status;
+       event.event = reason;
+
+       if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
+               event.provider_data = cep;
+               id = cep->listen_cep->cm_id;
+       } else {
+               id = cep->cm_id;
+       }
+       /* Signal IRD and ORD */
+       if (reason == IW_CM_EVENT_ESTABLISHED ||
+           reason == IW_CM_EVENT_CONNECT_REPLY) {
+               /* Signal negotiated IRD/ORD values we will use */
+               event.ird = cep->ird;
+               event.ord = cep->ord;
+       } else if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
+               event.ird = cep->ord;
+               event.ord = cep->ird;
+       }
+       /* Signal private data and address information */
+       if (reason == IW_CM_EVENT_CONNECT_REQUEST ||
+           reason == IW_CM_EVENT_CONNECT_REPLY) {
+               u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len);
+
+               if (pd_len) {
+                       /*
+                        * hand over MPA private data
+                        */
+                       event.private_data_len = pd_len;
+                       event.private_data = cep->mpa.pdata;
+
+                       /* Hide MPA V2 IRD/ORD control */
+                       if (cep->enhanced_rdma_conn_est) {
+                               event.private_data_len -=
+                                       sizeof(struct mpa_v2_data);
+                               event.private_data +=
+                                       sizeof(struct mpa_v2_data);
+                       }
+               }
+               getname_local(cep->sock, &event.local_addr);
+               getname_peer(cep->sock, &event.remote_addr);
+       }
+       siw_dbg_cep(cep, "[QP %u]: id 0x%p, reason=%d, status=%d\n",
+                   cep->qp ? qp_id(cep->qp) : -1, id, reason, status);
+
+       return id->event_handler(id, &event);
+}
+
+/*
+ * siw_qp_cm_drop()
+ *
+ * Drops established LLP connection if present and not already
+ * scheduled for dropping. Called from user context, SQ workqueue
+ * or receive IRQ. Caller signals if socket can be immediately
+ * closed (basically, if not in IRQ).
+ */
+void siw_qp_cm_drop(struct siw_qp *qp, int schedule)
+{
+       struct siw_cep *cep = qp->cep;
+
+       qp->rx_stream.rx_suspend = 1;
+       qp->tx_ctx.tx_suspend = 1;
+
+       if (!qp->cep)
+               return;
+
+       if (schedule) {
+               siw_cm_queue_work(cep, SIW_CM_WORK_CLOSE_LLP);
+       } else {
+               siw_cep_set_inuse(cep);
+
+               if (cep->state == SIW_EPSTATE_CLOSED) {
+                       siw_dbg_cep(cep, "already closed\n");
+                       goto out;
+               }
+               siw_dbg_cep(cep, "immediate close, state %d\n", cep->state);
+
+               if (qp->term_info.valid)
+                       siw_send_terminate(qp);
+
+               if (cep->cm_id) {
+                       switch (cep->state) {
+                       case SIW_EPSTATE_AWAIT_MPAREP:
+                               siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+                                             -EINVAL);
+                               break;
+
+                       case SIW_EPSTATE_RDMA_MODE:
+                               siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
+                               break;
+
+                       case SIW_EPSTATE_IDLE:
+                       case SIW_EPSTATE_LISTENING:
+                       case SIW_EPSTATE_CONNECTING:
+                       case SIW_EPSTATE_AWAIT_MPAREQ:
+                       case SIW_EPSTATE_RECVD_MPAREQ:
+                       case SIW_EPSTATE_CLOSED:
+                       default:
+                               break;
+                       }
+                       cep->cm_id->rem_ref(cep->cm_id);
+                       cep->cm_id = NULL;
+                       siw_cep_put(cep);
+               }
+               cep->state = SIW_EPSTATE_CLOSED;
+
+               if (cep->sock) {
+                       siw_socket_disassoc(cep->sock);
+                       /*
+                        * Immediately close socket
+                        */
+                       sock_release(cep->sock);
+                       cep->sock = NULL;
+               }
+               if (cep->qp) {
+                       cep->qp = NULL;
+                       siw_qp_put(qp);
+               }
+out:
+               siw_cep_set_free(cep);
+       }
+}
+
+void siw_cep_put(struct siw_cep *cep)
+{
+       WARN_ON(kref_read(&cep->ref) < 1);
+       kref_put(&cep->ref, __siw_cep_dealloc);
+}
+
+void siw_cep_get(struct siw_cep *cep)
+{
+       kref_get(&cep->ref);
+}
+
+/*
+ * Expects params->pd_len in host byte order
+ */
+static int siw_send_mpareqrep(struct siw_cep *cep, const void *pdata, u8 pd_len)
+{
+       struct socket *s = cep->sock;
+       struct mpa_rr *rr = &cep->mpa.hdr;
+       struct kvec iov[3];
+       struct msghdr msg;
+       int rv;
+       int iovec_num = 0;
+       int mpa_len;
+
+       memset(&msg, 0, sizeof(msg));
+
+       iov[iovec_num].iov_base = rr;
+       iov[iovec_num].iov_len = sizeof(*rr);
+       mpa_len = sizeof(*rr);
+
+       if (cep->enhanced_rdma_conn_est) {
+               iovec_num++;
+               iov[iovec_num].iov_base = &cep->mpa.v2_ctrl;
+               iov[iovec_num].iov_len = sizeof(cep->mpa.v2_ctrl);
+               mpa_len += sizeof(cep->mpa.v2_ctrl);
+       }
+       if (pd_len) {
+               iovec_num++;
+               iov[iovec_num].iov_base = (char *)pdata;
+               iov[iovec_num].iov_len = pd_len;
+               mpa_len += pd_len;
+       }
+       if (cep->enhanced_rdma_conn_est)
+               pd_len += sizeof(cep->mpa.v2_ctrl);
+
+       rr->params.pd_len = cpu_to_be16(pd_len);
+
+       rv = kernel_sendmsg(s, &msg, iov, iovec_num + 1, mpa_len);
+
+       return rv < 0 ? rv : 0;
+}
+
+/*
+ * Receive MPA Request/Reply header.
+ *
+ * Returns 0 if complete MPA Request/Reply header including
+ * eventual private data was received. Returns -EAGAIN if
+ * header was partially received or negative error code otherwise.
+ *
+ * Context: May be called in process context only
+ */
+static int siw_recv_mpa_rr(struct siw_cep *cep)
+{
+       struct mpa_rr *hdr = &cep->mpa.hdr;
+       struct socket *s = cep->sock;
+       u16 pd_len;
+       int rcvd, to_rcv;
+
+       if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) {
+               rcvd = ksock_recv(s, (char *)hdr + cep->mpa.bytes_rcvd,
+                                 sizeof(struct mpa_rr) - cep->mpa.bytes_rcvd,
+                                 0);
+               if (rcvd <= 0)
+                       return -ECONNABORTED;
+
+               cep->mpa.bytes_rcvd += rcvd;
+
+               if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr))
+                       return -EAGAIN;
+
+               if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA)
+                       return -EPROTO;
+       }
+       pd_len = be16_to_cpu(hdr->params.pd_len);
+
+       /*
+        * At least the MPA Request/Reply header (frame not including
+        * private data) has been received.
+        * Receive (or continue receiving) any private data.
+        */
+       to_rcv = pd_len - (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr));
+
+       if (!to_rcv) {
+               /*
+                * We must have hdr->params.pd_len == 0 and thus received a
+                * complete MPA Request/Reply frame.
+                * Check against peer protocol violation.
+                */
+               u32 word;
+
+               rcvd = ksock_recv(s, (char *)&word, sizeof(word), MSG_DONTWAIT);
+               if (rcvd == -EAGAIN)
+                       return 0;
+
+               if (rcvd == 0) {
+                       siw_dbg_cep(cep, "peer EOF\n");
+                       return -EPIPE;
+               }
+               if (rcvd < 0) {
+                       siw_dbg_cep(cep, "error: %d\n", rcvd);
+                       return rcvd;
+               }
+               siw_dbg_cep(cep, "peer sent extra data: %d\n", rcvd);
+
+               return -EPROTO;
+       }
+
+       /*
+        * At this point, we must have hdr->params.pd_len != 0.
+        * A private data buffer gets allocated if hdr->params.pd_len != 0.
+        */
+       if (!cep->mpa.pdata) {
+               cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL);
+               if (!cep->mpa.pdata)
+                       return -ENOMEM;
+       }
+       rcvd = ksock_recv(
+               s, cep->mpa.pdata + cep->mpa.bytes_rcvd - sizeof(struct mpa_rr),
+               to_rcv + 4, MSG_DONTWAIT);
+
+       if (rcvd < 0)
+               return rcvd;
+
+       if (rcvd > to_rcv)
+               return -EPROTO;
+
+       cep->mpa.bytes_rcvd += rcvd;
+
+       if (to_rcv == rcvd) {
+               siw_dbg_cep(cep, "%d bytes private data received\n", pd_len);
+               return 0;
+       }
+       return -EAGAIN;
+}
+
+/*
+ * siw_proc_mpareq()
+ *
+ * Read MPA Request from socket and signal new connection to IWCM
+ * if success. Caller must hold lock on corresponding listening CEP.
+ */
+static int siw_proc_mpareq(struct siw_cep *cep)
+{
+       struct mpa_rr *req;
+       int version, rv;
+       u16 pd_len;
+
+       rv = siw_recv_mpa_rr(cep);
+       if (rv)
+               return rv;
+
+       req = &cep->mpa.hdr;
+
+       version = __mpa_rr_revision(req->params.bits);
+       pd_len = be16_to_cpu(req->params.pd_len);
+
+       if (version > MPA_REVISION_2)
+               /* allow for 0, 1, and 2 only */
+               return -EPROTO;
+
+       if (memcmp(req->key, MPA_KEY_REQ, 16))
+               return -EPROTO;
+
+       /* Prepare for sending MPA reply */
+       memcpy(req->key, MPA_KEY_REP, 16);
+
+       if (version == MPA_REVISION_2 &&
+           (req->params.bits & MPA_RR_FLAG_ENHANCED)) {
+               /*
+                * MPA version 2 must signal IRD/ORD values and P2P mode
+                * in private data if header flag MPA_RR_FLAG_ENHANCED
+                * is set.
+                */
+               if (pd_len < sizeof(struct mpa_v2_data))
+                       goto reject_conn;
+
+               cep->enhanced_rdma_conn_est = true;
+       }
+
+       /* MPA Markers: currently not supported. Marker TX to be added. */
+       if (req->params.bits & MPA_RR_FLAG_MARKERS)
+               goto reject_conn;
+
+       if (req->params.bits & MPA_RR_FLAG_CRC) {
+               /*
+                * RFC 5044, page 27: CRC MUST be used if peer requests it.
+                * siw specific: 'mpa_crc_strict' parameter to reject
+                * connection with CRC if local CRC off enforced by
+                * 'mpa_crc_strict' module parameter.
+                */
+               if (!mpa_crc_required && mpa_crc_strict)
+                       goto reject_conn;
+
+               /* Enable CRC if requested by module parameter */
+               if (mpa_crc_required)
+                       req->params.bits |= MPA_RR_FLAG_CRC;
+       }
+       if (cep->enhanced_rdma_conn_est) {
+               struct mpa_v2_data *v2 = (struct mpa_v2_data *)cep->mpa.pdata;
+
+               /*
+                * Peer requested ORD becomes requested local IRD,
+                * peer requested IRD becomes requested local ORD.
+                * IRD and ORD get limited by global maximum values.
+                */
+               cep->ord = ntohs(v2->ird) & MPA_IRD_ORD_MASK;
+               cep->ord = min(cep->ord, SIW_MAX_ORD_QP);
+               cep->ird = ntohs(v2->ord) & MPA_IRD_ORD_MASK;
+               cep->ird = min(cep->ird, SIW_MAX_IRD_QP);
+
+               /* May get overwritten by locally negotiated values */
+               cep->mpa.v2_ctrl.ird = htons(cep->ird);
+               cep->mpa.v2_ctrl.ord = htons(cep->ord);
+
+               /*
+                * Support for peer sent zero length Write or Read to
+                * let local side enter RTS. Writes are preferred.
+                * Sends would require pre-posting a Receive and are
+                * not supported.
+                * Propose zero length Write if none of Read and Write
+                * is indicated.
+                */
+               if (v2->ird & MPA_V2_PEER_TO_PEER) {
+                       cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER;
+
+                       if (v2->ord & MPA_V2_RDMA_WRITE_RTR)
+                               cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR;
+                       else if (v2->ord & MPA_V2_RDMA_READ_RTR)
+                               cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_READ_RTR;
+                       else
+                               cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR;
+               }
+       }
+
+       cep->state = SIW_EPSTATE_RECVD_MPAREQ;
+
+       /* Keep reference until IWCM accepts/rejects */
+       siw_cep_get(cep);
+       rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0);
+       if (rv)
+               siw_cep_put(cep);
+
+       return rv;
+
+reject_conn:
+       siw_dbg_cep(cep, "reject: crc %d:%d:%d, m %d:%d\n",
+                   req->params.bits & MPA_RR_FLAG_CRC ? 1 : 0,
+                   mpa_crc_required, mpa_crc_strict,
+                   req->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0);
+
+       req->params.bits &= ~MPA_RR_FLAG_MARKERS;
+       req->params.bits |= MPA_RR_FLAG_REJECT;
+
+       if (!mpa_crc_required && mpa_crc_strict)
+               req->params.bits &= ~MPA_RR_FLAG_CRC;
+
+       if (pd_len)
+               kfree(cep->mpa.pdata);
+
+       cep->mpa.pdata = NULL;
+
+       siw_send_mpareqrep(cep, NULL, 0);
+
+       return -EOPNOTSUPP;
+}
+
+static int siw_proc_mpareply(struct siw_cep *cep)
+{
+       struct siw_qp_attrs qp_attrs;
+       enum siw_qp_attr_mask qp_attr_mask;
+       struct siw_qp *qp = cep->qp;
+       struct mpa_rr *rep;
+       int rv;
+       u16 rep_ord;
+       u16 rep_ird;
+       bool ird_insufficient = false;
+       enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR;
+
+       rv = siw_recv_mpa_rr(cep);
+       if (rv != -EAGAIN)
+               siw_cancel_mpatimer(cep);
+       if (rv)
+               goto out_err;
+
+       rep = &cep->mpa.hdr;
+
+       if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) {
+               /* allow for 0, 1,  and 2 only */
+               rv = -EPROTO;
+               goto out_err;
+       }
+       if (memcmp(rep->key, MPA_KEY_REP, 16)) {
+               siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA,
+                                  LLP_ECODE_INVALID_REQ_RESP, 0);
+               siw_send_terminate(qp);
+               rv = -EPROTO;
+               goto out_err;
+       }
+       if (rep->params.bits & MPA_RR_FLAG_REJECT) {
+               siw_dbg_cep(cep, "got mpa reject\n");
+               siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET);
+
+               return -ECONNRESET;
+       }
+       if (try_gso && rep->params.bits & MPA_RR_FLAG_GSO_EXP) {
+               siw_dbg_cep(cep, "peer allows GSO on TX\n");
+               qp->tx_ctx.gso_seg_limit = 0;
+       }
+       if ((rep->params.bits & MPA_RR_FLAG_MARKERS) ||
+           (mpa_crc_required && !(rep->params.bits & MPA_RR_FLAG_CRC)) ||
+           (mpa_crc_strict && !mpa_crc_required &&
+            (rep->params.bits & MPA_RR_FLAG_CRC))) {
+               siw_dbg_cep(cep, "reply unsupp: crc %d:%d:%d, m %d:%d\n",
+                           rep->params.bits & MPA_RR_FLAG_CRC ? 1 : 0,
+                           mpa_crc_required, mpa_crc_strict,
+                           rep->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0);
+
+               siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED);
+
+               return -EINVAL;
+       }
+       if (cep->enhanced_rdma_conn_est) {
+               struct mpa_v2_data *v2;
+
+               if (__mpa_rr_revision(rep->params.bits) < MPA_REVISION_2 ||
+                   !(rep->params.bits & MPA_RR_FLAG_ENHANCED)) {
+                       /*
+                        * Protocol failure: The responder MUST reply with
+                        * MPA version 2 and MUST set MPA_RR_FLAG_ENHANCED.
+                        */
+                       siw_dbg_cep(cep, "mpa reply error: vers %d, enhcd %d\n",
+                                   __mpa_rr_revision(rep->params.bits),
+                                   rep->params.bits & MPA_RR_FLAG_ENHANCED ?
+                                           1 :
+                                           0);
+
+                       siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+                                     -ECONNRESET);
+                       return -EINVAL;
+               }
+               v2 = (struct mpa_v2_data *)cep->mpa.pdata;
+               rep_ird = ntohs(v2->ird) & MPA_IRD_ORD_MASK;
+               rep_ord = ntohs(v2->ord) & MPA_IRD_ORD_MASK;
+
+               if (cep->ird < rep_ord &&
+                   (relaxed_ird_negotiation == false ||
+                    rep_ord > cep->sdev->attrs.max_ird)) {
+                       siw_dbg_cep(cep, "ird %d, rep_ord %d, max_ord %d\n",
+                                   cep->ird, rep_ord,
+                                   cep->sdev->attrs.max_ord);
+                       ird_insufficient = true;
+               }
+               if (cep->ord > rep_ird && relaxed_ird_negotiation == false) {
+                       siw_dbg_cep(cep, "ord %d, rep_ird %d\n", cep->ord,
+                                   rep_ird);
+                       ird_insufficient = true;
+               }
+               /*
+                * Always report negotiated peer values to user,
+                * even if IRD/ORD negotiation failed
+                */
+               cep->ird = rep_ord;
+               cep->ord = rep_ird;
+
+               if (ird_insufficient) {
+                       /*
+                        * If the initiator IRD is insuffient for the
+                        * responder ORD, send a TERM.
+                        */
+                       siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
+                                          LLP_ETYPE_MPA,
+                                          LLP_ECODE_INSUFFICIENT_IRD, 0);
+                       siw_send_terminate(qp);
+                       rv = -ENOMEM;
+                       goto out_err;
+               }
+               if (cep->mpa.v2_ctrl_req.ird & MPA_V2_PEER_TO_PEER)
+                       mpa_p2p_mode =
+                               cep->mpa.v2_ctrl_req.ord &
+                               (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR);
+
+               /*
+                * Check if we requested P2P mode, and if peer agrees
+                */
+               if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) {
+                       if ((mpa_p2p_mode & v2->ord) == 0) {
+                               /*
+                                * We requested RTR mode(s), but the peer
+                                * did not pick any mode we support.
+                                */
+                               siw_dbg_cep(cep,
+                                           "rtr mode:  req %2x, got %2x\n",
+                                           mpa_p2p_mode,
+                                           v2->ord & (MPA_V2_RDMA_WRITE_RTR |
+                                                      MPA_V2_RDMA_READ_RTR));
+
+                               siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
+                                                  LLP_ETYPE_MPA,
+                                                  LLP_ECODE_NO_MATCHING_RTR,
+                                                  0);
+                               siw_send_terminate(qp);
+                               rv = -EPROTO;
+                               goto out_err;
+                       }
+                       mpa_p2p_mode = v2->ord & (MPA_V2_RDMA_WRITE_RTR |
+                                                 MPA_V2_RDMA_READ_RTR);
+               }
+       }
+       memset(&qp_attrs, 0, sizeof(qp_attrs));
+
+       if (rep->params.bits & MPA_RR_FLAG_CRC)
+               qp_attrs.flags = SIW_MPA_CRC;
+
+       qp_attrs.irq_size = cep->ird;
+       qp_attrs.orq_size = cep->ord;
+       qp_attrs.sk = cep->sock;
+       qp_attrs.state = SIW_QP_STATE_RTS;
+
+       qp_attr_mask = SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE |
+                      SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA;
+
+       /* Move socket RX/TX under QP control */
+       down_write(&qp->state_lock);
+       if (qp->attrs.state > SIW_QP_STATE_RTR) {
+               rv = -EINVAL;
+               up_write(&qp->state_lock);
+               goto out_err;
+       }
+       rv = siw_qp_modify(qp, &qp_attrs, qp_attr_mask);
+
+       siw_qp_socket_assoc(cep, qp);
+
+       up_write(&qp->state_lock);
+
+       /* Send extra RDMA frame to trigger peer RTS if negotiated */
+       if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) {
+               rv = siw_qp_mpa_rts(qp, mpa_p2p_mode);
+               if (rv)
+                       goto out_err;
+       }
+       if (!rv) {
+               rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0);
+               if (!rv)
+                       cep->state = SIW_EPSTATE_RDMA_MODE;
+
+               return 0;
+       }
+
+out_err:
+       siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL);
+
+       return rv;
+}
+
+/*
+ * siw_accept_newconn - accept an incoming pending connection
+ *
+ */
+static void siw_accept_newconn(struct siw_cep *cep)
+{
+       struct socket *s = cep->sock;
+       struct socket *new_s = NULL;
+       struct siw_cep *new_cep = NULL;
+       int rv = 0; /* debug only. should disappear */
+
+       if (cep->state != SIW_EPSTATE_LISTENING)
+               goto error;
+
+       new_cep = siw_cep_alloc(cep->sdev);
+       if (!new_cep)
+               goto error;
+
+       /*
+        * 4: Allocate a sufficient number of work elements
+        * to allow concurrent handling of local + peer close
+        * events, MPA header processing + MPA timeout.
+        */
+       if (siw_cm_alloc_work(new_cep, 4) != 0)
+               goto error;
+
+       /*
+        * Copy saved socket callbacks from listening CEP
+        * and assign new socket with new CEP
+        */
+       new_cep->sk_state_change = cep->sk_state_change;
+       new_cep->sk_data_ready = cep->sk_data_ready;
+       new_cep->sk_write_space = cep->sk_write_space;
+       new_cep->sk_error_report = cep->sk_error_report;
+
+       rv = kernel_accept(s, &new_s, O_NONBLOCK);
+       if (rv != 0) {
+               /*
+                * Connection already aborted by peer..?
+                */
+               siw_dbg_cep(cep, "kernel_accept() error: %d\n", rv);
+               goto error;
+       }
+       new_cep->sock = new_s;
+       siw_cep_get(new_cep);
+       new_s->sk->sk_user_data = new_cep;
+
+       siw_dbg_cep(cep, "listen socket 0x%p, new 0x%p\n", s, new_s);
+
+       if (siw_tcp_nagle == false) {
+               int val = 1;
+
+               rv = kernel_setsockopt(new_s, SOL_TCP, TCP_NODELAY,
+                                      (char *)&val, sizeof(val));
+               if (rv) {
+                       siw_dbg_cep(cep, "setsockopt NODELAY error: %d\n", rv);
+                       goto error;
+               }
+       }
+       new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ;
+
+       rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT);
+       if (rv)
+               goto error;
+       /*
+        * See siw_proc_mpareq() etc. for the use of new_cep->listen_cep.
+        */
+       new_cep->listen_cep = cep;
+       siw_cep_get(cep);
+
+       if (atomic_read(&new_s->sk->sk_rmem_alloc)) {
+               /*
+                * MPA REQ already queued
+                */
+               siw_dbg_cep(cep, "immediate mpa request\n");
+
+               siw_cep_set_inuse(new_cep);
+               rv = siw_proc_mpareq(new_cep);
+               siw_cep_set_free(new_cep);
+
+               if (rv != -EAGAIN) {
+                       siw_cep_put(cep);
+                       new_cep->listen_cep = NULL;
+                       if (rv)
+                               goto error;
+               }
+       }
+       return;
+
+error:
+       if (new_cep)
+               siw_cep_put(new_cep);
+
+       if (new_s) {
+               siw_socket_disassoc(new_s);
+               sock_release(new_s);
+               new_cep->sock = NULL;
+       }
+       siw_dbg_cep(cep, "error %d\n", rv);
+}
+
+static void siw_cm_work_handler(struct work_struct *w)
+{
+       struct siw_cm_work *work;
+       struct siw_cep *cep;
+       int release_cep = 0, rv = 0;
+
+       work = container_of(w, struct siw_cm_work, work.work);
+       cep = work->cep;
+
+       siw_dbg_cep(cep, "[QP %u]: work type: %d, state %d\n",
+                   cep->qp ? qp_id(cep->qp) : -1, work->type, cep->state);
+
+       siw_cep_set_inuse(cep);
+
+       switch (work->type) {
+       case SIW_CM_WORK_ACCEPT:
+               siw_accept_newconn(cep);
+               break;
+
+       case SIW_CM_WORK_READ_MPAHDR:
+               if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
+                       if (cep->listen_cep) {
+                               siw_cep_set_inuse(cep->listen_cep);
+
+                               if (cep->listen_cep->state ==
+                                   SIW_EPSTATE_LISTENING)
+                                       rv = siw_proc_mpareq(cep);
+                               else
+                                       rv = -EFAULT;
+
+                               siw_cep_set_free(cep->listen_cep);
+
+                               if (rv != -EAGAIN) {
+                                       siw_cep_put(cep->listen_cep);
+                                       cep->listen_cep = NULL;
+                                       if (rv)
+                                               siw_cep_put(cep);
+                               }
+                       }
+               } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
+                       rv = siw_proc_mpareply(cep);
+               } else {
+                       /*
+                        * CEP already moved out of MPA handshake.
+                        * any connection management already done.
+                        * silently ignore the mpa packet.
+                        */
+                       if (cep->state == SIW_EPSTATE_RDMA_MODE) {
+                               cep->sock->sk->sk_data_ready(cep->sock->sk);
+                               siw_dbg_cep(cep, "already in RDMA mode");
+                       } else {
+                               siw_dbg_cep(cep, "out of state: %d\n",
+                                           cep->state);
+                       }
+               }
+               if (rv && rv != EAGAIN)
+                       release_cep = 1;
+               break;
+
+       case SIW_CM_WORK_CLOSE_LLP:
+               /*
+                * QP scheduled LLP close
+                */
+               if (cep->qp && cep->qp->term_info.valid)
+                       siw_send_terminate(cep->qp);
+
+               if (cep->cm_id)
+                       siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
+
+               release_cep = 1;
+               break;
+
+       case SIW_CM_WORK_PEER_CLOSE:
+               if (cep->cm_id) {
+                       if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
+                               /*
+                                * MPA reply not received, but connection drop
+                                */
+                               siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+                                             -ECONNRESET);
+                       } else if (cep->state == SIW_EPSTATE_RDMA_MODE) {
+                               /*
+                                * NOTE: IW_CM_EVENT_DISCONNECT is given just
+                                *       to transition IWCM into CLOSING.
+                                */
+                               siw_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0);
+                               siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
+                       }
+                       /*
+                        * for other states there is no connection
+                        * known to the IWCM.
+                        */
+               } else {
+                       if (cep->state == SIW_EPSTATE_RECVD_MPAREQ) {
+                               /*
+                                * Wait for the ulp/CM to call accept/reject
+                                */
+                               siw_dbg_cep(cep,
+                                           "mpa req recvd, wait for ULP\n");
+                       } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
+                               /*
+                                * Socket close before MPA request received.
+                                */
+                               siw_dbg_cep(cep, "no mpareq: drop listener\n");
+                               siw_cep_put(cep->listen_cep);
+                               cep->listen_cep = NULL;
+                       }
+               }
+               release_cep = 1;
+               break;
+
+       case SIW_CM_WORK_MPATIMEOUT:
+               cep->mpa_timer = NULL;
+
+               if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
+                       /*
+                        * MPA request timed out:
+                        * Hide any partially received private data and signal
+                        * timeout
+                        */
+                       cep->mpa.hdr.params.pd_len = 0;
+
+                       if (cep->cm_id)
+                               siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+                                             -ETIMEDOUT);
+                       release_cep = 1;
+
+               } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
+                       /*
+                        * No MPA request received after peer TCP stream setup.
+                        */
+                       if (cep->listen_cep) {
+                               siw_cep_put(cep->listen_cep);
+                               cep->listen_cep = NULL;
+                       }
+                       release_cep = 1;
+               }
+               break;
+
+       default:
+               WARN(1, "Undefined CM work type: %d\n", work->type);
+       }
+       if (release_cep) {
+               siw_dbg_cep(cep,
+                           "release: timer=%s, QP[%u], id 0x%p\n",
+                           cep->mpa_timer ? "y" : "n",
+                           cep->qp ? qp_id(cep->qp) : -1, cep->cm_id);
+
+               siw_cancel_mpatimer(cep);
+
+               cep->state = SIW_EPSTATE_CLOSED;
+
+               if (cep->qp) {
+                       struct siw_qp *qp = cep->qp;
+                       /*
+                        * Serialize a potential race with application
+                        * closing the QP and calling siw_qp_cm_drop()
+                        */
+                       siw_qp_get(qp);
+                       siw_cep_set_free(cep);
+
+                       siw_qp_llp_close(qp);
+                       siw_qp_put(qp);
+
+                       siw_cep_set_inuse(cep);
+                       cep->qp = NULL;
+                       siw_qp_put(qp);
+               }
+               if (cep->sock) {
+                       siw_socket_disassoc(cep->sock);
+                       sock_release(cep->sock);
+                       cep->sock = NULL;
+               }
+               if (cep->cm_id) {
+                       cep->cm_id->rem_ref(cep->cm_id);
+                       cep->cm_id = NULL;
+                       siw_cep_put(cep);
+               }
+       }
+       siw_cep_set_free(cep);
+       siw_put_work(work);
+       siw_cep_put(cep);
+}
+
+static struct workqueue_struct *siw_cm_wq;
+
+int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type)
+{
+       struct siw_cm_work *work = siw_get_work(cep);
+       unsigned long delay = 0;
+
+       if (!work) {
+               siw_dbg_cep(cep, "failed with no work available\n");
+               return -ENOMEM;
+       }
+       work->type = type;
+       work->cep = cep;
+
+       siw_cep_get(cep);
+
+       INIT_DELAYED_WORK(&work->work, siw_cm_work_handler);
+
+       if (type == SIW_CM_WORK_MPATIMEOUT) {
+               cep->mpa_timer = work;
+
+               if (cep->state == SIW_EPSTATE_AWAIT_MPAREP)
+                       delay = MPAREQ_TIMEOUT;
+               else
+                       delay = MPAREP_TIMEOUT;
+       }
+       siw_dbg_cep(cep, "[QP %u]: work type: %d, work 0x%p, timeout %lu\n",
+                   cep->qp ? qp_id(cep->qp) : -1, type, work, delay);
+
+       queue_delayed_work(siw_cm_wq, &work->work, delay);
+
+       return 0;
+}
+
+static void siw_cm_llp_data_ready(struct sock *sk)
+{
+       struct siw_cep *cep;
+
+       read_lock(&sk->sk_callback_lock);
+
+       cep = sk_to_cep(sk);
+       if (!cep) {
+               WARN_ON(1);
+               goto out;
+       }
+       siw_dbg_cep(cep, "state: %d\n", cep->state);
+
+       switch (cep->state) {
+       case SIW_EPSTATE_RDMA_MODE:
+               /* fall through */
+       case SIW_EPSTATE_LISTENING:
+               break;
+
+       case SIW_EPSTATE_AWAIT_MPAREQ:
+               /* fall through */
+       case SIW_EPSTATE_AWAIT_MPAREP:
+               siw_cm_queue_work(cep, SIW_CM_WORK_READ_MPAHDR);
+               break;
+
+       default:
+               siw_dbg_cep(cep, "unexpected data, state %d\n", cep->state);
+               break;
+       }
+out:
+       read_unlock(&sk->sk_callback_lock);
+}
+
+static void siw_cm_llp_write_space(struct sock *sk)
+{
+       struct siw_cep *cep = sk_to_cep(sk);
+
+       if (cep)
+               siw_dbg_cep(cep, "state: %d\n", cep->state);
+}
+
+static void siw_cm_llp_error_report(struct sock *sk)
+{
+       struct siw_cep *cep = sk_to_cep(sk);
+
+       if (cep) {
+               siw_dbg_cep(cep, "error %d, socket state: %d, cep state: %d\n",
+                           sk->sk_err, sk->sk_state, cep->state);
+               cep->sk_error_report(sk);
+       }
+}
+
+static void siw_cm_llp_state_change(struct sock *sk)
+{
+       struct siw_cep *cep;
+       void (*orig_state_change)(struct sock *s);
+
+       read_lock(&sk->sk_callback_lock);
+
+       cep = sk_to_cep(sk);
+       if (!cep) {
+               /* endpoint already disassociated */
+               read_unlock(&sk->sk_callback_lock);
+               return;
+       }
+       orig_state_change = cep->sk_state_change;
+
+       siw_dbg_cep(cep, "state: %d\n", cep->state);
+
+       switch (sk->sk_state) {
+       case TCP_ESTABLISHED:
+               /*
+                * handle accepting socket as special case where only
+                * new connection is possible
+                */
+               siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT);
+               break;
+
+       case TCP_CLOSE:
+       case TCP_CLOSE_WAIT:
+               if (cep->qp)
+                       cep->qp->tx_ctx.tx_suspend = 1;
+               siw_cm_queue_work(cep, SIW_CM_WORK_PEER_CLOSE);
+               break;
+
+       default:
+               siw_dbg_cep(cep, "unexpected socket state %d\n", sk->sk_state);
+       }
+       read_unlock(&sk->sk_callback_lock);
+       orig_state_change(sk);
+}
+
+static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
+                             struct sockaddr *raddr)
+{
+       int rv, flags = 0, s_val = 1;
+       size_t size = laddr->sa_family == AF_INET ?
+               sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+
+       /*
+        * Make address available again asap.
+        */
+       rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val,
+                              sizeof(s_val));
+       if (rv < 0)
+               return rv;
+
+       rv = s->ops->bind(s, laddr, size);
+       if (rv < 0)
+               return rv;
+
+       rv = s->ops->connect(s, raddr, size, flags);
+
+       return rv < 0 ? rv : 0;
+}
+
+int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
+{
+       struct siw_device *sdev = to_siw_dev(id->device);
+       struct siw_qp *qp;
+       struct siw_cep *cep = NULL;
+       struct socket *s = NULL;
+       struct sockaddr *laddr = (struct sockaddr *)&id->local_addr,
+                       *raddr = (struct sockaddr *)&id->remote_addr;
+       bool p2p_mode = peer_to_peer, v4 = true;
+       u16 pd_len = params->private_data_len;
+       int version = mpa_version, rv;
+
+       if (pd_len > MPA_MAX_PRIVDATA)
+               return -EINVAL;
+
+       if (params->ird > sdev->attrs.max_ird ||
+           params->ord > sdev->attrs.max_ord)
+               return -ENOMEM;
+
+       if (laddr->sa_family == AF_INET6)
+               v4 = false;
+       else if (laddr->sa_family != AF_INET)
+               return -EAFNOSUPPORT;
+
+       /*
+        * Respect any iwarp port mapping: Use mapped remote address
+        * if valid. Local address must not be mapped, since siw
+        * uses kernel TCP stack.
+        */
+       if ((v4 && to_sockaddr_in(id->remote_addr).sin_port != 0) ||
+            to_sockaddr_in6(id->remote_addr).sin6_port != 0)
+               raddr = (struct sockaddr *)&id->m_remote_addr;
+
+       qp = siw_qp_id2obj(sdev, params->qpn);
+       if (!qp) {
+               WARN(1, "[QP %u] does not exist\n", params->qpn);
+               rv = -EINVAL;
+               goto error;
+       }
+       if (v4)
+               siw_dbg_qp(qp,
+                          "id 0x%p, pd_len %d, laddr %pI4 %d, raddr %pI4 %d\n",
+                          id, pd_len,
+                          &((struct sockaddr_in *)(laddr))->sin_addr,
+                          ntohs(((struct sockaddr_in *)(laddr))->sin_port),
+                          &((struct sockaddr_in *)(raddr))->sin_addr,
+                          ntohs(((struct sockaddr_in *)(raddr))->sin_port));
+       else
+               siw_dbg_qp(qp,
+                          "id 0x%p, pd_len %d, laddr %pI6 %d, raddr %pI6 %d\n",
+                          id, pd_len,
+                          &((struct sockaddr_in6 *)(laddr))->sin6_addr,
+                          ntohs(((struct sockaddr_in6 *)(laddr))->sin6_port),
+                          &((struct sockaddr_in6 *)(raddr))->sin6_addr,
+                          ntohs(((struct sockaddr_in6 *)(raddr))->sin6_port));
+
+       rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s);
+       if (rv < 0)
+               goto error;
+
+       /*
+        * NOTE: For simplification, connect() is called in blocking
+        * mode. Might be reconsidered for async connection setup at
+        * TCP level.
+        */
+       rv = kernel_bindconnect(s, laddr, raddr);
+       if (rv != 0) {
+               siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv);
+               goto error;
+       }
+       if (siw_tcp_nagle == false) {
+               int val = 1;
+
+               rv = kernel_setsockopt(s, SOL_TCP, TCP_NODELAY, (char *)&val,
+                                      sizeof(val));
+               if (rv) {
+                       siw_dbg_qp(qp, "setsockopt NODELAY error: %d\n", rv);
+                       goto error;
+               }
+       }
+       cep = siw_cep_alloc(sdev);
+       if (!cep) {
+               rv = -ENOMEM;
+               goto error;
+       }
+       siw_cep_set_inuse(cep);
+
+       /* Associate QP with CEP */
+       siw_cep_get(cep);
+       qp->cep = cep;
+
+       /* siw_qp_get(qp) already done by QP lookup */
+       cep->qp = qp;
+
+       id->add_ref(id);
+       cep->cm_id = id;
+
+       /*
+        * 4: Allocate a sufficient number of work elements
+        * to allow concurrent handling of local + peer close
+        * events, MPA header processing + MPA timeout.
+        */
+       rv = siw_cm_alloc_work(cep, 4);
+       if (rv != 0) {
+               rv = -ENOMEM;
+               goto error;
+       }
+       cep->ird = params->ird;
+       cep->ord = params->ord;
+
+       if (p2p_mode && cep->ord == 0)
+               cep->ord = 1;
+
+       cep->state = SIW_EPSTATE_CONNECTING;
+
+       /*
+        * Associate CEP with socket
+        */
+       siw_cep_socket_assoc(cep, s);
+
+       cep->state = SIW_EPSTATE_AWAIT_MPAREP;
+
+       /*
+        * Set MPA Request bits: CRC if required, no MPA Markers,
+        * MPA Rev. according to module parameter 'mpa_version', Key 'Request'.
+        */
+       cep->mpa.hdr.params.bits = 0;
+       if (version > MPA_REVISION_2) {
+               pr_warn("Setting MPA version to %u\n", MPA_REVISION_2);
+               version = MPA_REVISION_2;
+               /* Adjust also module parameter */
+               mpa_version = MPA_REVISION_2;
+       }
+       __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, version);
+
+       if (try_gso)
+               cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP;
+
+       if (mpa_crc_required)
+               cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC;
+
+       /*
+        * If MPA version == 2:
+        * o Include ORD and IRD.
+        * o Indicate peer-to-peer mode, if required by module
+        *   parameter 'peer_to_peer'.
+        */
+       if (version == MPA_REVISION_2) {
+               cep->enhanced_rdma_conn_est = true;
+               cep->mpa.hdr.params.bits |= MPA_RR_FLAG_ENHANCED;
+
+               cep->mpa.v2_ctrl.ird = htons(cep->ird);
+               cep->mpa.v2_ctrl.ord = htons(cep->ord);
+
+               if (p2p_mode) {
+                       cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER;
+                       cep->mpa.v2_ctrl.ord |= rtr_type;
+               }
+               /* Remember own P2P mode requested */
+               cep->mpa.v2_ctrl_req.ird = cep->mpa.v2_ctrl.ird;
+               cep->mpa.v2_ctrl_req.ord = cep->mpa.v2_ctrl.ord;
+       }
+       memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, 16);
+
+       rv = siw_send_mpareqrep(cep, params->private_data, pd_len);
+       /*
+        * Reset private data.
+        */
+       cep->mpa.hdr.params.pd_len = 0;
+
+       if (rv >= 0) {
+               rv = siw_cm_queue_work(cep, SIW_CM_WORK_MPATIMEOUT);
+               if (!rv) {
+                       siw_dbg_cep(cep, "id 0x%p, [QP %u]: exit\n", id,
+                                   qp_id(qp));
+                       siw_cep_set_free(cep);
+                       return 0;
+               }
+       }
+error:
+       siw_dbg_qp(qp, "failed: %d\n", rv);
+
+       if (cep) {
+               siw_socket_disassoc(s);
+               sock_release(s);
+               cep->sock = NULL;
+
+               cep->qp = NULL;
+
+               cep->cm_id = NULL;
+               id->rem_ref(id);
+               siw_cep_put(cep);
+
+               qp->cep = NULL;
+               siw_cep_put(cep);
+
+               cep->state = SIW_EPSTATE_CLOSED;
+
+               siw_cep_set_free(cep);
+
+               siw_cep_put(cep);
+
+       } else if (s) {
+               sock_release(s);
+       }
+       siw_qp_put(qp);
+
+       return rv;
+}
+
+/*
+ * siw_accept - Let SoftiWARP accept an RDMA connection request
+ *
+ * @id:                New connection management id to be used for accepted
+ *             connection request
+ * @params:    Connection parameters provided by ULP for accepting connection
+ *
+ * Transition QP to RTS state, associate new CM id @id with accepted CEP
+ * and get prepared for TCP input by installing socket callbacks.
+ * Then send MPA Reply and generate the "connection established" event.
+ * Socket callbacks must be installed before sending MPA Reply, because
+ * the latter may cause a first RDMA message to arrive from the RDMA Initiator
+ * side very quickly, at which time the socket callbacks must be ready.
+ */
+int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params)
+{
+       struct siw_device *sdev = to_siw_dev(id->device);
+       struct siw_cep *cep = (struct siw_cep *)id->provider_data;
+       struct siw_qp *qp;
+       struct siw_qp_attrs qp_attrs;
+       int rv, max_priv_data = MPA_MAX_PRIVDATA;
+       bool wait_for_peer_rts = false;
+
+       siw_cep_set_inuse(cep);
+       siw_cep_put(cep);
+
+       /* Free lingering inbound private data */
+       if (cep->mpa.hdr.params.pd_len) {
+               cep->mpa.hdr.params.pd_len = 0;
+               kfree(cep->mpa.pdata);
+               cep->mpa.pdata = NULL;
+       }
+       siw_cancel_mpatimer(cep);
+
+       if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) {
+               siw_dbg_cep(cep, "id 0x%p: out of state\n", id);
+
+               siw_cep_set_free(cep);
+               siw_cep_put(cep);
+
+               return -ECONNRESET;
+       }
+       qp = siw_qp_id2obj(sdev, params->qpn);
+       if (!qp) {
+               WARN(1, "[QP %d] does not exist\n", params->qpn);
+               siw_cep_set_free(cep);
+               siw_cep_put(cep);
+
+               return -EINVAL;
+       }
+       down_write(&qp->state_lock);
+       if (qp->attrs.state > SIW_QP_STATE_RTR) {
+               rv = -EINVAL;
+               up_write(&qp->state_lock);
+               goto error;
+       }
+       siw_dbg_cep(cep, "id 0x%p\n", id);
+
+       if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) {
+               siw_dbg_cep(cep, "peer allows GSO on TX\n");
+               qp->tx_ctx.gso_seg_limit = 0;
+       }
+       if (params->ord > sdev->attrs.max_ord ||
+           params->ird > sdev->attrs.max_ird) {
+               siw_dbg_cep(
+                       cep,
+                       "id 0x%p, [QP %u]: ord %d (max %d), ird %d (max %d)\n",
+                       id, qp_id(qp), params->ord, sdev->attrs.max_ord,
+                       params->ird, sdev->attrs.max_ird);
+               rv = -EINVAL;
+               up_write(&qp->state_lock);
+               goto error;
+       }
+       if (cep->enhanced_rdma_conn_est)
+               max_priv_data -= sizeof(struct mpa_v2_data);
+
+       if (params->private_data_len > max_priv_data) {
+               siw_dbg_cep(
+                       cep,
+                       "id 0x%p, [QP %u]: private data length: %d (max %d)\n",
+                       id, qp_id(qp), params->private_data_len, max_priv_data);
+               rv = -EINVAL;
+               up_write(&qp->state_lock);
+               goto error;
+       }
+       if (cep->enhanced_rdma_conn_est) {
+               if (params->ord > cep->ord) {
+                       if (relaxed_ird_negotiation) {
+                               params->ord = cep->ord;
+                       } else {
+                               cep->ird = params->ird;
+                               cep->ord = params->ord;
+                               rv = -EINVAL;
+                               up_write(&qp->state_lock);
+                               goto error;
+                       }
+               }
+               if (params->ird < cep->ird) {
+                       if (relaxed_ird_negotiation &&
+                           cep->ird <= sdev->attrs.max_ird)
+                               params->ird = cep->ird;
+                       else {
+                               rv = -ENOMEM;
+                               up_write(&qp->state_lock);
+                               goto error;
+                       }
+               }
+               if (cep->mpa.v2_ctrl.ord &
+                   (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR))
+                       wait_for_peer_rts = true;
+               /*
+                * Signal back negotiated IRD and ORD values
+                */
+               cep->mpa.v2_ctrl.ord =
+                       htons(params->ord & MPA_IRD_ORD_MASK) |
+                       (cep->mpa.v2_ctrl.ord & ~MPA_V2_MASK_IRD_ORD);
+               cep->mpa.v2_ctrl.ird =
+                       htons(params->ird & MPA_IRD_ORD_MASK) |
+                       (cep->mpa.v2_ctrl.ird & ~MPA_V2_MASK_IRD_ORD);
+       }
+       cep->ird = params->ird;
+       cep->ord = params->ord;
+
+       cep->cm_id = id;
+       id->add_ref(id);
+
+       memset(&qp_attrs, 0, sizeof(qp_attrs));
+       qp_attrs.orq_size = cep->ord;
+       qp_attrs.irq_size = cep->ird;
+       qp_attrs.sk = cep->sock;
+       if (cep->mpa.hdr.params.bits & MPA_RR_FLAG_CRC)
+               qp_attrs.flags = SIW_MPA_CRC;
+       qp_attrs.state = SIW_QP_STATE_RTS;
+
+       siw_dbg_cep(cep, "id 0x%p, [QP%u]: moving to rts\n", id, qp_id(qp));
+
+       /* Associate QP with CEP */
+       siw_cep_get(cep);
+       qp->cep = cep;
+
+       /* siw_qp_get(qp) already done by QP lookup */
+       cep->qp = qp;
+
+       cep->state = SIW_EPSTATE_RDMA_MODE;
+
+       /* Move socket RX/TX under QP control */
+       rv = siw_qp_modify(qp, &qp_attrs,
+                          SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE |
+                                  SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD |
+                                  SIW_QP_ATTR_MPA);
+       up_write(&qp->state_lock);
+
+       if (rv)
+               goto error;
+
+       siw_dbg_cep(cep, "id 0x%p, [QP %u]: send mpa reply, %d byte pdata\n",
+                   id, qp_id(qp), params->private_data_len);
+
+       rv = siw_send_mpareqrep(cep, params->private_data,
+                               params->private_data_len);
+       if (rv != 0)
+               goto error;
+
+       if (wait_for_peer_rts) {
+               siw_sk_assign_rtr_upcalls(cep);
+       } else {
+               siw_qp_socket_assoc(cep, qp);
+               rv = siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0);
+               if (rv)
+                       goto error;
+       }
+       siw_cep_set_free(cep);
+
+       return 0;
+error:
+       siw_socket_disassoc(cep->sock);
+       sock_release(cep->sock);
+       cep->sock = NULL;
+
+       cep->state = SIW_EPSTATE_CLOSED;
+
+       if (cep->cm_id) {
+               cep->cm_id->rem_ref(id);
+               cep->cm_id = NULL;
+       }
+       if (qp->cep) {
+               siw_cep_put(cep);
+               qp->cep = NULL;
+       }
+       cep->qp = NULL;
+       siw_qp_put(qp);
+
+       siw_cep_set_free(cep);
+       siw_cep_put(cep);
+
+       return rv;
+}
+
+/*
+ * siw_reject()
+ *
+ * Local connection reject case. Send private data back to peer,
+ * close connection and dereference connection id.
+ */
+int siw_reject(struct iw_cm_id *id, const void *pdata, u8 pd_len)
+{
+       struct siw_cep *cep = (struct siw_cep *)id->provider_data;
+
+       siw_cep_set_inuse(cep);
+       siw_cep_put(cep);
+
+       siw_cancel_mpatimer(cep);
+
+       if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) {
+               siw_dbg_cep(cep, "id 0x%p: out of state\n", id);
+
+               siw_cep_set_free(cep);
+               siw_cep_put(cep); /* put last reference */
+
+               return -ECONNRESET;
+       }
+       siw_dbg_cep(cep, "id 0x%p, cep->state %d, pd_len %d\n", id, cep->state,
+                   pd_len);
+
+       if (__mpa_rr_revision(cep->mpa.hdr.params.bits) >= MPA_REVISION_1) {
+               cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */
+               siw_send_mpareqrep(cep, pdata, pd_len);
+       }
+       siw_socket_disassoc(cep->sock);
+       sock_release(cep->sock);
+       cep->sock = NULL;
+
+       cep->state = SIW_EPSTATE_CLOSED;
+
+       siw_cep_set_free(cep);
+       siw_cep_put(cep);
+
+       return 0;
+}
+
+static int siw_listen_address(struct iw_cm_id *id, int backlog,
+                             struct sockaddr *laddr, int addr_family)
+{
+       struct socket *s;
+       struct siw_cep *cep = NULL;
+       struct siw_device *sdev = to_siw_dev(id->device);
+       int rv = 0, s_val;
+
+       rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s);
+       if (rv < 0)
+               return rv;
+
+       /*
+        * Allow binding local port when still in TIME_WAIT from last close.
+        */
+       s_val = 1;
+       rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val,
+                              sizeof(s_val));
+       if (rv) {
+               siw_dbg(id->device, "id 0x%p: setsockopt error: %d\n", id, rv);
+               goto error;
+       }
+       rv = s->ops->bind(s, laddr, addr_family == AF_INET ?
+                                   sizeof(struct sockaddr_in) :
+                                   sizeof(struct sockaddr_in6));
+       if (rv) {
+               siw_dbg(id->device, "id 0x%p: socket bind error: %d\n", id, rv);
+               goto error;
+       }
+       cep = siw_cep_alloc(sdev);
+       if (!cep) {
+               rv = -ENOMEM;
+               goto error;
+       }
+       siw_cep_socket_assoc(cep, s);
+
+       rv = siw_cm_alloc_work(cep, backlog);
+       if (rv) {
+               siw_dbg(id->device,
+                       "id 0x%p: alloc_work error %d, backlog %d\n", id,
+                       rv, backlog);
+               goto error;
+       }
+       rv = s->ops->listen(s, backlog);
+       if (rv) {
+               siw_dbg(id->device, "id 0x%p: listen error %d\n", id, rv);
+               goto error;
+       }
+       cep->cm_id = id;
+       id->add_ref(id);
+
+       /*
+        * In case of a wildcard rdma_listen on a multi-homed device,
+        * a listener's IWCM id is associated with more than one listening CEP.
+        *
+        * We currently use id->provider_data in three different ways:
+        *
+        * o For a listener's IWCM id, id->provider_data points to
+        *   the list_head of the list of listening CEPs.
+        *   Uses: siw_create_listen(), siw_destroy_listen()
+        *
+        * o For each accepted passive-side IWCM id, id->provider_data
+        *   points to the CEP itself. This is a consequence of
+        *   - siw_cm_upcall() setting event.provider_data = cep and
+        *   - the IWCM's cm_conn_req_handler() setting provider_data of the
+        *     new passive-side IWCM id equal to event.provider_data
+        *   Uses: siw_accept(), siw_reject()
+        *
+        * o For an active-side IWCM id, id->provider_data is not used at all.
+        *
+        */
+       if (!id->provider_data) {
+               id->provider_data =
+                       kmalloc(sizeof(struct list_head), GFP_KERNEL);
+               if (!id->provider_data) {
+                       rv = -ENOMEM;
+                       goto error;
+               }
+               INIT_LIST_HEAD((struct list_head *)id->provider_data);
+       }
+       list_add_tail(&cep->listenq, (struct list_head *)id->provider_data);
+       cep->state = SIW_EPSTATE_LISTENING;
+
+       if (addr_family == AF_INET)
+               siw_dbg(id->device, "Listen at laddr %pI4 %u\n",
+                       &(((struct sockaddr_in *)laddr)->sin_addr),
+                       ((struct sockaddr_in *)laddr)->sin_port);
+       else
+               siw_dbg(id->device, "Listen at laddr %pI6 %u\n",
+                       &(((struct sockaddr_in6 *)laddr)->sin6_addr),
+                       ((struct sockaddr_in6 *)laddr)->sin6_port);
+
+       return 0;
+
+error:
+       siw_dbg(id->device, "failed: %d\n", rv);
+
+       if (cep) {
+               siw_cep_set_inuse(cep);
+
+               if (cep->cm_id) {
+                       cep->cm_id->rem_ref(cep->cm_id);
+                       cep->cm_id = NULL;
+               }
+               cep->sock = NULL;
+               siw_socket_disassoc(s);
+               cep->state = SIW_EPSTATE_CLOSED;
+
+               siw_cep_set_free(cep);
+               siw_cep_put(cep);
+       }
+       sock_release(s);
+
+       return rv;
+}
+
+static void siw_drop_listeners(struct iw_cm_id *id)
+{
+       struct list_head *p, *tmp;
+
+       /*
+        * In case of a wildcard rdma_listen on a multi-homed device,
+        * a listener's IWCM id is associated with more than one listening CEP.
+        */
+       list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) {
+               struct siw_cep *cep = list_entry(p, struct siw_cep, listenq);
+
+               list_del(p);
+
+               siw_dbg_cep(cep, "id 0x%p: drop cep, state %d\n", id,
+                           cep->state);
+
+               siw_cep_set_inuse(cep);
+
+               if (cep->cm_id) {
+                       cep->cm_id->rem_ref(cep->cm_id);
+                       cep->cm_id = NULL;
+               }
+               if (cep->sock) {
+                       siw_socket_disassoc(cep->sock);
+                       sock_release(cep->sock);
+                       cep->sock = NULL;
+               }
+               cep->state = SIW_EPSTATE_CLOSED;
+               siw_cep_set_free(cep);
+               siw_cep_put(cep);
+       }
+}
+
+/*
+ * siw_create_listen - Create resources for a listener's IWCM ID @id
+ *
+ * Listens on the socket addresses id->local_addr and id->remote_addr.
+ *
+ * If the listener's @id provides a specific local IP address, at most one
+ * listening socket is created and associated with @id.
+ *
+ * If the listener's @id provides the wildcard (zero) local IP address,
+ * a separate listen is performed for each local IP address of the device
+ * by creating a listening socket and binding to that local IP address.
+ *
+ */
+int siw_create_listen(struct iw_cm_id *id, int backlog)
+{
+       struct net_device *dev = to_siw_dev(id->device)->netdev;
+       int rv = 0, listeners = 0;
+
+       siw_dbg(id->device, "id 0x%p: backlog %d\n", id, backlog);
+
+       /*
+        * For each attached address of the interface, create a
+        * listening socket, if id->local_addr is the wildcard
+        * IP address or matches the IP address.
+        */
+       if (id->local_addr.ss_family == AF_INET) {
+               struct in_device *in_dev = in_dev_get(dev);
+               struct sockaddr_in s_laddr, *s_raddr;
+               const struct in_ifaddr *ifa;
+
+               memcpy(&s_laddr, &id->local_addr, sizeof(s_laddr));
+               s_raddr = (struct sockaddr_in *)&id->remote_addr;
+
+               siw_dbg(id->device,
+                       "id 0x%p: laddr %pI4:%d, raddr %pI4:%d\n",
+                       id, &s_laddr.sin_addr, ntohs(s_laddr.sin_port),
+                       &s_raddr->sin_addr, ntohs(s_raddr->sin_port));
+
+               rtnl_lock();
+               in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+                       if (ipv4_is_zeronet(s_laddr.sin_addr.s_addr) ||
+                           s_laddr.sin_addr.s_addr == ifa->ifa_address) {
+                               s_laddr.sin_addr.s_addr = ifa->ifa_address;
+
+                               rv = siw_listen_address(id, backlog,
+                                               (struct sockaddr *)&s_laddr,
+                                               AF_INET);
+                               if (!rv)
+                                       listeners++;
+                       }
+               }
+               rtnl_unlock();
+               in_dev_put(in_dev);
+       } else if (id->local_addr.ss_family == AF_INET6) {
+               struct inet6_dev *in6_dev = in6_dev_get(dev);
+               struct inet6_ifaddr *ifp;
+               struct sockaddr_in6 *s_laddr = &to_sockaddr_in6(id->local_addr),
+                       *s_raddr = &to_sockaddr_in6(id->remote_addr);
+
+               siw_dbg(id->device,
+                       "id 0x%p: laddr %pI6:%d, raddr %pI6:%d\n",
+                       id, &s_laddr->sin6_addr, ntohs(s_laddr->sin6_port),
+                       &s_raddr->sin6_addr, ntohs(s_raddr->sin6_port));
+
+               read_lock_bh(&in6_dev->lock);
+               list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+                       struct sockaddr_in6 bind_addr;
+
+                       if (ipv6_addr_any(&s_laddr->sin6_addr) ||
+                           ipv6_addr_equal(&s_laddr->sin6_addr, &ifp->addr)) {
+                               bind_addr.sin6_family = AF_INET6;
+                               bind_addr.sin6_port = s_laddr->sin6_port;
+                               bind_addr.sin6_flowinfo = 0;
+                               bind_addr.sin6_addr = ifp->addr;
+                               bind_addr.sin6_scope_id = dev->ifindex;
+
+                               rv = siw_listen_address(id, backlog,
+                                               (struct sockaddr *)&bind_addr,
+                                               AF_INET6);
+                               if (!rv)
+                                       listeners++;
+                       }
+               }
+               read_unlock_bh(&in6_dev->lock);
+
+               in6_dev_put(in6_dev);
+       } else {
+               return -EAFNOSUPPORT;
+       }
+       if (listeners)
+               rv = 0;
+       else if (!rv)
+               rv = -EINVAL;
+
+       siw_dbg(id->device, "id 0x%p: %s\n", id, rv ? "FAIL" : "OK");
+
+       return rv;
+}
+
+int siw_destroy_listen(struct iw_cm_id *id)
+{
+       siw_dbg(id->device, "id 0x%p\n", id);
+
+       if (!id->provider_data) {
+               siw_dbg(id->device, "id 0x%p: no cep(s)\n", id);
+               return 0;
+       }
+       siw_drop_listeners(id);
+       kfree(id->provider_data);
+       id->provider_data = NULL;
+
+       return 0;
+}
+
+int siw_cm_init(void)
+{
+       /*
+        * create_single_workqueue for strict ordering
+        */
+       siw_cm_wq = create_singlethread_workqueue("siw_cm_wq");
+       if (!siw_cm_wq)
+               return -ENOMEM;
+
+       return 0;
+}
+
+void siw_cm_exit(void)
+{
+       if (siw_cm_wq) {
+               flush_workqueue(siw_cm_wq);
+               destroy_workqueue(siw_cm_wq);
+       }
+}
diff --git a/drivers/infiniband/sw/siw/siw_cm.h b/drivers/infiniband/sw/siw/siw_cm.h
new file mode 100644 (file)
index 0000000..8c59cb3
--- /dev/null
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/*          Greg Joyce <greg@opengridcomputing.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+/* Copyright (c) 2017, Open Grid Computing, Inc. */
+
+#ifndef _SIW_CM_H
+#define _SIW_CM_H
+
+#include <net/sock.h>
+#include <linux/tcp.h>
+
+#include <rdma/iw_cm.h>
+
+enum siw_cep_state {
+       SIW_EPSTATE_IDLE = 1,
+       SIW_EPSTATE_LISTENING,
+       SIW_EPSTATE_CONNECTING,
+       SIW_EPSTATE_AWAIT_MPAREQ,
+       SIW_EPSTATE_RECVD_MPAREQ,
+       SIW_EPSTATE_AWAIT_MPAREP,
+       SIW_EPSTATE_RDMA_MODE,
+       SIW_EPSTATE_CLOSED
+};
+
+struct siw_mpa_info {
+       struct mpa_rr hdr; /* peer mpa hdr in host byte order */
+       struct mpa_v2_data v2_ctrl;
+       struct mpa_v2_data v2_ctrl_req;
+       char *pdata;
+       int bytes_rcvd;
+};
+
+struct siw_device;
+
+struct siw_cep {
+       struct iw_cm_id *cm_id;
+       struct siw_device *sdev;
+       struct list_head devq;
+       spinlock_t lock;
+       struct kref ref;
+       int in_use;
+       wait_queue_head_t waitq;
+       enum siw_cep_state state;
+
+       struct list_head listenq;
+       struct siw_cep *listen_cep;
+
+       struct siw_qp *qp;
+       struct socket *sock;
+
+       struct siw_cm_work *mpa_timer;
+       struct list_head work_freelist;
+
+       struct siw_mpa_info mpa;
+       int ord;
+       int ird;
+       bool enhanced_rdma_conn_est;
+
+       /* Saved upcalls of socket */
+       void (*sk_state_change)(struct sock *sk);
+       void (*sk_data_ready)(struct sock *sk);
+       void (*sk_write_space)(struct sock *sk);
+       void (*sk_error_report)(struct sock *sk);
+};
+
+/*
+ * Connection initiator waits 10 seconds to receive an
+ * MPA reply after sending out MPA request. Reponder waits for
+ * 5 seconds for MPA request to arrive if new TCP connection
+ * was set up.
+ */
+#define MPAREQ_TIMEOUT (HZ * 10)
+#define MPAREP_TIMEOUT (HZ * 5)
+
+enum siw_work_type {
+       SIW_CM_WORK_ACCEPT = 1,
+       SIW_CM_WORK_READ_MPAHDR,
+       SIW_CM_WORK_CLOSE_LLP, /* close socket */
+       SIW_CM_WORK_PEER_CLOSE, /* socket indicated peer close */
+       SIW_CM_WORK_MPATIMEOUT
+};
+
+struct siw_cm_work {
+       struct delayed_work work;
+       struct list_head list;
+       enum siw_work_type type;
+       struct siw_cep *cep;
+};
+
+#define to_sockaddr_in(a) (*(struct sockaddr_in *)(&(a)))
+#define to_sockaddr_in6(a) (*(struct sockaddr_in6 *)(&(a)))
+
+static inline int getname_peer(struct socket *s, struct sockaddr_storage *a)
+{
+       return s->ops->getname(s, (struct sockaddr *)a, 1);
+}
+
+static inline int getname_local(struct socket *s, struct sockaddr_storage *a)
+{
+       return s->ops->getname(s, (struct sockaddr *)a, 0);
+}
+
+static inline int ksock_recv(struct socket *sock, char *buf, size_t size,
+                            int flags)
+{
+       struct kvec iov = { buf, size };
+       struct msghdr msg = { .msg_name = NULL, .msg_flags = flags };
+
+       return kernel_recvmsg(sock, &msg, &iov, 1, size, flags);
+}
+
+int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *parm);
+int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *param);
+int siw_reject(struct iw_cm_id *id, const void *data, u8 len);
+int siw_create_listen(struct iw_cm_id *id, int backlog);
+int siw_destroy_listen(struct iw_cm_id *id);
+
+void siw_cep_get(struct siw_cep *cep);
+void siw_cep_put(struct siw_cep *cep);
+int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type);
+
+int siw_cm_init(void);
+void siw_cm_exit(void);
+
+/*
+ * TCP socket interface
+ */
+#define sk_to_qp(sk) (((struct siw_cep *)((sk)->sk_user_data))->qp)
+#define sk_to_cep(sk) ((struct siw_cep *)((sk)->sk_user_data))
+
+#endif
diff --git a/drivers/infiniband/sw/siw/siw_cq.c b/drivers/infiniband/sw/siw/siw_cq.c
new file mode 100644 (file)
index 0000000..e381ae9
--- /dev/null
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "siw.h"
+
+static int map_wc_opcode[SIW_NUM_OPCODES] = {
+       [SIW_OP_WRITE] = IB_WC_RDMA_WRITE,
+       [SIW_OP_SEND] = IB_WC_SEND,
+       [SIW_OP_SEND_WITH_IMM] = IB_WC_SEND,
+       [SIW_OP_READ] = IB_WC_RDMA_READ,
+       [SIW_OP_READ_LOCAL_INV] = IB_WC_RDMA_READ,
+       [SIW_OP_COMP_AND_SWAP] = IB_WC_COMP_SWAP,
+       [SIW_OP_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
+       [SIW_OP_INVAL_STAG] = IB_WC_LOCAL_INV,
+       [SIW_OP_REG_MR] = IB_WC_REG_MR,
+       [SIW_OP_RECEIVE] = IB_WC_RECV,
+       [SIW_OP_READ_RESPONSE] = -1 /* not used */
+};
+
+static struct {
+       enum siw_wc_status siw;
+       enum ib_wc_status ib;
+} map_cqe_status[SIW_NUM_WC_STATUS] = {
+       { SIW_WC_SUCCESS, IB_WC_SUCCESS },
+       { SIW_WC_LOC_LEN_ERR, IB_WC_LOC_LEN_ERR },
+       { SIW_WC_LOC_PROT_ERR, IB_WC_LOC_PROT_ERR },
+       { SIW_WC_LOC_QP_OP_ERR, IB_WC_LOC_QP_OP_ERR },
+       { SIW_WC_WR_FLUSH_ERR, IB_WC_WR_FLUSH_ERR },
+       { SIW_WC_BAD_RESP_ERR, IB_WC_BAD_RESP_ERR },
+       { SIW_WC_LOC_ACCESS_ERR, IB_WC_LOC_ACCESS_ERR },
+       { SIW_WC_REM_ACCESS_ERR, IB_WC_REM_ACCESS_ERR },
+       { SIW_WC_REM_INV_REQ_ERR, IB_WC_REM_INV_REQ_ERR },
+       { SIW_WC_GENERAL_ERR, IB_WC_GENERAL_ERR }
+};
+
+/*
+ * Reap one CQE from the CQ. Only used by kernel clients
+ * during CQ normal operation. Might be called during CQ
+ * flush for user mapped CQE array as well.
+ */
+int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc)
+{
+       struct siw_cqe *cqe;
+       unsigned long flags;
+
+       spin_lock_irqsave(&cq->lock, flags);
+
+       cqe = &cq->queue[cq->cq_get % cq->num_cqe];
+       if (READ_ONCE(cqe->flags) & SIW_WQE_VALID) {
+               memset(wc, 0, sizeof(*wc));
+               wc->wr_id = cqe->id;
+               wc->status = map_cqe_status[cqe->status].ib;
+               wc->opcode = map_wc_opcode[cqe->opcode];
+               wc->byte_len = cqe->bytes;
+
+               /*
+                * During CQ flush, also user land CQE's may get
+                * reaped here, which do not hold a QP reference
+                * and do not qualify for memory extension verbs.
+                */
+               if (likely(cq->kernel_verbs)) {
+                       if (cqe->flags & SIW_WQE_REM_INVAL) {
+                               wc->ex.invalidate_rkey = cqe->inval_stag;
+                               wc->wc_flags = IB_WC_WITH_INVALIDATE;
+                       }
+                       wc->qp = cqe->base_qp;
+                       siw_dbg_cq(cq, "idx %u, type %d, flags %2x, id 0x%p\n",
+                                  cq->cq_get % cq->num_cqe, cqe->opcode,
+                                  cqe->flags, (void *)cqe->id);
+               }
+               WRITE_ONCE(cqe->flags, 0);
+               cq->cq_get++;
+
+               spin_unlock_irqrestore(&cq->lock, flags);
+
+               return 1;
+       }
+       spin_unlock_irqrestore(&cq->lock, flags);
+
+       return 0;
+}
+
+/*
+ * siw_cq_flush()
+ *
+ * Flush all CQ elements.
+ */
+void siw_cq_flush(struct siw_cq *cq)
+{
+       struct ib_wc wc;
+
+       while (siw_reap_cqe(cq, &wc))
+               ;
+}
diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c
new file mode 100644 (file)
index 0000000..f55c4e8
--- /dev/null
@@ -0,0 +1,685 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <net/net_namespace.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_arp.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/dma-mapping.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/rdma_netlink.h>
+#include <linux/kthread.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+
+MODULE_AUTHOR("Bernard Metzler");
+MODULE_DESCRIPTION("Software iWARP Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+/* transmit from user buffer, if possible */
+const bool zcopy_tx = true;
+
+/* Restrict usage of GSO, if hardware peer iwarp is unable to process
+ * large packets. try_gso = true lets siw try to use local GSO,
+ * if peer agrees.  Not using GSO severly limits siw maximum tx bandwidth.
+ */
+const bool try_gso;
+
+/* Attach siw also with loopback devices */
+const bool loopback_enabled = true;
+
+/* We try to negotiate CRC on, if true */
+const bool mpa_crc_required;
+
+/* MPA CRC on/off enforced */
+const bool mpa_crc_strict;
+
+/* Control TCP_NODELAY socket option */
+const bool siw_tcp_nagle;
+
+/* Select MPA version to be used during connection setup */
+u_char mpa_version = MPA_REVISION_2;
+
+/* Selects MPA P2P mode (additional handshake during connection
+ * setup, if true.
+ */
+const bool peer_to_peer;
+
+struct task_struct *siw_tx_thread[NR_CPUS];
+struct crypto_shash *siw_crypto_shash;
+
+static int siw_device_register(struct siw_device *sdev, const char *name)
+{
+       struct ib_device *base_dev = &sdev->base_dev;
+       static int dev_id = 1;
+       int rv;
+
+       rv = ib_register_device(base_dev, name);
+       if (rv) {
+               pr_warn("siw: device registration error %d\n", rv);
+               return rv;
+       }
+       sdev->vendor_part_id = dev_id++;
+
+       siw_dbg(base_dev, "HWaddr=%pM\n", sdev->netdev->dev_addr);
+
+       return 0;
+}
+
+static void siw_device_cleanup(struct ib_device *base_dev)
+{
+       struct siw_device *sdev = to_siw_dev(base_dev);
+
+       xa_destroy(&sdev->qp_xa);
+       xa_destroy(&sdev->mem_xa);
+}
+
+static int siw_create_tx_threads(void)
+{
+       int cpu, assigned = 0;
+
+       for_each_online_cpu(cpu) {
+               /* Skip HT cores */
+               if (cpu % cpumask_weight(topology_sibling_cpumask(cpu)))
+                       continue;
+
+               siw_tx_thread[cpu] =
+                       kthread_create(siw_run_sq, (unsigned long *)(long)cpu,
+                                      "siw_tx/%d", cpu);
+               if (IS_ERR(siw_tx_thread[cpu])) {
+                       siw_tx_thread[cpu] = NULL;
+                       continue;
+               }
+               kthread_bind(siw_tx_thread[cpu], cpu);
+
+               wake_up_process(siw_tx_thread[cpu]);
+               assigned++;
+       }
+       return assigned;
+}
+
+static int siw_dev_qualified(struct net_device *netdev)
+{
+       /*
+        * Additional hardware support can be added here
+        * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see
+        * <linux/if_arp.h> for type identifiers.
+        */
+       if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 ||
+           (netdev->type == ARPHRD_LOOPBACK && loopback_enabled))
+               return 1;
+
+       return 0;
+}
+
+static DEFINE_PER_CPU(atomic_t, siw_use_cnt);
+
+static struct {
+       struct cpumask **tx_valid_cpus;
+       int num_nodes;
+} siw_cpu_info;
+
+static int siw_init_cpulist(void)
+{
+       int i, num_nodes = num_possible_nodes();
+
+       memset(siw_tx_thread, 0, sizeof(siw_tx_thread));
+
+       siw_cpu_info.num_nodes = num_nodes;
+
+       siw_cpu_info.tx_valid_cpus =
+               kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL);
+       if (!siw_cpu_info.tx_valid_cpus) {
+               siw_cpu_info.num_nodes = 0;
+               return -ENOMEM;
+       }
+       for (i = 0; i < siw_cpu_info.num_nodes; i++) {
+               siw_cpu_info.tx_valid_cpus[i] =
+                       kzalloc(sizeof(struct cpumask), GFP_KERNEL);
+               if (!siw_cpu_info.tx_valid_cpus[i])
+                       goto out_err;
+
+               cpumask_clear(siw_cpu_info.tx_valid_cpus[i]);
+       }
+       for_each_possible_cpu(i)
+               cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]);
+
+       return 0;
+
+out_err:
+       siw_cpu_info.num_nodes = 0;
+       while (i) {
+               kfree(siw_cpu_info.tx_valid_cpus[i]);
+               siw_cpu_info.tx_valid_cpus[i--] = NULL;
+       }
+       kfree(siw_cpu_info.tx_valid_cpus);
+       siw_cpu_info.tx_valid_cpus = NULL;
+
+       return -ENOMEM;
+}
+
+static void siw_destroy_cpulist(void)
+{
+       int i = 0;
+
+       while (i < siw_cpu_info.num_nodes)
+               kfree(siw_cpu_info.tx_valid_cpus[i++]);
+
+       kfree(siw_cpu_info.tx_valid_cpus);
+}
+
+/*
+ * Choose CPU with least number of active QP's from NUMA node of
+ * TX interface.
+ */
+int siw_get_tx_cpu(struct siw_device *sdev)
+{
+       const struct cpumask *tx_cpumask;
+       int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1;
+
+       if (node < 0)
+               tx_cpumask = cpu_online_mask;
+       else
+               tx_cpumask = siw_cpu_info.tx_valid_cpus[node];
+
+       num_cpus = cpumask_weight(tx_cpumask);
+       if (!num_cpus) {
+               /* no CPU on this NUMA node */
+               tx_cpumask = cpu_online_mask;
+               num_cpus = cpumask_weight(tx_cpumask);
+       }
+       if (!num_cpus)
+               goto out;
+
+       cpu = cpumask_first(tx_cpumask);
+
+       for (i = 0, min_use = SIW_MAX_QP; i < num_cpus;
+            i++, cpu = cpumask_next(cpu, tx_cpumask)) {
+               int usage;
+
+               /* Skip any cores which have no TX thread */
+               if (!siw_tx_thread[cpu])
+                       continue;
+
+               usage = atomic_read(&per_cpu(siw_use_cnt, cpu));
+               if (usage <= min_use) {
+                       tx_cpu = cpu;
+                       min_use = usage;
+               }
+       }
+       siw_dbg(&sdev->base_dev,
+               "tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use);
+
+out:
+       if (tx_cpu >= 0)
+               atomic_inc(&per_cpu(siw_use_cnt, tx_cpu));
+       else
+               pr_warn("siw: no tx cpu found\n");
+
+       return tx_cpu;
+}
+
+void siw_put_tx_cpu(int cpu)
+{
+       atomic_dec(&per_cpu(siw_use_cnt, cpu));
+}
+
+static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id)
+{
+       struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id);
+
+       if (qp) {
+               /*
+                * siw_qp_id2obj() increments object reference count
+                */
+               siw_qp_put(qp);
+               return qp->ib_qp;
+       }
+       return NULL;
+}
+
+static void siw_verbs_sq_flush(struct ib_qp *base_qp)
+{
+       struct siw_qp *qp = to_siw_qp(base_qp);
+
+       down_write(&qp->state_lock);
+       siw_sq_flush(qp);
+       up_write(&qp->state_lock);
+}
+
+static void siw_verbs_rq_flush(struct ib_qp *base_qp)
+{
+       struct siw_qp *qp = to_siw_qp(base_qp);
+
+       down_write(&qp->state_lock);
+       siw_rq_flush(qp);
+       up_write(&qp->state_lock);
+}
+
+static const struct ib_device_ops siw_device_ops = {
+       .owner = THIS_MODULE,
+       .uverbs_abi_ver = SIW_ABI_VERSION,
+       .driver_id = RDMA_DRIVER_SIW,
+
+       .alloc_mr = siw_alloc_mr,
+       .alloc_pd = siw_alloc_pd,
+       .alloc_ucontext = siw_alloc_ucontext,
+       .create_cq = siw_create_cq,
+       .create_qp = siw_create_qp,
+       .create_srq = siw_create_srq,
+       .dealloc_driver = siw_device_cleanup,
+       .dealloc_pd = siw_dealloc_pd,
+       .dealloc_ucontext = siw_dealloc_ucontext,
+       .dereg_mr = siw_dereg_mr,
+       .destroy_cq = siw_destroy_cq,
+       .destroy_qp = siw_destroy_qp,
+       .destroy_srq = siw_destroy_srq,
+       .drain_rq = siw_verbs_rq_flush,
+       .drain_sq = siw_verbs_sq_flush,
+       .get_dma_mr = siw_get_dma_mr,
+       .get_port_immutable = siw_get_port_immutable,
+       .iw_accept = siw_accept,
+       .iw_add_ref = siw_qp_get_ref,
+       .iw_connect = siw_connect,
+       .iw_create_listen = siw_create_listen,
+       .iw_destroy_listen = siw_destroy_listen,
+       .iw_get_qp = siw_get_base_qp,
+       .iw_reject = siw_reject,
+       .iw_rem_ref = siw_qp_put_ref,
+       .map_mr_sg = siw_map_mr_sg,
+       .mmap = siw_mmap,
+       .modify_qp = siw_verbs_modify_qp,
+       .modify_srq = siw_modify_srq,
+       .poll_cq = siw_poll_cq,
+       .post_recv = siw_post_receive,
+       .post_send = siw_post_send,
+       .post_srq_recv = siw_post_srq_recv,
+       .query_device = siw_query_device,
+       .query_gid = siw_query_gid,
+       .query_pkey = siw_query_pkey,
+       .query_port = siw_query_port,
+       .query_qp = siw_query_qp,
+       .query_srq = siw_query_srq,
+       .req_notify_cq = siw_req_notify_cq,
+       .reg_user_mr = siw_reg_user_mr,
+
+       INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq),
+       INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd),
+       INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq),
+       INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext),
+};
+
+static struct siw_device *siw_device_create(struct net_device *netdev)
+{
+       struct siw_device *sdev = NULL;
+       struct ib_device *base_dev;
+       struct device *parent = netdev->dev.parent;
+       int rv;
+
+       if (!parent) {
+               /*
+                * The loopback device has no parent device,
+                * so it appears as a top-level device. To support
+                * loopback device connectivity, take this device
+                * as the parent device. Skip all other devices
+                * w/o parent device.
+                */
+               if (netdev->type != ARPHRD_LOOPBACK) {
+                       pr_warn("siw: device %s error: no parent device\n",
+                               netdev->name);
+                       return NULL;
+               }
+               parent = &netdev->dev;
+       }
+       sdev = ib_alloc_device(siw_device, base_dev);
+       if (!sdev)
+               return NULL;
+
+       base_dev = &sdev->base_dev;
+
+       sdev->netdev = netdev;
+
+       if (netdev->type != ARPHRD_LOOPBACK) {
+               memcpy(&base_dev->node_guid, netdev->dev_addr, 6);
+       } else {
+               /*
+                * The loopback device does not have a HW address,
+                * but connection mangagement lib expects gid != 0
+                */
+               size_t gidlen = min_t(size_t, strlen(base_dev->name), 6);
+
+               memcpy(&base_dev->node_guid, base_dev->name, gidlen);
+       }
+       base_dev->uverbs_cmd_mask =
+               (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+               (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+               (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+               (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+               (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+               (1ull << IB_USER_VERBS_CMD_REG_MR) |
+               (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+               (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+               (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+               (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+               (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+               (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+               (1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+               (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+               (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+               (1ull << IB_USER_VERBS_CMD_POST_RECV) |
+               (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
+               (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV) |
+               (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
+               (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
+
+       base_dev->node_type = RDMA_NODE_RNIC;
+       memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON,
+              sizeof(SIW_NODE_DESC_COMMON));
+
+       /*
+        * Current model (one-to-one device association):
+        * One Softiwarp device per net_device or, equivalently,
+        * per physical port.
+        */
+       base_dev->phys_port_cnt = 1;
+       base_dev->dev.parent = parent;
+       base_dev->dev.dma_ops = &dma_virt_ops;
+       base_dev->num_comp_vectors = num_possible_cpus();
+
+       ib_set_device_ops(base_dev, &siw_device_ops);
+       rv = ib_device_set_netdev(base_dev, netdev, 1);
+       if (rv)
+               goto error;
+
+       memcpy(base_dev->iw_ifname, netdev->name,
+              sizeof(base_dev->iw_ifname));
+
+       /* Disable TCP port mapping */
+       base_dev->iw_driver_flags = IW_F_NO_PORT_MAP,
+
+       sdev->attrs.max_qp = SIW_MAX_QP;
+       sdev->attrs.max_qp_wr = SIW_MAX_QP_WR;
+       sdev->attrs.max_ord = SIW_MAX_ORD_QP;
+       sdev->attrs.max_ird = SIW_MAX_IRD_QP;
+       sdev->attrs.max_sge = SIW_MAX_SGE;
+       sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD;
+       sdev->attrs.max_cq = SIW_MAX_CQ;
+       sdev->attrs.max_cqe = SIW_MAX_CQE;
+       sdev->attrs.max_mr = SIW_MAX_MR;
+       sdev->attrs.max_pd = SIW_MAX_PD;
+       sdev->attrs.max_mw = SIW_MAX_MW;
+       sdev->attrs.max_fmr = SIW_MAX_FMR;
+       sdev->attrs.max_srq = SIW_MAX_SRQ;
+       sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR;
+       sdev->attrs.max_srq_sge = SIW_MAX_SGE;
+
+       xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1);
+       xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1);
+
+       INIT_LIST_HEAD(&sdev->cep_list);
+       INIT_LIST_HEAD(&sdev->qp_list);
+
+       atomic_set(&sdev->num_ctx, 0);
+       atomic_set(&sdev->num_srq, 0);
+       atomic_set(&sdev->num_qp, 0);
+       atomic_set(&sdev->num_cq, 0);
+       atomic_set(&sdev->num_mr, 0);
+       atomic_set(&sdev->num_pd, 0);
+
+       sdev->numa_node = dev_to_node(parent);
+       spin_lock_init(&sdev->lock);
+
+       return sdev;
+error:
+       ib_dealloc_device(base_dev);
+
+       return NULL;
+}
+
+/*
+ * Network link becomes unavailable. Mark all
+ * affected QP's accordingly.
+ */
+static void siw_netdev_down(struct work_struct *work)
+{
+       struct siw_device *sdev =
+               container_of(work, struct siw_device, netdev_down);
+
+       struct siw_qp_attrs qp_attrs;
+       struct list_head *pos, *tmp;
+
+       memset(&qp_attrs, 0, sizeof(qp_attrs));
+       qp_attrs.state = SIW_QP_STATE_ERROR;
+
+       list_for_each_safe(pos, tmp, &sdev->qp_list) {
+               struct siw_qp *qp = list_entry(pos, struct siw_qp, devq);
+
+               down_write(&qp->state_lock);
+               WARN_ON(siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE));
+               up_write(&qp->state_lock);
+       }
+       ib_device_put(&sdev->base_dev);
+}
+
+static void siw_device_goes_down(struct siw_device *sdev)
+{
+       if (ib_device_try_get(&sdev->base_dev)) {
+               INIT_WORK(&sdev->netdev_down, siw_netdev_down);
+               schedule_work(&sdev->netdev_down);
+       }
+}
+
+static int siw_netdev_event(struct notifier_block *nb, unsigned long event,
+                           void *arg)
+{
+       struct net_device *netdev = netdev_notifier_info_to_dev(arg);
+       struct ib_device *base_dev;
+       struct siw_device *sdev;
+
+       dev_dbg(&netdev->dev, "siw: event %lu\n", event);
+
+       if (dev_net(netdev) != &init_net)
+               return NOTIFY_OK;
+
+       base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW);
+       if (!base_dev)
+               return NOTIFY_OK;
+
+       sdev = to_siw_dev(base_dev);
+
+       switch (event) {
+       case NETDEV_UP:
+               sdev->state = IB_PORT_ACTIVE;
+               siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE);
+               break;
+
+       case NETDEV_GOING_DOWN:
+               siw_device_goes_down(sdev);
+               break;
+
+       case NETDEV_DOWN:
+               sdev->state = IB_PORT_DOWN;
+               siw_port_event(sdev, 1, IB_EVENT_PORT_ERR);
+               break;
+
+       case NETDEV_REGISTER:
+               /*
+                * Device registration now handled only by
+                * rdma netlink commands. So it shall be impossible
+                * to end up here with a valid siw device.
+                */
+               siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n");
+               break;
+
+       case NETDEV_UNREGISTER:
+               ib_unregister_device_queued(&sdev->base_dev);
+               break;
+
+       case NETDEV_CHANGEADDR:
+               siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE);
+               break;
+       /*
+        * Todo: Below netdev events are currently not handled.
+        */
+       case NETDEV_CHANGEMTU:
+       case NETDEV_CHANGE:
+               break;
+
+       default:
+               break;
+       }
+       ib_device_put(&sdev->base_dev);
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block siw_netdev_nb = {
+       .notifier_call = siw_netdev_event,
+};
+
+static int siw_newlink(const char *basedev_name, struct net_device *netdev)
+{
+       struct ib_device *base_dev;
+       struct siw_device *sdev = NULL;
+       int rv = -ENOMEM;
+
+       if (!siw_dev_qualified(netdev))
+               return -EINVAL;
+
+       base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW);
+       if (base_dev) {
+               ib_device_put(base_dev);
+               return -EEXIST;
+       }
+       sdev = siw_device_create(netdev);
+       if (sdev) {
+               dev_dbg(&netdev->dev, "siw: new device\n");
+
+               if (netif_running(netdev) && netif_carrier_ok(netdev))
+                       sdev->state = IB_PORT_ACTIVE;
+               else
+                       sdev->state = IB_PORT_DOWN;
+
+               rv = siw_device_register(sdev, basedev_name);
+               if (rv)
+                       ib_dealloc_device(&sdev->base_dev);
+       }
+       return rv;
+}
+
+static struct rdma_link_ops siw_link_ops = {
+       .type = "siw",
+       .newlink = siw_newlink,
+};
+
+/*
+ * siw_init_module - Initialize Softiwarp module and register with netdev
+ *                   subsystem.
+ */
+static __init int siw_init_module(void)
+{
+       int rv;
+       int nr_cpu;
+
+       if (SENDPAGE_THRESH < SIW_MAX_INLINE) {
+               pr_info("siw: sendpage threshold too small: %u\n",
+                       (int)SENDPAGE_THRESH);
+               rv = -EINVAL;
+               goto out_error;
+       }
+       rv = siw_init_cpulist();
+       if (rv)
+               goto out_error;
+
+       rv = siw_cm_init();
+       if (rv)
+               goto out_error;
+
+       if (!siw_create_tx_threads()) {
+               pr_info("siw: Could not start any TX thread\n");
+               goto out_error;
+       }
+       /*
+        * Locate CRC32 algorithm. If unsuccessful, fail
+        * loading siw only, if CRC is required.
+        */
+       siw_crypto_shash = crypto_alloc_shash("crc32c", 0, 0);
+       if (IS_ERR(siw_crypto_shash)) {
+               pr_info("siw: Loading CRC32c failed: %ld\n",
+                       PTR_ERR(siw_crypto_shash));
+               siw_crypto_shash = NULL;
+               if (mpa_crc_required) {
+                       rv = -EOPNOTSUPP;
+                       goto out_error;
+               }
+       }
+       rv = register_netdevice_notifier(&siw_netdev_nb);
+       if (rv)
+               goto out_error;
+
+       rdma_link_register(&siw_link_ops);
+
+       pr_info("SoftiWARP attached\n");
+       return 0;
+
+out_error:
+       for (nr_cpu = 0; nr_cpu < nr_cpu_ids; nr_cpu++) {
+               if (siw_tx_thread[nr_cpu]) {
+                       siw_stop_tx_thread(nr_cpu);
+                       siw_tx_thread[nr_cpu] = NULL;
+               }
+       }
+       if (siw_crypto_shash)
+               crypto_free_shash(siw_crypto_shash);
+
+       pr_info("SoftIWARP attach failed. Error: %d\n", rv);
+
+       siw_cm_exit();
+       siw_destroy_cpulist();
+
+       return rv;
+}
+
+static void __exit siw_exit_module(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               if (siw_tx_thread[cpu]) {
+                       siw_stop_tx_thread(cpu);
+                       siw_tx_thread[cpu] = NULL;
+               }
+       }
+       unregister_netdevice_notifier(&siw_netdev_nb);
+       rdma_link_unregister(&siw_link_ops);
+       ib_unregister_driver(RDMA_DRIVER_SIW);
+
+       siw_cm_exit();
+
+       siw_destroy_cpulist();
+
+       if (siw_crypto_shash)
+               crypto_free_shash(siw_crypto_shash);
+
+       pr_info("SoftiWARP detached\n");
+}
+
+module_init(siw_init_module);
+module_exit(siw_exit_module);
+
+MODULE_ALIAS_RDMA_LINK("siw");
diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c
new file mode 100644 (file)
index 0000000..67171c8
--- /dev/null
@@ -0,0 +1,460 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/gfp.h>
+#include <rdma/ib_verbs.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/sched/mm.h>
+#include <linux/resource.h>
+
+#include "siw.h"
+#include "siw_mem.h"
+
+/*
+ * Stag lookup is based on its index part only (24 bits).
+ * The code avoids special Stag of zero and tries to randomize
+ * STag values between 1 and SIW_STAG_MAX_INDEX.
+ */
+int siw_mem_add(struct siw_device *sdev, struct siw_mem *m)
+{
+       struct xa_limit limit = XA_LIMIT(1, 0x00ffffff);
+       u32 id, next;
+
+       get_random_bytes(&next, 4);
+       next &= 0x00ffffff;
+
+       if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next,
+           GFP_KERNEL) < 0)
+               return -ENOMEM;
+
+       /* Set the STag index part */
+       m->stag = id << 8;
+
+       siw_dbg_mem(m, "new MEM object\n");
+
+       return 0;
+}
+
+/*
+ * siw_mem_id2obj()
+ *
+ * resolves memory from stag given by id. might be called from:
+ * o process context before sending out of sgl, or
+ * o in softirq when resolving target memory
+ */
+struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index)
+{
+       struct siw_mem *mem;
+
+       rcu_read_lock();
+       mem = xa_load(&sdev->mem_xa, stag_index);
+       if (likely(mem && kref_get_unless_zero(&mem->ref))) {
+               rcu_read_unlock();
+               return mem;
+       }
+       rcu_read_unlock();
+
+       return NULL;
+}
+
+static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages,
+                          bool dirty)
+{
+       struct page **p = chunk->plist;
+
+       while (num_pages--) {
+               if (!PageDirty(*p) && dirty)
+                       put_user_pages_dirty_lock(p, 1);
+               else
+                       put_user_page(*p);
+               p++;
+       }
+}
+
+void siw_umem_release(struct siw_umem *umem, bool dirty)
+{
+       struct mm_struct *mm_s = umem->owning_mm;
+       int i, num_pages = umem->num_pages;
+
+       for (i = 0; num_pages; i++) {
+               int to_free = min_t(int, PAGES_PER_CHUNK, num_pages);
+
+               siw_free_plist(&umem->page_chunk[i], to_free,
+                              umem->writable && dirty);
+               kfree(umem->page_chunk[i].plist);
+               num_pages -= to_free;
+       }
+       atomic64_sub(umem->num_pages, &mm_s->pinned_vm);
+
+       mmdrop(mm_s);
+       kfree(umem->page_chunk);
+       kfree(umem);
+}
+
+int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj,
+                  u64 start, u64 len, int rights)
+{
+       struct siw_device *sdev = to_siw_dev(pd->device);
+       struct siw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+       struct xa_limit limit = XA_LIMIT(1, 0x00ffffff);
+       u32 id, next;
+
+       if (!mem)
+               return -ENOMEM;
+
+       mem->mem_obj = mem_obj;
+       mem->stag_valid = 0;
+       mem->sdev = sdev;
+       mem->va = start;
+       mem->len = len;
+       mem->pd = pd;
+       mem->perms = rights & IWARP_ACCESS_MASK;
+       kref_init(&mem->ref);
+
+       mr->mem = mem;
+
+       get_random_bytes(&next, 4);
+       next &= 0x00ffffff;
+
+       if (xa_alloc_cyclic(&sdev->mem_xa, &id, mem, limit, &next,
+           GFP_KERNEL) < 0) {
+               kfree(mem);
+               return -ENOMEM;
+       }
+       /* Set the STag index part */
+       mem->stag = id << 8;
+       mr->base_mr.lkey = mr->base_mr.rkey = mem->stag;
+
+       return 0;
+}
+
+void siw_mr_drop_mem(struct siw_mr *mr)
+{
+       struct siw_mem *mem = mr->mem, *found;
+
+       mem->stag_valid = 0;
+
+       /* make STag invalid visible asap */
+       smp_mb();
+
+       found = xa_erase(&mem->sdev->mem_xa, mem->stag >> 8);
+       WARN_ON(found != mem);
+       siw_mem_put(mem);
+}
+
+void siw_free_mem(struct kref *ref)
+{
+       struct siw_mem *mem = container_of(ref, struct siw_mem, ref);
+
+       siw_dbg_mem(mem, "free mem, pbl: %s\n", mem->is_pbl ? "y" : "n");
+
+       if (!mem->is_mw && mem->mem_obj) {
+               if (mem->is_pbl == 0)
+                       siw_umem_release(mem->umem, true);
+               else
+                       kfree(mem->pbl);
+       }
+       kfree(mem);
+}
+
+/*
+ * siw_check_mem()
+ *
+ * Check protection domain, STAG state, access permissions and
+ * address range for memory object.
+ *
+ * @pd:                Protection Domain memory should belong to
+ * @mem:       memory to be checked
+ * @addr:      starting addr of mem
+ * @perms:     requested access permissions
+ * @len:       len of memory interval to be checked
+ *
+ */
+int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr,
+                 enum ib_access_flags perms, int len)
+{
+       if (!mem->stag_valid) {
+               siw_dbg_pd(pd, "STag 0x%08x invalid\n", mem->stag);
+               return -E_STAG_INVALID;
+       }
+       if (mem->pd != pd) {
+               siw_dbg_pd(pd, "STag 0x%08x: PD mismatch\n", mem->stag);
+               return -E_PD_MISMATCH;
+       }
+       /*
+        * check access permissions
+        */
+       if ((mem->perms & perms) < perms) {
+               siw_dbg_pd(pd, "permissions 0x%08x < 0x%08x\n",
+                          mem->perms, perms);
+               return -E_ACCESS_PERM;
+       }
+       /*
+        * Check if access falls into valid memory interval.
+        */
+       if (addr < mem->va || addr + len > mem->va + mem->len) {
+               siw_dbg_pd(pd, "MEM interval len %d\n", len);
+               siw_dbg_pd(pd, "[0x%016llx, 0x%016llx] out of bounds\n",
+                          (unsigned long long)addr,
+                          (unsigned long long)(addr + len));
+               siw_dbg_pd(pd, "[0x%016llx, 0x%016llx] STag=0x%08x\n",
+                          (unsigned long long)mem->va,
+                          (unsigned long long)(mem->va + mem->len),
+                          mem->stag);
+
+               return -E_BASE_BOUNDS;
+       }
+       return E_ACCESS_OK;
+}
+
+/*
+ * siw_check_sge()
+ *
+ * Check SGE for access rights in given interval
+ *
+ * @pd:                Protection Domain memory should belong to
+ * @sge:       SGE to be checked
+ * @mem:       location of memory reference within array
+ * @perms:     requested access permissions
+ * @off:       starting offset in SGE
+ * @len:       len of memory interval to be checked
+ *
+ * NOTE: Function references SGE's memory object (mem->obj)
+ * if not yet done. New reference is kept if check went ok and
+ * released if check failed. If mem->obj is already valid, no new
+ * lookup is being done and mem is not released it check fails.
+ */
+int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, struct siw_mem *mem[],
+                 enum ib_access_flags perms, u32 off, int len)
+{
+       struct siw_device *sdev = to_siw_dev(pd->device);
+       struct siw_mem *new = NULL;
+       int rv = E_ACCESS_OK;
+
+       if (len + off > sge->length) {
+               rv = -E_BASE_BOUNDS;
+               goto fail;
+       }
+       if (*mem == NULL) {
+               new = siw_mem_id2obj(sdev, sge->lkey >> 8);
+               if (unlikely(!new)) {
+                       siw_dbg_pd(pd, "STag unknown: 0x%08x\n", sge->lkey);
+                       rv = -E_STAG_INVALID;
+                       goto fail;
+               }
+               *mem = new;
+       }
+       /* Check if user re-registered with different STag key */
+       if (unlikely((*mem)->stag != sge->lkey)) {
+               siw_dbg_mem((*mem), "STag mismatch: 0x%08x\n", sge->lkey);
+               rv = -E_STAG_INVALID;
+               goto fail;
+       }
+       rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len);
+       if (unlikely(rv))
+               goto fail;
+
+       return 0;
+
+fail:
+       if (new) {
+               *mem = NULL;
+               siw_mem_put(new);
+       }
+       return rv;
+}
+
+void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op)
+{
+       switch (op) {
+       case SIW_OP_SEND:
+       case SIW_OP_WRITE:
+       case SIW_OP_SEND_WITH_IMM:
+       case SIW_OP_SEND_REMOTE_INV:
+       case SIW_OP_READ:
+       case SIW_OP_READ_LOCAL_INV:
+               if (!(wqe->sqe.flags & SIW_WQE_INLINE))
+                       siw_unref_mem_sgl(wqe->mem, wqe->sqe.num_sge);
+               break;
+
+       case SIW_OP_RECEIVE:
+               siw_unref_mem_sgl(wqe->mem, wqe->rqe.num_sge);
+               break;
+
+       case SIW_OP_READ_RESPONSE:
+               siw_unref_mem_sgl(wqe->mem, 1);
+               break;
+
+       default:
+               /*
+                * SIW_OP_INVAL_STAG and SIW_OP_REG_MR
+                * do not hold memory references
+                */
+               break;
+       }
+}
+
+int siw_invalidate_stag(struct ib_pd *pd, u32 stag)
+{
+       struct siw_device *sdev = to_siw_dev(pd->device);
+       struct siw_mem *mem = siw_mem_id2obj(sdev, stag >> 8);
+       int rv = 0;
+
+       if (unlikely(!mem)) {
+               siw_dbg_pd(pd, "STag 0x%08x unknown\n", stag);
+               return -EINVAL;
+       }
+       if (unlikely(mem->pd != pd)) {
+               siw_dbg_pd(pd, "PD mismatch for STag 0x%08x\n", stag);
+               rv = -EACCES;
+               goto out;
+       }
+       /*
+        * Per RDMA verbs definition, an STag may already be in invalid
+        * state if invalidation is requested. So no state check here.
+        */
+       mem->stag_valid = 0;
+
+       siw_dbg_pd(pd, "STag 0x%08x now invalid\n", stag);
+out:
+       siw_mem_put(mem);
+       return rv;
+}
+
+/*
+ * Gets physical address backed by PBL element. Address is referenced
+ * by linear byte offset into list of variably sized PB elements.
+ * Optionally, provides remaining len within current element, and
+ * current PBL index for later resume at same element.
+ */
+u64 siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx)
+{
+       int i = idx ? *idx : 0;
+
+       while (i < pbl->num_buf) {
+               struct siw_pble *pble = &pbl->pbe[i];
+
+               if (pble->pbl_off + pble->size > off) {
+                       u64 pble_off = off - pble->pbl_off;
+
+                       if (len)
+                               *len = pble->size - pble_off;
+                       if (idx)
+                               *idx = i;
+
+                       return pble->addr + pble_off;
+               }
+               i++;
+       }
+       if (len)
+               *len = 0;
+       return 0;
+}
+
+struct siw_pbl *siw_pbl_alloc(u32 num_buf)
+{
+       struct siw_pbl *pbl;
+       int buf_size = sizeof(*pbl);
+
+       if (num_buf == 0)
+               return ERR_PTR(-EINVAL);
+
+       buf_size += ((num_buf - 1) * sizeof(struct siw_pble));
+
+       pbl = kzalloc(buf_size, GFP_KERNEL);
+       if (!pbl)
+               return ERR_PTR(-ENOMEM);
+
+       pbl->max_buf = num_buf;
+
+       return pbl;
+}
+
+struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
+{
+       struct siw_umem *umem;
+       struct mm_struct *mm_s;
+       u64 first_page_va;
+       unsigned long mlock_limit;
+       unsigned int foll_flags = FOLL_WRITE;
+       int num_pages, num_chunks, i, rv = 0;
+
+       if (!can_do_mlock())
+               return ERR_PTR(-EPERM);
+
+       if (!len)
+               return ERR_PTR(-EINVAL);
+
+       first_page_va = start & PAGE_MASK;
+       num_pages = PAGE_ALIGN(start + len - first_page_va) >> PAGE_SHIFT;
+       num_chunks = (num_pages >> CHUNK_SHIFT) + 1;
+
+       umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+       if (!umem)
+               return ERR_PTR(-ENOMEM);
+
+       mm_s = current->mm;
+       umem->owning_mm = mm_s;
+       umem->writable = writable;
+
+       mmgrab(mm_s);
+
+       if (!writable)
+               foll_flags |= FOLL_FORCE;
+
+       down_read(&mm_s->mmap_sem);
+
+       mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+       if (num_pages + atomic64_read(&mm_s->pinned_vm) > mlock_limit) {
+               rv = -ENOMEM;
+               goto out_sem_up;
+       }
+       umem->fp_addr = first_page_va;
+
+       umem->page_chunk =
+               kcalloc(num_chunks, sizeof(struct siw_page_chunk), GFP_KERNEL);
+       if (!umem->page_chunk) {
+               rv = -ENOMEM;
+               goto out_sem_up;
+       }
+       for (i = 0; num_pages; i++) {
+               int got, nents = min_t(int, num_pages, PAGES_PER_CHUNK);
+
+               umem->page_chunk[i].plist =
+                       kcalloc(nents, sizeof(struct page *), GFP_KERNEL);
+               if (!umem->page_chunk[i].plist) {
+                       rv = -ENOMEM;
+                       goto out_sem_up;
+               }
+               got = 0;
+               while (nents) {
+                       struct page **plist = &umem->page_chunk[i].plist[got];
+
+                       rv = get_user_pages(first_page_va, nents,
+                                           foll_flags | FOLL_LONGTERM,
+                                           plist, NULL);
+                       if (rv < 0)
+                               goto out_sem_up;
+
+                       umem->num_pages += rv;
+                       atomic64_add(rv, &mm_s->pinned_vm);
+                       first_page_va += rv * PAGE_SIZE;
+                       nents -= rv;
+                       got += rv;
+               }
+               num_pages -= got;
+       }
+out_sem_up:
+       up_read(&mm_s->mmap_sem);
+
+       if (rv > 0)
+               return umem;
+
+       siw_umem_release(umem, false);
+
+       return ERR_PTR(rv);
+}
diff --git a/drivers/infiniband/sw/siw/siw_mem.h b/drivers/infiniband/sw/siw/siw_mem.h
new file mode 100644 (file)
index 0000000..f43daf2
--- /dev/null
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#ifndef _SIW_MEM_H
+#define _SIW_MEM_H
+
+struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable);
+void siw_umem_release(struct siw_umem *umem, bool dirty);
+struct siw_pbl *siw_pbl_alloc(u32 num_buf);
+u64 siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx);
+struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index);
+int siw_mem_add(struct siw_device *sdev, struct siw_mem *m);
+int siw_invalidate_stag(struct ib_pd *pd, u32 stag);
+int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr,
+                 enum ib_access_flags perms, int len);
+int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge,
+                 struct siw_mem *mem[], enum ib_access_flags perms,
+                 u32 off, int len);
+void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op);
+int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj,
+                  u64 start, u64 len, int rights);
+void siw_mr_drop_mem(struct siw_mr *mr);
+void siw_free_mem(struct kref *ref);
+
+static inline void siw_mem_put(struct siw_mem *mem)
+{
+       kref_put(&mem->ref, siw_free_mem);
+}
+
+static inline struct siw_mr *siw_mem2mr(struct siw_mem *m)
+{
+       return container_of(m, struct siw_mr, mem);
+}
+
+static inline void siw_unref_mem_sgl(struct siw_mem **mem, unsigned int num_sge)
+{
+       while (num_sge) {
+               if (*mem == NULL)
+                       break;
+
+               siw_mem_put(*mem);
+               *mem = NULL;
+               mem++;
+               num_sge--;
+       }
+}
+
+#define CHUNK_SHIFT 9 /* sets number of pages per chunk */
+#define PAGES_PER_CHUNK (_AC(1, UL) << CHUNK_SHIFT)
+#define CHUNK_MASK (~(PAGES_PER_CHUNK - 1))
+#define PAGE_CHUNK_SIZE (PAGES_PER_CHUNK * sizeof(struct page *))
+
+/*
+ * siw_get_upage()
+ *
+ * Get page pointer for address on given umem.
+ *
+ * @umem: two dimensional list of page pointers
+ * @addr: user virtual address
+ */
+static inline struct page *siw_get_upage(struct siw_umem *umem, u64 addr)
+{
+       unsigned int page_idx = (addr - umem->fp_addr) >> PAGE_SHIFT,
+                    chunk_idx = page_idx >> CHUNK_SHIFT,
+                    page_in_chunk = page_idx & ~CHUNK_MASK;
+
+       if (likely(page_idx < umem->num_pages))
+               return umem->page_chunk[chunk_idx].plist[page_in_chunk];
+
+       return NULL;
+}
+#endif
diff --git a/drivers/infiniband/sw/siw/siw_qp.c b/drivers/infiniband/sw/siw/siw_qp.c
new file mode 100644 (file)
index 0000000..11383d9
--- /dev/null
@@ -0,0 +1,1322 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/llist.h>
+#include <asm/barrier.h>
+#include <net/tcp.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+#include "siw_mem.h"
+
+static char siw_qp_state_to_string[SIW_QP_STATE_COUNT][sizeof "TERMINATE"] = {
+       [SIW_QP_STATE_IDLE] = "IDLE",
+       [SIW_QP_STATE_RTR] = "RTR",
+       [SIW_QP_STATE_RTS] = "RTS",
+       [SIW_QP_STATE_CLOSING] = "CLOSING",
+       [SIW_QP_STATE_TERMINATE] = "TERMINATE",
+       [SIW_QP_STATE_ERROR] = "ERROR"
+};
+
+/*
+ * iWARP (RDMAP, DDP and MPA) parameters as well as Softiwarp settings on a
+ * per-RDMAP message basis. Please keep order of initializer. All MPA len
+ * is initialized to minimum packet size.
+ */
+struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1] = {
+       { /* RDMAP_RDMA_WRITE */
+         .hdr_len = sizeof(struct iwarp_rdma_write),
+         .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_write) - 2),
+         .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST |
+                                cpu_to_be16(DDP_VERSION << 8) |
+                                cpu_to_be16(RDMAP_VERSION << 6) |
+                                cpu_to_be16(RDMAP_RDMA_WRITE),
+         .rx_data = siw_proc_write },
+       { /* RDMAP_RDMA_READ_REQ */
+         .hdr_len = sizeof(struct iwarp_rdma_rreq),
+         .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rreq) - 2),
+         .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+                                cpu_to_be16(RDMAP_VERSION << 6) |
+                                cpu_to_be16(RDMAP_RDMA_READ_REQ),
+         .rx_data = siw_proc_rreq },
+       { /* RDMAP_RDMA_READ_RESP */
+         .hdr_len = sizeof(struct iwarp_rdma_rresp),
+         .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rresp) - 2),
+         .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST |
+                                cpu_to_be16(DDP_VERSION << 8) |
+                                cpu_to_be16(RDMAP_VERSION << 6) |
+                                cpu_to_be16(RDMAP_RDMA_READ_RESP),
+         .rx_data = siw_proc_rresp },
+       { /* RDMAP_SEND */
+         .hdr_len = sizeof(struct iwarp_send),
+         .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
+         .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+                                cpu_to_be16(RDMAP_VERSION << 6) |
+                                cpu_to_be16(RDMAP_SEND),
+         .rx_data = siw_proc_send },
+       { /* RDMAP_SEND_INVAL */
+         .hdr_len = sizeof(struct iwarp_send_inv),
+         .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
+         .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+                                cpu_to_be16(RDMAP_VERSION << 6) |
+                                cpu_to_be16(RDMAP_SEND_INVAL),
+         .rx_data = siw_proc_send },
+       { /* RDMAP_SEND_SE */
+         .hdr_len = sizeof(struct iwarp_send),
+         .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
+         .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+                                cpu_to_be16(RDMAP_VERSION << 6) |
+                                cpu_to_be16(RDMAP_SEND_SE),
+         .rx_data = siw_proc_send },
+       { /* RDMAP_SEND_SE_INVAL */
+         .hdr_len = sizeof(struct iwarp_send_inv),
+         .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
+         .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+                                cpu_to_be16(RDMAP_VERSION << 6) |
+                                cpu_to_be16(RDMAP_SEND_SE_INVAL),
+         .rx_data = siw_proc_send },
+       { /* RDMAP_TERMINATE */
+         .hdr_len = sizeof(struct iwarp_terminate),
+         .ctrl.mpa_len = htons(sizeof(struct iwarp_terminate) - 2),
+         .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+                                cpu_to_be16(RDMAP_VERSION << 6) |
+                                cpu_to_be16(RDMAP_TERMINATE),
+         .rx_data = siw_proc_terminate }
+};
+
+void siw_qp_llp_data_ready(struct sock *sk)
+{
+       struct siw_qp *qp;
+
+       read_lock(&sk->sk_callback_lock);
+
+       if (unlikely(!sk->sk_user_data || !sk_to_qp(sk)))
+               goto done;
+
+       qp = sk_to_qp(sk);
+
+       if (likely(!qp->rx_stream.rx_suspend &&
+                  down_read_trylock(&qp->state_lock))) {
+               read_descriptor_t rd_desc = { .arg.data = qp, .count = 1 };
+
+               if (likely(qp->attrs.state == SIW_QP_STATE_RTS))
+                       /*
+                        * Implements data receive operation during
+                        * socket callback. TCP gracefully catches
+                        * the case where there is nothing to receive
+                        * (not calling siw_tcp_rx_data() then).
+                        */
+                       tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
+
+               up_read(&qp->state_lock);
+       } else {
+               siw_dbg_qp(qp, "unable to process RX, suspend: %d\n",
+                          qp->rx_stream.rx_suspend);
+       }
+done:
+       read_unlock(&sk->sk_callback_lock);
+}
+
+void siw_qp_llp_close(struct siw_qp *qp)
+{
+       siw_dbg_qp(qp, "enter llp close, state = %s\n",
+                  siw_qp_state_to_string[qp->attrs.state]);
+
+       down_write(&qp->state_lock);
+
+       qp->rx_stream.rx_suspend = 1;
+       qp->tx_ctx.tx_suspend = 1;
+       qp->attrs.sk = NULL;
+
+       switch (qp->attrs.state) {
+       case SIW_QP_STATE_RTS:
+       case SIW_QP_STATE_RTR:
+       case SIW_QP_STATE_IDLE:
+       case SIW_QP_STATE_TERMINATE:
+               qp->attrs.state = SIW_QP_STATE_ERROR;
+               break;
+       /*
+        * SIW_QP_STATE_CLOSING:
+        *
+        * This is a forced close. shall the QP be moved to
+        * ERROR or IDLE ?
+        */
+       case SIW_QP_STATE_CLOSING:
+               if (tx_wqe(qp)->wr_status == SIW_WR_IDLE)
+                       qp->attrs.state = SIW_QP_STATE_ERROR;
+               else
+                       qp->attrs.state = SIW_QP_STATE_IDLE;
+               break;
+
+       default:
+               siw_dbg_qp(qp, "llp close: no state transition needed: %s\n",
+                          siw_qp_state_to_string[qp->attrs.state]);
+               break;
+       }
+       siw_sq_flush(qp);
+       siw_rq_flush(qp);
+
+       /*
+        * Dereference closing CEP
+        */
+       if (qp->cep) {
+               siw_cep_put(qp->cep);
+               qp->cep = NULL;
+       }
+
+       up_write(&qp->state_lock);
+
+       siw_dbg_qp(qp, "llp close exit: state %s\n",
+                  siw_qp_state_to_string[qp->attrs.state]);
+}
+
+/*
+ * socket callback routine informing about newly available send space.
+ * Function schedules SQ work for processing SQ items.
+ */
+void siw_qp_llp_write_space(struct sock *sk)
+{
+       struct siw_cep *cep = sk_to_cep(sk);
+
+       cep->sk_write_space(sk);
+
+       if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+               (void)siw_sq_start(cep->qp);
+}
+
+static int siw_qp_readq_init(struct siw_qp *qp, int irq_size, int orq_size)
+{
+       irq_size = roundup_pow_of_two(irq_size);
+       orq_size = roundup_pow_of_two(orq_size);
+
+       qp->attrs.irq_size = irq_size;
+       qp->attrs.orq_size = orq_size;
+
+       qp->irq = vzalloc(irq_size * sizeof(struct siw_sqe));
+       if (!qp->irq) {
+               siw_dbg_qp(qp, "irq malloc for %d failed\n", irq_size);
+               qp->attrs.irq_size = 0;
+               return -ENOMEM;
+       }
+       qp->orq = vzalloc(orq_size * sizeof(struct siw_sqe));
+       if (!qp->orq) {
+               siw_dbg_qp(qp, "orq malloc for %d failed\n", orq_size);
+               qp->attrs.orq_size = 0;
+               qp->attrs.irq_size = 0;
+               vfree(qp->irq);
+               return -ENOMEM;
+       }
+       siw_dbg_qp(qp, "ORD %d, IRD %d\n", orq_size, irq_size);
+       return 0;
+}
+
+static int siw_qp_enable_crc(struct siw_qp *qp)
+{
+       struct siw_rx_stream *c_rx = &qp->rx_stream;
+       struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
+       int size = crypto_shash_descsize(siw_crypto_shash) +
+                       sizeof(struct shash_desc);
+
+       if (siw_crypto_shash == NULL)
+               return -ENOENT;
+
+       c_tx->mpa_crc_hd = kzalloc(size, GFP_KERNEL);
+       c_rx->mpa_crc_hd = kzalloc(size, GFP_KERNEL);
+       if (!c_tx->mpa_crc_hd || !c_rx->mpa_crc_hd) {
+               kfree(c_tx->mpa_crc_hd);
+               kfree(c_rx->mpa_crc_hd);
+               c_tx->mpa_crc_hd = NULL;
+               c_rx->mpa_crc_hd = NULL;
+               return -ENOMEM;
+       }
+       c_tx->mpa_crc_hd->tfm = siw_crypto_shash;
+       c_rx->mpa_crc_hd->tfm = siw_crypto_shash;
+
+       return 0;
+}
+
+/*
+ * Send a non signalled READ or WRITE to peer side as negotiated
+ * with MPAv2 P2P setup protocol. The work request is only created
+ * as a current active WR and does not consume Send Queue space.
+ *
+ * Caller must hold QP state lock.
+ */
+int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl)
+{
+       struct siw_wqe *wqe = tx_wqe(qp);
+       unsigned long flags;
+       int rv = 0;
+
+       spin_lock_irqsave(&qp->sq_lock, flags);
+
+       if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
+               spin_unlock_irqrestore(&qp->sq_lock, flags);
+               return -EIO;
+       }
+       memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
+
+       wqe->wr_status = SIW_WR_QUEUED;
+       wqe->sqe.flags = 0;
+       wqe->sqe.num_sge = 1;
+       wqe->sqe.sge[0].length = 0;
+       wqe->sqe.sge[0].laddr = 0;
+       wqe->sqe.sge[0].lkey = 0;
+       /*
+        * While it must not be checked for inbound zero length
+        * READ/WRITE, some HW may treat STag 0 special.
+        */
+       wqe->sqe.rkey = 1;
+       wqe->sqe.raddr = 0;
+       wqe->processed = 0;
+
+       if (ctrl & MPA_V2_RDMA_WRITE_RTR)
+               wqe->sqe.opcode = SIW_OP_WRITE;
+       else if (ctrl & MPA_V2_RDMA_READ_RTR) {
+               struct siw_sqe *rreq;
+
+               wqe->sqe.opcode = SIW_OP_READ;
+
+               spin_lock(&qp->orq_lock);
+
+               rreq = orq_get_free(qp);
+               if (rreq) {
+                       siw_read_to_orq(rreq, &wqe->sqe);
+                       qp->orq_put++;
+               } else
+                       rv = -EIO;
+
+               spin_unlock(&qp->orq_lock);
+       } else
+               rv = -EINVAL;
+
+       if (rv)
+               wqe->wr_status = SIW_WR_IDLE;
+
+       spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+       if (!rv)
+               rv = siw_sq_start(qp);
+
+       return rv;
+}
+
+/*
+ * Map memory access error to DDP tagged error
+ */
+enum ddp_ecode siw_tagged_error(enum siw_access_state state)
+{
+       switch (state) {
+       case E_STAG_INVALID:
+               return DDP_ECODE_T_INVALID_STAG;
+       case E_BASE_BOUNDS:
+               return DDP_ECODE_T_BASE_BOUNDS;
+       case E_PD_MISMATCH:
+               return DDP_ECODE_T_STAG_NOT_ASSOC;
+       case E_ACCESS_PERM:
+               /*
+                * RFC 5041 (DDP) lacks an ecode for insufficient access
+                * permissions. 'Invalid STag' seem to be the closest
+                * match though.
+                */
+               return DDP_ECODE_T_INVALID_STAG;
+       default:
+               WARN_ON(1);
+               return DDP_ECODE_T_INVALID_STAG;
+       }
+}
+
+/*
+ * Map memory access error to RDMAP protection error
+ */
+enum rdmap_ecode siw_rdmap_error(enum siw_access_state state)
+{
+       switch (state) {
+       case E_STAG_INVALID:
+               return RDMAP_ECODE_INVALID_STAG;
+       case E_BASE_BOUNDS:
+               return RDMAP_ECODE_BASE_BOUNDS;
+       case E_PD_MISMATCH:
+               return RDMAP_ECODE_STAG_NOT_ASSOC;
+       case E_ACCESS_PERM:
+               return RDMAP_ECODE_ACCESS_RIGHTS;
+       default:
+               return RDMAP_ECODE_UNSPECIFIED;
+       }
+}
+
+void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer, u8 etype,
+                       u8 ecode, int in_tx)
+{
+       if (!qp->term_info.valid) {
+               memset(&qp->term_info, 0, sizeof(qp->term_info));
+               qp->term_info.layer = layer;
+               qp->term_info.etype = etype;
+               qp->term_info.ecode = ecode;
+               qp->term_info.in_tx = in_tx;
+               qp->term_info.valid = 1;
+       }
+       siw_dbg_qp(qp, "init TERM: layer %d, type %d, code %d, in tx %s\n",
+                  layer, etype, ecode, in_tx ? "yes" : "no");
+}
+
+/*
+ * Send a TERMINATE message, as defined in RFC's 5040/5041/5044/6581.
+ * Sending TERMINATE messages is best effort - such messages
+ * can only be send if the QP is still connected and it does
+ * not have another outbound message in-progress, i.e. the
+ * TERMINATE message must not interfer with an incomplete current
+ * transmit operation.
+ */
+void siw_send_terminate(struct siw_qp *qp)
+{
+       struct kvec iov[3];
+       struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
+       struct iwarp_terminate *term = NULL;
+       union iwarp_hdr *err_hdr = NULL;
+       struct socket *s = qp->attrs.sk;
+       struct siw_rx_stream *srx = &qp->rx_stream;
+       union iwarp_hdr *rx_hdr = &srx->hdr;
+       u32 crc = 0;
+       int num_frags, len_terminate, rv;
+
+       if (!qp->term_info.valid)
+               return;
+
+       qp->term_info.valid = 0;
+
+       if (tx_wqe(qp)->wr_status == SIW_WR_INPROGRESS) {
+               siw_dbg_qp(qp, "cannot send TERMINATE: op %d in progress\n",
+                          tx_type(tx_wqe(qp)));
+               return;
+       }
+       if (!s && qp->cep)
+               /* QP not yet in RTS. Take socket from connection end point */
+               s = qp->cep->sock;
+
+       if (!s) {
+               siw_dbg_qp(qp, "cannot send TERMINATE: not connected\n");
+               return;
+       }
+
+       term = kzalloc(sizeof(*term), GFP_KERNEL);
+       if (!term)
+               return;
+
+       term->ddp_qn = cpu_to_be32(RDMAP_UNTAGGED_QN_TERMINATE);
+       term->ddp_mo = 0;
+       term->ddp_msn = cpu_to_be32(1);
+
+       iov[0].iov_base = term;
+       iov[0].iov_len = sizeof(*term);
+
+       if ((qp->term_info.layer == TERM_ERROR_LAYER_DDP) ||
+           ((qp->term_info.layer == TERM_ERROR_LAYER_RDMAP) &&
+            (qp->term_info.etype != RDMAP_ETYPE_CATASTROPHIC))) {
+               err_hdr = kzalloc(sizeof(*err_hdr), GFP_KERNEL);
+               if (!err_hdr) {
+                       kfree(term);
+                       return;
+               }
+       }
+       memcpy(&term->ctrl, &iwarp_pktinfo[RDMAP_TERMINATE].ctrl,
+              sizeof(struct iwarp_ctrl));
+
+       __rdmap_term_set_layer(term, qp->term_info.layer);
+       __rdmap_term_set_etype(term, qp->term_info.etype);
+       __rdmap_term_set_ecode(term, qp->term_info.ecode);
+
+       switch (qp->term_info.layer) {
+       case TERM_ERROR_LAYER_RDMAP:
+               if (qp->term_info.etype == RDMAP_ETYPE_CATASTROPHIC)
+                       /* No additional DDP/RDMAP header to be included */
+                       break;
+
+               if (qp->term_info.etype == RDMAP_ETYPE_REMOTE_PROTECTION) {
+                       /*
+                        * Complete RDMAP frame will get attached, and
+                        * DDP segment length is valid
+                        */
+                       term->flag_m = 1;
+                       term->flag_d = 1;
+                       term->flag_r = 1;
+
+                       if (qp->term_info.in_tx) {
+                               struct iwarp_rdma_rreq *rreq;
+                               struct siw_wqe *wqe = tx_wqe(qp);
+
+                               /* Inbound RREQ error, detected during
+                                * RRESP creation. Take state from
+                                * current TX work queue element to
+                                * reconstruct peers RREQ.
+                                */
+                               rreq = (struct iwarp_rdma_rreq *)err_hdr;
+
+                               memcpy(&rreq->ctrl,
+                                      &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
+                                      sizeof(struct iwarp_ctrl));
+
+                               rreq->rsvd = 0;
+                               rreq->ddp_qn =
+                                       htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
+
+                               /* Provide RREQ's MSN as kept aside */
+                               rreq->ddp_msn = htonl(wqe->sqe.sge[0].length);
+
+                               rreq->ddp_mo = htonl(wqe->processed);
+                               rreq->sink_stag = htonl(wqe->sqe.rkey);
+                               rreq->sink_to = cpu_to_be64(wqe->sqe.raddr);
+                               rreq->read_size = htonl(wqe->sqe.sge[0].length);
+                               rreq->source_stag = htonl(wqe->sqe.sge[0].lkey);
+                               rreq->source_to =
+                                       cpu_to_be64(wqe->sqe.sge[0].laddr);
+
+                               iov[1].iov_base = rreq;
+                               iov[1].iov_len = sizeof(*rreq);
+
+                               rx_hdr = (union iwarp_hdr *)rreq;
+                       } else {
+                               /* Take RDMAP/DDP information from
+                                * current (failed) inbound frame.
+                                */
+                               iov[1].iov_base = rx_hdr;
+
+                               if (__rdmap_get_opcode(&rx_hdr->ctrl) ==
+                                   RDMAP_RDMA_READ_REQ)
+                                       iov[1].iov_len =
+                                               sizeof(struct iwarp_rdma_rreq);
+                               else /* SEND type */
+                                       iov[1].iov_len =
+                                               sizeof(struct iwarp_send);
+                       }
+               } else {
+                       /* Do not report DDP hdr information if packet
+                        * layout is unknown
+                        */
+                       if ((qp->term_info.ecode == RDMAP_ECODE_VERSION) ||
+                           (qp->term_info.ecode == RDMAP_ECODE_OPCODE))
+                               break;
+
+                       iov[1].iov_base = rx_hdr;
+
+                       /* Only DDP frame will get attached */
+                       if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
+                               iov[1].iov_len =
+                                       sizeof(struct iwarp_rdma_write);
+                       else
+                               iov[1].iov_len = sizeof(struct iwarp_send);
+
+                       term->flag_m = 1;
+                       term->flag_d = 1;
+               }
+               term->ctrl.mpa_len = cpu_to_be16(iov[1].iov_len);
+               break;
+
+       case TERM_ERROR_LAYER_DDP:
+               /* Report error encountered while DDP processing.
+                * This can only happen as a result of inbound
+                * DDP processing
+                */
+
+               /* Do not report DDP hdr information if packet
+                * layout is unknown
+                */
+               if (((qp->term_info.etype == DDP_ETYPE_TAGGED_BUF) &&
+                    (qp->term_info.ecode == DDP_ECODE_T_VERSION)) ||
+                   ((qp->term_info.etype == DDP_ETYPE_UNTAGGED_BUF) &&
+                    (qp->term_info.ecode == DDP_ECODE_UT_VERSION)))
+                       break;
+
+               iov[1].iov_base = rx_hdr;
+
+               if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
+                       iov[1].iov_len = sizeof(struct iwarp_ctrl_tagged);
+               else
+                       iov[1].iov_len = sizeof(struct iwarp_ctrl_untagged);
+
+               term->flag_m = 1;
+               term->flag_d = 1;
+               break;
+
+       default:
+               break;
+       }
+       if (term->flag_m || term->flag_d || term->flag_r) {
+               iov[2].iov_base = &crc;
+               iov[2].iov_len = sizeof(crc);
+               len_terminate = sizeof(*term) + iov[1].iov_len + MPA_CRC_SIZE;
+               num_frags = 3;
+       } else {
+               iov[1].iov_base = &crc;
+               iov[1].iov_len = sizeof(crc);
+               len_terminate = sizeof(*term) + MPA_CRC_SIZE;
+               num_frags = 2;
+       }
+
+       /* Adjust DDP Segment Length parameter, if valid */
+       if (term->flag_m) {
+               u32 real_ddp_len = be16_to_cpu(rx_hdr->ctrl.mpa_len);
+               enum rdma_opcode op = __rdmap_get_opcode(&rx_hdr->ctrl);
+
+               real_ddp_len -= iwarp_pktinfo[op].hdr_len - MPA_HDR_SIZE;
+               rx_hdr->ctrl.mpa_len = cpu_to_be16(real_ddp_len);
+       }
+
+       term->ctrl.mpa_len =
+               cpu_to_be16(len_terminate - (MPA_HDR_SIZE + MPA_CRC_SIZE));
+       if (qp->tx_ctx.mpa_crc_hd) {
+               crypto_shash_init(qp->tx_ctx.mpa_crc_hd);
+               if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd,
+                                       (u8 *)iov[0].iov_base,
+                                       iov[0].iov_len))
+                       goto out;
+
+               if (num_frags == 3) {
+                       if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd,
+                                               (u8 *)iov[1].iov_base,
+                                               iov[1].iov_len))
+                               goto out;
+               }
+               crypto_shash_final(qp->tx_ctx.mpa_crc_hd, (u8 *)&crc);
+       }
+
+       rv = kernel_sendmsg(s, &msg, iov, num_frags, len_terminate);
+       siw_dbg_qp(qp, "sent TERM: %s, layer %d, type %d, code %d (%d bytes)\n",
+                  rv == len_terminate ? "success" : "failure",
+                  __rdmap_term_layer(term), __rdmap_term_etype(term),
+                  __rdmap_term_ecode(term), rv);
+out:
+       kfree(term);
+       kfree(err_hdr);
+}
+
+/*
+ * Handle all attrs other than state
+ */
+static void siw_qp_modify_nonstate(struct siw_qp *qp,
+                                  struct siw_qp_attrs *attrs,
+                                  enum siw_qp_attr_mask mask)
+{
+       if (mask & SIW_QP_ATTR_ACCESS_FLAGS) {
+               if (attrs->flags & SIW_RDMA_BIND_ENABLED)
+                       qp->attrs.flags |= SIW_RDMA_BIND_ENABLED;
+               else
+                       qp->attrs.flags &= ~SIW_RDMA_BIND_ENABLED;
+
+               if (attrs->flags & SIW_RDMA_WRITE_ENABLED)
+                       qp->attrs.flags |= SIW_RDMA_WRITE_ENABLED;
+               else
+                       qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED;
+
+               if (attrs->flags & SIW_RDMA_READ_ENABLED)
+                       qp->attrs.flags |= SIW_RDMA_READ_ENABLED;
+               else
+                       qp->attrs.flags &= ~SIW_RDMA_READ_ENABLED;
+       }
+}
+
+static int siw_qp_nextstate_from_idle(struct siw_qp *qp,
+                                     struct siw_qp_attrs *attrs,
+                                     enum siw_qp_attr_mask mask)
+{
+       int rv = 0;
+
+       switch (attrs->state) {
+       case SIW_QP_STATE_RTS:
+               if (attrs->flags & SIW_MPA_CRC) {
+                       rv = siw_qp_enable_crc(qp);
+                       if (rv)
+                               break;
+               }
+               if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) {
+                       siw_dbg_qp(qp, "no socket\n");
+                       rv = -EINVAL;
+                       break;
+               }
+               if (!(mask & SIW_QP_ATTR_MPA)) {
+                       siw_dbg_qp(qp, "no MPA\n");
+                       rv = -EINVAL;
+                       break;
+               }
+               /*
+                * Initialize iWARP TX state
+                */
+               qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 0;
+               qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 0;
+               qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 0;
+
+               /*
+                * Initialize iWARP RX state
+                */
+               qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 1;
+               qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 1;
+               qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 1;
+
+               /*
+                * init IRD free queue, caller has already checked
+                * limits.
+                */
+               rv = siw_qp_readq_init(qp, attrs->irq_size,
+                                      attrs->orq_size);
+               if (rv)
+                       break;
+
+               qp->attrs.sk = attrs->sk;
+               qp->attrs.state = SIW_QP_STATE_RTS;
+
+               siw_dbg_qp(qp, "enter RTS: crc=%s, ord=%u, ird=%u\n",
+                          attrs->flags & SIW_MPA_CRC ? "y" : "n",
+                          qp->attrs.orq_size, qp->attrs.irq_size);
+               break;
+
+       case SIW_QP_STATE_ERROR:
+               siw_rq_flush(qp);
+               qp->attrs.state = SIW_QP_STATE_ERROR;
+               if (qp->cep) {
+                       siw_cep_put(qp->cep);
+                       qp->cep = NULL;
+               }
+               break;
+
+       default:
+               break;
+       }
+       return rv;
+}
+
+static int siw_qp_nextstate_from_rts(struct siw_qp *qp,
+                                    struct siw_qp_attrs *attrs)
+{
+       int drop_conn = 0;
+
+       switch (attrs->state) {
+       case SIW_QP_STATE_CLOSING:
+               /*
+                * Verbs: move to IDLE if SQ and ORQ are empty.
+                * Move to ERROR otherwise. But first of all we must
+                * close the connection. So we keep CLOSING or ERROR
+                * as a transient state, schedule connection drop work
+                * and wait for the socket state change upcall to
+                * come back closed.
+                */
+               if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) {
+                       qp->attrs.state = SIW_QP_STATE_CLOSING;
+               } else {
+                       qp->attrs.state = SIW_QP_STATE_ERROR;
+                       siw_sq_flush(qp);
+               }
+               siw_rq_flush(qp);
+
+               drop_conn = 1;
+               break;
+
+       case SIW_QP_STATE_TERMINATE:
+               qp->attrs.state = SIW_QP_STATE_TERMINATE;
+
+               siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+                                  RDMAP_ETYPE_CATASTROPHIC,
+                                  RDMAP_ECODE_UNSPECIFIED, 1);
+               drop_conn = 1;
+               break;
+
+       case SIW_QP_STATE_ERROR:
+               /*
+                * This is an emergency close.
+                *
+                * Any in progress transmit operation will get
+                * cancelled.
+                * This will likely result in a protocol failure,
+                * if a TX operation is in transit. The caller
+                * could unconditional wait to give the current
+                * operation a chance to complete.
+                * Esp., how to handle the non-empty IRQ case?
+                * The peer was asking for data transfer at a valid
+                * point in time.
+                */
+               siw_sq_flush(qp);
+               siw_rq_flush(qp);
+               qp->attrs.state = SIW_QP_STATE_ERROR;
+               drop_conn = 1;
+               break;
+
+       default:
+               break;
+       }
+       return drop_conn;
+}
+
+static void siw_qp_nextstate_from_term(struct siw_qp *qp,
+                                      struct siw_qp_attrs *attrs)
+{
+       switch (attrs->state) {
+       case SIW_QP_STATE_ERROR:
+               siw_rq_flush(qp);
+               qp->attrs.state = SIW_QP_STATE_ERROR;
+
+               if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
+                       siw_sq_flush(qp);
+               break;
+
+       default:
+               break;
+       }
+}
+
+static int siw_qp_nextstate_from_close(struct siw_qp *qp,
+                                      struct siw_qp_attrs *attrs)
+{
+       int rv = 0;
+
+       switch (attrs->state) {
+       case SIW_QP_STATE_IDLE:
+               WARN_ON(tx_wqe(qp)->wr_status != SIW_WR_IDLE);
+               qp->attrs.state = SIW_QP_STATE_IDLE;
+               break;
+
+       case SIW_QP_STATE_CLOSING:
+               /*
+                * The LLP may already moved the QP to closing
+                * due to graceful peer close init
+                */
+               break;
+
+       case SIW_QP_STATE_ERROR:
+               /*
+                * QP was moved to CLOSING by LLP event
+                * not yet seen by user.
+                */
+               qp->attrs.state = SIW_QP_STATE_ERROR;
+
+               if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
+                       siw_sq_flush(qp);
+
+               siw_rq_flush(qp);
+               break;
+
+       default:
+               siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
+                          siw_qp_state_to_string[qp->attrs.state],
+                          siw_qp_state_to_string[attrs->state]);
+
+               rv = -ECONNABORTED;
+       }
+       return rv;
+}
+
+/*
+ * Caller must hold qp->state_lock
+ */
+int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attrs,
+                 enum siw_qp_attr_mask mask)
+{
+       int drop_conn = 0, rv = 0;
+
+       if (!mask)
+               return 0;
+
+       siw_dbg_qp(qp, "state: %s => %s\n",
+                  siw_qp_state_to_string[qp->attrs.state],
+                  siw_qp_state_to_string[attrs->state]);
+
+       if (mask != SIW_QP_ATTR_STATE)
+               siw_qp_modify_nonstate(qp, attrs, mask);
+
+       if (!(mask & SIW_QP_ATTR_STATE))
+               return 0;
+
+       switch (qp->attrs.state) {
+       case SIW_QP_STATE_IDLE:
+       case SIW_QP_STATE_RTR:
+               rv = siw_qp_nextstate_from_idle(qp, attrs, mask);
+               break;
+
+       case SIW_QP_STATE_RTS:
+               drop_conn = siw_qp_nextstate_from_rts(qp, attrs);
+               break;
+
+       case SIW_QP_STATE_TERMINATE:
+               siw_qp_nextstate_from_term(qp, attrs);
+               break;
+
+       case SIW_QP_STATE_CLOSING:
+               siw_qp_nextstate_from_close(qp, attrs);
+               break;
+       default:
+               break;
+       }
+       if (drop_conn)
+               siw_qp_cm_drop(qp, 0);
+
+       return rv;
+}
+
+void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe)
+{
+       rreq->id = sqe->id;
+       rreq->opcode = sqe->opcode;
+       rreq->sge[0].laddr = sqe->sge[0].laddr;
+       rreq->sge[0].length = sqe->sge[0].length;
+       rreq->sge[0].lkey = sqe->sge[0].lkey;
+       rreq->sge[1].lkey = sqe->sge[1].lkey;
+       rreq->flags = sqe->flags | SIW_WQE_VALID;
+       rreq->num_sge = 1;
+}
+
+/*
+ * Must be called with SQ locked.
+ * To avoid complete SQ starvation by constant inbound READ requests,
+ * the active IRQ will not be served after qp->irq_burst, if the
+ * SQ has pending work.
+ */
+int siw_activate_tx(struct siw_qp *qp)
+{
+       struct siw_sqe *irqe, *sqe;
+       struct siw_wqe *wqe = tx_wqe(qp);
+       int rv = 1;
+
+       irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size];
+
+       if (irqe->flags & SIW_WQE_VALID) {
+               sqe = sq_get_next(qp);
+
+               /*
+                * Avoid local WQE processing starvation in case
+                * of constant inbound READ request stream
+                */
+               if (sqe && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE) {
+                       qp->irq_burst = 0;
+                       goto skip_irq;
+               }
+               memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
+               wqe->wr_status = SIW_WR_QUEUED;
+
+               /* start READ RESPONSE */
+               wqe->sqe.opcode = SIW_OP_READ_RESPONSE;
+               wqe->sqe.flags = 0;
+               if (irqe->num_sge) {
+                       wqe->sqe.num_sge = 1;
+                       wqe->sqe.sge[0].length = irqe->sge[0].length;
+                       wqe->sqe.sge[0].laddr = irqe->sge[0].laddr;
+                       wqe->sqe.sge[0].lkey = irqe->sge[0].lkey;
+               } else {
+                       wqe->sqe.num_sge = 0;
+               }
+
+               /* Retain original RREQ's message sequence number for
+                * potential error reporting cases.
+                */
+               wqe->sqe.sge[1].length = irqe->sge[1].length;
+
+               wqe->sqe.rkey = irqe->rkey;
+               wqe->sqe.raddr = irqe->raddr;
+
+               wqe->processed = 0;
+               qp->irq_get++;
+
+               /* mark current IRQ entry free */
+               smp_store_mb(irqe->flags, 0);
+
+               goto out;
+       }
+       sqe = sq_get_next(qp);
+       if (sqe) {
+skip_irq:
+               memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
+               wqe->wr_status = SIW_WR_QUEUED;
+
+               /* First copy SQE to kernel private memory */
+               memcpy(&wqe->sqe, sqe, sizeof(*sqe));
+
+               if (wqe->sqe.opcode >= SIW_NUM_OPCODES) {
+                       rv = -EINVAL;
+                       goto out;
+               }
+               if (wqe->sqe.flags & SIW_WQE_INLINE) {
+                       if (wqe->sqe.opcode != SIW_OP_SEND &&
+                           wqe->sqe.opcode != SIW_OP_WRITE) {
+                               rv = -EINVAL;
+                               goto out;
+                       }
+                       if (wqe->sqe.sge[0].length > SIW_MAX_INLINE) {
+                               rv = -EINVAL;
+                               goto out;
+                       }
+                       wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1];
+                       wqe->sqe.sge[0].lkey = 0;
+                       wqe->sqe.num_sge = 1;
+               }
+               if (wqe->sqe.flags & SIW_WQE_READ_FENCE) {
+                       /* A READ cannot be fenced */
+                       if (unlikely(wqe->sqe.opcode == SIW_OP_READ ||
+                                    wqe->sqe.opcode ==
+                                            SIW_OP_READ_LOCAL_INV)) {
+                               siw_dbg_qp(qp, "cannot fence read\n");
+                               rv = -EINVAL;
+                               goto out;
+                       }
+                       spin_lock(&qp->orq_lock);
+
+                       if (!siw_orq_empty(qp)) {
+                               qp->tx_ctx.orq_fence = 1;
+                               rv = 0;
+                       }
+                       spin_unlock(&qp->orq_lock);
+
+               } else if (wqe->sqe.opcode == SIW_OP_READ ||
+                          wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
+                       struct siw_sqe *rreq;
+
+                       wqe->sqe.num_sge = 1;
+
+                       spin_lock(&qp->orq_lock);
+
+                       rreq = orq_get_free(qp);
+                       if (rreq) {
+                               /*
+                                * Make an immediate copy in ORQ to be ready
+                                * to process loopback READ reply
+                                */
+                               siw_read_to_orq(rreq, &wqe->sqe);
+                               qp->orq_put++;
+                       } else {
+                               qp->tx_ctx.orq_fence = 1;
+                               rv = 0;
+                       }
+                       spin_unlock(&qp->orq_lock);
+               }
+
+               /* Clear SQE, can be re-used by application */
+               smp_store_mb(sqe->flags, 0);
+               qp->sq_get++;
+       } else {
+               rv = 0;
+       }
+out:
+       if (unlikely(rv < 0)) {
+               siw_dbg_qp(qp, "error %d\n", rv);
+               wqe->wr_status = SIW_WR_IDLE;
+       }
+       return rv;
+}
+
+/*
+ * Check if current CQ state qualifies for calling CQ completion
+ * handler. Must be called with CQ lock held.
+ */
+static bool siw_cq_notify_now(struct siw_cq *cq, u32 flags)
+{
+       u64 cq_notify;
+
+       if (!cq->base_cq.comp_handler)
+               return false;
+
+       cq_notify = READ_ONCE(*cq->notify);
+
+       if ((cq_notify & SIW_NOTIFY_NEXT_COMPLETION) ||
+           ((cq_notify & SIW_NOTIFY_SOLICITED) &&
+            (flags & SIW_WQE_SOLICITED))) {
+               /* dis-arm CQ */
+               smp_store_mb(*cq->notify, SIW_NOTIFY_NOT);
+
+               return true;
+       }
+       return false;
+}
+
+int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32 bytes,
+                    enum siw_wc_status status)
+{
+       struct siw_cq *cq = qp->scq;
+       int rv = 0;
+
+       if (cq) {
+               u32 sqe_flags = sqe->flags;
+               struct siw_cqe *cqe;
+               u32 idx;
+               unsigned long flags;
+
+               spin_lock_irqsave(&cq->lock, flags);
+
+               idx = cq->cq_put % cq->num_cqe;
+               cqe = &cq->queue[idx];
+
+               if (!READ_ONCE(cqe->flags)) {
+                       bool notify;
+
+                       cqe->id = sqe->id;
+                       cqe->opcode = sqe->opcode;
+                       cqe->status = status;
+                       cqe->imm_data = 0;
+                       cqe->bytes = bytes;
+
+                       if (cq->kernel_verbs)
+                               cqe->base_qp = qp->ib_qp;
+                       else
+                               cqe->qp_id = qp_id(qp);
+
+                       /* mark CQE valid for application */
+                       WRITE_ONCE(cqe->flags, SIW_WQE_VALID);
+                       /* recycle SQE */
+                       smp_store_mb(sqe->flags, 0);
+
+                       cq->cq_put++;
+                       notify = siw_cq_notify_now(cq, sqe_flags);
+
+                       spin_unlock_irqrestore(&cq->lock, flags);
+
+                       if (notify) {
+                               siw_dbg_cq(cq, "Call completion handler\n");
+                               cq->base_cq.comp_handler(&cq->base_cq,
+                                               cq->base_cq.cq_context);
+                       }
+               } else {
+                       spin_unlock_irqrestore(&cq->lock, flags);
+                       rv = -ENOMEM;
+                       siw_cq_event(cq, IB_EVENT_CQ_ERR);
+               }
+       } else {
+               /* recycle SQE */
+               smp_store_mb(sqe->flags, 0);
+       }
+       return rv;
+}
+
+int siw_rqe_complete(struct siw_qp *qp, struct siw_rqe *rqe, u32 bytes,
+                    u32 inval_stag, enum siw_wc_status status)
+{
+       struct siw_cq *cq = qp->rcq;
+       int rv = 0;
+
+       if (cq) {
+               struct siw_cqe *cqe;
+               u32 idx;
+               unsigned long flags;
+
+               spin_lock_irqsave(&cq->lock, flags);
+
+               idx = cq->cq_put % cq->num_cqe;
+               cqe = &cq->queue[idx];
+
+               if (!READ_ONCE(cqe->flags)) {
+                       bool notify;
+                       u8 cqe_flags = SIW_WQE_VALID;
+
+                       cqe->id = rqe->id;
+                       cqe->opcode = SIW_OP_RECEIVE;
+                       cqe->status = status;
+                       cqe->imm_data = 0;
+                       cqe->bytes = bytes;
+
+                       if (cq->kernel_verbs) {
+                               cqe->base_qp = qp->ib_qp;
+                               if (inval_stag) {
+                                       cqe_flags |= SIW_WQE_REM_INVAL;
+                                       cqe->inval_stag = inval_stag;
+                               }
+                       } else {
+                               cqe->qp_id = qp_id(qp);
+                       }
+                       /* mark CQE valid for application */
+                       WRITE_ONCE(cqe->flags, cqe_flags);
+                       /* recycle RQE */
+                       smp_store_mb(rqe->flags, 0);
+
+                       cq->cq_put++;
+                       notify = siw_cq_notify_now(cq, SIW_WQE_SIGNALLED);
+
+                       spin_unlock_irqrestore(&cq->lock, flags);
+
+                       if (notify) {
+                               siw_dbg_cq(cq, "Call completion handler\n");
+                               cq->base_cq.comp_handler(&cq->base_cq,
+                                               cq->base_cq.cq_context);
+                       }
+               } else {
+                       spin_unlock_irqrestore(&cq->lock, flags);
+                       rv = -ENOMEM;
+                       siw_cq_event(cq, IB_EVENT_CQ_ERR);
+               }
+       } else {
+               /* recycle RQE */
+               smp_store_mb(rqe->flags, 0);
+       }
+       return rv;
+}
+
+/*
+ * siw_sq_flush()
+ *
+ * Flush SQ and ORRQ entries to CQ.
+ *
+ * Must be called with QP state write lock held.
+ * Therefore, SQ and ORQ lock must not be taken.
+ */
+void siw_sq_flush(struct siw_qp *qp)
+{
+       struct siw_sqe *sqe;
+       struct siw_wqe *wqe = tx_wqe(qp);
+       int async_event = 0;
+
+       /*
+        * Start with completing any work currently on the ORQ
+        */
+       while (qp->attrs.orq_size) {
+               sqe = &qp->orq[qp->orq_get % qp->attrs.orq_size];
+               if (!READ_ONCE(sqe->flags))
+                       break;
+
+               if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0)
+                       break;
+
+               WRITE_ONCE(sqe->flags, 0);
+               qp->orq_get++;
+       }
+       /*
+        * Flush an in-progress WQE if present
+        */
+       if (wqe->wr_status != SIW_WR_IDLE) {
+               siw_dbg_qp(qp, "flush current SQE, type %d, status %d\n",
+                          tx_type(wqe), wqe->wr_status);
+
+               siw_wqe_put_mem(wqe, tx_type(wqe));
+
+               if (tx_type(wqe) != SIW_OP_READ_RESPONSE &&
+                   ((tx_type(wqe) != SIW_OP_READ &&
+                     tx_type(wqe) != SIW_OP_READ_LOCAL_INV) ||
+                    wqe->wr_status == SIW_WR_QUEUED))
+                       /*
+                        * An in-progress Read Request is already in
+                        * the ORQ
+                        */
+                       siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
+                                        SIW_WC_WR_FLUSH_ERR);
+
+               wqe->wr_status = SIW_WR_IDLE;
+       }
+       /*
+        * Flush the Send Queue
+        */
+       while (qp->attrs.sq_size) {
+               sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size];
+               if (!READ_ONCE(sqe->flags))
+                       break;
+
+               async_event = 1;
+               if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0)
+                       /*
+                        * Shall IB_EVENT_SQ_DRAINED be supressed if work
+                        * completion fails?
+                        */
+                       break;
+
+               WRITE_ONCE(sqe->flags, 0);
+               qp->sq_get++;
+       }
+       if (async_event)
+               siw_qp_event(qp, IB_EVENT_SQ_DRAINED);
+}
+
+/*
+ * siw_rq_flush()
+ *
+ * Flush recv queue entries to CQ. Also
+ * takes care of pending active tagged and untagged
+ * inbound transfers, which have target memory
+ * referenced.
+ *
+ * Must be called with QP state write lock held.
+ * Therefore, RQ lock must not be taken.
+ */
+void siw_rq_flush(struct siw_qp *qp)
+{
+       struct siw_wqe *wqe = &qp->rx_untagged.wqe_active;
+
+       /*
+        * Flush an in-progress untagged operation if present
+        */
+       if (wqe->wr_status != SIW_WR_IDLE) {
+               siw_dbg_qp(qp, "flush current rqe, type %d, status %d\n",
+                          rx_type(wqe), wqe->wr_status);
+
+               siw_wqe_put_mem(wqe, rx_type(wqe));
+
+               if (rx_type(wqe) == SIW_OP_RECEIVE) {
+                       siw_rqe_complete(qp, &wqe->rqe, wqe->bytes,
+                                        0, SIW_WC_WR_FLUSH_ERR);
+               } else if (rx_type(wqe) != SIW_OP_READ &&
+                          rx_type(wqe) != SIW_OP_READ_RESPONSE &&
+                          rx_type(wqe) != SIW_OP_WRITE) {
+                       siw_sqe_complete(qp, &wqe->sqe, 0, SIW_WC_WR_FLUSH_ERR);
+               }
+               wqe->wr_status = SIW_WR_IDLE;
+       }
+       wqe = &qp->rx_tagged.wqe_active;
+
+       if (wqe->wr_status != SIW_WR_IDLE) {
+               siw_wqe_put_mem(wqe, rx_type(wqe));
+               wqe->wr_status = SIW_WR_IDLE;
+       }
+       /*
+        * Flush the Receive Queue
+        */
+       while (qp->attrs.rq_size) {
+               struct siw_rqe *rqe =
+                       &qp->recvq[qp->rq_get % qp->attrs.rq_size];
+
+               if (!READ_ONCE(rqe->flags))
+                       break;
+
+               if (siw_rqe_complete(qp, rqe, 0, 0, SIW_WC_WR_FLUSH_ERR) != 0)
+                       break;
+
+               WRITE_ONCE(rqe->flags, 0);
+               qp->rq_get++;
+       }
+}
+
+int siw_qp_add(struct siw_device *sdev, struct siw_qp *qp)
+{
+       int rv = xa_alloc(&sdev->qp_xa, &qp->ib_qp->qp_num, qp, xa_limit_32b,
+                         GFP_KERNEL);
+
+       if (!rv) {
+               kref_init(&qp->ref);
+               qp->sdev = sdev;
+               qp->qp_num = qp->ib_qp->qp_num;
+               siw_dbg_qp(qp, "new QP\n");
+       }
+       return rv;
+}
+
+void siw_free_qp(struct kref *ref)
+{
+       struct siw_qp *found, *qp = container_of(ref, struct siw_qp, ref);
+       struct siw_device *sdev = qp->sdev;
+       unsigned long flags;
+
+       if (qp->cep)
+               siw_cep_put(qp->cep);
+
+       found = xa_erase(&sdev->qp_xa, qp_id(qp));
+       WARN_ON(found != qp);
+       spin_lock_irqsave(&sdev->lock, flags);
+       list_del(&qp->devq);
+       spin_unlock_irqrestore(&sdev->lock, flags);
+
+       vfree(qp->sendq);
+       vfree(qp->recvq);
+       vfree(qp->irq);
+       vfree(qp->orq);
+
+       siw_put_tx_cpu(qp->tx_cpu);
+
+       atomic_dec(&sdev->num_qp);
+       siw_dbg_qp(qp, "free QP\n");
+       kfree_rcu(qp, rcu);
+}
diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c
new file mode 100644 (file)
index 0000000..f87657a
--- /dev/null
@@ -0,0 +1,1458 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+#include "siw_mem.h"
+
+/*
+ * siw_rx_umem()
+ *
+ * Receive data of @len into target referenced by @dest_addr.
+ *
+ * @srx:       Receive Context
+ * @umem:      siw representation of target memory
+ * @dest_addr: user virtual address
+ * @len:       number of bytes to place
+ */
+static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem,
+                      u64 dest_addr, int len)
+{
+       int copied = 0;
+
+       while (len) {
+               struct page *p;
+               int pg_off, bytes, rv;
+               void *dest;
+
+               p = siw_get_upage(umem, dest_addr);
+               if (unlikely(!p)) {
+                       pr_warn("siw: %s: [QP %u]: bogus addr: %p, %p\n",
+                               __func__, qp_id(rx_qp(srx)),
+                               (void *)dest_addr, (void *)umem->fp_addr);
+                       /* siw internal error */
+                       srx->skb_copied += copied;
+                       srx->skb_new -= copied;
+
+                       return -EFAULT;
+               }
+               pg_off = dest_addr & ~PAGE_MASK;
+               bytes = min(len, (int)PAGE_SIZE - pg_off);
+
+               siw_dbg_qp(rx_qp(srx), "page %p, bytes=%u\n", p, bytes);
+
+               dest = kmap_atomic(p);
+               rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off,
+                                  bytes);
+
+               if (unlikely(rv)) {
+                       kunmap_atomic(dest);
+                       srx->skb_copied += copied;
+                       srx->skb_new -= copied;
+
+                       pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n",
+                               qp_id(rx_qp(srx)), __func__, len, p, rv);
+
+                       return -EFAULT;
+               }
+               if (srx->mpa_crc_hd) {
+                       if (rx_qp(srx)->kernel_verbs) {
+                               crypto_shash_update(srx->mpa_crc_hd,
+                                       (u8 *)(dest + pg_off), bytes);
+                               kunmap_atomic(dest);
+                       } else {
+                               kunmap_atomic(dest);
+                               /*
+                                * Do CRC on original, not target buffer.
+                                * Some user land applications may
+                                * concurrently write the target buffer,
+                                * which would yield a broken CRC.
+                                * Walking the skb twice is very ineffcient.
+                                * Folding the CRC into skb_copy_bits()
+                                * would be much better, but is currently
+                                * not supported.
+                                */
+                               siw_crc_skb(srx, bytes);
+                       }
+               } else {
+                       kunmap_atomic(dest);
+               }
+               srx->skb_offset += bytes;
+               copied += bytes;
+               len -= bytes;
+               dest_addr += bytes;
+               pg_off = 0;
+       }
+       srx->skb_copied += copied;
+       srx->skb_new -= copied;
+
+       return copied;
+}
+
+static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len)
+{
+       int rv;
+
+       siw_dbg_qp(rx_qp(srx), "kva: 0x%p, len: %u\n", kva, len);
+
+       rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len);
+       if (unlikely(rv)) {
+               pr_warn("siw: [QP %u]: %s, len %d, kva 0x%p, rv %d\n",
+                       qp_id(rx_qp(srx)), __func__, len, kva, rv);
+
+               return rv;
+       }
+       if (srx->mpa_crc_hd)
+               crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len);
+
+       srx->skb_offset += len;
+       srx->skb_copied += len;
+       srx->skb_new -= len;
+
+       return len;
+}
+
+static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx,
+                     struct siw_mem *mem, u64 addr, int len)
+{
+       struct siw_pbl *pbl = mem->pbl;
+       u64 offset = addr - mem->va;
+       int copied = 0;
+
+       while (len) {
+               int bytes;
+               u64 buf_addr =
+                       siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx);
+               if (!buf_addr)
+                       break;
+
+               bytes = min(bytes, len);
+               if (siw_rx_kva(srx, (void *)buf_addr, bytes) == bytes) {
+                       copied += bytes;
+                       offset += bytes;
+                       len -= bytes;
+               } else {
+                       break;
+               }
+       }
+       return copied;
+}
+
+/*
+ * siw_rresp_check_ntoh()
+ *
+ * Check incoming RRESP fragment header against expected
+ * header values and update expected values for potential next
+ * fragment.
+ *
+ * NOTE: This function must be called only if a RRESP DDP segment
+ *       starts but not for fragmented consecutive pieces of an
+ *       already started DDP segment.
+ */
+static int siw_rresp_check_ntoh(struct siw_rx_stream *srx,
+                               struct siw_rx_fpdu *frx)
+{
+       struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp;
+       struct siw_wqe *wqe = &frx->wqe_active;
+       enum ddp_ecode ecode;
+
+       u32 sink_stag = be32_to_cpu(rresp->sink_stag);
+       u64 sink_to = be64_to_cpu(rresp->sink_to);
+
+       if (frx->first_ddp_seg) {
+               srx->ddp_stag = wqe->sqe.sge[0].lkey;
+               srx->ddp_to = wqe->sqe.sge[0].laddr;
+               frx->pbl_idx = 0;
+       }
+       /* Below checks extend beyond the semantics of DDP, and
+        * into RDMAP:
+        * We check if the read response matches exactly the
+        * read request which was send to the remote peer to
+        * trigger this read response. RFC5040/5041 do not
+        * always have a proper error code for the detected
+        * error cases. We choose 'base or bounds error' for
+        * cases where the inbound STag is valid, but offset
+        * or length do not match our response receive state.
+        */
+       if (unlikely(srx->ddp_stag != sink_stag)) {
+               pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n",
+                       qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag);
+               ecode = DDP_ECODE_T_INVALID_STAG;
+               goto error;
+       }
+       if (unlikely(srx->ddp_to != sink_to)) {
+               pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n",
+                       qp_id(rx_qp(srx)), (unsigned long long)sink_to,
+                       (unsigned long long)srx->ddp_to);
+               ecode = DDP_ECODE_T_BASE_BOUNDS;
+               goto error;
+       }
+       if (unlikely(!frx->more_ddp_segs &&
+                    (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) {
+               pr_warn("siw: [QP %u]: rresp len: %d != %d\n",
+                       qp_id(rx_qp(srx)),
+                       wqe->processed + srx->fpdu_part_rem, wqe->bytes);
+               ecode = DDP_ECODE_T_BASE_BOUNDS;
+               goto error;
+       }
+       return 0;
+error:
+       siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
+                          DDP_ETYPE_TAGGED_BUF, ecode, 0);
+       return -EINVAL;
+}
+
+/*
+ * siw_write_check_ntoh()
+ *
+ * Check incoming WRITE fragment header against expected
+ * header values and update expected values for potential next
+ * fragment
+ *
+ * NOTE: This function must be called only if a WRITE DDP segment
+ *       starts but not for fragmented consecutive pieces of an
+ *       already started DDP segment.
+ */
+static int siw_write_check_ntoh(struct siw_rx_stream *srx,
+                               struct siw_rx_fpdu *frx)
+{
+       struct iwarp_rdma_write *write = &srx->hdr.rwrite;
+       enum ddp_ecode ecode;
+
+       u32 sink_stag = be32_to_cpu(write->sink_stag);
+       u64 sink_to = be64_to_cpu(write->sink_to);
+
+       if (frx->first_ddp_seg) {
+               srx->ddp_stag = sink_stag;
+               srx->ddp_to = sink_to;
+               frx->pbl_idx = 0;
+       } else {
+               if (unlikely(srx->ddp_stag != sink_stag)) {
+                       pr_warn("siw: [QP %u]: write stag: %08x != %08x\n",
+                               qp_id(rx_qp(srx)), sink_stag,
+                               srx->ddp_stag);
+                       ecode = DDP_ECODE_T_INVALID_STAG;
+                       goto error;
+               }
+               if (unlikely(srx->ddp_to != sink_to)) {
+                       pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n",
+                               qp_id(rx_qp(srx)),
+                               (unsigned long long)sink_to,
+                               (unsigned long long)srx->ddp_to);
+                       ecode = DDP_ECODE_T_BASE_BOUNDS;
+                       goto error;
+               }
+       }
+       return 0;
+error:
+       siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
+                          DDP_ETYPE_TAGGED_BUF, ecode, 0);
+       return -EINVAL;
+}
+
+/*
+ * siw_send_check_ntoh()
+ *
+ * Check incoming SEND fragment header against expected
+ * header values and update expected MSN if no next
+ * fragment expected
+ *
+ * NOTE: This function must be called only if a SEND DDP segment
+ *       starts but not for fragmented consecutive pieces of an
+ *       already started DDP segment.
+ */
+static int siw_send_check_ntoh(struct siw_rx_stream *srx,
+                              struct siw_rx_fpdu *frx)
+{
+       struct iwarp_send_inv *send = &srx->hdr.send_inv;
+       struct siw_wqe *wqe = &frx->wqe_active;
+       enum ddp_ecode ecode;
+
+       u32 ddp_msn = be32_to_cpu(send->ddp_msn);
+       u32 ddp_mo = be32_to_cpu(send->ddp_mo);
+       u32 ddp_qn = be32_to_cpu(send->ddp_qn);
+
+       if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) {
+               pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n",
+                       qp_id(rx_qp(srx)), ddp_qn);
+               ecode = DDP_ECODE_UT_INVALID_QN;
+               goto error;
+       }
+       if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) {
+               pr_warn("siw: [QP %u]: send msn: %u != %u\n",
+                       qp_id(rx_qp(srx)), ddp_msn,
+                       srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
+               ecode = DDP_ECODE_UT_INVALID_MSN_RANGE;
+               goto error;
+       }
+       if (unlikely(ddp_mo != wqe->processed)) {
+               pr_warn("siw: [QP %u], send mo: %u != %u\n",
+                       qp_id(rx_qp(srx)), ddp_mo, wqe->processed);
+               ecode = DDP_ECODE_UT_INVALID_MO;
+               goto error;
+       }
+       if (frx->first_ddp_seg) {
+               /* initialize user memory write position */
+               frx->sge_idx = 0;
+               frx->sge_off = 0;
+               frx->pbl_idx = 0;
+
+               /* only valid for SEND_INV and SEND_SE_INV operations */
+               srx->inval_stag = be32_to_cpu(send->inval_stag);
+       }
+       if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) {
+               siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n",
+                          wqe->bytes, wqe->processed, srx->fpdu_part_rem);
+               wqe->wc_status = SIW_WC_LOC_LEN_ERR;
+               ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF;
+               goto error;
+       }
+       return 0;
+error:
+       siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
+                          DDP_ETYPE_UNTAGGED_BUF, ecode, 0);
+       return -EINVAL;
+}
+
+static struct siw_wqe *siw_rqe_get(struct siw_qp *qp)
+{
+       struct siw_rqe *rqe;
+       struct siw_srq *srq;
+       struct siw_wqe *wqe = NULL;
+       bool srq_event = false;
+       unsigned long flags;
+
+       srq = qp->srq;
+       if (srq) {
+               spin_lock_irqsave(&srq->lock, flags);
+               if (unlikely(!srq->num_rqe))
+                       goto out;
+
+               rqe = &srq->recvq[srq->rq_get % srq->num_rqe];
+       } else {
+               if (unlikely(!qp->recvq))
+                       goto out;
+
+               rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size];
+       }
+       if (likely(rqe->flags == SIW_WQE_VALID)) {
+               int num_sge = rqe->num_sge;
+
+               if (likely(num_sge <= SIW_MAX_SGE)) {
+                       int i = 0;
+
+                       wqe = rx_wqe(&qp->rx_untagged);
+                       rx_type(wqe) = SIW_OP_RECEIVE;
+                       wqe->wr_status = SIW_WR_INPROGRESS;
+                       wqe->bytes = 0;
+                       wqe->processed = 0;
+
+                       wqe->rqe.id = rqe->id;
+                       wqe->rqe.num_sge = num_sge;
+
+                       while (i < num_sge) {
+                               wqe->rqe.sge[i].laddr = rqe->sge[i].laddr;
+                               wqe->rqe.sge[i].lkey = rqe->sge[i].lkey;
+                               wqe->rqe.sge[i].length = rqe->sge[i].length;
+                               wqe->bytes += wqe->rqe.sge[i].length;
+                               wqe->mem[i] = NULL;
+                               i++;
+                       }
+                       /* can be re-used by appl */
+                       smp_store_mb(rqe->flags, 0);
+               } else {
+                       siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge);
+                       if (srq)
+                               spin_unlock_irqrestore(&srq->lock, flags);
+                       return NULL;
+               }
+               if (!srq) {
+                       qp->rq_get++;
+               } else {
+                       if (srq->armed) {
+                               /* Test SRQ limit */
+                               u32 off = (srq->rq_get + srq->limit) %
+                                         srq->num_rqe;
+                               struct siw_rqe *rqe2 = &srq->recvq[off];
+
+                               if (!(rqe2->flags & SIW_WQE_VALID)) {
+                                       srq->armed = 0;
+                                       srq_event = true;
+                               }
+                       }
+                       srq->rq_get++;
+               }
+       }
+out:
+       if (srq) {
+               spin_unlock_irqrestore(&srq->lock, flags);
+               if (srq_event)
+                       siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED);
+       }
+       return wqe;
+}
+
+/*
+ * siw_proc_send:
+ *
+ * Process one incoming SEND and place data into memory referenced by
+ * receive wqe.
+ *
+ * Function supports partially received sends (suspending/resuming
+ * current receive wqe processing)
+ *
+ * return value:
+ *     0:       reached the end of a DDP segment
+ *     -EAGAIN: to be called again to finish the DDP segment
+ */
+int siw_proc_send(struct siw_qp *qp)
+{
+       struct siw_rx_stream *srx = &qp->rx_stream;
+       struct siw_rx_fpdu *frx = &qp->rx_untagged;
+       struct siw_wqe *wqe;
+       u32 data_bytes; /* all data bytes available */
+       u32 rcvd_bytes; /* sum of data bytes rcvd */
+       int rv = 0;
+
+       if (frx->first_ddp_seg) {
+               wqe = siw_rqe_get(qp);
+               if (unlikely(!wqe)) {
+                       siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+                                          DDP_ETYPE_UNTAGGED_BUF,
+                                          DDP_ECODE_UT_INVALID_MSN_NOBUF, 0);
+                       return -ENOENT;
+               }
+       } else {
+               wqe = rx_wqe(frx);
+       }
+       if (srx->state == SIW_GET_DATA_START) {
+               rv = siw_send_check_ntoh(srx, frx);
+               if (unlikely(rv)) {
+                       siw_qp_event(qp, IB_EVENT_QP_FATAL);
+                       return rv;
+               }
+               if (!srx->fpdu_part_rem) /* zero length SEND */
+                       return 0;
+       }
+       data_bytes = min(srx->fpdu_part_rem, srx->skb_new);
+       rcvd_bytes = 0;
+
+       /* A zero length SEND will skip below loop */
+       while (data_bytes) {
+               struct ib_pd *pd;
+               struct siw_mem **mem, *mem_p;
+               struct siw_sge *sge;
+               u32 sge_bytes; /* data bytes avail for SGE */
+
+               sge = &wqe->rqe.sge[frx->sge_idx];
+
+               if (!sge->length) {
+                       /* just skip empty sge's */
+                       frx->sge_idx++;
+                       frx->sge_off = 0;
+                       frx->pbl_idx = 0;
+                       continue;
+               }
+               sge_bytes = min(data_bytes, sge->length - frx->sge_off);
+               mem = &wqe->mem[frx->sge_idx];
+
+               /*
+                * check with QP's PD if no SRQ present, SRQ's PD otherwise
+                */
+               pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd;
+
+               rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE,
+                                  frx->sge_off, sge_bytes);
+               if (unlikely(rv)) {
+                       siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+                                          DDP_ETYPE_CATASTROPHIC,
+                                          DDP_ECODE_CATASTROPHIC, 0);
+
+                       siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
+                       break;
+               }
+               mem_p = *mem;
+               if (mem_p->mem_obj == NULL)
+                       rv = siw_rx_kva(srx,
+                                       (void *)(sge->laddr + frx->sge_off),
+                                       sge_bytes);
+               else if (!mem_p->is_pbl)
+                       rv = siw_rx_umem(srx, mem_p->umem,
+                                        sge->laddr + frx->sge_off, sge_bytes);
+               else
+                       rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
+                                       sge->laddr + frx->sge_off, sge_bytes);
+
+               if (unlikely(rv != sge_bytes)) {
+                       wqe->processed += rcvd_bytes;
+
+                       siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+                                          DDP_ETYPE_CATASTROPHIC,
+                                          DDP_ECODE_CATASTROPHIC, 0);
+                       return -EINVAL;
+               }
+               frx->sge_off += rv;
+
+               if (frx->sge_off == sge->length) {
+                       frx->sge_idx++;
+                       frx->sge_off = 0;
+                       frx->pbl_idx = 0;
+               }
+               data_bytes -= rv;
+               rcvd_bytes += rv;
+
+               srx->fpdu_part_rem -= rv;
+               srx->fpdu_part_rcvd += rv;
+       }
+       wqe->processed += rcvd_bytes;
+
+       if (!srx->fpdu_part_rem)
+               return 0;
+
+       return (rv < 0) ? rv : -EAGAIN;
+}
+
+/*
+ * siw_proc_write:
+ *
+ * Place incoming WRITE after referencing and checking target buffer
+
+ * Function supports partially received WRITEs (suspending/resuming
+ * current receive processing)
+ *
+ * return value:
+ *     0:       reached the end of a DDP segment
+ *     -EAGAIN: to be called again to finish the DDP segment
+ */
+int siw_proc_write(struct siw_qp *qp)
+{
+       struct siw_rx_stream *srx = &qp->rx_stream;
+       struct siw_rx_fpdu *frx = &qp->rx_tagged;
+       struct siw_mem *mem;
+       int bytes, rv;
+
+       if (srx->state == SIW_GET_DATA_START) {
+               if (!srx->fpdu_part_rem) /* zero length WRITE */
+                       return 0;
+
+               rv = siw_write_check_ntoh(srx, frx);
+               if (unlikely(rv)) {
+                       siw_qp_event(qp, IB_EVENT_QP_FATAL);
+                       return rv;
+               }
+       }
+       bytes = min(srx->fpdu_part_rem, srx->skb_new);
+
+       if (frx->first_ddp_seg) {
+               struct siw_wqe *wqe = rx_wqe(frx);
+
+               rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8);
+               if (unlikely(!rx_mem(frx))) {
+                       siw_dbg_qp(qp,
+                                  "sink stag not found/invalid, stag 0x%08x\n",
+                                  srx->ddp_stag);
+
+                       siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+                                          DDP_ETYPE_TAGGED_BUF,
+                                          DDP_ECODE_T_INVALID_STAG, 0);
+                       return -EINVAL;
+               }
+               wqe->rqe.num_sge = 1;
+               rx_type(wqe) = SIW_OP_WRITE;
+               wqe->wr_status = SIW_WR_INPROGRESS;
+       }
+       mem = rx_mem(frx);
+
+       /*
+        * Check if application re-registered memory with different
+        * key field of STag.
+        */
+       if (unlikely(mem->stag != srx->ddp_stag)) {
+               siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+                                  DDP_ETYPE_TAGGED_BUF,
+                                  DDP_ECODE_T_INVALID_STAG, 0);
+               return -EINVAL;
+       }
+       rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd,
+                          IB_ACCESS_REMOTE_WRITE, bytes);
+       if (unlikely(rv)) {
+               siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+                                  DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv),
+                                  0);
+
+               siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
+
+               return -EINVAL;
+       }
+
+       if (mem->mem_obj == NULL)
+               rv = siw_rx_kva(srx,
+                               (void *)(srx->ddp_to + srx->fpdu_part_rcvd),
+                               bytes);
+       else if (!mem->is_pbl)
+               rv = siw_rx_umem(srx, mem->umem,
+                                srx->ddp_to + srx->fpdu_part_rcvd, bytes);
+       else
+               rv = siw_rx_pbl(srx, &frx->pbl_idx, mem,
+                               srx->ddp_to + srx->fpdu_part_rcvd, bytes);
+
+       if (unlikely(rv != bytes)) {
+               siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+                                  DDP_ETYPE_CATASTROPHIC,
+                                  DDP_ECODE_CATASTROPHIC, 0);
+               return -EINVAL;
+       }
+       srx->fpdu_part_rem -= rv;
+       srx->fpdu_part_rcvd += rv;
+
+       if (!srx->fpdu_part_rem) {
+               srx->ddp_to += srx->fpdu_part_rcvd;
+               return 0;
+       }
+       return -EAGAIN;
+}
+
+/*
+ * Inbound RREQ's cannot carry user data.
+ */
+int siw_proc_rreq(struct siw_qp *qp)
+{
+       struct siw_rx_stream *srx = &qp->rx_stream;
+
+       if (!srx->fpdu_part_rem)
+               return 0;
+
+       pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp),
+               be16_to_cpu(srx->hdr.ctrl.mpa_len));
+
+       return -EPROTO;
+}
+
+/*
+ * siw_init_rresp:
+ *
+ * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
+ * Put it at the tail of the IRQ, if there is another WQE currently in
+ * transmit processing. If not, make it the current WQE to be processed
+ * and schedule transmit processing.
+ *
+ * Can be called from softirq context and from process
+ * context (RREAD socket loopback case!)
+ *
+ * return value:
+ *     0:      success,
+ *             failure code otherwise
+ */
+
+static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx)
+{
+       struct siw_wqe *tx_work = tx_wqe(qp);
+       struct siw_sqe *resp;
+
+       uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to),
+                laddr = be64_to_cpu(srx->hdr.rreq.source_to);
+       uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size),
+                lkey = be32_to_cpu(srx->hdr.rreq.source_stag),
+                rkey = be32_to_cpu(srx->hdr.rreq.sink_stag),
+                msn = be32_to_cpu(srx->hdr.rreq.ddp_msn);
+
+       int run_sq = 1, rv = 0;
+       unsigned long flags;
+
+       if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) {
+               siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+                                  DDP_ETYPE_UNTAGGED_BUF,
+                                  DDP_ECODE_UT_INVALID_MSN_RANGE, 0);
+               return -EPROTO;
+       }
+       spin_lock_irqsave(&qp->sq_lock, flags);
+
+       if (tx_work->wr_status == SIW_WR_IDLE) {
+               /*
+                * immediately schedule READ response w/o
+                * consuming IRQ entry: IRQ must be empty.
+                */
+               tx_work->processed = 0;
+               tx_work->mem[0] = NULL;
+               tx_work->wr_status = SIW_WR_QUEUED;
+               resp = &tx_work->sqe;
+       } else {
+               resp = irq_alloc_free(qp);
+               run_sq = 0;
+       }
+       if (likely(resp)) {
+               resp->opcode = SIW_OP_READ_RESPONSE;
+
+               resp->sge[0].length = length;
+               resp->sge[0].laddr = laddr;
+               resp->sge[0].lkey = lkey;
+
+               /* Keep aside message sequence number for potential
+                * error reporting during Read Response generation.
+                */
+               resp->sge[1].length = msn;
+
+               resp->raddr = raddr;
+               resp->rkey = rkey;
+               resp->num_sge = length ? 1 : 0;
+
+               /* RRESP now valid as current TX wqe or placed into IRQ */
+               smp_store_mb(resp->flags, SIW_WQE_VALID);
+       } else {
+               pr_warn("siw: [QP %u]: irq %d exceeded %d\n", qp_id(qp),
+                       qp->irq_put % qp->attrs.irq_size, qp->attrs.irq_size);
+
+               siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+                                  RDMAP_ETYPE_REMOTE_OPERATION,
+                                  RDMAP_ECODE_CATASTROPHIC_STREAM, 0);
+               rv = -EPROTO;
+       }
+
+       spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+       if (run_sq)
+               rv = siw_sq_start(qp);
+
+       return rv;
+}
+
+/*
+ * Only called at start of Read.Resonse processing.
+ * Transfer pending Read from tip of ORQ into currrent rx wqe,
+ * but keep ORQ entry valid until Read.Response processing done.
+ * No Queue locking needed.
+ */
+static int siw_orqe_start_rx(struct siw_qp *qp)
+{
+       struct siw_sqe *orqe;
+       struct siw_wqe *wqe = NULL;
+
+       /* make sure ORQ indices are current */
+       smp_mb();
+
+       orqe = orq_get_current(qp);
+       if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) {
+               /* RRESP is a TAGGED RDMAP operation */
+               wqe = rx_wqe(&qp->rx_tagged);
+               wqe->sqe.id = orqe->id;
+               wqe->sqe.opcode = orqe->opcode;
+               wqe->sqe.sge[0].laddr = orqe->sge[0].laddr;
+               wqe->sqe.sge[0].lkey = orqe->sge[0].lkey;
+               wqe->sqe.sge[0].length = orqe->sge[0].length;
+               wqe->sqe.flags = orqe->flags;
+               wqe->sqe.num_sge = 1;
+               wqe->bytes = orqe->sge[0].length;
+               wqe->processed = 0;
+               wqe->mem[0] = NULL;
+               /* make sure WQE is completely written before valid */
+               smp_wmb();
+               wqe->wr_status = SIW_WR_INPROGRESS;
+
+               return 0;
+       }
+       return -EPROTO;
+}
+
+/*
+ * siw_proc_rresp:
+ *
+ * Place incoming RRESP data into memory referenced by RREQ WQE
+ * which is at the tip of the ORQ
+ *
+ * Function supports partially received RRESP's (suspending/resuming
+ * current receive processing)
+ */
+int siw_proc_rresp(struct siw_qp *qp)
+{
+       struct siw_rx_stream *srx = &qp->rx_stream;
+       struct siw_rx_fpdu *frx = &qp->rx_tagged;
+       struct siw_wqe *wqe = rx_wqe(frx);
+       struct siw_mem **mem, *mem_p;
+       struct siw_sge *sge;
+       int bytes, rv;
+
+       if (frx->first_ddp_seg) {
+               if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
+                       pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n",
+                               qp_id(qp), wqe->wr_status, wqe->sqe.opcode);
+                       rv = -EPROTO;
+                       goto error_term;
+               }
+               /*
+                * fetch pending RREQ from orq
+                */
+               rv = siw_orqe_start_rx(qp);
+               if (rv) {
+                       pr_warn("siw: [QP %u]: ORQ empty at idx %d\n",
+                               qp_id(qp), qp->orq_get % qp->attrs.orq_size);
+                       goto error_term;
+               }
+               rv = siw_rresp_check_ntoh(srx, frx);
+               if (unlikely(rv)) {
+                       siw_qp_event(qp, IB_EVENT_QP_FATAL);
+                       return rv;
+               }
+       } else {
+               if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) {
+                       pr_warn("siw: [QP %u]: resume RRESP: status %d\n",
+                               qp_id(qp), wqe->wr_status);
+                       rv = -EPROTO;
+                       goto error_term;
+               }
+       }
+       if (!srx->fpdu_part_rem) /* zero length RRESPONSE */
+               return 0;
+
+       sge = wqe->sqe.sge; /* there is only one */
+       mem = &wqe->mem[0];
+
+       if (!(*mem)) {
+               /*
+                * check target memory which resolves memory on first fragment
+                */
+               rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0,
+                                  wqe->bytes);
+               if (unlikely(rv)) {
+                       siw_dbg_qp(qp, "target mem check: %d\n", rv);
+                       wqe->wc_status = SIW_WC_LOC_PROT_ERR;
+
+                       siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+                                          DDP_ETYPE_TAGGED_BUF,
+                                          siw_tagged_error(-rv), 0);
+
+                       siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
+
+                       return -EINVAL;
+               }
+       }
+       mem_p = *mem;
+
+       bytes = min(srx->fpdu_part_rem, srx->skb_new);
+
+       if (mem_p->mem_obj == NULL)
+               rv = siw_rx_kva(srx, (void *)(sge->laddr + wqe->processed),
+                               bytes);
+       else if (!mem_p->is_pbl)
+               rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed,
+                                bytes);
+       else
+               rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
+                               sge->laddr + wqe->processed, bytes);
+       if (rv != bytes) {
+               wqe->wc_status = SIW_WC_GENERAL_ERR;
+               rv = -EINVAL;
+               goto error_term;
+       }
+       srx->fpdu_part_rem -= rv;
+       srx->fpdu_part_rcvd += rv;
+       wqe->processed += rv;
+
+       if (!srx->fpdu_part_rem) {
+               srx->ddp_to += srx->fpdu_part_rcvd;
+               return 0;
+       }
+       return -EAGAIN;
+
+error_term:
+       siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC,
+                          DDP_ECODE_CATASTROPHIC, 0);
+       return rv;
+}
+
+int siw_proc_terminate(struct siw_qp *qp)
+{
+       struct siw_rx_stream *srx = &qp->rx_stream;
+       struct sk_buff *skb = srx->skb;
+       struct iwarp_terminate *term = &srx->hdr.terminate;
+       union iwarp_hdr term_info;
+       u8 *infop = (u8 *)&term_info;
+       enum rdma_opcode op;
+       u16 to_copy = sizeof(struct iwarp_ctrl);
+
+       pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n",
+               __rdmap_term_layer(term), __rdmap_term_etype(term),
+               __rdmap_term_ecode(term));
+
+       if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE ||
+           be32_to_cpu(term->ddp_msn) !=
+                   qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] ||
+           be32_to_cpu(term->ddp_mo) != 0) {
+               pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n",
+                       be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn),
+                       be32_to_cpu(term->ddp_mo));
+               return -ECONNRESET;
+       }
+       /*
+        * Receive remaining pieces of TERM if indicated
+        */
+       if (!term->flag_m)
+               return -ECONNRESET;
+
+       /* Do not take the effort to reassemble a network fragmented
+        * TERM message
+        */
+       if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged))
+               return -ECONNRESET;
+
+       memset(infop, 0, sizeof(term_info));
+
+       skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
+
+       op = __rdmap_get_opcode(&term_info.ctrl);
+       if (op >= RDMAP_TERMINATE)
+               goto out;
+
+       infop += to_copy;
+       srx->skb_offset += to_copy;
+       srx->skb_new -= to_copy;
+       srx->skb_copied += to_copy;
+       srx->fpdu_part_rcvd += to_copy;
+       srx->fpdu_part_rem -= to_copy;
+
+       to_copy = iwarp_pktinfo[op].hdr_len - to_copy;
+
+       /* Again, no network fragmented TERM's */
+       if (to_copy + MPA_CRC_SIZE > srx->skb_new)
+               return -ECONNRESET;
+
+       skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
+
+       if (term->flag_r) {
+               siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n",
+                          op, be16_to_cpu(term_info.ctrl.mpa_len),
+                          term->flag_m ? "valid" : "invalid");
+       } else if (term->flag_d) {
+               siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n",
+                          op, be16_to_cpu(term_info.ctrl.mpa_len),
+                          term->flag_m ? "valid" : "invalid");
+       }
+out:
+       srx->skb_new -= to_copy;
+       srx->skb_offset += to_copy;
+       srx->skb_copied += to_copy;
+       srx->fpdu_part_rcvd += to_copy;
+       srx->fpdu_part_rem -= to_copy;
+
+       return -ECONNRESET;
+}
+
+static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx)
+{
+       struct sk_buff *skb = srx->skb;
+       u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad;
+       __wsum crc_in, crc_own = 0;
+
+       siw_dbg_qp(qp, "expected %d, available %d, pad %u\n",
+                  srx->fpdu_part_rem, srx->skb_new, srx->pad);
+
+       if (srx->skb_new < srx->fpdu_part_rem)
+               return -EAGAIN;
+
+       skb_copy_bits(skb, srx->skb_offset, tbuf, srx->fpdu_part_rem);
+
+       if (srx->mpa_crc_hd && srx->pad)
+               crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad);
+
+       srx->skb_new -= srx->fpdu_part_rem;
+       srx->skb_offset += srx->fpdu_part_rem;
+       srx->skb_copied += srx->fpdu_part_rem;
+
+       if (!srx->mpa_crc_hd)
+               return 0;
+
+       /*
+        * CRC32 is computed, transmitted and received directly in NBO,
+        * so there's never a reason to convert byte order.
+        */
+       crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own);
+       crc_in = (__force __wsum)srx->trailer.crc;
+
+       if (unlikely(crc_in != crc_own)) {
+               pr_warn("siw: crc error. in: %08x, own %08x, op %u\n",
+                       crc_in, crc_own, qp->rx_stream.rdmap_op);
+
+               siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
+                                  LLP_ETYPE_MPA,
+                                  LLP_ECODE_RECEIVED_CRC, 0);
+               return -EINVAL;
+       }
+       return 0;
+}
+
+#define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged)
+
+static int siw_get_hdr(struct siw_rx_stream *srx)
+{
+       struct sk_buff *skb = srx->skb;
+       struct siw_qp *qp = rx_qp(srx);
+       struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl;
+       struct siw_rx_fpdu *frx;
+       u8 opcode;
+       int bytes;
+
+       if (srx->fpdu_part_rcvd < MIN_DDP_HDR) {
+               /*
+                * copy a mimimum sized (tagged) DDP frame control part
+                */
+               bytes = min_t(int, srx->skb_new,
+                             MIN_DDP_HDR - srx->fpdu_part_rcvd);
+
+               skb_copy_bits(skb, srx->skb_offset,
+                             (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
+
+               srx->fpdu_part_rcvd += bytes;
+
+               srx->skb_new -= bytes;
+               srx->skb_offset += bytes;
+               srx->skb_copied += bytes;
+
+               if (srx->fpdu_part_rcvd < MIN_DDP_HDR)
+                       return -EAGAIN;
+
+               if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) {
+                       enum ddp_etype etype;
+                       enum ddp_ecode ecode;
+
+                       pr_warn("siw: received ddp version unsupported %d\n",
+                               __ddp_get_version(c_hdr));
+
+                       if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) {
+                               etype = DDP_ETYPE_TAGGED_BUF;
+                               ecode = DDP_ECODE_T_VERSION;
+                       } else {
+                               etype = DDP_ETYPE_UNTAGGED_BUF;
+                               ecode = DDP_ECODE_UT_VERSION;
+                       }
+                       siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
+                                          etype, ecode, 0);
+                       return -EINVAL;
+               }
+               if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) {
+                       pr_warn("siw: received rdmap version unsupported %d\n",
+                               __rdmap_get_version(c_hdr));
+
+                       siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
+                                          RDMAP_ETYPE_REMOTE_OPERATION,
+                                          RDMAP_ECODE_VERSION, 0);
+                       return -EINVAL;
+               }
+               opcode = __rdmap_get_opcode(c_hdr);
+
+               if (opcode > RDMAP_TERMINATE) {
+                       pr_warn("siw: received unknown packet type %u\n",
+                               opcode);
+
+                       siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
+                                          RDMAP_ETYPE_REMOTE_OPERATION,
+                                          RDMAP_ECODE_OPCODE, 0);
+                       return -EINVAL;
+               }
+               siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode);
+       } else {
+               opcode = __rdmap_get_opcode(c_hdr);
+       }
+       set_rx_fpdu_context(qp, opcode);
+       frx = qp->rx_fpdu;
+
+       /*
+        * Figure out len of current hdr: variable length of
+        * iwarp hdr may force us to copy hdr information in
+        * two steps. Only tagged DDP messages are already
+        * completely received.
+        */
+       if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) {
+               bytes = iwarp_pktinfo[opcode].hdr_len - MIN_DDP_HDR;
+
+               if (srx->skb_new < bytes)
+                       return -EAGAIN;
+
+               skb_copy_bits(skb, srx->skb_offset,
+                             (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
+
+               srx->fpdu_part_rcvd += bytes;
+
+               srx->skb_new -= bytes;
+               srx->skb_offset += bytes;
+               srx->skb_copied += bytes;
+       }
+
+       /*
+        * DDP/RDMAP header receive completed. Check if the current
+        * DDP segment starts a new RDMAP message or continues a previously
+        * started RDMAP message.
+        *
+        * Alternating reception of DDP segments (or FPDUs) from incomplete
+        * tagged and untagged RDMAP messages is supported, as long as
+        * the current tagged or untagged message gets eventually completed
+        * w/o intersection from another message of the same type
+        * (tagged/untagged). E.g., a WRITE can get intersected by a SEND,
+        * but not by a READ RESPONSE etc.
+        */
+       if (srx->mpa_crc_hd) {
+               /*
+                * Restart CRC computation
+                */
+               crypto_shash_init(srx->mpa_crc_hd);
+               crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr,
+                                   srx->fpdu_part_rcvd);
+       }
+       if (frx->more_ddp_segs) {
+               frx->first_ddp_seg = 0;
+               if (frx->prev_rdmap_op != opcode) {
+                       pr_warn("siw: packet intersection: %u : %u\n",
+                               frx->prev_rdmap_op, opcode);
+                       /*
+                        * The last inbound RDMA operation of same type
+                        * (tagged or untagged) is left unfinished.
+                        * To complete it in error, make it the current
+                        * operation again, even with the header already
+                        * overwritten. For error handling, only the opcode
+                        * and current rx context are relevant.
+                        */
+                       set_rx_fpdu_context(qp, frx->prev_rdmap_op);
+                       __rdmap_set_opcode(c_hdr, frx->prev_rdmap_op);
+                       return -EPROTO;
+               }
+       } else {
+               frx->prev_rdmap_op = opcode;
+               frx->first_ddp_seg = 1;
+       }
+       frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1;
+
+       return 0;
+}
+
+static int siw_check_tx_fence(struct siw_qp *qp)
+{
+       struct siw_wqe *tx_waiting = tx_wqe(qp);
+       struct siw_sqe *rreq;
+       int resume_tx = 0, rv = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&qp->orq_lock, flags);
+
+       rreq = orq_get_current(qp);
+
+       /* free current orq entry */
+       WRITE_ONCE(rreq->flags, 0);
+
+       if (qp->tx_ctx.orq_fence) {
+               if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) {
+                       pr_warn("siw: [QP %u]: fence resume: bad status %d\n",
+                               qp_id(qp), tx_waiting->wr_status);
+                       rv = -EPROTO;
+                       goto out;
+               }
+               /* resume SQ processing */
+               if (tx_waiting->sqe.opcode == SIW_OP_READ ||
+                   tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
+                       rreq = orq_get_tail(qp);
+                       if (unlikely(!rreq)) {
+                               pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp));
+                               rv = -EPROTO;
+                               goto out;
+                       }
+                       siw_read_to_orq(rreq, &tx_waiting->sqe);
+
+                       qp->orq_put++;
+                       qp->tx_ctx.orq_fence = 0;
+                       resume_tx = 1;
+
+               } else if (siw_orq_empty(qp)) {
+                       qp->tx_ctx.orq_fence = 0;
+                       resume_tx = 1;
+               } else {
+                       pr_warn("siw: [QP %u]: fence resume: orq idx: %d:%d\n",
+                               qp_id(qp), qp->orq_get, qp->orq_put);
+                       rv = -EPROTO;
+               }
+       }
+       qp->orq_get++;
+out:
+       spin_unlock_irqrestore(&qp->orq_lock, flags);
+
+       if (resume_tx)
+               rv = siw_sq_start(qp);
+
+       return rv;
+}
+
+/*
+ * siw_rdmap_complete()
+ *
+ * Complete processing of an RDMA message after receiving all
+ * DDP segmens or ABort processing after encountering error case.
+ *
+ *   o SENDs + RRESPs will need for completion,
+ *   o RREQs need for  READ RESPONSE initialization
+ *   o WRITEs need memory dereferencing
+ *
+ * TODO: Failed WRITEs need local error to be surfaced.
+ */
+static int siw_rdmap_complete(struct siw_qp *qp, int error)
+{
+       struct siw_rx_stream *srx = &qp->rx_stream;
+       struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu);
+       enum siw_wc_status wc_status = wqe->wc_status;
+       u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl);
+       int rv = 0;
+
+       switch (opcode) {
+       case RDMAP_SEND_SE:
+       case RDMAP_SEND_SE_INVAL:
+               wqe->rqe.flags |= SIW_WQE_SOLICITED;
+               /* Fall through */
+
+       case RDMAP_SEND:
+       case RDMAP_SEND_INVAL:
+               if (wqe->wr_status == SIW_WR_IDLE)
+                       break;
+
+               srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
+
+               if (error != 0 && wc_status == SIW_WC_SUCCESS)
+                       wc_status = SIW_WC_GENERAL_ERR;
+               /*
+                * Handle STag invalidation request
+                */
+               if (wc_status == SIW_WC_SUCCESS &&
+                   (opcode == RDMAP_SEND_INVAL ||
+                    opcode == RDMAP_SEND_SE_INVAL)) {
+                       rv = siw_invalidate_stag(qp->pd, srx->inval_stag);
+                       if (rv) {
+                               siw_init_terminate(
+                                       qp, TERM_ERROR_LAYER_RDMAP,
+                                       rv == -EACCES ?
+                                               RDMAP_ETYPE_REMOTE_PROTECTION :
+                                               RDMAP_ETYPE_REMOTE_OPERATION,
+                                       RDMAP_ECODE_CANNOT_INVALIDATE, 0);
+
+                               wc_status = SIW_WC_REM_INV_REQ_ERR;
+                       }
+                       rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
+                                             rv ? 0 : srx->inval_stag,
+                                             wc_status);
+               } else {
+                       rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
+                                             0, wc_status);
+               }
+               siw_wqe_put_mem(wqe, SIW_OP_RECEIVE);
+               break;
+
+       case RDMAP_RDMA_READ_RESP:
+               if (wqe->wr_status == SIW_WR_IDLE)
+                       break;
+
+               if (error != 0) {
+                       if ((srx->state == SIW_GET_HDR &&
+                            qp->rx_fpdu->first_ddp_seg) || error == -ENODATA)
+                               /* possible RREQ in ORQ left untouched */
+                               break;
+
+                       if (wc_status == SIW_WC_SUCCESS)
+                               wc_status = SIW_WC_GENERAL_ERR;
+               } else if (qp->kernel_verbs &&
+                          rx_type(wqe) == SIW_OP_READ_LOCAL_INV) {
+                       /*
+                        * Handle any STag invalidation request
+                        */
+                       rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey);
+                       if (rv) {
+                               siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+                                                  RDMAP_ETYPE_CATASTROPHIC,
+                                                  RDMAP_ECODE_UNSPECIFIED, 0);
+
+                               if (wc_status == SIW_WC_SUCCESS) {
+                                       wc_status = SIW_WC_GENERAL_ERR;
+                                       error = rv;
+                               }
+                       }
+               }
+               /*
+                * All errors turn the wqe into signalled.
+                */
+               if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0)
+                       rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed,
+                                             wc_status);
+               siw_wqe_put_mem(wqe, SIW_OP_READ);
+
+               if (!error)
+                       rv = siw_check_tx_fence(qp);
+               else
+                       /* Disable current ORQ eleement */
+                       WRITE_ONCE(orq_get_current(qp)->flags, 0);
+               break;
+
+       case RDMAP_RDMA_READ_REQ:
+               if (!error) {
+                       rv = siw_init_rresp(qp, srx);
+                       srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
+               }
+               break;
+
+       case RDMAP_RDMA_WRITE:
+               if (wqe->wr_status == SIW_WR_IDLE)
+                       break;
+
+               /*
+                * Free References from memory object if
+                * attached to receive context (inbound WRITE).
+                * While a zero-length WRITE is allowed,
+                * no memory reference got created.
+                */
+               if (rx_mem(&qp->rx_tagged)) {
+                       siw_mem_put(rx_mem(&qp->rx_tagged));
+                       rx_mem(&qp->rx_tagged) = NULL;
+               }
+               break;
+
+       default:
+               break;
+       }
+       wqe->wr_status = SIW_WR_IDLE;
+
+       return rv;
+}
+
+/*
+ * siw_tcp_rx_data()
+ *
+ * Main routine to consume inbound TCP payload
+ *
+ * @rd_desc:   read descriptor
+ * @skb:       socket buffer
+ * @off:       offset in skb
+ * @len:       skb->len - offset : payload in skb
+ */
+int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
+                   unsigned int off, size_t len)
+{
+       struct siw_qp *qp = rd_desc->arg.data;
+       struct siw_rx_stream *srx = &qp->rx_stream;
+       int rv;
+
+       srx->skb = skb;
+       srx->skb_new = skb->len - off;
+       srx->skb_offset = off;
+       srx->skb_copied = 0;
+
+       siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new);
+
+       while (srx->skb_new) {
+               int run_completion = 1;
+
+               if (unlikely(srx->rx_suspend)) {
+                       /* Do not process any more data */
+                       srx->skb_copied += srx->skb_new;
+                       break;
+               }
+               switch (srx->state) {
+               case SIW_GET_HDR:
+                       rv = siw_get_hdr(srx);
+                       if (!rv) {
+                               srx->fpdu_part_rem =
+                                       be16_to_cpu(srx->hdr.ctrl.mpa_len) -
+                                       srx->fpdu_part_rcvd + MPA_HDR_SIZE;
+
+                               if (srx->fpdu_part_rem)
+                                       srx->pad = -srx->fpdu_part_rem & 0x3;
+                               else
+                                       srx->pad = 0;
+
+                               srx->state = SIW_GET_DATA_START;
+                               srx->fpdu_part_rcvd = 0;
+                       }
+                       break;
+
+               case SIW_GET_DATA_MORE:
+                       /*
+                        * Another data fragment of the same DDP segment.
+                        * Setting first_ddp_seg = 0 avoids repeating
+                        * initializations that shall occur only once per
+                        * DDP segment.
+                        */
+                       qp->rx_fpdu->first_ddp_seg = 0;
+                       /* Fall through */
+
+               case SIW_GET_DATA_START:
+                       /*
+                        * Headers will be checked by the opcode-specific
+                        * data receive function below.
+                        */
+                       rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp);
+                       if (!rv) {
+                               int mpa_len =
+                                       be16_to_cpu(srx->hdr.ctrl.mpa_len)
+                                       + MPA_HDR_SIZE;
+
+                               srx->fpdu_part_rem = (-mpa_len & 0x3)
+                                                     + MPA_CRC_SIZE;
+                               srx->fpdu_part_rcvd = 0;
+                               srx->state = SIW_GET_TRAILER;
+                       } else {
+                               if (unlikely(rv == -ECONNRESET))
+                                       run_completion = 0;
+                               else
+                                       srx->state = SIW_GET_DATA_MORE;
+                       }
+                       break;
+
+               case SIW_GET_TRAILER:
+                       /*
+                        * read CRC + any padding
+                        */
+                       rv = siw_get_trailer(qp, srx);
+                       if (likely(!rv)) {
+                               /*
+                                * FPDU completed.
+                                * complete RDMAP message if last fragment
+                                */
+                               srx->state = SIW_GET_HDR;
+                               srx->fpdu_part_rcvd = 0;
+
+                               if (!(srx->hdr.ctrl.ddp_rdmap_ctrl &
+                                     DDP_FLAG_LAST))
+                                       /* more frags */
+                                       break;
+
+                               rv = siw_rdmap_complete(qp, 0);
+                               run_completion = 0;
+                       }
+                       break;
+
+               default:
+                       pr_warn("QP[%u]: RX out of state\n", qp_id(qp));
+                       rv = -EPROTO;
+                       run_completion = 0;
+               }
+               if (unlikely(rv != 0 && rv != -EAGAIN)) {
+                       if ((srx->state > SIW_GET_HDR ||
+                            qp->rx_fpdu->more_ddp_segs) && run_completion)
+                               siw_rdmap_complete(qp, rv);
+
+                       siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv,
+                                  srx->state);
+
+                       siw_qp_cm_drop(qp, 1);
+
+                       break;
+               }
+               if (rv) {
+                       siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n",
+                                  srx->state, srx->fpdu_part_rem);
+                       break;
+               }
+       }
+       return srx->skb_copied;
+}
diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c
new file mode 100644 (file)
index 0000000..43020d2
--- /dev/null
@@ -0,0 +1,1269 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <net/tcp.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+#include "siw_mem.h"
+
+#define MAX_HDR_INLINE                                 \
+       (((uint32_t)(sizeof(struct siw_rreq_pkt) -      \
+                    sizeof(struct iwarp_send))) & 0xF8)
+
+static struct page *siw_get_pblpage(struct siw_mem *mem, u64 addr, int *idx)
+{
+       struct siw_pbl *pbl = mem->pbl;
+       u64 offset = addr - mem->va;
+       u64 paddr = siw_pbl_get_buffer(pbl, offset, NULL, idx);
+
+       if (paddr)
+               return virt_to_page(paddr);
+
+       return NULL;
+}
+
+/*
+ * Copy short payload at provided destination payload address
+ */
+static int siw_try_1seg(struct siw_iwarp_tx *c_tx, u64 paddr)
+{
+       struct siw_wqe *wqe = &c_tx->wqe_active;
+       struct siw_sge *sge = &wqe->sqe.sge[0];
+       u32 bytes = sge->length;
+
+       if (bytes > MAX_HDR_INLINE || wqe->sqe.num_sge != 1)
+               return MAX_HDR_INLINE + 1;
+
+       if (!bytes)
+               return 0;
+
+       if (tx_flags(wqe) & SIW_WQE_INLINE) {
+               memcpy((void *)paddr, &wqe->sqe.sge[1], bytes);
+       } else {
+               struct siw_mem *mem = wqe->mem[0];
+
+               if (!mem->mem_obj) {
+                       /* Kernel client using kva */
+                       memcpy((void *)paddr, (void *)sge->laddr, bytes);
+               } else if (c_tx->in_syscall) {
+                       if (copy_from_user((void *)paddr,
+                                          (const void __user *)sge->laddr,
+                                          bytes))
+                               return -EFAULT;
+               } else {
+                       unsigned int off = sge->laddr & ~PAGE_MASK;
+                       struct page *p;
+                       char *buffer;
+                       int pbl_idx = 0;
+
+                       if (!mem->is_pbl)
+                               p = siw_get_upage(mem->umem, sge->laddr);
+                       else
+                               p = siw_get_pblpage(mem, sge->laddr, &pbl_idx);
+
+                       if (unlikely(!p))
+                               return -EFAULT;
+
+                       buffer = kmap_atomic(p);
+
+                       if (likely(PAGE_SIZE - off >= bytes)) {
+                               memcpy((void *)paddr, buffer + off, bytes);
+                               kunmap_atomic(buffer);
+                       } else {
+                               unsigned long part = bytes - (PAGE_SIZE - off);
+
+                               memcpy((void *)paddr, buffer + off, part);
+                               kunmap_atomic(buffer);
+
+                               if (!mem->is_pbl)
+                                       p = siw_get_upage(mem->umem,
+                                                         sge->laddr + part);
+                               else
+                                       p = siw_get_pblpage(mem,
+                                                           sge->laddr + part,
+                                                           &pbl_idx);
+                               if (unlikely(!p))
+                                       return -EFAULT;
+
+                               buffer = kmap_atomic(p);
+                               memcpy((void *)(paddr + part), buffer,
+                                      bytes - part);
+                               kunmap_atomic(buffer);
+                       }
+               }
+       }
+       return (int)bytes;
+}
+
+#define PKT_FRAGMENTED 1
+#define PKT_COMPLETE 0
+
+/*
+ * siw_qp_prepare_tx()
+ *
+ * Prepare tx state for sending out one fpdu. Builds complete pkt
+ * if no user data or only immediate data are present.
+ *
+ * returns PKT_COMPLETE if complete pkt built, PKT_FRAGMENTED otherwise.
+ */
+static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx)
+{
+       struct siw_wqe *wqe = &c_tx->wqe_active;
+       char *crc = NULL;
+       int data = 0;
+
+       switch (tx_type(wqe)) {
+       case SIW_OP_READ:
+       case SIW_OP_READ_LOCAL_INV:
+               memcpy(&c_tx->pkt.ctrl,
+                      &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
+                      sizeof(struct iwarp_ctrl));
+
+               c_tx->pkt.rreq.rsvd = 0;
+               c_tx->pkt.rreq.ddp_qn = htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
+               c_tx->pkt.rreq.ddp_msn =
+                       htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]);
+               c_tx->pkt.rreq.ddp_mo = 0;
+               c_tx->pkt.rreq.sink_stag = htonl(wqe->sqe.sge[0].lkey);
+               c_tx->pkt.rreq.sink_to =
+                       cpu_to_be64(wqe->sqe.sge[0].laddr);
+               c_tx->pkt.rreq.source_stag = htonl(wqe->sqe.rkey);
+               c_tx->pkt.rreq.source_to = cpu_to_be64(wqe->sqe.raddr);
+               c_tx->pkt.rreq.read_size = htonl(wqe->sqe.sge[0].length);
+
+               c_tx->ctrl_len = sizeof(struct iwarp_rdma_rreq);
+               crc = (char *)&c_tx->pkt.rreq_pkt.crc;
+               break;
+
+       case SIW_OP_SEND:
+               if (tx_flags(wqe) & SIW_WQE_SOLICITED)
+                       memcpy(&c_tx->pkt.ctrl,
+                              &iwarp_pktinfo[RDMAP_SEND_SE].ctrl,
+                              sizeof(struct iwarp_ctrl));
+               else
+                       memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_SEND].ctrl,
+                              sizeof(struct iwarp_ctrl));
+
+               c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND;
+               c_tx->pkt.send.ddp_msn =
+                       htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
+               c_tx->pkt.send.ddp_mo = 0;
+
+               c_tx->pkt.send_inv.inval_stag = 0;
+
+               c_tx->ctrl_len = sizeof(struct iwarp_send);
+
+               crc = (char *)&c_tx->pkt.send_pkt.crc;
+               data = siw_try_1seg(c_tx, (u64)crc);
+               break;
+
+       case SIW_OP_SEND_REMOTE_INV:
+               if (tx_flags(wqe) & SIW_WQE_SOLICITED)
+                       memcpy(&c_tx->pkt.ctrl,
+                              &iwarp_pktinfo[RDMAP_SEND_SE_INVAL].ctrl,
+                              sizeof(struct iwarp_ctrl));
+               else
+                       memcpy(&c_tx->pkt.ctrl,
+                              &iwarp_pktinfo[RDMAP_SEND_INVAL].ctrl,
+                              sizeof(struct iwarp_ctrl));
+
+               c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND;
+               c_tx->pkt.send.ddp_msn =
+                       htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
+               c_tx->pkt.send.ddp_mo = 0;
+
+               c_tx->pkt.send_inv.inval_stag = cpu_to_be32(wqe->sqe.rkey);
+
+               c_tx->ctrl_len = sizeof(struct iwarp_send_inv);
+
+               crc = (char *)&c_tx->pkt.send_pkt.crc;
+               data = siw_try_1seg(c_tx, (u64)crc);
+               break;
+
+       case SIW_OP_WRITE:
+               memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_RDMA_WRITE].ctrl,
+                      sizeof(struct iwarp_ctrl));
+
+               c_tx->pkt.rwrite.sink_stag = htonl(wqe->sqe.rkey);
+               c_tx->pkt.rwrite.sink_to = cpu_to_be64(wqe->sqe.raddr);
+               c_tx->ctrl_len = sizeof(struct iwarp_rdma_write);
+
+               crc = (char *)&c_tx->pkt.write_pkt.crc;
+               data = siw_try_1seg(c_tx, (u64)crc);
+               break;
+
+       case SIW_OP_READ_RESPONSE:
+               memcpy(&c_tx->pkt.ctrl,
+                      &iwarp_pktinfo[RDMAP_RDMA_READ_RESP].ctrl,
+                      sizeof(struct iwarp_ctrl));
+
+               /* NBO */
+               c_tx->pkt.rresp.sink_stag = cpu_to_be32(wqe->sqe.rkey);
+               c_tx->pkt.rresp.sink_to = cpu_to_be64(wqe->sqe.raddr);
+
+               c_tx->ctrl_len = sizeof(struct iwarp_rdma_rresp);
+
+               crc = (char *)&c_tx->pkt.write_pkt.crc;
+               data = siw_try_1seg(c_tx, (u64)crc);
+               break;
+
+       default:
+               siw_dbg_qp(tx_qp(c_tx), "stale wqe type %d\n", tx_type(wqe));
+               return -EOPNOTSUPP;
+       }
+       if (unlikely(data < 0))
+               return data;
+
+       c_tx->ctrl_sent = 0;
+
+       if (data <= MAX_HDR_INLINE) {
+               if (data) {
+                       wqe->processed = data;
+
+                       c_tx->pkt.ctrl.mpa_len =
+                               htons(c_tx->ctrl_len + data - MPA_HDR_SIZE);
+
+                       /* Add pad, if needed */
+                       data += -(int)data & 0x3;
+                       /* advance CRC location after payload */
+                       crc += data;
+                       c_tx->ctrl_len += data;
+
+                       if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED))
+                               c_tx->pkt.c_untagged.ddp_mo = 0;
+                       else
+                               c_tx->pkt.c_tagged.ddp_to =
+                                       cpu_to_be64(wqe->sqe.raddr);
+               }
+
+               *(u32 *)crc = 0;
+               /*
+                * Do complete CRC if enabled and short packet
+                */
+               if (c_tx->mpa_crc_hd) {
+                       crypto_shash_init(c_tx->mpa_crc_hd);
+                       if (crypto_shash_update(c_tx->mpa_crc_hd,
+                                               (u8 *)&c_tx->pkt,
+                                               c_tx->ctrl_len))
+                               return -EINVAL;
+                       crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)crc);
+               }
+               c_tx->ctrl_len += MPA_CRC_SIZE;
+
+               return PKT_COMPLETE;
+       }
+       c_tx->ctrl_len += MPA_CRC_SIZE;
+       c_tx->sge_idx = 0;
+       c_tx->sge_off = 0;
+       c_tx->pbl_idx = 0;
+
+       /*
+        * Allow direct sending out of user buffer if WR is non signalled
+        * and payload is over threshold.
+        * Per RDMA verbs, the application should not change the send buffer
+        * until the work completed. In iWarp, work completion is only
+        * local delivery to TCP. TCP may reuse the buffer for
+        * retransmission. Changing unsent data also breaks the CRC,
+        * if applied.
+        */
+       if (c_tx->zcopy_tx && wqe->bytes >= SENDPAGE_THRESH &&
+           !(tx_flags(wqe) & SIW_WQE_SIGNALLED))
+               c_tx->use_sendpage = 1;
+       else
+               c_tx->use_sendpage = 0;
+
+       return PKT_FRAGMENTED;
+}
+
+/*
+ * Send out one complete control type FPDU, or header of FPDU carrying
+ * data. Used for fixed sized packets like Read.Requests or zero length
+ * SENDs, WRITEs, READ.Responses, or header only.
+ */
+static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s,
+                             int flags)
+{
+       struct msghdr msg = { .msg_flags = flags };
+       struct kvec iov = { .iov_base =
+                                   (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent,
+                           .iov_len = c_tx->ctrl_len - c_tx->ctrl_sent };
+
+       int rv = kernel_sendmsg(s, &msg, &iov, 1,
+                               c_tx->ctrl_len - c_tx->ctrl_sent);
+
+       if (rv >= 0) {
+               c_tx->ctrl_sent += rv;
+
+               if (c_tx->ctrl_sent == c_tx->ctrl_len)
+                       rv = 0;
+               else
+                       rv = -EAGAIN;
+       }
+       return rv;
+}
+
+/*
+ * 0copy TCP transmit interface: Use do_tcp_sendpages.
+ *
+ * Using sendpage to push page by page appears to be less efficient
+ * than using sendmsg, even if data are copied.
+ *
+ * A general performance limitation might be the extra four bytes
+ * trailer checksum segment to be pushed after user data.
+ */
+static int siw_tcp_sendpages(struct socket *s, struct page **page, int offset,
+                            size_t size)
+{
+       struct sock *sk = s->sk;
+       int i = 0, rv = 0, sent = 0,
+           flags = MSG_MORE | MSG_DONTWAIT | MSG_SENDPAGE_NOTLAST;
+
+       while (size) {
+               size_t bytes = min_t(size_t, PAGE_SIZE - offset, size);
+
+               if (size + offset <= PAGE_SIZE)
+                       flags = MSG_MORE | MSG_DONTWAIT;
+
+               tcp_rate_check_app_limited(sk);
+try_page_again:
+               lock_sock(sk);
+               rv = do_tcp_sendpages(sk, page[i], offset, bytes, flags);
+               release_sock(sk);
+
+               if (rv > 0) {
+                       size -= rv;
+                       sent += rv;
+                       if (rv != bytes) {
+                               offset += rv;
+                               bytes -= rv;
+                               goto try_page_again;
+                       }
+                       offset = 0;
+               } else {
+                       if (rv == -EAGAIN || rv == 0)
+                               break;
+                       return rv;
+               }
+               i++;
+       }
+       return sent;
+}
+
+/*
+ * siw_0copy_tx()
+ *
+ * Pushes list of pages to TCP socket. If pages from multiple
+ * SGE's, all referenced pages of each SGE are pushed in one
+ * shot.
+ */
+static int siw_0copy_tx(struct socket *s, struct page **page,
+                       struct siw_sge *sge, unsigned int offset,
+                       unsigned int size)
+{
+       int i = 0, sent = 0, rv;
+       int sge_bytes = min(sge->length - offset, size);
+
+       offset = (sge->laddr + offset) & ~PAGE_MASK;
+
+       while (sent != size) {
+               rv = siw_tcp_sendpages(s, &page[i], offset, sge_bytes);
+               if (rv >= 0) {
+                       sent += rv;
+                       if (size == sent || sge_bytes > rv)
+                               break;
+
+                       i += PAGE_ALIGN(sge_bytes + offset) >> PAGE_SHIFT;
+                       sge++;
+                       sge_bytes = min(sge->length, size - sent);
+                       offset = sge->laddr & ~PAGE_MASK;
+               } else {
+                       sent = rv;
+                       break;
+               }
+       }
+       return sent;
+}
+
+#define MAX_TRAILER (MPA_CRC_SIZE + 4)
+
+static void siw_unmap_pages(struct page **pages, int hdr_len, int num_maps)
+{
+       if (hdr_len) {
+               ++pages;
+               --num_maps;
+       }
+       while (num_maps-- > 0) {
+               kunmap(*pages);
+               pages++;
+       }
+}
+
+/*
+ * siw_tx_hdt() tries to push a complete packet to TCP where all
+ * packet fragments are referenced by the elements of one iovec.
+ * For the data portion, each involved page must be referenced by
+ * one extra element. All sge's data can be non-aligned to page
+ * boundaries. Two more elements are referencing iWARP header
+ * and trailer:
+ * MAX_ARRAY = 64KB/PAGE_SIZE + 1 + (2 * (SIW_MAX_SGE - 1) + HDR + TRL
+ */
+#define MAX_ARRAY ((0xffff / PAGE_SIZE) + 1 + (2 * (SIW_MAX_SGE - 1) + 2))
+
+/*
+ * Write out iov referencing hdr, data and trailer of current FPDU.
+ * Update transmit state dependent on write return status
+ */
+static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s)
+{
+       struct siw_wqe *wqe = &c_tx->wqe_active;
+       struct siw_sge *sge = &wqe->sqe.sge[c_tx->sge_idx];
+       struct kvec iov[MAX_ARRAY];
+       struct page *page_array[MAX_ARRAY];
+       struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
+
+       int seg = 0, do_crc = c_tx->do_crc, is_kva = 0, rv;
+       unsigned int data_len = c_tx->bytes_unsent, hdr_len = 0, trl_len = 0,
+                    sge_off = c_tx->sge_off, sge_idx = c_tx->sge_idx,
+                    pbl_idx = c_tx->pbl_idx;
+
+       if (c_tx->state == SIW_SEND_HDR) {
+               if (c_tx->use_sendpage) {
+                       rv = siw_tx_ctrl(c_tx, s, MSG_DONTWAIT | MSG_MORE);
+                       if (rv)
+                               goto done;
+
+                       c_tx->state = SIW_SEND_DATA;
+               } else {
+                       iov[0].iov_base =
+                               (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent;
+                       iov[0].iov_len = hdr_len =
+                               c_tx->ctrl_len - c_tx->ctrl_sent;
+                       seg = 1;
+               }
+       }
+
+       wqe->processed += data_len;
+
+       while (data_len) { /* walk the list of SGE's */
+               unsigned int sge_len = min(sge->length - sge_off, data_len);
+               unsigned int fp_off = (sge->laddr + sge_off) & ~PAGE_MASK;
+               struct siw_mem *mem;
+
+               if (!(tx_flags(wqe) & SIW_WQE_INLINE)) {
+                       mem = wqe->mem[sge_idx];
+                       if (!mem->mem_obj)
+                               is_kva = 1;
+               } else {
+                       is_kva = 1;
+               }
+               if (is_kva && !c_tx->use_sendpage) {
+                       /*
+                        * tx from kernel virtual address: either inline data
+                        * or memory region with assigned kernel buffer
+                        */
+                       iov[seg].iov_base = (void *)(sge->laddr + sge_off);
+                       iov[seg].iov_len = sge_len;
+
+                       if (do_crc)
+                               crypto_shash_update(c_tx->mpa_crc_hd,
+                                                   iov[seg].iov_base,
+                                                   sge_len);
+                       sge_off += sge_len;
+                       data_len -= sge_len;
+                       seg++;
+                       goto sge_done;
+               }
+
+               while (sge_len) {
+                       size_t plen = min((int)PAGE_SIZE - fp_off, sge_len);
+
+                       if (!is_kva) {
+                               struct page *p;
+
+                               if (mem->is_pbl)
+                                       p = siw_get_pblpage(
+                                               mem, sge->laddr + sge_off,
+                                               &pbl_idx);
+                               else
+                                       p = siw_get_upage(mem->umem,
+                                                         sge->laddr + sge_off);
+                               if (unlikely(!p)) {
+                                       if (hdr_len)
+                                               seg--;
+                                       if (!c_tx->use_sendpage && seg) {
+                                               siw_unmap_pages(page_array,
+                                                               hdr_len, seg);
+                                       }
+                                       wqe->processed -= c_tx->bytes_unsent;
+                                       rv = -EFAULT;
+                                       goto done_crc;
+                               }
+                               page_array[seg] = p;
+
+                               if (!c_tx->use_sendpage) {
+                                       iov[seg].iov_base = kmap(p) + fp_off;
+                                       iov[seg].iov_len = plen;
+                                       if (do_crc)
+                                               crypto_shash_update(
+                                                       c_tx->mpa_crc_hd,
+                                                       iov[seg].iov_base,
+                                                       plen);
+                               } else if (do_crc)
+                                       crypto_shash_update(
+                                               c_tx->mpa_crc_hd,
+                                               page_address(p) + fp_off,
+                                               plen);
+                       } else {
+                               u64 pa = ((sge->laddr + sge_off) & PAGE_MASK);
+
+                               page_array[seg] = virt_to_page(pa);
+                               if (do_crc)
+                                       crypto_shash_update(
+                                               c_tx->mpa_crc_hd,
+                                               (void *)(sge->laddr + sge_off),
+                                               plen);
+                       }
+
+                       sge_len -= plen;
+                       sge_off += plen;
+                       data_len -= plen;
+                       fp_off = 0;
+
+                       if (++seg > (int)MAX_ARRAY) {
+                               siw_dbg_qp(tx_qp(c_tx), "to many fragments\n");
+                               if (!is_kva && !c_tx->use_sendpage) {
+                                       siw_unmap_pages(page_array, hdr_len,
+                                                       seg - 1);
+                               }
+                               wqe->processed -= c_tx->bytes_unsent;
+                               rv = -EMSGSIZE;
+                               goto done_crc;
+                       }
+               }
+sge_done:
+               /* Update SGE variables at end of SGE */
+               if (sge_off == sge->length &&
+                   (data_len != 0 || wqe->processed < wqe->bytes)) {
+                       sge_idx++;
+                       sge++;
+                       sge_off = 0;
+               }
+       }
+       /* trailer */
+       if (likely(c_tx->state != SIW_SEND_TRAILER)) {
+               iov[seg].iov_base = &c_tx->trailer.pad[4 - c_tx->pad];
+               iov[seg].iov_len = trl_len = MAX_TRAILER - (4 - c_tx->pad);
+       } else {
+               iov[seg].iov_base = &c_tx->trailer.pad[c_tx->ctrl_sent];
+               iov[seg].iov_len = trl_len = MAX_TRAILER - c_tx->ctrl_sent;
+       }
+
+       if (c_tx->pad) {
+               *(u32 *)c_tx->trailer.pad = 0;
+               if (do_crc)
+                       crypto_shash_update(c_tx->mpa_crc_hd,
+                               (u8 *)&c_tx->trailer.crc - c_tx->pad,
+                               c_tx->pad);
+       }
+       if (!c_tx->mpa_crc_hd)
+               c_tx->trailer.crc = 0;
+       else if (do_crc)
+               crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)&c_tx->trailer.crc);
+
+       data_len = c_tx->bytes_unsent;
+
+       if (c_tx->use_sendpage) {
+               rv = siw_0copy_tx(s, page_array, &wqe->sqe.sge[c_tx->sge_idx],
+                                 c_tx->sge_off, data_len);
+               if (rv == data_len) {
+                       rv = kernel_sendmsg(s, &msg, &iov[seg], 1, trl_len);
+                       if (rv > 0)
+                               rv += data_len;
+                       else
+                               rv = data_len;
+               }
+       } else {
+               rv = kernel_sendmsg(s, &msg, iov, seg + 1,
+                                   hdr_len + data_len + trl_len);
+               if (!is_kva)
+                       siw_unmap_pages(page_array, hdr_len, seg);
+       }
+       if (rv < (int)hdr_len) {
+               /* Not even complete hdr pushed or negative rv */
+               wqe->processed -= data_len;
+               if (rv >= 0) {
+                       c_tx->ctrl_sent += rv;
+                       rv = -EAGAIN;
+               }
+               goto done_crc;
+       }
+       rv -= hdr_len;
+
+       if (rv >= (int)data_len) {
+               /* all user data pushed to TCP or no data to push */
+               if (data_len > 0 && wqe->processed < wqe->bytes) {
+                       /* Save the current state for next tx */
+                       c_tx->sge_idx = sge_idx;
+                       c_tx->sge_off = sge_off;
+                       c_tx->pbl_idx = pbl_idx;
+               }
+               rv -= data_len;
+
+               if (rv == trl_len) /* all pushed */
+                       rv = 0;
+               else {
+                       c_tx->state = SIW_SEND_TRAILER;
+                       c_tx->ctrl_len = MAX_TRAILER;
+                       c_tx->ctrl_sent = rv + 4 - c_tx->pad;
+                       c_tx->bytes_unsent = 0;
+                       rv = -EAGAIN;
+               }
+
+       } else if (data_len > 0) {
+               /* Maybe some user data pushed to TCP */
+               c_tx->state = SIW_SEND_DATA;
+               wqe->processed -= data_len - rv;
+
+               if (rv) {
+                       /*
+                        * Some bytes out. Recompute tx state based
+                        * on old state and bytes pushed
+                        */
+                       unsigned int sge_unsent;
+
+                       c_tx->bytes_unsent -= rv;
+                       sge = &wqe->sqe.sge[c_tx->sge_idx];
+                       sge_unsent = sge->length - c_tx->sge_off;
+
+                       while (sge_unsent <= rv) {
+                               rv -= sge_unsent;
+                               c_tx->sge_idx++;
+                               c_tx->sge_off = 0;
+                               sge++;
+                               sge_unsent = sge->length;
+                       }
+                       c_tx->sge_off += rv;
+               }
+               rv = -EAGAIN;
+       }
+done_crc:
+       c_tx->do_crc = 0;
+done:
+       return rv;
+}
+
+static void siw_update_tcpseg(struct siw_iwarp_tx *c_tx,
+                                    struct socket *s)
+{
+       struct tcp_sock *tp = tcp_sk(s->sk);
+
+       if (tp->gso_segs) {
+               if (c_tx->gso_seg_limit == 0)
+                       c_tx->tcp_seglen = tp->mss_cache * tp->gso_segs;
+               else
+                       c_tx->tcp_seglen =
+                               tp->mss_cache *
+                               min_t(u16, c_tx->gso_seg_limit, tp->gso_segs);
+       } else {
+               c_tx->tcp_seglen = tp->mss_cache;
+       }
+       /* Loopback may give odd numbers */
+       c_tx->tcp_seglen &= 0xfffffff8;
+}
+
+/*
+ * siw_prepare_fpdu()
+ *
+ * Prepares transmit context to send out one FPDU if FPDU will contain
+ * user data and user data are not immediate data.
+ * Computes maximum FPDU length to fill up TCP MSS if possible.
+ *
+ * @qp:                QP from which to transmit
+ * @wqe:       Current WQE causing transmission
+ *
+ * TODO: Take into account real available sendspace on socket
+ *       to avoid header misalignment due to send pausing within
+ *       fpdu transmission
+ */
+static void siw_prepare_fpdu(struct siw_qp *qp, struct siw_wqe *wqe)
+{
+       struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
+       int data_len;
+
+       c_tx->ctrl_len =
+               iwarp_pktinfo[__rdmap_get_opcode(&c_tx->pkt.ctrl)].hdr_len;
+       c_tx->ctrl_sent = 0;
+
+       /*
+        * Update target buffer offset if any
+        */
+       if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED))
+               /* Untagged message */
+               c_tx->pkt.c_untagged.ddp_mo = cpu_to_be32(wqe->processed);
+       else /* Tagged message */
+               c_tx->pkt.c_tagged.ddp_to =
+                       cpu_to_be64(wqe->sqe.raddr + wqe->processed);
+
+       data_len = wqe->bytes - wqe->processed;
+       if (data_len + c_tx->ctrl_len + MPA_CRC_SIZE > c_tx->tcp_seglen) {
+               /* Trim DDP payload to fit into current TCP segment */
+               data_len = c_tx->tcp_seglen - (c_tx->ctrl_len + MPA_CRC_SIZE);
+               c_tx->pkt.ctrl.ddp_rdmap_ctrl &= ~DDP_FLAG_LAST;
+               c_tx->pad = 0;
+       } else {
+               c_tx->pkt.ctrl.ddp_rdmap_ctrl |= DDP_FLAG_LAST;
+               c_tx->pad = -data_len & 0x3;
+       }
+       c_tx->bytes_unsent = data_len;
+
+       c_tx->pkt.ctrl.mpa_len =
+               htons(c_tx->ctrl_len + data_len - MPA_HDR_SIZE);
+
+       /*
+        * Init MPA CRC computation
+        */
+       if (c_tx->mpa_crc_hd) {
+               crypto_shash_init(c_tx->mpa_crc_hd);
+               crypto_shash_update(c_tx->mpa_crc_hd, (u8 *)&c_tx->pkt,
+                                   c_tx->ctrl_len);
+               c_tx->do_crc = 1;
+       }
+}
+
+/*
+ * siw_check_sgl_tx()
+ *
+ * Check permissions for a list of SGE's (SGL).
+ * A successful check will have all memory referenced
+ * for transmission resolved and assigned to the WQE.
+ *
+ * @pd:                Protection Domain SGL should belong to
+ * @wqe:       WQE to be checked
+ * @perms:     requested access permissions
+ *
+ */
+
+static int siw_check_sgl_tx(struct ib_pd *pd, struct siw_wqe *wqe,
+                           enum ib_access_flags perms)
+{
+       struct siw_sge *sge = &wqe->sqe.sge[0];
+       int i, len, num_sge = wqe->sqe.num_sge;
+
+       if (unlikely(num_sge > SIW_MAX_SGE))
+               return -EINVAL;
+
+       for (i = 0, len = 0; num_sge; num_sge--, i++, sge++) {
+               /*
+                * rdma verbs: do not check stag for a zero length sge
+                */
+               if (sge->length) {
+                       int rv = siw_check_sge(pd, sge, &wqe->mem[i], perms, 0,
+                                              sge->length);
+
+                       if (unlikely(rv != E_ACCESS_OK))
+                               return rv;
+               }
+               len += sge->length;
+       }
+       return len;
+}
+
+/*
+ * siw_qp_sq_proc_tx()
+ *
+ * Process one WQE which needs transmission on the wire.
+ */
+static int siw_qp_sq_proc_tx(struct siw_qp *qp, struct siw_wqe *wqe)
+{
+       struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
+       struct socket *s = qp->attrs.sk;
+       int rv = 0, burst_len = qp->tx_ctx.burst;
+       enum rdmap_ecode ecode = RDMAP_ECODE_CATASTROPHIC_STREAM;
+
+       if (unlikely(wqe->wr_status == SIW_WR_IDLE))
+               return 0;
+
+       if (!burst_len)
+               burst_len = SQ_USER_MAXBURST;
+
+       if (wqe->wr_status == SIW_WR_QUEUED) {
+               if (!(wqe->sqe.flags & SIW_WQE_INLINE)) {
+                       if (tx_type(wqe) == SIW_OP_READ_RESPONSE)
+                               wqe->sqe.num_sge = 1;
+
+                       if (tx_type(wqe) != SIW_OP_READ &&
+                           tx_type(wqe) != SIW_OP_READ_LOCAL_INV) {
+                               /*
+                                * Reference memory to be tx'd w/o checking
+                                * access for LOCAL_READ permission, since
+                                * not defined in RDMA core.
+                                */
+                               rv = siw_check_sgl_tx(qp->pd, wqe, 0);
+                               if (rv < 0) {
+                                       if (tx_type(wqe) ==
+                                           SIW_OP_READ_RESPONSE)
+                                               ecode = siw_rdmap_error(-rv);
+                                       rv = -EINVAL;
+                                       goto tx_error;
+                               }
+                               wqe->bytes = rv;
+                       } else {
+                               wqe->bytes = 0;
+                       }
+               } else {
+                       wqe->bytes = wqe->sqe.sge[0].length;
+                       if (!qp->kernel_verbs) {
+                               if (wqe->bytes > SIW_MAX_INLINE) {
+                                       rv = -EINVAL;
+                                       goto tx_error;
+                               }
+                               wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1];
+                       }
+               }
+               wqe->wr_status = SIW_WR_INPROGRESS;
+               wqe->processed = 0;
+
+               siw_update_tcpseg(c_tx, s);
+
+               rv = siw_qp_prepare_tx(c_tx);
+               if (rv == PKT_FRAGMENTED) {
+                       c_tx->state = SIW_SEND_HDR;
+                       siw_prepare_fpdu(qp, wqe);
+               } else if (rv == PKT_COMPLETE) {
+                       c_tx->state = SIW_SEND_SHORT_FPDU;
+               } else {
+                       goto tx_error;
+               }
+       }
+
+next_segment:
+       siw_dbg_qp(qp, "wr type %d, state %d, data %u, sent %u, id %llx\n",
+                  tx_type(wqe), wqe->wr_status, wqe->bytes, wqe->processed,
+                  wqe->sqe.id);
+
+       if (--burst_len == 0) {
+               rv = -EINPROGRESS;
+               goto tx_done;
+       }
+       if (c_tx->state == SIW_SEND_SHORT_FPDU) {
+               enum siw_opcode tx_type = tx_type(wqe);
+               unsigned int msg_flags;
+
+               if (siw_sq_empty(qp) || !siw_tcp_nagle || burst_len == 1)
+                       /*
+                        * End current TCP segment, if SQ runs empty,
+                        * or siw_tcp_nagle is not set, or we bail out
+                        * soon due to no burst credit left.
+                        */
+                       msg_flags = MSG_DONTWAIT;
+               else
+                       msg_flags = MSG_DONTWAIT | MSG_MORE;
+
+               rv = siw_tx_ctrl(c_tx, s, msg_flags);
+
+               if (!rv && tx_type != SIW_OP_READ &&
+                   tx_type != SIW_OP_READ_LOCAL_INV)
+                       wqe->processed = wqe->bytes;
+
+               goto tx_done;
+
+       } else {
+               rv = siw_tx_hdt(c_tx, s);
+       }
+       if (!rv) {
+               /*
+                * One segment sent. Processing completed if last
+                * segment, Do next segment otherwise.
+                */
+               if (unlikely(c_tx->tx_suspend)) {
+                       /*
+                        * Verbs, 6.4.: Try stopping sending after a full
+                        * DDP segment if the connection goes down
+                        * (== peer halfclose)
+                        */
+                       rv = -ECONNABORTED;
+                       goto tx_done;
+               }
+               if (c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_LAST) {
+                       siw_dbg_qp(qp, "WQE completed\n");
+                       goto tx_done;
+               }
+               c_tx->state = SIW_SEND_HDR;
+
+               siw_update_tcpseg(c_tx, s);
+
+               siw_prepare_fpdu(qp, wqe);
+               goto next_segment;
+       }
+tx_done:
+       qp->tx_ctx.burst = burst_len;
+       return rv;
+
+tx_error:
+       if (ecode != RDMAP_ECODE_CATASTROPHIC_STREAM)
+               siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+                                  RDMAP_ETYPE_REMOTE_PROTECTION, ecode, 1);
+       else
+               siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+                                  RDMAP_ETYPE_CATASTROPHIC,
+                                  RDMAP_ECODE_UNSPECIFIED, 1);
+       return rv;
+}
+
+static int siw_fastreg_mr(struct ib_pd *pd, struct siw_sqe *sqe)
+{
+       struct ib_mr *base_mr = (struct ib_mr *)sqe->base_mr;
+       struct siw_device *sdev = to_siw_dev(pd->device);
+       struct siw_mem *mem = siw_mem_id2obj(sdev, sqe->rkey  >> 8);
+       int rv = 0;
+
+       siw_dbg_pd(pd, "STag 0x%08x\n", sqe->rkey);
+
+       if (unlikely(!mem || !base_mr)) {
+               pr_warn("siw: fastreg: STag 0x%08x unknown\n", sqe->rkey);
+               return -EINVAL;
+       }
+       if (unlikely(base_mr->rkey >> 8 != sqe->rkey  >> 8)) {
+               pr_warn("siw: fastreg: STag 0x%08x: bad MR\n", sqe->rkey);
+               rv = -EINVAL;
+               goto out;
+       }
+       if (unlikely(mem->pd != pd)) {
+               pr_warn("siw: fastreg: PD mismatch\n");
+               rv = -EINVAL;
+               goto out;
+       }
+       if (unlikely(mem->stag_valid)) {
+               pr_warn("siw: fastreg: STag 0x%08x already valid\n", sqe->rkey);
+               rv = -EINVAL;
+               goto out;
+       }
+       /* Refresh STag since user may have changed key part */
+       mem->stag = sqe->rkey;
+       mem->perms = sqe->access;
+
+       siw_dbg_mem(mem, "STag now valid, MR va: 0x%016llx -> 0x%016llx\n",
+                   mem->va, base_mr->iova);
+       mem->va = base_mr->iova;
+       mem->stag_valid = 1;
+out:
+       siw_mem_put(mem);
+       return rv;
+}
+
+static int siw_qp_sq_proc_local(struct siw_qp *qp, struct siw_wqe *wqe)
+{
+       int rv;
+
+       switch (tx_type(wqe)) {
+       case SIW_OP_REG_MR:
+               rv = siw_fastreg_mr(qp->pd, &wqe->sqe);
+               break;
+
+       case SIW_OP_INVAL_STAG:
+               rv = siw_invalidate_stag(qp->pd, wqe->sqe.rkey);
+               break;
+
+       default:
+               rv = -EINVAL;
+       }
+       return rv;
+}
+
+/*
+ * siw_qp_sq_process()
+ *
+ * Core TX path routine for RDMAP/DDP/MPA using a TCP kernel socket.
+ * Sends RDMAP payload for the current SQ WR @wqe of @qp in one or more
+ * MPA FPDUs, each containing a DDP segment.
+ *
+ * SQ processing may occur in user context as a result of posting
+ * new WQE's or from siw_sq_work_handler() context. Processing in
+ * user context is limited to non-kernel verbs users.
+ *
+ * SQ processing may get paused anytime, possibly in the middle of a WR
+ * or FPDU, if insufficient send space is available. SQ processing
+ * gets resumed from siw_sq_work_handler(), if send space becomes
+ * available again.
+ *
+ * Must be called with the QP state read-locked.
+ *
+ * Note:
+ * An outbound RREQ can be satisfied by the corresponding RRESP
+ * _before_ it gets assigned to the ORQ. This happens regularly
+ * in RDMA READ via loopback case. Since both outbound RREQ and
+ * inbound RRESP can be handled by the same CPU, locking the ORQ
+ * is dead-lock prone and thus not an option. With that, the
+ * RREQ gets assigned to the ORQ _before_ being sent - see
+ * siw_activate_tx() - and pulled back in case of send failure.
+ */
+int siw_qp_sq_process(struct siw_qp *qp)
+{
+       struct siw_wqe *wqe = tx_wqe(qp);
+       enum siw_opcode tx_type;
+       unsigned long flags;
+       int rv = 0;
+
+       siw_dbg_qp(qp, "enter for type %d\n", tx_type(wqe));
+
+next_wqe:
+       /*
+        * Stop QP processing if SQ state changed
+        */
+       if (unlikely(qp->tx_ctx.tx_suspend)) {
+               siw_dbg_qp(qp, "tx suspended\n");
+               goto done;
+       }
+       tx_type = tx_type(wqe);
+
+       if (tx_type <= SIW_OP_READ_RESPONSE)
+               rv = siw_qp_sq_proc_tx(qp, wqe);
+       else
+               rv = siw_qp_sq_proc_local(qp, wqe);
+
+       if (!rv) {
+               /*
+                * WQE processing done
+                */
+               switch (tx_type) {
+               case SIW_OP_SEND:
+               case SIW_OP_SEND_REMOTE_INV:
+               case SIW_OP_WRITE:
+                       siw_wqe_put_mem(wqe, tx_type);
+                       /* Fall through */
+
+               case SIW_OP_INVAL_STAG:
+               case SIW_OP_REG_MR:
+                       if (tx_flags(wqe) & SIW_WQE_SIGNALLED)
+                               siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
+                                                SIW_WC_SUCCESS);
+                       break;
+
+               case SIW_OP_READ:
+               case SIW_OP_READ_LOCAL_INV:
+                       /*
+                        * already enqueued to ORQ queue
+                        */
+                       break;
+
+               case SIW_OP_READ_RESPONSE:
+                       siw_wqe_put_mem(wqe, tx_type);
+                       break;
+
+               default:
+                       WARN(1, "undefined WQE type %d\n", tx_type);
+                       rv = -EINVAL;
+                       goto done;
+               }
+
+               spin_lock_irqsave(&qp->sq_lock, flags);
+               wqe->wr_status = SIW_WR_IDLE;
+               rv = siw_activate_tx(qp);
+               spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+               if (rv <= 0)
+                       goto done;
+
+               goto next_wqe;
+
+       } else if (rv == -EAGAIN) {
+               siw_dbg_qp(qp, "sq paused: hd/tr %d of %d, data %d\n",
+                          qp->tx_ctx.ctrl_sent, qp->tx_ctx.ctrl_len,
+                          qp->tx_ctx.bytes_unsent);
+               rv = 0;
+               goto done;
+       } else if (rv == -EINPROGRESS) {
+               rv = siw_sq_start(qp);
+               goto done;
+       } else {
+               /*
+                * WQE processing failed.
+                * Verbs 8.3.2:
+                * o It turns any WQE into a signalled WQE.
+                * o Local catastrophic error must be surfaced
+                * o QP must be moved into Terminate state: done by code
+                *   doing socket state change processing
+                *
+                * o TODO: Termination message must be sent.
+                * o TODO: Implement more precise work completion errors,
+                *         see enum ib_wc_status in ib_verbs.h
+                */
+               siw_dbg_qp(qp, "wqe type %d processing failed: %d\n",
+                          tx_type(wqe), rv);
+
+               spin_lock_irqsave(&qp->sq_lock, flags);
+               /*
+                * RREQ may have already been completed by inbound RRESP!
+                */
+               if (tx_type == SIW_OP_READ ||
+                   tx_type == SIW_OP_READ_LOCAL_INV) {
+                       /* Cleanup pending entry in ORQ */
+                       qp->orq_put--;
+                       qp->orq[qp->orq_put % qp->attrs.orq_size].flags = 0;
+               }
+               spin_unlock_irqrestore(&qp->sq_lock, flags);
+               /*
+                * immediately suspends further TX processing
+                */
+               if (!qp->tx_ctx.tx_suspend)
+                       siw_qp_cm_drop(qp, 0);
+
+               switch (tx_type) {
+               case SIW_OP_SEND:
+               case SIW_OP_SEND_REMOTE_INV:
+               case SIW_OP_SEND_WITH_IMM:
+               case SIW_OP_WRITE:
+               case SIW_OP_READ:
+               case SIW_OP_READ_LOCAL_INV:
+                       siw_wqe_put_mem(wqe, tx_type);
+                       /* Fall through */
+
+               case SIW_OP_INVAL_STAG:
+               case SIW_OP_REG_MR:
+                       siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
+                                        SIW_WC_LOC_QP_OP_ERR);
+
+                       siw_qp_event(qp, IB_EVENT_QP_FATAL);
+
+                       break;
+
+               case SIW_OP_READ_RESPONSE:
+                       siw_dbg_qp(qp, "proc. read.response failed: %d\n", rv);
+
+                       siw_qp_event(qp, IB_EVENT_QP_REQ_ERR);
+
+                       siw_wqe_put_mem(wqe, SIW_OP_READ_RESPONSE);
+
+                       break;
+
+               default:
+                       WARN(1, "undefined WQE type %d\n", tx_type);
+                       rv = -EINVAL;
+               }
+               wqe->wr_status = SIW_WR_IDLE;
+       }
+done:
+       return rv;
+}
+
+static void siw_sq_resume(struct siw_qp *qp)
+{
+       if (down_read_trylock(&qp->state_lock)) {
+               if (likely(qp->attrs.state == SIW_QP_STATE_RTS &&
+                          !qp->tx_ctx.tx_suspend)) {
+                       int rv = siw_qp_sq_process(qp);
+
+                       up_read(&qp->state_lock);
+
+                       if (unlikely(rv < 0)) {
+                               siw_dbg_qp(qp, "SQ task failed: err %d\n", rv);
+
+                               if (!qp->tx_ctx.tx_suspend)
+                                       siw_qp_cm_drop(qp, 0);
+                       }
+               } else {
+                       up_read(&qp->state_lock);
+               }
+       } else {
+               siw_dbg_qp(qp, "Resume SQ while QP locked\n");
+       }
+       siw_qp_put(qp);
+}
+
+struct tx_task_t {
+       struct llist_head active;
+       wait_queue_head_t waiting;
+};
+
+static DEFINE_PER_CPU(struct tx_task_t, siw_tx_task_g);
+
+void siw_stop_tx_thread(int nr_cpu)
+{
+       kthread_stop(siw_tx_thread[nr_cpu]);
+       wake_up(&per_cpu(siw_tx_task_g, nr_cpu).waiting);
+}
+
+int siw_run_sq(void *data)
+{
+       const int nr_cpu = (unsigned int)(long)data;
+       struct llist_node *active;
+       struct siw_qp *qp;
+       struct tx_task_t *tx_task = &per_cpu(siw_tx_task_g, nr_cpu);
+
+       init_llist_head(&tx_task->active);
+       init_waitqueue_head(&tx_task->waiting);
+
+       while (1) {
+               struct llist_node *fifo_list = NULL;
+
+               wait_event_interruptible(tx_task->waiting,
+                                        !llist_empty(&tx_task->active) ||
+                                                kthread_should_stop());
+
+               if (kthread_should_stop())
+                       break;
+
+               active = llist_del_all(&tx_task->active);
+               /*
+                * llist_del_all returns a list with newest entry first.
+                * Re-order list for fairness among QP's.
+                */
+               while (active) {
+                       struct llist_node *tmp = active;
+
+                       active = llist_next(active);
+                       tmp->next = fifo_list;
+                       fifo_list = tmp;
+               }
+               while (fifo_list) {
+                       qp = container_of(fifo_list, struct siw_qp, tx_list);
+                       fifo_list = llist_next(fifo_list);
+                       qp->tx_list.next = NULL;
+
+                       siw_sq_resume(qp);
+               }
+       }
+       active = llist_del_all(&tx_task->active);
+       if (active) {
+               llist_for_each_entry(qp, active, tx_list) {
+                       qp->tx_list.next = NULL;
+                       siw_sq_resume(qp);
+               }
+       }
+       return 0;
+}
+
+int siw_sq_start(struct siw_qp *qp)
+{
+       if (tx_wqe(qp)->wr_status == SIW_WR_IDLE)
+               return 0;
+
+       if (unlikely(!cpu_online(qp->tx_cpu))) {
+               siw_put_tx_cpu(qp->tx_cpu);
+               qp->tx_cpu = siw_get_tx_cpu(qp->sdev);
+               if (qp->tx_cpu < 0) {
+                       pr_warn("siw: no tx cpu available\n");
+
+                       return -EIO;
+               }
+       }
+       siw_qp_get(qp);
+
+       llist_add(&qp->tx_list, &per_cpu(siw_tx_task_g, qp->tx_cpu).active);
+
+       wake_up(&per_cpu(siw_tx_task_g, qp->tx_cpu).waiting);
+
+       return 0;
+}
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
new file mode 100644 (file)
index 0000000..32dc79d
--- /dev/null
@@ -0,0 +1,1760 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/xarray.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/uverbs_ioctl.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+#include "siw_mem.h"
+
+static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = {
+       [IB_QPS_RESET] = SIW_QP_STATE_IDLE,
+       [IB_QPS_INIT] = SIW_QP_STATE_IDLE,
+       [IB_QPS_RTR] = SIW_QP_STATE_RTR,
+       [IB_QPS_RTS] = SIW_QP_STATE_RTS,
+       [IB_QPS_SQD] = SIW_QP_STATE_CLOSING,
+       [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE,
+       [IB_QPS_ERR] = SIW_QP_STATE_ERROR
+};
+
+static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = {
+       [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR",
+       [IB_QPS_RTS] = "RTS",     [IB_QPS_SQD] = "SQD",   [IB_QPS_SQE] = "SQE",
+       [IB_QPS_ERR] = "ERR"
+};
+
+static u32 siw_create_uobj(struct siw_ucontext *uctx, void *vaddr, u32 size)
+{
+       struct siw_uobj *uobj;
+       struct xa_limit limit = XA_LIMIT(0, SIW_UOBJ_MAX_KEY);
+       u32 key;
+
+       uobj = kzalloc(sizeof(*uobj), GFP_KERNEL);
+       if (!uobj)
+               return SIW_INVAL_UOBJ_KEY;
+
+       if (xa_alloc_cyclic(&uctx->xa, &key, uobj, limit, &uctx->uobj_nextkey,
+                           GFP_KERNEL) < 0) {
+               kfree(uobj);
+               return SIW_INVAL_UOBJ_KEY;
+       }
+       uobj->size = PAGE_ALIGN(size);
+       uobj->addr = vaddr;
+
+       return key;
+}
+
+static struct siw_uobj *siw_get_uobj(struct siw_ucontext *uctx,
+                                    unsigned long off, u32 size)
+{
+       struct siw_uobj *uobj = xa_load(&uctx->xa, off);
+
+       if (uobj && uobj->size == size)
+               return uobj;
+
+       return NULL;
+}
+
+int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma)
+{
+       struct siw_ucontext *uctx = to_siw_ctx(ctx);
+       struct siw_uobj *uobj;
+       unsigned long off = vma->vm_pgoff;
+       int size = vma->vm_end - vma->vm_start;
+       int rv = -EINVAL;
+
+       /*
+        * Must be page aligned
+        */
+       if (vma->vm_start & (PAGE_SIZE - 1)) {
+               pr_warn("siw: mmap not page aligned\n");
+               goto out;
+       }
+       uobj = siw_get_uobj(uctx, off, size);
+       if (!uobj) {
+               siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %u\n",
+                       off, size);
+               goto out;
+       }
+       rv = remap_vmalloc_range(vma, uobj->addr, 0);
+       if (rv)
+               pr_warn("remap_vmalloc_range failed: %lu, %u\n", off, size);
+out:
+       return rv;
+}
+
+int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata)
+{
+       struct siw_device *sdev = to_siw_dev(base_ctx->device);
+       struct siw_ucontext *ctx = to_siw_ctx(base_ctx);
+       struct siw_uresp_alloc_ctx uresp = {};
+       int rv;
+
+       if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) {
+               rv = -ENOMEM;
+               goto err_out;
+       }
+       xa_init_flags(&ctx->xa, XA_FLAGS_ALLOC);
+       ctx->uobj_nextkey = 0;
+       ctx->sdev = sdev;
+
+       uresp.dev_id = sdev->vendor_part_id;
+
+       if (udata->outlen < sizeof(uresp)) {
+               rv = -EINVAL;
+               goto err_out;
+       }
+       rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+       if (rv)
+               goto err_out;
+
+       siw_dbg(base_ctx->device, "success. now %d context(s)\n",
+               atomic_read(&sdev->num_ctx));
+
+       return 0;
+
+err_out:
+       atomic_dec(&sdev->num_ctx);
+       siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv,
+               atomic_read(&sdev->num_ctx));
+
+       return rv;
+}
+
+void siw_dealloc_ucontext(struct ib_ucontext *base_ctx)
+{
+       struct siw_ucontext *uctx = to_siw_ctx(base_ctx);
+       void *entry;
+       unsigned long index;
+
+       /*
+        * Make sure all user mmap objects are gone. Since QP, CQ
+        * and SRQ destroy routines destroy related objects, nothing
+        * should be found here.
+        */
+       xa_for_each(&uctx->xa, index, entry) {
+               kfree(xa_erase(&uctx->xa, index));
+               pr_warn("siw: dropping orphaned uobj at %lu\n", index);
+       }
+       xa_destroy(&uctx->xa);
+       atomic_dec(&uctx->sdev->num_ctx);
+}
+
+int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr,
+                    struct ib_udata *udata)
+{
+       struct siw_device *sdev = to_siw_dev(base_dev);
+
+       if (udata->inlen || udata->outlen)
+               return -EINVAL;
+
+       memset(attr, 0, sizeof(*attr));
+
+       /* Revisit atomic caps if RFC 7306 gets supported */
+       attr->atomic_cap = 0;
+       attr->device_cap_flags =
+               IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_ALLOW_USER_UNREG;
+       attr->max_cq = sdev->attrs.max_cq;
+       attr->max_cqe = sdev->attrs.max_cqe;
+       attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL;
+       attr->max_fmr = sdev->attrs.max_fmr;
+       attr->max_mr = sdev->attrs.max_mr;
+       attr->max_mw = sdev->attrs.max_mw;
+       attr->max_mr_size = ~0ull;
+       attr->max_pd = sdev->attrs.max_pd;
+       attr->max_qp = sdev->attrs.max_qp;
+       attr->max_qp_init_rd_atom = sdev->attrs.max_ird;
+       attr->max_qp_rd_atom = sdev->attrs.max_ord;
+       attr->max_qp_wr = sdev->attrs.max_qp_wr;
+       attr->max_recv_sge = sdev->attrs.max_sge;
+       attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird;
+       attr->max_send_sge = sdev->attrs.max_sge;
+       attr->max_sge_rd = sdev->attrs.max_sge_rd;
+       attr->max_srq = sdev->attrs.max_srq;
+       attr->max_srq_sge = sdev->attrs.max_srq_sge;
+       attr->max_srq_wr = sdev->attrs.max_srq_wr;
+       attr->page_size_cap = PAGE_SIZE;
+       attr->vendor_id = SIW_VENDOR_ID;
+       attr->vendor_part_id = sdev->vendor_part_id;
+
+       memcpy(&attr->sys_image_guid, sdev->netdev->dev_addr, 6);
+
+       return 0;
+}
+
+int siw_query_port(struct ib_device *base_dev, u8 port,
+                  struct ib_port_attr *attr)
+{
+       struct siw_device *sdev = to_siw_dev(base_dev);
+
+       memset(attr, 0, sizeof(*attr));
+
+       attr->active_mtu = attr->max_mtu;
+       attr->active_speed = 2;
+       attr->active_width = 2;
+       attr->gid_tbl_len = 1;
+       attr->max_msg_sz = -1;
+       attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
+       attr->phys_state = sdev->state == IB_PORT_ACTIVE ? 5 : 3;
+       attr->pkey_tbl_len = 1;
+       attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP;
+       attr->state = sdev->state;
+       /*
+        * All zero
+        *
+        * attr->lid = 0;
+        * attr->bad_pkey_cntr = 0;
+        * attr->qkey_viol_cntr = 0;
+        * attr->sm_lid = 0;
+        * attr->lmc = 0;
+        * attr->max_vl_num = 0;
+        * attr->sm_sl = 0;
+        * attr->subnet_timeout = 0;
+        * attr->init_type_repy = 0;
+        */
+       return 0;
+}
+
+int siw_get_port_immutable(struct ib_device *base_dev, u8 port,
+                          struct ib_port_immutable *port_immutable)
+{
+       struct ib_port_attr attr;
+       int rv = siw_query_port(base_dev, port, &attr);
+
+       if (rv)
+               return rv;
+
+       port_immutable->pkey_tbl_len = attr.pkey_tbl_len;
+       port_immutable->gid_tbl_len = attr.gid_tbl_len;
+       port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
+
+       return 0;
+}
+
+int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey)
+{
+       /* Report the default pkey */
+       *pkey = 0xffff;
+       return 0;
+}
+
+int siw_query_gid(struct ib_device *base_dev, u8 port, int idx,
+                 union ib_gid *gid)
+{
+       struct siw_device *sdev = to_siw_dev(base_dev);
+
+       /* subnet_prefix == interface_id == 0; */
+       memset(gid, 0, sizeof(*gid));
+       memcpy(&gid->raw[0], sdev->netdev->dev_addr, 6);
+
+       return 0;
+}
+
+int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata)
+{
+       struct siw_device *sdev = to_siw_dev(pd->device);
+
+       if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) {
+               atomic_dec(&sdev->num_pd);
+               return -ENOMEM;
+       }
+       siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd));
+
+       return 0;
+}
+
+void siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
+{
+       struct siw_device *sdev = to_siw_dev(pd->device);
+
+       siw_dbg_pd(pd, "free PD\n");
+       atomic_dec(&sdev->num_pd);
+}
+
+void siw_qp_get_ref(struct ib_qp *base_qp)
+{
+       siw_qp_get(to_siw_qp(base_qp));
+}
+
+void siw_qp_put_ref(struct ib_qp *base_qp)
+{
+       siw_qp_put(to_siw_qp(base_qp));
+}
+
+/*
+ * siw_create_qp()
+ *
+ * Create QP of requested size on given device.
+ *
+ * @pd:                Protection Domain
+ * @attrs:     Initial QP attributes.
+ * @udata:     used to provide QP ID, SQ and RQ size back to user.
+ */
+
+struct ib_qp *siw_create_qp(struct ib_pd *pd,
+                           struct ib_qp_init_attr *attrs,
+                           struct ib_udata *udata)
+{
+       struct siw_qp *qp = NULL;
+       struct siw_base_qp *siw_base_qp = NULL;
+       struct ib_device *base_dev = pd->device;
+       struct siw_device *sdev = to_siw_dev(base_dev);
+       struct siw_ucontext *uctx =
+               rdma_udata_to_drv_context(udata, struct siw_ucontext,
+                                         base_ucontext);
+       struct siw_cq *scq = NULL, *rcq = NULL;
+       unsigned long flags;
+       int num_sqe, num_rqe, rv = 0;
+
+       siw_dbg(base_dev, "create new QP\n");
+
+       if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) {
+               siw_dbg(base_dev, "too many QP's\n");
+               rv = -ENOMEM;
+               goto err_out;
+       }
+       if (attrs->qp_type != IB_QPT_RC) {
+               siw_dbg(base_dev, "only RC QP's supported\n");
+               rv = -EINVAL;
+               goto err_out;
+       }
+       if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) ||
+           (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) ||
+           (attrs->cap.max_send_sge > SIW_MAX_SGE) ||
+           (attrs->cap.max_recv_sge > SIW_MAX_SGE)) {
+               siw_dbg(base_dev, "QP size error\n");
+               rv = -EINVAL;
+               goto err_out;
+       }
+       if (attrs->cap.max_inline_data > SIW_MAX_INLINE) {
+               siw_dbg(base_dev, "max inline send: %d > %d\n",
+                       attrs->cap.max_inline_data, (int)SIW_MAX_INLINE);
+               rv = -EINVAL;
+               goto err_out;
+       }
+       /*
+        * NOTE: we allow for zero element SQ and RQ WQE's SGL's
+        * but not for a QP unable to hold any WQE (SQ + RQ)
+        */
+       if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) {
+               siw_dbg(base_dev, "QP must have send or receive queue\n");
+               rv = -EINVAL;
+               goto err_out;
+       }
+       scq = to_siw_cq(attrs->send_cq);
+       rcq = to_siw_cq(attrs->recv_cq);
+
+       if (!scq || (!rcq && !attrs->srq)) {
+               siw_dbg(base_dev, "send CQ or receive CQ invalid\n");
+               rv = -EINVAL;
+               goto err_out;
+       }
+       siw_base_qp = kzalloc(sizeof(*siw_base_qp), GFP_KERNEL);
+       if (!siw_base_qp) {
+               rv = -ENOMEM;
+               goto err_out;
+       }
+       qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+       if (!qp) {
+               rv = -ENOMEM;
+               goto err_out;
+       }
+       siw_base_qp->qp = qp;
+       qp->ib_qp = &siw_base_qp->base_qp;
+
+       init_rwsem(&qp->state_lock);
+       spin_lock_init(&qp->sq_lock);
+       spin_lock_init(&qp->rq_lock);
+       spin_lock_init(&qp->orq_lock);
+
+       qp->kernel_verbs = !udata;
+       qp->xa_sq_index = SIW_INVAL_UOBJ_KEY;
+       qp->xa_rq_index = SIW_INVAL_UOBJ_KEY;
+
+       rv = siw_qp_add(sdev, qp);
+       if (rv)
+               goto err_out;
+
+       /* All queue indices are derived from modulo operations
+        * on a free running 'get' (consumer) and 'put' (producer)
+        * unsigned counter. Having queue sizes at power of two
+        * avoids handling counter wrap around.
+        */
+       num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr);
+       num_rqe = roundup_pow_of_two(attrs->cap.max_recv_wr);
+
+       if (qp->kernel_verbs)
+               qp->sendq = vzalloc(num_sqe * sizeof(struct siw_sqe));
+       else
+               qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe));
+
+       if (qp->sendq == NULL) {
+               siw_dbg(base_dev, "SQ size %d alloc failed\n", num_sqe);
+               rv = -ENOMEM;
+               goto err_out_xa;
+       }
+       if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) {
+               if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR)
+                       qp->attrs.flags |= SIW_SIGNAL_ALL_WR;
+               else {
+                       rv = -EINVAL;
+                       goto err_out_xa;
+               }
+       }
+       qp->pd = pd;
+       qp->scq = scq;
+       qp->rcq = rcq;
+
+       if (attrs->srq) {
+               /*
+                * SRQ support.
+                * Verbs 6.3.7: ignore RQ size, if SRQ present
+                * Verbs 6.3.5: do not check PD of SRQ against PD of QP
+                */
+               qp->srq = to_siw_srq(attrs->srq);
+               qp->attrs.rq_size = 0;
+               siw_dbg(base_dev, "QP [%u]: [SRQ 0x%p] attached\n",
+                       qp->qp_num, qp->srq);
+       } else if (num_rqe) {
+               if (qp->kernel_verbs)
+                       qp->recvq = vzalloc(num_rqe * sizeof(struct siw_rqe));
+               else
+                       qp->recvq =
+                               vmalloc_user(num_rqe * sizeof(struct siw_rqe));
+
+               if (qp->recvq == NULL) {
+                       siw_dbg(base_dev, "RQ size %d alloc failed\n", num_rqe);
+                       rv = -ENOMEM;
+                       goto err_out_xa;
+               }
+               qp->attrs.rq_size = num_rqe;
+       }
+       qp->attrs.sq_size = num_sqe;
+       qp->attrs.sq_max_sges = attrs->cap.max_send_sge;
+       qp->attrs.rq_max_sges = attrs->cap.max_recv_sge;
+
+       /* Make those two tunables fixed for now. */
+       qp->tx_ctx.gso_seg_limit = 1;
+       qp->tx_ctx.zcopy_tx = zcopy_tx;
+
+       qp->attrs.state = SIW_QP_STATE_IDLE;
+
+       if (udata) {
+               struct siw_uresp_create_qp uresp = {};
+
+               uresp.num_sqe = num_sqe;
+               uresp.num_rqe = num_rqe;
+               uresp.qp_id = qp_id(qp);
+
+               if (qp->sendq) {
+                       qp->xa_sq_index =
+                               siw_create_uobj(uctx, qp->sendq,
+                                       num_sqe * sizeof(struct siw_sqe));
+               }
+               if (qp->recvq) {
+                       qp->xa_rq_index =
+                                siw_create_uobj(uctx, qp->recvq,
+                                       num_rqe * sizeof(struct siw_rqe));
+               }
+               if (qp->xa_sq_index == SIW_INVAL_UOBJ_KEY ||
+                   qp->xa_rq_index == SIW_INVAL_UOBJ_KEY) {
+                       rv = -ENOMEM;
+                       goto err_out_xa;
+               }
+               uresp.sq_key = qp->xa_sq_index << PAGE_SHIFT;
+               uresp.rq_key = qp->xa_rq_index << PAGE_SHIFT;
+
+               if (udata->outlen < sizeof(uresp)) {
+                       rv = -EINVAL;
+                       goto err_out_xa;
+               }
+               rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+               if (rv)
+                       goto err_out_xa;
+       }
+       qp->tx_cpu = siw_get_tx_cpu(sdev);
+       if (qp->tx_cpu < 0) {
+               rv = -EINVAL;
+               goto err_out_xa;
+       }
+       INIT_LIST_HEAD(&qp->devq);
+       spin_lock_irqsave(&sdev->lock, flags);
+       list_add_tail(&qp->devq, &sdev->qp_list);
+       spin_unlock_irqrestore(&sdev->lock, flags);
+
+       return qp->ib_qp;
+
+err_out_xa:
+       xa_erase(&sdev->qp_xa, qp_id(qp));
+err_out:
+       kfree(siw_base_qp);
+
+       if (qp) {
+               if (qp->xa_sq_index != SIW_INVAL_UOBJ_KEY)
+                       kfree(xa_erase(&uctx->xa, qp->xa_sq_index));
+               if (qp->xa_rq_index != SIW_INVAL_UOBJ_KEY)
+                       kfree(xa_erase(&uctx->xa, qp->xa_rq_index));
+
+               vfree(qp->sendq);
+               vfree(qp->recvq);
+               kfree(qp);
+       }
+       atomic_dec(&sdev->num_qp);
+
+       return ERR_PTR(rv);
+}
+
+/*
+ * Minimum siw_query_qp() verb interface.
+ *
+ * @qp_attr_mask is not used but all available information is provided
+ */
+int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr,
+                int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+       struct siw_qp *qp;
+       struct siw_device *sdev;
+
+       if (base_qp && qp_attr && qp_init_attr) {
+               qp = to_siw_qp(base_qp);
+               sdev = to_siw_dev(base_qp->device);
+       } else {
+               return -EINVAL;
+       }
+       qp_attr->cap.max_inline_data = SIW_MAX_INLINE;
+       qp_attr->cap.max_send_wr = qp->attrs.sq_size;
+       qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges;
+       qp_attr->cap.max_recv_wr = qp->attrs.rq_size;
+       qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges;
+       qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
+       qp_attr->max_rd_atomic = qp->attrs.irq_size;
+       qp_attr->max_dest_rd_atomic = qp->attrs.orq_size;
+
+       qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE |
+                                  IB_ACCESS_REMOTE_WRITE |
+                                  IB_ACCESS_REMOTE_READ;
+
+       qp_init_attr->qp_type = base_qp->qp_type;
+       qp_init_attr->send_cq = base_qp->send_cq;
+       qp_init_attr->recv_cq = base_qp->recv_cq;
+       qp_init_attr->srq = base_qp->srq;
+
+       qp_init_attr->cap = qp_attr->cap;
+
+       return 0;
+}
+
+int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr,
+                       int attr_mask, struct ib_udata *udata)
+{
+       struct siw_qp_attrs new_attrs;
+       enum siw_qp_attr_mask siw_attr_mask = 0;
+       struct siw_qp *qp = to_siw_qp(base_qp);
+       int rv = 0;
+
+       if (!attr_mask)
+               return 0;
+
+       memset(&new_attrs, 0, sizeof(new_attrs));
+
+       if (attr_mask & IB_QP_ACCESS_FLAGS) {
+               siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS;
+
+               if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ)
+                       new_attrs.flags |= SIW_RDMA_READ_ENABLED;
+               if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
+                       new_attrs.flags |= SIW_RDMA_WRITE_ENABLED;
+               if (attr->qp_access_flags & IB_ACCESS_MW_BIND)
+                       new_attrs.flags |= SIW_RDMA_BIND_ENABLED;
+       }
+       if (attr_mask & IB_QP_STATE) {
+               siw_dbg_qp(qp, "desired IB QP state: %s\n",
+                          ib_qp_state_to_string[attr->qp_state]);
+
+               new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state];
+
+               if (new_attrs.state > SIW_QP_STATE_RTS)
+                       qp->tx_ctx.tx_suspend = 1;
+
+               siw_attr_mask |= SIW_QP_ATTR_STATE;
+       }
+       if (!siw_attr_mask)
+               goto out;
+
+       down_write(&qp->state_lock);
+
+       rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask);
+
+       up_write(&qp->state_lock);
+out:
+       return rv;
+}
+
+int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata)
+{
+       struct siw_qp *qp = to_siw_qp(base_qp);
+       struct siw_base_qp *siw_base_qp = to_siw_base_qp(base_qp);
+       struct siw_ucontext *uctx =
+               rdma_udata_to_drv_context(udata, struct siw_ucontext,
+                                         base_ucontext);
+       struct siw_qp_attrs qp_attrs;
+
+       siw_dbg_qp(qp, "state %d, cep 0x%p\n", qp->attrs.state, qp->cep);
+
+       /*
+        * Mark QP as in process of destruction to prevent from
+        * any async callbacks to RDMA core
+        */
+       qp->attrs.flags |= SIW_QP_IN_DESTROY;
+       qp->rx_stream.rx_suspend = 1;
+
+       if (uctx && qp->xa_sq_index != SIW_INVAL_UOBJ_KEY)
+               kfree(xa_erase(&uctx->xa, qp->xa_sq_index));
+       if (uctx && qp->xa_rq_index != SIW_INVAL_UOBJ_KEY)
+               kfree(xa_erase(&uctx->xa, qp->xa_rq_index));
+
+       down_write(&qp->state_lock);
+
+       qp_attrs.state = SIW_QP_STATE_ERROR;
+       siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE);
+
+       if (qp->cep) {
+               siw_cep_put(qp->cep);
+               qp->cep = NULL;
+       }
+       up_write(&qp->state_lock);
+
+       kfree(qp->tx_ctx.mpa_crc_hd);
+       kfree(qp->rx_stream.mpa_crc_hd);
+
+       qp->scq = qp->rcq = NULL;
+
+       siw_qp_put(qp);
+       kfree(siw_base_qp);
+
+       return 0;
+}
+
+/*
+ * siw_copy_inline_sgl()
+ *
+ * Prepare sgl of inlined data for sending. For userland callers
+ * function checks if given buffer addresses and len's are within
+ * process context bounds.
+ * Data from all provided sge's are copied together into the wqe,
+ * referenced by a single sge.
+ */
+static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr,
+                              struct siw_sqe *sqe)
+{
+       struct ib_sge *core_sge = core_wr->sg_list;
+       void *kbuf = &sqe->sge[1];
+       int num_sge = core_wr->num_sge, bytes = 0;
+
+       sqe->sge[0].laddr = (u64)kbuf;
+       sqe->sge[0].lkey = 0;
+
+       while (num_sge--) {
+               if (!core_sge->length) {
+                       core_sge++;
+                       continue;
+               }
+               bytes += core_sge->length;
+               if (bytes > SIW_MAX_INLINE) {
+                       bytes = -EINVAL;
+                       break;
+               }
+               memcpy(kbuf, (void *)(uintptr_t)core_sge->addr,
+                      core_sge->length);
+
+               kbuf += core_sge->length;
+               core_sge++;
+       }
+       sqe->sge[0].length = bytes > 0 ? bytes : 0;
+       sqe->num_sge = bytes > 0 ? 1 : 0;
+
+       return bytes;
+}
+
+/*
+ * siw_post_send()
+ *
+ * Post a list of S-WR's to a SQ.
+ *
+ * @base_qp:   Base QP contained in siw QP
+ * @wr:                Null terminated list of user WR's
+ * @bad_wr:    Points to failing WR in case of synchronous failure.
+ */
+int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
+                 const struct ib_send_wr **bad_wr)
+{
+       struct siw_qp *qp = to_siw_qp(base_qp);
+       struct siw_wqe *wqe = tx_wqe(qp);
+
+       unsigned long flags;
+       int rv = 0;
+
+       /*
+        * Try to acquire QP state lock. Must be non-blocking
+        * to accommodate kernel clients needs.
+        */
+       if (!down_read_trylock(&qp->state_lock)) {
+               *bad_wr = wr;
+               siw_dbg_qp(qp, "QP locked, state %d\n", qp->attrs.state);
+               return -ENOTCONN;
+       }
+       if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) {
+               up_read(&qp->state_lock);
+               *bad_wr = wr;
+               siw_dbg_qp(qp, "QP out of state %d\n", qp->attrs.state);
+               return -ENOTCONN;
+       }
+       if (wr && !qp->kernel_verbs) {
+               siw_dbg_qp(qp, "wr must be empty for user mapped sq\n");
+               up_read(&qp->state_lock);
+               *bad_wr = wr;
+               return -EINVAL;
+       }
+       spin_lock_irqsave(&qp->sq_lock, flags);
+
+       while (wr) {
+               u32 idx = qp->sq_put % qp->attrs.sq_size;
+               struct siw_sqe *sqe = &qp->sendq[idx];
+
+               if (sqe->flags) {
+                       siw_dbg_qp(qp, "sq full\n");
+                       rv = -ENOMEM;
+                       break;
+               }
+               if (wr->num_sge > qp->attrs.sq_max_sges) {
+                       siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
+                       rv = -EINVAL;
+                       break;
+               }
+               sqe->id = wr->wr_id;
+
+               if ((wr->send_flags & IB_SEND_SIGNALED) ||
+                   (qp->attrs.flags & SIW_SIGNAL_ALL_WR))
+                       sqe->flags |= SIW_WQE_SIGNALLED;
+
+               if (wr->send_flags & IB_SEND_FENCE)
+                       sqe->flags |= SIW_WQE_READ_FENCE;
+
+               switch (wr->opcode) {
+               case IB_WR_SEND:
+               case IB_WR_SEND_WITH_INV:
+                       if (wr->send_flags & IB_SEND_SOLICITED)
+                               sqe->flags |= SIW_WQE_SOLICITED;
+
+                       if (!(wr->send_flags & IB_SEND_INLINE)) {
+                               siw_copy_sgl(wr->sg_list, sqe->sge,
+                                            wr->num_sge);
+                               sqe->num_sge = wr->num_sge;
+                       } else {
+                               rv = siw_copy_inline_sgl(wr, sqe);
+                               if (rv <= 0) {
+                                       rv = -EINVAL;
+                                       break;
+                               }
+                               sqe->flags |= SIW_WQE_INLINE;
+                               sqe->num_sge = 1;
+                       }
+                       if (wr->opcode == IB_WR_SEND)
+                               sqe->opcode = SIW_OP_SEND;
+                       else {
+                               sqe->opcode = SIW_OP_SEND_REMOTE_INV;
+                               sqe->rkey = wr->ex.invalidate_rkey;
+                       }
+                       break;
+
+               case IB_WR_RDMA_READ_WITH_INV:
+               case IB_WR_RDMA_READ:
+                       /*
+                        * iWarp restricts RREAD sink to SGL containing
+                        * 1 SGE only. we could relax to SGL with multiple
+                        * elements referring the SAME ltag or even sending
+                        * a private per-rreq tag referring to a checked
+                        * local sgl with MULTIPLE ltag's.
+                        */
+                       if (unlikely(wr->num_sge != 1)) {
+                               rv = -EINVAL;
+                               break;
+                       }
+                       siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1);
+                       /*
+                        * NOTE: zero length RREAD is allowed!
+                        */
+                       sqe->raddr = rdma_wr(wr)->remote_addr;
+                       sqe->rkey = rdma_wr(wr)->rkey;
+                       sqe->num_sge = 1;
+
+                       if (wr->opcode == IB_WR_RDMA_READ)
+                               sqe->opcode = SIW_OP_READ;
+                       else
+                               sqe->opcode = SIW_OP_READ_LOCAL_INV;
+                       break;
+
+               case IB_WR_RDMA_WRITE:
+                       if (!(wr->send_flags & IB_SEND_INLINE)) {
+                               siw_copy_sgl(wr->sg_list, &sqe->sge[0],
+                                            wr->num_sge);
+                               sqe->num_sge = wr->num_sge;
+                       } else {
+                               rv = siw_copy_inline_sgl(wr, sqe);
+                               if (unlikely(rv < 0)) {
+                                       rv = -EINVAL;
+                                       break;
+                               }
+                               sqe->flags |= SIW_WQE_INLINE;
+                               sqe->num_sge = 1;
+                       }
+                       sqe->raddr = rdma_wr(wr)->remote_addr;
+                       sqe->rkey = rdma_wr(wr)->rkey;
+                       sqe->opcode = SIW_OP_WRITE;
+                       break;
+
+               case IB_WR_REG_MR:
+                       sqe->base_mr = (uint64_t)reg_wr(wr)->mr;
+                       sqe->rkey = reg_wr(wr)->key;
+                       sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK;
+                       sqe->opcode = SIW_OP_REG_MR;
+                       break;
+
+               case IB_WR_LOCAL_INV:
+                       sqe->rkey = wr->ex.invalidate_rkey;
+                       sqe->opcode = SIW_OP_INVAL_STAG;
+                       break;
+
+               default:
+                       siw_dbg_qp(qp, "ib wr type %d unsupported\n",
+                                  wr->opcode);
+                       rv = -EINVAL;
+                       break;
+               }
+               siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%p\n",
+                          sqe->opcode, sqe->flags, (void *)sqe->id);
+
+               if (unlikely(rv < 0))
+                       break;
+
+               /* make SQE only valid after completely written */
+               smp_wmb();
+               sqe->flags |= SIW_WQE_VALID;
+
+               qp->sq_put++;
+               wr = wr->next;
+       }
+
+       /*
+        * Send directly if SQ processing is not in progress.
+        * Eventual immediate errors (rv < 0) do not affect the involved
+        * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ
+        * processing, if new work is already pending. But rv must be passed
+        * to caller.
+        */
+       if (wqe->wr_status != SIW_WR_IDLE) {
+               spin_unlock_irqrestore(&qp->sq_lock, flags);
+               goto skip_direct_sending;
+       }
+       rv = siw_activate_tx(qp);
+       spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+       if (rv <= 0)
+               goto skip_direct_sending;
+
+       if (qp->kernel_verbs) {
+               rv = siw_sq_start(qp);
+       } else {
+               qp->tx_ctx.in_syscall = 1;
+
+               if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend))
+                       siw_qp_cm_drop(qp, 0);
+
+               qp->tx_ctx.in_syscall = 0;
+       }
+skip_direct_sending:
+
+       up_read(&qp->state_lock);
+
+       if (rv >= 0)
+               return 0;
+       /*
+        * Immediate error
+        */
+       siw_dbg_qp(qp, "error %d\n", rv);
+
+       *bad_wr = wr;
+       return rv;
+}
+
+/*
+ * siw_post_receive()
+ *
+ * Post a list of R-WR's to a RQ.
+ *
+ * @base_qp:   Base QP contained in siw QP
+ * @wr:                Null terminated list of user WR's
+ * @bad_wr:    Points to failing WR in case of synchronous failure.
+ */
+int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
+                    const struct ib_recv_wr **bad_wr)
+{
+       struct siw_qp *qp = to_siw_qp(base_qp);
+       unsigned long flags;
+       int rv = 0;
+
+       if (qp->srq) {
+               *bad_wr = wr;
+               return -EOPNOTSUPP; /* what else from errno.h? */
+       }
+       /*
+        * Try to acquire QP state lock. Must be non-blocking
+        * to accommodate kernel clients needs.
+        */
+       if (!down_read_trylock(&qp->state_lock)) {
+               *bad_wr = wr;
+               return -ENOTCONN;
+       }
+       if (!qp->kernel_verbs) {
+               siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n");
+               up_read(&qp->state_lock);
+               *bad_wr = wr;
+               return -EINVAL;
+       }
+       if (qp->attrs.state > SIW_QP_STATE_RTS) {
+               up_read(&qp->state_lock);
+               *bad_wr = wr;
+               return -EINVAL;
+       }
+       /*
+        * Serialize potentially multiple producers.
+        * Not needed for single threaded consumer side.
+        */
+       spin_lock_irqsave(&qp->rq_lock, flags);
+
+       while (wr) {
+               u32 idx = qp->rq_put % qp->attrs.rq_size;
+               struct siw_rqe *rqe = &qp->recvq[idx];
+
+               if (rqe->flags) {
+                       siw_dbg_qp(qp, "RQ full\n");
+                       rv = -ENOMEM;
+                       break;
+               }
+               if (wr->num_sge > qp->attrs.rq_max_sges) {
+                       siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
+                       rv = -EINVAL;
+                       break;
+               }
+               rqe->id = wr->wr_id;
+               rqe->num_sge = wr->num_sge;
+               siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
+
+               /* make sure RQE is completely written before valid */
+               smp_wmb();
+
+               rqe->flags = SIW_WQE_VALID;
+
+               qp->rq_put++;
+               wr = wr->next;
+       }
+       spin_unlock_irqrestore(&qp->rq_lock, flags);
+
+       up_read(&qp->state_lock);
+
+       if (rv < 0) {
+               siw_dbg_qp(qp, "error %d\n", rv);
+               *bad_wr = wr;
+       }
+       return rv > 0 ? 0 : rv;
+}
+
+void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata)
+{
+       struct siw_cq *cq = to_siw_cq(base_cq);
+       struct siw_device *sdev = to_siw_dev(base_cq->device);
+       struct siw_ucontext *ctx =
+               rdma_udata_to_drv_context(udata, struct siw_ucontext,
+                                         base_ucontext);
+
+       siw_dbg_cq(cq, "free CQ resources\n");
+
+       siw_cq_flush(cq);
+
+       if (ctx && cq->xa_cq_index != SIW_INVAL_UOBJ_KEY)
+               kfree(xa_erase(&ctx->xa, cq->xa_cq_index));
+
+       atomic_dec(&sdev->num_cq);
+
+       vfree(cq->queue);
+}
+
+/*
+ * siw_create_cq()
+ *
+ * Populate CQ of requested size
+ *
+ * @base_cq: CQ as allocated by RDMA midlayer
+ * @attr: Initial CQ attributes
+ * @udata: relates to user context
+ */
+
+int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
+                 struct ib_udata *udata)
+{
+       struct siw_device *sdev = to_siw_dev(base_cq->device);
+       struct siw_cq *cq = to_siw_cq(base_cq);
+       int rv, size = attr->cqe;
+
+       if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) {
+               siw_dbg(base_cq->device, "too many CQ's\n");
+               rv = -ENOMEM;
+               goto err_out;
+       }
+       if (size < 1 || size > sdev->attrs.max_cqe) {
+               siw_dbg(base_cq->device, "CQ size error: %d\n", size);
+               rv = -EINVAL;
+               goto err_out;
+       }
+       size = roundup_pow_of_two(size);
+       cq->base_cq.cqe = size;
+       cq->num_cqe = size;
+       cq->xa_cq_index = SIW_INVAL_UOBJ_KEY;
+
+       if (!udata) {
+               cq->kernel_verbs = 1;
+               cq->queue = vzalloc(size * sizeof(struct siw_cqe) +
+                                   sizeof(struct siw_cq_ctrl));
+       } else {
+               cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) +
+                                        sizeof(struct siw_cq_ctrl));
+       }
+       if (cq->queue == NULL) {
+               rv = -ENOMEM;
+               goto err_out;
+       }
+       get_random_bytes(&cq->id, 4);
+       siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id);
+
+       spin_lock_init(&cq->lock);
+
+       cq->notify = &((struct siw_cq_ctrl *)&cq->queue[size])->notify;
+
+       if (udata) {
+               struct siw_uresp_create_cq uresp = {};
+               struct siw_ucontext *ctx =
+                       rdma_udata_to_drv_context(udata, struct siw_ucontext,
+                                                 base_ucontext);
+
+               cq->xa_cq_index =
+                       siw_create_uobj(ctx, cq->queue,
+                                       size * sizeof(struct siw_cqe) +
+                                               sizeof(struct siw_cq_ctrl));
+               if (cq->xa_cq_index == SIW_INVAL_UOBJ_KEY) {
+                       rv = -ENOMEM;
+                       goto err_out;
+               }
+               uresp.cq_key = cq->xa_cq_index << PAGE_SHIFT;
+               uresp.cq_id = cq->id;
+               uresp.num_cqe = size;
+
+               if (udata->outlen < sizeof(uresp)) {
+                       rv = -EINVAL;
+                       goto err_out;
+               }
+               rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+               if (rv)
+                       goto err_out;
+       }
+       return 0;
+
+err_out:
+       siw_dbg(base_cq->device, "CQ creation failed: %d", rv);
+
+       if (cq && cq->queue) {
+               struct siw_ucontext *ctx =
+                       rdma_udata_to_drv_context(udata, struct siw_ucontext,
+                                                 base_ucontext);
+               if (cq->xa_cq_index != SIW_INVAL_UOBJ_KEY)
+                       kfree(xa_erase(&ctx->xa, cq->xa_cq_index));
+               vfree(cq->queue);
+       }
+       atomic_dec(&sdev->num_cq);
+
+       return rv;
+}
+
+/*
+ * siw_poll_cq()
+ *
+ * Reap CQ entries if available and copy work completion status into
+ * array of WC's provided by caller. Returns number of reaped CQE's.
+ *
+ * @base_cq:   Base CQ contained in siw CQ.
+ * @num_cqe:   Maximum number of CQE's to reap.
+ * @wc:                Array of work completions to be filled by siw.
+ */
+int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc)
+{
+       struct siw_cq *cq = to_siw_cq(base_cq);
+       int i;
+
+       for (i = 0; i < num_cqe; i++) {
+               if (!siw_reap_cqe(cq, wc))
+                       break;
+               wc++;
+       }
+       return i;
+}
+
+/*
+ * siw_req_notify_cq()
+ *
+ * Request notification for new CQE's added to that CQ.
+ * Defined flags:
+ * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification
+ *   event if a WQE with notification flag set enters the CQ
+ * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification
+ *   event if a WQE enters the CQ.
+ * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the
+ *   number of not reaped CQE's regardless of its notification
+ *   type and current or new CQ notification settings.
+ *
+ * @base_cq:   Base CQ contained in siw CQ.
+ * @flags:     Requested notification flags.
+ */
+int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags)
+{
+       struct siw_cq *cq = to_siw_cq(base_cq);
+
+       siw_dbg_cq(cq, "flags: 0x%02x\n", flags);
+
+       if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
+               /* CQ event for next solicited completion */
+               smp_store_mb(*cq->notify, SIW_NOTIFY_SOLICITED);
+       else
+               /* CQ event for any signalled completion */
+               smp_store_mb(*cq->notify, SIW_NOTIFY_ALL);
+
+       if (flags & IB_CQ_REPORT_MISSED_EVENTS)
+               return cq->cq_put - cq->cq_get;
+
+       return 0;
+}
+
+/*
+ * siw_dereg_mr()
+ *
+ * Release Memory Region.
+ *
+ * @base_mr: Base MR contained in siw MR.
+ * @udata: points to user context, unused.
+ */
+int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata)
+{
+       struct siw_mr *mr = to_siw_mr(base_mr);
+       struct siw_device *sdev = to_siw_dev(base_mr->device);
+
+       siw_dbg_mem(mr->mem, "deregister MR\n");
+
+       atomic_dec(&sdev->num_mr);
+
+       siw_mr_drop_mem(mr);
+       kfree_rcu(mr, rcu);
+
+       return 0;
+}
+
+/*
+ * siw_reg_user_mr()
+ *
+ * Register Memory Region.
+ *
+ * @pd:                Protection Domain
+ * @start:     starting address of MR (virtual address)
+ * @len:       len of MR
+ * @rnic_va:   not used by siw
+ * @rights:    MR access rights
+ * @udata:     user buffer to communicate STag and Key.
+ */
+struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
+                             u64 rnic_va, int rights, struct ib_udata *udata)
+{
+       struct siw_mr *mr = NULL;
+       struct siw_umem *umem = NULL;
+       struct siw_ureq_reg_mr ureq;
+       struct siw_device *sdev = to_siw_dev(pd->device);
+
+       unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK);
+       int rv;
+
+       siw_dbg_pd(pd, "start: 0x%016llx, va: 0x%016llx, len: %llu\n",
+                  (unsigned long long)start, (unsigned long long)rnic_va,
+                  (unsigned long long)len);
+
+       if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
+               siw_dbg_pd(pd, "too many mr's\n");
+               rv = -ENOMEM;
+               goto err_out;
+       }
+       if (!len) {
+               rv = -EINVAL;
+               goto err_out;
+       }
+       if (mem_limit != RLIM_INFINITY) {
+               unsigned long num_pages =
+                       (PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT;
+               mem_limit >>= PAGE_SHIFT;
+
+               if (num_pages > mem_limit - current->mm->locked_vm) {
+                       siw_dbg_pd(pd, "pages req %lu, max %lu, lock %lu\n",
+                                  num_pages, mem_limit,
+                                  current->mm->locked_vm);
+                       rv = -ENOMEM;
+                       goto err_out;
+               }
+       }
+       umem = siw_umem_get(start, len, ib_access_writable(rights));
+       if (IS_ERR(umem)) {
+               rv = PTR_ERR(umem);
+               siw_dbg_pd(pd, "getting user memory failed: %d\n", rv);
+               umem = NULL;
+               goto err_out;
+       }
+       mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+       if (!mr) {
+               rv = -ENOMEM;
+               goto err_out;
+       }
+       rv = siw_mr_add_mem(mr, pd, umem, start, len, rights);
+       if (rv)
+               goto err_out;
+
+       if (udata) {
+               struct siw_uresp_reg_mr uresp = {};
+               struct siw_mem *mem = mr->mem;
+
+               if (udata->inlen < sizeof(ureq)) {
+                       rv = -EINVAL;
+                       goto err_out;
+               }
+               rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq));
+               if (rv)
+                       goto err_out;
+
+               mr->base_mr.lkey |= ureq.stag_key;
+               mr->base_mr.rkey |= ureq.stag_key;
+               mem->stag |= ureq.stag_key;
+               uresp.stag = mem->stag;
+
+               if (udata->outlen < sizeof(uresp)) {
+                       rv = -EINVAL;
+                       goto err_out;
+               }
+               rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+               if (rv)
+                       goto err_out;
+       }
+       mr->mem->stag_valid = 1;
+
+       return &mr->base_mr;
+
+err_out:
+       atomic_dec(&sdev->num_mr);
+       if (mr) {
+               if (mr->mem)
+                       siw_mr_drop_mem(mr);
+               kfree_rcu(mr, rcu);
+       } else {
+               if (umem)
+                       siw_umem_release(umem, false);
+       }
+       return ERR_PTR(rv);
+}
+
+struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+                          u32 max_sge, struct ib_udata *udata)
+{
+       struct siw_device *sdev = to_siw_dev(pd->device);
+       struct siw_mr *mr = NULL;
+       struct siw_pbl *pbl = NULL;
+       int rv;
+
+       if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
+               siw_dbg_pd(pd, "too many mr's\n");
+               rv = -ENOMEM;
+               goto err_out;
+       }
+       if (mr_type != IB_MR_TYPE_MEM_REG) {
+               siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type);
+               rv = -EOPNOTSUPP;
+               goto err_out;
+       }
+       if (max_sge > SIW_MAX_SGE_PBL) {
+               siw_dbg_pd(pd, "too many sge's: %d\n", max_sge);
+               rv = -ENOMEM;
+               goto err_out;
+       }
+       pbl = siw_pbl_alloc(max_sge);
+       if (IS_ERR(pbl)) {
+               rv = PTR_ERR(pbl);
+               siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv);
+               pbl = NULL;
+               goto err_out;
+       }
+       mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+       if (!mr) {
+               rv = -ENOMEM;
+               goto err_out;
+       }
+       rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0);
+       if (rv)
+               goto err_out;
+
+       mr->mem->is_pbl = 1;
+
+       siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
+
+       return &mr->base_mr;
+
+err_out:
+       atomic_dec(&sdev->num_mr);
+
+       if (!mr) {
+               kfree(pbl);
+       } else {
+               if (mr->mem)
+                       siw_mr_drop_mem(mr);
+               kfree_rcu(mr, rcu);
+       }
+       siw_dbg_pd(pd, "failed: %d\n", rv);
+
+       return ERR_PTR(rv);
+}
+
+/* Just used to count number of pages being mapped */
+static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr)
+{
+       return 0;
+}
+
+int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
+                 unsigned int *sg_off)
+{
+       struct scatterlist *slp;
+       struct siw_mr *mr = to_siw_mr(base_mr);
+       struct siw_mem *mem = mr->mem;
+       struct siw_pbl *pbl = mem->pbl;
+       struct siw_pble *pble;
+       u64 pbl_size;
+       int i, rv;
+
+       if (!pbl) {
+               siw_dbg_mem(mem, "no PBL allocated\n");
+               return -EINVAL;
+       }
+       pble = pbl->pbe;
+
+       if (pbl->max_buf < num_sle) {
+               siw_dbg_mem(mem, "too many SGE's: %d > %d\n",
+                           mem->pbl->max_buf, num_sle);
+               return -ENOMEM;
+       }
+       for_each_sg(sl, slp, num_sle, i) {
+               if (sg_dma_len(slp) == 0) {
+                       siw_dbg_mem(mem, "empty SGE\n");
+                       return -EINVAL;
+               }
+               if (i == 0) {
+                       pble->addr = sg_dma_address(slp);
+                       pble->size = sg_dma_len(slp);
+                       pble->pbl_off = 0;
+                       pbl_size = pble->size;
+                       pbl->num_buf = 1;
+               } else {
+                       /* Merge PBL entries if adjacent */
+                       if (pble->addr + pble->size == sg_dma_address(slp)) {
+                               pble->size += sg_dma_len(slp);
+                       } else {
+                               pble++;
+                               pbl->num_buf++;
+                               pble->addr = sg_dma_address(slp);
+                               pble->size = sg_dma_len(slp);
+                               pble->pbl_off = pbl_size;
+                       }
+                       pbl_size += sg_dma_len(slp);
+               }
+               siw_dbg_mem(mem,
+                       "sge[%d], size %llu, addr 0x%016llx, total %llu\n",
+                       i, pble->size, pble->addr, pbl_size);
+       }
+       rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page);
+       if (rv > 0) {
+               mem->len = base_mr->length;
+               mem->va = base_mr->iova;
+               siw_dbg_mem(mem,
+                       "%llu bytes, start 0x%016llx, %u SLE to %u entries\n",
+                       mem->len, mem->va, num_sle, pbl->num_buf);
+       }
+       return rv;
+}
+
+/*
+ * siw_get_dma_mr()
+ *
+ * Create a (empty) DMA memory region, where no umem is attached.
+ */
+struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights)
+{
+       struct siw_device *sdev = to_siw_dev(pd->device);
+       struct siw_mr *mr = NULL;
+       int rv;
+
+       if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
+               siw_dbg_pd(pd, "too many mr's\n");
+               rv = -ENOMEM;
+               goto err_out;
+       }
+       mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+       if (!mr) {
+               rv = -ENOMEM;
+               goto err_out;
+       }
+       rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights);
+       if (rv)
+               goto err_out;
+
+       mr->mem->stag_valid = 1;
+
+       siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
+
+       return &mr->base_mr;
+
+err_out:
+       if (rv)
+               kfree(mr);
+
+       atomic_dec(&sdev->num_mr);
+
+       return ERR_PTR(rv);
+}
+
+/*
+ * siw_create_srq()
+ *
+ * Create Shared Receive Queue of attributes @init_attrs
+ * within protection domain given by @pd.
+ *
+ * @base_srq:  Base SRQ contained in siw SRQ.
+ * @init_attrs:        SRQ init attributes.
+ * @udata:     points to user context
+ */
+int siw_create_srq(struct ib_srq *base_srq,
+                  struct ib_srq_init_attr *init_attrs, struct ib_udata *udata)
+{
+       struct siw_srq *srq = to_siw_srq(base_srq);
+       struct ib_srq_attr *attrs = &init_attrs->attr;
+       struct siw_device *sdev = to_siw_dev(base_srq->device);
+       struct siw_ucontext *ctx =
+               rdma_udata_to_drv_context(udata, struct siw_ucontext,
+                                         base_ucontext);
+       int rv;
+
+       if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) {
+               siw_dbg_pd(base_srq->pd, "too many SRQ's\n");
+               rv = -ENOMEM;
+               goto err_out;
+       }
+       if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR ||
+           attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) {
+               rv = -EINVAL;
+               goto err_out;
+       }
+       srq->max_sge = attrs->max_sge;
+       srq->num_rqe = roundup_pow_of_two(attrs->max_wr);
+       srq->xa_srq_index = SIW_INVAL_UOBJ_KEY;
+       srq->limit = attrs->srq_limit;
+       if (srq->limit)
+               srq->armed = 1;
+
+       srq->kernel_verbs = !udata;
+
+       if (udata)
+               srq->recvq =
+                       vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe));
+       else
+               srq->recvq = vzalloc(srq->num_rqe * sizeof(struct siw_rqe));
+
+       if (srq->recvq == NULL) {
+               rv = -ENOMEM;
+               goto err_out;
+       }
+       if (udata) {
+               struct siw_uresp_create_srq uresp = {};
+
+               srq->xa_srq_index = siw_create_uobj(
+                       ctx, srq->recvq, srq->num_rqe * sizeof(struct siw_rqe));
+
+               if (srq->xa_srq_index == SIW_INVAL_UOBJ_KEY) {
+                       rv = -ENOMEM;
+                       goto err_out;
+               }
+               uresp.srq_key = srq->xa_srq_index;
+               uresp.num_rqe = srq->num_rqe;
+
+               if (udata->outlen < sizeof(uresp)) {
+                       rv = -EINVAL;
+                       goto err_out;
+               }
+               rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+               if (rv)
+                       goto err_out;
+       }
+       spin_lock_init(&srq->lock);
+
+       siw_dbg_pd(base_srq->pd, "[SRQ 0x%p]: success\n", srq);
+
+       return 0;
+
+err_out:
+       if (srq->recvq) {
+               if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY)
+                       kfree(xa_erase(&ctx->xa, srq->xa_srq_index));
+               vfree(srq->recvq);
+       }
+       atomic_dec(&sdev->num_srq);
+
+       return rv;
+}
+
+/*
+ * siw_modify_srq()
+ *
+ * Modify SRQ. The caller may resize SRQ and/or set/reset notification
+ * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification.
+ *
+ * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE
+ * parameter. siw_modify_srq() does not check the attrs->max_sge param.
+ */
+int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs,
+                  enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+       struct siw_srq *srq = to_siw_srq(base_srq);
+       unsigned long flags;
+       int rv = 0;
+
+       spin_lock_irqsave(&srq->lock, flags);
+
+       if (attr_mask & IB_SRQ_MAX_WR) {
+               /* resize request not yet supported */
+               rv = -EOPNOTSUPP;
+               goto out;
+       }
+       if (attr_mask & IB_SRQ_LIMIT) {
+               if (attrs->srq_limit) {
+                       if (unlikely(attrs->srq_limit > srq->num_rqe)) {
+                               rv = -EINVAL;
+                               goto out;
+                       }
+                       srq->armed = 1;
+               } else {
+                       srq->armed = 0;
+               }
+               srq->limit = attrs->srq_limit;
+       }
+out:
+       spin_unlock_irqrestore(&srq->lock, flags);
+
+       return rv;
+}
+
+/*
+ * siw_query_srq()
+ *
+ * Query SRQ attributes.
+ */
+int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs)
+{
+       struct siw_srq *srq = to_siw_srq(base_srq);
+       unsigned long flags;
+
+       spin_lock_irqsave(&srq->lock, flags);
+
+       attrs->max_wr = srq->num_rqe;
+       attrs->max_sge = srq->max_sge;
+       attrs->srq_limit = srq->limit;
+
+       spin_unlock_irqrestore(&srq->lock, flags);
+
+       return 0;
+}
+
+/*
+ * siw_destroy_srq()
+ *
+ * Destroy SRQ.
+ * It is assumed that the SRQ is not referenced by any
+ * QP anymore - the code trusts the RDMA core environment to keep track
+ * of QP references.
+ */
+void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata)
+{
+       struct siw_srq *srq = to_siw_srq(base_srq);
+       struct siw_device *sdev = to_siw_dev(base_srq->device);
+       struct siw_ucontext *ctx =
+               rdma_udata_to_drv_context(udata, struct siw_ucontext,
+                                         base_ucontext);
+
+       if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY)
+               kfree(xa_erase(&ctx->xa, srq->xa_srq_index));
+
+       vfree(srq->recvq);
+       atomic_dec(&sdev->num_srq);
+}
+
+/*
+ * siw_post_srq_recv()
+ *
+ * Post a list of receive queue elements to SRQ.
+ * NOTE: The function does not check or lock a certain SRQ state
+ *       during the post operation. The code simply trusts the
+ *       RDMA core environment.
+ *
+ * @base_srq:  Base SRQ contained in siw SRQ
+ * @wr:                List of R-WR's
+ * @bad_wr:    Updated to failing WR if posting fails.
+ */
+int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr,
+                     const struct ib_recv_wr **bad_wr)
+{
+       struct siw_srq *srq = to_siw_srq(base_srq);
+       unsigned long flags;
+       int rv = 0;
+
+       if (unlikely(!srq->kernel_verbs)) {
+               siw_dbg_pd(base_srq->pd,
+                          "[SRQ 0x%p]: no kernel post_recv for mapped srq\n",
+                          srq);
+               rv = -EINVAL;
+               goto out;
+       }
+       /*
+        * Serialize potentially multiple producers.
+        * Also needed to serialize potentially multiple
+        * consumers.
+        */
+       spin_lock_irqsave(&srq->lock, flags);
+
+       while (wr) {
+               u32 idx = srq->rq_put % srq->num_rqe;
+               struct siw_rqe *rqe = &srq->recvq[idx];
+
+               if (rqe->flags) {
+                       siw_dbg_pd(base_srq->pd, "SRQ full\n");
+                       rv = -ENOMEM;
+                       break;
+               }
+               if (unlikely(wr->num_sge > srq->max_sge)) {
+                       siw_dbg_pd(base_srq->pd,
+                                  "[SRQ 0x%p]: too many sge's: %d\n", srq,
+                                  wr->num_sge);
+                       rv = -EINVAL;
+                       break;
+               }
+               rqe->id = wr->wr_id;
+               rqe->num_sge = wr->num_sge;
+               siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
+
+               /* Make sure S-RQE is completely written before valid */
+               smp_wmb();
+
+               rqe->flags = SIW_WQE_VALID;
+
+               srq->rq_put++;
+               wr = wr->next;
+       }
+       spin_unlock_irqrestore(&srq->lock, flags);
+out:
+       if (unlikely(rv < 0)) {
+               siw_dbg_pd(base_srq->pd, "[SRQ 0x%p]: error %d\n", srq, rv);
+               *bad_wr = wr;
+       }
+       return rv;
+}
+
+void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype)
+{
+       struct ib_event event;
+       struct ib_qp *base_qp = qp->ib_qp;
+
+       /*
+        * Do not report asynchronous errors on QP which gets
+        * destroyed via verbs interface (siw_destroy_qp())
+        */
+       if (qp->attrs.flags & SIW_QP_IN_DESTROY)
+               return;
+
+       event.event = etype;
+       event.device = base_qp->device;
+       event.element.qp = base_qp;
+
+       if (base_qp->event_handler) {
+               siw_dbg_qp(qp, "reporting event %d\n", etype);
+               base_qp->event_handler(&event, base_qp->qp_context);
+       }
+}
+
+void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype)
+{
+       struct ib_event event;
+       struct ib_cq *base_cq = &cq->base_cq;
+
+       event.event = etype;
+       event.device = base_cq->device;
+       event.element.cq = base_cq;
+
+       if (base_cq->event_handler) {
+               siw_dbg_cq(cq, "reporting CQ event %d\n", etype);
+               base_cq->event_handler(&event, base_cq->cq_context);
+       }
+}
+
+void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype)
+{
+       struct ib_event event;
+       struct ib_srq *base_srq = &srq->base_srq;
+
+       event.event = etype;
+       event.device = base_srq->device;
+       event.element.srq = base_srq;
+
+       if (base_srq->event_handler) {
+               siw_dbg_pd(srq->base_srq.pd,
+                          "reporting SRQ event %d\n", etype);
+               base_srq->event_handler(&event, base_srq->srq_context);
+       }
+}
+
+void siw_port_event(struct siw_device *sdev, u8 port, enum ib_event_type etype)
+{
+       struct ib_event event;
+
+       event.event = etype;
+       event.device = &sdev->base_dev;
+       event.element.port_num = port;
+
+       siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype);
+
+       ib_dispatch_event(&event);
+}
diff --git a/drivers/infiniband/sw/siw/siw_verbs.h b/drivers/infiniband/sw/siw/siw_verbs.h
new file mode 100644 (file)
index 0000000..1910869
--- /dev/null
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#ifndef _SIW_VERBS_H
+#define _SIW_VERBS_H
+
+#include <linux/errno.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "siw.h"
+#include "siw_cm.h"
+
+/*
+ * siw_copy_sgl()
+ *
+ * Copy SGL from RDMA core representation to local
+ * representation.
+ */
+static inline void siw_copy_sgl(struct ib_sge *sge, struct siw_sge *siw_sge,
+                               int num_sge)
+{
+       while (num_sge--) {
+               siw_sge->laddr = sge->addr;
+               siw_sge->length = sge->length;
+               siw_sge->lkey = sge->lkey;
+
+               siw_sge++;
+               sge++;
+       }
+}
+
+int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata);
+void siw_dealloc_ucontext(struct ib_ucontext *base_ctx);
+int siw_query_port(struct ib_device *base_dev, u8 port,
+                  struct ib_port_attr *attr);
+int siw_get_port_immutable(struct ib_device *base_dev, u8 port,
+                          struct ib_port_immutable *port_immutable);
+int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr,
+                    struct ib_udata *udata);
+int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
+                 struct ib_udata *udata);
+int siw_query_port(struct ib_device *base_dev, u8 port,
+                  struct ib_port_attr *attr);
+int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey);
+int siw_query_gid(struct ib_device *base_dev, u8 port, int idx,
+                 union ib_gid *gid);
+int siw_alloc_pd(struct ib_pd *base_pd, struct ib_udata *udata);
+void siw_dealloc_pd(struct ib_pd *base_pd, struct ib_udata *udata);
+struct ib_qp *siw_create_qp(struct ib_pd *base_pd,
+                           struct ib_qp_init_attr *attr,
+                           struct ib_udata *udata);
+int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr,
+                int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr,
+                       int attr_mask, struct ib_udata *udata);
+int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata);
+int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
+                 const struct ib_send_wr **bad_wr);
+int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
+                    const struct ib_recv_wr **bad_wr);
+void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata);
+int siw_poll_cq(struct ib_cq *base_cq, int num_entries, struct ib_wc *wc);
+int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags);
+struct ib_mr *siw_reg_user_mr(struct ib_pd *base_pd, u64 start, u64 len,
+                             u64 rnic_va, int rights, struct ib_udata *udata);
+struct ib_mr *siw_alloc_mr(struct ib_pd *base_pd, enum ib_mr_type mr_type,
+                          u32 max_sge, struct ib_udata *udata);
+struct ib_mr *siw_get_dma_mr(struct ib_pd *base_pd, int rights);
+int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
+                 unsigned int *sg_off);
+int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata);
+int siw_create_srq(struct ib_srq *base_srq, struct ib_srq_init_attr *attr,
+                  struct ib_udata *udata);
+int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attr,
+                  enum ib_srq_attr_mask mask, struct ib_udata *udata);
+int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attr);
+void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata);
+int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr,
+                     const struct ib_recv_wr **bad_wr);
+int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma);
+void siw_qp_event(struct siw_qp *qp, enum ib_event_type type);
+void siw_cq_event(struct siw_cq *cq, enum ib_event_type type);
+void siw_srq_event(struct siw_srq *srq, enum ib_event_type type);
+void siw_port_event(struct siw_device *dev, u8 port, enum ib_event_type type);
+
+#endif
index 4760ce465d894f18046b43ee5c89212bfd32e4fa..7af68604af773867a37a820ba71ecb66f3a9c41a 100644 (file)
@@ -7,7 +7,7 @@ config INFINIBAND_IPOIB
          transports IP packets over InfiniBand so you can use your IB
          device as a fancy NIC.
 
-         See Documentation/infiniband/ipoib.txt for more information
+         See Documentation/infiniband/ipoib.rst for more information
 
 config INFINIBAND_IPOIB_CM
        bool "IP-over-InfiniBand Connected Mode support"
index aa9dcfc36cd35b81b9ad961f13c5e7303467ad5e..c59e00a0881f19e6efae40145332743de8407ca7 100644 (file)
@@ -1153,7 +1153,6 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
                ret = -ENOMEM;
                goto err_tx;
        }
-       memset(p->tx_ring, 0, ipoib_sendq_size * sizeof(*p->tx_ring));
 
        p->qp = ipoib_cm_create_tx_qp(p->dev, p);
        memalloc_noio_restore(noio_flag);
index 83429925dfc69cc2c278fb59d8b7f16d72a02bb1..63e4f9d15fd9146d4c7f5be7e7a8e44bb0cdf2f0 100644 (file)
@@ -138,7 +138,6 @@ static void ipoib_get_strings(struct net_device __always_unused *dev,
                        p += ETH_GSTRING_LEN;
                }
                break;
-       case ETH_SS_TEST:
        default:
                break;
        }
@@ -149,7 +148,6 @@ static int ipoib_get_sset_count(struct net_device __always_unused *dev,
        switch (sset) {
        case ETH_SS_STATS:
                return IPOIB_GLOBAL_STATS_LEN;
-       case ETH_SS_TEST:
        default:
                break;
        }
@@ -222,6 +220,7 @@ static const struct ethtool_ops ipoib_ethtool_ops = {
        .get_strings            = ipoib_get_strings,
        .get_ethtool_stats      = ipoib_get_ethtool_stats,
        .get_sset_count         = ipoib_get_sset_count,
+       .get_link               = ethtool_op_get_link,
 };
 
 void ipoib_set_ethtool_ops(struct net_device *dev)
index 04ea7db08e87edf6a8302070e661abd0fe35f790..ac0583ff280d1b5fd3efecec56a9ce9c92a43a11 100644 (file)
@@ -1893,12 +1893,6 @@ static void ipoib_child_init(struct net_device *ndev)
        struct ipoib_dev_priv *priv = ipoib_priv(ndev);
        struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);
 
-       dev_hold(priv->parent);
-
-       down_write(&ppriv->vlan_rwsem);
-       list_add_tail(&priv->list, &ppriv->child_intfs);
-       up_write(&ppriv->vlan_rwsem);
-
        priv->max_ib_mtu = ppriv->max_ib_mtu;
        set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
        memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN);
@@ -1941,6 +1935,17 @@ static int ipoib_ndo_init(struct net_device *ndev)
        if (rc) {
                pr_warn("%s: failed to initialize device: %s port %d (ret = %d)\n",
                        priv->ca->name, priv->dev->name, priv->port, rc);
+               return rc;
+       }
+
+       if (priv->parent) {
+               struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);
+
+               dev_hold(priv->parent);
+
+               down_write(&ppriv->vlan_rwsem);
+               list_add_tail(&priv->list, &ppriv->child_intfs);
+               up_write(&ppriv->vlan_rwsem);
        }
 
        return 0;
@@ -1958,6 +1963,14 @@ static void ipoib_ndo_uninit(struct net_device *dev)
         */
        WARN_ON(!list_empty(&priv->child_intfs));
 
+       if (priv->parent) {
+               struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);
+
+               down_write(&ppriv->vlan_rwsem);
+               list_del(&priv->list);
+               up_write(&ppriv->vlan_rwsem);
+       }
+
        ipoib_neigh_hash_uninit(dev);
 
        ipoib_ib_dev_cleanup(dev);
@@ -1969,15 +1982,8 @@ static void ipoib_ndo_uninit(struct net_device *dev)
                priv->wq = NULL;
        }
 
-       if (priv->parent) {
-               struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);
-
-               down_write(&ppriv->vlan_rwsem);
-               list_del(&priv->list);
-               up_write(&ppriv->vlan_rwsem);
-
+       if (priv->parent)
                dev_put(priv->parent);
-       }
 }
 
 static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state)
index ba09068f620015c51b6c8f00127aa75491449b60..b69304d28f0662a5547bb43605dd9d9a842cccc8 100644 (file)
@@ -260,11 +260,8 @@ void ipoib_transport_dev_cleanup(struct net_device *dev)
                priv->qp = NULL;
        }
 
-       if (ib_destroy_cq(priv->send_cq))
-               ipoib_warn(priv, "ib_cq_destroy (send) failed\n");
-
-       if (ib_destroy_cq(priv->recv_cq))
-               ipoib_warn(priv, "ib_cq_destroy (recv) failed\n");
+       ib_destroy_cq(priv->send_cq);
+       ib_destroy_cq(priv->recv_cq);
 }
 
 void ipoib_event(struct ib_event_handler *handler,
index 9c185a8dabd304e2cff9b139e53c9e54c220bbd7..c7a3d75fb30806c2f6fe6fa5d38bcfb8a4d3e1db 100644 (file)
@@ -205,7 +205,8 @@ iser_initialize_task_headers(struct iscsi_task *task,
                goto out;
        }
 
-       tx_desc->wr_idx = 0;
+       tx_desc->inv_wr.next = NULL;
+       tx_desc->reg_wr.wr.next = NULL;
        tx_desc->mapped = true;
        tx_desc->dma_addr = dma_addr;
        tx_desc->tx_sg[0].addr   = tx_desc->dma_addr;
@@ -406,13 +407,10 @@ static u8
 iscsi_iser_check_protection(struct iscsi_task *task, sector_t *sector)
 {
        struct iscsi_iser_task *iser_task = task->dd_data;
+       enum iser_data_dir dir = iser_task->dir[ISER_DIR_IN] ?
+                                       ISER_DIR_IN : ISER_DIR_OUT;
 
-       if (iser_task->dir[ISER_DIR_IN])
-               return iser_check_task_pi_status(iser_task, ISER_DIR_IN,
-                                                sector);
-       else
-               return iser_check_task_pi_status(iser_task, ISER_DIR_OUT,
-                                                sector);
+       return iser_check_task_pi_status(iser_task, dir, sector);
 }
 
 /**
index 36d525110fd2b28cad5ef05594665cdb14bccd3b..39bf213444cbb509ccd3c2214c526cb6c8e3ac9a 100644 (file)
@@ -225,14 +225,6 @@ enum iser_desc_type {
        ISCSI_TX_DATAOUT
 };
 
-/* Maximum number of work requests per task:
- * Data memory region local invalidate + fast registration
- * Protection memory region local invalidate + fast registration
- * Signature memory region local invalidate + fast registration
- * PDU send
- */
-#define ISER_MAX_WRS 7
-
 /**
  * struct iser_tx_desc - iSER TX descriptor
  *
@@ -245,11 +237,9 @@ enum iser_desc_type {
  *                 unsolicited data-out or control
  * @num_sge:       number sges used on this TX task
  * @mapped:        Is the task header mapped
- * @wr_idx:        Current WR index
- * @wrs:           Array of WRs per task
- * @data_reg:      Data buffer registration details
- * @prot_reg:      Protection buffer registration details
- * @sig_attrs:     Signature attributes
+ * reg_wr:         registration WR
+ * send_wr:        send WR
+ * inv_wr:         invalidate WR
  */
 struct iser_tx_desc {
        struct iser_ctrl             iser_header;
@@ -260,15 +250,9 @@ struct iser_tx_desc {
        int                          num_sge;
        struct ib_cqe                cqe;
        bool                         mapped;
-       u8                           wr_idx;
-       union iser_wr {
-               struct ib_send_wr               send;
-               struct ib_reg_wr                fast_reg;
-               struct ib_sig_handover_wr       sig;
-       } wrs[ISER_MAX_WRS];
-       struct iser_mem_reg          data_reg;
-       struct iser_mem_reg          prot_reg;
-       struct ib_sig_attrs          sig_attrs;
+       struct ib_reg_wr             reg_wr;
+       struct ib_send_wr            send_wr;
+       struct ib_send_wr            inv_wr;
 };
 
 #define ISER_RX_PAD_SIZE       (256 - (ISER_RX_PAYLOAD_SIZE + \
@@ -388,6 +372,7 @@ struct iser_device {
  *
  * @mr:         memory region
  * @fmr_pool:   pool of fmrs
+ * @sig_mr:     signature memory region
  * @page_vec:   fast reg page list used by fmr pool
  * @mr_valid:   is mr valid indicator
  */
@@ -396,36 +381,22 @@ struct iser_reg_resources {
                struct ib_mr             *mr;
                struct ib_fmr_pool       *fmr_pool;
        };
+       struct ib_mr                     *sig_mr;
        struct iser_page_vec             *page_vec;
        u8                                mr_valid:1;
 };
 
-/**
- * struct iser_pi_context - Protection information context
- *
- * @rsc:             protection buffer registration resources
- * @sig_mr:          signature enable memory region
- * @sig_mr_valid:    is sig_mr valid indicator
- * @sig_protected:   is region protected indicator
- */
-struct iser_pi_context {
-       struct iser_reg_resources       rsc;
-       struct ib_mr                   *sig_mr;
-       u8                              sig_mr_valid:1;
-       u8                              sig_protected:1;
-};
-
 /**
  * struct iser_fr_desc - Fast registration descriptor
  *
  * @list:           entry in connection fastreg pool
  * @rsc:            data buffer registration resources
- * @pi_ctx:         protection information context
+ * @sig_protected:  is region protected indicator
  */
 struct iser_fr_desc {
        struct list_head                  list;
        struct iser_reg_resources         rsc;
-       struct iser_pi_context           *pi_ctx;
+       bool                              sig_protected;
        struct list_head                  all_list;
 };
 
@@ -674,21 +645,6 @@ void
 iser_reg_desc_put_fmr(struct ib_conn *ib_conn,
                      struct iser_fr_desc *desc);
 
-static inline struct ib_send_wr *
-iser_tx_next_wr(struct iser_tx_desc *tx_desc)
-{
-       struct ib_send_wr *cur_wr = &tx_desc->wrs[tx_desc->wr_idx].send;
-       struct ib_send_wr *last_wr;
-
-       if (tx_desc->wr_idx) {
-               last_wr = &tx_desc->wrs[tx_desc->wr_idx - 1].send;
-               last_wr->next = cur_wr;
-       }
-       tx_desc->wr_idx++;
-
-       return cur_wr;
-}
-
 static inline struct iser_conn *
 to_iser_conn(struct ib_conn *ib_conn)
 {
index 96af06cfe0afd9386235007a762eff92197b42a4..5cbb4b3a05660adfe228a06cd811eb3d2e9f0c48 100644 (file)
@@ -592,15 +592,14 @@ void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc)
 static inline int
 iser_inv_desc(struct iser_fr_desc *desc, u32 rkey)
 {
-       if (likely(rkey == desc->rsc.mr->rkey)) {
-               desc->rsc.mr_valid = 0;
-       } else if (likely(desc->pi_ctx && rkey == desc->pi_ctx->sig_mr->rkey)) {
-               desc->pi_ctx->sig_mr_valid = 0;
-       } else {
+       if (unlikely((!desc->sig_protected && rkey != desc->rsc.mr->rkey) ||
+                    (desc->sig_protected && rkey != desc->rsc.sig_mr->rkey))) {
                iser_err("Bogus remote invalidation for rkey %#x\n", rkey);
                return -EINVAL;
        }
 
+       desc->rsc.mr_valid = 0;
+
        return 0;
 }
 
@@ -750,6 +749,9 @@ void iser_task_rdma_init(struct iscsi_iser_task *iser_task)
        iser_task->prot[ISER_DIR_IN].data_len  = 0;
        iser_task->prot[ISER_DIR_OUT].data_len = 0;
 
+       iser_task->prot[ISER_DIR_IN].dma_nents = 0;
+       iser_task->prot[ISER_DIR_OUT].dma_nents = 0;
+
        memset(&iser_task->rdma_reg[ISER_DIR_IN], 0,
               sizeof(struct iser_mem_reg));
        memset(&iser_task->rdma_reg[ISER_DIR_OUT], 0,
index 2ba70729d7b054071ed07e04057c7b9c3a6274e5..2cc89a9b9e9bb48ee8e924bc1381d50b60ec6cc9 100644 (file)
@@ -302,8 +302,7 @@ void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task,
 }
 
 static void
-iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs,
-                   struct ib_sig_domain *domain)
+iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_domain *domain)
 {
        domain->sig_type = IB_SIG_TYPE_T10_DIF;
        domain->sig.dif.pi_interval = scsi_prot_interval(sc);
@@ -326,21 +325,21 @@ iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs)
        case SCSI_PROT_WRITE_INSERT:
        case SCSI_PROT_READ_STRIP:
                sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE;
-               iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire);
+               iser_set_dif_domain(sc, &sig_attrs->wire);
                sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC;
                break;
        case SCSI_PROT_READ_INSERT:
        case SCSI_PROT_WRITE_STRIP:
                sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE;
-               iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem);
+               iser_set_dif_domain(sc, &sig_attrs->mem);
                sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ?
                                                IB_T10DIF_CSUM : IB_T10DIF_CRC;
                break;
        case SCSI_PROT_READ_PASS:
        case SCSI_PROT_WRITE_PASS:
-               iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire);
+               iser_set_dif_domain(sc, &sig_attrs->wire);
                sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC;
-               iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem);
+               iser_set_dif_domain(sc, &sig_attrs->mem);
                sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ?
                                                IB_T10DIF_CSUM : IB_T10DIF_CRC;
                break;
@@ -366,27 +365,29 @@ iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask)
 static inline void
 iser_inv_rkey(struct ib_send_wr *inv_wr,
              struct ib_mr *mr,
-             struct ib_cqe *cqe)
+             struct ib_cqe *cqe,
+             struct ib_send_wr *next_wr)
 {
        inv_wr->opcode = IB_WR_LOCAL_INV;
        inv_wr->wr_cqe = cqe;
        inv_wr->ex.invalidate_rkey = mr->rkey;
        inv_wr->send_flags = 0;
        inv_wr->num_sge = 0;
+       inv_wr->next = next_wr;
 }
 
 static int
 iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
-               struct iser_pi_context *pi_ctx,
-               struct iser_mem_reg *data_reg,
-               struct iser_mem_reg *prot_reg,
+               struct iser_data_buf *mem,
+               struct iser_data_buf *sig_mem,
+               struct iser_reg_resources *rsc,
                struct iser_mem_reg *sig_reg)
 {
        struct iser_tx_desc *tx_desc = &iser_task->desc;
-       struct ib_sig_attrs *sig_attrs = &tx_desc->sig_attrs;
        struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe;
-       struct ib_sig_handover_wr *wr;
-       struct ib_mr *mr = pi_ctx->sig_mr;
+       struct ib_mr *mr = rsc->sig_mr;
+       struct ib_sig_attrs *sig_attrs = mr->sig_attrs;
+       struct ib_reg_wr *wr = &tx_desc->reg_wr;
        int ret;
 
        memset(sig_attrs, 0, sizeof(*sig_attrs));
@@ -396,33 +397,36 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
 
        iser_set_prot_checks(iser_task->sc, &sig_attrs->check_mask);
 
-       if (pi_ctx->sig_mr_valid)
-               iser_inv_rkey(iser_tx_next_wr(tx_desc), mr, cqe);
+       if (rsc->mr_valid)
+               iser_inv_rkey(&tx_desc->inv_wr, mr, cqe, &wr->wr);
 
        ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey));
 
-       wr = container_of(iser_tx_next_wr(tx_desc), struct ib_sig_handover_wr,
-                         wr);
-       wr->wr.opcode = IB_WR_REG_SIG_MR;
+       ret = ib_map_mr_sg_pi(mr, mem->sg, mem->dma_nents, NULL,
+                             sig_mem->sg, sig_mem->dma_nents, NULL, SZ_4K);
+       if (unlikely(ret)) {
+               iser_err("failed to map PI sg (%d)\n",
+                        mem->dma_nents + sig_mem->dma_nents);
+               goto err;
+       }
+
+       memset(wr, 0, sizeof(*wr));
+       wr->wr.next = &tx_desc->send_wr;
+       wr->wr.opcode = IB_WR_REG_MR_INTEGRITY;
        wr->wr.wr_cqe = cqe;
-       wr->wr.sg_list = &data_reg->sge;
-       wr->wr.num_sge = 1;
+       wr->wr.num_sge = 0;
        wr->wr.send_flags = 0;
-       wr->sig_attrs = sig_attrs;
-       wr->sig_mr = mr;
-       if (scsi_prot_sg_count(iser_task->sc))
-               wr->prot = &prot_reg->sge;
-       else
-               wr->prot = NULL;
-       wr->access_flags = IB_ACCESS_LOCAL_WRITE |
-                          IB_ACCESS_REMOTE_READ |
-                          IB_ACCESS_REMOTE_WRITE;
-       pi_ctx->sig_mr_valid = 1;
+       wr->mr = mr;
+       wr->key = mr->rkey;
+       wr->access = IB_ACCESS_LOCAL_WRITE |
+                    IB_ACCESS_REMOTE_READ |
+                    IB_ACCESS_REMOTE_WRITE;
+       rsc->mr_valid = 1;
 
        sig_reg->sge.lkey = mr->lkey;
        sig_reg->rkey = mr->rkey;
-       sig_reg->sge.addr = 0;
-       sig_reg->sge.length = scsi_transfer_length(iser_task->sc);
+       sig_reg->sge.addr = mr->iova;
+       sig_reg->sge.length = mr->length;
 
        iser_dbg("lkey=0x%x rkey=0x%x addr=0x%llx length=%u\n",
                 sig_reg->sge.lkey, sig_reg->rkey, sig_reg->sge.addr,
@@ -439,11 +443,11 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
        struct iser_tx_desc *tx_desc = &iser_task->desc;
        struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe;
        struct ib_mr *mr = rsc->mr;
-       struct ib_reg_wr *wr;
+       struct ib_reg_wr *wr = &tx_desc->reg_wr;
        int n;
 
        if (rsc->mr_valid)
-               iser_inv_rkey(iser_tx_next_wr(tx_desc), mr, cqe);
+               iser_inv_rkey(&tx_desc->inv_wr, mr, cqe, &wr->wr);
 
        ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey));
 
@@ -454,7 +458,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
                return n < 0 ? n : -EINVAL;
        }
 
-       wr = container_of(iser_tx_next_wr(tx_desc), struct ib_reg_wr, wr);
+       wr->wr.next = &tx_desc->send_wr;
        wr->wr.opcode = IB_WR_REG_MR;
        wr->wr.wr_cqe = cqe;
        wr->wr.send_flags = 0;
@@ -478,21 +482,6 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
        return 0;
 }
 
-static int
-iser_reg_prot_sg(struct iscsi_iser_task *task,
-                struct iser_data_buf *mem,
-                struct iser_fr_desc *desc,
-                bool use_dma_key,
-                struct iser_mem_reg *reg)
-{
-       struct iser_device *device = task->iser_conn->ib_conn.device;
-
-       if (use_dma_key)
-               return iser_reg_dma(device, mem, reg);
-
-       return device->reg_ops->reg_mem(task, mem, &desc->pi_ctx->rsc, reg);
-}
-
 static int
 iser_reg_data_sg(struct iscsi_iser_task *task,
                 struct iser_data_buf *mem,
@@ -516,7 +505,6 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *task,
        struct iser_device *device = ib_conn->device;
        struct iser_data_buf *mem = &task->data[dir];
        struct iser_mem_reg *reg = &task->rdma_reg[dir];
-       struct iser_mem_reg *data_reg;
        struct iser_fr_desc *desc = NULL;
        bool use_dma_key;
        int err;
@@ -529,32 +517,17 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *task,
                reg->mem_h = desc;
        }
 
-       if (scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL)
-               data_reg = reg;
-       else
-               data_reg = &task->desc.data_reg;
-
-       err = iser_reg_data_sg(task, mem, desc, use_dma_key, data_reg);
-       if (unlikely(err))
-               goto err_reg;
-
-       if (scsi_get_prot_op(task->sc) != SCSI_PROT_NORMAL) {
-               struct iser_mem_reg *prot_reg = &task->desc.prot_reg;
-
-               if (scsi_prot_sg_count(task->sc)) {
-                       mem = &task->prot[dir];
-                       err = iser_reg_prot_sg(task, mem, desc,
-                                              use_dma_key, prot_reg);
-                       if (unlikely(err))
-                               goto err_reg;
-               }
-
-               err = iser_reg_sig_mr(task, desc->pi_ctx, data_reg,
-                                     prot_reg, reg);
+       if (scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL) {
+               err = iser_reg_data_sg(task, mem, desc, use_dma_key, reg);
+               if (unlikely(err))
+                       goto err_reg;
+       } else {
+               err = iser_reg_sig_mr(task, mem, &task->prot[dir],
+                                     &desc->rsc, reg);
                if (unlikely(err))
                        goto err_reg;
 
-               desc->pi_ctx->sig_protected = 1;
+               desc->sig_protected = 1;
        }
 
        return 0;
index 4ff3d98fa6a4e2b6b127d973d6b0edf9063cd474..a6548de0e218618395c75d36dc4a4f63be2b9740 100644 (file)
@@ -233,116 +233,63 @@ void iser_free_fmr_pool(struct ib_conn *ib_conn)
        kfree(desc);
 }
 
-static int
-iser_alloc_reg_res(struct iser_device *device,
-                  struct ib_pd *pd,
-                  struct iser_reg_resources *res,
-                  unsigned int size)
+static struct iser_fr_desc *
+iser_create_fastreg_desc(struct iser_device *device,
+                        struct ib_pd *pd,
+                        bool pi_enable,
+                        unsigned int size)
 {
+       struct iser_fr_desc *desc;
        struct ib_device *ib_dev = device->ib_device;
        enum ib_mr_type mr_type;
        int ret;
 
+       desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+       if (!desc)
+               return ERR_PTR(-ENOMEM);
+
        if (ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)
                mr_type = IB_MR_TYPE_SG_GAPS;
        else
                mr_type = IB_MR_TYPE_MEM_REG;
 
-       res->mr = ib_alloc_mr(pd, mr_type, size);
-       if (IS_ERR(res->mr)) {
-               ret = PTR_ERR(res->mr);
+       desc->rsc.mr = ib_alloc_mr(pd, mr_type, size);
+       if (IS_ERR(desc->rsc.mr)) {
+               ret = PTR_ERR(desc->rsc.mr);
                iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret);
-               return ret;
+               goto err_alloc_mr;
        }
-       res->mr_valid = 0;
-
-       return 0;
-}
 
-static void
-iser_free_reg_res(struct iser_reg_resources *rsc)
-{
-       ib_dereg_mr(rsc->mr);
-}
-
-static int
-iser_alloc_pi_ctx(struct iser_device *device,
-                 struct ib_pd *pd,
-                 struct iser_fr_desc *desc,
-                 unsigned int size)
-{
-       struct iser_pi_context *pi_ctx = NULL;
-       int ret;
-
-       desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL);
-       if (!desc->pi_ctx)
-               return -ENOMEM;
-
-       pi_ctx = desc->pi_ctx;
-
-       ret = iser_alloc_reg_res(device, pd, &pi_ctx->rsc, size);
-       if (ret) {
-               iser_err("failed to allocate reg_resources\n");
-               goto alloc_reg_res_err;
-       }
-
-       pi_ctx->sig_mr = ib_alloc_mr(pd, IB_MR_TYPE_SIGNATURE, 2);
-       if (IS_ERR(pi_ctx->sig_mr)) {
-               ret = PTR_ERR(pi_ctx->sig_mr);
-               goto sig_mr_failure;
+       if (pi_enable) {
+               desc->rsc.sig_mr = ib_alloc_mr_integrity(pd, size, size);
+               if (IS_ERR(desc->rsc.sig_mr)) {
+                       ret = PTR_ERR(desc->rsc.sig_mr);
+                       iser_err("Failed to allocate sig_mr err=%d\n", ret);
+                       goto err_alloc_mr_integrity;
+               }
        }
-       pi_ctx->sig_mr_valid = 0;
-       desc->pi_ctx->sig_protected = 0;
-
-       return 0;
+       desc->rsc.mr_valid = 0;
 
-sig_mr_failure:
-       iser_free_reg_res(&pi_ctx->rsc);
-alloc_reg_res_err:
-       kfree(desc->pi_ctx);
+       return desc;
 
-       return ret;
-}
+err_alloc_mr_integrity:
+       ib_dereg_mr(desc->rsc.mr);
+err_alloc_mr:
+       kfree(desc);
 
-static void
-iser_free_pi_ctx(struct iser_pi_context *pi_ctx)
-{
-       iser_free_reg_res(&pi_ctx->rsc);
-       ib_dereg_mr(pi_ctx->sig_mr);
-       kfree(pi_ctx);
+       return ERR_PTR(ret);
 }
 
-static struct iser_fr_desc *
-iser_create_fastreg_desc(struct iser_device *device,
-                        struct ib_pd *pd,
-                        bool pi_enable,
-                        unsigned int size)
+static void iser_destroy_fastreg_desc(struct iser_fr_desc *desc)
 {
-       struct iser_fr_desc *desc;
-       int ret;
-
-       desc = kzalloc(sizeof(*desc), GFP_KERNEL);
-       if (!desc)
-               return ERR_PTR(-ENOMEM);
-
-       ret = iser_alloc_reg_res(device, pd, &desc->rsc, size);
-       if (ret)
-               goto reg_res_alloc_failure;
+       struct iser_reg_resources *res = &desc->rsc;
 
-       if (pi_enable) {
-               ret = iser_alloc_pi_ctx(device, pd, desc, size);
-               if (ret)
-                       goto pi_ctx_alloc_failure;
+       ib_dereg_mr(res->mr);
+       if (res->sig_mr) {
+               ib_dereg_mr(res->sig_mr);
+               res->sig_mr = NULL;
        }
-
-       return desc;
-
-pi_ctx_alloc_failure:
-       iser_free_reg_res(&desc->rsc);
-reg_res_alloc_failure:
        kfree(desc);
-
-       return ERR_PTR(ret);
 }
 
 /**
@@ -399,10 +346,7 @@ void iser_free_fastreg_pool(struct ib_conn *ib_conn)
 
        list_for_each_entry_safe(desc, tmp, &fr_pool->all_list, all_list) {
                list_del(&desc->all_list);
-               iser_free_reg_res(&desc->rsc);
-               if (desc->pi_ctx)
-                       iser_free_pi_ctx(desc->pi_ctx);
-               kfree(desc);
+               iser_destroy_fastreg_desc(desc);
                ++i;
        }
 
@@ -455,7 +399,7 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
        init_attr.qp_type       = IB_QPT_RC;
        if (ib_conn->pi_support) {
                init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1;
-               init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN;
+               init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
                iser_conn->max_cmds =
                        ISER_GET_MAX_XMIT_CMDS(ISER_QP_SIG_MAX_REQ_DTOS);
        } else {
@@ -707,6 +651,7 @@ iser_calc_scsi_params(struct iser_conn *iser_conn,
        struct ib_device_attr *attr = &device->ib_device->attrs;
        unsigned short sg_tablesize, sup_sg_tablesize;
        unsigned short reserved_mr_pages;
+       u32 max_num_sg;
 
        /*
         * FRs without SG_GAPS or FMRs can only map up to a (device) page per
@@ -720,12 +665,17 @@ iser_calc_scsi_params(struct iser_conn *iser_conn,
        else
                reserved_mr_pages = 1;
 
+       if (iser_conn->ib_conn.pi_support)
+               max_num_sg = attr->max_pi_fast_reg_page_list_len;
+       else
+               max_num_sg = attr->max_fast_reg_page_list_len;
+
        sg_tablesize = DIV_ROUND_UP(max_sectors * 512, SIZE_4K);
        if (attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)
                sup_sg_tablesize =
                        min_t(
                         uint, ISCSI_ISER_MAX_SG_TABLESIZE,
-                        attr->max_fast_reg_page_list_len - reserved_mr_pages);
+                        max_num_sg - reserved_mr_pages);
        else
                sup_sg_tablesize = ISCSI_ISER_MAX_SG_TABLESIZE;
 
@@ -762,7 +712,7 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id)
        /* connection T10-PI support */
        if (iser_pi_enable) {
                if (!(device->ib_device->attrs.device_cap_flags &
-                     IB_DEVICE_SIGNATURE_HANDOVER)) {
+                     IB_DEVICE_INTEGRITY_HANDOVER)) {
                        iser_warn("T10-PI requested but not supported on %s, "
                                  "continue without T10-PI\n",
                                  dev_name(&ib_conn->device->ib_device->dev));
@@ -1087,7 +1037,8 @@ int iser_post_recvm(struct iser_conn *iser_conn, int count)
 int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
                   bool signal)
 {
-       struct ib_send_wr *wr = iser_tx_next_wr(tx_desc);
+       struct ib_send_wr *wr = &tx_desc->send_wr;
+       struct ib_send_wr *first_wr;
        int ib_ret;
 
        ib_dma_sync_single_for_device(ib_conn->device->ib_device,
@@ -1101,7 +1052,14 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
        wr->opcode = IB_WR_SEND;
        wr->send_flags = signal ? IB_SEND_SIGNALED : 0;
 
-       ib_ret = ib_post_send(ib_conn->qp, &tx_desc->wrs[0].send, NULL);
+       if (tx_desc->inv_wr.next)
+               first_wr = &tx_desc->inv_wr;
+       else if (tx_desc->reg_wr.wr.next)
+               first_wr = &tx_desc->reg_wr.wr;
+       else
+               first_wr = wr;
+
+       ib_ret = ib_post_send(ib_conn->qp, first_wr, NULL);
        if (ib_ret)
                iser_err("ib_post_send failed, ret:%d opcode:%d\n",
                         ib_ret, wr->opcode);
@@ -1118,9 +1076,9 @@ u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
        struct ib_mr_status mr_status;
        int ret;
 
-       if (desc && desc->pi_ctx->sig_protected) {
-               desc->pi_ctx->sig_protected = 0;
-               ret = ib_check_mr_status(desc->pi_ctx->sig_mr,
+       if (desc && desc->sig_protected) {
+               desc->sig_protected = 0;
+               ret = ib_check_mr_status(desc->rsc.sig_mr,
                                         IB_MR_CHECK_SIG_STATUS, &mr_status);
                if (ret) {
                        pr_err("ib_check_mr_status failed, ret %d\n", ret);
index 0205cf142b2fb55f6305193cd5ebbf6913853785..a1a035270cabf0b0dac14542b5f879514c7ce2e1 100644 (file)
@@ -133,7 +133,7 @@ isert_create_qp(struct isert_conn *isert_conn,
        attr.sq_sig_type = IB_SIGNAL_REQ_WR;
        attr.qp_type = IB_QPT_RC;
        if (device->pi_capable)
-               attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN;
+               attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
 
        ret = rdma_create_qp(cma_id, device->pd, &attr);
        if (ret) {
@@ -309,7 +309,7 @@ isert_create_device_ib_res(struct isert_device *device)
 
        /* Check signature cap */
        device->pi_capable = ib_dev->attrs.device_cap_flags &
-                            IB_DEVICE_SIGNATURE_HANDOVER ? true : false;
+                            IB_DEVICE_INTEGRITY_HANDOVER ? true : false;
 
        return 0;
 
@@ -1669,7 +1669,7 @@ isert_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
 
        isert_dbg("Cmd %p\n", isert_cmd);
 
-       ret = isert_check_pi_status(cmd, isert_cmd->rw.sig->sig_mr);
+       ret = isert_check_pi_status(cmd, isert_cmd->rw.reg->mr);
        isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
 
        if (ret) {
@@ -1715,7 +1715,7 @@ isert_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
        iscsit_stop_dataout_timer(cmd);
 
        if (isert_prot_cmd(isert_conn, se_cmd))
-               ret = isert_check_pi_status(se_cmd, isert_cmd->rw.sig->sig_mr);
+               ret = isert_check_pi_status(se_cmd, isert_cmd->rw.reg->mr);
        isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
        cmd->write_data_done = 0;
 
@@ -2059,8 +2059,7 @@ isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
 }
 
 static inline void
-isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs,
-                    struct ib_sig_domain *domain)
+isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_domain *domain)
 {
        domain->sig_type = IB_SIG_TYPE_T10_DIF;
        domain->sig.dif.bg_type = IB_T10DIF_CRC;
@@ -2088,17 +2087,17 @@ isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs)
        case TARGET_PROT_DIN_INSERT:
        case TARGET_PROT_DOUT_STRIP:
                sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE;
-               isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->wire);
+               isert_set_dif_domain(se_cmd, &sig_attrs->wire);
                break;
        case TARGET_PROT_DOUT_INSERT:
        case TARGET_PROT_DIN_STRIP:
                sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE;
-               isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->mem);
+               isert_set_dif_domain(se_cmd, &sig_attrs->mem);
                break;
        case TARGET_PROT_DIN_PASS:
        case TARGET_PROT_DOUT_PASS:
-               isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->wire);
-               isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->mem);
+               isert_set_dif_domain(se_cmd, &sig_attrs->wire);
+               isert_set_dif_domain(se_cmd, &sig_attrs->mem);
                break;
        default:
                isert_err("Unsupported PI operation %d\n", se_cmd->prot_op);
index d5cbad2c61e490a4096669e5a27373089378b9f0..c7bd96edce80ea3a65c1f79cbfcc2b226ccb96fd 100644 (file)
@@ -3466,13 +3466,14 @@ static const match_table_t srp_opt_tokens = {
  * @net:          [in]  Network namespace.
  * @sa:                   [out] Address family, IP address and port number.
  * @addr_port_str: [in]  IP address and port number.
+ * @has_port:     [out] Whether or not @addr_port_str includes a port number.
  *
  * Parse the following address formats:
  * - IPv4: <ip_address>:<port>, e.g. 1.2.3.4:5.
  * - IPv6: \[<ipv6_address>\]:<port>, e.g. [1::2:3%4]:5.
  */
 static int srp_parse_in(struct net *net, struct sockaddr_storage *sa,
-                       const char *addr_port_str)
+                       const char *addr_port_str, bool *has_port)
 {
        char *addr_end, *addr = kstrdup(addr_port_str, GFP_KERNEL);
        char *port_str;
@@ -3481,9 +3482,12 @@ static int srp_parse_in(struct net *net, struct sockaddr_storage *sa,
        if (!addr)
                return -ENOMEM;
        port_str = strrchr(addr, ':');
-       if (!port_str)
-               return -EINVAL;
-       *port_str++ = '\0';
+       if (port_str && strchr(port_str, ']'))
+               port_str = NULL;
+       if (port_str)
+               *port_str++ = '\0';
+       if (has_port)
+               *has_port = port_str != NULL;
        ret = inet_pton_with_scope(net, AF_INET, addr, port_str, sa);
        if (ret && addr[0]) {
                addr_end = addr + strlen(addr) - 1;
@@ -3505,6 +3509,7 @@ static int srp_parse_options(struct net *net, const char *buf,
        char *p;
        substring_t args[MAX_OPT_ARGS];
        unsigned long long ull;
+       bool has_port;
        int opt_mask = 0;
        int token;
        int ret = -EINVAL;
@@ -3603,7 +3608,8 @@ static int srp_parse_options(struct net *net, const char *buf,
                                ret = -ENOMEM;
                                goto out;
                        }
-                       ret = srp_parse_in(net, &target->rdma_cm.src.ss, p);
+                       ret = srp_parse_in(net, &target->rdma_cm.src.ss, p,
+                                          NULL);
                        if (ret < 0) {
                                pr_warn("bad source parameter '%s'\n", p);
                                kfree(p);
@@ -3619,7 +3625,10 @@ static int srp_parse_options(struct net *net, const char *buf,
                                ret = -ENOMEM;
                                goto out;
                        }
-                       ret = srp_parse_in(net, &target->rdma_cm.dst.ss, p);
+                       ret = srp_parse_in(net, &target->rdma_cm.dst.ss, p,
+                                          &has_port);
+                       if (!has_port)
+                               ret = -EINVAL;
                        if (ret < 0) {
                                pr_warn("bad dest parameter '%s'\n", p);
                                kfree(p);
index 676619c1454a9622d8106f7fbaa3e9b116d91d46..a249db528d543dcad37e9b5162cc4b196a2f41e1 100644 (file)
@@ -482,7 +482,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
        ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs,
                              queue->queue_size,
                              IB_MR_TYPE_MEM_REG,
-                             nvme_rdma_get_max_fr_pages(ibdev));
+                             nvme_rdma_get_max_fr_pages(ibdev), 0);
        if (ret) {
                dev_err(queue->ctrl->ctrl.device,
                        "failed to initialize MR pool sized %d for QID %d\n",
index aa9bdd47a648590293fffee5e43371764ee2bc0b..d3a0fbfff2bb0931dddd8fd0d65ab544bc8f76b0 100644 (file)
@@ -82,6 +82,7 @@ struct dim_stats {
  * @prev_stats: Measured rates from previous iteration (for comparison)
  * @start_sample: Sampled data at start of current iteration
  * @work: Work to perform on action required
+ * @priv: A pointer to the struct that points to dim
  * @profile_ix: Current moderation profile
  * @mode: CQ period count mode
  * @tune_state: Algorithm tuning state (see below)
@@ -95,6 +96,7 @@ struct dim {
        struct dim_sample start_sample;
        struct dim_sample measuring_sample;
        struct work_struct work;
+       void *priv;
        u8 profile_ix;
        u8 mode;
        u8 tune_state;
@@ -363,4 +365,25 @@ struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode);
  */
 void net_dim(struct dim *dim, struct dim_sample end_sample);
 
+/* RDMA DIM */
+
+/*
+ * RDMA DIM profile:
+ * profile size must be of RDMA_DIM_PARAMS_NUM_PROFILES.
+ */
+#define RDMA_DIM_PARAMS_NUM_PROFILES 9
+#define RDMA_DIM_START_PROFILE 0
+
+/**
+ * rdma_dim - Runs the adaptive moderation.
+ * @dim: The moderation struct.
+ * @completions: The number of completions collected in this round.
+ *
+ * Each call to rdma_dim takes the latest amount of completions that
+ * have been collected and counts them as a new event.
+ * Once enough events have been collected the algorithm decides a new
+ * moderation level.
+ */
+void rdma_dim(struct dim *dim, u64 completions);
+
 #endif /* DIM_H */
index 515624c66ce10c892e8090dc149cf267b8111e1d..b3d5752657d9893fbc83bdaf577d872ed9bde5a4 100644 (file)
@@ -1391,7 +1391,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
        u8         reserved_at_6c8[0x28];
        u8         sf_base_id[0x10];
 
-       u8         reserved_at_700[0x100];
+       u8         reserved_at_700[0x80];
+       u8         vhca_tunnel_commands[0x40];
+       u8         reserved_at_7c0[0x40];
 };
 
 enum mlx5_flow_destination_type {
@@ -9695,7 +9697,7 @@ struct mlx5_ifc_general_obj_in_cmd_hdr_bits {
        u8         opcode[0x10];
        u8         uid[0x10];
 
-       u8         reserved_at_20[0x10];
+       u8         vhca_tunnel_id[0x10];
        u8         obj_type[0x10];
 
        u8         obj_id[0x20];
index 127d224443e36003671be24e40c062b006795095..ae63b1ae90048cb84e35df4c509f2e6dfcd34abc 100644 (file)
@@ -37,7 +37,8 @@
 #include <linux/mlx5/driver.h>
 
 #define MLX5_INVALID_LKEY      0x100
-#define MLX5_SIG_WQE_SIZE      (MLX5_SEND_WQE_BB * 5)
+/* UMR (3 WQE_BB's) + SIG (3 WQE_BB's) + PSV (mem) + PSV (wire) */
+#define MLX5_SIG_WQE_SIZE      (MLX5_SEND_WQE_BB * 8)
 #define MLX5_DIF_SIZE          8
 #define MLX5_STRIDE_BLOCK_OP   0x400
 #define MLX5_CPY_GRD_MASK      0xc0
@@ -70,6 +71,7 @@ enum mlx5_qp_optpar {
        MLX5_QP_OPTPAR_CQN_RCV                  = 1 << 19,
        MLX5_QP_OPTPAR_DC_HS                    = 1 << 20,
        MLX5_QP_OPTPAR_DC_KEY                   = 1 << 21,
+       MLX5_QP_OPTPAR_COUNTER_SET_ID           = 1 << 25,
 };
 
 enum mlx5_qp_state {
index 040d853077c6015fc7ac8ebb40938c1b56ccc5c2..1052d0d62be7d2b5d623717d218b3085054f5614 100644 (file)
@@ -46,7 +46,6 @@ struct ib_umem {
        struct mm_struct       *owning_mm;
        size_t                  length;
        unsigned long           address;
-       int                     page_shift;
        u32 writable : 1;
        u32 is_odp : 1;
        struct work_struct      work;
@@ -58,24 +57,14 @@ struct ib_umem {
 /* Returns the offset of the umem start relative to the first page. */
 static inline int ib_umem_offset(struct ib_umem *umem)
 {
-       return umem->address & (BIT(umem->page_shift) - 1);
-}
-
-/* Returns the first page of an ODP umem. */
-static inline unsigned long ib_umem_start(struct ib_umem *umem)
-{
-       return umem->address - ib_umem_offset(umem);
-}
-
-/* Returns the address of the page after the last one of an ODP umem. */
-static inline unsigned long ib_umem_end(struct ib_umem *umem)
-{
-       return ALIGN(umem->address + umem->length, BIT(umem->page_shift));
+       return umem->address & ~PAGE_MASK;
 }
 
 static inline size_t ib_umem_num_pages(struct ib_umem *umem)
 {
-       return (ib_umem_end(umem) - ib_umem_start(umem)) >> umem->page_shift;
+       return (ALIGN(umem->address + umem->length, PAGE_SIZE) -
+               ALIGN_DOWN(umem->address, PAGE_SIZE)) >>
+              PAGE_SHIFT;
 }
 
 #ifdef CONFIG_INFINIBAND_USER_MEM
index eeec4e53c4489c890ade19d41652b662bdbe54cb..479db5c98ff60f38fc9148fe3fd8f1632a4c4847 100644 (file)
@@ -76,6 +76,7 @@ struct ib_umem_odp {
 
        struct completion       notifier_completion;
        int                     dying;
+       unsigned int            page_shift;
        struct work_struct      work;
 };
 
@@ -84,6 +85,25 @@ static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem)
        return container_of(umem, struct ib_umem_odp, umem);
 }
 
+/* Returns the first page of an ODP umem. */
+static inline unsigned long ib_umem_start(struct ib_umem_odp *umem_odp)
+{
+       return ALIGN_DOWN(umem_odp->umem.address, 1UL << umem_odp->page_shift);
+}
+
+/* Returns the address of the page after the last one of an ODP umem. */
+static inline unsigned long ib_umem_end(struct ib_umem_odp *umem_odp)
+{
+       return ALIGN(umem_odp->umem.address + umem_odp->umem.length,
+                    1UL << umem_odp->page_shift);
+}
+
+static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
+{
+       return (ib_umem_end(umem_odp) - ib_umem_start(umem_odp)) >>
+              umem_odp->page_shift;
+}
+
 /*
  * The lower 2 bits of the DMA address signal the R/W permissions for
  * the entry. To upgrade the permissions, provide the appropriate
index 54873085f2dab21257680f63c2e8db1e83416b8c..c5f8a9f1706374fece3b342a3b995d9e31ca18d5 100644 (file)
 #include <linux/cgroup_rdma.h>
 #include <linux/irqflags.h>
 #include <linux/preempt.h>
+#include <linux/dim.h>
 #include <uapi/rdma/ib_user_verbs.h>
+#include <rdma/rdma_counter.h>
 #include <rdma/restrack.h>
+#include <rdma/signature.h>
 #include <uapi/rdma/rdma_user_ioctl.h>
 #include <uapi/rdma/ib_user_ioctl_verbs.h>
 
@@ -132,17 +135,6 @@ struct ib_gid_attr {
        u8                      port_num;
 };
 
-enum rdma_node_type {
-       /* IB values map to NodeInfo:NodeType. */
-       RDMA_NODE_IB_CA         = 1,
-       RDMA_NODE_IB_SWITCH,
-       RDMA_NODE_IB_ROUTER,
-       RDMA_NODE_RNIC,
-       RDMA_NODE_USNIC,
-       RDMA_NODE_USNIC_UDP,
-       RDMA_NODE_UNSPECIFIED,
-};
-
 enum {
        /* set the local administered indication */
        IB_SA_WELL_KNOWN_GUID   = BIT_ULL(57) | 2,
@@ -164,7 +156,7 @@ enum rdma_protocol_type {
 };
 
 __attribute_const__ enum rdma_transport_type
-rdma_node_get_transport(enum rdma_node_type node_type);
+rdma_node_get_transport(unsigned int node_type);
 
 enum rdma_network_type {
        RDMA_NETWORK_IB,
@@ -263,7 +255,7 @@ enum ib_device_cap_flags {
         */
        IB_DEVICE_CROSS_CHANNEL                 = (1 << 27),
        IB_DEVICE_MANAGED_FLOW_STEERING         = (1 << 29),
-       IB_DEVICE_SIGNATURE_HANDOVER            = (1 << 30),
+       IB_DEVICE_INTEGRITY_HANDOVER            = (1 << 30),
        IB_DEVICE_ON_DEMAND_PAGING              = (1ULL << 31),
        IB_DEVICE_SG_GAPS_REG                   = (1ULL << 32),
        IB_DEVICE_VIRTUAL_FUNCTION              = (1ULL << 33),
@@ -275,17 +267,6 @@ enum ib_device_cap_flags {
        IB_DEVICE_ALLOW_USER_UNREG              = (1ULL << 37),
 };
 
-enum ib_signature_prot_cap {
-       IB_PROT_T10DIF_TYPE_1 = 1,
-       IB_PROT_T10DIF_TYPE_2 = 1 << 1,
-       IB_PROT_T10DIF_TYPE_3 = 1 << 2,
-};
-
-enum ib_signature_guard_cap {
-       IB_GUARD_T10DIF_CRC     = 1,
-       IB_GUARD_T10DIF_CSUM    = 1 << 1,
-};
-
 enum ib_atomic_cap {
        IB_ATOMIC_NONE,
        IB_ATOMIC_HCA,
@@ -327,8 +308,8 @@ struct ib_rss_caps {
 };
 
 enum ib_tm_cap_flags {
-       /*  Support tag matching on RC transport */
-       IB_TM_CAP_RC                = 1 << 0,
+       /*  Support tag matching with rendezvous offload for RC transport */
+       IB_TM_CAP_RNDV_RC = 1 << 0,
 };
 
 struct ib_tm_caps {
@@ -411,6 +392,7 @@ struct ib_device_attr {
        int                     max_srq_wr;
        int                     max_srq_sge;
        unsigned int            max_fast_reg_page_list_len;
+       unsigned int            max_pi_fast_reg_page_list_len;
        u16                     max_pkeys;
        u8                      local_ca_ack_delay;
        int                     sig_prot_cap;
@@ -796,118 +778,26 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate);
  * enum ib_mr_type - memory region type
  * @IB_MR_TYPE_MEM_REG:       memory region that is used for
  *                            normal registration
- * @IB_MR_TYPE_SIGNATURE:     memory region that is used for
- *                            signature operations (data-integrity
- *                            capable regions)
  * @IB_MR_TYPE_SG_GAPS:       memory region that is capable to
  *                            register any arbitrary sg lists (without
  *                            the normal mr constraints - see
  *                            ib_map_mr_sg)
+ * @IB_MR_TYPE_DM:            memory region that is used for device
+ *                            memory registration
+ * @IB_MR_TYPE_USER:          memory region that is used for the user-space
+ *                            application
+ * @IB_MR_TYPE_DMA:           memory region that is used for DMA operations
+ *                            without address translations (VA=PA)
+ * @IB_MR_TYPE_INTEGRITY:     memory region that is used for
+ *                            data integrity operations
  */
 enum ib_mr_type {
        IB_MR_TYPE_MEM_REG,
-       IB_MR_TYPE_SIGNATURE,
        IB_MR_TYPE_SG_GAPS,
-};
-
-/**
- * Signature types
- * IB_SIG_TYPE_NONE: Unprotected.
- * IB_SIG_TYPE_T10_DIF: Type T10-DIF
- */
-enum ib_signature_type {
-       IB_SIG_TYPE_NONE,
-       IB_SIG_TYPE_T10_DIF,
-};
-
-/**
- * Signature T10-DIF block-guard types
- * IB_T10DIF_CRC: Corresponds to T10-PI mandated CRC checksum rules.
- * IB_T10DIF_CSUM: Corresponds to IP checksum rules.
- */
-enum ib_t10_dif_bg_type {
-       IB_T10DIF_CRC,
-       IB_T10DIF_CSUM
-};
-
-/**
- * struct ib_t10_dif_domain - Parameters specific for T10-DIF
- *     domain.
- * @bg_type: T10-DIF block guard type (CRC|CSUM)
- * @pi_interval: protection information interval.
- * @bg: seed of guard computation.
- * @app_tag: application tag of guard block
- * @ref_tag: initial guard block reference tag.
- * @ref_remap: Indicate wethear the reftag increments each block
- * @app_escape: Indicate to skip block check if apptag=0xffff
- * @ref_escape: Indicate to skip block check if reftag=0xffffffff
- * @apptag_check_mask: check bitmask of application tag.
- */
-struct ib_t10_dif_domain {
-       enum ib_t10_dif_bg_type bg_type;
-       u16                     pi_interval;
-       u16                     bg;
-       u16                     app_tag;
-       u32                     ref_tag;
-       bool                    ref_remap;
-       bool                    app_escape;
-       bool                    ref_escape;
-       u16                     apptag_check_mask;
-};
-
-/**
- * struct ib_sig_domain - Parameters for signature domain
- * @sig_type: specific signauture type
- * @sig: union of all signature domain attributes that may
- *     be used to set domain layout.
- */
-struct ib_sig_domain {
-       enum ib_signature_type sig_type;
-       union {
-               struct ib_t10_dif_domain dif;
-       } sig;
-};
-
-/**
- * struct ib_sig_attrs - Parameters for signature handover operation
- * @check_mask: bitmask for signature byte check (8 bytes)
- * @mem: memory domain layout desciptor.
- * @wire: wire domain layout desciptor.
- */
-struct ib_sig_attrs {
-       u8                      check_mask;
-       struct ib_sig_domain    mem;
-       struct ib_sig_domain    wire;
-};
-
-enum ib_sig_err_type {
-       IB_SIG_BAD_GUARD,
-       IB_SIG_BAD_REFTAG,
-       IB_SIG_BAD_APPTAG,
-};
-
-/**
- * Signature check masks (8 bytes in total) according to the T10-PI standard:
- *  -------- -------- ------------
- * | GUARD  | APPTAG |   REFTAG   |
- * |  2B    |  2B    |    4B      |
- *  -------- -------- ------------
- */
-enum {
-       IB_SIG_CHECK_GUARD      = 0xc0,
-       IB_SIG_CHECK_APPTAG     = 0x30,
-       IB_SIG_CHECK_REFTAG     = 0x0f,
-};
-
-/**
- * struct ib_sig_err - signature error descriptor
- */
-struct ib_sig_err {
-       enum ib_sig_err_type    err_type;
-       u32                     expected;
-       u32                     actual;
-       u64                     sig_err_offset;
-       u32                     key;
+       IB_MR_TYPE_DM,
+       IB_MR_TYPE_USER,
+       IB_MR_TYPE_DMA,
+       IB_MR_TYPE_INTEGRITY,
 };
 
 enum ib_mr_status_check {
@@ -1164,7 +1054,7 @@ enum ib_qp_create_flags {
        IB_QP_CREATE_MANAGED_SEND               = 1 << 3,
        IB_QP_CREATE_MANAGED_RECV               = 1 << 4,
        IB_QP_CREATE_NETIF_QP                   = 1 << 5,
-       IB_QP_CREATE_SIGNATURE_EN               = 1 << 6,
+       IB_QP_CREATE_INTEGRITY_EN               = 1 << 6,
        /* FREE                                 = 1 << 7, */
        IB_QP_CREATE_SCATTER_FCS                = 1 << 8,
        IB_QP_CREATE_CVLAN_STRIPPING            = 1 << 9,
@@ -1343,7 +1233,7 @@ enum ib_wr_opcode {
 
        /* These are kernel only and can not be issued by userspace */
        IB_WR_REG_MR = 0x20,
-       IB_WR_REG_SIG_MR,
+       IB_WR_REG_MR_INTEGRITY,
 
        /* reserve values for low level drivers' internal use.
         * These values will not be used at all in the ib core layer.
@@ -1453,20 +1343,6 @@ static inline const struct ib_reg_wr *reg_wr(const struct ib_send_wr *wr)
        return container_of(wr, struct ib_reg_wr, wr);
 }
 
-struct ib_sig_handover_wr {
-       struct ib_send_wr       wr;
-       struct ib_sig_attrs    *sig_attrs;
-       struct ib_mr           *sig_mr;
-       int                     access_flags;
-       struct ib_sge          *prot;
-};
-
-static inline const struct ib_sig_handover_wr *
-sig_handover_wr(const struct ib_send_wr *wr)
-{
-       return container_of(wr, struct ib_sig_handover_wr, wr);
-}
-
 struct ib_recv_wr {
        struct ib_recv_wr      *next;
        union {
@@ -1634,6 +1510,7 @@ struct ib_cq {
                struct work_struct      work;
        };
        struct workqueue_struct *comp_wq;
+       struct dim *dim;
        /*
         * Implementation details of the RDMA core, don't use in drivers:
         */
@@ -1818,10 +1695,14 @@ struct ib_qp {
        struct ib_qp_security  *qp_sec;
        u8                      port;
 
+       bool                    integrity_en;
        /*
         * Implementation details of the RDMA core, don't use in drivers:
         */
        struct rdma_restrack_entry     res;
+
+       /* The counter the qp is bind to */
+       struct rdma_counter    *counter;
 };
 
 struct ib_dm {
@@ -1840,6 +1721,7 @@ struct ib_mr {
        u64                iova;
        u64                length;
        unsigned int       page_size;
+       enum ib_mr_type    type;
        bool               need_inval;
        union {
                struct ib_uobject       *uobject;       /* user */
@@ -1847,7 +1729,7 @@ struct ib_mr {
        };
 
        struct ib_dm      *dm;
-
+       struct ib_sig_attrs *sig_attrs; /* only for IB_MR_TYPE_INTEGRITY MRs */
        /*
         * Implementation details of the RDMA core, don't use in drivers:
         */
@@ -2243,6 +2125,8 @@ struct ib_port_data {
        spinlock_t netdev_lock;
        struct net_device __rcu *netdev;
        struct hlist_node ndev_hash_link;
+       struct rdma_port_counter port_counter;
+       struct rdma_hw_stats *hw_stats;
 };
 
 /* rdma netdev type - specifies protocol type */
@@ -2329,6 +2213,11 @@ struct iw_cm_conn_param;
  * need to define the supported operations, otherwise they will be set to null.
  */
 struct ib_device_ops {
+       struct module *owner;
+       enum rdma_driver_id driver_id;
+       u32 uverbs_abi_ver;
+       unsigned int uverbs_no_driver_id_binding:1;
+
        int (*post_send)(struct ib_qp *qp, const struct ib_send_wr *send_wr,
                         const struct ib_send_wr **bad_send_wr);
        int (*post_recv)(struct ib_qp *qp, const struct ib_recv_wr *recv_wr,
@@ -2454,11 +2343,10 @@ struct ib_device_ops {
        int (*query_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
                        int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
        int (*destroy_qp)(struct ib_qp *qp, struct ib_udata *udata);
-       struct ib_cq *(*create_cq)(struct ib_device *device,
-                                  const struct ib_cq_init_attr *attr,
-                                  struct ib_udata *udata);
+       int (*create_cq)(struct ib_cq *cq, const struct ib_cq_init_attr *attr,
+                        struct ib_udata *udata);
        int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
-       int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata);
+       void (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata);
        int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata);
        struct ib_mr *(*get_dma_mr)(struct ib_pd *pd, int mr_access_flags);
        struct ib_mr *(*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length,
@@ -2470,6 +2358,9 @@ struct ib_device_ops {
        int (*dereg_mr)(struct ib_mr *mr, struct ib_udata *udata);
        struct ib_mr *(*alloc_mr)(struct ib_pd *pd, enum ib_mr_type mr_type,
                                  u32 max_num_sg, struct ib_udata *udata);
+       struct ib_mr *(*alloc_mr_integrity)(struct ib_pd *pd,
+                                           u32 max_num_data_sg,
+                                           u32 max_num_meta_sg);
        int (*advise_mr)(struct ib_pd *pd,
                         enum ib_uverbs_advise_mr_advice advice, u32 flags,
                         struct ib_sge *sg_list, u32 num_sge,
@@ -2516,7 +2407,7 @@ struct ib_device_ops {
        struct ib_wq *(*create_wq)(struct ib_pd *pd,
                                   struct ib_wq_init_attr *init_attr,
                                   struct ib_udata *udata);
-       int (*destroy_wq)(struct ib_wq *wq, struct ib_udata *udata);
+       void (*destroy_wq)(struct ib_wq *wq, struct ib_udata *udata);
        int (*modify_wq)(struct ib_wq *wq, struct ib_wq_attr *attr,
                         u32 wq_attr_mask, struct ib_udata *udata);
        struct ib_rwq_ind_table *(*create_rwq_ind_table)(
@@ -2538,6 +2429,11 @@ struct ib_device_ops {
        int (*read_counters)(struct ib_counters *counters,
                             struct ib_counters_read_attr *counters_read_attr,
                             struct uverbs_attr_bundle *attrs);
+       int (*map_mr_sg_pi)(struct ib_mr *mr, struct scatterlist *data_sg,
+                           int data_sg_nents, unsigned int *data_sg_offset,
+                           struct scatterlist *meta_sg, int meta_sg_nents,
+                           unsigned int *meta_sg_offset);
+
        /**
         * alloc_hw_stats - Allocate a struct rdma_hw_stats and fill in the
         *   driver initialized data.  The struct is kfree()'ed by the sysfs
@@ -2595,8 +2491,34 @@ struct ib_device_ops {
                         u8 pdata_len);
        int (*iw_create_listen)(struct iw_cm_id *cm_id, int backlog);
        int (*iw_destroy_listen)(struct iw_cm_id *cm_id);
+       /**
+        * counter_bind_qp - Bind a QP to a counter.
+        * @counter - The counter to be bound. If counter->id is zero then
+        *   the driver needs to allocate a new counter and set counter->id
+        */
+       int (*counter_bind_qp)(struct rdma_counter *counter, struct ib_qp *qp);
+       /**
+        * counter_unbind_qp - Unbind the qp from the dynamically-allocated
+        *   counter and bind it onto the default one
+        */
+       int (*counter_unbind_qp)(struct ib_qp *qp);
+       /**
+        * counter_dealloc -De-allocate the hw counter
+        */
+       int (*counter_dealloc)(struct rdma_counter *counter);
+       /**
+        * counter_alloc_stats - Allocate a struct rdma_hw_stats and fill in
+        * the driver initialized data.
+        */
+       struct rdma_hw_stats *(*counter_alloc_stats)(
+               struct rdma_counter *counter);
+       /**
+        * counter_update_stats - Query the stats value of this counter
+        */
+       int (*counter_update_stats)(struct rdma_counter *counter);
 
        DECLARE_RDMA_OBJ_SIZE(ib_ah);
+       DECLARE_RDMA_OBJ_SIZE(ib_cq);
        DECLARE_RDMA_OBJ_SIZE(ib_pd);
        DECLARE_RDMA_OBJ_SIZE(ib_srq);
        DECLARE_RDMA_OBJ_SIZE(ib_ucontext);
@@ -2636,7 +2558,6 @@ struct ib_device {
 
        int                           num_comp_vectors;
 
-       struct module               *owner;
        union {
                struct device           dev;
                struct ib_core_device   coredev;
@@ -2648,7 +2569,6 @@ struct ib_device {
         */
        const struct attribute_group    *groups[3];
 
-       int                          uverbs_abi_ver;
        u64                          uverbs_cmd_mask;
        u64                          uverbs_ex_cmd_mask;
 
@@ -2658,6 +2578,8 @@ struct ib_device {
        u16                          is_switch:1;
        /* Indicates kernel verbs support, should not be used in drivers */
        u16                          kverbs_provider:1;
+       /* CQ adaptive moderation (RDMA DIM) */
+       u16                          use_cq_dim:1;
        u8                           node_type;
        u8                           phys_port_cnt;
        struct ib_device_attr        attrs;
@@ -2672,7 +2594,6 @@ struct ib_device {
        struct rdma_restrack_root *res;
 
        const struct uapi_definition   *driver_def;
-       enum rdma_driver_id             driver_id;
 
        /*
         * Positive refcount indicates that the device is currently
@@ -2694,11 +2615,15 @@ struct ib_device {
        u32 iw_driver_flags;
 };
 
+struct ib_client_nl_info;
 struct ib_client {
        const char *name;
        void (*add)   (struct ib_device *);
        void (*remove)(struct ib_device *, void *client_data);
        void (*rename)(struct ib_device *dev, void *client_data);
+       int (*get_nl_info)(struct ib_device *ibdev, void *client_data,
+                          struct ib_client_nl_info *res);
+       int (*get_global_nl_info)(struct ib_client_nl_info *res);
 
        /* Returns the net_dev belonging to this ib_client and matching the
         * given parameters.
@@ -3859,9 +3784,9 @@ int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata);
  *
  * NOTE: for user cq use ib_destroy_cq_user with valid udata!
  */
-static inline int ib_destroy_cq(struct ib_cq *cq)
+static inline void ib_destroy_cq(struct ib_cq *cq)
 {
-       return ib_destroy_cq_user(cq, NULL);
+       ib_destroy_cq_user(cq, NULL);
 }
 
 /**
@@ -4148,6 +4073,10 @@ static inline struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
        return ib_alloc_mr_user(pd, mr_type, max_num_sg, NULL);
 }
 
+struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd,
+                                   u32 max_num_data_sg,
+                                   u32 max_num_meta_sg);
+
 /**
  * ib_update_fast_reg_key - updates the key portion of the fast_reg MR
  *   R_Key and L_Key.
@@ -4332,6 +4261,10 @@ int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
 
 int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
                 unsigned int *sg_offset, unsigned int page_size);
+int ib_map_mr_sg_pi(struct ib_mr *mr, struct scatterlist *data_sg,
+                   int data_sg_nents, unsigned int *data_sg_offset,
+                   struct scatterlist *meta_sg, int meta_sg_nents,
+                   unsigned int *meta_sg_offset, unsigned int page_size);
 
 static inline int
 ib_map_mr_sg_zbva(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
index 83763ef82354501788f262276763e4ef4c491fbc..e77123bcb43bc1fcda7d1c6567c65a428ca4697f 100644 (file)
@@ -11,7 +11,7 @@ struct ib_mr *ib_mr_pool_get(struct ib_qp *qp, struct list_head *list);
 void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr);
 
 int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr,
-               enum ib_mr_type type, u32 max_num_sg);
+               enum ib_mr_type type, u32 max_num_sg, u32 max_num_meta_sg);
 void ib_mr_pool_destroy(struct ib_qp *qp, struct list_head *list);
 
 #endif /* _RDMA_MR_POOL_H */
diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h
new file mode 100644 (file)
index 0000000..eb99856
--- /dev/null
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2019 Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef _RDMA_COUNTER_H_
+#define _RDMA_COUNTER_H_
+
+#include <linux/mutex.h>
+#include <linux/pid_namespace.h>
+
+#include <rdma/restrack.h>
+#include <rdma/rdma_netlink.h>
+
+struct ib_device;
+struct ib_qp;
+
+struct auto_mode_param {
+       int qp_type;
+};
+
+struct rdma_counter_mode {
+       enum rdma_nl_counter_mode mode;
+       enum rdma_nl_counter_mask mask;
+       struct auto_mode_param param;
+};
+
+struct rdma_port_counter {
+       struct rdma_counter_mode mode;
+       struct rdma_hw_stats *hstats;
+       unsigned int num_counters;
+       struct mutex lock;
+};
+
+struct rdma_counter {
+       struct rdma_restrack_entry      res;
+       struct ib_device                *device;
+       uint32_t                        id;
+       struct kref                     kref;
+       struct rdma_counter_mode        mode;
+       struct mutex                    lock;
+       struct rdma_hw_stats            *stats;
+       u8                              port;
+};
+
+void rdma_counter_init(struct ib_device *dev);
+void rdma_counter_release(struct ib_device *dev);
+int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
+                              bool on, enum rdma_nl_counter_mask mask);
+int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port);
+int rdma_counter_unbind_qp(struct ib_qp *qp, bool force);
+
+int rdma_counter_query_stats(struct rdma_counter *counter);
+u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index);
+int rdma_counter_bind_qpn(struct ib_device *dev, u8 port,
+                         u32 qp_num, u32 counter_id);
+int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port,
+                               u32 qp_num, u32 *counter_id);
+int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port,
+                           u32 qp_num, u32 counter_id);
+int rdma_counter_get_mode(struct ib_device *dev, u8 port,
+                         enum rdma_nl_counter_mode *mode,
+                         enum rdma_nl_counter_mask *mask);
+
+#endif /* _RDMA_COUNTER_H_ */
index 10732ab31ba2f9c466eedc5d1408edb8a6ecbace..6631624e4d7c542dd196387d82be263a7e61be4f 100644 (file)
@@ -6,6 +6,12 @@
 #include <linux/netlink.h>
 #include <uapi/rdma/rdma_netlink.h>
 
+enum {
+       RDMA_NLDEV_ATTR_EMPTY_STRING = 1,
+       RDMA_NLDEV_ATTR_ENTRY_STRLEN = 16,
+       RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE = 32,
+};
+
 struct rdma_nl_cbs {
        int (*doit)(struct sk_buff *skb, struct nlmsghdr *nlh,
                    struct netlink_ext_ack *extack);
@@ -110,4 +116,6 @@ void rdma_link_register(struct rdma_link_ops *ops);
 void rdma_link_unregister(struct rdma_link_ops *ops);
 
 #define MODULE_ALIAS_RDMA_LINK(type) MODULE_ALIAS("rdma-link-" type)
+#define MODULE_ALIAS_RDMA_CLIENT(type) MODULE_ALIAS("rdma-client-" type)
+
 #endif /* _RDMA_NETLINK_H */
index b9cd06db1a71e65b325ab0acea399419654c2cd8..525848e227dc5356227ea5ce96c2da36693db4b0 100644 (file)
@@ -2,7 +2,7 @@
 #define DEF_RDMA_VT_H
 
 /*
- * Copyright(c) 2016 - 2018 Intel Corporation.
+ * Copyright(c) 2016 - 2019 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -202,7 +202,6 @@ struct rvt_pd {
 struct rvt_ah {
        struct ib_ah ibah;
        struct rdma_ah_attr attr;
-       atomic_t refcount;
        u8 vl;
        u8 log_pmtu;
 };
@@ -555,7 +554,7 @@ static inline u16 rvt_get_pkey(struct rvt_dev_info *rdi,
 
 struct rvt_dev_info *rvt_alloc_device(size_t size, int nports);
 void rvt_dealloc_device(struct rvt_dev_info *rdi);
-int rvt_register_device(struct rvt_dev_info *rvd, u32 driver_id);
+int rvt_register_device(struct rvt_dev_info *rvd);
 void rvt_unregister_device(struct rvt_dev_info *rvd);
 int rvt_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr);
 int rvt_init_port(struct rvt_dev_info *rdi, struct rvt_ibport *port,
index 75dc65c0bfb83d81b25fea05905373fa8d5d9e08..04c519ef6d715b30adf603b2445180a9874325e2 100644 (file)
  */
 #define RVT_CQ_NONE      (IB_CQ_NEXT_COMP + 1)
 
+/*
+ * Define read macro that apply smp_load_acquire memory barrier
+ * when reading indice of circular buffer that mmaped to user space.
+ */
+#define RDMA_READ_UAPI_ATOMIC(member) smp_load_acquire(&(member).val)
+
+/*
+ * Define write macro that uses smp_store_release memory barrier
+ * when writing indice of circular buffer that mmaped to user space.
+ */
+#define RDMA_WRITE_UAPI_ATOMIC(member, x) smp_store_release(&(member).val, x)
+#include <rdma/rvt-abi.h>
+
 /*
  * This structure is used to contain the head pointer, tail pointer,
  * and completion queue entries as a single memory allocation so
  * it can be mmap'ed into user space.
  */
-struct rvt_cq_wc {
+struct rvt_k_cq_wc {
        u32 head;               /* index of next entry to fill */
        u32 tail;               /* index of next ib_poll_cq() entry */
-       union {
-               /* these are actually size ibcq.cqe + 1 */
-               struct ib_uverbs_wc uqueue[0];
-               struct ib_wc kqueue[0];
-       };
+       struct ib_wc kqueue[];
 };
 
 /*
@@ -84,10 +93,12 @@ struct rvt_cq {
        spinlock_t lock; /* protect changes in this struct */
        u8 notify;
        u8 triggered;
+       u8 cq_full;
        int comp_vector_cpu;
        struct rvt_dev_info *rdi;
        struct rvt_cq_wc *queue;
        struct rvt_mmap_info *ip;
+       struct rvt_k_cq_wc *kqueue;
 };
 
 static inline struct rvt_cq *ibcq_to_rvtcq(struct ib_cq *ibcq)
@@ -95,6 +106,6 @@ static inline struct rvt_cq *ibcq_to_rvtcq(struct ib_cq *ibcq)
        return container_of(ibcq, struct rvt_cq, ibcq);
 }
 
-void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited);
+bool rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited);
 
 #endif          /* DEF_RDMAVT_INCCQH */
index 68e38c20afc043c31662916b9b95ba433a720077..0eeea520a85315d12cf838b52f4fbc3d6502221e 100644 (file)
@@ -2,7 +2,7 @@
 #define DEF_RDMAVT_INCQP_H
 
 /*
- * Copyright(c) 2016 - 2018 Intel Corporation.
+ * Copyright(c) 2016 - 2019 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -52,6 +52,7 @@
 #include <rdma/ib_pack.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/rdmavt_cq.h>
+#include <rdma/rvt-abi.h>
 /*
  * Atomic bit definitions for r_aflags.
  */
 #define RVT_SEND_RESERVE_USED           IB_SEND_RESERVED_START
 #define RVT_SEND_COMPLETION_ONLY       (IB_SEND_RESERVED_START << 1)
 
+/**
+ * rvt_ud_wr - IB UD work plus AH cache
+ * @wr: valid IB work request
+ * @attr: pointer to an allocated AH attribute
+ *
+ * Special case the UD WR so we can keep track of the AH attributes.
+ *
+ * NOTE: This data structure is stricly ordered wr then attr. I.e the attr
+ * MUST come after wr.  The ib_ud_wr is sized and copied in rvt_post_one_wr.
+ * The copy assumes that wr is first.
+ */
+struct rvt_ud_wr {
+       struct ib_ud_wr wr;
+       struct rdma_ah_attr *attr;
+};
+
 /*
  * Send work request queue entry.
  * The size of the sg_list is determined when the QP is created and stored
 struct rvt_swqe {
        union {
                struct ib_send_wr wr;   /* don't use wr.sg_list */
-               struct ib_ud_wr ud_wr;
+               struct rvt_ud_wr ud_wr;
                struct ib_reg_wr reg_wr;
                struct ib_rdma_wr rdma_wr;
                struct ib_atomic_wr atomic_wr;
@@ -177,33 +194,84 @@ struct rvt_swqe {
        struct rvt_sge sg_list[0];
 };
 
-/*
- * Receive work request queue entry.
- * The size of the sg_list is determined when the QP (or SRQ) is created
- * and stored in qp->r_rq.max_sge (or srq->rq.max_sge).
+/**
+ * struct rvt_krwq - kernel struct receive work request
+ * @p_lock: lock to protect producer of the kernel buffer
+ * @head: index of next entry to fill
+ * @c_lock:lock to protect consumer of the kernel buffer
+ * @tail: index of next entry to pull
+ * @count: count is aproximate of total receive enteries posted
+ * @rvt_rwqe: struct of receive work request queue entry
+ *
+ * This structure is used to contain the head pointer,
+ * tail pointer and receive work queue entries for kernel
+ * mode user.
  */
-struct rvt_rwqe {
-       u64 wr_id;
-       u8 num_sge;
-       struct ib_sge sg_list[0];
-};
-
-/*
- * This structure is used to contain the head pointer, tail pointer,
- * and receive work queue entries as a single memory allocation so
- * it can be mmap'ed into user space.
- * Note that the wq array elements are variable size so you can't
- * just index into the array to get the N'th element;
- * use get_rwqe_ptr() instead.
- */
-struct rvt_rwq {
+struct rvt_krwq {
+       spinlock_t p_lock;      /* protect producer */
        u32 head;               /* new work requests posted to the head */
+
+       /* protect consumer */
+       spinlock_t c_lock ____cacheline_aligned_in_smp;
        u32 tail;               /* receives pull requests from here. */
-       struct rvt_rwqe wq[0];
+       u32 count;              /* approx count of receive entries posted */
+       struct rvt_rwqe *curr_wq;
+       struct rvt_rwqe wq[];
 };
 
+/*
+ * rvt_get_swqe_ah - Return the pointer to the struct rvt_ah
+ * @swqe: valid Send WQE
+ *
+ */
+static inline struct rvt_ah *rvt_get_swqe_ah(struct rvt_swqe *swqe)
+{
+       return ibah_to_rvtah(swqe->ud_wr.wr.ah);
+}
+
+/**
+ * rvt_get_swqe_ah_attr - Return the cached ah attribute information
+ * @swqe: valid Send WQE
+ *
+ */
+static inline struct rdma_ah_attr *rvt_get_swqe_ah_attr(struct rvt_swqe *swqe)
+{
+       return swqe->ud_wr.attr;
+}
+
+/**
+ * rvt_get_swqe_remote_qpn - Access the remote QPN value
+ * @swqe: valid Send WQE
+ *
+ */
+static inline u32 rvt_get_swqe_remote_qpn(struct rvt_swqe *swqe)
+{
+       return swqe->ud_wr.wr.remote_qpn;
+}
+
+/**
+ * rvt_get_swqe_remote_qkey - Acces the remote qkey value
+ * @swqe: valid Send WQE
+ *
+ */
+static inline u32 rvt_get_swqe_remote_qkey(struct rvt_swqe *swqe)
+{
+       return swqe->ud_wr.wr.remote_qkey;
+}
+
+/**
+ * rvt_get_swqe_pkey_index - Access the pkey index
+ * @swqe: valid Send WQE
+ *
+ */
+static inline u16 rvt_get_swqe_pkey_index(struct rvt_swqe *swqe)
+{
+       return swqe->ud_wr.wr.pkey_index;
+}
+
 struct rvt_rq {
        struct rvt_rwq *wq;
+       struct rvt_krwq *kwq;
        u32 size;               /* size of RWQE array */
        u8 max_sge;
        /* protect changes in this struct */
@@ -472,7 +540,7 @@ static inline struct rvt_swqe *rvt_get_swqe_ptr(struct rvt_qp *qp,
 static inline struct rvt_rwqe *rvt_get_rwqe_ptr(struct rvt_rq *rq, unsigned n)
 {
        return (struct rvt_rwqe *)
-               ((char *)rq->wq->wq +
+               ((char *)rq->kwq->curr_wq +
                 (sizeof(struct rvt_rwqe) +
                  rq->max_sge * sizeof(struct ib_sge)) * n);
 }
@@ -565,42 +633,6 @@ static inline void rvt_qp_wqe_unreserve(
 
 extern const enum ib_wc_opcode ib_rvt_wc_opcode[];
 
-/**
- * rvt_qp_swqe_complete() - insert send completion
- * @qp - the qp
- * @wqe - the send wqe
- * @status - completion status
- *
- * Insert a send completion into the completion
- * queue if the qp indicates it should be done.
- *
- * See IBTA 10.7.3.1 for info on completion
- * control.
- */
-static inline void rvt_qp_swqe_complete(
-       struct rvt_qp *qp,
-       struct rvt_swqe *wqe,
-       enum ib_wc_opcode opcode,
-       enum ib_wc_status status)
-{
-       if (unlikely(wqe->wr.send_flags & RVT_SEND_RESERVE_USED))
-               return;
-       if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
-           (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
-            status != IB_WC_SUCCESS) {
-               struct ib_wc wc;
-
-               memset(&wc, 0, sizeof(wc));
-               wc.wr_id = wqe->wr.wr_id;
-               wc.status = status;
-               wc.opcode = opcode;
-               wc.qp = &qp->ibqp;
-               wc.byte_len = wqe->length;
-               rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc,
-                            status != IB_WC_SUCCESS);
-       }
-}
-
 /*
  * Compare the lower 24 bits of the msn values.
  * Returns an integer <, ==, or > than zero.
@@ -734,7 +766,119 @@ static inline void rvt_put_qp_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
 {
        rvt_put_swqe(wqe);
        if (qp->allowed_ops == IB_OPCODE_UD)
-               atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount);
+               rdma_destroy_ah_attr(wqe->ud_wr.attr);
+}
+
+/**
+ * rvt_qp_sqwe_incr - increment ring index
+ * @qp: the qp
+ * @val: the starting value
+ *
+ * Return: the new value wrapping as appropriate
+ */
+static inline u32
+rvt_qp_swqe_incr(struct rvt_qp *qp, u32 val)
+{
+       if (++val >= qp->s_size)
+               val = 0;
+       return val;
+}
+
+int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err);
+
+/**
+ * rvt_recv_cq - add a new entry to completion queue
+ *                     by receive queue
+ * @qp: receive queue
+ * @wc: work completion entry to add
+ * @solicited: true if @entry is solicited
+ *
+ * This is wrapper function for rvt_enter_cq function call by
+ * receive queue. If rvt_cq_enter return false, it means cq is
+ * full and the qp is put into error state.
+ */
+static inline void rvt_recv_cq(struct rvt_qp *qp, struct ib_wc *wc,
+                              bool solicited)
+{
+       struct rvt_cq *cq = ibcq_to_rvtcq(qp->ibqp.recv_cq);
+
+       if (unlikely(!rvt_cq_enter(cq, wc, solicited)))
+               rvt_error_qp(qp, IB_WC_LOC_QP_OP_ERR);
+}
+
+/**
+ * rvt_send_cq - add a new entry to completion queue
+ *                        by send queue
+ * @qp: send queue
+ * @wc: work completion entry to add
+ * @solicited: true if @entry is solicited
+ *
+ * This is wrapper function for rvt_enter_cq function call by
+ * send queue. If rvt_cq_enter return false, it means cq is
+ * full and the qp is put into error state.
+ */
+static inline void rvt_send_cq(struct rvt_qp *qp, struct ib_wc *wc,
+                              bool solicited)
+{
+       struct rvt_cq *cq = ibcq_to_rvtcq(qp->ibqp.send_cq);
+
+       if (unlikely(!rvt_cq_enter(cq, wc, solicited)))
+               rvt_error_qp(qp, IB_WC_LOC_QP_OP_ERR);
+}
+
+/**
+ * rvt_qp_complete_swqe - insert send completion
+ * @qp - the qp
+ * @wqe - the send wqe
+ * @opcode - wc operation (driver dependent)
+ * @status - completion status
+ *
+ * Update the s_last information, and then insert a send
+ * completion into the completion
+ * queue if the qp indicates it should be done.
+ *
+ * See IBTA 10.7.3.1 for info on completion
+ * control.
+ *
+ * Return: new last
+ */
+static inline u32
+rvt_qp_complete_swqe(struct rvt_qp *qp,
+                    struct rvt_swqe *wqe,
+                    enum ib_wc_opcode opcode,
+                    enum ib_wc_status status)
+{
+       bool need_completion;
+       u64 wr_id;
+       u32 byte_len, last;
+       int flags = wqe->wr.send_flags;
+
+       rvt_put_qp_swqe(qp, wqe);
+
+       need_completion =
+               !(flags & RVT_SEND_RESERVE_USED) &&
+               (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
+               (flags & IB_SEND_SIGNALED) ||
+               status != IB_WC_SUCCESS);
+       if (need_completion) {
+               wr_id = wqe->wr.wr_id;
+               byte_len = wqe->length;
+               /* above fields required before writing s_last */
+       }
+       last = rvt_qp_swqe_incr(qp, qp->s_last);
+       /* see rvt_qp_is_avail() */
+       smp_store_release(&qp->s_last, last);
+       if (need_completion) {
+               struct ib_wc w = {
+                       .wr_id = wr_id,
+                       .status = status,
+                       .opcode = opcode,
+                       .qp = &qp->ibqp,
+                       .byte_len = byte_len,
+               };
+               rvt_send_cq(qp, &w, status != IB_WC_SUCCESS);
+       }
+       return last;
 }
 
 extern const int  ib_rvt_state_ops[];
@@ -742,7 +886,6 @@ extern const int  ib_rvt_state_ops[];
 struct rvt_dev_info;
 int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only);
 void rvt_comm_est(struct rvt_qp *qp);
-int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err);
 void rvt_rc_error(struct rvt_qp *qp, enum ib_wc_status err);
 unsigned long rvt_rnr_tbl_to_usec(u32 index);
 enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t);
@@ -784,6 +927,53 @@ struct rvt_qp_iter {
        int n;
 };
 
+/**
+ * ib_cq_tail - Return tail index of cq buffer
+ * @send_cq - The cq for send
+ *
+ * This is called in qp_iter_print to get tail
+ * of cq buffer.
+ */
+static inline u32 ib_cq_tail(struct ib_cq *send_cq)
+{
+       struct rvt_cq *cq = ibcq_to_rvtcq(send_cq);
+
+       return ibcq_to_rvtcq(send_cq)->ip ?
+              RDMA_READ_UAPI_ATOMIC(cq->queue->tail) :
+              ibcq_to_rvtcq(send_cq)->kqueue->tail;
+}
+
+/**
+ * ib_cq_head - Return head index of cq buffer
+ * @send_cq - The cq for send
+ *
+ * This is called in qp_iter_print to get head
+ * of cq buffer.
+ */
+static inline u32 ib_cq_head(struct ib_cq *send_cq)
+{
+       struct rvt_cq *cq = ibcq_to_rvtcq(send_cq);
+
+       return ibcq_to_rvtcq(send_cq)->ip ?
+              RDMA_READ_UAPI_ATOMIC(cq->queue->head) :
+              ibcq_to_rvtcq(send_cq)->kqueue->head;
+}
+
+/**
+ * rvt_free_rq - free memory allocated for rvt_rq struct
+ * @rvt_rq: request queue data structure
+ *
+ * This function should only be called if the rvt_mmap_info()
+ * has not succeeded.
+ */
+static inline void rvt_free_rq(struct rvt_rq *rq)
+{
+       kvfree(rq->kwq);
+       rq->kwq = NULL;
+       vfree(rq->wq);
+       rq->wq = NULL;
+}
+
 struct rvt_qp_iter *rvt_qp_iter_init(struct rvt_dev_info *rdi,
                                     u64 v,
                                     void (*cb)(struct rvt_qp *qp, u64 v));
index ecf3c7702a4ff96fe1a2c426fc0f0d581025db08..b0fc6b26bdf531f8a42bf3b398d1d32b6bb6d2e2 100644 (file)
@@ -14,6 +14,9 @@
 #include <uapi/rdma/rdma_netlink.h>
 #include <linux/xarray.h>
 
+struct ib_device;
+struct sk_buff;
+
 /**
  * enum rdma_restrack_type - HW objects to track
  */
@@ -42,14 +45,16 @@ enum rdma_restrack_type {
         * @RDMA_RESTRACK_CTX: Verbs contexts (CTX)
         */
        RDMA_RESTRACK_CTX,
+       /**
+        * @RDMA_RESTRACK_COUNTER: Statistic Counter
+        */
+       RDMA_RESTRACK_COUNTER,
        /**
         * @RDMA_RESTRACK_MAX: Last entry, used for array dclarations
         */
        RDMA_RESTRACK_MAX
 };
 
-struct ib_device;
-
 /**
  * struct rdma_restrack_entry - metadata per-entry
  */
index 494f79ca3e621c92ac1aae076ffba7ba249c0d87..6ad9dc836c107c0bce8978de738f097d4c8c8b9d 100644 (file)
@@ -39,15 +39,6 @@ struct rdma_rw_ctx {
                        struct ib_send_wr       inv_wr;
                        struct ib_mr            *mr;
                } *reg;
-
-               struct {
-                       struct rdma_rw_reg_ctx  data;
-                       struct rdma_rw_reg_ctx  prot;
-                       struct ib_send_wr       sig_inv_wr;
-                       struct ib_mr            *sig_mr;
-                       struct ib_sge           sig_sge;
-                       struct ib_sig_handover_wr sig_wr;
-               } *sig;
        };
 };
 
diff --git a/include/rdma/signature.h b/include/rdma/signature.h
new file mode 100644 (file)
index 0000000..f24cc2a
--- /dev/null
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) */
+/*
+ * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef _RDMA_SIGNATURE_H_
+#define _RDMA_SIGNATURE_H_
+
+enum ib_signature_prot_cap {
+       IB_PROT_T10DIF_TYPE_1 = 1,
+       IB_PROT_T10DIF_TYPE_2 = 1 << 1,
+       IB_PROT_T10DIF_TYPE_3 = 1 << 2,
+};
+
+enum ib_signature_guard_cap {
+       IB_GUARD_T10DIF_CRC     = 1,
+       IB_GUARD_T10DIF_CSUM    = 1 << 1,
+};
+
+/**
+ * enum ib_signature_type - Signature types
+ * @IB_SIG_TYPE_NONE: Unprotected.
+ * @IB_SIG_TYPE_T10_DIF: Type T10-DIF
+ */
+enum ib_signature_type {
+       IB_SIG_TYPE_NONE,
+       IB_SIG_TYPE_T10_DIF,
+};
+
+/**
+ * enum ib_t10_dif_bg_type - Signature T10-DIF block-guard types
+ * @IB_T10DIF_CRC: Corresponds to T10-PI mandated CRC checksum rules.
+ * @IB_T10DIF_CSUM: Corresponds to IP checksum rules.
+ */
+enum ib_t10_dif_bg_type {
+       IB_T10DIF_CRC,
+       IB_T10DIF_CSUM,
+};
+
+/**
+ * struct ib_t10_dif_domain - Parameters specific for T10-DIF
+ *     domain.
+ * @bg_type: T10-DIF block guard type (CRC|CSUM)
+ * @pi_interval: protection information interval.
+ * @bg: seed of guard computation.
+ * @app_tag: application tag of guard block
+ * @ref_tag: initial guard block reference tag.
+ * @ref_remap: Indicate wethear the reftag increments each block
+ * @app_escape: Indicate to skip block check if apptag=0xffff
+ * @ref_escape: Indicate to skip block check if reftag=0xffffffff
+ * @apptag_check_mask: check bitmask of application tag.
+ */
+struct ib_t10_dif_domain {
+       enum ib_t10_dif_bg_type bg_type;
+       u16                     pi_interval;
+       u16                     bg;
+       u16                     app_tag;
+       u32                     ref_tag;
+       bool                    ref_remap;
+       bool                    app_escape;
+       bool                    ref_escape;
+       u16                     apptag_check_mask;
+};
+
+/**
+ * struct ib_sig_domain - Parameters for signature domain
+ * @sig_type: specific signauture type
+ * @sig: union of all signature domain attributes that may
+ *     be used to set domain layout.
+ */
+struct ib_sig_domain {
+       enum ib_signature_type sig_type;
+       union {
+               struct ib_t10_dif_domain dif;
+       } sig;
+};
+
+/**
+ * struct ib_sig_attrs - Parameters for signature handover operation
+ * @check_mask: bitmask for signature byte check (8 bytes)
+ * @mem: memory domain layout descriptor.
+ * @wire: wire domain layout descriptor.
+ * @meta_length: metadata length
+ */
+struct ib_sig_attrs {
+       u8                      check_mask;
+       struct ib_sig_domain    mem;
+       struct ib_sig_domain    wire;
+       int                     meta_length;
+};
+
+enum ib_sig_err_type {
+       IB_SIG_BAD_GUARD,
+       IB_SIG_BAD_REFTAG,
+       IB_SIG_BAD_APPTAG,
+};
+
+/*
+ * Signature check masks (8 bytes in total) according to the T10-PI standard:
+ *  -------- -------- ------------
+ * | GUARD  | APPTAG |   REFTAG   |
+ * |  2B    |  2B    |    4B      |
+ *  -------- -------- ------------
+ */
+enum {
+       IB_SIG_CHECK_GUARD = 0xc0,
+       IB_SIG_CHECK_APPTAG = 0x30,
+       IB_SIG_CHECK_REFTAG = 0x0f,
+};
+
+/*
+ * struct ib_sig_err - signature error descriptor
+ */
+struct ib_sig_err {
+       enum ib_sig_err_type    err_type;
+       u32                     expected;
+       u32                     actual;
+       u64                     sig_err_offset;
+       u32                     key;
+};
+
+#endif /* _RDMA_SIGNATURE_H_ */
diff --git a/include/uapi/rdma/ib_user_cm.h b/include/uapi/rdma/ib_user_cm.h
deleted file mode 100644 (file)
index e2709bb..0000000
+++ /dev/null
@@ -1,326 +0,0 @@
-/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
-/*
- * Copyright (c) 2005 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Intel Corporation.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef IB_USER_CM_H
-#define IB_USER_CM_H
-
-#include <linux/types.h>
-#include <rdma/ib_user_sa.h>
-
-#define IB_USER_CM_ABI_VERSION 5
-
-enum {
-       IB_USER_CM_CMD_CREATE_ID,
-       IB_USER_CM_CMD_DESTROY_ID,
-       IB_USER_CM_CMD_ATTR_ID,
-
-       IB_USER_CM_CMD_LISTEN,
-       IB_USER_CM_CMD_NOTIFY,
-
-       IB_USER_CM_CMD_SEND_REQ,
-       IB_USER_CM_CMD_SEND_REP,
-       IB_USER_CM_CMD_SEND_RTU,
-       IB_USER_CM_CMD_SEND_DREQ,
-       IB_USER_CM_CMD_SEND_DREP,
-       IB_USER_CM_CMD_SEND_REJ,
-       IB_USER_CM_CMD_SEND_MRA,
-       IB_USER_CM_CMD_SEND_LAP,
-       IB_USER_CM_CMD_SEND_APR,
-       IB_USER_CM_CMD_SEND_SIDR_REQ,
-       IB_USER_CM_CMD_SEND_SIDR_REP,
-
-       IB_USER_CM_CMD_EVENT,
-       IB_USER_CM_CMD_INIT_QP_ATTR,
-};
-/*
- * command ABI structures.
- */
-struct ib_ucm_cmd_hdr {
-       __u32 cmd;
-       __u16 in;
-       __u16 out;
-};
-
-struct ib_ucm_create_id {
-       __aligned_u64 uid;
-       __aligned_u64 response;
-};
-
-struct ib_ucm_create_id_resp {
-       __u32 id;
-};
-
-struct ib_ucm_destroy_id {
-       __aligned_u64 response;
-       __u32 id;
-       __u32 reserved;
-};
-
-struct ib_ucm_destroy_id_resp {
-       __u32 events_reported;
-};
-
-struct ib_ucm_attr_id {
-       __aligned_u64 response;
-       __u32 id;
-       __u32 reserved;
-};
-
-struct ib_ucm_attr_id_resp {
-       __be64 service_id;
-       __be64 service_mask;
-       __be32 local_id;
-       __be32 remote_id;
-};
-
-struct ib_ucm_init_qp_attr {
-       __aligned_u64 response;
-       __u32 id;
-       __u32 qp_state;
-};
-
-struct ib_ucm_listen {
-       __be64 service_id;
-       __be64 service_mask;
-       __u32 id;
-       __u32 reserved;
-};
-
-struct ib_ucm_notify {
-       __u32 id;
-       __u32 event;
-};
-
-struct ib_ucm_private_data {
-       __aligned_u64 data;
-       __u32 id;
-       __u8  len;
-       __u8  reserved[3];
-};
-
-struct ib_ucm_req {
-       __u32 id;
-       __u32 qpn;
-       __u32 qp_type;
-       __u32 psn;
-       __be64 sid;
-       __aligned_u64 data;
-       __aligned_u64 primary_path;
-       __aligned_u64 alternate_path;
-       __u8  len;
-       __u8  peer_to_peer;
-       __u8  responder_resources;
-       __u8  initiator_depth;
-       __u8  remote_cm_response_timeout;
-       __u8  flow_control;
-       __u8  local_cm_response_timeout;
-       __u8  retry_count;
-       __u8  rnr_retry_count;
-       __u8  max_cm_retries;
-       __u8  srq;
-       __u8  reserved[5];
-};
-
-struct ib_ucm_rep {
-       __aligned_u64 uid;
-       __aligned_u64 data;
-       __u32 id;
-       __u32 qpn;
-       __u32 psn;
-       __u8  len;
-       __u8  responder_resources;
-       __u8  initiator_depth;
-       __u8  target_ack_delay;
-       __u8  failover_accepted;
-       __u8  flow_control;
-       __u8  rnr_retry_count;
-       __u8  srq;
-       __u8  reserved[4];
-};
-
-struct ib_ucm_info {
-       __u32 id;
-       __u32 status;
-       __aligned_u64 info;
-       __aligned_u64 data;
-       __u8  info_len;
-       __u8  data_len;
-       __u8  reserved[6];
-};
-
-struct ib_ucm_mra {
-       __aligned_u64 data;
-       __u32 id;
-       __u8  len;
-       __u8  timeout;
-       __u8  reserved[2];
-};
-
-struct ib_ucm_lap {
-       __aligned_u64 path;
-       __aligned_u64 data;
-       __u32 id;
-       __u8  len;
-       __u8  reserved[3];
-};
-
-struct ib_ucm_sidr_req {
-       __u32 id;
-       __u32 timeout;
-       __be64 sid;
-       __aligned_u64 data;
-       __aligned_u64 path;
-       __u16 reserved_pkey;
-       __u8  len;
-       __u8  max_cm_retries;
-       __u8  reserved[4];
-};
-
-struct ib_ucm_sidr_rep {
-       __u32 id;
-       __u32 qpn;
-       __u32 qkey;
-       __u32 status;
-       __aligned_u64 info;
-       __aligned_u64 data;
-       __u8  info_len;
-       __u8  data_len;
-       __u8  reserved[6];
-};
-/*
- * event notification ABI structures.
- */
-struct ib_ucm_event_get {
-       __aligned_u64 response;
-       __aligned_u64 data;
-       __aligned_u64 info;
-       __u8  data_len;
-       __u8  info_len;
-       __u8  reserved[6];
-};
-
-struct ib_ucm_req_event_resp {
-       struct ib_user_path_rec primary_path;
-       struct ib_user_path_rec alternate_path;
-       __be64                 remote_ca_guid;
-       __u32                  remote_qkey;
-       __u32                  remote_qpn;
-       __u32                  qp_type;
-       __u32                  starting_psn;
-       __u8  responder_resources;
-       __u8  initiator_depth;
-       __u8  local_cm_response_timeout;
-       __u8  flow_control;
-       __u8  remote_cm_response_timeout;
-       __u8  retry_count;
-       __u8  rnr_retry_count;
-       __u8  srq;
-       __u8  port;
-       __u8  reserved[7];
-};
-
-struct ib_ucm_rep_event_resp {
-       __be64 remote_ca_guid;
-       __u32 remote_qkey;
-       __u32 remote_qpn;
-       __u32 starting_psn;
-       __u8  responder_resources;
-       __u8  initiator_depth;
-       __u8  target_ack_delay;
-       __u8  failover_accepted;
-       __u8  flow_control;
-       __u8  rnr_retry_count;
-       __u8  srq;
-       __u8  reserved[5];
-};
-
-struct ib_ucm_rej_event_resp {
-       __u32 reason;
-       /* ari in ib_ucm_event_get info field. */
-};
-
-struct ib_ucm_mra_event_resp {
-       __u8  timeout;
-       __u8  reserved[3];
-};
-
-struct ib_ucm_lap_event_resp {
-       struct ib_user_path_rec path;
-};
-
-struct ib_ucm_apr_event_resp {
-       __u32 status;
-       /* apr info in ib_ucm_event_get info field. */
-};
-
-struct ib_ucm_sidr_req_event_resp {
-       __u16 pkey;
-       __u8  port;
-       __u8  reserved;
-};
-
-struct ib_ucm_sidr_rep_event_resp {
-       __u32 status;
-       __u32 qkey;
-       __u32 qpn;
-       /* info in ib_ucm_event_get info field. */
-};
-
-#define IB_UCM_PRES_DATA      0x01
-#define IB_UCM_PRES_INFO      0x02
-#define IB_UCM_PRES_PRIMARY   0x04
-#define IB_UCM_PRES_ALTERNATE 0x08
-
-struct ib_ucm_event_resp {
-       __aligned_u64 uid;
-       __u32 id;
-       __u32 event;
-       __u32 present;
-       __u32 reserved;
-       union {
-               struct ib_ucm_req_event_resp req_resp;
-               struct ib_ucm_rep_event_resp rep_resp;
-               struct ib_ucm_rej_event_resp rej_resp;
-               struct ib_ucm_mra_event_resp mra_resp;
-               struct ib_ucm_lap_event_resp lap_resp;
-               struct ib_ucm_apr_event_resp apr_resp;
-
-               struct ib_ucm_sidr_req_event_resp sidr_req_resp;
-               struct ib_ucm_sidr_rep_event_resp sidr_rep_resp;
-
-               __u32                             send_status;
-       } u;
-};
-
-#endif /* IB_USER_CM_H */
index d404c951954cdc566cacda181898852bfb8abefa..d0da070cf0ab76867ddb0a2995cc6b321b264c0d 100644 (file)
@@ -51,6 +51,7 @@ enum mlx5_ib_devx_methods {
        MLX5_IB_METHOD_DEVX_OTHER  = (1U << UVERBS_ID_NS_SHIFT),
        MLX5_IB_METHOD_DEVX_QUERY_UAR,
        MLX5_IB_METHOD_DEVX_QUERY_EQN,
+       MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT,
 };
 
 enum  mlx5_ib_devx_other_attrs {
@@ -93,6 +94,14 @@ enum mlx5_ib_devx_obj_query_async_attrs {
        MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN,
 };
 
+enum mlx5_ib_devx_subscribe_event_attrs {
+       MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
+       MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE,
+       MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST,
+       MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM,
+       MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE,
+};
+
 enum  mlx5_ib_devx_query_eqn_attrs {
        MLX5_IB_ATTR_DEVX_QUERY_EQN_USER_VEC = (1U << UVERBS_ID_NS_SHIFT),
        MLX5_IB_ATTR_DEVX_QUERY_EQN_DEV_EQN,
@@ -127,16 +136,26 @@ enum mlx5_ib_devx_async_cmd_fd_alloc_attrs {
        MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
 };
 
+enum mlx5_ib_devx_async_event_fd_alloc_attrs {
+       MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
+       MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_FLAGS,
+};
+
 enum mlx5_ib_devx_async_cmd_fd_methods {
        MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC = (1U << UVERBS_ID_NS_SHIFT),
 };
 
+enum mlx5_ib_devx_async_event_fd_methods {
+       MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC = (1U << UVERBS_ID_NS_SHIFT),
+};
+
 enum mlx5_ib_objects {
        MLX5_IB_OBJECT_DEVX = (1U << UVERBS_ID_NS_SHIFT),
        MLX5_IB_OBJECT_DEVX_OBJ,
        MLX5_IB_OBJECT_DEVX_UMEM,
        MLX5_IB_OBJECT_FLOW_MATCHER,
        MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
+       MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD,
 };
 
 enum mlx5_ib_flow_matcher_create_attrs {
index a8f34c2374586bebb6390ff5660004f399ccf822..7e9900b0e7461f961bdf7a6aeaa4e389ed121a13 100644 (file)
@@ -63,5 +63,14 @@ enum mlx5_ib_uapi_dm_type {
        MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM,
 };
 
+enum mlx5_ib_uapi_devx_create_event_channel_flags {
+       MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA = 1 << 0,
+};
+
+struct mlx5_ib_uapi_devx_async_event_hdr {
+       __aligned_u64   cookie;
+       __u8            out_data[];
+};
+
 #endif
 
index 41db51367efafb3d2abb5103e59ddc04b3d61e99..8e277783fa9610d79629e73c69df658959914c3b 100644 (file)
@@ -147,6 +147,18 @@ enum {
        IWPM_NLA_HELLO_MAX
 };
 
+/* For RDMA_NLDEV_ATTR_DEV_NODE_TYPE */
+enum {
+       /* IB values map to NodeInfo:NodeType. */
+       RDMA_NODE_IB_CA = 1,
+       RDMA_NODE_IB_SWITCH,
+       RDMA_NODE_IB_ROUTER,
+       RDMA_NODE_RNIC,
+       RDMA_NODE_USNIC,
+       RDMA_NODE_USNIC_UDP,
+       RDMA_NODE_UNSPECIFIED,
+};
+
 /*
  * Local service operations:
  *   RESOLVE - The client requests the local service to resolve a path.
@@ -267,11 +279,15 @@ enum rdma_nldev_command {
 
        RDMA_NLDEV_CMD_RES_PD_GET, /* can dump */
 
-       RDMA_NLDEV_NUM_OPS
-};
+       RDMA_NLDEV_CMD_GET_CHARDEV,
 
-enum {
-       RDMA_NLDEV_ATTR_ENTRY_STRLEN = 16,
+       RDMA_NLDEV_CMD_STAT_SET,
+
+       RDMA_NLDEV_CMD_STAT_GET, /* can dump */
+
+       RDMA_NLDEV_CMD_STAT_DEL,
+
+       RDMA_NLDEV_NUM_OPS
 };
 
 enum rdma_nldev_print_type {
@@ -478,10 +494,72 @@ enum rdma_nldev_attr {
         * File descriptor handle of the net namespace object
         */
        RDMA_NLDEV_NET_NS_FD,                   /* u32 */
+       /*
+        * Information about a chardev.
+        * CHARDEV_TYPE is the name of the chardev ABI (ie uverbs, umad, etc)
+        * CHARDEV_ABI signals the ABI revision (historical)
+        * CHARDEV_NAME is the kernel name for the /dev/ file (no directory)
+        * CHARDEV is the 64 bit dev_t for the inode
+        */
+       RDMA_NLDEV_ATTR_CHARDEV_TYPE,           /* string */
+       RDMA_NLDEV_ATTR_CHARDEV_NAME,           /* string */
+       RDMA_NLDEV_ATTR_CHARDEV_ABI,            /* u64 */
+       RDMA_NLDEV_ATTR_CHARDEV,                /* u64 */
+       RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID,       /* u64 */
+       /*
+        * Counter-specific attributes.
+        */
+       RDMA_NLDEV_ATTR_STAT_MODE,              /* u32 */
+       RDMA_NLDEV_ATTR_STAT_RES,               /* u32 */
+       RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK,    /* u32 */
+       RDMA_NLDEV_ATTR_STAT_COUNTER,           /* nested table */
+       RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY,     /* nested table */
+       RDMA_NLDEV_ATTR_STAT_COUNTER_ID,        /* u32 */
+       RDMA_NLDEV_ATTR_STAT_HWCOUNTERS,        /* nested table */
+       RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY,   /* nested table */
+       RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME,      /* string */
+       RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE,     /* u64 */
+
+       /*
+        * CQ adaptive moderatio (DIM)
+        */
+       RDMA_NLDEV_ATTR_DEV_DIM,                /* u8 */
 
        /*
         * Always the end
         */
        RDMA_NLDEV_ATTR_MAX
 };
+
+/*
+ * Supported counter bind modes. All modes are mutual-exclusive.
+ */
+enum rdma_nl_counter_mode {
+       RDMA_COUNTER_MODE_NONE,
+
+       /*
+        * A qp is bound with a counter automatically during initialization
+        * based on the auto mode (e.g., qp type, ...)
+        */
+       RDMA_COUNTER_MODE_AUTO,
+
+       /*
+        * Which qp are bound with which counter is explicitly specified
+        * by the user
+        */
+       RDMA_COUNTER_MODE_MANUAL,
+
+       /*
+        * Always the end
+        */
+       RDMA_COUNTER_MODE_MAX,
+};
+
+/*
+ * Supported criteria in counter auto mode.
+ * Currently only "qp type" is supported
+ */
+enum rdma_nl_counter_mask {
+       RDMA_COUNTER_MASK_QP_TYPE = 1,
+};
 #endif /* _UAPI_RDMA_NETLINK_H */
index 26213f49f5c8ea4a06251b67cab3f9cb312ab386..64c14cb0022fa0e906c2b321b6819b2cf1bb8bc2 100644 (file)
@@ -103,6 +103,7 @@ enum rdma_driver_id {
        RDMA_DRIVER_HFI1,
        RDMA_DRIVER_QIB,
        RDMA_DRIVER_EFA,
+       RDMA_DRIVER_SIW,
 };
 
 #endif
diff --git a/include/uapi/rdma/rvt-abi.h b/include/uapi/rdma/rvt-abi.h
new file mode 100644 (file)
index 0000000..7328293
--- /dev/null
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+#ifndef RVT_ABI_USER_H
+#define RVT_ABI_USER_H
+
+#include <linux/types.h>
+#include <rdma/ib_user_verbs.h>
+#ifndef RDMA_ATOMIC_UAPI
+#define RDMA_ATOMIC_UAPI(_type, _name) struct{ _type val; } _name
+#endif
+
+struct rvt_wqe_sge {
+       __aligned_u64 addr;
+       __u32 length;
+       __u32 lkey;
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and completion queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ */
+struct rvt_cq_wc {
+       /* index of next entry to fill */
+       RDMA_ATOMIC_UAPI(__u32, head);
+       /* index of next ib_poll_cq() entry */
+       RDMA_ATOMIC_UAPI(__u32, tail);
+
+       /* these are actually size ibcq.cqe + 1 */
+       struct ib_uverbs_wc uqueue[];
+};
+
+/*
+ * Receive work request queue entry.
+ * The size of the sg_list is determined when the QP (or SRQ) is created
+ * and stored in qp->r_rq.max_sge (or srq->rq.max_sge).
+ */
+struct rvt_rwqe {
+       __u64 wr_id;
+       __u8 num_sge;
+       __u8 padding[7];
+       struct rvt_wqe_sge sg_list[];
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and receive work queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ * Note that the wq array elements are variable size so you can't
+ * just index into the array to get the N'th element;
+ * use get_rwqe_ptr() for user space and rvt_get_rwqe_ptr()
+ * for kernel space.
+ */
+struct rvt_rwq {
+       /* new work requests posted to the head */
+       RDMA_ATOMIC_UAPI(__u32, head);
+       /* receives pull requests from here. */
+       RDMA_ATOMIC_UAPI(__u32, tail);
+       struct rvt_rwqe wq[];
+};
+#endif /* RVT_ABI_USER_H */
diff --git a/include/uapi/rdma/siw-abi.h b/include/uapi/rdma/siw-abi.h
new file mode 100644 (file)
index 0000000..3dd8071
--- /dev/null
@@ -0,0 +1,185 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#ifndef _SIW_USER_H
+#define _SIW_USER_H
+
+#include <linux/types.h>
+
+#define SIW_NODE_DESC_COMMON "Software iWARP stack"
+#define SIW_ABI_VERSION 1
+#define SIW_MAX_SGE 6
+#define SIW_UOBJ_MAX_KEY 0x08FFFF
+#define SIW_INVAL_UOBJ_KEY (SIW_UOBJ_MAX_KEY + 1)
+
+struct siw_uresp_create_cq {
+       __u32 cq_id;
+       __u32 num_cqe;
+       __aligned_u64 cq_key;
+};
+
+struct siw_uresp_create_qp {
+       __u32 qp_id;
+       __u32 num_sqe;
+       __u32 num_rqe;
+       __u32 pad;
+       __aligned_u64 sq_key;
+       __aligned_u64 rq_key;
+};
+
+struct siw_ureq_reg_mr {
+       __u8 stag_key;
+       __u8 reserved[3];
+       __u32 pad;
+};
+
+struct siw_uresp_reg_mr {
+       __u32 stag;
+       __u32 pad;
+};
+
+struct siw_uresp_create_srq {
+       __u32 num_rqe;
+       __u32 pad;
+       __aligned_u64 srq_key;
+};
+
+struct siw_uresp_alloc_ctx {
+       __u32 dev_id;
+       __u32 pad;
+};
+
+enum siw_opcode {
+       SIW_OP_WRITE,
+       SIW_OP_READ,
+       SIW_OP_READ_LOCAL_INV,
+       SIW_OP_SEND,
+       SIW_OP_SEND_WITH_IMM,
+       SIW_OP_SEND_REMOTE_INV,
+
+       /* Unsupported */
+       SIW_OP_FETCH_AND_ADD,
+       SIW_OP_COMP_AND_SWAP,
+
+       SIW_OP_RECEIVE,
+       /* provider internal SQE */
+       SIW_OP_READ_RESPONSE,
+       /*
+        * below opcodes valid for
+        * in-kernel clients only
+        */
+       SIW_OP_INVAL_STAG,
+       SIW_OP_REG_MR,
+       SIW_NUM_OPCODES
+};
+
+/* Keep it same as ibv_sge to allow for memcpy */
+struct siw_sge {
+       __aligned_u64 laddr;
+       __u32 length;
+       __u32 lkey;
+};
+
+/*
+ * Inline data are kept within the work request itself occupying
+ * the space of sge[1] .. sge[n]. Therefore, inline data cannot be
+ * supported if SIW_MAX_SGE is below 2 elements.
+ */
+#define SIW_MAX_INLINE (sizeof(struct siw_sge) * (SIW_MAX_SGE - 1))
+
+#if SIW_MAX_SGE < 2
+#error "SIW_MAX_SGE must be at least 2"
+#endif
+
+enum siw_wqe_flags {
+       SIW_WQE_VALID = 1,
+       SIW_WQE_INLINE = (1 << 1),
+       SIW_WQE_SIGNALLED = (1 << 2),
+       SIW_WQE_SOLICITED = (1 << 3),
+       SIW_WQE_READ_FENCE = (1 << 4),
+       SIW_WQE_REM_INVAL = (1 << 5),
+       SIW_WQE_COMPLETED = (1 << 6)
+};
+
+/* Send Queue Element */
+struct siw_sqe {
+       __aligned_u64 id;
+       __u16 flags;
+       __u8 num_sge;
+       /* Contains enum siw_opcode values */
+       __u8 opcode;
+       __u32 rkey;
+       union {
+               __aligned_u64 raddr;
+               __aligned_u64 base_mr;
+       };
+       union {
+               struct siw_sge sge[SIW_MAX_SGE];
+               __aligned_u64 access;
+       };
+};
+
+/* Receive Queue Element */
+struct siw_rqe {
+       __aligned_u64 id;
+       __u16 flags;
+       __u8 num_sge;
+       /*
+        * only used by kernel driver,
+        * ignored if set by user
+        */
+       __u8 opcode;
+       __u32 unused;
+       struct siw_sge sge[SIW_MAX_SGE];
+};
+
+enum siw_notify_flags {
+       SIW_NOTIFY_NOT = (0),
+       SIW_NOTIFY_SOLICITED = (1 << 0),
+       SIW_NOTIFY_NEXT_COMPLETION = (1 << 1),
+       SIW_NOTIFY_MISSED_EVENTS = (1 << 2),
+       SIW_NOTIFY_ALL = SIW_NOTIFY_SOLICITED | SIW_NOTIFY_NEXT_COMPLETION |
+                        SIW_NOTIFY_MISSED_EVENTS
+};
+
+enum siw_wc_status {
+       SIW_WC_SUCCESS,
+       SIW_WC_LOC_LEN_ERR,
+       SIW_WC_LOC_PROT_ERR,
+       SIW_WC_LOC_QP_OP_ERR,
+       SIW_WC_WR_FLUSH_ERR,
+       SIW_WC_BAD_RESP_ERR,
+       SIW_WC_LOC_ACCESS_ERR,
+       SIW_WC_REM_ACCESS_ERR,
+       SIW_WC_REM_INV_REQ_ERR,
+       SIW_WC_GENERAL_ERR,
+       SIW_NUM_WC_STATUS
+};
+
+struct siw_cqe {
+       __aligned_u64 id;
+       __u8 flags;
+       __u8 opcode;
+       __u16 status;
+       __u32 bytes;
+       union {
+               __aligned_u64 imm_data;
+               __u32 inval_stag;
+       };
+       /* QP number or QP pointer */
+       union {
+               struct ib_qp *base_qp;
+               __aligned_u64 qp_id;
+       };
+};
+
+/*
+ * Shared structure between user and kernel
+ * to control CQ arming.
+ */
+struct siw_cq_ctrl {
+       __aligned_u64 notify;
+};
+#endif
index 160afe288df067068811ebb146575bd03e66f3fa..1d6858a108cb8f0a4c859f354c23ae3244bfbd8a 100644 (file)
@@ -2,8 +2,6 @@
 # DIM Dynamic Interrupt Moderation library
 #
 
-obj-$(CONFIG_DIMLIB) = net_dim.o
+obj-$(CONFIG_DIMLIB) += dim.o
 
-net_dim-y = \
-       dim.o           \
-       net_dim.o
+dim-y := dim.o net_dim.o rdma_dim.o
diff --git a/lib/dim/rdma_dim.c b/lib/dim/rdma_dim.c
new file mode 100644 (file)
index 0000000..f7e26c7
--- /dev/null
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include <linux/dim.h>
+
+static int rdma_dim_step(struct dim *dim)
+{
+       if (dim->tune_state == DIM_GOING_RIGHT) {
+               if (dim->profile_ix == (RDMA_DIM_PARAMS_NUM_PROFILES - 1))
+                       return DIM_ON_EDGE;
+               dim->profile_ix++;
+               dim->steps_right++;
+       }
+       if (dim->tune_state == DIM_GOING_LEFT) {
+               if (dim->profile_ix == 0)
+                       return DIM_ON_EDGE;
+               dim->profile_ix--;
+               dim->steps_left++;
+       }
+
+       return DIM_STEPPED;
+}
+
+static int rdma_dim_stats_compare(struct dim_stats *curr,
+                                 struct dim_stats *prev)
+{
+       /* first stat */
+       if (!prev->cpms)
+               return DIM_STATS_SAME;
+
+       if (IS_SIGNIFICANT_DIFF(curr->cpms, prev->cpms))
+               return (curr->cpms > prev->cpms) ? DIM_STATS_BETTER :
+                                               DIM_STATS_WORSE;
+
+       if (IS_SIGNIFICANT_DIFF(curr->cpe_ratio, prev->cpe_ratio))
+               return (curr->cpe_ratio > prev->cpe_ratio) ? DIM_STATS_BETTER :
+                                               DIM_STATS_WORSE;
+
+       return DIM_STATS_SAME;
+}
+
+static bool rdma_dim_decision(struct dim_stats *curr_stats, struct dim *dim)
+{
+       int prev_ix = dim->profile_ix;
+       u8 state = dim->tune_state;
+       int stats_res;
+       int step_res;
+
+       if (state != DIM_PARKING_ON_TOP && state != DIM_PARKING_TIRED) {
+               stats_res = rdma_dim_stats_compare(curr_stats,
+                                                  &dim->prev_stats);
+
+               switch (stats_res) {
+               case DIM_STATS_SAME:
+                       if (curr_stats->cpe_ratio <= 50 * prev_ix)
+                               dim->profile_ix = 0;
+                       break;
+               case DIM_STATS_WORSE:
+                       dim_turn(dim);
+                       /* fall through */
+               case DIM_STATS_BETTER:
+                       step_res = rdma_dim_step(dim);
+                       if (step_res == DIM_ON_EDGE)
+                               dim_turn(dim);
+                       break;
+               }
+       }
+
+       dim->prev_stats = *curr_stats;
+
+       return dim->profile_ix != prev_ix;
+}
+
+void rdma_dim(struct dim *dim, u64 completions)
+{
+       struct dim_sample *curr_sample = &dim->measuring_sample;
+       struct dim_stats curr_stats;
+       u32 nevents;
+
+       dim_update_sample_with_comps(curr_sample->event_ctr + 1, 0, 0,
+                                    curr_sample->comp_ctr + completions,
+                                    &dim->measuring_sample);
+
+       switch (dim->state) {
+       case DIM_MEASURE_IN_PROGRESS:
+               nevents = curr_sample->event_ctr - dim->start_sample.event_ctr;
+               if (nevents < DIM_NEVENTS)
+                       break;
+               dim_calc_stats(&dim->start_sample, curr_sample, &curr_stats);
+               if (rdma_dim_decision(&curr_stats, dim)) {
+                       dim->state = DIM_APPLY_NEW_PROFILE;
+                       schedule_work(&dim->work);
+                       break;
+               }
+               /* fall through */
+       case DIM_START_MEASURE:
+               dim->state = DIM_MEASURE_IN_PROGRESS;
+               dim_update_sample_with_comps(curr_sample->event_ctr, 0, 0,
+                                            curr_sample->comp_ctr,
+                                            &dim->start_sample);
+               break;
+       case DIM_APPLY_NEW_PROFILE:
+               break;
+       }
+}
+EXPORT_SYMBOL(rdma_dim);
index 8891822eba4f7fcd180ed54f34b83d587de367b3..c36d89cd14a10ec02b9c0708a468b244880dc9f0 100644 (file)
@@ -607,11 +607,11 @@ send_hdrs_dma_out:
 qp_out:
        rdma_destroy_qp(ic->i_cm_id);
 recv_cq_out:
-       if (!ib_destroy_cq(ic->i_recv_cq))
-               ic->i_recv_cq = NULL;
+       ib_destroy_cq(ic->i_recv_cq);
+       ic->i_recv_cq = NULL;
 send_cq_out:
-       if (!ib_destroy_cq(ic->i_send_cq))
-               ic->i_send_cq = NULL;
+       ib_destroy_cq(ic->i_send_cq);
+       ic->i_send_cq = NULL;
 rds_ibdev_out:
        rds_ib_remove_conn(rds_ibdev, conn);
 out: