staging/rdma/hfi: fix CQ completion order issue
authorMike Marciniszyn <mike.marciniszyn@intel.com>
Sun, 14 Feb 2016 20:45:53 +0000 (12:45 -0800)
committerDoug Ledford <dledford@redhat.com>
Fri, 11 Mar 2016 01:38:14 +0000 (20:38 -0500)
The current implementation of the sdma_wait variable
has a timing hole that can cause a completion Q entry
to be returned from a pio send prior to an older
sdma packets completion queue entry.

The sdma_wait variable used to be decremented prior to
calling the packet complete routine.  The hole is between decrement
and the verbs completion where send engine using pio could return
a out of order completion in that window.

This patch closes the hole by allowing an API option to
specify an sdma_drained callback.   The atomic dec
is positioned after the complete callback to avoid the
window as long as the pio path doesn't execute when
there is a non-zero sdma count.

Reviewed-by: Jubin John <jubin.john@intel.com>
Signed-off-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
drivers/staging/rdma/hfi1/iowait.h
drivers/staging/rdma/hfi1/qp.c
drivers/staging/rdma/hfi1/sdma.c
drivers/staging/rdma/hfi1/sdma.h
drivers/staging/rdma/hfi1/sdma_txreq.h
drivers/staging/rdma/hfi1/user_sdma.c
drivers/staging/rdma/hfi1/verbs.c

index b5eb1e0a5aa2591c5e0d45b60f6717e3df5fa1a6..2cb3f04227528636f01be4d9b84fe442d30d0ecd 100644 (file)
@@ -69,7 +69,8 @@ struct sdma_engine;
  * @list: used to add/insert into QP/PQ wait lists
  * @tx_head: overflow list of sdma_txreq's
  * @sleep: no space callback
- * @wakeup: space callback
+ * @wakeup: space callback wakeup
+ * @sdma_drained: sdma count drained
  * @iowork: workqueue overhead
  * @wait_dma: wait for sdma_busy == 0
  * @wait_pio: wait for pio_busy == 0
@@ -104,6 +105,7 @@ struct iowait {
                struct sdma_txreq *tx,
                unsigned seq);
        void (*wakeup)(struct iowait *wait, int reason);
+       void (*sdma_drained)(struct iowait *wait);
        struct work_struct iowork;
        wait_queue_head_t wait_dma;
        wait_queue_head_t wait_pio;
@@ -122,7 +124,7 @@ struct iowait {
  * @tx_limit: limit for overflow queuing
  * @func: restart function for workqueue
  * @sleep: sleep function for no space
- * @wakeup: wakeup function for no space
+ * @resume: wakeup function for no space
  *
  * This function initializes the iowait
  * structure embedded in the QP or PQ.
@@ -138,7 +140,8 @@ static inline void iowait_init(
                struct iowait *wait,
                struct sdma_txreq *tx,
                unsigned seq),
-       void (*wakeup)(struct iowait *wait, int reason))
+       void (*wakeup)(struct iowait *wait, int reason),
+       void (*sdma_drained)(struct iowait *wait))
 {
        wait->count = 0;
        INIT_LIST_HEAD(&wait->list);
@@ -151,6 +154,7 @@ static inline void iowait_init(
        wait->tx_limit = tx_limit;
        wait->sleep = sleep;
        wait->wakeup = wakeup;
+       wait->sdma_drained = sdma_drained;
 }
 
 /**
@@ -273,6 +277,8 @@ static inline void iowait_drain_wakeup(struct iowait *wait)
 {
        wake_up(&wait->wait_dma);
        wake_up(&wait->wait_pio);
+       if (wait->sdma_drained)
+               wait->sdma_drained(wait);
 }
 
 /**
index 2d157054576a327c7caa366d4f4209d2bc61d409..77e91f280b211568ca33d36edcfb1cec8b1af001 100644 (file)
@@ -73,6 +73,7 @@ static int iowait_sleep(
        struct sdma_txreq *stx,
        unsigned seq);
 static void iowait_wakeup(struct iowait *wait, int reason);
+static void iowait_sdma_drained(struct iowait *wait);
 static void qp_pio_drain(struct rvt_qp *qp);
 
 static inline unsigned mk_qpn(struct rvt_qpn_table *qpt,
@@ -509,6 +510,22 @@ static void iowait_wakeup(struct iowait *wait, int reason)
        hfi1_qp_wakeup(qp, RVT_S_WAIT_DMA_DESC);
 }
 
+static void iowait_sdma_drained(struct iowait *wait)
+{
+       struct rvt_qp *qp = iowait_to_qp(wait);
+
+       /*
+        * This happens when the send engine notes
+        * a QP in the error state and cannot
+        * do the flush work until that QP's
+        * sdma work has finished.
+        */
+       if (qp->s_flags & RVT_S_WAIT_DMA) {
+               qp->s_flags &= ~RVT_S_WAIT_DMA;
+               hfi1_schedule_send(qp);
+       }
+}
+
 /**
  *
  * qp_to_sdma_engine - map a qp to a send engine
@@ -773,7 +790,8 @@ void notify_qp_reset(struct rvt_qp *qp)
                1,
                _hfi1_do_send,
                iowait_sleep,
-               iowait_wakeup);
+               iowait_wakeup,
+               iowait_sdma_drained);
        priv->r_adefered = 0;
        clear_ahg(qp);
 }
index ff38fa3b7ca5d3854abd4066abd75b7db5509475..e79f931d06cea3f93806591f76601951399c7674 100644 (file)
@@ -361,6 +361,28 @@ static inline void sdma_set_desc_cnt(struct sdma_engine *sde, unsigned cnt)
        write_sde_csr(sde, SD(DESC_CNT), reg);
 }
 
+static inline void complete_tx(struct sdma_engine *sde,
+                              struct sdma_txreq *tx,
+                              int res)
+{
+       /* protect against complete modifying */
+       struct iowait *wait = tx->wait;
+       callback_t complete = tx->complete;
+
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+       trace_hfi1_sdma_out_sn(sde, txp->sn);
+       if (WARN_ON_ONCE(sde->head_sn != txp->sn))
+               dd_dev_err(sde->dd, "expected %llu got %llu\n",
+                          sde->head_sn, txp->sn);
+       sde->head_sn++;
+#endif
+       sdma_txclean(sde->dd, tx);
+       if (complete)
+               (*complete)(tx, res);
+       if (iowait_sdma_dec(wait) && wait)
+               iowait_drain_wakeup(wait);
+}
+
 /*
  * Complete all the sdma requests with a SDMA_TXREQ_S_ABORTED status
  *
@@ -395,27 +417,8 @@ static void sdma_flush(struct sdma_engine *sde)
        }
        spin_unlock_irqrestore(&sde->flushlist_lock, flags);
        /* flush from flush list */
-       list_for_each_entry_safe(txp, txp_next, &flushlist, list) {
-               int drained = 0;
-               /* protect against complete modifying */
-               struct iowait *wait = txp->wait;
-
-               list_del_init(&txp->list);
-#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
-               trace_hfi1_sdma_out_sn(sde, txp->sn);
-               if (WARN_ON_ONCE(sde->head_sn != txp->sn))
-                       dd_dev_err(sde->dd, "expected %llu got %llu\n",
-                               sde->head_sn, txp->sn);
-               sde->head_sn++;
-#endif
-               sdma_txclean(sde->dd, txp);
-               if (wait)
-                       drained = iowait_sdma_dec(wait);
-               if (txp->complete)
-                       (*txp->complete)(txp, SDMA_TXREQ_S_ABORTED, drained);
-               if (wait && drained)
-                       iowait_drain_wakeup(wait);
-       }
+       list_for_each_entry_safe(txp, txp_next, &flushlist, list)
+               complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
 }
 
 /*
@@ -577,31 +580,10 @@ static void sdma_flush_descq(struct sdma_engine *sde)
                head = ++sde->descq_head & sde->sdma_mask;
                /* if now past this txp's descs, do the callback */
                if (txp && txp->next_descq_idx == head) {
-                       int drained = 0;
-                       /* protect against complete modifying */
-                       struct iowait *wait = txp->wait;
-
                        /* remove from list */
                        sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
-                       if (wait)
-                               drained = iowait_sdma_dec(wait);
-#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
-                       trace_hfi1_sdma_out_sn(sde, txp->sn);
-                       if (WARN_ON_ONCE(sde->head_sn != txp->sn))
-                               dd_dev_err(sde->dd, "expected %llu got %llu\n",
-                                       sde->head_sn, txp->sn);
-                       sde->head_sn++;
-#endif
-                       sdma_txclean(sde->dd, txp);
+                       complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
                        trace_hfi1_sdma_progress(sde, head, tail, txp);
-                       if (txp->complete)
-                               (*txp->complete)(
-                                       txp,
-                                       SDMA_TXREQ_S_ABORTED,
-                                       drained);
-                       if (wait && drained)
-                               iowait_drain_wakeup(wait);
-                       /* see if there is another txp */
                        txp = get_txhead(sde);
                }
                progress++;
@@ -1470,7 +1452,7 @@ static void sdma_make_progress(struct sdma_engine *sde, u64 status)
 {
        struct sdma_txreq *txp = NULL;
        int progress = 0;
-       u16 hwhead, swhead, swtail;
+       u16 hwhead, swhead;
        int idle_check_done = 0;
 
        hwhead = sdma_gethead(sde);
@@ -1491,29 +1473,9 @@ retry:
 
                /* if now past this txp's descs, do the callback */
                if (txp && txp->next_descq_idx == swhead) {
-                       int drained = 0;
-                       /* protect against complete modifying */
-                       struct iowait *wait = txp->wait;
-
                        /* remove from list */
                        sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
-                       if (wait)
-                               drained = iowait_sdma_dec(wait);
-#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
-                       trace_hfi1_sdma_out_sn(sde, txp->sn);
-                       if (WARN_ON_ONCE(sde->head_sn != txp->sn))
-                               dd_dev_err(sde->dd, "expected %llu got %llu\n",
-                                       sde->head_sn, txp->sn);
-                       sde->head_sn++;
-#endif
-                       sdma_txclean(sde->dd, txp);
-                       if (txp->complete)
-                               (*txp->complete)(
-                                       txp,
-                                       SDMA_TXREQ_S_OK,
-                                       drained);
-                       if (wait && drained)
-                               iowait_drain_wakeup(wait);
+                       complete_tx(sde, txp, SDMA_TXREQ_S_OK);
                        /* see if there is another txp */
                        txp = get_txhead(sde);
                }
@@ -1531,6 +1493,8 @@ retry:
         * of sdma_make_progress(..) which is ensured by idle_check_done flag
         */
        if ((status & sde->idle_mask) && !idle_check_done) {
+               u16 swtail;
+
                swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
                if (swtail != hwhead) {
                        hwhead = (u16)read_sde_csr(sde, SD(HEAD));
index 76ed2157c514ad8f294a05c6aea43a1df7847dbc..f24b5a17322b3300cbf5599bf491a5ce3bca2838 100644 (file)
@@ -555,7 +555,7 @@ static inline int sdma_txinit_ahg(
        u8 num_ahg,
        u32 *ahg,
        u8 ahg_hlen,
-       void (*cb)(struct sdma_txreq *, int, int))
+       void (*cb)(struct sdma_txreq *, int))
 {
        if (tlen == 0)
                return -ENODATA;
@@ -618,7 +618,7 @@ static inline int sdma_txinit(
        struct sdma_txreq *tx,
        u16 flags,
        u16 tlen,
-       void (*cb)(struct sdma_txreq *, int, int))
+       void (*cb)(struct sdma_txreq *, int))
 {
        return sdma_txinit_ahg(tx, flags, tlen, 0, 0, NULL, 0, cb);
 }
index 2effb35b9b91058e9badcf562072a52a25617928..bf7d777d756ec44efd9bbb9e89e30ed2f4982531 100644 (file)
@@ -93,7 +93,7 @@ struct sdma_desc {
 #define SDMA_TXREQ_F_USE_AHG      0x0004
 
 struct sdma_txreq;
-typedef void (*callback_t)(struct sdma_txreq *, int, int);
+typedef void (*callback_t)(struct sdma_txreq *, int);
 
 struct iowait;
 struct sdma_txreq {
index ac903099843eb4eeb7d4499be022879f1edd194a..dfa9ef209793b2e8d87b2541d3339968b28b1ae1 100644 (file)
@@ -273,7 +273,7 @@ struct user_sdma_txreq {
 
 static int user_sdma_send_pkts(struct user_sdma_request *, unsigned);
 static int num_user_pages(const struct iovec *);
-static void user_sdma_txreq_cb(struct sdma_txreq *, int, int);
+static void user_sdma_txreq_cb(struct sdma_txreq *, int);
 static inline void pq_update(struct hfi1_user_sdma_pkt_q *);
 static void user_sdma_free_request(struct user_sdma_request *, bool);
 static int pin_vector_pages(struct user_sdma_request *,
@@ -388,7 +388,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
        init_waitqueue_head(&pq->wait);
 
        iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
-                   activate_packet_queue);
+                   activate_packet_queue, NULL);
        pq->reqidx = 0;
        snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
                 fd->subctxt);
@@ -1341,8 +1341,7 @@ static int set_txreq_header_ahg(struct user_sdma_request *req,
  * tx request have been processed by the DMA engine. Called in
  * interrupt context.
  */
-static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status,
-                              int drain)
+static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
 {
        struct user_sdma_txreq *tx =
                container_of(txreq, struct user_sdma_txreq, txreq);
index d900374abe70459d107243ea0ef980e8f36325aa..31419666cc693312273eb7e9d9740ec6f257f1a9 100644 (file)
@@ -130,8 +130,7 @@ MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
 
 static void verbs_sdma_complete(
        struct sdma_txreq *cookie,
-       int status,
-       int drained);
+       int status);
 
 static int pio_wait(struct rvt_qp *qp,
                    struct send_context *sc,
@@ -523,8 +522,7 @@ void update_sge(struct rvt_sge_state *ss, u32 length)
 /* New API */
 static void verbs_sdma_complete(
        struct sdma_txreq *cookie,
-       int status,
-       int drained)
+       int status)
 {
        struct verbs_txreq *tx =
                container_of(cookie, struct verbs_txreq, txreq);
@@ -539,18 +537,6 @@ static void verbs_sdma_complete(
                hdr = &tx->phdr.hdr;
                hfi1_rc_send_complete(qp, hdr);
        }
-       if (drained) {
-               /*
-                * This happens when the send engine notes
-                * a QP in the error state and cannot
-                * do the flush work until that QP's
-                * sdma work has finished.
-                */
-               if (qp->s_flags & RVT_S_WAIT_DMA) {
-                       qp->s_flags &= ~RVT_S_WAIT_DMA;
-                       hfi1_schedule_send(qp);
-               }
-       }
        spin_unlock(&qp->s_lock);
 
        hfi1_put_txreq(tx);