NTB: Improve performance with write combining
[linux-2.6-block.git] / drivers / ntb / ntb_transport.c
index 8d9b59f7fa07ae167f847047e1f39bd47dfd7406..7a765d3230d86f5e45442c0e851721a5a7dfa406 100644 (file)
@@ -58,6 +58,7 @@
 #include <linux/pci.h>
 #include <linux/slab.h>
 #include <linux/types.h>
+#include <linux/uaccess.h>
 #include "linux/ntb.h"
 #include "linux/ntb_transport.h"
 
@@ -346,6 +347,7 @@ int ntb_transport_register_client_dev(char *device_name)
 {
        struct ntb_transport_client_dev *client_dev;
        struct ntb_transport_ctx *nt;
+       int node;
        int rc, i = 0;
 
        if (list_empty(&ntb_transport_list))
@@ -354,8 +356,10 @@ int ntb_transport_register_client_dev(char *device_name)
        list_for_each_entry(nt, &ntb_transport_list, entry) {
                struct device *dev;
 
-               client_dev = kzalloc(sizeof(*client_dev),
-                                    GFP_KERNEL);
+               node = dev_to_node(&nt->ndev->dev);
+
+               client_dev = kzalloc_node(sizeof(*client_dev),
+                                         GFP_KERNEL, node);
                if (!client_dev) {
                        rc = -ENOMEM;
                        goto err;
@@ -648,18 +652,37 @@ static int ntb_set_mw(struct ntb_transport_ctx *nt, int num_mw,
        return 0;
 }
 
+static void ntb_qp_link_down_reset(struct ntb_transport_qp *qp)
+{
+       qp->link_is_up = false;
+
+       qp->tx_index = 0;
+       qp->rx_index = 0;
+       qp->rx_bytes = 0;
+       qp->rx_pkts = 0;
+       qp->rx_ring_empty = 0;
+       qp->rx_err_no_buf = 0;
+       qp->rx_err_oflow = 0;
+       qp->rx_err_ver = 0;
+       qp->rx_memcpy = 0;
+       qp->rx_async = 0;
+       qp->tx_bytes = 0;
+       qp->tx_pkts = 0;
+       qp->tx_ring_full = 0;
+       qp->tx_err_no_buf = 0;
+       qp->tx_memcpy = 0;
+       qp->tx_async = 0;
+}
+
 static void ntb_qp_link_cleanup(struct ntb_transport_qp *qp)
 {
        struct ntb_transport_ctx *nt = qp->transport;
        struct pci_dev *pdev = nt->ndev->pdev;
 
-       if (qp->link_is_up) {
-               cancel_delayed_work_sync(&qp->link_work);
-               return;
-       }
-
        dev_info(&pdev->dev, "qp %d: Link Cleanup\n", qp->qp_num);
-       qp->link_is_up = false;
+
+       cancel_delayed_work_sync(&qp->link_work);
+       ntb_qp_link_down_reset(qp);
 
        if (qp->event_handler)
                qp->event_handler(qp->cb_data, qp->link_is_up);
@@ -831,7 +854,7 @@ static void ntb_qp_link_work(struct work_struct *work)
 
        /* query remote spad for qp ready bits */
        ntb_peer_spad_read(nt->ndev, QP_LINKS);
-       dev_dbg(&pdev->dev, "Remote QP link status = %x\n", val);
+       dev_dbg_ratelimited(&pdev->dev, "Remote QP link status = %x\n", val);
 
        /* See if the remote side is up */
        if (val & BIT(qp->qp_num)) {
@@ -866,9 +889,9 @@ static int ntb_transport_init_queue(struct ntb_transport_ctx *nt,
        qp->qp_num = qp_num;
        qp->transport = nt;
        qp->ndev = nt->ndev;
-       qp->link_is_up = false;
        qp->client_ready = false;
        qp->event_handler = NULL;
+       ntb_qp_link_down_reset(qp);
 
        if (qp_count % mw_count && mw_num + 1 < qp_count / mw_count)
                num_qps_mw = qp_count / mw_count + 1;
@@ -934,6 +957,7 @@ static int ntb_transport_probe(struct ntb_client *self, struct ntb_dev *ndev)
        struct ntb_transport_mw *mw;
        unsigned int mw_count, qp_count;
        u64 qp_bitmap;
+       int node;
        int rc, i;
 
        if (ntb_db_is_unsafe(ndev))
@@ -943,7 +967,9 @@ static int ntb_transport_probe(struct ntb_client *self, struct ntb_dev *ndev)
                dev_dbg(&ndev->dev,
                        "scratchpad is unsafe, proceed anyway...\n");
 
-       nt = kzalloc(sizeof(*nt), GFP_KERNEL);
+       node = dev_to_node(&ndev->dev);
+
+       nt = kzalloc_node(sizeof(*nt), GFP_KERNEL, node);
        if (!nt)
                return -ENOMEM;
 
@@ -953,7 +979,8 @@ static int ntb_transport_probe(struct ntb_client *self, struct ntb_dev *ndev)
 
        nt->mw_count = mw_count;
 
-       nt->mw_vec = kcalloc(mw_count, sizeof(*nt->mw_vec), GFP_KERNEL);
+       nt->mw_vec = kzalloc_node(mw_count * sizeof(*nt->mw_vec),
+                                 GFP_KERNEL, node);
        if (!nt->mw_vec) {
                rc = -ENOMEM;
                goto err;
@@ -967,7 +994,7 @@ static int ntb_transport_probe(struct ntb_client *self, struct ntb_dev *ndev)
                if (rc)
                        goto err1;
 
-               mw->vbase = ioremap(mw->phys_addr, mw->phys_size);
+               mw->vbase = ioremap_wc(mw->phys_addr, mw->phys_size);
                if (!mw->vbase) {
                        rc = -ENOMEM;
                        goto err1;
@@ -993,7 +1020,8 @@ static int ntb_transport_probe(struct ntb_client *self, struct ntb_dev *ndev)
        nt->qp_bitmap = qp_bitmap;
        nt->qp_bitmap_free = qp_bitmap;
 
-       nt->qp_vec = kcalloc(qp_count, sizeof(*nt->qp_vec), GFP_KERNEL);
+       nt->qp_vec = kzalloc_node(qp_count * sizeof(*nt->qp_vec),
+                                 GFP_KERNEL, node);
        if (!nt->qp_vec) {
                rc = -ENOMEM;
                goto err2;
@@ -1212,8 +1240,7 @@ static int ntb_process_rxc(struct ntb_transport_qp *qp)
                dev_dbg(&qp->ndev->pdev->dev, "link down flag set\n");
                ntb_qp_link_down(qp);
                hdr->flags = 0;
-               iowrite32(qp->rx_index, &qp->rx_info->entry);
-               return 0;
+               return -EAGAIN;
        }
 
        if (hdr->ver != (u32)qp->rx_pkts) {
@@ -1349,7 +1376,15 @@ static void ntb_tx_copy_callback(void *data)
 
 static void ntb_memcpy_tx(struct ntb_queue_entry *entry, void __iomem *offset)
 {
+#ifdef ARCH_HAS_NOCACHE_UACCESS
+       /*
+        * Using non-temporal mov to improve performance on non-cached
+        * writes, even though we aren't actually copying from user space.
+        */
+       __copy_from_user_inatomic_nocache(offset, entry->buf, entry->len);
+#else
        memcpy_toio(offset, entry->buf, entry->len);
+#endif
 
        /* Ensure that the data is fully copied out before setting the flags */
        wmb();
@@ -1469,7 +1504,6 @@ static void ntb_send_link_down(struct ntb_transport_qp *qp)
        if (!qp->link_is_up)
                return;
 
-       qp->link_is_up = false;
        dev_info(&pdev->dev, "qp %d: Send Link Down\n", qp->qp_num);
 
        for (i = 0; i < NTB_LINK_DOWN_TIMEOUT; i++) {
@@ -1491,6 +1525,13 @@ static void ntb_send_link_down(struct ntb_transport_qp *qp)
        if (rc)
                dev_err(&pdev->dev, "ntb: QP%d unable to send linkdown msg\n",
                        qp->qp_num);
+
+       ntb_qp_link_down_reset(qp);
+}
+
+static bool ntb_dma_filter_fn(struct dma_chan *chan, void *node)
+{
+       return dev_to_node(&chan->dev->device) == (int)(unsigned long)node;
 }
 
 /**
@@ -1518,12 +1559,16 @@ ntb_transport_create_queue(void *data, struct device *client_dev,
        struct ntb_transport_qp *qp;
        u64 qp_bit;
        unsigned int free_queue;
+       dma_cap_mask_t dma_mask;
+       int node;
        int i;
 
        ndev = dev_ntb(client_dev->parent);
        pdev = ndev->pdev;
        nt = ndev->ctx;
 
+       node = dev_to_node(&ndev->dev);
+
        free_queue = ffs(nt->qp_bitmap);
        if (!free_queue)
                goto err;
@@ -1541,15 +1586,16 @@ ntb_transport_create_queue(void *data, struct device *client_dev,
        qp->tx_handler = handlers->tx_handler;
        qp->event_handler = handlers->event_handler;
 
-       dmaengine_get();
-       qp->dma_chan = dma_find_channel(DMA_MEMCPY);
-       if (!qp->dma_chan) {
-               dmaengine_put();
+       dma_cap_zero(dma_mask);
+       dma_cap_set(DMA_MEMCPY, dma_mask);
+
+       qp->dma_chan = dma_request_channel(dma_mask, ntb_dma_filter_fn,
+                                          (void *)(unsigned long)node);
+       if (!qp->dma_chan)
                dev_info(&pdev->dev, "Unable to allocate DMA channel, using CPU instead\n");
-       }
 
        for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
-               entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+               entry = kzalloc_node(sizeof(*entry), GFP_ATOMIC, node);
                if (!entry)
                        goto err1;
 
@@ -1559,7 +1605,7 @@ ntb_transport_create_queue(void *data, struct device *client_dev,
        }
 
        for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
-               entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+               entry = kzalloc_node(sizeof(*entry), GFP_ATOMIC, node);
                if (!entry)
                        goto err2;
 
@@ -1582,7 +1628,7 @@ err1:
        while ((entry = ntb_list_rm(&qp->ntb_rx_free_q_lock, &qp->rx_free_q)))
                kfree(entry);
        if (qp->dma_chan)
-               dmaengine_put();
+               dma_release_channel(qp->dma_chan);
        nt->qp_bitmap_free |= qp_bit;
 err:
        return NULL;
@@ -1619,7 +1665,7 @@ void ntb_transport_free_queue(struct ntb_transport_qp *qp)
                 */
                dma_sync_wait(chan, qp->last_cookie);
                dmaengine_terminate_all(chan);
-               dmaengine_put();
+               dma_release_channel(chan);
        }
 
        qp_bit = BIT_ULL(qp->qp_num);