svcrdma: Avoid DMA mapping small RPC Replies
authorChuck Lever <chuck.lever@oracle.com>
Tue, 3 Mar 2020 18:28:14 +0000 (13:28 -0500)
committerChuck Lever <chuck.lever@oracle.com>
Mon, 16 Mar 2020 16:04:33 +0000 (12:04 -0400)
On some platforms, DMA mapping part of a page is more costly than
copying bytes. Indeed, not involving the I/O MMU can help the
RPC/RDMA transport scale better for tiny I/Os across more RDMA
devices. This is because interaction with the I/O MMU is eliminated
for each of these small I/Os. Without the explicit unmapping, the
NIC no longer needs to do a costly internal TLB shoot down for
buffers that are just a handful of bytes.

Since pull-up is now a more a frequent operation, I've introduced a
trace point in the pull-up path. It can be used for debugging or
user-space tools that count pull-up frequency.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
include/linux/sunrpc/svc_rdma.h
include/trace/events/rpcrdma.h
net/sunrpc/xprtrdma/svc_rdma_sendto.c

index a3fa5b4fa2e42d8d208da3d25fd0d299926af1f7..78fe2ac6dc6c5d7b3c477b42cecf04920622425c 100644 (file)
@@ -52,6 +52,7 @@
 
 /* Default and maximum inline threshold sizes */
 enum {
+       RPCRDMA_PULLUP_THRESH = RPCRDMA_V1_DEF_INLINE_SIZE >> 1,
        RPCRDMA_DEF_INLINE_THRESH = 4096,
        RPCRDMA_MAX_INLINE_THRESH = 65536
 };
index 74b68547eefb5f0642eede7dea37bf086829d3e9..9238d233f8cf2c7f7d6f417e2c6a46571211017a 100644 (file)
@@ -1639,6 +1639,24 @@ TRACE_EVENT(svcrdma_dma_map_rwctx,
        )
 );
 
+TRACE_EVENT(svcrdma_send_pullup,
+       TP_PROTO(
+               unsigned int len
+       ),
+
+       TP_ARGS(len),
+
+       TP_STRUCT__entry(
+               __field(unsigned int, len)
+       ),
+
+       TP_fast_assign(
+               __entry->len = len;
+       ),
+
+       TP_printk("len=%u", __entry->len)
+);
+
 TRACE_EVENT(svcrdma_send_failed,
        TP_PROTO(
                const struct svc_rqst *rqst,
index 7b9853214769239c573d8c95c642433cd1aee34e..90cba3058f04cf40f69985ad01ee631982331f5c 100644 (file)
@@ -541,6 +541,7 @@ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
 /**
  * svc_rdma_pull_up_needed - Determine whether to use pull-up
  * @rdma: controlling transport
+ * @sctxt: send_ctxt for the Send WR
  * @rctxt: Write and Reply chunks provided by client
  * @xdr: xdr_buf containing RPC message to transmit
  *
@@ -549,11 +550,20 @@ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
  *     %false otherwise
  */
 static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma,
+                                   struct svc_rdma_send_ctxt *sctxt,
                                    const struct svc_rdma_recv_ctxt *rctxt,
                                    struct xdr_buf *xdr)
 {
        int elements;
 
+       /* For small messages, copying bytes is cheaper than DMA mapping.
+        */
+       if (sctxt->sc_hdrbuf.len + xdr->len < RPCRDMA_PULLUP_THRESH)
+               return true;
+
+       /* Check whether the xdr_buf has more elements than can
+        * fit in a single RDMA Send.
+        */
        /* xdr->head */
        elements = 1;
 
@@ -636,6 +646,7 @@ static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma,
                memcpy(dst, tailbase, taillen);
 
        sctxt->sc_sges[0].length += xdr->len;
+       trace_svcrdma_send_pullup(sctxt->sc_sges[0].length);
        return 0;
 }
 
@@ -675,7 +686,7 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
        /* For pull-up, svc_rdma_send() will sync the transport header.
         * No additional DMA mapping is necessary.
         */
-       if (svc_rdma_pull_up_needed(rdma, rctxt, xdr))
+       if (svc_rdma_pull_up_needed(rdma, sctxt, rctxt, xdr))
                return svc_rdma_pull_up_reply_msg(rdma, sctxt, rctxt, xdr);
 
        ++sctxt->sc_cur_sge_no;