virtio_net: Defer skb allocation in receive path Date: Wed, 13 Jan 2010 12:53:38...
authorShirley Ma <mashirle@us.ibm.com>
Fri, 29 Jan 2010 03:20:04 +0000 (03:20 +0000)
committerDavid S. Miller <davem@davemloft.net>
Tue, 2 Feb 2010 23:55:42 +0000 (15:55 -0800)
virtio_net receives packets from its pre-allocated vring buffers, then it
delivers these packets to upper layer protocols as skb buffs. So it's not
necessary to pre-allocate skb for each mergable buffer, then frees extra
skbs when buffers are merged into a large packet. This patch has deferred
skb allocation in receiving packets for both big packets and mergeable buffers
to reduce skb pre-allocations and skb frees. It frees unused buffers by calling
detach_unused_buf in vring, so recv skb queue is not needed.

Signed-off-by: Shirley Ma <xma@us.ibm.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/virtio_net.c

index 6b92e383c653b2a724caa8308292dd307cda7332..9d8984a3741c3d0d8ef562d82e2f62ecd369b316 100644 (file)
@@ -56,8 +56,7 @@ struct virtnet_info
        /* Host will merge rx buffers for big packets (shake it! shake it!) */
        bool mergeable_rx_bufs;
 
-       /* Receive & send queues. */
-       struct sk_buff_head recv;
+       /* Send queue. */
        struct sk_buff_head send;
 
        /* Work struct for refilling if we run low on memory. */
@@ -75,34 +74,44 @@ struct skb_vnet_hdr {
        unsigned int num_sg;
 };
 
+struct padded_vnet_hdr {
+       struct virtio_net_hdr hdr;
+       /*
+        * virtio_net_hdr should be in a separated sg buffer because of a
+        * QEMU bug, and data sg buffer shares same page with this header sg.
+        * This padding makes next sg 16 byte aligned after virtio_net_hdr.
+        */
+       char padding[6];
+};
+
 static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
 {
        return (struct skb_vnet_hdr *)skb->cb;
 }
 
-static void give_a_page(struct virtnet_info *vi, struct page *page)
-{
-       page->private = (unsigned long)vi->pages;
-       vi->pages = page;
-}
-
-static void trim_pages(struct virtnet_info *vi, struct sk_buff *skb)
+/*
+ * private is used to chain pages for big packets, put the whole
+ * most recent used list in the beginning for reuse
+ */
+static void give_pages(struct virtnet_info *vi, struct page *page)
 {
-       unsigned int i;
+       struct page *end;
 
-       for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
-               give_a_page(vi, skb_shinfo(skb)->frags[i].page);
-       skb_shinfo(skb)->nr_frags = 0;
-       skb->data_len = 0;
+       /* Find end of list, sew whole thing into vi->pages. */
+       for (end = page; end->private; end = (struct page *)end->private);
+       end->private = (unsigned long)vi->pages;
+       vi->pages = page;
 }
 
 static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
 {
        struct page *p = vi->pages;
 
-       if (p)
+       if (p) {
                vi->pages = (struct page *)p->private;
-       else
+               /* clear private here, it is used to chain pages */
+               p->private = 0;
+       } else
                p = alloc_page(gfp_mask);
        return p;
 }
@@ -118,99 +127,142 @@ static void skb_xmit_done(struct virtqueue *svq)
        netif_wake_queue(vi->dev);
 }
 
-static void receive_skb(struct net_device *dev, struct sk_buff *skb,
-                       unsigned len)
+static void set_skb_frag(struct sk_buff *skb, struct page *page,
+                        unsigned int offset, unsigned int *len)
 {
-       struct virtnet_info *vi = netdev_priv(dev);
-       struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
-       int err;
-       int i;
-
-       if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
-               pr_debug("%s: short packet %i\n", dev->name, len);
-               dev->stats.rx_length_errors++;
-               goto drop;
-       }
+       int i = skb_shinfo(skb)->nr_frags;
+       skb_frag_t *f;
+
+       f = &skb_shinfo(skb)->frags[i];
+       f->size = min((unsigned)PAGE_SIZE - offset, *len);
+       f->page_offset = offset;
+       f->page = page;
+
+       skb->data_len += f->size;
+       skb->len += f->size;
+       skb_shinfo(skb)->nr_frags++;
+       *len -= f->size;
+}
 
-       if (vi->mergeable_rx_bufs) {
-               unsigned int copy;
-               char *p = page_address(skb_shinfo(skb)->frags[0].page);
+static struct sk_buff *page_to_skb(struct virtnet_info *vi,
+                                  struct page *page, unsigned int len)
+{
+       struct sk_buff *skb;
+       struct skb_vnet_hdr *hdr;
+       unsigned int copy, hdr_len, offset;
+       char *p;
 
-               if (len > PAGE_SIZE)
-                       len = PAGE_SIZE;
-               len -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
+       p = page_address(page);
 
-               memcpy(&hdr->mhdr, p, sizeof(hdr->mhdr));
-               p += sizeof(hdr->mhdr);
+       /* copy small packet so we can reuse these pages for small data */
+       skb = netdev_alloc_skb_ip_align(vi->dev, GOOD_COPY_LEN);
+       if (unlikely(!skb))
+               return NULL;
 
-               copy = len;
-               if (copy > skb_tailroom(skb))
-                       copy = skb_tailroom(skb);
+       hdr = skb_vnet_hdr(skb);
 
-               memcpy(skb_put(skb, copy), p, copy);
+       if (vi->mergeable_rx_bufs) {
+               hdr_len = sizeof hdr->mhdr;
+               offset = hdr_len;
+       } else {
+               hdr_len = sizeof hdr->hdr;
+               offset = sizeof(struct padded_vnet_hdr);
+       }
 
-               len -= copy;
+       memcpy(hdr, p, hdr_len);
 
-               if (!len) {
-                       give_a_page(vi, skb_shinfo(skb)->frags[0].page);
-                       skb_shinfo(skb)->nr_frags--;
-               } else {
-                       skb_shinfo(skb)->frags[0].page_offset +=
-                               sizeof(hdr->mhdr) + copy;
-                       skb_shinfo(skb)->frags[0].size = len;
-                       skb->data_len += len;
-                       skb->len += len;
-               }
+       len -= hdr_len;
+       p += offset;
 
-               while (--hdr->mhdr.num_buffers) {
-                       struct sk_buff *nskb;
+       copy = len;
+       if (copy > skb_tailroom(skb))
+               copy = skb_tailroom(skb);
+       memcpy(skb_put(skb, copy), p, copy);
 
-                       i = skb_shinfo(skb)->nr_frags;
-                       if (i >= MAX_SKB_FRAGS) {
-                               pr_debug("%s: packet too long %d\n", dev->name,
-                                        len);
-                               dev->stats.rx_length_errors++;
-                               goto drop;
-                       }
+       len -= copy;
+       offset += copy;
 
-                       nskb = vi->rvq->vq_ops->get_buf(vi->rvq, &len);
-                       if (!nskb) {
-                               pr_debug("%s: rx error: %d buffers missing\n",
-                                        dev->name, hdr->mhdr.num_buffers);
-                               dev->stats.rx_length_errors++;
-                               goto drop;
-                       }
+       while (len) {
+               set_skb_frag(skb, page, offset, &len);
+               page = (struct page *)page->private;
+               offset = 0;
+       }
 
-                       __skb_unlink(nskb, &vi->recv);
-                       vi->num--;
+       if (page)
+               give_pages(vi, page);
 
-                       skb_shinfo(skb)->frags[i] = skb_shinfo(nskb)->frags[0];
-                       skb_shinfo(nskb)->nr_frags = 0;
-                       kfree_skb(nskb);
+       return skb;
+}
 
-                       if (len > PAGE_SIZE)
-                               len = PAGE_SIZE;
+static int receive_mergeable(struct virtnet_info *vi, struct sk_buff *skb)
+{
+       struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
+       struct page *page;
+       int num_buf, i, len;
+
+       num_buf = hdr->mhdr.num_buffers;
+       while (--num_buf) {
+               i = skb_shinfo(skb)->nr_frags;
+               if (i >= MAX_SKB_FRAGS) {
+                       pr_debug("%s: packet too long\n", skb->dev->name);
+                       skb->dev->stats.rx_length_errors++;
+                       return -EINVAL;
+               }
 
-                       skb_shinfo(skb)->frags[i].size = len;
-                       skb_shinfo(skb)->nr_frags++;
-                       skb->data_len += len;
-                       skb->len += len;
+               page = vi->rvq->vq_ops->get_buf(vi->rvq, &len);
+               if (!page) {
+                       pr_debug("%s: rx error: %d buffers missing\n",
+                                skb->dev->name, hdr->mhdr.num_buffers);
+                       skb->dev->stats.rx_length_errors++;
+                       return -EINVAL;
                }
-       } else {
-               len -= sizeof(hdr->hdr);
+               if (len > PAGE_SIZE)
+                       len = PAGE_SIZE;
+
+               set_skb_frag(skb, page, 0, &len);
+
+               --vi->num;
+       }
+       return 0;
+}
+
+static void receive_buf(struct net_device *dev, void *buf, unsigned int len)
+{
+       struct virtnet_info *vi = netdev_priv(dev);
+       struct sk_buff *skb;
+       struct page *page;
+       struct skb_vnet_hdr *hdr;
 
-               if (len <= MAX_PACKET_LEN)
-                       trim_pages(vi, skb);
+       if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
+               pr_debug("%s: short packet %i\n", dev->name, len);
+               dev->stats.rx_length_errors++;
+               if (vi->mergeable_rx_bufs || vi->big_packets)
+                       give_pages(vi, buf);
+               else
+                       dev_kfree_skb(buf);
+               return;
+       }
 
-               err = pskb_trim(skb, len);
-               if (err) {
-                       pr_debug("%s: pskb_trim failed %i %d\n", dev->name,
-                                len, err);
+       if (!vi->mergeable_rx_bufs && !vi->big_packets) {
+               skb = buf;
+               len -= sizeof(struct virtio_net_hdr);
+               skb_trim(skb, len);
+       } else {
+               page = buf;
+               skb = page_to_skb(vi, page, len);
+               if (unlikely(!skb)) {
                        dev->stats.rx_dropped++;
-                       goto drop;
+                       give_pages(vi, page);
+                       return;
                }
+               if (vi->mergeable_rx_bufs)
+                       if (receive_mergeable(vi, skb)) {
+                               dev_kfree_skb(skb);
+                               return;
+                       }
        }
 
+       hdr = skb_vnet_hdr(skb);
        skb->truesize += skb->data_len;
        dev->stats.rx_bytes += skb->len;
        dev->stats.rx_packets++;
@@ -267,110 +319,119 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,
 
 frame_err:
        dev->stats.rx_frame_errors++;
-drop:
        dev_kfree_skb(skb);
 }
 
-static bool try_fill_recv_maxbufs(struct virtnet_info *vi, gfp_t gfp)
+static int add_recvbuf_small(struct virtnet_info *vi, gfp_t gfp)
 {
        struct sk_buff *skb;
-       struct scatterlist sg[2+MAX_SKB_FRAGS];
-       int num, err, i;
-       bool oom = false;
-
-       sg_init_table(sg, 2+MAX_SKB_FRAGS);
-       do {
-               struct skb_vnet_hdr *hdr;
+       struct skb_vnet_hdr *hdr;
+       struct scatterlist sg[2];
+       int err;
 
-               skb = netdev_alloc_skb_ip_align(vi->dev, MAX_PACKET_LEN);
-               if (unlikely(!skb)) {
-                       oom = true;
-                       break;
-               }
+       skb = netdev_alloc_skb_ip_align(vi->dev, MAX_PACKET_LEN);
+       if (unlikely(!skb))
+               return -ENOMEM;
 
-               skb_put(skb, MAX_PACKET_LEN);
+       skb_put(skb, MAX_PACKET_LEN);
 
-               hdr = skb_vnet_hdr(skb);
-               sg_set_buf(sg, &hdr->hdr, sizeof(hdr->hdr));
+       hdr = skb_vnet_hdr(skb);
+       sg_set_buf(sg, &hdr->hdr, sizeof hdr->hdr);
 
-               if (vi->big_packets) {
-                       for (i = 0; i < MAX_SKB_FRAGS; i++) {
-                               skb_frag_t *f = &skb_shinfo(skb)->frags[i];
-                               f->page = get_a_page(vi, gfp);
-                               if (!f->page)
-                                       break;
+       skb_to_sgvec(skb, sg + 1, 0, skb->len);
 
-                               f->page_offset = 0;
-                               f->size = PAGE_SIZE;
+       err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 2, skb);
+       if (err < 0)
+               dev_kfree_skb(skb);
 
-                               skb->data_len += PAGE_SIZE;
-                               skb->len += PAGE_SIZE;
+       return err;
+}
 
-                               skb_shinfo(skb)->nr_frags++;
-                       }
+static int add_recvbuf_big(struct virtnet_info *vi, gfp_t gfp)
+{
+       struct scatterlist sg[MAX_SKB_FRAGS + 2];
+       struct page *first, *list = NULL;
+       char *p;
+       int i, err, offset;
+
+       /* page in sg[MAX_SKB_FRAGS + 1] is list tail */
+       for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
+               first = get_a_page(vi, gfp);
+               if (!first) {
+                       if (list)
+                               give_pages(vi, list);
+                       return -ENOMEM;
                }
+               sg_set_buf(&sg[i], page_address(first), PAGE_SIZE);
 
-               num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
-               skb_queue_head(&vi->recv, skb);
+               /* chain new page in list head to match sg */
+               first->private = (unsigned long)list;
+               list = first;
+       }
 
-               err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, num, skb);
-               if (err < 0) {
-                       skb_unlink(skb, &vi->recv);
-                       trim_pages(vi, skb);
-                       kfree_skb(skb);
-                       break;
-               }
-               vi->num++;
-       } while (err >= num);
-       if (unlikely(vi->num > vi->max))
-               vi->max = vi->num;
-       vi->rvq->vq_ops->kick(vi->rvq);
-       return !oom;
+       first = get_a_page(vi, gfp);
+       if (!first) {
+               give_pages(vi, list);
+               return -ENOMEM;
+       }
+       p = page_address(first);
+
+       /* sg[0], sg[1] share the same page */
+       /* a separated sg[0] for  virtio_net_hdr only during to QEMU bug*/
+       sg_set_buf(&sg[0], p, sizeof(struct virtio_net_hdr));
+
+       /* sg[1] for data packet, from offset */
+       offset = sizeof(struct padded_vnet_hdr);
+       sg_set_buf(&sg[1], p + offset, PAGE_SIZE - offset);
+
+       /* chain first in list head */
+       first->private = (unsigned long)list;
+       err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, MAX_SKB_FRAGS + 2,
+                                      first);
+       if (err < 0)
+               give_pages(vi, first);
+
+       return err;
 }
 
-/* Returns false if we couldn't fill entirely (OOM). */
-static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
+static int add_recvbuf_mergeable(struct virtnet_info *vi, gfp_t gfp)
 {
-       struct sk_buff *skb;
-       struct scatterlist sg[1];
+       struct page *page;
+       struct scatterlist sg;
        int err;
-       bool oom = false;
-
-       if (!vi->mergeable_rx_bufs)
-               return try_fill_recv_maxbufs(vi, gfp);
 
-       do {
-               skb_frag_t *f;
+       page = get_a_page(vi, gfp);
+       if (!page)
+               return -ENOMEM;
 
-               skb = netdev_alloc_skb_ip_align(vi->dev, GOOD_COPY_LEN);
-               if (unlikely(!skb)) {
-                       oom = true;
-                       break;
-               }
+       sg_init_one(&sg, page_address(page), PAGE_SIZE);
 
-               f = &skb_shinfo(skb)->frags[0];
-               f->page = get_a_page(vi, gfp);
-               if (!f->page) {
-                       oom = true;
-                       kfree_skb(skb);
-                       break;
-               }
+       err = vi->rvq->vq_ops->add_buf(vi->rvq, &sg, 0, 1, page);
+       if (err < 0)
+               give_pages(vi, page);
 
-               f->page_offset = 0;
-               f->size = PAGE_SIZE;
+       return err;
+}
 
-               skb_shinfo(skb)->nr_frags++;
+/* Returns false if we couldn't fill entirely (OOM). */
+static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
+{
+       int err;
+       bool oom = false;
 
-               sg_init_one(sg, page_address(f->page), PAGE_SIZE);
-               skb_queue_head(&vi->recv, skb);
+       do {
+               if (vi->mergeable_rx_bufs)
+                       err = add_recvbuf_mergeable(vi, gfp);
+               else if (vi->big_packets)
+                       err = add_recvbuf_big(vi, gfp);
+               else
+                       err = add_recvbuf_small(vi, gfp);
 
-               err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 1, skb);
                if (err < 0) {
-                       skb_unlink(skb, &vi->recv);
-                       kfree_skb(skb);
+                       oom = true;
                        break;
                }
-               vi->num++;
+               ++vi->num;
        } while (err > 0);
        if (unlikely(vi->num > vi->max))
                vi->max = vi->num;
@@ -407,15 +468,14 @@ static void refill_work(struct work_struct *work)
 static int virtnet_poll(struct napi_struct *napi, int budget)
 {
        struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi);
-       struct sk_buff *skb = NULL;
+       void *buf;
        unsigned int len, received = 0;
 
 again:
        while (received < budget &&
-              (skb = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
-               __skb_unlink(skb, &vi->recv);
-               receive_skb(vi->dev, skb, len);
-               vi->num--;
+              (buf = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
+               receive_buf(vi->dev, buf, len);
+               --vi->num;
                received++;
        }
 
@@ -495,9 +555,9 @@ static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
 
        /* Encode metadata header at front. */
        if (vi->mergeable_rx_bufs)
-               sg_set_buf(sg, &hdr->mhdr, sizeof(hdr->mhdr));
+               sg_set_buf(sg, &hdr->mhdr, sizeof hdr->mhdr);
        else
-               sg_set_buf(sg, &hdr->hdr, sizeof(hdr->hdr));
+               sg_set_buf(sg, &hdr->hdr, sizeof hdr->hdr);
 
        hdr->num_sg = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
        return vi->svq->vq_ops->add_buf(vi->svq, sg, hdr->num_sg, 0, skb);
@@ -917,8 +977,7 @@ static int virtnet_probe(struct virtio_device *vdev)
                        dev->features |= NETIF_F_HW_VLAN_FILTER;
        }
 
-       /* Initialize our empty receive and send queues. */
-       skb_queue_head_init(&vi->recv);
+       /* Initialize our empty send queue. */
        skb_queue_head_init(&vi->send);
 
        err = register_netdev(dev);
@@ -953,25 +1012,35 @@ free:
        return err;
 }
 
+static void free_unused_bufs(struct virtnet_info *vi)
+{
+       void *buf;
+       while (1) {
+               buf = vi->rvq->vq_ops->detach_unused_buf(vi->rvq);
+               if (!buf)
+                       break;
+               if (vi->mergeable_rx_bufs || vi->big_packets)
+                       give_pages(vi, buf);
+               else
+                       dev_kfree_skb(buf);
+               --vi->num;
+       }
+       BUG_ON(vi->num != 0);
+}
+
 static void __devexit virtnet_remove(struct virtio_device *vdev)
 {
        struct virtnet_info *vi = vdev->priv;
-       struct sk_buff *skb;
 
        /* Stop all the virtqueues. */
        vdev->config->reset(vdev);
 
-       /* Free our skbs in send and recv queues, if any. */
-       while ((skb = __skb_dequeue(&vi->recv)) != NULL) {
-               kfree_skb(skb);
-               vi->num--;
-       }
+       /* Free our skbs in send queue, if any. */
        __skb_queue_purge(&vi->send);
 
-       BUG_ON(vi->num != 0);
-
        unregister_netdev(vi->dev);
        cancel_delayed_work_sync(&vi->refill);
+       free_unused_bufs(vi);
 
        vdev->config->del_vqs(vi->vdev);