Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
[linux-block.git] / drivers / net / virtio_net.c
index ac0c143f97b4d1e840572a2e1aa2be37a239b2a1..9b6a4a875c5531133fad2080ed84af9148d27972 100644 (file)
@@ -195,6 +195,9 @@ struct virtnet_info {
        /* # of XDP queue pairs currently used by the driver */
        u16 xdp_queue_pairs;
 
+       /* xdp_queue_pairs may be 0, when xdp is already loaded. So add this. */
+       bool xdp_enabled;
+
        /* I like... big packets and I cannot lie! */
        bool big_packets;
 
@@ -376,21 +379,18 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
                                   struct receive_queue *rq,
                                   struct page *page, unsigned int offset,
                                   unsigned int len, unsigned int truesize,
-                                  bool hdr_valid, unsigned int metasize)
+                                  bool hdr_valid, unsigned int metasize,
+                                  unsigned int headroom)
 {
        struct sk_buff *skb;
        struct virtio_net_hdr_mrg_rxbuf *hdr;
        unsigned int copy, hdr_len, hdr_padded_len;
-       char *p;
+       struct page *page_to_free = NULL;
+       int tailroom, shinfo_size;
+       char *p, *hdr_p, *buf;
 
        p = page_address(page) + offset;
-
-       /* copy small packet so we can reuse these pages for small data */
-       skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
-       if (unlikely(!skb))
-               return NULL;
-
-       hdr = skb_vnet_hdr(skb);
+       hdr_p = p;
 
        hdr_len = vi->hdr_len;
        if (vi->mergeable_rx_bufs)
@@ -398,14 +398,44 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
        else
                hdr_padded_len = sizeof(struct padded_vnet_hdr);
 
-       /* hdr_valid means no XDP, so we can copy the vnet header */
-       if (hdr_valid)
-               memcpy(hdr, p, hdr_len);
+       /* If headroom is not 0, there is an offset between the beginning of the
+        * data and the allocated space, otherwise the data and the allocated
+        * space are aligned.
+        */
+       if (headroom) {
+               /* Buffers with headroom use PAGE_SIZE as alloc size,
+                * see add_recvbuf_mergeable() + get_mergeable_buf_len()
+                */
+               truesize = PAGE_SIZE;
+               tailroom = truesize - len - offset;
+               buf = page_address(page);
+       } else {
+               tailroom = truesize - len;
+               buf = p;
+       }
 
        len -= hdr_len;
        offset += hdr_padded_len;
        p += hdr_padded_len;
 
+       shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+       /* copy small packet so we can reuse these pages */
+       if (!NET_IP_ALIGN && len > GOOD_COPY_LEN && tailroom >= shinfo_size) {
+               skb = build_skb(buf, truesize);
+               if (unlikely(!skb))
+                       return NULL;
+
+               skb_reserve(skb, p - buf);
+               skb_put(skb, len);
+               goto ok;
+       }
+
+       /* copy small packet so we can reuse these pages for small data */
+       skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
+       if (unlikely(!skb))
+               return NULL;
+
        /* Copy all frame if it fits skb->head, otherwise
         * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.
         */
@@ -415,11 +445,6 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
                copy = ETH_HLEN + metasize;
        skb_put_data(skb, p, copy);
 
-       if (metasize) {
-               __skb_pull(skb, metasize);
-               skb_metadata_set(skb, metasize);
-       }
-
        len -= copy;
        offset += copy;
 
@@ -427,8 +452,8 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
                if (len)
                        skb_add_rx_frag(skb, 0, page, offset, len, truesize);
                else
-                       put_page(page);
-               return skb;
+                       page_to_free = page;
+               goto ok;
        }
 
        /*
@@ -455,6 +480,20 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
        if (page)
                give_pages(rq, page);
 
+ok:
+       /* hdr_valid means no XDP, so we can copy the vnet header */
+       if (hdr_valid) {
+               hdr = skb_vnet_hdr(skb);
+               memcpy(hdr, hdr_p, hdr_len);
+       }
+       if (page_to_free)
+               put_page(page_to_free);
+
+       if (metasize) {
+               __skb_pull(skb, metasize);
+               skb_metadata_set(skb, metasize);
+       }
+
        return skb;
 }
 
@@ -485,12 +524,41 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
        return 0;
 }
 
-static struct send_queue *virtnet_xdp_sq(struct virtnet_info *vi)
-{
-       unsigned int qp;
-
-       qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
-       return &vi->sq[qp];
+/* when vi->curr_queue_pairs > nr_cpu_ids, the txq/sq is only used for xdp tx on
+ * the current cpu, so it does not need to be locked.
+ *
+ * Here we use marco instead of inline functions because we have to deal with
+ * three issues at the same time: 1. the choice of sq. 2. judge and execute the
+ * lock/unlock of txq 3. make sparse happy. It is difficult for two inline
+ * functions to perfectly solve these three problems at the same time.
+ */
+#define virtnet_xdp_get_sq(vi) ({                                       \
+       struct netdev_queue *txq;                                       \
+       typeof(vi) v = (vi);                                            \
+       unsigned int qp;                                                \
+                                                                       \
+       if (v->curr_queue_pairs > nr_cpu_ids) {                         \
+               qp = v->curr_queue_pairs - v->xdp_queue_pairs;          \
+               qp += smp_processor_id();                               \
+               txq = netdev_get_tx_queue(v->dev, qp);                  \
+               __netif_tx_acquire(txq);                                \
+       } else {                                                        \
+               qp = smp_processor_id() % v->curr_queue_pairs;          \
+               txq = netdev_get_tx_queue(v->dev, qp);                  \
+               __netif_tx_lock(txq, raw_smp_processor_id());           \
+       }                                                               \
+       v->sq + qp;                                                     \
+})
+
+#define virtnet_xdp_put_sq(vi, q) {                                     \
+       struct netdev_queue *txq;                                       \
+       typeof(vi) v = (vi);                                            \
+                                                                       \
+       txq = netdev_get_tx_queue(v->dev, (q) - v->sq);                 \
+       if (v->curr_queue_pairs > nr_cpu_ids)                           \
+               __netif_tx_release(txq);                                \
+       else                                                            \
+               __netif_tx_unlock(txq);                                 \
 }
 
 static int virtnet_xdp_xmit(struct net_device *dev,
@@ -503,10 +571,10 @@ static int virtnet_xdp_xmit(struct net_device *dev,
        unsigned int len;
        int packets = 0;
        int bytes = 0;
-       int drops = 0;
+       int nxmit = 0;
        int kicks = 0;
-       int ret, err;
        void *ptr;
+       int ret;
        int i;
 
        /* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
@@ -516,11 +584,10 @@ static int virtnet_xdp_xmit(struct net_device *dev,
        if (!xdp_prog)
                return -ENXIO;
 
-       sq = virtnet_xdp_sq(vi);
+       sq = virtnet_xdp_get_sq(vi);
 
        if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) {
                ret = -EINVAL;
-               drops = n;
                goto out;
        }
 
@@ -543,13 +610,11 @@ static int virtnet_xdp_xmit(struct net_device *dev,
        for (i = 0; i < n; i++) {
                struct xdp_frame *xdpf = frames[i];
 
-               err = __virtnet_xdp_xmit_one(vi, sq, xdpf);
-               if (err) {
-                       xdp_return_frame_rx_napi(xdpf);
-                       drops++;
-               }
+               if (__virtnet_xdp_xmit_one(vi, sq, xdpf))
+                       break;
+               nxmit++;
        }
-       ret = n - drops;
+       ret = nxmit;
 
        if (flags & XDP_XMIT_FLUSH) {
                if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
@@ -560,16 +625,17 @@ out:
        sq->stats.bytes += bytes;
        sq->stats.packets += packets;
        sq->stats.xdp_tx += n;
-       sq->stats.xdp_tx_drops += drops;
+       sq->stats.xdp_tx_drops += n - nxmit;
        sq->stats.kicks += kicks;
        u64_stats_update_end(&sq->stats.syncp);
 
+       virtnet_xdp_put_sq(vi, sq);
        return ret;
 }
 
 static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
 {
-       return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
+       return vi->xdp_enabled ? VIRTIO_XDP_HEADROOM : 0;
 }
 
 /* We copy the packet for XDP in the following cases:
@@ -713,7 +779,9 @@ static struct sk_buff *receive_small(struct net_device *dev,
                        if (unlikely(!xdpf))
                                goto err_xdp;
                        err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
-                       if (unlikely(err < 0)) {
+                       if (unlikely(!err)) {
+                               xdp_return_frame_rx_napi(xdpf);
+                       } else if (unlikely(err < 0)) {
                                trace_xdp_exception(vi->dev, xdp_prog, act);
                                goto err_xdp;
                        }
@@ -776,7 +844,7 @@ static struct sk_buff *receive_big(struct net_device *dev,
 {
        struct page *page = buf;
        struct sk_buff *skb =
-               page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0);
+               page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0, 0);
 
        stats->bytes += len - vi->hdr_len;
        if (unlikely(!skb))
@@ -890,7 +958,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
                                put_page(page);
                                head_skb = page_to_skb(vi, rq, xdp_page, offset,
                                                       len, PAGE_SIZE, false,
-                                                      metasize);
+                                                      metasize, headroom);
                                return head_skb;
                        }
                        break;
@@ -900,7 +968,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
                        if (unlikely(!xdpf))
                                goto err_xdp;
                        err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
-                       if (unlikely(err < 0)) {
+                       if (unlikely(!err)) {
+                               xdp_return_frame_rx_napi(xdpf);
+                       } else if (unlikely(err < 0)) {
                                trace_xdp_exception(vi->dev, xdp_prog, act);
                                if (unlikely(xdp_page != page))
                                        put_page(xdp_page);
@@ -946,7 +1016,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
        }
 
        head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog,
-                              metasize);
+                              metasize, headroom);
        curr_skb = head_skb;
 
        if (unlikely(!curr_skb))
@@ -1462,12 +1532,13 @@ static int virtnet_poll(struct napi_struct *napi, int budget)
                xdp_do_flush();
 
        if (xdp_xmit & VIRTIO_XDP_TX) {
-               sq = virtnet_xdp_sq(vi);
+               sq = virtnet_xdp_get_sq(vi);
                if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
                        u64_stats_update_begin(&sq->stats.syncp);
                        sq->stats.kicks++;
                        u64_stats_update_end(&sq->stats.syncp);
                }
+               virtnet_xdp_put_sq(vi, sq);
        }
 
        return received;
@@ -1985,7 +2056,7 @@ static void virtnet_set_affinity(struct virtnet_info *vi)
                }
                virtqueue_set_affinity(vi->rq[i].vq, mask);
                virtqueue_set_affinity(vi->sq[i].vq, mask);
-               __netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, false);
+               __netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, XPS_CPUS);
                cpumask_clear(mask);
        }
 
@@ -2108,25 +2179,21 @@ static int virtnet_set_channels(struct net_device *dev,
 static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
 {
        struct virtnet_info *vi = netdev_priv(dev);
-       char *p = (char *)data;
        unsigned int i, j;
+       u8 *p = data;
 
        switch (stringset) {
        case ETH_SS_STATS:
                for (i = 0; i < vi->curr_queue_pairs; i++) {
-                       for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) {
-                               snprintf(p, ETH_GSTRING_LEN, "rx_queue_%u_%s",
-                                        i, virtnet_rq_stats_desc[j].desc);
-                               p += ETH_GSTRING_LEN;
-                       }
+                       for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++)
+                               ethtool_sprintf(&p, "rx_queue_%u_%s", i,
+                                               virtnet_rq_stats_desc[j].desc);
                }
 
                for (i = 0; i < vi->curr_queue_pairs; i++) {
-                       for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) {
-                               snprintf(p, ETH_GSTRING_LEN, "tx_queue_%u_%s",
-                                        i, virtnet_sq_stats_desc[j].desc);
-                               p += ETH_GSTRING_LEN;
-                       }
+                       for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++)
+                               ethtool_sprintf(&p, "tx_queue_%u_%s", i,
+                                               virtnet_sq_stats_desc[j].desc);
                }
                break;
        }
@@ -2422,10 +2489,9 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 
        /* XDP requires extra queues for XDP_TX */
        if (curr_qp + xdp_qp > vi->max_queue_pairs) {
-               NL_SET_ERR_MSG_MOD(extack, "Too few free TX rings available");
-               netdev_warn(dev, "request %i queues but max is %i\n",
+               netdev_warn(dev, "XDP request %i queues but max is %i. XDP_TX and XDP_REDIRECT will operate in a slower locked tx mode.\n",
                            curr_qp + xdp_qp, vi->max_queue_pairs);
-               return -ENOMEM;
+               xdp_qp = 0;
        }
 
        old_prog = rtnl_dereference(vi->rq[0].xdp_prog);
@@ -2459,11 +2525,14 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
        vi->xdp_queue_pairs = xdp_qp;
 
        if (prog) {
+               vi->xdp_enabled = true;
                for (i = 0; i < vi->max_queue_pairs; i++) {
                        rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
                        if (i == 0 && !old_prog)
                                virtnet_clear_guest_offloads(vi);
                }
+       } else {
+               vi->xdp_enabled = false;
        }
 
        for (i = 0; i < vi->max_queue_pairs; i++) {
@@ -2531,7 +2600,7 @@ static int virtnet_set_features(struct net_device *dev,
        int err;
 
        if ((dev->features ^ features) & NETIF_F_LRO) {
-               if (vi->xdp_queue_pairs)
+               if (vi->xdp_enabled)
                        return -EBUSY;
 
                if (features & NETIF_F_LRO)
@@ -2981,7 +3050,8 @@ static int virtnet_probe(struct virtio_device *vdev)
                return -ENOMEM;
 
        /* Set up network device as normal. */
-       dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
+       dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE |
+                          IFF_TX_SKB_NO_LINEAR;
        dev->netdev_ops = &virtnet_netdev;
        dev->features = NETIF_F_HIGHDMA;