Merge tag 'cxl-fixes-6.10-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl
[linux-block.git] / drivers / net / veth.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  drivers/net/veth.c
4  *
5  *  Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc
6  *
7  * Author: Pavel Emelianov <xemul@openvz.org>
8  * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com>
9  *
10  */
11
12 #include <linux/netdevice.h>
13 #include <linux/slab.h>
14 #include <linux/ethtool.h>
15 #include <linux/etherdevice.h>
16 #include <linux/u64_stats_sync.h>
17
18 #include <net/rtnetlink.h>
19 #include <net/dst.h>
20 #include <net/xfrm.h>
21 #include <net/xdp.h>
22 #include <linux/veth.h>
23 #include <linux/module.h>
24 #include <linux/bpf.h>
25 #include <linux/filter.h>
26 #include <linux/ptr_ring.h>
27 #include <linux/bpf_trace.h>
28 #include <linux/net_tstamp.h>
29 #include <linux/skbuff_ref.h>
30 #include <net/page_pool/helpers.h>
31
32 #define DRV_NAME        "veth"
33 #define DRV_VERSION     "1.0"
34
35 #define VETH_XDP_FLAG           BIT(0)
36 #define VETH_RING_SIZE          256
37 #define VETH_XDP_HEADROOM       (XDP_PACKET_HEADROOM + NET_IP_ALIGN)
38
39 #define VETH_XDP_TX_BULK_SIZE   16
40 #define VETH_XDP_BATCH          16
41
42 struct veth_stats {
43         u64     rx_drops;
44         /* xdp */
45         u64     xdp_packets;
46         u64     xdp_bytes;
47         u64     xdp_redirect;
48         u64     xdp_drops;
49         u64     xdp_tx;
50         u64     xdp_tx_err;
51         u64     peer_tq_xdp_xmit;
52         u64     peer_tq_xdp_xmit_err;
53 };
54
55 struct veth_rq_stats {
56         struct veth_stats       vs;
57         struct u64_stats_sync   syncp;
58 };
59
60 struct veth_rq {
61         struct napi_struct      xdp_napi;
62         struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */
63         struct net_device       *dev;
64         struct bpf_prog __rcu   *xdp_prog;
65         struct xdp_mem_info     xdp_mem;
66         struct veth_rq_stats    stats;
67         bool                    rx_notify_masked;
68         struct ptr_ring         xdp_ring;
69         struct xdp_rxq_info     xdp_rxq;
70         struct page_pool        *page_pool;
71 };
72
73 struct veth_priv {
74         struct net_device __rcu *peer;
75         atomic64_t              dropped;
76         struct bpf_prog         *_xdp_prog;
77         struct veth_rq          *rq;
78         unsigned int            requested_headroom;
79 };
80
81 struct veth_xdp_tx_bq {
82         struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE];
83         unsigned int count;
84 };
85
86 /*
87  * ethtool interface
88  */
89
90 struct veth_q_stat_desc {
91         char    desc[ETH_GSTRING_LEN];
92         size_t  offset;
93 };
94
95 #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m)
96
97 static const struct veth_q_stat_desc veth_rq_stats_desc[] = {
98         { "xdp_packets",        VETH_RQ_STAT(xdp_packets) },
99         { "xdp_bytes",          VETH_RQ_STAT(xdp_bytes) },
100         { "drops",              VETH_RQ_STAT(rx_drops) },
101         { "xdp_redirect",       VETH_RQ_STAT(xdp_redirect) },
102         { "xdp_drops",          VETH_RQ_STAT(xdp_drops) },
103         { "xdp_tx",             VETH_RQ_STAT(xdp_tx) },
104         { "xdp_tx_errors",      VETH_RQ_STAT(xdp_tx_err) },
105 };
106
107 #define VETH_RQ_STATS_LEN       ARRAY_SIZE(veth_rq_stats_desc)
108
109 static const struct veth_q_stat_desc veth_tq_stats_desc[] = {
110         { "xdp_xmit",           VETH_RQ_STAT(peer_tq_xdp_xmit) },
111         { "xdp_xmit_errors",    VETH_RQ_STAT(peer_tq_xdp_xmit_err) },
112 };
113
114 #define VETH_TQ_STATS_LEN       ARRAY_SIZE(veth_tq_stats_desc)
115
116 static struct {
117         const char string[ETH_GSTRING_LEN];
118 } ethtool_stats_keys[] = {
119         { "peer_ifindex" },
120 };
121
122 struct veth_xdp_buff {
123         struct xdp_buff xdp;
124         struct sk_buff *skb;
125 };
126
127 static int veth_get_link_ksettings(struct net_device *dev,
128                                    struct ethtool_link_ksettings *cmd)
129 {
130         cmd->base.speed         = SPEED_10000;
131         cmd->base.duplex        = DUPLEX_FULL;
132         cmd->base.port          = PORT_TP;
133         cmd->base.autoneg       = AUTONEG_DISABLE;
134         return 0;
135 }
136
137 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
138 {
139         strscpy(info->driver, DRV_NAME, sizeof(info->driver));
140         strscpy(info->version, DRV_VERSION, sizeof(info->version));
141 }
142
143 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
144 {
145         u8 *p = buf;
146         int i, j;
147
148         switch(stringset) {
149         case ETH_SS_STATS:
150                 memcpy(p, &ethtool_stats_keys, sizeof(ethtool_stats_keys));
151                 p += sizeof(ethtool_stats_keys);
152                 for (i = 0; i < dev->real_num_rx_queues; i++)
153                         for (j = 0; j < VETH_RQ_STATS_LEN; j++)
154                                 ethtool_sprintf(&p, "rx_queue_%u_%.18s",
155                                                 i, veth_rq_stats_desc[j].desc);
156
157                 for (i = 0; i < dev->real_num_tx_queues; i++)
158                         for (j = 0; j < VETH_TQ_STATS_LEN; j++)
159                                 ethtool_sprintf(&p, "tx_queue_%u_%.18s",
160                                                 i, veth_tq_stats_desc[j].desc);
161
162                 page_pool_ethtool_stats_get_strings(p);
163                 break;
164         }
165 }
166
167 static int veth_get_sset_count(struct net_device *dev, int sset)
168 {
169         switch (sset) {
170         case ETH_SS_STATS:
171                 return ARRAY_SIZE(ethtool_stats_keys) +
172                        VETH_RQ_STATS_LEN * dev->real_num_rx_queues +
173                        VETH_TQ_STATS_LEN * dev->real_num_tx_queues +
174                        page_pool_ethtool_stats_get_count();
175         default:
176                 return -EOPNOTSUPP;
177         }
178 }
179
180 static void veth_get_page_pool_stats(struct net_device *dev, u64 *data)
181 {
182 #ifdef CONFIG_PAGE_POOL_STATS
183         struct veth_priv *priv = netdev_priv(dev);
184         struct page_pool_stats pp_stats = {};
185         int i;
186
187         for (i = 0; i < dev->real_num_rx_queues; i++) {
188                 if (!priv->rq[i].page_pool)
189                         continue;
190                 page_pool_get_stats(priv->rq[i].page_pool, &pp_stats);
191         }
192         page_pool_ethtool_stats_get(data, &pp_stats);
193 #endif /* CONFIG_PAGE_POOL_STATS */
194 }
195
196 static void veth_get_ethtool_stats(struct net_device *dev,
197                 struct ethtool_stats *stats, u64 *data)
198 {
199         struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
200         struct net_device *peer = rtnl_dereference(priv->peer);
201         int i, j, idx, pp_idx;
202
203         data[0] = peer ? peer->ifindex : 0;
204         idx = 1;
205         for (i = 0; i < dev->real_num_rx_queues; i++) {
206                 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats;
207                 const void *stats_base = (void *)&rq_stats->vs;
208                 unsigned int start;
209                 size_t offset;
210
211                 do {
212                         start = u64_stats_fetch_begin(&rq_stats->syncp);
213                         for (j = 0; j < VETH_RQ_STATS_LEN; j++) {
214                                 offset = veth_rq_stats_desc[j].offset;
215                                 data[idx + j] = *(u64 *)(stats_base + offset);
216                         }
217                 } while (u64_stats_fetch_retry(&rq_stats->syncp, start));
218                 idx += VETH_RQ_STATS_LEN;
219         }
220         pp_idx = idx;
221
222         if (!peer)
223                 goto page_pool_stats;
224
225         rcv_priv = netdev_priv(peer);
226         for (i = 0; i < peer->real_num_rx_queues; i++) {
227                 const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats;
228                 const void *base = (void *)&rq_stats->vs;
229                 unsigned int start, tx_idx = idx;
230                 size_t offset;
231
232                 tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN;
233                 do {
234                         start = u64_stats_fetch_begin(&rq_stats->syncp);
235                         for (j = 0; j < VETH_TQ_STATS_LEN; j++) {
236                                 offset = veth_tq_stats_desc[j].offset;
237                                 data[tx_idx + j] += *(u64 *)(base + offset);
238                         }
239                 } while (u64_stats_fetch_retry(&rq_stats->syncp, start));
240         }
241         pp_idx = idx + dev->real_num_tx_queues * VETH_TQ_STATS_LEN;
242
243 page_pool_stats:
244         veth_get_page_pool_stats(dev, &data[pp_idx]);
245 }
246
247 static void veth_get_channels(struct net_device *dev,
248                               struct ethtool_channels *channels)
249 {
250         channels->tx_count = dev->real_num_tx_queues;
251         channels->rx_count = dev->real_num_rx_queues;
252         channels->max_tx = dev->num_tx_queues;
253         channels->max_rx = dev->num_rx_queues;
254 }
255
256 static int veth_set_channels(struct net_device *dev,
257                              struct ethtool_channels *ch);
258
259 static const struct ethtool_ops veth_ethtool_ops = {
260         .get_drvinfo            = veth_get_drvinfo,
261         .get_link               = ethtool_op_get_link,
262         .get_strings            = veth_get_strings,
263         .get_sset_count         = veth_get_sset_count,
264         .get_ethtool_stats      = veth_get_ethtool_stats,
265         .get_link_ksettings     = veth_get_link_ksettings,
266         .get_ts_info            = ethtool_op_get_ts_info,
267         .get_channels           = veth_get_channels,
268         .set_channels           = veth_set_channels,
269 };
270
271 /* general routines */
272
273 static bool veth_is_xdp_frame(void *ptr)
274 {
275         return (unsigned long)ptr & VETH_XDP_FLAG;
276 }
277
278 static struct xdp_frame *veth_ptr_to_xdp(void *ptr)
279 {
280         return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG);
281 }
282
283 static void *veth_xdp_to_ptr(struct xdp_frame *xdp)
284 {
285         return (void *)((unsigned long)xdp | VETH_XDP_FLAG);
286 }
287
288 static void veth_ptr_free(void *ptr)
289 {
290         if (veth_is_xdp_frame(ptr))
291                 xdp_return_frame(veth_ptr_to_xdp(ptr));
292         else
293                 kfree_skb(ptr);
294 }
295
296 static void __veth_xdp_flush(struct veth_rq *rq)
297 {
298         /* Write ptr_ring before reading rx_notify_masked */
299         smp_mb();
300         if (!READ_ONCE(rq->rx_notify_masked) &&
301             napi_schedule_prep(&rq->xdp_napi)) {
302                 WRITE_ONCE(rq->rx_notify_masked, true);
303                 __napi_schedule(&rq->xdp_napi);
304         }
305 }
306
307 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb)
308 {
309         if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) {
310                 dev_kfree_skb_any(skb);
311                 return NET_RX_DROP;
312         }
313
314         return NET_RX_SUCCESS;
315 }
316
317 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
318                             struct veth_rq *rq, bool xdp)
319 {
320         return __dev_forward_skb(dev, skb) ?: xdp ?
321                 veth_xdp_rx(rq, skb) :
322                 __netif_rx(skb);
323 }
324
325 /* return true if the specified skb has chances of GRO aggregation
326  * Don't strive for accuracy, but try to avoid GRO overhead in the most
327  * common scenarios.
328  * When XDP is enabled, all traffic is considered eligible, as the xmit
329  * device has TSO off.
330  * When TSO is enabled on the xmit device, we are likely interested only
331  * in UDP aggregation, explicitly check for that if the skb is suspected
332  * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets -
333  * to belong to locally generated UDP traffic.
334  */
335 static bool veth_skb_is_eligible_for_gro(const struct net_device *dev,
336                                          const struct net_device *rcv,
337                                          const struct sk_buff *skb)
338 {
339         return !(dev->features & NETIF_F_ALL_TSO) ||
340                 (skb->destructor == sock_wfree &&
341                  rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD));
342 }
343
344 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
345 {
346         struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
347         struct veth_rq *rq = NULL;
348         int ret = NETDEV_TX_OK;
349         struct net_device *rcv;
350         int length = skb->len;
351         bool use_napi = false;
352         int rxq;
353
354         rcu_read_lock();
355         rcv = rcu_dereference(priv->peer);
356         if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) {
357                 kfree_skb(skb);
358                 goto drop;
359         }
360
361         rcv_priv = netdev_priv(rcv);
362         rxq = skb_get_queue_mapping(skb);
363         if (rxq < rcv->real_num_rx_queues) {
364                 rq = &rcv_priv->rq[rxq];
365
366                 /* The napi pointer is available when an XDP program is
367                  * attached or when GRO is enabled
368                  * Don't bother with napi/GRO if the skb can't be aggregated
369                  */
370                 use_napi = rcu_access_pointer(rq->napi) &&
371                            veth_skb_is_eligible_for_gro(dev, rcv, skb);
372         }
373
374         skb_tx_timestamp(skb);
375         if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) {
376                 if (!use_napi)
377                         dev_sw_netstats_tx_add(dev, 1, length);
378                 else
379                         __veth_xdp_flush(rq);
380         } else {
381 drop:
382                 atomic64_inc(&priv->dropped);
383                 ret = NET_XMIT_DROP;
384         }
385
386         rcu_read_unlock();
387
388         return ret;
389 }
390
391 static void veth_stats_rx(struct veth_stats *result, struct net_device *dev)
392 {
393         struct veth_priv *priv = netdev_priv(dev);
394         int i;
395
396         result->peer_tq_xdp_xmit_err = 0;
397         result->xdp_packets = 0;
398         result->xdp_tx_err = 0;
399         result->xdp_bytes = 0;
400         result->rx_drops = 0;
401         for (i = 0; i < dev->num_rx_queues; i++) {
402                 u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err;
403                 struct veth_rq_stats *stats = &priv->rq[i].stats;
404                 unsigned int start;
405
406                 do {
407                         start = u64_stats_fetch_begin(&stats->syncp);
408                         peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err;
409                         xdp_tx_err = stats->vs.xdp_tx_err;
410                         packets = stats->vs.xdp_packets;
411                         bytes = stats->vs.xdp_bytes;
412                         drops = stats->vs.rx_drops;
413                 } while (u64_stats_fetch_retry(&stats->syncp, start));
414                 result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err;
415                 result->xdp_tx_err += xdp_tx_err;
416                 result->xdp_packets += packets;
417                 result->xdp_bytes += bytes;
418                 result->rx_drops += drops;
419         }
420 }
421
422 static void veth_get_stats64(struct net_device *dev,
423                              struct rtnl_link_stats64 *tot)
424 {
425         struct veth_priv *priv = netdev_priv(dev);
426         struct net_device *peer;
427         struct veth_stats rx;
428
429         tot->tx_dropped = atomic64_read(&priv->dropped);
430         dev_fetch_sw_netstats(tot, dev->tstats);
431
432         veth_stats_rx(&rx, dev);
433         tot->tx_dropped += rx.xdp_tx_err;
434         tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err;
435         tot->rx_bytes += rx.xdp_bytes;
436         tot->rx_packets += rx.xdp_packets;
437
438         rcu_read_lock();
439         peer = rcu_dereference(priv->peer);
440         if (peer) {
441                 struct rtnl_link_stats64 tot_peer = {};
442
443                 dev_fetch_sw_netstats(&tot_peer, peer->tstats);
444                 tot->rx_bytes += tot_peer.tx_bytes;
445                 tot->rx_packets += tot_peer.tx_packets;
446
447                 veth_stats_rx(&rx, peer);
448                 tot->tx_dropped += rx.peer_tq_xdp_xmit_err;
449                 tot->rx_dropped += rx.xdp_tx_err;
450                 tot->tx_bytes += rx.xdp_bytes;
451                 tot->tx_packets += rx.xdp_packets;
452         }
453         rcu_read_unlock();
454 }
455
456 /* fake multicast ability */
457 static void veth_set_multicast_list(struct net_device *dev)
458 {
459 }
460
461 static int veth_select_rxq(struct net_device *dev)
462 {
463         return smp_processor_id() % dev->real_num_rx_queues;
464 }
465
466 static struct net_device *veth_peer_dev(struct net_device *dev)
467 {
468         struct veth_priv *priv = netdev_priv(dev);
469
470         /* Callers must be under RCU read side. */
471         return rcu_dereference(priv->peer);
472 }
473
474 static int veth_xdp_xmit(struct net_device *dev, int n,
475                          struct xdp_frame **frames,
476                          u32 flags, bool ndo_xmit)
477 {
478         struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
479         int i, ret = -ENXIO, nxmit = 0;
480         struct net_device *rcv;
481         unsigned int max_len;
482         struct veth_rq *rq;
483
484         if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
485                 return -EINVAL;
486
487         rcu_read_lock();
488         rcv = rcu_dereference(priv->peer);
489         if (unlikely(!rcv))
490                 goto out;
491
492         rcv_priv = netdev_priv(rcv);
493         rq = &rcv_priv->rq[veth_select_rxq(rcv)];
494         /* The napi pointer is set if NAPI is enabled, which ensures that
495          * xdp_ring is initialized on receive side and the peer device is up.
496          */
497         if (!rcu_access_pointer(rq->napi))
498                 goto out;
499
500         max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN;
501
502         spin_lock(&rq->xdp_ring.producer_lock);
503         for (i = 0; i < n; i++) {
504                 struct xdp_frame *frame = frames[i];
505                 void *ptr = veth_xdp_to_ptr(frame);
506
507                 if (unlikely(xdp_get_frame_len(frame) > max_len ||
508                              __ptr_ring_produce(&rq->xdp_ring, ptr)))
509                         break;
510                 nxmit++;
511         }
512         spin_unlock(&rq->xdp_ring.producer_lock);
513
514         if (flags & XDP_XMIT_FLUSH)
515                 __veth_xdp_flush(rq);
516
517         ret = nxmit;
518         if (ndo_xmit) {
519                 u64_stats_update_begin(&rq->stats.syncp);
520                 rq->stats.vs.peer_tq_xdp_xmit += nxmit;
521                 rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit;
522                 u64_stats_update_end(&rq->stats.syncp);
523         }
524
525 out:
526         rcu_read_unlock();
527
528         return ret;
529 }
530
531 static int veth_ndo_xdp_xmit(struct net_device *dev, int n,
532                              struct xdp_frame **frames, u32 flags)
533 {
534         int err;
535
536         err = veth_xdp_xmit(dev, n, frames, flags, true);
537         if (err < 0) {
538                 struct veth_priv *priv = netdev_priv(dev);
539
540                 atomic64_add(n, &priv->dropped);
541         }
542
543         return err;
544 }
545
546 static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq)
547 {
548         int sent, i, err = 0, drops;
549
550         sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false);
551         if (sent < 0) {
552                 err = sent;
553                 sent = 0;
554         }
555
556         for (i = sent; unlikely(i < bq->count); i++)
557                 xdp_return_frame(bq->q[i]);
558
559         drops = bq->count - sent;
560         trace_xdp_bulk_tx(rq->dev, sent, drops, err);
561
562         u64_stats_update_begin(&rq->stats.syncp);
563         rq->stats.vs.xdp_tx += sent;
564         rq->stats.vs.xdp_tx_err += drops;
565         u64_stats_update_end(&rq->stats.syncp);
566
567         bq->count = 0;
568 }
569
570 static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq)
571 {
572         struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev);
573         struct net_device *rcv;
574         struct veth_rq *rcv_rq;
575
576         rcu_read_lock();
577         veth_xdp_flush_bq(rq, bq);
578         rcv = rcu_dereference(priv->peer);
579         if (unlikely(!rcv))
580                 goto out;
581
582         rcv_priv = netdev_priv(rcv);
583         rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)];
584         /* xdp_ring is initialized on receive side? */
585         if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog)))
586                 goto out;
587
588         __veth_xdp_flush(rcv_rq);
589 out:
590         rcu_read_unlock();
591 }
592
593 static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp,
594                        struct veth_xdp_tx_bq *bq)
595 {
596         struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp);
597
598         if (unlikely(!frame))
599                 return -EOVERFLOW;
600
601         if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE))
602                 veth_xdp_flush_bq(rq, bq);
603
604         bq->q[bq->count++] = frame;
605
606         return 0;
607 }
608
609 static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq,
610                                           struct xdp_frame *frame,
611                                           struct veth_xdp_tx_bq *bq,
612                                           struct veth_stats *stats)
613 {
614         struct xdp_frame orig_frame;
615         struct bpf_prog *xdp_prog;
616
617         rcu_read_lock();
618         xdp_prog = rcu_dereference(rq->xdp_prog);
619         if (likely(xdp_prog)) {
620                 struct veth_xdp_buff vxbuf;
621                 struct xdp_buff *xdp = &vxbuf.xdp;
622                 u32 act;
623
624                 xdp_convert_frame_to_buff(frame, xdp);
625                 xdp->rxq = &rq->xdp_rxq;
626                 vxbuf.skb = NULL;
627
628                 act = bpf_prog_run_xdp(xdp_prog, xdp);
629
630                 switch (act) {
631                 case XDP_PASS:
632                         if (xdp_update_frame_from_buff(xdp, frame))
633                                 goto err_xdp;
634                         break;
635                 case XDP_TX:
636                         orig_frame = *frame;
637                         xdp->rxq->mem = frame->mem;
638                         if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) {
639                                 trace_xdp_exception(rq->dev, xdp_prog, act);
640                                 frame = &orig_frame;
641                                 stats->rx_drops++;
642                                 goto err_xdp;
643                         }
644                         stats->xdp_tx++;
645                         rcu_read_unlock();
646                         goto xdp_xmit;
647                 case XDP_REDIRECT:
648                         orig_frame = *frame;
649                         xdp->rxq->mem = frame->mem;
650                         if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) {
651                                 frame = &orig_frame;
652                                 stats->rx_drops++;
653                                 goto err_xdp;
654                         }
655                         stats->xdp_redirect++;
656                         rcu_read_unlock();
657                         goto xdp_xmit;
658                 default:
659                         bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act);
660                         fallthrough;
661                 case XDP_ABORTED:
662                         trace_xdp_exception(rq->dev, xdp_prog, act);
663                         fallthrough;
664                 case XDP_DROP:
665                         stats->xdp_drops++;
666                         goto err_xdp;
667                 }
668         }
669         rcu_read_unlock();
670
671         return frame;
672 err_xdp:
673         rcu_read_unlock();
674         xdp_return_frame(frame);
675 xdp_xmit:
676         return NULL;
677 }
678
679 /* frames array contains VETH_XDP_BATCH at most */
680 static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames,
681                                   int n_xdpf, struct veth_xdp_tx_bq *bq,
682                                   struct veth_stats *stats)
683 {
684         void *skbs[VETH_XDP_BATCH];
685         int i;
686
687         if (xdp_alloc_skb_bulk(skbs, n_xdpf,
688                                GFP_ATOMIC | __GFP_ZERO) < 0) {
689                 for (i = 0; i < n_xdpf; i++)
690                         xdp_return_frame(frames[i]);
691                 stats->rx_drops += n_xdpf;
692
693                 return;
694         }
695
696         for (i = 0; i < n_xdpf; i++) {
697                 struct sk_buff *skb = skbs[i];
698
699                 skb = __xdp_build_skb_from_frame(frames[i], skb,
700                                                  rq->dev);
701                 if (!skb) {
702                         xdp_return_frame(frames[i]);
703                         stats->rx_drops++;
704                         continue;
705                 }
706                 napi_gro_receive(&rq->xdp_napi, skb);
707         }
708 }
709
710 static void veth_xdp_get(struct xdp_buff *xdp)
711 {
712         struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
713         int i;
714
715         get_page(virt_to_page(xdp->data));
716         if (likely(!xdp_buff_has_frags(xdp)))
717                 return;
718
719         for (i = 0; i < sinfo->nr_frags; i++)
720                 __skb_frag_ref(&sinfo->frags[i]);
721 }
722
723 static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq,
724                                         struct xdp_buff *xdp,
725                                         struct sk_buff **pskb)
726 {
727         struct sk_buff *skb = *pskb;
728         u32 frame_sz;
729
730         if (skb_shared(skb) || skb_head_is_locked(skb) ||
731             skb_shinfo(skb)->nr_frags ||
732             skb_headroom(skb) < XDP_PACKET_HEADROOM) {
733                 if (skb_pp_cow_data(rq->page_pool, pskb, XDP_PACKET_HEADROOM))
734                         goto drop;
735
736                 skb = *pskb;
737         }
738
739         /* SKB "head" area always have tailroom for skb_shared_info */
740         frame_sz = skb_end_pointer(skb) - skb->head;
741         frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
742         xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq);
743         xdp_prepare_buff(xdp, skb->head, skb_headroom(skb),
744                          skb_headlen(skb), true);
745
746         if (skb_is_nonlinear(skb)) {
747                 skb_shinfo(skb)->xdp_frags_size = skb->data_len;
748                 xdp_buff_set_frags_flag(xdp);
749         } else {
750                 xdp_buff_clear_frags_flag(xdp);
751         }
752         *pskb = skb;
753
754         return 0;
755 drop:
756         consume_skb(skb);
757         *pskb = NULL;
758
759         return -ENOMEM;
760 }
761
762 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
763                                         struct sk_buff *skb,
764                                         struct veth_xdp_tx_bq *bq,
765                                         struct veth_stats *stats)
766 {
767         void *orig_data, *orig_data_end;
768         struct bpf_prog *xdp_prog;
769         struct veth_xdp_buff vxbuf;
770         struct xdp_buff *xdp = &vxbuf.xdp;
771         u32 act, metalen;
772         int off;
773
774         skb_prepare_for_gro(skb);
775
776         rcu_read_lock();
777         xdp_prog = rcu_dereference(rq->xdp_prog);
778         if (unlikely(!xdp_prog)) {
779                 rcu_read_unlock();
780                 goto out;
781         }
782
783         __skb_push(skb, skb->data - skb_mac_header(skb));
784         if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb))
785                 goto drop;
786         vxbuf.skb = skb;
787
788         orig_data = xdp->data;
789         orig_data_end = xdp->data_end;
790
791         act = bpf_prog_run_xdp(xdp_prog, xdp);
792
793         switch (act) {
794         case XDP_PASS:
795                 break;
796         case XDP_TX:
797                 veth_xdp_get(xdp);
798                 consume_skb(skb);
799                 xdp->rxq->mem = rq->xdp_mem;
800                 if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) {
801                         trace_xdp_exception(rq->dev, xdp_prog, act);
802                         stats->rx_drops++;
803                         goto err_xdp;
804                 }
805                 stats->xdp_tx++;
806                 rcu_read_unlock();
807                 goto xdp_xmit;
808         case XDP_REDIRECT:
809                 veth_xdp_get(xdp);
810                 consume_skb(skb);
811                 xdp->rxq->mem = rq->xdp_mem;
812                 if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) {
813                         stats->rx_drops++;
814                         goto err_xdp;
815                 }
816                 stats->xdp_redirect++;
817                 rcu_read_unlock();
818                 goto xdp_xmit;
819         default:
820                 bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act);
821                 fallthrough;
822         case XDP_ABORTED:
823                 trace_xdp_exception(rq->dev, xdp_prog, act);
824                 fallthrough;
825         case XDP_DROP:
826                 stats->xdp_drops++;
827                 goto xdp_drop;
828         }
829         rcu_read_unlock();
830
831         /* check if bpf_xdp_adjust_head was used */
832         off = orig_data - xdp->data;
833         if (off > 0)
834                 __skb_push(skb, off);
835         else if (off < 0)
836                 __skb_pull(skb, -off);
837
838         skb_reset_mac_header(skb);
839
840         /* check if bpf_xdp_adjust_tail was used */
841         off = xdp->data_end - orig_data_end;
842         if (off != 0)
843                 __skb_put(skb, off); /* positive on grow, negative on shrink */
844
845         /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
846          * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
847          */
848         if (xdp_buff_has_frags(xdp))
849                 skb->data_len = skb_shinfo(skb)->xdp_frags_size;
850         else
851                 skb->data_len = 0;
852
853         skb->protocol = eth_type_trans(skb, rq->dev);
854
855         metalen = xdp->data - xdp->data_meta;
856         if (metalen)
857                 skb_metadata_set(skb, metalen);
858 out:
859         return skb;
860 drop:
861         stats->rx_drops++;
862 xdp_drop:
863         rcu_read_unlock();
864         kfree_skb(skb);
865         return NULL;
866 err_xdp:
867         rcu_read_unlock();
868         xdp_return_buff(xdp);
869 xdp_xmit:
870         return NULL;
871 }
872
873 static int veth_xdp_rcv(struct veth_rq *rq, int budget,
874                         struct veth_xdp_tx_bq *bq,
875                         struct veth_stats *stats)
876 {
877         int i, done = 0, n_xdpf = 0;
878         void *xdpf[VETH_XDP_BATCH];
879
880         for (i = 0; i < budget; i++) {
881                 void *ptr = __ptr_ring_consume(&rq->xdp_ring);
882
883                 if (!ptr)
884                         break;
885
886                 if (veth_is_xdp_frame(ptr)) {
887                         /* ndo_xdp_xmit */
888                         struct xdp_frame *frame = veth_ptr_to_xdp(ptr);
889
890                         stats->xdp_bytes += xdp_get_frame_len(frame);
891                         frame = veth_xdp_rcv_one(rq, frame, bq, stats);
892                         if (frame) {
893                                 /* XDP_PASS */
894                                 xdpf[n_xdpf++] = frame;
895                                 if (n_xdpf == VETH_XDP_BATCH) {
896                                         veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf,
897                                                               bq, stats);
898                                         n_xdpf = 0;
899                                 }
900                         }
901                 } else {
902                         /* ndo_start_xmit */
903                         struct sk_buff *skb = ptr;
904
905                         stats->xdp_bytes += skb->len;
906                         skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
907                         if (skb) {
908                                 if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC))
909                                         netif_receive_skb(skb);
910                                 else
911                                         napi_gro_receive(&rq->xdp_napi, skb);
912                         }
913                 }
914                 done++;
915         }
916
917         if (n_xdpf)
918                 veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats);
919
920         u64_stats_update_begin(&rq->stats.syncp);
921         rq->stats.vs.xdp_redirect += stats->xdp_redirect;
922         rq->stats.vs.xdp_bytes += stats->xdp_bytes;
923         rq->stats.vs.xdp_drops += stats->xdp_drops;
924         rq->stats.vs.rx_drops += stats->rx_drops;
925         rq->stats.vs.xdp_packets += done;
926         u64_stats_update_end(&rq->stats.syncp);
927
928         return done;
929 }
930
931 static int veth_poll(struct napi_struct *napi, int budget)
932 {
933         struct veth_rq *rq =
934                 container_of(napi, struct veth_rq, xdp_napi);
935         struct veth_stats stats = {};
936         struct veth_xdp_tx_bq bq;
937         int done;
938
939         bq.count = 0;
940
941         xdp_set_return_frame_no_direct();
942         done = veth_xdp_rcv(rq, budget, &bq, &stats);
943
944         if (stats.xdp_redirect > 0)
945                 xdp_do_flush();
946
947         if (done < budget && napi_complete_done(napi, done)) {
948                 /* Write rx_notify_masked before reading ptr_ring */
949                 smp_store_mb(rq->rx_notify_masked, false);
950                 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) {
951                         if (napi_schedule_prep(&rq->xdp_napi)) {
952                                 WRITE_ONCE(rq->rx_notify_masked, true);
953                                 __napi_schedule(&rq->xdp_napi);
954                         }
955                 }
956         }
957
958         if (stats.xdp_tx > 0)
959                 veth_xdp_flush(rq, &bq);
960         xdp_clear_return_frame_no_direct();
961
962         return done;
963 }
964
965 static int veth_create_page_pool(struct veth_rq *rq)
966 {
967         struct page_pool_params pp_params = {
968                 .order = 0,
969                 .pool_size = VETH_RING_SIZE,
970                 .nid = NUMA_NO_NODE,
971                 .dev = &rq->dev->dev,
972         };
973
974         rq->page_pool = page_pool_create(&pp_params);
975         if (IS_ERR(rq->page_pool)) {
976                 int err = PTR_ERR(rq->page_pool);
977
978                 rq->page_pool = NULL;
979                 return err;
980         }
981
982         return 0;
983 }
984
985 static int __veth_napi_enable_range(struct net_device *dev, int start, int end)
986 {
987         struct veth_priv *priv = netdev_priv(dev);
988         int err, i;
989
990         for (i = start; i < end; i++) {
991                 err = veth_create_page_pool(&priv->rq[i]);
992                 if (err)
993                         goto err_page_pool;
994         }
995
996         for (i = start; i < end; i++) {
997                 struct veth_rq *rq = &priv->rq[i];
998
999                 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
1000                 if (err)
1001                         goto err_xdp_ring;
1002         }
1003
1004         for (i = start; i < end; i++) {
1005                 struct veth_rq *rq = &priv->rq[i];
1006
1007                 napi_enable(&rq->xdp_napi);
1008                 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi);
1009         }
1010
1011         return 0;
1012
1013 err_xdp_ring:
1014         for (i--; i >= start; i--)
1015                 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free);
1016         i = end;
1017 err_page_pool:
1018         for (i--; i >= start; i--) {
1019                 page_pool_destroy(priv->rq[i].page_pool);
1020                 priv->rq[i].page_pool = NULL;
1021         }
1022
1023         return err;
1024 }
1025
1026 static int __veth_napi_enable(struct net_device *dev)
1027 {
1028         return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues);
1029 }
1030
1031 static void veth_napi_del_range(struct net_device *dev, int start, int end)
1032 {
1033         struct veth_priv *priv = netdev_priv(dev);
1034         int i;
1035
1036         for (i = start; i < end; i++) {
1037                 struct veth_rq *rq = &priv->rq[i];
1038
1039                 rcu_assign_pointer(priv->rq[i].napi, NULL);
1040                 napi_disable(&rq->xdp_napi);
1041                 __netif_napi_del(&rq->xdp_napi);
1042         }
1043         synchronize_net();
1044
1045         for (i = start; i < end; i++) {
1046                 struct veth_rq *rq = &priv->rq[i];
1047
1048                 rq->rx_notify_masked = false;
1049                 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
1050         }
1051
1052         for (i = start; i < end; i++) {
1053                 page_pool_destroy(priv->rq[i].page_pool);
1054                 priv->rq[i].page_pool = NULL;
1055         }
1056 }
1057
1058 static void veth_napi_del(struct net_device *dev)
1059 {
1060         veth_napi_del_range(dev, 0, dev->real_num_rx_queues);
1061 }
1062
1063 static bool veth_gro_requested(const struct net_device *dev)
1064 {
1065         return !!(dev->wanted_features & NETIF_F_GRO);
1066 }
1067
1068 static int veth_enable_xdp_range(struct net_device *dev, int start, int end,
1069                                  bool napi_already_on)
1070 {
1071         struct veth_priv *priv = netdev_priv(dev);
1072         int err, i;
1073
1074         for (i = start; i < end; i++) {
1075                 struct veth_rq *rq = &priv->rq[i];
1076
1077                 if (!napi_already_on)
1078                         netif_napi_add(dev, &rq->xdp_napi, veth_poll);
1079                 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id);
1080                 if (err < 0)
1081                         goto err_rxq_reg;
1082
1083                 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
1084                                                  MEM_TYPE_PAGE_SHARED,
1085                                                  NULL);
1086                 if (err < 0)
1087                         goto err_reg_mem;
1088
1089                 /* Save original mem info as it can be overwritten */
1090                 rq->xdp_mem = rq->xdp_rxq.mem;
1091         }
1092         return 0;
1093
1094 err_reg_mem:
1095         xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
1096 err_rxq_reg:
1097         for (i--; i >= start; i--) {
1098                 struct veth_rq *rq = &priv->rq[i];
1099
1100                 xdp_rxq_info_unreg(&rq->xdp_rxq);
1101                 if (!napi_already_on)
1102                         netif_napi_del(&rq->xdp_napi);
1103         }
1104
1105         return err;
1106 }
1107
1108 static void veth_disable_xdp_range(struct net_device *dev, int start, int end,
1109                                    bool delete_napi)
1110 {
1111         struct veth_priv *priv = netdev_priv(dev);
1112         int i;
1113
1114         for (i = start; i < end; i++) {
1115                 struct veth_rq *rq = &priv->rq[i];
1116
1117                 rq->xdp_rxq.mem = rq->xdp_mem;
1118                 xdp_rxq_info_unreg(&rq->xdp_rxq);
1119
1120                 if (delete_napi)
1121                         netif_napi_del(&rq->xdp_napi);
1122         }
1123 }
1124
1125 static int veth_enable_xdp(struct net_device *dev)
1126 {
1127         bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP);
1128         struct veth_priv *priv = netdev_priv(dev);
1129         int err, i;
1130
1131         if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) {
1132                 err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on);
1133                 if (err)
1134                         return err;
1135
1136                 if (!napi_already_on) {
1137                         err = __veth_napi_enable(dev);
1138                         if (err) {
1139                                 veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true);
1140                                 return err;
1141                         }
1142                 }
1143         }
1144
1145         for (i = 0; i < dev->real_num_rx_queues; i++) {
1146                 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog);
1147                 rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi);
1148         }
1149
1150         return 0;
1151 }
1152
1153 static void veth_disable_xdp(struct net_device *dev)
1154 {
1155         struct veth_priv *priv = netdev_priv(dev);
1156         int i;
1157
1158         for (i = 0; i < dev->real_num_rx_queues; i++)
1159                 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL);
1160
1161         if (!netif_running(dev) || !veth_gro_requested(dev))
1162                 veth_napi_del(dev);
1163
1164         veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false);
1165 }
1166
1167 static int veth_napi_enable_range(struct net_device *dev, int start, int end)
1168 {
1169         struct veth_priv *priv = netdev_priv(dev);
1170         int err, i;
1171
1172         for (i = start; i < end; i++) {
1173                 struct veth_rq *rq = &priv->rq[i];
1174
1175                 netif_napi_add(dev, &rq->xdp_napi, veth_poll);
1176         }
1177
1178         err = __veth_napi_enable_range(dev, start, end);
1179         if (err) {
1180                 for (i = start; i < end; i++) {
1181                         struct veth_rq *rq = &priv->rq[i];
1182
1183                         netif_napi_del(&rq->xdp_napi);
1184                 }
1185                 return err;
1186         }
1187         return err;
1188 }
1189
1190 static int veth_napi_enable(struct net_device *dev)
1191 {
1192         return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues);
1193 }
1194
1195 static void veth_disable_range_safe(struct net_device *dev, int start, int end)
1196 {
1197         struct veth_priv *priv = netdev_priv(dev);
1198
1199         if (start >= end)
1200                 return;
1201
1202         if (priv->_xdp_prog) {
1203                 veth_napi_del_range(dev, start, end);
1204                 veth_disable_xdp_range(dev, start, end, false);
1205         } else if (veth_gro_requested(dev)) {
1206                 veth_napi_del_range(dev, start, end);
1207         }
1208 }
1209
1210 static int veth_enable_range_safe(struct net_device *dev, int start, int end)
1211 {
1212         struct veth_priv *priv = netdev_priv(dev);
1213         int err;
1214
1215         if (start >= end)
1216                 return 0;
1217
1218         if (priv->_xdp_prog) {
1219                 /* these channels are freshly initialized, napi is not on there even
1220                  * when GRO is requeste
1221                  */
1222                 err = veth_enable_xdp_range(dev, start, end, false);
1223                 if (err)
1224                         return err;
1225
1226                 err = __veth_napi_enable_range(dev, start, end);
1227                 if (err) {
1228                         /* on error always delete the newly added napis */
1229                         veth_disable_xdp_range(dev, start, end, true);
1230                         return err;
1231                 }
1232         } else if (veth_gro_requested(dev)) {
1233                 return veth_napi_enable_range(dev, start, end);
1234         }
1235         return 0;
1236 }
1237
1238 static void veth_set_xdp_features(struct net_device *dev)
1239 {
1240         struct veth_priv *priv = netdev_priv(dev);
1241         struct net_device *peer;
1242
1243         peer = rtnl_dereference(priv->peer);
1244         if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) {
1245                 struct veth_priv *priv_peer = netdev_priv(peer);
1246                 xdp_features_t val = NETDEV_XDP_ACT_BASIC |
1247                                      NETDEV_XDP_ACT_REDIRECT |
1248                                      NETDEV_XDP_ACT_RX_SG;
1249
1250                 if (priv_peer->_xdp_prog || veth_gro_requested(peer))
1251                         val |= NETDEV_XDP_ACT_NDO_XMIT |
1252                                NETDEV_XDP_ACT_NDO_XMIT_SG;
1253                 xdp_set_features_flag(dev, val);
1254         } else {
1255                 xdp_clear_features_flag(dev);
1256         }
1257 }
1258
1259 static int veth_set_channels(struct net_device *dev,
1260                              struct ethtool_channels *ch)
1261 {
1262         struct veth_priv *priv = netdev_priv(dev);
1263         unsigned int old_rx_count, new_rx_count;
1264         struct veth_priv *peer_priv;
1265         struct net_device *peer;
1266         int err;
1267
1268         /* sanity check. Upper bounds are already enforced by the caller */
1269         if (!ch->rx_count || !ch->tx_count)
1270                 return -EINVAL;
1271
1272         /* avoid braking XDP, if that is enabled */
1273         peer = rtnl_dereference(priv->peer);
1274         peer_priv = peer ? netdev_priv(peer) : NULL;
1275         if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues)
1276                 return -EINVAL;
1277
1278         if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues)
1279                 return -EINVAL;
1280
1281         old_rx_count = dev->real_num_rx_queues;
1282         new_rx_count = ch->rx_count;
1283         if (netif_running(dev)) {
1284                 /* turn device off */
1285                 netif_carrier_off(dev);
1286                 if (peer)
1287                         netif_carrier_off(peer);
1288
1289                 /* try to allocate new resurces, as needed*/
1290                 err = veth_enable_range_safe(dev, old_rx_count, new_rx_count);
1291                 if (err)
1292                         goto out;
1293         }
1294
1295         err = netif_set_real_num_rx_queues(dev, ch->rx_count);
1296         if (err)
1297                 goto revert;
1298
1299         err = netif_set_real_num_tx_queues(dev, ch->tx_count);
1300         if (err) {
1301                 int err2 = netif_set_real_num_rx_queues(dev, old_rx_count);
1302
1303                 /* this error condition could happen only if rx and tx change
1304                  * in opposite directions (e.g. tx nr raises, rx nr decreases)
1305                  * and we can't do anything to fully restore the original
1306                  * status
1307                  */
1308                 if (err2)
1309                         pr_warn("Can't restore rx queues config %d -> %d %d",
1310                                 new_rx_count, old_rx_count, err2);
1311                 else
1312                         goto revert;
1313         }
1314
1315 out:
1316         if (netif_running(dev)) {
1317                 /* note that we need to swap the arguments WRT the enable part
1318                  * to identify the range we have to disable
1319                  */
1320                 veth_disable_range_safe(dev, new_rx_count, old_rx_count);
1321                 netif_carrier_on(dev);
1322                 if (peer)
1323                         netif_carrier_on(peer);
1324         }
1325
1326         /* update XDP supported features */
1327         veth_set_xdp_features(dev);
1328         if (peer)
1329                 veth_set_xdp_features(peer);
1330
1331         return err;
1332
1333 revert:
1334         new_rx_count = old_rx_count;
1335         old_rx_count = ch->rx_count;
1336         goto out;
1337 }
1338
1339 static int veth_open(struct net_device *dev)
1340 {
1341         struct veth_priv *priv = netdev_priv(dev);
1342         struct net_device *peer = rtnl_dereference(priv->peer);
1343         int err;
1344
1345         if (!peer)
1346                 return -ENOTCONN;
1347
1348         if (priv->_xdp_prog) {
1349                 err = veth_enable_xdp(dev);
1350                 if (err)
1351                         return err;
1352         } else if (veth_gro_requested(dev)) {
1353                 err = veth_napi_enable(dev);
1354                 if (err)
1355                         return err;
1356         }
1357
1358         if (peer->flags & IFF_UP) {
1359                 netif_carrier_on(dev);
1360                 netif_carrier_on(peer);
1361         }
1362
1363         veth_set_xdp_features(dev);
1364
1365         return 0;
1366 }
1367
1368 static int veth_close(struct net_device *dev)
1369 {
1370         struct veth_priv *priv = netdev_priv(dev);
1371         struct net_device *peer = rtnl_dereference(priv->peer);
1372
1373         netif_carrier_off(dev);
1374         if (peer)
1375                 netif_carrier_off(peer);
1376
1377         if (priv->_xdp_prog)
1378                 veth_disable_xdp(dev);
1379         else if (veth_gro_requested(dev))
1380                 veth_napi_del(dev);
1381
1382         return 0;
1383 }
1384
1385 static int is_valid_veth_mtu(int mtu)
1386 {
1387         return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU;
1388 }
1389
1390 static int veth_alloc_queues(struct net_device *dev)
1391 {
1392         struct veth_priv *priv = netdev_priv(dev);
1393         int i;
1394
1395         priv->rq = kvcalloc(dev->num_rx_queues, sizeof(*priv->rq),
1396                             GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
1397         if (!priv->rq)
1398                 return -ENOMEM;
1399
1400         for (i = 0; i < dev->num_rx_queues; i++) {
1401                 priv->rq[i].dev = dev;
1402                 u64_stats_init(&priv->rq[i].stats.syncp);
1403         }
1404
1405         return 0;
1406 }
1407
1408 static void veth_free_queues(struct net_device *dev)
1409 {
1410         struct veth_priv *priv = netdev_priv(dev);
1411
1412         kvfree(priv->rq);
1413 }
1414
1415 static int veth_dev_init(struct net_device *dev)
1416 {
1417         netdev_lockdep_set_classes(dev);
1418         return veth_alloc_queues(dev);
1419 }
1420
1421 static void veth_dev_free(struct net_device *dev)
1422 {
1423         veth_free_queues(dev);
1424 }
1425
1426 #ifdef CONFIG_NET_POLL_CONTROLLER
1427 static void veth_poll_controller(struct net_device *dev)
1428 {
1429         /* veth only receives frames when its peer sends one
1430          * Since it has nothing to do with disabling irqs, we are guaranteed
1431          * never to have pending data when we poll for it so
1432          * there is nothing to do here.
1433          *
1434          * We need this though so netpoll recognizes us as an interface that
1435          * supports polling, which enables bridge devices in virt setups to
1436          * still use netconsole
1437          */
1438 }
1439 #endif  /* CONFIG_NET_POLL_CONTROLLER */
1440
1441 static int veth_get_iflink(const struct net_device *dev)
1442 {
1443         struct veth_priv *priv = netdev_priv(dev);
1444         struct net_device *peer;
1445         int iflink;
1446
1447         rcu_read_lock();
1448         peer = rcu_dereference(priv->peer);
1449         iflink = peer ? READ_ONCE(peer->ifindex) : 0;
1450         rcu_read_unlock();
1451
1452         return iflink;
1453 }
1454
1455 static netdev_features_t veth_fix_features(struct net_device *dev,
1456                                            netdev_features_t features)
1457 {
1458         struct veth_priv *priv = netdev_priv(dev);
1459         struct net_device *peer;
1460
1461         peer = rtnl_dereference(priv->peer);
1462         if (peer) {
1463                 struct veth_priv *peer_priv = netdev_priv(peer);
1464
1465                 if (peer_priv->_xdp_prog)
1466                         features &= ~NETIF_F_GSO_SOFTWARE;
1467         }
1468
1469         return features;
1470 }
1471
1472 static int veth_set_features(struct net_device *dev,
1473                              netdev_features_t features)
1474 {
1475         netdev_features_t changed = features ^ dev->features;
1476         struct veth_priv *priv = netdev_priv(dev);
1477         struct net_device *peer;
1478         int err;
1479
1480         if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog)
1481                 return 0;
1482
1483         peer = rtnl_dereference(priv->peer);
1484         if (features & NETIF_F_GRO) {
1485                 err = veth_napi_enable(dev);
1486                 if (err)
1487                         return err;
1488
1489                 if (peer)
1490                         xdp_features_set_redirect_target(peer, true);
1491         } else {
1492                 if (peer)
1493                         xdp_features_clear_redirect_target(peer);
1494                 veth_napi_del(dev);
1495         }
1496         return 0;
1497 }
1498
1499 static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
1500 {
1501         struct veth_priv *peer_priv, *priv = netdev_priv(dev);
1502         struct net_device *peer;
1503
1504         if (new_hr < 0)
1505                 new_hr = 0;
1506
1507         rcu_read_lock();
1508         peer = rcu_dereference(priv->peer);
1509         if (unlikely(!peer))
1510                 goto out;
1511
1512         peer_priv = netdev_priv(peer);
1513         priv->requested_headroom = new_hr;
1514         new_hr = max(priv->requested_headroom, peer_priv->requested_headroom);
1515         dev->needed_headroom = new_hr;
1516         peer->needed_headroom = new_hr;
1517
1518 out:
1519         rcu_read_unlock();
1520 }
1521
1522 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
1523                         struct netlink_ext_ack *extack)
1524 {
1525         struct veth_priv *priv = netdev_priv(dev);
1526         struct bpf_prog *old_prog;
1527         struct net_device *peer;
1528         unsigned int max_mtu;
1529         int err;
1530
1531         old_prog = priv->_xdp_prog;
1532         priv->_xdp_prog = prog;
1533         peer = rtnl_dereference(priv->peer);
1534
1535         if (prog) {
1536                 if (!peer) {
1537                         NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached");
1538                         err = -ENOTCONN;
1539                         goto err;
1540                 }
1541
1542                 max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) -
1543                           peer->hard_header_len;
1544                 /* Allow increasing the max_mtu if the program supports
1545                  * XDP fragments.
1546                  */
1547                 if (prog->aux->xdp_has_frags)
1548                         max_mtu += PAGE_SIZE * MAX_SKB_FRAGS;
1549
1550                 if (peer->mtu > max_mtu) {
1551                         NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP");
1552                         err = -ERANGE;
1553                         goto err;
1554                 }
1555
1556                 if (dev->real_num_rx_queues < peer->real_num_tx_queues) {
1557                         NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues");
1558                         err = -ENOSPC;
1559                         goto err;
1560                 }
1561
1562                 if (dev->flags & IFF_UP) {
1563                         err = veth_enable_xdp(dev);
1564                         if (err) {
1565                                 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed");
1566                                 goto err;
1567                         }
1568                 }
1569
1570                 if (!old_prog) {
1571                         peer->hw_features &= ~NETIF_F_GSO_SOFTWARE;
1572                         peer->max_mtu = max_mtu;
1573                 }
1574
1575                 xdp_features_set_redirect_target(peer, true);
1576         }
1577
1578         if (old_prog) {
1579                 if (!prog) {
1580                         if (peer && !veth_gro_requested(dev))
1581                                 xdp_features_clear_redirect_target(peer);
1582
1583                         if (dev->flags & IFF_UP)
1584                                 veth_disable_xdp(dev);
1585
1586                         if (peer) {
1587                                 peer->hw_features |= NETIF_F_GSO_SOFTWARE;
1588                                 peer->max_mtu = ETH_MAX_MTU;
1589                         }
1590                 }
1591                 bpf_prog_put(old_prog);
1592         }
1593
1594         if ((!!old_prog ^ !!prog) && peer)
1595                 netdev_update_features(peer);
1596
1597         return 0;
1598 err:
1599         priv->_xdp_prog = old_prog;
1600
1601         return err;
1602 }
1603
1604 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
1605 {
1606         switch (xdp->command) {
1607         case XDP_SETUP_PROG:
1608                 return veth_xdp_set(dev, xdp->prog, xdp->extack);
1609         default:
1610                 return -EINVAL;
1611         }
1612 }
1613
1614 static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
1615 {
1616         struct veth_xdp_buff *_ctx = (void *)ctx;
1617
1618         if (!_ctx->skb)
1619                 return -ENODATA;
1620
1621         *timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp;
1622         return 0;
1623 }
1624
1625 static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash,
1626                             enum xdp_rss_hash_type *rss_type)
1627 {
1628         struct veth_xdp_buff *_ctx = (void *)ctx;
1629         struct sk_buff *skb = _ctx->skb;
1630
1631         if (!skb)
1632                 return -ENODATA;
1633
1634         *hash = skb_get_hash(skb);
1635         *rss_type = skb->l4_hash ? XDP_RSS_TYPE_L4_ANY : XDP_RSS_TYPE_NONE;
1636
1637         return 0;
1638 }
1639
1640 static int veth_xdp_rx_vlan_tag(const struct xdp_md *ctx, __be16 *vlan_proto,
1641                                 u16 *vlan_tci)
1642 {
1643         const struct veth_xdp_buff *_ctx = (void *)ctx;
1644         const struct sk_buff *skb = _ctx->skb;
1645         int err;
1646
1647         if (!skb)
1648                 return -ENODATA;
1649
1650         err = __vlan_hwaccel_get_tag(skb, vlan_tci);
1651         if (err)
1652                 return err;
1653
1654         *vlan_proto = skb->vlan_proto;
1655         return err;
1656 }
1657
1658 static const struct net_device_ops veth_netdev_ops = {
1659         .ndo_init            = veth_dev_init,
1660         .ndo_open            = veth_open,
1661         .ndo_stop            = veth_close,
1662         .ndo_start_xmit      = veth_xmit,
1663         .ndo_get_stats64     = veth_get_stats64,
1664         .ndo_set_rx_mode     = veth_set_multicast_list,
1665         .ndo_set_mac_address = eth_mac_addr,
1666 #ifdef CONFIG_NET_POLL_CONTROLLER
1667         .ndo_poll_controller    = veth_poll_controller,
1668 #endif
1669         .ndo_get_iflink         = veth_get_iflink,
1670         .ndo_fix_features       = veth_fix_features,
1671         .ndo_set_features       = veth_set_features,
1672         .ndo_features_check     = passthru_features_check,
1673         .ndo_set_rx_headroom    = veth_set_rx_headroom,
1674         .ndo_bpf                = veth_xdp,
1675         .ndo_xdp_xmit           = veth_ndo_xdp_xmit,
1676         .ndo_get_peer_dev       = veth_peer_dev,
1677 };
1678
1679 static const struct xdp_metadata_ops veth_xdp_metadata_ops = {
1680         .xmo_rx_timestamp               = veth_xdp_rx_timestamp,
1681         .xmo_rx_hash                    = veth_xdp_rx_hash,
1682         .xmo_rx_vlan_tag                = veth_xdp_rx_vlan_tag,
1683 };
1684
1685 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
1686                        NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \
1687                        NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \
1688                        NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \
1689                        NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX )
1690
1691 static void veth_setup(struct net_device *dev)
1692 {
1693         ether_setup(dev);
1694
1695         dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1696         dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1697         dev->priv_flags |= IFF_NO_QUEUE;
1698         dev->priv_flags |= IFF_PHONY_HEADROOM;
1699
1700         dev->netdev_ops = &veth_netdev_ops;
1701         dev->xdp_metadata_ops = &veth_xdp_metadata_ops;
1702         dev->ethtool_ops = &veth_ethtool_ops;
1703         dev->features |= NETIF_F_LLTX;
1704         dev->features |= VETH_FEATURES;
1705         dev->vlan_features = dev->features &
1706                              ~(NETIF_F_HW_VLAN_CTAG_TX |
1707                                NETIF_F_HW_VLAN_STAG_TX |
1708                                NETIF_F_HW_VLAN_CTAG_RX |
1709                                NETIF_F_HW_VLAN_STAG_RX);
1710         dev->needs_free_netdev = true;
1711         dev->priv_destructor = veth_dev_free;
1712         dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
1713         dev->max_mtu = ETH_MAX_MTU;
1714
1715         dev->hw_features = VETH_FEATURES;
1716         dev->hw_enc_features = VETH_FEATURES;
1717         dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
1718         netif_set_tso_max_size(dev, GSO_MAX_SIZE);
1719 }
1720
1721 /*
1722  * netlink interface
1723  */
1724
1725 static int veth_validate(struct nlattr *tb[], struct nlattr *data[],
1726                          struct netlink_ext_ack *extack)
1727 {
1728         if (tb[IFLA_ADDRESS]) {
1729                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1730                         return -EINVAL;
1731                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1732                         return -EADDRNOTAVAIL;
1733         }
1734         if (tb[IFLA_MTU]) {
1735                 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU])))
1736                         return -EINVAL;
1737         }
1738         return 0;
1739 }
1740
1741 static struct rtnl_link_ops veth_link_ops;
1742
1743 static void veth_disable_gro(struct net_device *dev)
1744 {
1745         dev->features &= ~NETIF_F_GRO;
1746         dev->wanted_features &= ~NETIF_F_GRO;
1747         netdev_update_features(dev);
1748 }
1749
1750 static int veth_init_queues(struct net_device *dev, struct nlattr *tb[])
1751 {
1752         int err;
1753
1754         if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) {
1755                 err = netif_set_real_num_tx_queues(dev, 1);
1756                 if (err)
1757                         return err;
1758         }
1759         if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) {
1760                 err = netif_set_real_num_rx_queues(dev, 1);
1761                 if (err)
1762                         return err;
1763         }
1764         return 0;
1765 }
1766
1767 static int veth_newlink(struct net *src_net, struct net_device *dev,
1768                         struct nlattr *tb[], struct nlattr *data[],
1769                         struct netlink_ext_ack *extack)
1770 {
1771         int err;
1772         struct net_device *peer;
1773         struct veth_priv *priv;
1774         char ifname[IFNAMSIZ];
1775         struct nlattr *peer_tb[IFLA_MAX + 1], **tbp;
1776         unsigned char name_assign_type;
1777         struct ifinfomsg *ifmp;
1778         struct net *net;
1779
1780         /*
1781          * create and register peer first
1782          */
1783         if (data != NULL && data[VETH_INFO_PEER] != NULL) {
1784                 struct nlattr *nla_peer;
1785
1786                 nla_peer = data[VETH_INFO_PEER];
1787                 ifmp = nla_data(nla_peer);
1788                 err = rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack);
1789                 if (err < 0)
1790                         return err;
1791
1792                 err = veth_validate(peer_tb, NULL, extack);
1793                 if (err < 0)
1794                         return err;
1795
1796                 tbp = peer_tb;
1797         } else {
1798                 ifmp = NULL;
1799                 tbp = tb;
1800         }
1801
1802         if (ifmp && tbp[IFLA_IFNAME]) {
1803                 nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
1804                 name_assign_type = NET_NAME_USER;
1805         } else {
1806                 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d");
1807                 name_assign_type = NET_NAME_ENUM;
1808         }
1809
1810         net = rtnl_link_get_net(src_net, tbp);
1811         if (IS_ERR(net))
1812                 return PTR_ERR(net);
1813
1814         peer = rtnl_create_link(net, ifname, name_assign_type,
1815                                 &veth_link_ops, tbp, extack);
1816         if (IS_ERR(peer)) {
1817                 put_net(net);
1818                 return PTR_ERR(peer);
1819         }
1820
1821         if (!ifmp || !tbp[IFLA_ADDRESS])
1822                 eth_hw_addr_random(peer);
1823
1824         if (ifmp && (dev->ifindex != 0))
1825                 peer->ifindex = ifmp->ifi_index;
1826
1827         netif_inherit_tso_max(peer, dev);
1828
1829         err = register_netdevice(peer);
1830         put_net(net);
1831         net = NULL;
1832         if (err < 0)
1833                 goto err_register_peer;
1834
1835         /* keep GRO disabled by default to be consistent with the established
1836          * veth behavior
1837          */
1838         veth_disable_gro(peer);
1839         netif_carrier_off(peer);
1840
1841         err = rtnl_configure_link(peer, ifmp, 0, NULL);
1842         if (err < 0)
1843                 goto err_configure_peer;
1844
1845         /*
1846          * register dev last
1847          *
1848          * note, that since we've registered new device the dev's name
1849          * should be re-allocated
1850          */
1851
1852         if (tb[IFLA_ADDRESS] == NULL)
1853                 eth_hw_addr_random(dev);
1854
1855         if (tb[IFLA_IFNAME])
1856                 nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
1857         else
1858                 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d");
1859
1860         err = register_netdevice(dev);
1861         if (err < 0)
1862                 goto err_register_dev;
1863
1864         netif_carrier_off(dev);
1865
1866         /*
1867          * tie the deviced together
1868          */
1869
1870         priv = netdev_priv(dev);
1871         rcu_assign_pointer(priv->peer, peer);
1872         err = veth_init_queues(dev, tb);
1873         if (err)
1874                 goto err_queues;
1875
1876         priv = netdev_priv(peer);
1877         rcu_assign_pointer(priv->peer, dev);
1878         err = veth_init_queues(peer, tb);
1879         if (err)
1880                 goto err_queues;
1881
1882         veth_disable_gro(dev);
1883         /* update XDP supported features */
1884         veth_set_xdp_features(dev);
1885         veth_set_xdp_features(peer);
1886
1887         return 0;
1888
1889 err_queues:
1890         unregister_netdevice(dev);
1891 err_register_dev:
1892         /* nothing to do */
1893 err_configure_peer:
1894         unregister_netdevice(peer);
1895         return err;
1896
1897 err_register_peer:
1898         free_netdev(peer);
1899         return err;
1900 }
1901
1902 static void veth_dellink(struct net_device *dev, struct list_head *head)
1903 {
1904         struct veth_priv *priv;
1905         struct net_device *peer;
1906
1907         priv = netdev_priv(dev);
1908         peer = rtnl_dereference(priv->peer);
1909
1910         /* Note : dellink() is called from default_device_exit_batch(),
1911          * before a rcu_synchronize() point. The devices are guaranteed
1912          * not being freed before one RCU grace period.
1913          */
1914         RCU_INIT_POINTER(priv->peer, NULL);
1915         unregister_netdevice_queue(dev, head);
1916
1917         if (peer) {
1918                 priv = netdev_priv(peer);
1919                 RCU_INIT_POINTER(priv->peer, NULL);
1920                 unregister_netdevice_queue(peer, head);
1921         }
1922 }
1923
1924 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = {
1925         [VETH_INFO_PEER]        = { .len = sizeof(struct ifinfomsg) },
1926 };
1927
1928 static struct net *veth_get_link_net(const struct net_device *dev)
1929 {
1930         struct veth_priv *priv = netdev_priv(dev);
1931         struct net_device *peer = rtnl_dereference(priv->peer);
1932
1933         return peer ? dev_net(peer) : dev_net(dev);
1934 }
1935
1936 static unsigned int veth_get_num_queues(void)
1937 {
1938         /* enforce the same queue limit as rtnl_create_link */
1939         int queues = num_possible_cpus();
1940
1941         if (queues > 4096)
1942                 queues = 4096;
1943         return queues;
1944 }
1945
1946 static struct rtnl_link_ops veth_link_ops = {
1947         .kind           = DRV_NAME,
1948         .priv_size      = sizeof(struct veth_priv),
1949         .setup          = veth_setup,
1950         .validate       = veth_validate,
1951         .newlink        = veth_newlink,
1952         .dellink        = veth_dellink,
1953         .policy         = veth_policy,
1954         .maxtype        = VETH_INFO_MAX,
1955         .get_link_net   = veth_get_link_net,
1956         .get_num_tx_queues      = veth_get_num_queues,
1957         .get_num_rx_queues      = veth_get_num_queues,
1958 };
1959
1960 /*
1961  * init/fini
1962  */
1963
1964 static __init int veth_init(void)
1965 {
1966         return rtnl_link_register(&veth_link_ops);
1967 }
1968
1969 static __exit void veth_exit(void)
1970 {
1971         rtnl_link_unregister(&veth_link_ops);
1972 }
1973
1974 module_init(veth_init);
1975 module_exit(veth_exit);
1976
1977 MODULE_DESCRIPTION("Virtual Ethernet Tunnel");
1978 MODULE_LICENSE("GPL v2");
1979 MODULE_ALIAS_RTNL_LINK(DRV_NAME);