veth: introduce more specialized counters in veth_stats
[linux-2.6-block.git] / drivers / net / veth.c
CommitLineData
09c434b8 1// SPDX-License-Identifier: GPL-2.0-only
e314dbdc
PE
2/*
3 * drivers/net/veth.c
4 *
5 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc
6 *
7 * Author: Pavel Emelianov <xemul@openvz.org>
8 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com>
9 *
10 */
11
e314dbdc 12#include <linux/netdevice.h>
5a0e3ad6 13#include <linux/slab.h>
e314dbdc
PE
14#include <linux/ethtool.h>
15#include <linux/etherdevice.h>
cf05c700 16#include <linux/u64_stats_sync.h>
e314dbdc 17
f7b12606 18#include <net/rtnetlink.h>
e314dbdc
PE
19#include <net/dst.h>
20#include <net/xfrm.h>
af87a3aa 21#include <net/xdp.h>
ecef969e 22#include <linux/veth.h>
9d9779e7 23#include <linux/module.h>
948d4f21
TM
24#include <linux/bpf.h>
25#include <linux/filter.h>
26#include <linux/ptr_ring.h>
948d4f21 27#include <linux/bpf_trace.h>
aa4e689e 28#include <linux/net_tstamp.h>
e314dbdc
PE
29
30#define DRV_NAME "veth"
31#define DRV_VERSION "1.0"
32
9fc8d518 33#define VETH_XDP_FLAG BIT(0)
948d4f21
TM
34#define VETH_RING_SIZE 256
35#define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN)
36
9cda7807
TM
37#define VETH_XDP_TX_BULK_SIZE 16
38
65780c56 39struct veth_stats {
1c5b82e5
LB
40 u64 rx_drops;
41 /* xdp */
65780c56
LB
42 u64 xdp_packets;
43 u64 xdp_bytes;
1c5b82e5 44 u64 xdp_redirect;
65780c56 45 u64 xdp_drops;
1c5b82e5 46 u64 xdp_tx;
65780c56
LB
47};
48
4195e54a 49struct veth_rq_stats {
65780c56 50 struct veth_stats vs;
4195e54a
TM
51 struct u64_stats_sync syncp;
52};
53
638264dc 54struct veth_rq {
948d4f21
TM
55 struct napi_struct xdp_napi;
56 struct net_device *dev;
57 struct bpf_prog __rcu *xdp_prog;
d1396004 58 struct xdp_mem_info xdp_mem;
4195e54a 59 struct veth_rq_stats stats;
948d4f21
TM
60 bool rx_notify_masked;
61 struct ptr_ring xdp_ring;
62 struct xdp_rxq_info xdp_rxq;
e314dbdc
PE
63};
64
638264dc
TM
65struct veth_priv {
66 struct net_device __rcu *peer;
67 atomic64_t dropped;
68 struct bpf_prog *_xdp_prog;
69 struct veth_rq *rq;
70 unsigned int requested_headroom;
71};
72
9cda7807
TM
73struct veth_xdp_tx_bq {
74 struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE];
75 unsigned int count;
76};
77
e314dbdc
PE
78/*
79 * ethtool interface
80 */
81
d397b968
TM
82struct veth_q_stat_desc {
83 char desc[ETH_GSTRING_LEN];
84 size_t offset;
85};
86
65780c56 87#define VETH_RQ_STAT(m) offsetof(struct veth_stats, m)
d397b968
TM
88
89static const struct veth_q_stat_desc veth_rq_stats_desc[] = {
90 { "xdp_packets", VETH_RQ_STAT(xdp_packets) },
91 { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) },
92 { "xdp_drops", VETH_RQ_STAT(xdp_drops) },
93};
94
95#define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc)
96
e314dbdc
PE
97static struct {
98 const char string[ETH_GSTRING_LEN];
99} ethtool_stats_keys[] = {
100 { "peer_ifindex" },
101};
102
56607b98
PR
103static int veth_get_link_ksettings(struct net_device *dev,
104 struct ethtool_link_ksettings *cmd)
e314dbdc 105{
56607b98
PR
106 cmd->base.speed = SPEED_10000;
107 cmd->base.duplex = DUPLEX_FULL;
108 cmd->base.port = PORT_TP;
109 cmd->base.autoneg = AUTONEG_DISABLE;
e314dbdc
PE
110 return 0;
111}
112
113static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
114{
33a5ba14
RJ
115 strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
116 strlcpy(info->version, DRV_VERSION, sizeof(info->version));
e314dbdc
PE
117}
118
119static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
120{
d397b968
TM
121 char *p = (char *)buf;
122 int i, j;
123
e314dbdc
PE
124 switch(stringset) {
125 case ETH_SS_STATS:
d397b968
TM
126 memcpy(p, &ethtool_stats_keys, sizeof(ethtool_stats_keys));
127 p += sizeof(ethtool_stats_keys);
128 for (i = 0; i < dev->real_num_rx_queues; i++) {
129 for (j = 0; j < VETH_RQ_STATS_LEN; j++) {
abdf47aa
FF
130 snprintf(p, ETH_GSTRING_LEN,
131 "rx_queue_%u_%.11s",
d397b968
TM
132 i, veth_rq_stats_desc[j].desc);
133 p += ETH_GSTRING_LEN;
134 }
135 }
e314dbdc
PE
136 break;
137 }
138}
139
b9f2c044 140static int veth_get_sset_count(struct net_device *dev, int sset)
e314dbdc 141{
b9f2c044
JG
142 switch (sset) {
143 case ETH_SS_STATS:
d397b968
TM
144 return ARRAY_SIZE(ethtool_stats_keys) +
145 VETH_RQ_STATS_LEN * dev->real_num_rx_queues;
b9f2c044
JG
146 default:
147 return -EOPNOTSUPP;
148 }
e314dbdc
PE
149}
150
151static void veth_get_ethtool_stats(struct net_device *dev,
152 struct ethtool_stats *stats, u64 *data)
153{
d0e2c55e
ED
154 struct veth_priv *priv = netdev_priv(dev);
155 struct net_device *peer = rtnl_dereference(priv->peer);
d397b968 156 int i, j, idx;
e314dbdc 157
d0e2c55e 158 data[0] = peer ? peer->ifindex : 0;
d397b968
TM
159 idx = 1;
160 for (i = 0; i < dev->real_num_rx_queues; i++) {
161 const struct veth_rq_stats *rq_stats = &priv->rq[i].stats;
65780c56 162 const void *stats_base = (void *)&rq_stats->vs;
d397b968
TM
163 unsigned int start;
164 size_t offset;
165
166 do {
167 start = u64_stats_fetch_begin_irq(&rq_stats->syncp);
168 for (j = 0; j < VETH_RQ_STATS_LEN; j++) {
169 offset = veth_rq_stats_desc[j].offset;
170 data[idx + j] = *(u64 *)(stats_base + offset);
171 }
172 } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start));
173 idx += VETH_RQ_STATS_LEN;
174 }
e314dbdc
PE
175}
176
0fc0b732 177static const struct ethtool_ops veth_ethtool_ops = {
e314dbdc
PE
178 .get_drvinfo = veth_get_drvinfo,
179 .get_link = ethtool_op_get_link,
e314dbdc 180 .get_strings = veth_get_strings,
b9f2c044 181 .get_sset_count = veth_get_sset_count,
e314dbdc 182 .get_ethtool_stats = veth_get_ethtool_stats,
56607b98 183 .get_link_ksettings = veth_get_link_ksettings,
056b21fb 184 .get_ts_info = ethtool_op_get_ts_info,
e314dbdc
PE
185};
186
948d4f21
TM
187/* general routines */
188
9fc8d518
TM
189static bool veth_is_xdp_frame(void *ptr)
190{
191 return (unsigned long)ptr & VETH_XDP_FLAG;
192}
193
194static void *veth_ptr_to_xdp(void *ptr)
195{
196 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG);
197}
198
af87a3aa
TM
199static void *veth_xdp_to_ptr(void *ptr)
200{
201 return (void *)((unsigned long)ptr | VETH_XDP_FLAG);
202}
203
9fc8d518
TM
204static void veth_ptr_free(void *ptr)
205{
206 if (veth_is_xdp_frame(ptr))
207 xdp_return_frame(veth_ptr_to_xdp(ptr));
208 else
209 kfree_skb(ptr);
210}
211
638264dc 212static void __veth_xdp_flush(struct veth_rq *rq)
948d4f21
TM
213{
214 /* Write ptr_ring before reading rx_notify_masked */
215 smp_mb();
638264dc
TM
216 if (!rq->rx_notify_masked) {
217 rq->rx_notify_masked = true;
218 napi_schedule(&rq->xdp_napi);
948d4f21
TM
219 }
220}
221
638264dc 222static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb)
948d4f21 223{
638264dc 224 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) {
948d4f21
TM
225 dev_kfree_skb_any(skb);
226 return NET_RX_DROP;
227 }
228
229 return NET_RX_SUCCESS;
230}
231
638264dc
TM
232static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
233 struct veth_rq *rq, bool xdp)
e314dbdc 234{
948d4f21 235 return __dev_forward_skb(dev, skb) ?: xdp ?
638264dc 236 veth_xdp_rx(rq, skb) :
948d4f21
TM
237 netif_rx(skb);
238}
239
240static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
241{
242 struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
638264dc 243 struct veth_rq *rq = NULL;
d0e2c55e 244 struct net_device *rcv;
2681128f 245 int length = skb->len;
948d4f21 246 bool rcv_xdp = false;
638264dc 247 int rxq;
e314dbdc 248
d0e2c55e
ED
249 rcu_read_lock();
250 rcv = rcu_dereference(priv->peer);
251 if (unlikely(!rcv)) {
252 kfree_skb(skb);
253 goto drop;
254 }
e314dbdc 255
948d4f21 256 rcv_priv = netdev_priv(rcv);
638264dc
TM
257 rxq = skb_get_queue_mapping(skb);
258 if (rxq < rcv->real_num_rx_queues) {
259 rq = &rcv_priv->rq[rxq];
260 rcv_xdp = rcu_access_pointer(rq->xdp_prog);
261 if (rcv_xdp)
262 skb_record_rx_queue(skb, rxq);
263 }
948d4f21 264
aa4e689e 265 skb_tx_timestamp(skb);
638264dc 266 if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) {
b4fba476
ED
267 if (!rcv_xdp)
268 dev_lstats_add(dev, length);
2681128f 269 } else {
d0e2c55e 270drop:
2681128f
ED
271 atomic64_inc(&priv->dropped);
272 }
948d4f21
TM
273
274 if (rcv_xdp)
638264dc 275 __veth_xdp_flush(rq);
948d4f21 276
d0e2c55e 277 rcu_read_unlock();
948d4f21 278
6ed10654 279 return NETDEV_TX_OK;
e314dbdc
PE
280}
281
b4fba476 282static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes)
e314dbdc 283{
cf05c700 284 struct veth_priv *priv = netdev_priv(dev);
cf05c700 285
b4fba476 286 dev_lstats_read(dev, packets, bytes);
2681128f
ED
287 return atomic64_read(&priv->dropped);
288}
289
65780c56 290static void veth_stats_rx(struct veth_stats *result, struct net_device *dev)
4195e54a
TM
291{
292 struct veth_priv *priv = netdev_priv(dev);
293 int i;
294
295 result->xdp_packets = 0;
296 result->xdp_bytes = 0;
297 result->xdp_drops = 0;
298 for (i = 0; i < dev->num_rx_queues; i++) {
299 struct veth_rq_stats *stats = &priv->rq[i].stats;
300 u64 packets, bytes, drops;
301 unsigned int start;
302
303 do {
304 start = u64_stats_fetch_begin_irq(&stats->syncp);
65780c56
LB
305 packets = stats->vs.xdp_packets;
306 bytes = stats->vs.xdp_bytes;
307 drops = stats->vs.xdp_drops;
4195e54a
TM
308 } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
309 result->xdp_packets += packets;
310 result->xdp_bytes += bytes;
311 result->xdp_drops += drops;
312 }
313}
314
bc1f4470 315static void veth_get_stats64(struct net_device *dev,
316 struct rtnl_link_stats64 *tot)
2681128f
ED
317{
318 struct veth_priv *priv = netdev_priv(dev);
d0e2c55e 319 struct net_device *peer;
65780c56 320 struct veth_stats rx;
b4fba476 321 u64 packets, bytes;
4195e54a 322
b4fba476
ED
323 tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes);
324 tot->tx_bytes = bytes;
325 tot->tx_packets = packets;
2681128f 326
4195e54a
TM
327 veth_stats_rx(&rx, dev);
328 tot->rx_dropped = rx.xdp_drops;
329 tot->rx_bytes = rx.xdp_bytes;
330 tot->rx_packets = rx.xdp_packets;
2681128f 331
d0e2c55e
ED
332 rcu_read_lock();
333 peer = rcu_dereference(priv->peer);
334 if (peer) {
e25d5dbc 335 veth_stats_tx(peer, &packets, &bytes);
b4fba476
ED
336 tot->rx_bytes += bytes;
337 tot->rx_packets += packets;
4195e54a
TM
338
339 veth_stats_rx(&rx, peer);
340 tot->tx_bytes += rx.xdp_bytes;
341 tot->tx_packets += rx.xdp_packets;
d0e2c55e
ED
342 }
343 rcu_read_unlock();
e314dbdc
PE
344}
345
5c70ef85
G
346/* fake multicast ability */
347static void veth_set_multicast_list(struct net_device *dev)
348{
349}
350
948d4f21
TM
351static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
352 int buflen)
353{
354 struct sk_buff *skb;
355
356 if (!buflen) {
357 buflen = SKB_DATA_ALIGN(headroom + len) +
358 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
359 }
360 skb = build_skb(head, buflen);
361 if (!skb)
362 return NULL;
363
364 skb_reserve(skb, headroom);
365 skb_put(skb, len);
366
367 return skb;
368}
369
638264dc
TM
370static int veth_select_rxq(struct net_device *dev)
371{
372 return smp_processor_id() % dev->real_num_rx_queues;
373}
374
af87a3aa
TM
375static int veth_xdp_xmit(struct net_device *dev, int n,
376 struct xdp_frame **frames, u32 flags)
377{
378 struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
379 struct net_device *rcv;
2131479d 380 int i, ret, drops = n;
af87a3aa 381 unsigned int max_len;
638264dc 382 struct veth_rq *rq;
af87a3aa 383
b23bfa56 384 rcu_read_lock();
2131479d
TM
385 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) {
386 ret = -EINVAL;
387 goto drop;
388 }
af87a3aa
TM
389
390 rcv = rcu_dereference(priv->peer);
2131479d
TM
391 if (unlikely(!rcv)) {
392 ret = -ENXIO;
393 goto drop;
394 }
af87a3aa
TM
395
396 rcv_priv = netdev_priv(rcv);
638264dc 397 rq = &rcv_priv->rq[veth_select_rxq(rcv)];
af87a3aa
TM
398 /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive
399 * side. This means an XDP program is loaded on the peer and the peer
400 * device is up.
401 */
2131479d
TM
402 if (!rcu_access_pointer(rq->xdp_prog)) {
403 ret = -ENXIO;
404 goto drop;
405 }
af87a3aa 406
2131479d 407 drops = 0;
af87a3aa
TM
408 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN;
409
638264dc 410 spin_lock(&rq->xdp_ring.producer_lock);
af87a3aa
TM
411 for (i = 0; i < n; i++) {
412 struct xdp_frame *frame = frames[i];
413 void *ptr = veth_xdp_to_ptr(frame);
414
415 if (unlikely(frame->len > max_len ||
638264dc 416 __ptr_ring_produce(&rq->xdp_ring, ptr))) {
af87a3aa
TM
417 xdp_return_frame_rx_napi(frame);
418 drops++;
419 }
420 }
638264dc 421 spin_unlock(&rq->xdp_ring.producer_lock);
af87a3aa
TM
422
423 if (flags & XDP_XMIT_FLUSH)
638264dc 424 __veth_xdp_flush(rq);
af87a3aa 425
b23bfa56
JF
426 if (likely(!drops)) {
427 rcu_read_unlock();
2131479d 428 return n;
b23bfa56 429 }
2131479d
TM
430
431 ret = n - drops;
432drop:
b23bfa56 433 rcu_read_unlock();
2131479d
TM
434 atomic64_add(drops, &priv->dropped);
435
436 return ret;
af87a3aa
TM
437}
438
9cda7807
TM
439static void veth_xdp_flush_bq(struct net_device *dev, struct veth_xdp_tx_bq *bq)
440{
441 int sent, i, err = 0;
442
443 sent = veth_xdp_xmit(dev, bq->count, bq->q, 0);
444 if (sent < 0) {
445 err = sent;
446 sent = 0;
447 for (i = 0; i < bq->count; i++)
448 xdp_return_frame(bq->q[i]);
449 }
450 trace_xdp_bulk_tx(dev, sent, bq->count - sent, err);
451
452 bq->count = 0;
453}
454
455static void veth_xdp_flush(struct net_device *dev, struct veth_xdp_tx_bq *bq)
d1396004
TM
456{
457 struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
458 struct net_device *rcv;
638264dc 459 struct veth_rq *rq;
d1396004
TM
460
461 rcu_read_lock();
9cda7807 462 veth_xdp_flush_bq(dev, bq);
d1396004
TM
463 rcv = rcu_dereference(priv->peer);
464 if (unlikely(!rcv))
465 goto out;
466
467 rcv_priv = netdev_priv(rcv);
638264dc 468 rq = &rcv_priv->rq[veth_select_rxq(rcv)];
d1396004 469 /* xdp_ring is initialized on receive side? */
638264dc 470 if (unlikely(!rcu_access_pointer(rq->xdp_prog)))
d1396004
TM
471 goto out;
472
638264dc 473 __veth_xdp_flush(rq);
d1396004
TM
474out:
475 rcu_read_unlock();
476}
477
9cda7807
TM
478static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp,
479 struct veth_xdp_tx_bq *bq)
d1396004
TM
480{
481 struct xdp_frame *frame = convert_to_xdp_frame(xdp);
482
483 if (unlikely(!frame))
484 return -EOVERFLOW;
485
9cda7807
TM
486 if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE))
487 veth_xdp_flush_bq(dev, bq);
488
489 bq->q[bq->count++] = frame;
490
491 return 0;
d1396004
TM
492}
493
638264dc 494static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq,
d1396004 495 struct xdp_frame *frame,
1c5b82e5
LB
496 struct veth_xdp_tx_bq *bq,
497 struct veth_stats *stats)
9fc8d518
TM
498{
499 void *hard_start = frame->data - frame->headroom;
500 void *head = hard_start - sizeof(struct xdp_frame);
501 int len = frame->len, delta = 0;
d1396004 502 struct xdp_frame orig_frame;
9fc8d518
TM
503 struct bpf_prog *xdp_prog;
504 unsigned int headroom;
505 struct sk_buff *skb;
506
507 rcu_read_lock();
638264dc 508 xdp_prog = rcu_dereference(rq->xdp_prog);
9fc8d518
TM
509 if (likely(xdp_prog)) {
510 struct xdp_buff xdp;
511 u32 act;
512
513 xdp.data_hard_start = hard_start;
514 xdp.data = frame->data;
515 xdp.data_end = frame->data + frame->len;
516 xdp.data_meta = frame->data - frame->metasize;
638264dc 517 xdp.rxq = &rq->xdp_rxq;
9fc8d518
TM
518
519 act = bpf_prog_run_xdp(xdp_prog, &xdp);
520
521 switch (act) {
522 case XDP_PASS:
523 delta = frame->data - xdp.data;
524 len = xdp.data_end - xdp.data;
525 break;
d1396004
TM
526 case XDP_TX:
527 orig_frame = *frame;
528 xdp.data_hard_start = head;
529 xdp.rxq->mem = frame->mem;
9cda7807 530 if (unlikely(veth_xdp_tx(rq->dev, &xdp, bq) < 0)) {
638264dc 531 trace_xdp_exception(rq->dev, xdp_prog, act);
d1396004 532 frame = &orig_frame;
1c5b82e5 533 stats->rx_drops++;
d1396004
TM
534 goto err_xdp;
535 }
1c5b82e5 536 stats->xdp_tx++;
d1396004
TM
537 rcu_read_unlock();
538 goto xdp_xmit;
539 case XDP_REDIRECT:
540 orig_frame = *frame;
541 xdp.data_hard_start = head;
542 xdp.rxq->mem = frame->mem;
638264dc 543 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) {
d1396004 544 frame = &orig_frame;
1c5b82e5 545 stats->rx_drops++;
d1396004
TM
546 goto err_xdp;
547 }
1c5b82e5 548 stats->xdp_redirect++;
d1396004
TM
549 rcu_read_unlock();
550 goto xdp_xmit;
9fc8d518
TM
551 default:
552 bpf_warn_invalid_xdp_action(act);
a9b6d9ef 553 /* fall through */
9fc8d518 554 case XDP_ABORTED:
638264dc 555 trace_xdp_exception(rq->dev, xdp_prog, act);
a9b6d9ef 556 /* fall through */
9fc8d518 557 case XDP_DROP:
1c5b82e5 558 stats->xdp_drops++;
9fc8d518
TM
559 goto err_xdp;
560 }
561 }
562 rcu_read_unlock();
563
564 headroom = sizeof(struct xdp_frame) + frame->headroom - delta;
565 skb = veth_build_skb(head, headroom, len, 0);
566 if (!skb) {
567 xdp_return_frame(frame);
1c5b82e5 568 stats->rx_drops++;
9fc8d518
TM
569 goto err;
570 }
571
cbf33510 572 xdp_release_frame(frame);
9fc8d518 573 xdp_scrub_frame(frame);
638264dc 574 skb->protocol = eth_type_trans(skb, rq->dev);
9fc8d518
TM
575err:
576 return skb;
577err_xdp:
578 rcu_read_unlock();
579 xdp_return_frame(frame);
d1396004 580xdp_xmit:
9fc8d518
TM
581 return NULL;
582}
583
1c5b82e5
LB
584static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
585 struct sk_buff *skb,
586 struct veth_xdp_tx_bq *bq,
587 struct veth_stats *stats)
948d4f21
TM
588{
589 u32 pktlen, headroom, act, metalen;
590 void *orig_data, *orig_data_end;
591 struct bpf_prog *xdp_prog;
592 int mac_len, delta, off;
593 struct xdp_buff xdp;
594
4bf9ffa0
TM
595 skb_orphan(skb);
596
948d4f21 597 rcu_read_lock();
638264dc 598 xdp_prog = rcu_dereference(rq->xdp_prog);
948d4f21
TM
599 if (unlikely(!xdp_prog)) {
600 rcu_read_unlock();
601 goto out;
602 }
603
604 mac_len = skb->data - skb_mac_header(skb);
605 pktlen = skb->len + mac_len;
606 headroom = skb_headroom(skb) - mac_len;
607
608 if (skb_shared(skb) || skb_head_is_locked(skb) ||
609 skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
610 struct sk_buff *nskb;
611 int size, head_off;
612 void *head, *start;
613 struct page *page;
614
615 size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) +
616 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
617 if (size > PAGE_SIZE)
618 goto drop;
619
620 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
621 if (!page)
622 goto drop;
623
624 head = page_address(page);
625 start = head + VETH_XDP_HEADROOM;
626 if (skb_copy_bits(skb, -mac_len, start, pktlen)) {
627 page_frag_free(head);
628 goto drop;
629 }
630
631 nskb = veth_build_skb(head,
632 VETH_XDP_HEADROOM + mac_len, skb->len,
633 PAGE_SIZE);
634 if (!nskb) {
635 page_frag_free(head);
636 goto drop;
637 }
638
639 skb_copy_header(nskb, skb);
640 head_off = skb_headroom(nskb) - skb_headroom(skb);
641 skb_headers_offset_update(nskb, head_off);
948d4f21
TM
642 consume_skb(skb);
643 skb = nskb;
644 }
645
646 xdp.data_hard_start = skb->head;
647 xdp.data = skb_mac_header(skb);
648 xdp.data_end = xdp.data + pktlen;
649 xdp.data_meta = xdp.data;
638264dc 650 xdp.rxq = &rq->xdp_rxq;
948d4f21
TM
651 orig_data = xdp.data;
652 orig_data_end = xdp.data_end;
653
654 act = bpf_prog_run_xdp(xdp_prog, &xdp);
655
656 switch (act) {
657 case XDP_PASS:
658 break;
d1396004
TM
659 case XDP_TX:
660 get_page(virt_to_page(xdp.data));
661 consume_skb(skb);
638264dc 662 xdp.rxq->mem = rq->xdp_mem;
9cda7807 663 if (unlikely(veth_xdp_tx(rq->dev, &xdp, bq) < 0)) {
638264dc 664 trace_xdp_exception(rq->dev, xdp_prog, act);
1c5b82e5 665 stats->rx_drops++;
d1396004
TM
666 goto err_xdp;
667 }
1c5b82e5 668 stats->xdp_tx++;
d1396004
TM
669 rcu_read_unlock();
670 goto xdp_xmit;
671 case XDP_REDIRECT:
672 get_page(virt_to_page(xdp.data));
673 consume_skb(skb);
638264dc 674 xdp.rxq->mem = rq->xdp_mem;
1c5b82e5
LB
675 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) {
676 stats->rx_drops++;
d1396004 677 goto err_xdp;
1c5b82e5
LB
678 }
679 stats->xdp_redirect++;
d1396004
TM
680 rcu_read_unlock();
681 goto xdp_xmit;
948d4f21
TM
682 default:
683 bpf_warn_invalid_xdp_action(act);
a9b6d9ef 684 /* fall through */
948d4f21 685 case XDP_ABORTED:
638264dc 686 trace_xdp_exception(rq->dev, xdp_prog, act);
a9b6d9ef 687 /* fall through */
948d4f21 688 case XDP_DROP:
1c5b82e5
LB
689 stats->xdp_drops++;
690 goto xdp_drop;
948d4f21
TM
691 }
692 rcu_read_unlock();
693
694 delta = orig_data - xdp.data;
695 off = mac_len + delta;
696 if (off > 0)
697 __skb_push(skb, off);
698 else if (off < 0)
699 __skb_pull(skb, -off);
700 skb->mac_header -= delta;
701 off = xdp.data_end - orig_data_end;
702 if (off != 0)
703 __skb_put(skb, off);
638264dc 704 skb->protocol = eth_type_trans(skb, rq->dev);
948d4f21
TM
705
706 metalen = xdp.data - xdp.data_meta;
707 if (metalen)
708 skb_metadata_set(skb, metalen);
709out:
710 return skb;
711drop:
1c5b82e5
LB
712 stats->rx_drops++;
713xdp_drop:
948d4f21
TM
714 rcu_read_unlock();
715 kfree_skb(skb);
716 return NULL;
d1396004
TM
717err_xdp:
718 rcu_read_unlock();
719 page_frag_free(xdp.data);
720xdp_xmit:
721 return NULL;
948d4f21
TM
722}
723
1c5b82e5
LB
724static int veth_xdp_rcv(struct veth_rq *rq, int budget,
725 struct veth_xdp_tx_bq *bq,
726 struct veth_stats *stats)
948d4f21 727{
1c5b82e5 728 int i, done = 0;
948d4f21
TM
729
730 for (i = 0; i < budget; i++) {
638264dc 731 void *ptr = __ptr_ring_consume(&rq->xdp_ring);
9fc8d518 732 struct sk_buff *skb;
948d4f21 733
9fc8d518 734 if (!ptr)
948d4f21
TM
735 break;
736
d1396004 737 if (veth_is_xdp_frame(ptr)) {
4195e54a
TM
738 struct xdp_frame *frame = veth_ptr_to_xdp(ptr);
739
1c5b82e5
LB
740 stats->xdp_bytes += frame->len;
741 skb = veth_xdp_rcv_one(rq, frame, bq, stats);
d1396004 742 } else {
4195e54a 743 skb = ptr;
1c5b82e5
LB
744 stats->xdp_bytes += skb->len;
745 skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
d1396004 746 }
948d4f21
TM
747
748 if (skb)
638264dc 749 napi_gro_receive(&rq->xdp_napi, skb);
948d4f21
TM
750
751 done++;
752 }
753
4195e54a 754 u64_stats_update_begin(&rq->stats.syncp);
1c5b82e5
LB
755 rq->stats.vs.xdp_bytes += stats->xdp_bytes;
756 rq->stats.vs.xdp_drops += stats->xdp_drops + stats->rx_drops;
65780c56 757 rq->stats.vs.xdp_packets += done;
4195e54a
TM
758 u64_stats_update_end(&rq->stats.syncp);
759
948d4f21
TM
760 return done;
761}
762
763static int veth_poll(struct napi_struct *napi, int budget)
764{
638264dc
TM
765 struct veth_rq *rq =
766 container_of(napi, struct veth_rq, xdp_napi);
1c5b82e5 767 struct veth_stats stats = {};
9cda7807 768 struct veth_xdp_tx_bq bq;
948d4f21
TM
769 int done;
770
9cda7807
TM
771 bq.count = 0;
772
d1396004 773 xdp_set_return_frame_no_direct();
1c5b82e5 774 done = veth_xdp_rcv(rq, budget, &bq, &stats);
948d4f21
TM
775
776 if (done < budget && napi_complete_done(napi, done)) {
777 /* Write rx_notify_masked before reading ptr_ring */
638264dc
TM
778 smp_store_mb(rq->rx_notify_masked, false);
779 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) {
780 rq->rx_notify_masked = true;
781 napi_schedule(&rq->xdp_napi);
948d4f21
TM
782 }
783 }
784
1c5b82e5 785 if (stats.xdp_tx > 0)
9cda7807 786 veth_xdp_flush(rq->dev, &bq);
1c5b82e5 787 if (stats.xdp_redirect > 0)
1d233886 788 xdp_do_flush();
d1396004
TM
789 xdp_clear_return_frame_no_direct();
790
948d4f21
TM
791 return done;
792}
793
794static int veth_napi_add(struct net_device *dev)
795{
796 struct veth_priv *priv = netdev_priv(dev);
638264dc 797 int err, i;
948d4f21 798
638264dc
TM
799 for (i = 0; i < dev->real_num_rx_queues; i++) {
800 struct veth_rq *rq = &priv->rq[i];
801
802 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
803 if (err)
804 goto err_xdp_ring;
805 }
948d4f21 806
638264dc
TM
807 for (i = 0; i < dev->real_num_rx_queues; i++) {
808 struct veth_rq *rq = &priv->rq[i];
809
810 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
811 napi_enable(&rq->xdp_napi);
812 }
948d4f21
TM
813
814 return 0;
638264dc
TM
815err_xdp_ring:
816 for (i--; i >= 0; i--)
817 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free);
818
819 return err;
948d4f21
TM
820}
821
822static void veth_napi_del(struct net_device *dev)
823{
824 struct veth_priv *priv = netdev_priv(dev);
638264dc 825 int i;
948d4f21 826
638264dc
TM
827 for (i = 0; i < dev->real_num_rx_queues; i++) {
828 struct veth_rq *rq = &priv->rq[i];
829
830 napi_disable(&rq->xdp_napi);
831 napi_hash_del(&rq->xdp_napi);
832 }
833 synchronize_net();
834
835 for (i = 0; i < dev->real_num_rx_queues; i++) {
836 struct veth_rq *rq = &priv->rq[i];
837
838 netif_napi_del(&rq->xdp_napi);
839 rq->rx_notify_masked = false;
840 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
841 }
948d4f21
TM
842}
843
844static int veth_enable_xdp(struct net_device *dev)
845{
846 struct veth_priv *priv = netdev_priv(dev);
638264dc 847 int err, i;
948d4f21 848
638264dc
TM
849 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) {
850 for (i = 0; i < dev->real_num_rx_queues; i++) {
851 struct veth_rq *rq = &priv->rq[i];
948d4f21 852
638264dc
TM
853 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i);
854 if (err < 0)
855 goto err_rxq_reg;
856
857 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
858 MEM_TYPE_PAGE_SHARED,
859 NULL);
860 if (err < 0)
861 goto err_reg_mem;
862
863 /* Save original mem info as it can be overwritten */
864 rq->xdp_mem = rq->xdp_rxq.mem;
865 }
948d4f21
TM
866
867 err = veth_napi_add(dev);
868 if (err)
638264dc 869 goto err_rxq_reg;
948d4f21
TM
870 }
871
638264dc
TM
872 for (i = 0; i < dev->real_num_rx_queues; i++)
873 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog);
948d4f21
TM
874
875 return 0;
638264dc
TM
876err_reg_mem:
877 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
878err_rxq_reg:
879 for (i--; i >= 0; i--)
880 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
948d4f21
TM
881
882 return err;
883}
884
885static void veth_disable_xdp(struct net_device *dev)
886{
887 struct veth_priv *priv = netdev_priv(dev);
638264dc 888 int i;
948d4f21 889
638264dc
TM
890 for (i = 0; i < dev->real_num_rx_queues; i++)
891 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL);
948d4f21 892 veth_napi_del(dev);
638264dc
TM
893 for (i = 0; i < dev->real_num_rx_queues; i++) {
894 struct veth_rq *rq = &priv->rq[i];
895
896 rq->xdp_rxq.mem = rq->xdp_mem;
897 xdp_rxq_info_unreg(&rq->xdp_rxq);
898 }
948d4f21
TM
899}
900
e314dbdc
PE
901static int veth_open(struct net_device *dev)
902{
d0e2c55e
ED
903 struct veth_priv *priv = netdev_priv(dev);
904 struct net_device *peer = rtnl_dereference(priv->peer);
948d4f21 905 int err;
e314dbdc 906
d0e2c55e 907 if (!peer)
e314dbdc
PE
908 return -ENOTCONN;
909
948d4f21
TM
910 if (priv->_xdp_prog) {
911 err = veth_enable_xdp(dev);
912 if (err)
913 return err;
914 }
915
d0e2c55e 916 if (peer->flags & IFF_UP) {
e314dbdc 917 netif_carrier_on(dev);
d0e2c55e 918 netif_carrier_on(peer);
e314dbdc 919 }
948d4f21 920
e314dbdc
PE
921 return 0;
922}
923
2cf48a10
EB
924static int veth_close(struct net_device *dev)
925{
926 struct veth_priv *priv = netdev_priv(dev);
2efd32ee 927 struct net_device *peer = rtnl_dereference(priv->peer);
2cf48a10
EB
928
929 netif_carrier_off(dev);
2efd32ee
ED
930 if (peer)
931 netif_carrier_off(peer);
2cf48a10 932
948d4f21
TM
933 if (priv->_xdp_prog)
934 veth_disable_xdp(dev);
935
2cf48a10
EB
936 return 0;
937}
938
91572088 939static int is_valid_veth_mtu(int mtu)
38d40815 940{
91572088 941 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU;
38d40815
EB
942}
943
7797b93b
TM
944static int veth_alloc_queues(struct net_device *dev)
945{
946 struct veth_priv *priv = netdev_priv(dev);
947 int i;
948
949 priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL);
950 if (!priv->rq)
951 return -ENOMEM;
952
4195e54a 953 for (i = 0; i < dev->num_rx_queues; i++) {
7797b93b 954 priv->rq[i].dev = dev;
4195e54a
TM
955 u64_stats_init(&priv->rq[i].stats.syncp);
956 }
7797b93b
TM
957
958 return 0;
959}
960
961static void veth_free_queues(struct net_device *dev)
962{
963 struct veth_priv *priv = netdev_priv(dev);
964
965 kfree(priv->rq);
966}
967
e314dbdc
PE
968static int veth_dev_init(struct net_device *dev)
969{
7797b93b
TM
970 int err;
971
14d73416
LR
972 dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
973 if (!dev->lstats)
e314dbdc 974 return -ENOMEM;
7797b93b
TM
975
976 err = veth_alloc_queues(dev);
977 if (err) {
14d73416 978 free_percpu(dev->lstats);
7797b93b
TM
979 return err;
980 }
981
e314dbdc
PE
982 return 0;
983}
984
11687a10
DM
985static void veth_dev_free(struct net_device *dev)
986{
7797b93b 987 veth_free_queues(dev);
14d73416 988 free_percpu(dev->lstats);
11687a10
DM
989}
990
bb446c19
WC
991#ifdef CONFIG_NET_POLL_CONTROLLER
992static void veth_poll_controller(struct net_device *dev)
993{
994 /* veth only receives frames when its peer sends one
948d4f21 995 * Since it has nothing to do with disabling irqs, we are guaranteed
bb446c19
WC
996 * never to have pending data when we poll for it so
997 * there is nothing to do here.
998 *
999 * We need this though so netpoll recognizes us as an interface that
1000 * supports polling, which enables bridge devices in virt setups to
1001 * still use netconsole
1002 */
1003}
1004#endif /* CONFIG_NET_POLL_CONTROLLER */
1005
a45253bf
ND
1006static int veth_get_iflink(const struct net_device *dev)
1007{
1008 struct veth_priv *priv = netdev_priv(dev);
1009 struct net_device *peer;
1010 int iflink;
1011
1012 rcu_read_lock();
1013 peer = rcu_dereference(priv->peer);
1014 iflink = peer ? peer->ifindex : 0;
1015 rcu_read_unlock();
1016
1017 return iflink;
1018}
1019
dc224822
TM
1020static netdev_features_t veth_fix_features(struct net_device *dev,
1021 netdev_features_t features)
1022{
1023 struct veth_priv *priv = netdev_priv(dev);
1024 struct net_device *peer;
1025
1026 peer = rtnl_dereference(priv->peer);
1027 if (peer) {
1028 struct veth_priv *peer_priv = netdev_priv(peer);
1029
1030 if (peer_priv->_xdp_prog)
1031 features &= ~NETIF_F_GSO_SOFTWARE;
1032 }
1033
1034 return features;
1035}
1036
163e5292
PA
1037static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
1038{
1039 struct veth_priv *peer_priv, *priv = netdev_priv(dev);
1040 struct net_device *peer;
1041
1042 if (new_hr < 0)
1043 new_hr = 0;
1044
1045 rcu_read_lock();
1046 peer = rcu_dereference(priv->peer);
1047 if (unlikely(!peer))
1048 goto out;
1049
1050 peer_priv = netdev_priv(peer);
1051 priv->requested_headroom = new_hr;
1052 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom);
1053 dev->needed_headroom = new_hr;
1054 peer->needed_headroom = new_hr;
1055
1056out:
1057 rcu_read_unlock();
1058}
1059
948d4f21
TM
1060static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
1061 struct netlink_ext_ack *extack)
1062{
1063 struct veth_priv *priv = netdev_priv(dev);
1064 struct bpf_prog *old_prog;
1065 struct net_device *peer;
dc224822 1066 unsigned int max_mtu;
948d4f21
TM
1067 int err;
1068
1069 old_prog = priv->_xdp_prog;
1070 priv->_xdp_prog = prog;
1071 peer = rtnl_dereference(priv->peer);
1072
1073 if (prog) {
1074 if (!peer) {
1075 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached");
1076 err = -ENOTCONN;
1077 goto err;
1078 }
1079
dc224822
TM
1080 max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM -
1081 peer->hard_header_len -
1082 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
1083 if (peer->mtu > max_mtu) {
1084 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP");
1085 err = -ERANGE;
1086 goto err;
1087 }
1088
638264dc
TM
1089 if (dev->real_num_rx_queues < peer->real_num_tx_queues) {
1090 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues");
1091 err = -ENOSPC;
1092 goto err;
1093 }
1094
948d4f21
TM
1095 if (dev->flags & IFF_UP) {
1096 err = veth_enable_xdp(dev);
1097 if (err) {
1098 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed");
1099 goto err;
1100 }
1101 }
dc224822
TM
1102
1103 if (!old_prog) {
1104 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE;
1105 peer->max_mtu = max_mtu;
1106 }
948d4f21
TM
1107 }
1108
1109 if (old_prog) {
dc224822
TM
1110 if (!prog) {
1111 if (dev->flags & IFF_UP)
1112 veth_disable_xdp(dev);
1113
1114 if (peer) {
1115 peer->hw_features |= NETIF_F_GSO_SOFTWARE;
1116 peer->max_mtu = ETH_MAX_MTU;
1117 }
1118 }
948d4f21
TM
1119 bpf_prog_put(old_prog);
1120 }
1121
dc224822
TM
1122 if ((!!old_prog ^ !!prog) && peer)
1123 netdev_update_features(peer);
1124
948d4f21
TM
1125 return 0;
1126err:
1127 priv->_xdp_prog = old_prog;
1128
1129 return err;
1130}
1131
1132static u32 veth_xdp_query(struct net_device *dev)
1133{
1134 struct veth_priv *priv = netdev_priv(dev);
1135 const struct bpf_prog *xdp_prog;
1136
1137 xdp_prog = priv->_xdp_prog;
1138 if (xdp_prog)
1139 return xdp_prog->aux->id;
1140
1141 return 0;
1142}
1143
1144static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
1145{
1146 switch (xdp->command) {
1147 case XDP_SETUP_PROG:
1148 return veth_xdp_set(dev, xdp->prog, xdp->extack);
1149 case XDP_QUERY_PROG:
1150 xdp->prog_id = veth_xdp_query(dev);
1151 return 0;
1152 default:
1153 return -EINVAL;
1154 }
1155}
1156
4456e7bd 1157static const struct net_device_ops veth_netdev_ops = {
ee923623
DL
1158 .ndo_init = veth_dev_init,
1159 .ndo_open = veth_open,
2cf48a10 1160 .ndo_stop = veth_close,
ee923623 1161 .ndo_start_xmit = veth_xmit,
6311cc44 1162 .ndo_get_stats64 = veth_get_stats64,
5c70ef85 1163 .ndo_set_rx_mode = veth_set_multicast_list,
ee923623 1164 .ndo_set_mac_address = eth_mac_addr,
bb446c19
WC
1165#ifdef CONFIG_NET_POLL_CONTROLLER
1166 .ndo_poll_controller = veth_poll_controller,
1167#endif
a45253bf 1168 .ndo_get_iflink = veth_get_iflink,
dc224822 1169 .ndo_fix_features = veth_fix_features,
1a04a821 1170 .ndo_features_check = passthru_features_check,
163e5292 1171 .ndo_set_rx_headroom = veth_set_rx_headroom,
948d4f21 1172 .ndo_bpf = veth_xdp,
af87a3aa 1173 .ndo_xdp_xmit = veth_xdp_xmit,
4456e7bd
SH
1174};
1175
732912d7 1176#define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
c80fafbb 1177 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \
732912d7 1178 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \
28d2b136
PM
1179 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \
1180 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX )
8093315a 1181
e314dbdc
PE
1182static void veth_setup(struct net_device *dev)
1183{
1184 ether_setup(dev);
1185
550fd08c 1186 dev->priv_flags &= ~IFF_TX_SKB_SHARING;
23ea5a96 1187 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
02f01ec1 1188 dev->priv_flags |= IFF_NO_QUEUE;
163e5292 1189 dev->priv_flags |= IFF_PHONY_HEADROOM;
550fd08c 1190
4456e7bd 1191 dev->netdev_ops = &veth_netdev_ops;
e314dbdc
PE
1192 dev->ethtool_ops = &veth_ethtool_ops;
1193 dev->features |= NETIF_F_LLTX;
8093315a 1194 dev->features |= VETH_FEATURES;
8d0d21f4 1195 dev->vlan_features = dev->features &
3f8c707b
VY
1196 ~(NETIF_F_HW_VLAN_CTAG_TX |
1197 NETIF_F_HW_VLAN_STAG_TX |
1198 NETIF_F_HW_VLAN_CTAG_RX |
1199 NETIF_F_HW_VLAN_STAG_RX);
cf124db5
DM
1200 dev->needs_free_netdev = true;
1201 dev->priv_destructor = veth_dev_free;
91572088 1202 dev->max_mtu = ETH_MAX_MTU;
a2c725fa 1203
8093315a 1204 dev->hw_features = VETH_FEATURES;
82d81898 1205 dev->hw_enc_features = VETH_FEATURES;
607fca9a 1206 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
e314dbdc
PE
1207}
1208
1209/*
1210 * netlink interface
1211 */
1212
a8b8a889
MS
1213static int veth_validate(struct nlattr *tb[], struct nlattr *data[],
1214 struct netlink_ext_ack *extack)
e314dbdc
PE
1215{
1216 if (tb[IFLA_ADDRESS]) {
1217 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1218 return -EINVAL;
1219 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1220 return -EADDRNOTAVAIL;
1221 }
38d40815
EB
1222 if (tb[IFLA_MTU]) {
1223 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU])))
1224 return -EINVAL;
1225 }
e314dbdc
PE
1226 return 0;
1227}
1228
1229static struct rtnl_link_ops veth_link_ops;
1230
81adee47 1231static int veth_newlink(struct net *src_net, struct net_device *dev,
7a3f4a18
MS
1232 struct nlattr *tb[], struct nlattr *data[],
1233 struct netlink_ext_ack *extack)
e314dbdc 1234{
7797b93b 1235 int err;
e314dbdc
PE
1236 struct net_device *peer;
1237 struct veth_priv *priv;
1238 char ifname[IFNAMSIZ];
1239 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp;
5517750f 1240 unsigned char name_assign_type;
3729d502 1241 struct ifinfomsg *ifmp;
81adee47 1242 struct net *net;
e314dbdc
PE
1243
1244 /*
1245 * create and register peer first
e314dbdc 1246 */
e314dbdc
PE
1247 if (data != NULL && data[VETH_INFO_PEER] != NULL) {
1248 struct nlattr *nla_peer;
1249
1250 nla_peer = data[VETH_INFO_PEER];
3729d502 1251 ifmp = nla_data(nla_peer);
f7b12606
JP
1252 err = rtnl_nla_parse_ifla(peer_tb,
1253 nla_data(nla_peer) + sizeof(struct ifinfomsg),
fceb6435
JB
1254 nla_len(nla_peer) - sizeof(struct ifinfomsg),
1255 NULL);
e314dbdc
PE
1256 if (err < 0)
1257 return err;
1258
a8b8a889 1259 err = veth_validate(peer_tb, NULL, extack);
e314dbdc
PE
1260 if (err < 0)
1261 return err;
1262
1263 tbp = peer_tb;
3729d502
PM
1264 } else {
1265 ifmp = NULL;
e314dbdc 1266 tbp = tb;
3729d502 1267 }
e314dbdc 1268
191cdb38 1269 if (ifmp && tbp[IFLA_IFNAME]) {
e314dbdc 1270 nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
5517750f
TG
1271 name_assign_type = NET_NAME_USER;
1272 } else {
e314dbdc 1273 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d");
5517750f
TG
1274 name_assign_type = NET_NAME_ENUM;
1275 }
e314dbdc 1276
81adee47
EB
1277 net = rtnl_link_get_net(src_net, tbp);
1278 if (IS_ERR(net))
1279 return PTR_ERR(net);
1280
5517750f 1281 peer = rtnl_create_link(net, ifname, name_assign_type,
d0522f1c 1282 &veth_link_ops, tbp, extack);
81adee47
EB
1283 if (IS_ERR(peer)) {
1284 put_net(net);
e314dbdc 1285 return PTR_ERR(peer);
81adee47 1286 }
e314dbdc 1287
191cdb38 1288 if (!ifmp || !tbp[IFLA_ADDRESS])
f2cedb63 1289 eth_hw_addr_random(peer);
e6f8f1a7
PE
1290
1291 if (ifmp && (dev->ifindex != 0))
1292 peer->ifindex = ifmp->ifi_index;
e314dbdc 1293
72d24955
SH
1294 peer->gso_max_size = dev->gso_max_size;
1295 peer->gso_max_segs = dev->gso_max_segs;
1296
e314dbdc 1297 err = register_netdevice(peer);
81adee47
EB
1298 put_net(net);
1299 net = NULL;
e314dbdc
PE
1300 if (err < 0)
1301 goto err_register_peer;
1302
1303 netif_carrier_off(peer);
1304
3729d502
PM
1305 err = rtnl_configure_link(peer, ifmp);
1306 if (err < 0)
1307 goto err_configure_peer;
1308
e314dbdc
PE
1309 /*
1310 * register dev last
1311 *
1312 * note, that since we've registered new device the dev's name
1313 * should be re-allocated
1314 */
1315
1316 if (tb[IFLA_ADDRESS] == NULL)
f2cedb63 1317 eth_hw_addr_random(dev);
e314dbdc 1318
6c8c4446
JP
1319 if (tb[IFLA_IFNAME])
1320 nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
1321 else
1322 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d");
1323
e314dbdc
PE
1324 err = register_netdevice(dev);
1325 if (err < 0)
1326 goto err_register_dev;
1327
1328 netif_carrier_off(dev);
1329
1330 /*
1331 * tie the deviced together
1332 */
1333
1334 priv = netdev_priv(dev);
d0e2c55e 1335 rcu_assign_pointer(priv->peer, peer);
e314dbdc
PE
1336
1337 priv = netdev_priv(peer);
d0e2c55e 1338 rcu_assign_pointer(priv->peer, dev);
948d4f21 1339
e314dbdc
PE
1340 return 0;
1341
1342err_register_dev:
1343 /* nothing to do */
3729d502 1344err_configure_peer:
e314dbdc
PE
1345 unregister_netdevice(peer);
1346 return err;
1347
1348err_register_peer:
1349 free_netdev(peer);
1350 return err;
1351}
1352
23289a37 1353static void veth_dellink(struct net_device *dev, struct list_head *head)
e314dbdc
PE
1354{
1355 struct veth_priv *priv;
1356 struct net_device *peer;
1357
1358 priv = netdev_priv(dev);
d0e2c55e
ED
1359 peer = rtnl_dereference(priv->peer);
1360
1361 /* Note : dellink() is called from default_device_exit_batch(),
1362 * before a rcu_synchronize() point. The devices are guaranteed
1363 * not being freed before one RCU grace period.
1364 */
1365 RCU_INIT_POINTER(priv->peer, NULL);
24540535 1366 unregister_netdevice_queue(dev, head);
f45a5c26
ED
1367
1368 if (peer) {
1369 priv = netdev_priv(peer);
1370 RCU_INIT_POINTER(priv->peer, NULL);
1371 unregister_netdevice_queue(peer, head);
1372 }
e314dbdc
PE
1373}
1374
23711438
TG
1375static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = {
1376 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) },
1377};
e314dbdc 1378
e5f4e7b9
ND
1379static struct net *veth_get_link_net(const struct net_device *dev)
1380{
1381 struct veth_priv *priv = netdev_priv(dev);
1382 struct net_device *peer = rtnl_dereference(priv->peer);
1383
1384 return peer ? dev_net(peer) : dev_net(dev);
1385}
1386
e314dbdc
PE
1387static struct rtnl_link_ops veth_link_ops = {
1388 .kind = DRV_NAME,
1389 .priv_size = sizeof(struct veth_priv),
1390 .setup = veth_setup,
1391 .validate = veth_validate,
1392 .newlink = veth_newlink,
1393 .dellink = veth_dellink,
1394 .policy = veth_policy,
1395 .maxtype = VETH_INFO_MAX,
e5f4e7b9 1396 .get_link_net = veth_get_link_net,
e314dbdc
PE
1397};
1398
1399/*
1400 * init/fini
1401 */
1402
1403static __init int veth_init(void)
1404{
1405 return rtnl_link_register(&veth_link_ops);
1406}
1407
1408static __exit void veth_exit(void)
1409{
68365458 1410 rtnl_link_unregister(&veth_link_ops);
e314dbdc
PE
1411}
1412
1413module_init(veth_init);
1414module_exit(veth_exit);
1415
1416MODULE_DESCRIPTION("Virtual Ethernet Tunnel");
1417MODULE_LICENSE("GPL v2");
1418MODULE_ALIAS_RTNL_LINK(DRV_NAME);