Merge tag 'exynos-drm-fixes-for-v6.4-rc3' of git://git.kernel.org/pub/scm/linux/kerne...
[linux-block.git] / net / packet / af_packet.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * PACKET - implements raw packet sockets.
8 *
02c30a84 9 * Authors: Ross Biro
1da177e4
LT
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 *
1ce4f28b 13 * Fixes:
1da177e4
LT
14 * Alan Cox : verify_area() now used correctly
15 * Alan Cox : new skbuff lists, look ma no backlogs!
16 * Alan Cox : tidied skbuff lists.
17 * Alan Cox : Now uses generic datagram routines I
18 * added. Also fixed the peek/read crash
19 * from all old Linux datagram code.
20 * Alan Cox : Uses the improved datagram code.
21 * Alan Cox : Added NULL's for socket options.
22 * Alan Cox : Re-commented the code.
23 * Alan Cox : Use new kernel side addressing
24 * Rob Janssen : Correct MTU usage.
25 * Dave Platt : Counter leaks caused by incorrect
26 * interrupt locking and some slightly
27 * dubious gcc output. Can you read
28 * compiler: it said _VOLATILE_
29 * Richard Kooijman : Timestamp fixes.
30 * Alan Cox : New buffers. Use sk->mac.raw.
31 * Alan Cox : sendmsg/recvmsg support.
32 * Alan Cox : Protocol setting support
33 * Alexey Kuznetsov : Untied from IPv4 stack.
34 * Cyrus Durgin : Fixed kerneld for kmod.
35 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 36 * Ulises Alonso : Frame number limit removal and
1da177e4 37 * packet_set_ring memory leak.
0fb375fb
EB
38 * Eric Biederman : Allow for > 8 byte hardware addresses.
39 * The convention is that longer addresses
40 * will simply extend the hardware address
1ce4f28b 41 * byte arrays at the end of sockaddr_ll
0fb375fb 42 * and packet_mreq.
69e3c75f 43 * Johann Baudy : Added TX RING.
f6fb8f10 44 * Chetan Loke : Implemented TPACKET_V3 block abstraction
45 * layer.
46 * Copyright (C) 2011, <lokec@ccs.neu.edu>
1da177e4 47 */
1ce4f28b 48
dc41c4a9
BS
49#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
50
cc69837f 51#include <linux/ethtool.h>
b6459415 52#include <linux/filter.h>
1da177e4 53#include <linux/types.h>
1da177e4 54#include <linux/mm.h>
4fc268d2 55#include <linux/capability.h>
1da177e4
LT
56#include <linux/fcntl.h>
57#include <linux/socket.h>
58#include <linux/in.h>
59#include <linux/inet.h>
60#include <linux/netdevice.h>
61#include <linux/if_packet.h>
62#include <linux/wireless.h>
ffbc6111 63#include <linux/kernel.h>
1da177e4 64#include <linux/kmod.h>
5a0e3ad6 65#include <linux/slab.h>
0e3125c7 66#include <linux/vmalloc.h>
457c4cbc 67#include <net/net_namespace.h>
1da177e4
LT
68#include <net/ip.h>
69#include <net/protocol.h>
70#include <linux/skbuff.h>
71#include <net/sock.h>
72#include <linux/errno.h>
73#include <linux/timer.h>
7c0f6ba6 74#include <linux/uaccess.h>
1da177e4
LT
75#include <asm/ioctls.h>
76#include <asm/page.h>
a1f8e7f7 77#include <asm/cacheflush.h>
1da177e4
LT
78#include <asm/io.h>
79#include <linux/proc_fs.h>
80#include <linux/seq_file.h>
81#include <linux/poll.h>
82#include <linux/module.h>
83#include <linux/init.h>
905db440 84#include <linux/mutex.h>
05423b24 85#include <linux/if_vlan.h>
bfd5f4a3 86#include <linux/virtio_net.h>
ed85b565 87#include <linux/errqueue.h>
614f60fa 88#include <linux/net_tstamp.h>
b0138408 89#include <linux/percpu.h>
1da177e4
LT
90#ifdef CONFIG_INET
91#include <net/inet_common.h>
92#endif
47dceb8e 93#include <linux/bpf.h>
719c44d3 94#include <net/compat.h>
0d7308c0 95#include <linux/netfilter_netdev.h>
1da177e4 96
2787b04b
PE
97#include "internal.h"
98
1da177e4
LT
99/*
100 Assumptions:
d5496990
EB
101 - If the device has no dev->header_ops->create, there is no LL header
102 visible above the device. In this case, its hard_header_len should be 0.
b4c58814
XH
103 The device may prepend its own header internally. In this case, its
104 needed_headroom should be set to the space needed for it to add its
105 internal header.
106 For example, a WiFi driver pretending to be an Ethernet driver should
107 set its hard_header_len to be the Ethernet header length, and set its
108 needed_headroom to be (the real WiFi header length - the fake Ethernet
109 header length).
1da177e4
LT
110 - packet socket receives packets with pulled ll header,
111 so that SOCK_RAW should push it back.
112
113On receive:
114-----------
115
d5496990 116Incoming, dev_has_header(dev) == true
b0e380b1
ACM
117 mac_header -> ll header
118 data -> data
1da177e4 119
d5496990 120Outgoing, dev_has_header(dev) == true
b0e380b1
ACM
121 mac_header -> ll header
122 data -> ll header
1da177e4 123
d5496990 124Incoming, dev_has_header(dev) == false
b79a80bd
XH
125 mac_header -> data
126 However drivers often make it point to the ll header.
127 This is incorrect because the ll header should be invisible to us.
b0e380b1 128 data -> data
1da177e4 129
d5496990 130Outgoing, dev_has_header(dev) == false
b79a80bd 131 mac_header -> data. ll header is invisible to us.
b0e380b1 132 data -> data
1da177e4
LT
133
134Resume
d5496990 135 If dev_has_header(dev) == false we are unable to restore the ll header,
b79a80bd 136 because it is invisible to us.
1da177e4
LT
137
138
139On transmit:
140------------
141
21c85974 142dev_has_header(dev) == true
b0e380b1
ACM
143 mac_header -> ll header
144 data -> ll header
1da177e4 145
21c85974 146dev_has_header(dev) == false (ll header is invisible to us)
b0e380b1
ACM
147 mac_header -> data
148 data -> data
1da177e4 149
09599729 150 We should set network_header on output to the correct position,
1da177e4
LT
151 packet classifier depends on it.
152 */
153
1da177e4
LT
154/* Private packet socket structures. */
155
0fb375fb
EB
156/* identical to struct packet_mreq except it has
157 * a longer address field.
158 */
40d4e3df 159struct packet_mreq_max {
0fb375fb
EB
160 int mr_ifindex;
161 unsigned short mr_type;
162 unsigned short mr_alen;
163 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 164};
a2efcfa0 165
184f489e
DB
166union tpacket_uhdr {
167 struct tpacket_hdr *h1;
168 struct tpacket2_hdr *h2;
169 struct tpacket3_hdr *h3;
170 void *raw;
171};
172
f6fb8f10 173static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
174 int closing, int tx_ring);
175
f6fb8f10 176#define V3_ALIGNMENT (8)
177
bc59ba39 178#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 179
180#define BLK_PLUS_PRIV(sz_of_priv) \
181 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
182
f6fb8f10 183#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
184#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
185#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
186#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
187#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
188#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
f6fb8f10 189
69e3c75f 190struct packet_sock;
77f65ebd
WB
191static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
192 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 193
f6fb8f10 194static void *packet_previous_frame(struct packet_sock *po,
195 struct packet_ring_buffer *rb,
196 int status);
197static void packet_increment_head(struct packet_ring_buffer *buff);
878cd3ba 198static int prb_curr_blk_in_use(struct tpacket_block_desc *);
bc59ba39 199static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *);
bc59ba39 201static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 202 struct packet_sock *, unsigned int status);
bc59ba39 203static int prb_queue_frozen(struct tpacket_kbdq_core *);
204static void prb_open_block(struct tpacket_kbdq_core *,
205 struct tpacket_block_desc *);
17bfd8c8 206static void prb_retire_rx_blk_timer_expired(struct timer_list *);
bc59ba39 207static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
bc59ba39 208static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
209static void prb_clear_rxhash(struct tpacket_kbdq_core *,
210 struct tpacket3_hdr *);
211static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
212 struct tpacket3_hdr *);
1da177e4 213static void packet_flush_mclist(struct sock *sk);
865b03f2 214static u16 packet_pick_tx_queue(struct sk_buff *skb);
1da177e4 215
ffbc6111 216struct packet_skb_cb {
ffbc6111
HX
217 union {
218 struct sockaddr_pkt pkt;
2472d761
EB
219 union {
220 /* Trick: alias skb original length with
221 * ll.sll_family and ll.protocol in order
222 * to save room.
223 */
224 unsigned int origlen;
225 struct sockaddr_ll ll;
226 };
ffbc6111
HX
227 } sa;
228};
229
d3869efe
DW
230#define vio_le() virtio_legacy_is_little_endian()
231
ffbc6111 232#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 233
bc59ba39 234#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 235#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 236 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 237#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 238 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 239#define GET_NEXT_PRB_BLK_NUM(x) \
240 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
241 ((x)->kactive_blk_num+1) : 0)
242
dc99f600
DM
243static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
244static void __fanout_link(struct sock *sk, struct packet_sock *po);
245
0d7308c0
PNA
246#ifdef CONFIG_NETFILTER_EGRESS
247static noinline struct sk_buff *nf_hook_direct_egress(struct sk_buff *skb)
248{
249 struct sk_buff *next, *head = NULL, *tail;
250 int rc;
251
252 rcu_read_lock();
253 for (; skb != NULL; skb = next) {
254 next = skb->next;
255 skb_mark_not_on_list(skb);
256
257 if (!nf_hook_egress(skb, &rc, skb->dev))
258 continue;
259
260 if (!head)
261 head = skb;
262 else
263 tail->next = skb;
264
265 tail = skb;
266 }
267 rcu_read_unlock();
268
269 return head;
270}
271#endif
272
105a201e 273static int packet_xmit(const struct packet_sock *po, struct sk_buff *skb)
d346a3fa 274{
105a201e
ED
275 if (!packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS))
276 return dev_queue_xmit(skb);
277
0d7308c0
PNA
278#ifdef CONFIG_NETFILTER_EGRESS
279 if (nf_hook_egress_active()) {
280 skb = nf_hook_direct_egress(skb);
281 if (!skb)
282 return NET_XMIT_DROP;
283 }
284#endif
865b03f2 285 return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
d346a3fa
DB
286}
287
66e56cd4
DB
288static struct net_device *packet_cached_dev_get(struct packet_sock *po)
289{
290 struct net_device *dev;
291
292 rcu_read_lock();
293 dev = rcu_dereference(po->cached_dev);
1160dfa1 294 dev_hold(dev);
66e56cd4
DB
295 rcu_read_unlock();
296
297 return dev;
298}
299
300static void packet_cached_dev_assign(struct packet_sock *po,
301 struct net_device *dev)
302{
303 rcu_assign_pointer(po->cached_dev, dev);
304}
305
306static void packet_cached_dev_reset(struct packet_sock *po)
307{
308 RCU_INIT_POINTER(po->cached_dev, NULL);
309}
310
865b03f2 311static u16 packet_pick_tx_queue(struct sk_buff *skb)
0fd5d57b 312{
865b03f2 313 struct net_device *dev = skb->dev;
0fd5d57b 314 const struct net_device_ops *ops = dev->netdev_ops;
b71b5837 315 int cpu = raw_smp_processor_id();
0fd5d57b
DB
316 u16 queue_index;
317
b71b5837
PA
318#ifdef CONFIG_XPS
319 skb->sender_cpu = cpu + 1;
320#endif
321 skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
0fd5d57b 322 if (ops->ndo_select_queue) {
a350ecce 323 queue_index = ops->ndo_select_queue(dev, skb, NULL);
0fd5d57b
DB
324 queue_index = netdev_cap_txqueue(dev, queue_index);
325 } else {
b71b5837 326 queue_index = netdev_pick_tx(dev, skb, NULL);
0fd5d57b
DB
327 }
328
865b03f2 329 return queue_index;
0fd5d57b
DB
330}
331
a6361f0c 332/* __register_prot_hook must be invoked through register_prot_hook
ce06b03e
DM
333 * or from a context in which asynchronous accesses to the packet
334 * socket is not possible (packet_create()).
335 */
a6361f0c 336static void __register_prot_hook(struct sock *sk)
ce06b03e
DM
337{
338 struct packet_sock *po = pkt_sk(sk);
e40526cb 339
61edf479 340 if (!packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
66e56cd4 341 if (po->fanout)
dc99f600 342 __fanout_link(sk, po);
66e56cd4 343 else
dc99f600 344 dev_add_pack(&po->prot_hook);
e40526cb 345
ce06b03e 346 sock_hold(sk);
61edf479 347 packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 1);
ce06b03e
DM
348 }
349}
350
a6361f0c
WB
351static void register_prot_hook(struct sock *sk)
352{
353 lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
354 __register_prot_hook(sk);
355}
356
357/* If the sync parameter is true, we will temporarily drop
ce06b03e
DM
358 * the po->bind_lock and do a synchronize_net to make sure no
359 * asynchronous packet processing paths still refer to the elements
360 * of po->prot_hook. If the sync parameter is false, it is the
361 * callers responsibility to take care of this.
362 */
363static void __unregister_prot_hook(struct sock *sk, bool sync)
364{
365 struct packet_sock *po = pkt_sk(sk);
366
a6361f0c
WB
367 lockdep_assert_held_once(&po->bind_lock);
368
61edf479 369 packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 0);
66e56cd4
DB
370
371 if (po->fanout)
dc99f600 372 __fanout_unlink(sk, po);
66e56cd4 373 else
dc99f600 374 __dev_remove_pack(&po->prot_hook);
e40526cb 375
ce06b03e
DM
376 __sock_put(sk);
377
378 if (sync) {
379 spin_unlock(&po->bind_lock);
380 synchronize_net();
381 spin_lock(&po->bind_lock);
382 }
383}
384
385static void unregister_prot_hook(struct sock *sk, bool sync)
386{
387 struct packet_sock *po = pkt_sk(sk);
388
61edf479 389 if (packet_sock_flag(po, PACKET_SOCK_RUNNING))
ce06b03e
DM
390 __unregister_prot_hook(sk, sync);
391}
392
6e58040b 393static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
394{
395 if (is_vmalloc_addr(addr))
396 return vmalloc_to_page(addr);
397 return virt_to_page(addr);
398}
399
69e3c75f 400static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 401{
184f489e 402 union tpacket_uhdr h;
1da177e4 403
69e3c75f 404 h.raw = frame;
bbd6ef87
PM
405 switch (po->tp_version) {
406 case TPACKET_V1:
69e3c75f 407 h.h1->tp_status = status;
0af55bb5 408 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
409 break;
410 case TPACKET_V2:
69e3c75f 411 h.h2->tp_status = status;
0af55bb5 412 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 413 break;
f6fb8f10 414 case TPACKET_V3:
7f953ab2
SV
415 h.h3->tp_status = status;
416 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
417 break;
69e3c75f 418 default:
f6fb8f10 419 WARN(1, "TPACKET version not supported.\n");
69e3c75f 420 BUG();
bbd6ef87 421 }
69e3c75f
JB
422
423 smp_wmb();
bbd6ef87
PM
424}
425
96f657e6 426static int __packet_get_status(const struct packet_sock *po, void *frame)
bbd6ef87 427{
184f489e 428 union tpacket_uhdr h;
bbd6ef87 429
69e3c75f
JB
430 smp_rmb();
431
bbd6ef87
PM
432 h.raw = frame;
433 switch (po->tp_version) {
434 case TPACKET_V1:
0af55bb5 435 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 436 return h.h1->tp_status;
bbd6ef87 437 case TPACKET_V2:
0af55bb5 438 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 439 return h.h2->tp_status;
f6fb8f10 440 case TPACKET_V3:
7f953ab2
SV
441 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
442 return h.h3->tp_status;
69e3c75f 443 default:
f6fb8f10 444 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
445 BUG();
446 return 0;
bbd6ef87 447 }
1da177e4 448}
69e3c75f 449
d413fcb4 450static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts,
b9c32fb2 451 unsigned int flags)
7a51384c
DB
452{
453 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
454
68a360e8
WB
455 if (shhwtstamps &&
456 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
d413fcb4 457 ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts))
68a360e8 458 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c 459
171c3b15 460 if ((flags & SOF_TIMESTAMPING_SOFTWARE) &&
27942a15 461 ktime_to_timespec64_cond(skb_tstamp(skb), ts))
b9c32fb2 462 return TP_STATUS_TS_SOFTWARE;
7a51384c 463
b9c32fb2 464 return 0;
7a51384c
DB
465}
466
b9c32fb2
DB
467static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
468 struct sk_buff *skb)
2e31396f
WB
469{
470 union tpacket_uhdr h;
d413fcb4 471 struct timespec64 ts;
b9c32fb2 472 __u32 ts_status;
2e31396f 473
1051ce4a 474 if (!(ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp))))
b9c32fb2 475 return 0;
2e31396f
WB
476
477 h.raw = frame;
d413fcb4
AB
478 /*
479 * versions 1 through 3 overflow the timestamps in y2106, since they
480 * all store the seconds in a 32-bit unsigned integer.
481 * If we create a version 4, that should have a 64-bit timestamp,
482 * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit
483 * nanoseconds.
484 */
2e31396f
WB
485 switch (po->tp_version) {
486 case TPACKET_V1:
487 h.h1->tp_sec = ts.tv_sec;
488 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
489 break;
490 case TPACKET_V2:
491 h.h2->tp_sec = ts.tv_sec;
492 h.h2->tp_nsec = ts.tv_nsec;
493 break;
494 case TPACKET_V3:
57ea884b
DB
495 h.h3->tp_sec = ts.tv_sec;
496 h.h3->tp_nsec = ts.tv_nsec;
497 break;
2e31396f
WB
498 default:
499 WARN(1, "TPACKET version not supported.\n");
500 BUG();
501 }
502
503 /* one flush is safe, as both fields always lie on the same cacheline */
504 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
505 smp_wmb();
b9c32fb2
DB
506
507 return ts_status;
2e31396f
WB
508}
509
d4b5bd98
ED
510static void *packet_lookup_frame(const struct packet_sock *po,
511 const struct packet_ring_buffer *rb,
512 unsigned int position,
513 int status)
69e3c75f
JB
514{
515 unsigned int pg_vec_pos, frame_offset;
184f489e 516 union tpacket_uhdr h;
69e3c75f
JB
517
518 pg_vec_pos = position / rb->frames_per_block;
519 frame_offset = position % rb->frames_per_block;
520
0e3125c7
NH
521 h.raw = rb->pg_vec[pg_vec_pos].buffer +
522 (frame_offset * rb->frame_size);
69e3c75f
JB
523
524 if (status != __packet_get_status(po, h.raw))
525 return NULL;
526
527 return h.raw;
528}
529
eea49cc9 530static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
531 struct packet_ring_buffer *rb,
532 int status)
533{
534 return packet_lookup_frame(po, rb, rb->head, status);
535}
536
bc59ba39 537static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 538{
539 del_timer_sync(&pkc->retire_blk_timer);
540}
541
542static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 543 struct sk_buff_head *rb_queue)
544{
bc59ba39 545 struct tpacket_kbdq_core *pkc;
f6fb8f10 546
73d0fcf2 547 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 548
ec6f809f 549 spin_lock_bh(&rb_queue->lock);
f6fb8f10 550 pkc->delete_blk_timer = 1;
ec6f809f 551 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 552
553 prb_del_retire_blk_timer(pkc);
554}
555
e8e85cc5 556static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 557{
bc59ba39 558 struct tpacket_kbdq_core *pkc;
f6fb8f10 559
e8e85cc5 560 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
17bfd8c8
KC
561 timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
562 0);
563 pkc->retire_blk_timer.expires = jiffies;
f6fb8f10 564}
565
566static int prb_calc_retire_blk_tmo(struct packet_sock *po,
567 int blk_size_in_bytes)
568{
569 struct net_device *dev;
0914d2bb 570 unsigned int mbits, div;
7cad1bac 571 struct ethtool_link_ksettings ecmd;
4bc71cb9 572 int err;
f6fb8f10 573
4bc71cb9
JP
574 rtnl_lock();
575 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
576 if (unlikely(!dev)) {
577 rtnl_unlock();
f6fb8f10 578 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 579 }
7cad1bac 580 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9 581 rtnl_unlock();
0914d2bb 582 if (err)
b43d1f9f 583 return DEFAULT_PRB_RETIRE_TOV;
f6fb8f10 584
0914d2bb
MW
585 /* If the link speed is so slow you don't really
586 * need to worry about perf anyways
587 */
588 if (ecmd.base.speed < SPEED_1000 ||
589 ecmd.base.speed == SPEED_UNKNOWN)
590 return DEFAULT_PRB_RETIRE_TOV;
591
592 div = ecmd.base.speed / 1000;
f6fb8f10 593 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
594
595 if (div)
596 mbits /= div;
597
f6fb8f10 598 if (div)
0914d2bb
MW
599 return mbits + 1;
600 return mbits;
f6fb8f10 601}
602
bc59ba39 603static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 604 union tpacket_req_u *req_u)
605{
606 p1->feature_req_word = req_u->req3.tp_feature_req_word;
607}
608
609static void init_prb_bdqc(struct packet_sock *po,
610 struct packet_ring_buffer *rb,
611 struct pgv *pg_vec,
e8e85cc5 612 union tpacket_req_u *req_u)
f6fb8f10 613{
22781a5b 614 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 615 struct tpacket_block_desc *pbd;
f6fb8f10 616
617 memset(p1, 0x0, sizeof(*p1));
618
619 p1->knxt_seq_num = 1;
620 p1->pkbdq = pg_vec;
bc59ba39 621 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 622 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 623 p1->kblk_size = req_u->req3.tp_block_size;
624 p1->knum_blocks = req_u->req3.tp_block_nr;
625 p1->hdrlen = po->tp_hdrlen;
626 p1->version = po->tp_version;
627 p1->last_kactive_blk_num = 0;
ee80fbf3 628 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 629 if (req_u->req3.tp_retire_blk_tov)
630 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
631 else
632 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
633 req_u->req3.tp_block_size);
634 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
635 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
632ca50f 636 rwlock_init(&p1->blk_fill_in_prog_lock);
f6fb8f10 637
dc808110 638 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 639 prb_init_ft_ops(p1, req_u);
e8e85cc5 640 prb_setup_retire_blk_timer(po);
f6fb8f10 641 prb_open_block(p1, pbd);
642}
643
644/* Do NOT update the last_blk_num first.
645 * Assumes sk_buff_head lock is held.
646 */
bc59ba39 647static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 648{
649 mod_timer(&pkc->retire_blk_timer,
650 jiffies + pkc->tov_in_jiffies);
651 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
652}
653
654/*
655 * Timer logic:
656 * 1) We refresh the timer only when we open a block.
657 * By doing this we don't waste cycles refreshing the timer
658 * on packet-by-packet basis.
659 *
660 * With a 1MB block-size, on a 1Gbps line, it will take
661 * i) ~8 ms to fill a block + ii) memcpy etc.
662 * In this cut we are not accounting for the memcpy time.
663 *
664 * So, if the user sets the 'tmo' to 10ms then the timer
665 * will never fire while the block is still getting filled
666 * (which is what we want). However, the user could choose
667 * to close a block early and that's fine.
668 *
669 * But when the timer does fire, we check whether or not to refresh it.
670 * Since the tmo granularity is in msecs, it is not too expensive
671 * to refresh the timer, lets say every '8' msecs.
672 * Either the user can set the 'tmo' or we can derive it based on
673 * a) line-speed and b) block-size.
674 * prb_calc_retire_blk_tmo() calculates the tmo.
675 *
676 */
17bfd8c8 677static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
f6fb8f10 678{
17bfd8c8
KC
679 struct packet_sock *po =
680 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
22781a5b 681 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 682 unsigned int frozen;
bc59ba39 683 struct tpacket_block_desc *pbd;
f6fb8f10 684
685 spin_lock(&po->sk.sk_receive_queue.lock);
686
687 frozen = prb_queue_frozen(pkc);
688 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
689
690 if (unlikely(pkc->delete_blk_timer))
691 goto out;
692
693 /* We only need to plug the race when the block is partially filled.
694 * tpacket_rcv:
695 * lock(); increment BLOCK_NUM_PKTS; unlock()
696 * copy_bits() is in progress ...
697 * timer fires on other cpu:
698 * we can't retire the current block because copy_bits
699 * is in progress.
700 *
701 */
702 if (BLOCK_NUM_PKTS(pbd)) {
632ca50f
JO
703 /* Waiting for skb_copy_bits to finish... */
704 write_lock(&pkc->blk_fill_in_prog_lock);
705 write_unlock(&pkc->blk_fill_in_prog_lock);
f6fb8f10 706 }
707
708 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
709 if (!frozen) {
41a50d62
AD
710 if (!BLOCK_NUM_PKTS(pbd)) {
711 /* An empty block. Just refresh the timer. */
712 goto refresh_timer;
713 }
f6fb8f10 714 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
715 if (!prb_dispatch_next_block(pkc, po))
716 goto refresh_timer;
717 else
718 goto out;
719 } else {
720 /* Case 1. Queue was frozen because user-space was
721 * lagging behind.
722 */
878cd3ba 723 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 724 /*
725 * Ok, user-space is still behind.
726 * So just refresh the timer.
727 */
728 goto refresh_timer;
729 } else {
730 /* Case 2. queue was frozen,user-space caught up,
731 * now the link went idle && the timer fired.
732 * We don't have a block to close.So we open this
733 * block and restart the timer.
734 * opening a block thaws the queue,restarts timer
735 * Thawing/timer-refresh is a side effect.
736 */
737 prb_open_block(pkc, pbd);
738 goto out;
739 }
740 }
741 }
742
743refresh_timer:
744 _prb_refresh_rx_retire_blk_timer(pkc);
745
746out:
747 spin_unlock(&po->sk.sk_receive_queue.lock);
748}
749
eea49cc9 750static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 751 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 752{
753 /* Flush everything minus the block header */
754
755#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
756 u8 *start, *end;
757
758 start = (u8 *)pbd1;
759
760 /* Skip the block header(we know header WILL fit in 4K) */
761 start += PAGE_SIZE;
762
763 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
764 for (; start < end; start += PAGE_SIZE)
765 flush_dcache_page(pgv_to_page(start));
766
767 smp_wmb();
768#endif
769
770 /* Now update the block status. */
771
772 BLOCK_STATUS(pbd1) = status;
773
774 /* Flush the block header */
775
776#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
777 start = (u8 *)pbd1;
778 flush_dcache_page(pgv_to_page(start));
779
780 smp_wmb();
781#endif
782}
783
784/*
785 * Side effect:
786 *
787 * 1) flush the block
788 * 2) Increment active_blk_num
789 *
790 * Note:We DONT refresh the timer on purpose.
791 * Because almost always the next block will be opened.
792 */
bc59ba39 793static void prb_close_block(struct tpacket_kbdq_core *pkc1,
794 struct tpacket_block_desc *pbd1,
f6fb8f10 795 struct packet_sock *po, unsigned int stat)
796{
797 __u32 status = TP_STATUS_USER | stat;
798
799 struct tpacket3_hdr *last_pkt;
bc59ba39 800 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 801 struct sock *sk = &po->sk;
f6fb8f10 802
8e8e2951 803 if (atomic_read(&po->tp_drops))
f6fb8f10 804 status |= TP_STATUS_LOSING;
805
806 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
807 last_pkt->tp_next_offset = 0;
808
809 /* Get the ts of the last pkt */
810 if (BLOCK_NUM_PKTS(pbd1)) {
811 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
812 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
813 } else {
41a50d62
AD
814 /* Ok, we tmo'd - so get the current time.
815 *
816 * It shouldn't really happen as we don't close empty
817 * blocks. See prb_retire_rx_blk_timer_expired().
818 */
d413fcb4
AB
819 struct timespec64 ts;
820 ktime_get_real_ts64(&ts);
f6fb8f10 821 h1->ts_last_pkt.ts_sec = ts.tv_sec;
822 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
823 }
824
825 smp_wmb();
826
827 /* Flush the block */
828 prb_flush_block(pkc1, pbd1, status);
829
da413eec
DC
830 sk->sk_data_ready(sk);
831
f6fb8f10 832 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
833}
834
eea49cc9 835static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 836{
837 pkc->reset_pending_on_curr_blk = 0;
838}
839
840/*
841 * Side effect of opening a block:
842 *
843 * 1) prb_queue is thawed.
844 * 2) retire_blk_timer is refreshed.
845 *
846 */
bc59ba39 847static void prb_open_block(struct tpacket_kbdq_core *pkc1,
848 struct tpacket_block_desc *pbd1)
f6fb8f10 849{
d413fcb4 850 struct timespec64 ts;
bc59ba39 851 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 852
853 smp_rmb();
854
8da3056c
DB
855 /* We could have just memset this but we will lose the
856 * flexibility of making the priv area sticky
857 */
f6fb8f10 858
8da3056c
DB
859 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
860 BLOCK_NUM_PKTS(pbd1) = 0;
861 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 862
d413fcb4 863 ktime_get_real_ts64(&ts);
8da3056c
DB
864
865 h1->ts_first_pkt.ts_sec = ts.tv_sec;
866 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 867
8da3056c
DB
868 pkc1->pkblk_start = (char *)pbd1;
869 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
870
871 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
872 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
873
874 pbd1->version = pkc1->version;
875 pkc1->prev = pkc1->nxt_offset;
876 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
877
878 prb_thaw_queue(pkc1);
879 _prb_refresh_rx_retire_blk_timer(pkc1);
880
881 smp_wmb();
f6fb8f10 882}
883
884/*
885 * Queue freeze logic:
886 * 1) Assume tp_block_nr = 8 blocks.
887 * 2) At time 't0', user opens Rx ring.
888 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
889 * 4) user-space is either sleeping or processing block '0'.
890 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
891 * it will close block-7,loop around and try to fill block '0'.
892 * call-flow:
893 * __packet_lookup_frame_in_block
894 * prb_retire_current_block()
895 * prb_dispatch_next_block()
896 * |->(BLOCK_STATUS == USER) evaluates to true
897 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
898 * 6) Now there are two cases:
899 * 6.1) Link goes idle right after the queue is frozen.
900 * But remember, the last open_block() refreshed the timer.
901 * When this timer expires,it will refresh itself so that we can
902 * re-open block-0 in near future.
903 * 6.2) Link is busy and keeps on receiving packets. This is a simple
904 * case and __packet_lookup_frame_in_block will check if block-0
905 * is free and can now be re-used.
906 */
eea49cc9 907static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 908 struct packet_sock *po)
909{
910 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 911 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 912}
913
914#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
915
916/*
917 * If the next block is free then we will dispatch it
918 * and return a good offset.
919 * Else, we will freeze the queue.
920 * So, caller must check the return value.
921 */
bc59ba39 922static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 923 struct packet_sock *po)
924{
bc59ba39 925 struct tpacket_block_desc *pbd;
f6fb8f10 926
927 smp_rmb();
928
929 /* 1. Get current block num */
930 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
931
932 /* 2. If this block is currently in_use then freeze the queue */
933 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
934 prb_freeze_queue(pkc, po);
935 return NULL;
936 }
937
938 /*
939 * 3.
940 * open this block and return the offset where the first packet
941 * needs to get stored.
942 */
943 prb_open_block(pkc, pbd);
944 return (void *)pkc->nxt_offset;
945}
946
bc59ba39 947static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 948 struct packet_sock *po, unsigned int status)
949{
bc59ba39 950 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 951
952 /* retire/close the current block */
953 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
954 /*
955 * Plug the case where copy_bits() is in progress on
956 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
957 * have space to copy the pkt in the current block and
958 * called prb_retire_current_block()
959 *
960 * We don't need to worry about the TMO case because
961 * the timer-handler already handled this case.
962 */
963 if (!(status & TP_STATUS_BLK_TMO)) {
632ca50f
JO
964 /* Waiting for skb_copy_bits to finish... */
965 write_lock(&pkc->blk_fill_in_prog_lock);
966 write_unlock(&pkc->blk_fill_in_prog_lock);
f6fb8f10 967 }
968 prb_close_block(pkc, pbd, po, status);
969 return;
970 }
f6fb8f10 971}
972
878cd3ba 973static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
f6fb8f10 974{
975 return TP_STATUS_USER & BLOCK_STATUS(pbd);
976}
977
eea49cc9 978static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 979{
980 return pkc->reset_pending_on_curr_blk;
981}
982
eea49cc9 983static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
88fd1cb8 984 __releases(&pkc->blk_fill_in_prog_lock)
f6fb8f10 985{
bc59ba39 986 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
632ca50f
JO
987
988 read_unlock(&pkc->blk_fill_in_prog_lock);
f6fb8f10 989}
990
eea49cc9 991static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 992 struct tpacket3_hdr *ppd)
993{
3958afa1 994 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 995}
996
eea49cc9 997static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 998 struct tpacket3_hdr *ppd)
999{
1000 ppd->hv1.tp_rxhash = 0;
1001}
1002
eea49cc9 1003static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 1004 struct tpacket3_hdr *ppd)
1005{
df8a39de
JP
1006 if (skb_vlan_tag_present(pkc->skb)) {
1007 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
1008 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1009 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 1010 } else {
9e67030a 1011 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 1012 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 1013 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 1014 }
1015}
1016
bc59ba39 1017static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 1018 struct tpacket3_hdr *ppd)
1019{
a0cdfcf3 1020 ppd->hv1.tp_padding = 0;
f6fb8f10 1021 prb_fill_vlan_info(pkc, ppd);
1022
1023 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1024 prb_fill_rxhash(pkc, ppd);
1025 else
1026 prb_clear_rxhash(pkc, ppd);
1027}
1028
eea49cc9 1029static void prb_fill_curr_block(char *curr,
bc59ba39 1030 struct tpacket_kbdq_core *pkc,
1031 struct tpacket_block_desc *pbd,
f6fb8f10 1032 unsigned int len)
88fd1cb8 1033 __acquires(&pkc->blk_fill_in_prog_lock)
f6fb8f10 1034{
1035 struct tpacket3_hdr *ppd;
1036
1037 ppd = (struct tpacket3_hdr *)curr;
1038 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1039 pkc->prev = curr;
1040 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1041 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1042 BLOCK_NUM_PKTS(pbd) += 1;
632ca50f 1043 read_lock(&pkc->blk_fill_in_prog_lock);
f6fb8f10 1044 prb_run_all_ft_ops(pkc, ppd);
1045}
1046
1047/* Assumes caller has the sk->rx_queue.lock */
1048static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1049 struct sk_buff *skb,
f6fb8f10 1050 unsigned int len
1051 )
1052{
bc59ba39 1053 struct tpacket_kbdq_core *pkc;
1054 struct tpacket_block_desc *pbd;
f6fb8f10 1055 char *curr, *end;
1056
e3192690 1057 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1058 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1059
1060 /* Queue is frozen when user space is lagging behind */
1061 if (prb_queue_frozen(pkc)) {
1062 /*
1063 * Check if that last block which caused the queue to freeze,
1064 * is still in_use by user-space.
1065 */
878cd3ba 1066 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 1067 /* Can't record this packet */
1068 return NULL;
1069 } else {
1070 /*
1071 * Ok, the block was released by user-space.
1072 * Now let's open that block.
1073 * opening a block also thaws the queue.
1074 * Thawing is a side effect.
1075 */
1076 prb_open_block(pkc, pbd);
1077 }
1078 }
1079
1080 smp_mb();
1081 curr = pkc->nxt_offset;
1082 pkc->skb = skb;
e3192690 1083 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1084
1085 /* first try the current block */
1086 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1087 prb_fill_curr_block(curr, pkc, pbd, len);
1088 return (void *)curr;
1089 }
1090
1091 /* Ok, close the current block */
1092 prb_retire_current_block(pkc, po, 0);
1093
1094 /* Now, try to dispatch the next block */
1095 curr = (char *)prb_dispatch_next_block(pkc, po);
1096 if (curr) {
1097 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1098 prb_fill_curr_block(curr, pkc, pbd, len);
1099 return (void *)curr;
1100 }
1101
1102 /*
1103 * No free blocks are available.user_space hasn't caught up yet.
1104 * Queue was just frozen and now this packet will get dropped.
1105 */
1106 return NULL;
1107}
1108
eea49cc9 1109static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1110 struct sk_buff *skb,
1111 int status, unsigned int len)
1112{
1113 char *curr = NULL;
1114 switch (po->tp_version) {
1115 case TPACKET_V1:
1116 case TPACKET_V2:
1117 curr = packet_lookup_frame(po, &po->rx_ring,
1118 po->rx_ring.head, status);
1119 return curr;
1120 case TPACKET_V3:
46088059 1121 return __packet_lookup_frame_in_block(po, skb, len);
f6fb8f10 1122 default:
1123 WARN(1, "TPACKET version not supported\n");
1124 BUG();
99aa3473 1125 return NULL;
f6fb8f10 1126 }
1127}
1128
dcf70cef
ED
1129static void *prb_lookup_block(const struct packet_sock *po,
1130 const struct packet_ring_buffer *rb,
1131 unsigned int idx,
1132 int status)
f6fb8f10 1133{
bc59ba39 1134 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1135 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1136
1137 if (status != BLOCK_STATUS(pbd))
1138 return NULL;
1139 return pbd;
1140}
1141
eea49cc9 1142static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1143{
1144 unsigned int prev;
1145 if (rb->prb_bdqc.kactive_blk_num)
1146 prev = rb->prb_bdqc.kactive_blk_num-1;
1147 else
1148 prev = rb->prb_bdqc.knum_blocks-1;
1149 return prev;
1150}
1151
1152/* Assumes caller has held the rx_queue.lock */
eea49cc9 1153static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1154 struct packet_ring_buffer *rb,
1155 int status)
1156{
1157 unsigned int previous = prb_previous_blk_num(rb);
1158 return prb_lookup_block(po, rb, previous, status);
1159}
1160
eea49cc9 1161static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1162 struct packet_ring_buffer *rb,
1163 int status)
1164{
1165 if (po->tp_version <= TPACKET_V2)
1166 return packet_previous_frame(po, rb, status);
1167
1168 return __prb_previous_block(po, rb, status);
1169}
1170
eea49cc9 1171static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1172 struct packet_ring_buffer *rb)
1173{
1174 switch (po->tp_version) {
1175 case TPACKET_V1:
1176 case TPACKET_V2:
1177 return packet_increment_head(rb);
1178 case TPACKET_V3:
1179 default:
1180 WARN(1, "TPACKET version not supported.\n");
1181 BUG();
1182 return;
1183 }
1184}
1185
eea49cc9 1186static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1187 struct packet_ring_buffer *rb,
1188 int status)
1189{
1190 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1191 return packet_lookup_frame(po, rb, previous, status);
1192}
1193
eea49cc9 1194static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1195{
1196 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1197}
1198
b0138408
DB
1199static void packet_inc_pending(struct packet_ring_buffer *rb)
1200{
1201 this_cpu_inc(*rb->pending_refcnt);
1202}
1203
1204static void packet_dec_pending(struct packet_ring_buffer *rb)
1205{
1206 this_cpu_dec(*rb->pending_refcnt);
1207}
1208
1209static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1210{
1211 unsigned int refcnt = 0;
1212 int cpu;
1213
1214 /* We don't use pending refcount in rx_ring. */
1215 if (rb->pending_refcnt == NULL)
1216 return 0;
1217
1218 for_each_possible_cpu(cpu)
1219 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1220
1221 return refcnt;
1222}
1223
1224static int packet_alloc_pending(struct packet_sock *po)
1225{
1226 po->rx_ring.pending_refcnt = NULL;
1227
1228 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1229 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1230 return -ENOBUFS;
1231
1232 return 0;
1233}
1234
1235static void packet_free_pending(struct packet_sock *po)
1236{
1237 free_percpu(po->tx_ring.pending_refcnt);
1238}
1239
9954729b
WB
1240#define ROOM_POW_OFF 2
1241#define ROOM_NONE 0x0
1242#define ROOM_LOW 0x1
1243#define ROOM_NORMAL 0x2
1244
d4b5bd98 1245static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
77f65ebd 1246{
9954729b
WB
1247 int idx, len;
1248
d4b5bd98
ED
1249 len = READ_ONCE(po->rx_ring.frame_max) + 1;
1250 idx = READ_ONCE(po->rx_ring.head);
9954729b
WB
1251 if (pow_off)
1252 idx += len >> pow_off;
1253 if (idx >= len)
1254 idx -= len;
1255 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1256}
1257
dcf70cef 1258static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
9954729b
WB
1259{
1260 int idx, len;
1261
dcf70cef
ED
1262 len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
1263 idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
9954729b
WB
1264 if (pow_off)
1265 idx += len >> pow_off;
1266 if (idx >= len)
1267 idx -= len;
1268 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1269}
77f65ebd 1270
0338a145
ED
1271static int __packet_rcv_has_room(const struct packet_sock *po,
1272 const struct sk_buff *skb)
9954729b 1273{
0338a145 1274 const struct sock *sk = &po->sk;
9954729b
WB
1275 int ret = ROOM_NONE;
1276
1277 if (po->prot_hook.func != tpacket_rcv) {
0338a145
ED
1278 int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
1279 int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1280 - (skb ? skb->truesize : 0);
1281
1282 if (avail > (rcvbuf >> ROOM_POW_OFF))
9954729b
WB
1283 return ROOM_NORMAL;
1284 else if (avail > 0)
1285 return ROOM_LOW;
1286 else
1287 return ROOM_NONE;
1288 }
77f65ebd 1289
9954729b
WB
1290 if (po->tp_version == TPACKET_V3) {
1291 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1292 ret = ROOM_NORMAL;
1293 else if (__tpacket_v3_has_room(po, 0))
1294 ret = ROOM_LOW;
1295 } else {
1296 if (__tpacket_has_room(po, ROOM_POW_OFF))
1297 ret = ROOM_NORMAL;
1298 else if (__tpacket_has_room(po, 0))
1299 ret = ROOM_LOW;
1300 }
2ccdbaa6
WB
1301
1302 return ret;
1303}
1304
1305static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1306{
791a3e9f
ED
1307 bool pressure;
1308 int ret;
2ccdbaa6 1309
54d7c01d 1310 ret = __packet_rcv_has_room(po, skb);
3a2bb84e
ED
1311 pressure = ret != ROOM_NORMAL;
1312
791a3e9f
ED
1313 if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) != pressure)
1314 packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, pressure);
77f65ebd 1315
9954729b 1316 return ret;
77f65ebd
WB
1317}
1318
9bb6cd65
ED
1319static void packet_rcv_try_clear_pressure(struct packet_sock *po)
1320{
791a3e9f 1321 if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) &&
9bb6cd65 1322 __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
791a3e9f 1323 packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, false);
9bb6cd65
ED
1324}
1325
1da177e4
LT
1326static void packet_sock_destruct(struct sock *sk)
1327{
ed85b565
RC
1328 skb_queue_purge(&sk->sk_error_queue);
1329
547b792c 1330 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
14afee4b 1331 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1da177e4
LT
1332
1333 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1334 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1335 return;
1336 }
1da177e4
LT
1337}
1338
3b3a5b0a
WB
1339static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1340{
b756ad92
ED
1341 u32 *history = po->rollover->history;
1342 u32 victim, rxhash;
3b3a5b0a
WB
1343 int i, count = 0;
1344
1345 rxhash = skb_get_hash(skb);
1346 for (i = 0; i < ROLLOVER_HLEN; i++)
b756ad92 1347 if (READ_ONCE(history[i]) == rxhash)
3b3a5b0a
WB
1348 count++;
1349
8032bf12 1350 victim = get_random_u32_below(ROLLOVER_HLEN);
b756ad92
ED
1351
1352 /* Avoid dirtying the cache line if possible */
1353 if (READ_ONCE(history[victim]) != rxhash)
1354 WRITE_ONCE(history[victim], rxhash);
1355
3b3a5b0a
WB
1356 return count > (ROLLOVER_HLEN >> 1);
1357}
1358
77f65ebd
WB
1359static unsigned int fanout_demux_hash(struct packet_fanout *f,
1360 struct sk_buff *skb,
1361 unsigned int num)
dc99f600 1362{
eb70db87 1363 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
dc99f600
DM
1364}
1365
77f65ebd
WB
1366static unsigned int fanout_demux_lb(struct packet_fanout *f,
1367 struct sk_buff *skb,
1368 unsigned int num)
dc99f600 1369{
468479e6 1370 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1371
468479e6 1372 return val % num;
77f65ebd
WB
1373}
1374
1375static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1376 struct sk_buff *skb,
1377 unsigned int num)
1378{
1379 return smp_processor_id() % num;
dc99f600
DM
1380}
1381
5df0ddfb
DB
1382static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1383 struct sk_buff *skb,
1384 unsigned int num)
1385{
8032bf12 1386 return get_random_u32_below(num);
5df0ddfb
DB
1387}
1388
77f65ebd
WB
1389static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1390 struct sk_buff *skb,
ad377cab 1391 unsigned int idx, bool try_self,
77f65ebd 1392 unsigned int num)
95ec3eb4 1393{
4633c9e0 1394 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1395 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1396
94f633ea 1397 po = pkt_sk(rcu_dereference(f->arr[idx]));
3b3a5b0a
WB
1398
1399 if (try_self) {
1400 room = packet_rcv_has_room(po, skb);
1401 if (room == ROOM_NORMAL ||
1402 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1403 return idx;
4633c9e0 1404 po_skip = po;
3b3a5b0a 1405 }
ad377cab 1406
0648ab70 1407 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1408 do {
94f633ea 1409 po_next = pkt_sk(rcu_dereference(f->arr[i]));
791a3e9f
ED
1410 if (po_next != po_skip &&
1411 !packet_sock_flag(po_next, PACKET_SOCK_PRESSURE) &&
2ccdbaa6 1412 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1413 if (i != j)
0648ab70 1414 po->rollover->sock = i;
a9b63918
WB
1415 atomic_long_inc(&po->rollover->num);
1416 if (room == ROOM_LOW)
1417 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1418 return i;
1419 }
ad377cab 1420
77f65ebd
WB
1421 if (++i == num)
1422 i = 0;
1423 } while (i != j);
1424
a9b63918 1425 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1426 return idx;
1427}
1428
2d36097d
NH
1429static unsigned int fanout_demux_qm(struct packet_fanout *f,
1430 struct sk_buff *skb,
1431 unsigned int num)
1432{
1433 return skb_get_queue_mapping(skb) % num;
1434}
1435
47dceb8e
WB
1436static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1437 struct sk_buff *skb,
1438 unsigned int num)
1439{
1440 struct bpf_prog *prog;
1441 unsigned int ret = 0;
1442
1443 rcu_read_lock();
1444 prog = rcu_dereference(f->bpf_prog);
1445 if (prog)
ff936a04 1446 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1447 rcu_read_unlock();
1448
1449 return ret;
1450}
1451
77f65ebd
WB
1452static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1453{
1454 return f->flags & (flag >> 8);
95ec3eb4
DM
1455}
1456
95ec3eb4
DM
1457static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1458 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1459{
1460 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1461 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1462 struct net *net = read_pnet(&f->net);
dc99f600 1463 struct packet_sock *po;
77f65ebd 1464 unsigned int idx;
dc99f600 1465
19bcf9f2 1466 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1467 kfree_skb(skb);
1468 return 0;
1469 }
1470
3f34b24a 1471 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1472 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1473 if (!skb)
1474 return 0;
1475 }
95ec3eb4
DM
1476 switch (f->type) {
1477 case PACKET_FANOUT_HASH:
1478 default:
77f65ebd 1479 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1480 break;
1481 case PACKET_FANOUT_LB:
77f65ebd 1482 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1483 break;
1484 case PACKET_FANOUT_CPU:
77f65ebd
WB
1485 idx = fanout_demux_cpu(f, skb, num);
1486 break;
5df0ddfb
DB
1487 case PACKET_FANOUT_RND:
1488 idx = fanout_demux_rnd(f, skb, num);
1489 break;
2d36097d
NH
1490 case PACKET_FANOUT_QM:
1491 idx = fanout_demux_qm(f, skb, num);
1492 break;
77f65ebd 1493 case PACKET_FANOUT_ROLLOVER:
ad377cab 1494 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1495 break;
47dceb8e 1496 case PACKET_FANOUT_CBPF:
f2e52095 1497 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1498 idx = fanout_demux_bpf(f, skb, num);
1499 break;
dc99f600
DM
1500 }
1501
ad377cab
WB
1502 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1503 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1504
94f633ea 1505 po = pkt_sk(rcu_dereference(f->arr[idx]));
dc99f600
DM
1506 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1507}
1508
fff3321d
PE
1509DEFINE_MUTEX(fanout_mutex);
1510EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600 1511static LIST_HEAD(fanout_list);
4a69a864 1512static u16 fanout_next_id;
dc99f600
DM
1513
1514static void __fanout_link(struct sock *sk, struct packet_sock *po)
1515{
1516 struct packet_fanout *f = po->fanout;
1517
1518 spin_lock(&f->lock);
94f633ea 1519 rcu_assign_pointer(f->arr[f->num_members], sk);
dc99f600
DM
1520 smp_wmb();
1521 f->num_members++;
2bd624b4
AS
1522 if (f->num_members == 1)
1523 dev_add_pack(&f->prot_hook);
dc99f600
DM
1524 spin_unlock(&f->lock);
1525}
1526
1527static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1528{
1529 struct packet_fanout *f = po->fanout;
1530 int i;
1531
1532 spin_lock(&f->lock);
1533 for (i = 0; i < f->num_members; i++) {
94f633ea
ED
1534 if (rcu_dereference_protected(f->arr[i],
1535 lockdep_is_held(&f->lock)) == sk)
dc99f600
DM
1536 break;
1537 }
1538 BUG_ON(i >= f->num_members);
94f633ea
ED
1539 rcu_assign_pointer(f->arr[i],
1540 rcu_dereference_protected(f->arr[f->num_members - 1],
1541 lockdep_is_held(&f->lock)));
dc99f600 1542 f->num_members--;
2bd624b4
AS
1543 if (f->num_members == 0)
1544 __dev_remove_pack(&f->prot_hook);
dc99f600
DM
1545 spin_unlock(&f->lock);
1546}
1547
d4dd8aee 1548static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1549{
161642e2
ED
1550 if (sk->sk_family != PF_PACKET)
1551 return false;
c0de08d0 1552
161642e2 1553 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1554}
1555
47dceb8e
WB
1556static void fanout_init_data(struct packet_fanout *f)
1557{
1558 switch (f->type) {
1559 case PACKET_FANOUT_LB:
1560 atomic_set(&f->rr_cur, 0);
1561 break;
1562 case PACKET_FANOUT_CBPF:
f2e52095 1563 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1564 RCU_INIT_POINTER(f->bpf_prog, NULL);
1565 break;
1566 }
1567}
1568
1569static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1570{
1571 struct bpf_prog *old;
1572
1573 spin_lock(&f->lock);
1574 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1575 rcu_assign_pointer(f->bpf_prog, new);
1576 spin_unlock(&f->lock);
1577
1578 if (old) {
1579 synchronize_net();
1580 bpf_prog_destroy(old);
1581 }
1582}
1583
b1ea9ff6 1584static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data,
47dceb8e
WB
1585 unsigned int len)
1586{
1587 struct bpf_prog *new;
1588 struct sock_fprog fprog;
1589 int ret;
1590
1591 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1592 return -EPERM;
4d295e54
CH
1593
1594 ret = copy_bpf_fprog_from_user(&fprog, data, len);
1595 if (ret)
1596 return ret;
47dceb8e 1597
bab18991 1598 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1599 if (ret)
1600 return ret;
1601
1602 __fanout_set_data_bpf(po->fanout, new);
1603 return 0;
1604}
1605
a7b75c5a 1606static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data,
f2e52095
WB
1607 unsigned int len)
1608{
1609 struct bpf_prog *new;
1610 u32 fd;
1611
1612 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1613 return -EPERM;
1614 if (len != sizeof(fd))
1615 return -EINVAL;
a7b75c5a 1616 if (copy_from_sockptr(&fd, data, len))
f2e52095
WB
1617 return -EFAULT;
1618
113214be 1619 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
f2e52095
WB
1620 if (IS_ERR(new))
1621 return PTR_ERR(new);
f2e52095
WB
1622
1623 __fanout_set_data_bpf(po->fanout, new);
1624 return 0;
1625}
1626
a7b75c5a 1627static int fanout_set_data(struct packet_sock *po, sockptr_t data,
47dceb8e
WB
1628 unsigned int len)
1629{
1630 switch (po->fanout->type) {
1631 case PACKET_FANOUT_CBPF:
a7b75c5a 1632 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1633 case PACKET_FANOUT_EBPF:
1634 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1635 default:
1636 return -EINVAL;
07d53ae4 1637 }
47dceb8e
WB
1638}
1639
1640static void fanout_release_data(struct packet_fanout *f)
1641{
1642 switch (f->type) {
1643 case PACKET_FANOUT_CBPF:
f2e52095 1644 case PACKET_FANOUT_EBPF:
47dceb8e 1645 __fanout_set_data_bpf(f, NULL);
07d53ae4 1646 }
47dceb8e
WB
1647}
1648
4a69a864
MM
1649static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1650{
1651 struct packet_fanout *f;
1652
1653 list_for_each_entry(f, &fanout_list, list) {
1654 if (f->id == candidate_id &&
1655 read_pnet(&f->net) == sock_net(sk)) {
1656 return false;
1657 }
1658 }
1659 return true;
1660}
1661
1662static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1663{
1664 u16 id = fanout_next_id;
1665
1666 do {
1667 if (__fanout_id_is_free(sk, id)) {
1668 *new_id = id;
1669 fanout_next_id = id + 1;
1670 return true;
1671 }
1672
1673 id++;
1674 } while (id != fanout_next_id);
1675
1676 return false;
1677}
1678
9c661b0b 1679static int fanout_add(struct sock *sk, struct fanout_args *args)
dc99f600 1680{
d199fab6 1681 struct packet_rollover *rollover = NULL;
dc99f600 1682 struct packet_sock *po = pkt_sk(sk);
9c661b0b 1683 u16 type_flags = args->type_flags;
dc99f600 1684 struct packet_fanout *f, *match;
7736d33f 1685 u8 type = type_flags & 0xff;
77f65ebd 1686 u8 flags = type_flags >> 8;
9c661b0b 1687 u16 id = args->id;
dc99f600
DM
1688 int err;
1689
1690 switch (type) {
77f65ebd
WB
1691 case PACKET_FANOUT_ROLLOVER:
1692 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1693 return -EINVAL;
5af5a020 1694 break;
dc99f600
DM
1695 case PACKET_FANOUT_HASH:
1696 case PACKET_FANOUT_LB:
95ec3eb4 1697 case PACKET_FANOUT_CPU:
5df0ddfb 1698 case PACKET_FANOUT_RND:
2d36097d 1699 case PACKET_FANOUT_QM:
47dceb8e 1700 case PACKET_FANOUT_CBPF:
f2e52095 1701 case PACKET_FANOUT_EBPF:
dc99f600
DM
1702 break;
1703 default:
1704 return -EINVAL;
1705 }
1706
d199fab6
ED
1707 mutex_lock(&fanout_mutex);
1708
d199fab6 1709 err = -EALREADY;
dc99f600 1710 if (po->fanout)
d199fab6 1711 goto out;
dc99f600 1712
4633c9e0
WB
1713 if (type == PACKET_FANOUT_ROLLOVER ||
1714 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
d199fab6
ED
1715 err = -ENOMEM;
1716 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1717 if (!rollover)
1718 goto out;
1719 atomic_long_set(&rollover->num, 0);
1720 atomic_long_set(&rollover->num_huge, 0);
1721 atomic_long_set(&rollover->num_failed, 0);
0648ab70
WB
1722 }
1723
4a69a864
MM
1724 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1725 if (id != 0) {
1726 err = -EINVAL;
1727 goto out;
1728 }
1729 if (!fanout_find_new_id(sk, &id)) {
1730 err = -ENOMEM;
1731 goto out;
1732 }
1733 /* ephemeral flag for the first socket in the group: drop it */
1734 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1735 }
1736
dc99f600
DM
1737 match = NULL;
1738 list_for_each_entry(f, &fanout_list, list) {
1739 if (f->id == id &&
1740 read_pnet(&f->net) == sock_net(sk)) {
1741 match = f;
1742 break;
1743 }
1744 }
afe62c68 1745 err = -EINVAL;
9c661b0b
TL
1746 if (match) {
1747 if (match->flags != flags)
1748 goto out;
1749 if (args->max_num_members &&
1750 args->max_num_members != match->max_num_members)
1751 goto out;
1752 } else {
1753 if (args->max_num_members > PACKET_FANOUT_MAX)
1754 goto out;
1755 if (!args->max_num_members)
1756 /* legacy PACKET_FANOUT_MAX */
1757 args->max_num_members = 256;
afe62c68 1758 err = -ENOMEM;
9c661b0b
TL
1759 match = kvzalloc(struct_size(match, arr, args->max_num_members),
1760 GFP_KERNEL);
afe62c68
ED
1761 if (!match)
1762 goto out;
1763 write_pnet(&match->net, sock_net(sk));
1764 match->id = id;
1765 match->type = type;
77f65ebd 1766 match->flags = flags;
afe62c68
ED
1767 INIT_LIST_HEAD(&match->list);
1768 spin_lock_init(&match->lock);
fb5c2c17 1769 refcount_set(&match->sk_ref, 0);
47dceb8e 1770 fanout_init_data(match);
afe62c68
ED
1771 match->prot_hook.type = po->prot_hook.type;
1772 match->prot_hook.dev = po->prot_hook.dev;
1773 match->prot_hook.func = packet_rcv_fanout;
1774 match->prot_hook.af_packet_priv = match;
47934e06 1775 match->prot_hook.af_packet_net = read_pnet(&match->net);
c0de08d0 1776 match->prot_hook.id_match = match_fanout_group;
9c661b0b 1777 match->max_num_members = args->max_num_members;
58ba4263 1778 match->prot_hook.ignore_outgoing = type_flags & PACKET_FANOUT_FLAG_IGNORE_OUTGOING;
afe62c68 1779 list_add(&match->list, &fanout_list);
dc99f600 1780 }
afe62c68 1781 err = -EINVAL;
008ba2a1
WB
1782
1783 spin_lock(&po->bind_lock);
61edf479 1784 if (packet_sock_flag(po, PACKET_SOCK_RUNNING) &&
008ba2a1 1785 match->type == type &&
afe62c68
ED
1786 match->prot_hook.type == po->prot_hook.type &&
1787 match->prot_hook.dev == po->prot_hook.dev) {
1788 err = -ENOSPC;
9c661b0b 1789 if (refcount_read(&match->sk_ref) < match->max_num_members) {
afe62c68 1790 __dev_remove_pack(&po->prot_hook);
e42e70ad
ED
1791
1792 /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */
1793 WRITE_ONCE(po->fanout, match);
1794
57f015f5
MM
1795 po->rollover = rollover;
1796 rollover = NULL;
fb5c2c17 1797 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
afe62c68
ED
1798 __fanout_link(sk, po);
1799 err = 0;
dc99f600
DM
1800 }
1801 }
008ba2a1
WB
1802 spin_unlock(&po->bind_lock);
1803
1804 if (err && !refcount_read(&match->sk_ref)) {
1805 list_del(&match->list);
9c661b0b 1806 kvfree(match);
008ba2a1
WB
1807 }
1808
afe62c68 1809out:
57f015f5 1810 kfree(rollover);
d199fab6 1811 mutex_unlock(&fanout_mutex);
dc99f600
DM
1812 return err;
1813}
1814
2bd624b4
AS
1815/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1816 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1817 * It is the responsibility of the caller to call fanout_release_data() and
1818 * free the returned packet_fanout (after synchronize_net())
1819 */
1820static struct packet_fanout *fanout_release(struct sock *sk)
dc99f600
DM
1821{
1822 struct packet_sock *po = pkt_sk(sk);
1823 struct packet_fanout *f;
1824
fff3321d 1825 mutex_lock(&fanout_mutex);
d199fab6
ED
1826 f = po->fanout;
1827 if (f) {
1828 po->fanout = NULL;
1829
fb5c2c17 1830 if (refcount_dec_and_test(&f->sk_ref))
d199fab6 1831 list_del(&f->list);
2bd624b4
AS
1832 else
1833 f = NULL;
dc99f600
DM
1834 }
1835 mutex_unlock(&fanout_mutex);
2bd624b4
AS
1836
1837 return f;
dc99f600 1838}
1da177e4 1839
3c70c132
DB
1840static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1841 struct sk_buff *skb)
1842{
1843 /* Earlier code assumed this would be a VLAN pkt, double-check
1844 * this now that we have the actual packet in hand. We can only
1845 * do this check on Ethernet devices.
1846 */
1847 if (unlikely(dev->type != ARPHRD_ETHER))
1848 return false;
1849
1850 skb_reset_mac_header(skb);
1851 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1852}
1853
90ddc4f0 1854static const struct proto_ops packet_ops;
1da177e4 1855
90ddc4f0 1856static const struct proto_ops packet_ops_spkt;
1da177e4 1857
40d4e3df
ED
1858static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1859 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1860{
1861 struct sock *sk;
1862 struct sockaddr_pkt *spkt;
1863
1864 /*
1865 * When we registered the protocol we saved the socket in the data
1866 * field for just this event.
1867 */
1868
1869 sk = pt->af_packet_priv;
1ce4f28b 1870
1da177e4
LT
1871 /*
1872 * Yank back the headers [hope the device set this
1873 * right or kerboom...]
1874 *
1875 * Incoming packets have ll header pulled,
1876 * push it back.
1877 *
98e399f8 1878 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1879 * so that this procedure is noop.
1880 */
1881
1882 if (skb->pkt_type == PACKET_LOOPBACK)
1883 goto out;
1884
09ad9bc7 1885 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1886 goto out;
1887
40d4e3df
ED
1888 skb = skb_share_check(skb, GFP_ATOMIC);
1889 if (skb == NULL)
1da177e4
LT
1890 goto oom;
1891
1892 /* drop any routing info */
adf30907 1893 skb_dst_drop(skb);
1da177e4 1894
84531c24 1895 /* drop conntrack reference */
895b5c9f 1896 nf_reset_ct(skb);
84531c24 1897
ffbc6111 1898 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1899
98e399f8 1900 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1901
1902 /*
1903 * The SOCK_PACKET socket receives _all_ frames.
1904 */
1905
1906 spkt->spkt_family = dev->type;
8fc9d51e 1907 strscpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1da177e4
LT
1908 spkt->spkt_protocol = skb->protocol;
1909
1910 /*
1911 * Charge the memory to the socket. This is done specifically
1912 * to prevent sockets using all the memory up.
1913 */
1914
40d4e3df 1915 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1916 return 0;
1917
1918out:
1919 kfree_skb(skb);
1920oom:
1921 return 0;
1922}
1923
75c65772
MM
1924static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
1925{
dfed913e
HL
1926 int depth;
1927
18bed891
YK
1928 if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
1929 sock->type == SOCK_RAW) {
75c65772
MM
1930 skb_reset_mac_header(skb);
1931 skb->protocol = dev_parse_header_protocol(skb);
1932 }
1933
dfed913e
HL
1934 /* Move network header to the right position for VLAN tagged packets */
1935 if (likely(skb->dev->type == ARPHRD_ETHER) &&
1936 eth_type_vlan(skb->protocol) &&
4063384e
ED
1937 vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0)
1938 skb_set_network_header(skb, depth);
dfed913e 1939
75c65772
MM
1940 skb_probe_transport_header(skb);
1941}
1da177e4
LT
1942
1943/*
1944 * Output a raw packet to a device layer. This bypasses all the other
1945 * protocol layers and you must therefore supply it with a complete frame
1946 */
1ce4f28b 1947
1b784140
YX
1948static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1949 size_t len)
1da177e4
LT
1950{
1951 struct sock *sk = sock->sk;
342dfc30 1952 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1953 struct sk_buff *skb = NULL;
1da177e4 1954 struct net_device *dev;
c14ac945 1955 struct sockcm_cookie sockc;
40d4e3df 1956 __be16 proto = 0;
1da177e4 1957 int err;
3bdc0eba 1958 int extra_len = 0;
1ce4f28b 1959
1da177e4 1960 /*
1ce4f28b 1961 * Get and verify the address.
1da177e4
LT
1962 */
1963
40d4e3df 1964 if (saddr) {
1da177e4 1965 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1966 return -EINVAL;
1967 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1968 proto = saddr->spkt_protocol;
1969 } else
1970 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1971
1972 /*
1ce4f28b 1973 * Find the device first to size check it
1da177e4
LT
1974 */
1975
de74e92a 1976 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1977retry:
654d1f8a
ED
1978 rcu_read_lock();
1979 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1980 err = -ENODEV;
1981 if (dev == NULL)
1982 goto out_unlock;
1ce4f28b 1983
d5e76b0a
DM
1984 err = -ENETDOWN;
1985 if (!(dev->flags & IFF_UP))
1986 goto out_unlock;
1987
1da177e4 1988 /*
40d4e3df
ED
1989 * You may not queue a frame bigger than the mtu. This is the lowest level
1990 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1991 */
1ce4f28b 1992
3bdc0eba
BG
1993 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1994 if (!netif_supports_nofcs(dev)) {
1995 err = -EPROTONOSUPPORT;
1996 goto out_unlock;
1997 }
1998 extra_len = 4; /* We're doing our own CRC */
1999 }
2000
1da177e4 2001 err = -EMSGSIZE;
3bdc0eba 2002 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
2003 goto out_unlock;
2004
1a35ca80
ED
2005 if (!skb) {
2006 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 2007 int tlen = dev->needed_tailroom;
1a35ca80
ED
2008 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
2009
2010 rcu_read_unlock();
4ce40912 2011 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
2012 if (skb == NULL)
2013 return -ENOBUFS;
2014 /* FIXME: Save some space for broken drivers that write a hard
2015 * header at transmission time by themselves. PPP is the notable
2016 * one here. This should really be fixed at the driver level.
2017 */
2018 skb_reserve(skb, reserved);
2019 skb_reset_network_header(skb);
2020
2021 /* Try to align data part correctly */
2022 if (hhlen) {
2023 skb->data -= hhlen;
2024 skb->tail -= hhlen;
2025 if (len < hhlen)
2026 skb_reset_network_header(skb);
2027 }
6ce8e9ce 2028 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
2029 if (err)
2030 goto out_free;
2031 goto retry;
1da177e4
LT
2032 }
2033
6a341729 2034 if (!dev_validate_header(dev, skb->data, len) || !skb->len) {
9ed988cd
WB
2035 err = -EINVAL;
2036 goto out_unlock;
2037 }
3c70c132
DB
2038 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
2039 !packet_extra_vlan_len_allowed(dev, skb)) {
2040 err = -EMSGSIZE;
2041 goto out_unlock;
57f89bfa 2042 }
1a35ca80 2043
657a0667 2044 sockcm_init(&sockc, sk);
c14ac945
SHY
2045 if (msg->msg_controllen) {
2046 err = sock_cmsg_send(sk, msg, &sockc);
f8e7718c 2047 if (unlikely(err))
c14ac945 2048 goto out_unlock;
c14ac945
SHY
2049 }
2050
1da177e4
LT
2051 skb->protocol = proto;
2052 skb->dev = dev;
2053 skb->priority = sk->sk_priority;
2d37a186 2054 skb->mark = sk->sk_mark;
3d0ba8c0 2055 skb->tstamp = sockc.transmit_time;
bf84a010 2056
8f932f76 2057 skb_setup_tx_timestamp(skb, sockc.tsflags);
1da177e4 2058
3bdc0eba
BG
2059 if (unlikely(extra_len == 4))
2060 skb->no_fcs = 1;
2061
75c65772 2062 packet_parse_headers(skb, sock);
c1aad275 2063
1da177e4 2064 dev_queue_xmit(skb);
654d1f8a 2065 rcu_read_unlock();
40d4e3df 2066 return len;
1da177e4 2067
1da177e4 2068out_unlock:
654d1f8a 2069 rcu_read_unlock();
1a35ca80
ED
2070out_free:
2071 kfree_skb(skb);
1da177e4
LT
2072 return err;
2073}
1da177e4 2074
ff936a04
AS
2075static unsigned int run_filter(struct sk_buff *skb,
2076 const struct sock *sk,
2077 unsigned int res)
1da177e4
LT
2078{
2079 struct sk_filter *filter;
fda9ef5d 2080
80f8f102
ED
2081 rcu_read_lock();
2082 filter = rcu_dereference(sk->sk_filter);
dbcb5855 2083 if (filter != NULL)
ff936a04 2084 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 2085 rcu_read_unlock();
1da177e4 2086
dbcb5855 2087 return res;
1da177e4
LT
2088}
2089
16cc1400 2090static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
dfc39d40 2091 size_t *len, int vnet_hdr_sz)
16cc1400 2092{
dfc39d40 2093 struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 };
16cc1400 2094
dfc39d40 2095 if (*len < vnet_hdr_sz)
16cc1400 2096 return -EINVAL;
dfc39d40 2097 *len -= vnet_hdr_sz;
16cc1400 2098
dfc39d40 2099 if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0))
16cc1400
WB
2100 return -EINVAL;
2101
dfc39d40 2102 return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz);
16cc1400
WB
2103}
2104
1da177e4 2105/*
62ab0812
ED
2106 * This function makes lazy skb cloning in hope that most of packets
2107 * are discarded by BPF.
2108 *
2109 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2110 * and skb->cb are mangled. It works because (and until) packets
2111 * falling here are owned by current CPU. Output packets are cloned
2112 * by dev_queue_xmit_nit(), input packets are processed by net_bh
0e4161d0 2113 * sequentially, so that if we return skb to original state on exit,
62ab0812 2114 * we will not harm anyone.
1da177e4
LT
2115 */
2116
40d4e3df
ED
2117static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2118 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2119{
2120 struct sock *sk;
2121 struct sockaddr_ll *sll;
2122 struct packet_sock *po;
40d4e3df 2123 u8 *skb_head = skb->data;
1da177e4 2124 int skb_len = skb->len;
dbcb5855 2125 unsigned int snaplen, res;
da37845f 2126 bool is_drop_n_account = false;
1da177e4
LT
2127
2128 if (skb->pkt_type == PACKET_LOOPBACK)
2129 goto drop;
2130
2131 sk = pt->af_packet_priv;
2132 po = pkt_sk(sk);
2133
09ad9bc7 2134 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2135 goto drop;
2136
1da177e4
LT
2137 skb->dev = dev;
2138
d5496990 2139 if (dev_has_header(dev)) {
1da177e4 2140 /* The device has an explicit notion of ll header,
62ab0812
ED
2141 * exported to higher levels.
2142 *
2143 * Otherwise, the device hides details of its frame
2144 * structure, so that corresponding packet head is
2145 * never delivered to user.
1da177e4
LT
2146 */
2147 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2148 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2149 else if (skb->pkt_type == PACKET_OUTGOING) {
2150 /* Special case: outgoing packets have ll header at head */
bbe735e4 2151 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2152 }
2153 }
2154
2155 snaplen = skb->len;
2156
dbcb5855
DM
2157 res = run_filter(skb, sk, snaplen);
2158 if (!res)
fda9ef5d 2159 goto drop_n_restore;
dbcb5855
DM
2160 if (snaplen > res)
2161 snaplen = res;
1da177e4 2162
0fd7bac6 2163 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2164 goto drop_n_acct;
2165
2166 if (skb_shared(skb)) {
2167 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2168 if (nskb == NULL)
2169 goto drop_n_acct;
2170
2171 if (skb_head != skb->data) {
2172 skb->data = skb_head;
2173 skb->len = skb_len;
2174 }
abc4e4fa 2175 consume_skb(skb);
1da177e4
LT
2176 skb = nskb;
2177 }
2178
b4772ef8 2179 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2180
2181 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2182 sll->sll_hatype = dev->type;
1da177e4 2183 sll->sll_pkttype = skb->pkt_type;
ee5675ec 2184 if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
80feaacb
PWJ
2185 sll->sll_ifindex = orig_dev->ifindex;
2186 else
2187 sll->sll_ifindex = dev->ifindex;
1da177e4 2188
b95cce35 2189 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2190
2472d761
EB
2191 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2192 * Use their space for storing the original skb length.
2193 */
2194 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2195
1da177e4
LT
2196 if (pskb_trim(skb, snaplen))
2197 goto drop_n_acct;
2198
2199 skb_set_owner_r(skb, sk);
2200 skb->dev = NULL;
adf30907 2201 skb_dst_drop(skb);
1da177e4 2202
84531c24 2203 /* drop conntrack reference */
895b5c9f 2204 nf_reset_ct(skb);
84531c24 2205
1da177e4 2206 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2207 po->stats.stats1.tp_packets++;
3bc3b96f 2208 sock_skb_set_dropcount(sk, skb);
27942a15 2209 skb_clear_delivery_time(skb);
1da177e4
LT
2210 __skb_queue_tail(&sk->sk_receive_queue, skb);
2211 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2212 sk->sk_data_ready(sk);
1da177e4
LT
2213 return 0;
2214
2215drop_n_acct:
da37845f 2216 is_drop_n_account = true;
8e8e2951 2217 atomic_inc(&po->tp_drops);
7091fbd8 2218 atomic_inc(&sk->sk_drops);
1da177e4
LT
2219
2220drop_n_restore:
2221 if (skb_head != skb->data && skb_shared(skb)) {
2222 skb->data = skb_head;
2223 skb->len = skb_len;
2224 }
2225drop:
da37845f
WJ
2226 if (!is_drop_n_account)
2227 consume_skb(skb);
2228 else
2229 kfree_skb(skb);
1da177e4
LT
2230 return 0;
2231}
2232
40d4e3df
ED
2233static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2234 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2235{
2236 struct sock *sk;
2237 struct packet_sock *po;
2238 struct sockaddr_ll *sll;
184f489e 2239 union tpacket_uhdr h;
40d4e3df 2240 u8 *skb_head = skb->data;
1da177e4 2241 int skb_len = skb->len;
dbcb5855 2242 unsigned int snaplen, res;
f6fb8f10 2243 unsigned long status = TP_STATUS_USER;
acf69c94
OC
2244 unsigned short macoff, hdrlen;
2245 unsigned int netoff;
1da177e4 2246 struct sk_buff *copy_skb = NULL;
d413fcb4 2247 struct timespec64 ts;
b9c32fb2 2248 __u32 ts_status;
da37845f 2249 bool is_drop_n_account = false;
61fad681 2250 unsigned int slot_id = 0;
dfc39d40 2251 int vnet_hdr_sz = 0;
1da177e4 2252
51846355
AW
2253 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2254 * We may add members to them until current aligned size without forcing
2255 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2256 */
2257 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2258 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2259
1da177e4
LT
2260 if (skb->pkt_type == PACKET_LOOPBACK)
2261 goto drop;
2262
2263 sk = pt->af_packet_priv;
2264 po = pkt_sk(sk);
2265
09ad9bc7 2266 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2267 goto drop;
2268
d5496990 2269 if (dev_has_header(dev)) {
1da177e4 2270 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2271 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2272 else if (skb->pkt_type == PACKET_OUTGOING) {
2273 /* Special case: outgoing packets have ll header at head */
bbe735e4 2274 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2275 }
2276 }
2277
2278 snaplen = skb->len;
2279
dbcb5855
DM
2280 res = run_filter(skb, sk, snaplen);
2281 if (!res)
fda9ef5d 2282 goto drop_n_restore;
68c2e5de 2283
2c51c627
ED
2284 /* If we are flooded, just give up */
2285 if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
2286 atomic_inc(&po->tp_drops);
2287 goto drop_n_restore;
2288 }
2289
68c2e5de
AD
2290 if (skb->ip_summed == CHECKSUM_PARTIAL)
2291 status |= TP_STATUS_CSUMNOTREADY;
682f048b 2292 else if (skb->pkt_type != PACKET_OUTGOING &&
b85f628a 2293 skb_csum_unnecessary(skb))
682f048b 2294 status |= TP_STATUS_CSUM_VALID;
8e08bb75
XL
2295 if (skb_is_gso(skb) && skb_is_gso_tcp(skb))
2296 status |= TP_STATUS_GSO_TCP;
68c2e5de 2297
dbcb5855
DM
2298 if (snaplen > res)
2299 snaplen = res;
1da177e4
LT
2300
2301 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2302 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2303 po->tp_reserve;
1da177e4 2304 } else {
95c96174 2305 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2306 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2307 (maclen < 16 ? 16 : maclen)) +
58d19b19 2308 po->tp_reserve;
dfc39d40
JT
2309 vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
2310 if (vnet_hdr_sz)
2311 netoff += vnet_hdr_sz;
1da177e4
LT
2312 macoff = netoff - maclen;
2313 }
acf69c94
OC
2314 if (netoff > USHRT_MAX) {
2315 atomic_inc(&po->tp_drops);
2316 goto drop_n_restore;
2317 }
f6fb8f10 2318 if (po->tp_version <= TPACKET_V2) {
2319 if (macoff + snaplen > po->rx_ring.frame_size) {
2320 if (po->copy_thresh &&
0fd7bac6 2321 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2322 if (skb_shared(skb)) {
2323 copy_skb = skb_clone(skb, GFP_ATOMIC);
2324 } else {
2325 copy_skb = skb_get(skb);
2326 skb_head = skb->data;
2327 }
c700525f
ED
2328 if (copy_skb) {
2329 memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0,
2330 sizeof(PACKET_SKB_CB(copy_skb)->sa.ll));
f6fb8f10 2331 skb_set_owner_r(copy_skb, sk);
c700525f 2332 }
1da177e4 2333 }
f6fb8f10 2334 snaplen = po->rx_ring.frame_size - macoff;
edbd58be 2335 if ((int)snaplen < 0) {
f6fb8f10 2336 snaplen = 0;
dfc39d40 2337 vnet_hdr_sz = 0;
edbd58be 2338 }
1da177e4 2339 }
dc808110
ED
2340 } else if (unlikely(macoff + snaplen >
2341 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2342 u32 nval;
2343
2344 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2345 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2346 snaplen, nval, macoff);
2347 snaplen = nval;
2348 if (unlikely((int)snaplen < 0)) {
2349 snaplen = 0;
2350 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
dfc39d40 2351 vnet_hdr_sz = 0;
dc808110 2352 }
1da177e4 2353 }
1da177e4 2354 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2355 h.raw = packet_current_rx_frame(po, skb,
2356 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2357 if (!h.raw)
58d19b19 2358 goto drop_n_account;
46e4c421 2359
61fad681
WB
2360 if (po->tp_version <= TPACKET_V2) {
2361 slot_id = po->rx_ring.head;
2362 if (test_bit(slot_id, po->rx_ring.rx_owner_map))
2363 goto drop_n_account;
2364 __set_bit(slot_id, po->rx_ring.rx_owner_map);
2365 }
2366
dfc39d40 2367 if (vnet_hdr_sz &&
46e4c421
WB
2368 virtio_net_hdr_from_skb(skb, h.raw + macoff -
2369 sizeof(struct virtio_net_hdr),
88fd1cb8
JO
2370 vio_le(), true, 0)) {
2371 if (po->tp_version == TPACKET_V3)
2372 prb_clear_blk_fill_status(&po->rx_ring);
46e4c421 2373 goto drop_n_account;
88fd1cb8 2374 }
46e4c421 2375
f6fb8f10 2376 if (po->tp_version <= TPACKET_V2) {
2377 packet_increment_rx_head(po, &po->rx_ring);
2378 /*
2379 * LOSING will be reported till you read the stats,
2380 * because it's COR - Clear On Read.
2381 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2382 * at packet level.
2383 */
8e8e2951 2384 if (atomic_read(&po->tp_drops))
f6fb8f10 2385 status |= TP_STATUS_LOSING;
2386 }
945d015e 2387
ee80fbf3 2388 po->stats.stats1.tp_packets++;
1da177e4
LT
2389 if (copy_skb) {
2390 status |= TP_STATUS_COPY;
27942a15 2391 skb_clear_delivery_time(copy_skb);
1da177e4
LT
2392 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2393 }
1da177e4
LT
2394 spin_unlock(&sk->sk_receive_queue.lock);
2395
bbd6ef87 2396 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2 2397
171c3b15
RS
2398 /* Always timestamp; prefer an existing software timestamp taken
2399 * closer to the time of capture.
2400 */
2401 ts_status = tpacket_get_timestamp(skb, &ts,
1051ce4a
ED
2402 READ_ONCE(po->tp_tstamp) |
2403 SOF_TIMESTAMPING_SOFTWARE);
171c3b15 2404 if (!ts_status)
d413fcb4 2405 ktime_get_real_ts64(&ts);
1da177e4 2406
b9c32fb2
DB
2407 status |= ts_status;
2408
bbd6ef87
PM
2409 switch (po->tp_version) {
2410 case TPACKET_V1:
2411 h.h1->tp_len = skb->len;
2412 h.h1->tp_snaplen = snaplen;
2413 h.h1->tp_mac = macoff;
2414 h.h1->tp_net = netoff;
4b457bdf
DB
2415 h.h1->tp_sec = ts.tv_sec;
2416 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2417 hdrlen = sizeof(*h.h1);
2418 break;
2419 case TPACKET_V2:
2420 h.h2->tp_len = skb->len;
2421 h.h2->tp_snaplen = snaplen;
2422 h.h2->tp_mac = macoff;
2423 h.h2->tp_net = netoff;
bbd6ef87
PM
2424 h.h2->tp_sec = ts.tv_sec;
2425 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2426 if (skb_vlan_tag_present(skb)) {
2427 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2428 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2429 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2430 } else {
2431 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2432 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2433 }
e4d26f4b 2434 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2435 hdrlen = sizeof(*h.h2);
2436 break;
f6fb8f10 2437 case TPACKET_V3:
2438 /* tp_nxt_offset,vlan are already populated above.
2439 * So DONT clear those fields here
2440 */
2441 h.h3->tp_status |= status;
2442 h.h3->tp_len = skb->len;
2443 h.h3->tp_snaplen = snaplen;
2444 h.h3->tp_mac = macoff;
2445 h.h3->tp_net = netoff;
f6fb8f10 2446 h.h3->tp_sec = ts.tv_sec;
2447 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2448 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2449 hdrlen = sizeof(*h.h3);
2450 break;
bbd6ef87
PM
2451 default:
2452 BUG();
2453 }
1da177e4 2454
bbd6ef87 2455 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2456 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2457 sll->sll_family = AF_PACKET;
2458 sll->sll_hatype = dev->type;
2459 sll->sll_protocol = skb->protocol;
2460 sll->sll_pkttype = skb->pkt_type;
ee5675ec 2461 if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
80feaacb
PWJ
2462 sll->sll_ifindex = orig_dev->ifindex;
2463 else
2464 sll->sll_ifindex = dev->ifindex;
1da177e4 2465
e16aa207 2466 smp_mb();
f0d4eb29 2467
f6dafa95 2468#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2469 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2470 u8 *start, *end;
2471
f0d4eb29
DB
2472 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2473 macoff + snaplen);
2474
2475 for (start = h.raw; start < end; start += PAGE_SIZE)
2476 flush_dcache_page(pgv_to_page(start));
1da177e4 2477 }
f0d4eb29 2478 smp_wmb();
f6dafa95 2479#endif
f0d4eb29 2480
da413eec 2481 if (po->tp_version <= TPACKET_V2) {
61fad681 2482 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2483 __packet_set_status(po, h.raw, status);
61fad681
WB
2484 __clear_bit(slot_id, po->rx_ring.rx_owner_map);
2485 spin_unlock(&sk->sk_receive_queue.lock);
da413eec 2486 sk->sk_data_ready(sk);
88fd1cb8 2487 } else if (po->tp_version == TPACKET_V3) {
f6fb8f10 2488 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2489 }
1da177e4
LT
2490
2491drop_n_restore:
2492 if (skb_head != skb->data && skb_shared(skb)) {
2493 skb->data = skb_head;
2494 skb->len = skb_len;
2495 }
2496drop:
da37845f
WJ
2497 if (!is_drop_n_account)
2498 consume_skb(skb);
2499 else
2500 kfree_skb(skb);
1da177e4
LT
2501 return 0;
2502
58d19b19 2503drop_n_account:
1da177e4 2504 spin_unlock(&sk->sk_receive_queue.lock);
8e8e2951
ED
2505 atomic_inc(&po->tp_drops);
2506 is_drop_n_account = true;
1da177e4 2507
676d2369 2508 sk->sk_data_ready(sk);
acb5d75b 2509 kfree_skb(copy_skb);
1da177e4
LT
2510 goto drop_n_restore;
2511}
2512
69e3c75f
JB
2513static void tpacket_destruct_skb(struct sk_buff *skb)
2514{
2515 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2516
69e3c75f 2517 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2518 void *ph;
b9c32fb2
DB
2519 __u32 ts;
2520
5cd8d46e 2521 ph = skb_zcopy_get_nouarg(skb);
b0138408 2522 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2523
2524 ts = __packet_set_timestamp(po, ph, skb);
2525 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
89ed5b51
NH
2526
2527 if (!packet_read_pending(&po->tx_ring))
2528 complete(&po->skb_completion);
69e3c75f
JB
2529 }
2530
2531 sock_wfree(skb);
2532}
2533
16cc1400
WB
2534static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2535{
16cc1400
WB
2536 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2537 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2538 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2539 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2540 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2541 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2542 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2543
2544 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2545 return -EINVAL;
2546
16cc1400
WB
2547 return 0;
2548}
2549
2550static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
dfc39d40 2551 struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz)
16cc1400 2552{
dfc39d40
JT
2553 int ret;
2554
2555 if (*len < vnet_hdr_sz)
16cc1400 2556 return -EINVAL;
dfc39d40 2557 *len -= vnet_hdr_sz;
16cc1400 2558
cbbd26b8 2559 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
16cc1400
WB
2560 return -EFAULT;
2561
dfc39d40
JT
2562 ret = __packet_snd_vnet_parse(vnet_hdr, *len);
2563 if (ret)
2564 return ret;
2565
2566 /* move iter to point to the start of mac header */
2567 if (vnet_hdr_sz != sizeof(struct virtio_net_hdr))
2568 iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr));
2569
2570 return 0;
16cc1400
WB
2571}
2572
40d4e3df 2573static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2574 void *frame, struct net_device *dev, void *data, int tp_len,
c14ac945
SHY
2575 __be16 proto, unsigned char *addr, int hlen, int copylen,
2576 const struct sockcm_cookie *sockc)
69e3c75f 2577{
184f489e 2578 union tpacket_uhdr ph;
8d39b4a6 2579 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2580 struct socket *sock = po->sk.sk_socket;
2581 struct page *page;
69e3c75f
JB
2582 int err;
2583
2584 ph.raw = frame;
2585
2586 skb->protocol = proto;
2587 skb->dev = dev;
2588 skb->priority = po->sk.sk_priority;
2d37a186 2589 skb->mark = po->sk.sk_mark;
3d0ba8c0 2590 skb->tstamp = sockc->transmit_time;
8f932f76 2591 skb_setup_tx_timestamp(skb, sockc->tsflags);
5cd8d46e 2592 skb_zcopy_set_nouarg(skb, ph.raw);
69e3c75f 2593
ae641949 2594 skb_reserve(skb, hlen);
69e3c75f 2595 skb_reset_network_header(skb);
c1aad275 2596
69e3c75f
JB
2597 to_write = tp_len;
2598
2599 if (sock->type == SOCK_DGRAM) {
2600 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2601 NULL, tp_len);
2602 if (unlikely(err < 0))
2603 return -EINVAL;
1d036d25 2604 } else if (copylen) {
9ed988cd
WB
2605 int hdrlen = min_t(int, copylen, tp_len);
2606
69e3c75f 2607 skb_push(skb, dev->hard_header_len);
1d036d25 2608 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2609 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2610 if (unlikely(err))
2611 return err;
9ed988cd
WB
2612 if (!dev_validate_header(dev, skb->data, hdrlen))
2613 return -EINVAL;
69e3c75f 2614
9ed988cd
WB
2615 data += hdrlen;
2616 to_write -= hdrlen;
69e3c75f
JB
2617 }
2618
69e3c75f
JB
2619 offset = offset_in_page(data);
2620 len_max = PAGE_SIZE - offset;
2621 len = ((to_write > len_max) ? len_max : to_write);
2622
2623 skb->data_len = to_write;
2624 skb->len += to_write;
2625 skb->truesize += to_write;
14afee4b 2626 refcount_add(to_write, &po->sk.sk_wmem_alloc);
69e3c75f
JB
2627
2628 while (likely(to_write)) {
2629 nr_frags = skb_shinfo(skb)->nr_frags;
2630
2631 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
3948b059
ED
2632 pr_err("Packet exceed the number of skb frags(%u)\n",
2633 (unsigned int)MAX_SKB_FRAGS);
69e3c75f
JB
2634 return -EFAULT;
2635 }
2636
0af55bb5
CG
2637 page = pgv_to_page(data);
2638 data += len;
69e3c75f
JB
2639 flush_dcache_page(page);
2640 get_page(page);
0af55bb5 2641 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2642 to_write -= len;
2643 offset = 0;
2644 len_max = PAGE_SIZE;
2645 len = ((to_write > len_max) ? len_max : to_write);
2646 }
2647
75c65772 2648 packet_parse_headers(skb, sock);
efdfa2f7 2649
69e3c75f
JB
2650 return tp_len;
2651}
2652
8d39b4a6
WB
2653static int tpacket_parse_header(struct packet_sock *po, void *frame,
2654 int size_max, void **data)
2655{
2656 union tpacket_uhdr ph;
2657 int tp_len, off;
2658
2659 ph.raw = frame;
2660
2661 switch (po->tp_version) {
7f953ab2
SV
2662 case TPACKET_V3:
2663 if (ph.h3->tp_next_offset != 0) {
2664 pr_warn_once("variable sized slot not supported");
2665 return -EINVAL;
2666 }
2667 tp_len = ph.h3->tp_len;
2668 break;
8d39b4a6
WB
2669 case TPACKET_V2:
2670 tp_len = ph.h2->tp_len;
2671 break;
2672 default:
2673 tp_len = ph.h1->tp_len;
2674 break;
2675 }
2676 if (unlikely(tp_len > size_max)) {
2677 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2678 return -EMSGSIZE;
2679 }
2680
74383446 2681 if (unlikely(packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF))) {
8d39b4a6
WB
2682 int off_min, off_max;
2683
2684 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2685 off_max = po->tx_ring.frame_size - tp_len;
2686 if (po->sk.sk_type == SOCK_DGRAM) {
2687 switch (po->tp_version) {
7f953ab2
SV
2688 case TPACKET_V3:
2689 off = ph.h3->tp_net;
2690 break;
8d39b4a6
WB
2691 case TPACKET_V2:
2692 off = ph.h2->tp_net;
2693 break;
2694 default:
2695 off = ph.h1->tp_net;
2696 break;
2697 }
2698 } else {
2699 switch (po->tp_version) {
7f953ab2
SV
2700 case TPACKET_V3:
2701 off = ph.h3->tp_mac;
2702 break;
8d39b4a6
WB
2703 case TPACKET_V2:
2704 off = ph.h2->tp_mac;
2705 break;
2706 default:
2707 off = ph.h1->tp_mac;
2708 break;
2709 }
2710 }
2711 if (unlikely((off < off_min) || (off_max < off)))
2712 return -EINVAL;
2713 } else {
2714 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2715 }
2716
2717 *data = frame + off;
2718 return tp_len;
2719}
2720
69e3c75f
JB
2721static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2722{
89ed5b51 2723 struct sk_buff *skb = NULL;
69e3c75f 2724 struct net_device *dev;
1d036d25 2725 struct virtio_net_hdr *vnet_hdr = NULL;
c14ac945 2726 struct sockcm_cookie sockc;
69e3c75f 2727 __be16 proto;
09effa67 2728 int err, reserve = 0;
40d4e3df 2729 void *ph;
342dfc30 2730 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2731 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
dfc39d40 2732 int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
486efdc8 2733 unsigned char *addr = NULL;
69e3c75f 2734 int tp_len, size_max;
8d39b4a6 2735 void *data;
69e3c75f 2736 int len_sum = 0;
9e67030a 2737 int status = TP_STATUS_AVAILABLE;
1d036d25 2738 int hlen, tlen, copylen = 0;
89ed5b51 2739 long timeo = 0;
69e3c75f 2740
69e3c75f
JB
2741 mutex_lock(&po->pg_vec_lock);
2742
32d3182c
ED
2743 /* packet_sendmsg() check on tx_ring.pg_vec was lockless,
2744 * we need to confirm it under protection of pg_vec_lock.
2745 */
2746 if (unlikely(!po->tx_ring.pg_vec)) {
2747 err = -EBUSY;
2748 goto out;
2749 }
66e56cd4 2750 if (likely(saddr == NULL)) {
e40526cb 2751 dev = packet_cached_dev_get(po);
c7d2ef5d 2752 proto = READ_ONCE(po->num);
69e3c75f
JB
2753 } else {
2754 err = -EINVAL;
2755 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2756 goto out;
2757 if (msg->msg_namelen < (saddr->sll_halen
2758 + offsetof(struct sockaddr_ll,
2759 sll_addr)))
2760 goto out;
69e3c75f 2761 proto = saddr->sll_protocol;
827d9780 2762 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
486efdc8
WB
2763 if (po->sk.sk_socket->type == SOCK_DGRAM) {
2764 if (dev && msg->msg_namelen < dev->addr_len +
2765 offsetof(struct sockaddr_ll, sll_addr))
2766 goto out_put;
2767 addr = saddr->sll_addr;
2768 }
69e3c75f
JB
2769 }
2770
69e3c75f
JB
2771 err = -ENXIO;
2772 if (unlikely(dev == NULL))
2773 goto out;
69e3c75f
JB
2774 err = -ENETDOWN;
2775 if (unlikely(!(dev->flags & IFF_UP)))
2776 goto out_put;
2777
657a0667 2778 sockcm_init(&sockc, &po->sk);
d19b183c
DCS
2779 if (msg->msg_controllen) {
2780 err = sock_cmsg_send(&po->sk, msg, &sockc);
2781 if (unlikely(err))
2782 goto out_put;
2783 }
2784
5cfb4c8d
DB
2785 if (po->sk.sk_socket->type == SOCK_RAW)
2786 reserve = dev->hard_header_len;
69e3c75f 2787 size_max = po->tx_ring.frame_size
b5dd884e 2788 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2789
dfc39d40 2790 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz)
5cfb4c8d 2791 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2792
89ed5b51
NH
2793 reinit_completion(&po->skb_completion);
2794
69e3c75f
JB
2795 do {
2796 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2797 TP_STATUS_SEND_REQUEST);
69e3c75f 2798 if (unlikely(ph == NULL)) {
89ed5b51
NH
2799 if (need_wait && skb) {
2800 timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
2801 timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
2802 if (timeo <= 0) {
2803 err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
2804 goto out_put;
2805 }
2806 }
2807 /* check for additional frames */
69e3c75f
JB
2808 continue;
2809 }
2810
8d39b4a6
WB
2811 skb = NULL;
2812 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2813 if (tp_len < 0)
2814 goto tpacket_error;
2815
69e3c75f 2816 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2817 hlen = LL_RESERVED_SPACE(dev);
2818 tlen = dev->needed_tailroom;
dfc39d40 2819 if (vnet_hdr_sz) {
1d036d25 2820 vnet_hdr = data;
dfc39d40
JT
2821 data += vnet_hdr_sz;
2822 tp_len -= vnet_hdr_sz;
1d036d25
WB
2823 if (tp_len < 0 ||
2824 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2825 tp_len = -EINVAL;
2826 goto tpacket_error;
2827 }
2828 copylen = __virtio16_to_cpu(vio_le(),
2829 vnet_hdr->hdr_len);
2830 }
9ed988cd 2831 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2832 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2833 hlen + tlen + sizeof(struct sockaddr_ll) +
2834 (copylen - dev->hard_header_len),
fbf33a28 2835 !need_wait, &err);
69e3c75f 2836
fbf33a28
KM
2837 if (unlikely(skb == NULL)) {
2838 /* we assume the socket was initially writeable ... */
2839 if (likely(len_sum > 0))
2840 err = len_sum;
69e3c75f 2841 goto out_status;
fbf33a28 2842 }
8d39b4a6 2843 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
c14ac945 2844 addr, hlen, copylen, &sockc);
dbd46ab4 2845 if (likely(tp_len >= 0) &&
5cfb4c8d 2846 tp_len > dev->mtu + reserve &&
dfc39d40 2847 !vnet_hdr_sz &&
3c70c132
DB
2848 !packet_extra_vlan_len_allowed(dev, skb))
2849 tp_len = -EMSGSIZE;
69e3c75f
JB
2850
2851 if (unlikely(tp_len < 0)) {
8d39b4a6 2852tpacket_error:
164bddac 2853 if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) {
69e3c75f
JB
2854 __packet_set_status(po, ph,
2855 TP_STATUS_AVAILABLE);
2856 packet_increment_head(&po->tx_ring);
2857 kfree_skb(skb);
2858 continue;
2859 } else {
2860 status = TP_STATUS_WRONG_FORMAT;
2861 err = tp_len;
2862 goto out_status;
2863 }
2864 }
2865
dfc39d40 2866 if (vnet_hdr_sz) {
9d2f67e4
JT
2867 if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
2868 tp_len = -EINVAL;
2869 goto tpacket_error;
2870 }
2871 virtio_net_hdr_set_proto(skb, vnet_hdr);
1d036d25
WB
2872 }
2873
69e3c75f
JB
2874 skb->destructor = tpacket_destruct_skb;
2875 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2876 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2877
2878 status = TP_STATUS_SEND_REQUEST;
105a201e 2879 err = packet_xmit(po, skb);
29e8e659
HL
2880 if (unlikely(err != 0)) {
2881 if (err > 0)
2882 err = net_xmit_errno(err);
eb70df13
JP
2883 if (err && __packet_get_status(po, ph) ==
2884 TP_STATUS_AVAILABLE) {
2885 /* skb was destructed already */
2886 skb = NULL;
2887 goto out_status;
2888 }
2889 /*
2890 * skb was dropped but not destructed yet;
2891 * let's treat it like congestion or err < 0
2892 */
2893 err = 0;
2894 }
69e3c75f
JB
2895 packet_increment_head(&po->tx_ring);
2896 len_sum += tp_len;
b0138408
DB
2897 } while (likely((ph != NULL) ||
2898 /* Note: packet_read_pending() might be slow if we have
2899 * to call it as it's per_cpu variable, but in fast-path
2900 * we already short-circuit the loop with the first
2901 * condition, and luckily don't have to go that path
2902 * anyway.
2903 */
2904 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2905
2906 err = len_sum;
2907 goto out_put;
2908
69e3c75f
JB
2909out_status:
2910 __packet_set_status(po, ph, status);
2911 kfree_skb(skb);
2912out_put:
e40526cb 2913 dev_put(dev);
69e3c75f
JB
2914out:
2915 mutex_unlock(&po->pg_vec_lock);
2916 return err;
2917}
69e3c75f 2918
eea49cc9
OJ
2919static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2920 size_t reserve, size_t len,
2921 size_t linear, int noblock,
2922 int *err)
bfd5f4a3
SS
2923{
2924 struct sk_buff *skb;
2925
2926 /* Under a page? Don't bother with paged skb. */
2927 if (prepad + len < PAGE_SIZE || !linear)
2928 linear = len;
2929
2930 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2931 err, 0);
bfd5f4a3
SS
2932 if (!skb)
2933 return NULL;
2934
2935 skb_reserve(skb, reserve);
2936 skb_put(skb, linear);
2937 skb->data_len = len - linear;
2938 skb->len += len - linear;
2939
2940 return skb;
2941}
2942
d346a3fa 2943static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2944{
2945 struct sock *sk = sock->sk;
342dfc30 2946 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2947 struct sk_buff *skb;
2948 struct net_device *dev;
0e11c91e 2949 __be16 proto;
486efdc8 2950 unsigned char *addr = NULL;
827d9780 2951 int err, reserve = 0;
c7d39e32 2952 struct sockcm_cookie sockc;
bfd5f4a3
SS
2953 struct virtio_net_hdr vnet_hdr = { 0 };
2954 int offset = 0;
bfd5f4a3 2955 struct packet_sock *po = pkt_sk(sk);
dfc39d40 2956 int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
57031eb7 2957 int hlen, tlen, linear;
3bdc0eba 2958 int extra_len = 0;
1da177e4
LT
2959
2960 /*
1ce4f28b 2961 * Get and verify the address.
1da177e4 2962 */
1ce4f28b 2963
66e56cd4 2964 if (likely(saddr == NULL)) {
e40526cb 2965 dev = packet_cached_dev_get(po);
c7d2ef5d 2966 proto = READ_ONCE(po->num);
1da177e4
LT
2967 } else {
2968 err = -EINVAL;
2969 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2970 goto out;
0fb375fb
EB
2971 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2972 goto out;
1da177e4 2973 proto = saddr->sll_protocol;
827d9780 2974 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
486efdc8
WB
2975 if (sock->type == SOCK_DGRAM) {
2976 if (dev && msg->msg_namelen < dev->addr_len +
2977 offsetof(struct sockaddr_ll, sll_addr))
2978 goto out_unlock;
2979 addr = saddr->sll_addr;
2980 }
1da177e4
LT
2981 }
2982
1da177e4 2983 err = -ENXIO;
e40526cb 2984 if (unlikely(dev == NULL))
1da177e4 2985 goto out_unlock;
d5e76b0a 2986 err = -ENETDOWN;
e40526cb 2987 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2988 goto out_unlock;
2989
657a0667 2990 sockcm_init(&sockc, sk);
c7d39e32
EJ
2991 sockc.mark = sk->sk_mark;
2992 if (msg->msg_controllen) {
2993 err = sock_cmsg_send(sk, msg, &sockc);
2994 if (unlikely(err))
2995 goto out_unlock;
2996 }
2997
e40526cb
DB
2998 if (sock->type == SOCK_RAW)
2999 reserve = dev->hard_header_len;
dfc39d40
JT
3000 if (vnet_hdr_sz) {
3001 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz);
16cc1400 3002 if (err)
bfd5f4a3 3003 goto out_unlock;
bfd5f4a3
SS
3004 }
3005
3bdc0eba
BG
3006 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
3007 if (!netif_supports_nofcs(dev)) {
3008 err = -EPROTONOSUPPORT;
3009 goto out_unlock;
3010 }
3011 extra_len = 4; /* We're doing our own CRC */
3012 }
3013
1da177e4 3014 err = -EMSGSIZE;
16cc1400
WB
3015 if (!vnet_hdr.gso_type &&
3016 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
3017 goto out_unlock;
3018
bfd5f4a3 3019 err = -ENOBUFS;
ae641949
HX
3020 hlen = LL_RESERVED_SPACE(dev);
3021 tlen = dev->needed_tailroom;
57031eb7
WB
3022 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
3023 linear = max(linear, min_t(int, len, dev->hard_header_len));
3024 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
bfd5f4a3 3025 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 3026 if (skb == NULL)
1da177e4
LT
3027 goto out_unlock;
3028
b84bbaf7 3029 skb_reset_network_header(skb);
1da177e4 3030
0c4e8581 3031 err = -EINVAL;
9c707762
WB
3032 if (sock->type == SOCK_DGRAM) {
3033 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 3034 if (unlikely(offset < 0))
9c707762 3035 goto out_free;
b84bbaf7 3036 } else if (reserve) {
9aad13b0 3037 skb_reserve(skb, -reserve);
88a8121d
ND
3038 if (len < reserve + sizeof(struct ipv6hdr) &&
3039 dev->min_header_len != dev->hard_header_len)
993675a3 3040 skb_reset_network_header(skb);
9c707762 3041 }
1da177e4
LT
3042
3043 /* Returns -EFAULT on error */
c0371da6 3044 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
3045 if (err)
3046 goto out_free;
bf84a010 3047
dc633700
ZS
3048 if ((sock->type == SOCK_RAW &&
3049 !dev_validate_header(dev, skb->data, len)) || !skb->len) {
9ed988cd
WB
3050 err = -EINVAL;
3051 goto out_free;
3052 }
3053
8f932f76 3054 skb_setup_tx_timestamp(skb, sockc.tsflags);
1da177e4 3055
16cc1400 3056 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
3057 !packet_extra_vlan_len_allowed(dev, skb)) {
3058 err = -EMSGSIZE;
3059 goto out_free;
57f89bfa
BG
3060 }
3061
09effa67
DM
3062 skb->protocol = proto;
3063 skb->dev = dev;
1da177e4 3064 skb->priority = sk->sk_priority;
c7d39e32 3065 skb->mark = sockc.mark;
3d0ba8c0 3066 skb->tstamp = sockc.transmit_time;
0fd5d57b 3067
dfed913e
HL
3068 if (unlikely(extra_len == 4))
3069 skb->no_fcs = 1;
3070
3071 packet_parse_headers(skb, sock);
3072
dfc39d40 3073 if (vnet_hdr_sz) {
db60eb5f 3074 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
16cc1400
WB
3075 if (err)
3076 goto out_free;
dfc39d40 3077 len += vnet_hdr_sz;
9d2f67e4 3078 virtio_net_hdr_set_proto(skb, &vnet_hdr);
bfd5f4a3
SS
3079 }
3080
105a201e
ED
3081 err = packet_xmit(po, skb);
3082
29e8e659
HL
3083 if (unlikely(err != 0)) {
3084 if (err > 0)
3085 err = net_xmit_errno(err);
3086 if (err)
3087 goto out_unlock;
3088 }
1da177e4 3089
e40526cb 3090 dev_put(dev);
1da177e4 3091
40d4e3df 3092 return len;
1da177e4
LT
3093
3094out_free:
3095 kfree_skb(skb);
3096out_unlock:
1160dfa1 3097 dev_put(dev);
1da177e4
LT
3098out:
3099 return err;
3100}
3101
1b784140 3102static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 3103{
69e3c75f
JB
3104 struct sock *sk = sock->sk;
3105 struct packet_sock *po = pkt_sk(sk);
d346a3fa 3106
d1b5bee4
ED
3107 /* Reading tx_ring.pg_vec without holding pg_vec_lock is racy.
3108 * tpacket_snd() will redo the check safely.
3109 */
3110 if (data_race(po->tx_ring.pg_vec))
69e3c75f 3111 return tpacket_snd(po, msg);
d1b5bee4
ED
3112
3113 return packet_snd(sock, msg, len);
69e3c75f
JB
3114}
3115
1da177e4
LT
3116/*
3117 * Close a PACKET socket. This is fairly simple. We immediately go
3118 * to 'closed' state and remove our protocol entry in the device list.
3119 */
3120
3121static int packet_release(struct socket *sock)
3122{
3123 struct sock *sk = sock->sk;
3124 struct packet_sock *po;
2bd624b4 3125 struct packet_fanout *f;
d12d01d6 3126 struct net *net;
f6fb8f10 3127 union tpacket_req_u req_u;
1da177e4
LT
3128
3129 if (!sk)
3130 return 0;
3131
3b1e0a65 3132 net = sock_net(sk);
1da177e4
LT
3133 po = pkt_sk(sk);
3134
0fa7fa98 3135 mutex_lock(&net->packet.sklist_lock);
808f5114 3136 sk_del_node_init_rcu(sk);
0fa7fa98
PE
3137 mutex_unlock(&net->packet.sklist_lock);
3138
920de804 3139 sock_prot_inuse_add(net, sk->sk_prot, -1);
1da177e4 3140
808f5114 3141 spin_lock(&po->bind_lock);
ce06b03e 3142 unregister_prot_hook(sk, false);
66e56cd4
DB
3143 packet_cached_dev_reset(po);
3144
160ff18a 3145 if (po->prot_hook.dev) {
d62607c3 3146 netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker);
160ff18a
BG
3147 po->prot_hook.dev = NULL;
3148 }
808f5114 3149 spin_unlock(&po->bind_lock);
1da177e4 3150
1da177e4 3151 packet_flush_mclist(sk);
1da177e4 3152
5171b37d 3153 lock_sock(sk);
9665d5d6
PS
3154 if (po->rx_ring.pg_vec) {
3155 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3156 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 3157 }
69e3c75f 3158
9665d5d6
PS
3159 if (po->tx_ring.pg_vec) {
3160 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3161 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 3162 }
5171b37d 3163 release_sock(sk);
1da177e4 3164
2bd624b4 3165 f = fanout_release(sk);
dc99f600 3166
808f5114 3167 synchronize_net();
2bd624b4 3168
afa0925c 3169 kfree(po->rollover);
2bd624b4
AS
3170 if (f) {
3171 fanout_release_data(f);
9c661b0b 3172 kvfree(f);
2bd624b4 3173 }
1da177e4
LT
3174 /*
3175 * Now the socket is dead. No more input will appear.
3176 */
1da177e4
LT
3177 sock_orphan(sk);
3178 sock->sk = NULL;
3179
3180 /* Purge queues */
3181
3182 skb_queue_purge(&sk->sk_receive_queue);
b0138408 3183 packet_free_pending(po);
1da177e4
LT
3184
3185 sock_put(sk);
3186 return 0;
3187}
3188
3189/*
3190 * Attach a packet hook.
3191 */
3192
30f7ea1c
FR
3193static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3194 __be16 proto)
1da177e4
LT
3195{
3196 struct packet_sock *po = pkt_sk(sk);
30f7ea1c 3197 struct net_device *dev = NULL;
30f7ea1c 3198 bool unlisted = false;
bf44077c
ED
3199 bool need_rehook;
3200 int ret = 0;
dc99f600 3201
1da177e4 3202 lock_sock(sk);
1da177e4 3203 spin_lock(&po->bind_lock);
30f7ea1c
FR
3204 rcu_read_lock();
3205
4971613c
WB
3206 if (po->fanout) {
3207 ret = -EINVAL;
3208 goto out_unlock;
3209 }
3210
30f7ea1c
FR
3211 if (name) {
3212 dev = dev_get_by_name_rcu(sock_net(sk), name);
3213 if (!dev) {
3214 ret = -ENODEV;
3215 goto out_unlock;
3216 }
3217 } else if (ifindex) {
3218 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3219 if (!dev) {
3220 ret = -ENODEV;
3221 goto out_unlock;
3222 }
3223 }
3224
bf44077c 3225 need_rehook = po->prot_hook.type != proto || po->prot_hook.dev != dev;
902fefb8
DB
3226
3227 if (need_rehook) {
bf44077c 3228 dev_hold(dev);
61edf479 3229 if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
30f7ea1c 3230 rcu_read_unlock();
15fe076e
ED
3231 /* prevents packet_notifier() from calling
3232 * register_prot_hook()
3233 */
c7d2ef5d 3234 WRITE_ONCE(po->num, 0);
30f7ea1c
FR
3235 __unregister_prot_hook(sk, true);
3236 rcu_read_lock();
30f7ea1c
FR
3237 if (dev)
3238 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3239 dev->ifindex);
3240 }
1da177e4 3241
61edf479 3242 BUG_ON(packet_sock_flag(po, PACKET_SOCK_RUNNING));
c7d2ef5d 3243 WRITE_ONCE(po->num, proto);
902fefb8 3244 po->prot_hook.type = proto;
902fefb8 3245
d62607c3 3246 netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker);
f1d9268e 3247
30f7ea1c 3248 if (unlikely(unlisted)) {
30f7ea1c 3249 po->prot_hook.dev = NULL;
e032f7c9 3250 WRITE_ONCE(po->ifindex, -1);
30f7ea1c
FR
3251 packet_cached_dev_reset(po);
3252 } else {
d62607c3
JK
3253 netdev_hold(dev, &po->prot_hook.dev_tracker,
3254 GFP_ATOMIC);
30f7ea1c 3255 po->prot_hook.dev = dev;
e032f7c9 3256 WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0);
30f7ea1c
FR
3257 packet_cached_dev_assign(po, dev);
3258 }
bf44077c 3259 dev_put(dev);
902fefb8 3260 }
66e56cd4 3261
902fefb8 3262 if (proto == 0 || !need_rehook)
1da177e4
LT
3263 goto out_unlock;
3264
30f7ea1c 3265 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3266 register_prot_hook(sk);
be85d4ad
UT
3267 } else {
3268 sk->sk_err = ENETDOWN;
3269 if (!sock_flag(sk, SOCK_DEAD))
e3ae2365 3270 sk_error_report(sk);
1da177e4
LT
3271 }
3272
3273out_unlock:
30f7ea1c 3274 rcu_read_unlock();
1da177e4
LT
3275 spin_unlock(&po->bind_lock);
3276 release_sock(sk);
30f7ea1c 3277 return ret;
1da177e4
LT
3278}
3279
3280/*
3281 * Bind a packet socket to a device
3282 */
3283
40d4e3df
ED
3284static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3285 int addr_len)
1da177e4 3286{
40d4e3df 3287 struct sock *sk = sock->sk;
b5f0de6d 3288 char name[sizeof(uaddr->sa_data_min) + 1];
1ce4f28b 3289
1da177e4
LT
3290 /*
3291 * Check legality
3292 */
1ce4f28b 3293
8ae55f04 3294 if (addr_len != sizeof(struct sockaddr))
1da177e4 3295 return -EINVAL;
540e2894
AP
3296 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3297 * zero-terminated.
3298 */
b5f0de6d
KC
3299 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min));
3300 name[sizeof(uaddr->sa_data_min)] = 0;
1da177e4 3301
30f7ea1c 3302 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3303}
1da177e4
LT
3304
3305static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3306{
40d4e3df
ED
3307 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3308 struct sock *sk = sock->sk;
1da177e4
LT
3309
3310 /*
3311 * Check legality
3312 */
1ce4f28b 3313
1da177e4
LT
3314 if (addr_len < sizeof(struct sockaddr_ll))
3315 return -EINVAL;
3316 if (sll->sll_family != AF_PACKET)
3317 return -EINVAL;
3318
30f7ea1c
FR
3319 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3320 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3321}
3322
3323static struct proto packet_proto = {
3324 .name = "PACKET",
3325 .owner = THIS_MODULE,
3326 .obj_size = sizeof(struct packet_sock),
3327};
3328
3329/*
1ce4f28b 3330 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3331 */
3332
3f378b68
EP
3333static int packet_create(struct net *net, struct socket *sock, int protocol,
3334 int kern)
1da177e4
LT
3335{
3336 struct sock *sk;
3337 struct packet_sock *po;
0e11c91e 3338 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3339 int err;
3340
df008c91 3341 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3342 return -EPERM;
be02097c
DM
3343 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3344 sock->type != SOCK_PACKET)
1da177e4
LT
3345 return -ESOCKTNOSUPPORT;
3346
3347 sock->state = SS_UNCONNECTED;
3348
3349 err = -ENOBUFS;
11aa9c28 3350 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3351 if (sk == NULL)
3352 goto out;
3353
3354 sock->ops = &packet_ops;
1da177e4
LT
3355 if (sock->type == SOCK_PACKET)
3356 sock->ops = &packet_ops_spkt;
be02097c 3357
1da177e4
LT
3358 sock_init_data(sock, sk);
3359
3360 po = pkt_sk(sk);
89ed5b51 3361 init_completion(&po->skb_completion);
1da177e4 3362 sk->sk_family = PF_PACKET;
0e11c91e 3363 po->num = proto;
66e56cd4 3364
b0138408
DB
3365 err = packet_alloc_pending(po);
3366 if (err)
3367 goto out2;
3368
66e56cd4 3369 packet_cached_dev_reset(po);
1da177e4
LT
3370
3371 sk->sk_destruct = packet_sock_destruct;
1da177e4
LT
3372
3373 /*
3374 * Attach a protocol block
3375 */
3376
3377 spin_lock_init(&po->bind_lock);
905db440 3378 mutex_init(&po->pg_vec_lock);
0648ab70 3379 po->rollover = NULL;
1da177e4 3380 po->prot_hook.func = packet_rcv;
be02097c 3381
1da177e4
LT
3382 if (sock->type == SOCK_PACKET)
3383 po->prot_hook.func = packet_rcv_spkt;
be02097c 3384
1da177e4 3385 po->prot_hook.af_packet_priv = sk;
47934e06 3386 po->prot_hook.af_packet_net = sock_net(sk);
1da177e4 3387
0e11c91e
AV
3388 if (proto) {
3389 po->prot_hook.type = proto;
a6361f0c 3390 __register_prot_hook(sk);
1da177e4
LT
3391 }
3392
0fa7fa98 3393 mutex_lock(&net->packet.sklist_lock);
a4dc6a49 3394 sk_add_node_tail_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3395 mutex_unlock(&net->packet.sklist_lock);
3396
3680453c 3397 sock_prot_inuse_add(net, &packet_proto, 1);
808f5114 3398
40d4e3df 3399 return 0;
b0138408
DB
3400out2:
3401 sk_free(sk);
1da177e4
LT
3402out:
3403 return err;
3404}
3405
3406/*
3407 * Pull a packet from our receive queue and hand it to the user.
3408 * If necessary we block.
3409 */
3410
1b784140
YX
3411static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3412 int flags)
1da177e4
LT
3413{
3414 struct sock *sk = sock->sk;
3415 struct sk_buff *skb;
3416 int copied, err;
dfc39d40 3417 int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz);
2472d761 3418 unsigned int origlen = 0;
1da177e4
LT
3419
3420 err = -EINVAL;
ed85b565 3421 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3422 goto out;
3423
3424#if 0
3425 /* What error should we return now? EUNATTACH? */
3426 if (pkt_sk(sk)->ifindex < 0)
3427 return -ENODEV;
3428#endif
3429
ed85b565 3430 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3431 err = sock_recv_errqueue(sk, msg, len,
3432 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3433 goto out;
3434 }
3435
1da177e4
LT
3436 /*
3437 * Call the generic datagram receiver. This handles all sorts
3438 * of horrible races and re-entrancy so we can forget about it
3439 * in the protocol layers.
3440 *
3441 * Now it will return ENETDOWN, if device have just gone down,
3442 * but then it will block.
3443 */
3444
f4b41f06 3445 skb = skb_recv_datagram(sk, flags, &err);
1da177e4
LT
3446
3447 /*
1ce4f28b 3448 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3449 * handles the blocking we don't see and worry about blocking
3450 * retries.
3451 */
3452
8ae55f04 3453 if (skb == NULL)
1da177e4
LT
3454 goto out;
3455
9bb6cd65 3456 packet_rcv_try_clear_pressure(pkt_sk(sk));
2ccdbaa6 3457
dfc39d40
JT
3458 if (vnet_hdr_len) {
3459 err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len);
16cc1400 3460 if (err)
bfd5f4a3
SS
3461 goto out_free;
3462 }
3463
f3d33426
HFS
3464 /* You lose any data beyond the buffer you gave. If it worries
3465 * a user program they can ask the device for its MTU
3466 * anyway.
1da177e4 3467 */
1da177e4 3468 copied = skb->len;
40d4e3df
ED
3469 if (copied > len) {
3470 copied = len;
3471 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3472 }
3473
51f3d02b 3474 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3475 if (err)
3476 goto out_free;
3477
2472d761
EB
3478 if (sock->type != SOCK_PACKET) {
3479 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3480
3481 /* Original length was stored in sockaddr_ll fields */
3482 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3483 sll->sll_family = AF_PACKET;
3484 sll->sll_protocol = skb->protocol;
3485 }
3486
6fd1d51c 3487 sock_recv_cmsgs(msg, sk, skb);
1da177e4 3488
f3d33426 3489 if (msg->msg_name) {
c700525f
ED
3490 const size_t max_len = min(sizeof(skb->cb),
3491 sizeof(struct sockaddr_storage));
b2cf86e1
WB
3492 int copy_len;
3493
f3d33426
HFS
3494 /* If the address length field is there to be filled
3495 * in, we fill it in now.
3496 */
3497 if (sock->type == SOCK_PACKET) {
342dfc30 3498 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426 3499 msg->msg_namelen = sizeof(struct sockaddr_pkt);
b2cf86e1 3500 copy_len = msg->msg_namelen;
f3d33426
HFS
3501 } else {
3502 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3503
f3d33426
HFS
3504 msg->msg_namelen = sll->sll_halen +
3505 offsetof(struct sockaddr_ll, sll_addr);
b2cf86e1
WB
3506 copy_len = msg->msg_namelen;
3507 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
3508 memset(msg->msg_name +
3509 offsetof(struct sockaddr_ll, sll_addr),
3510 0, sizeof(sll->sll_addr));
3511 msg->msg_namelen = sizeof(struct sockaddr_ll);
3512 }
f3d33426 3513 }
c700525f
ED
3514 if (WARN_ON_ONCE(copy_len > max_len)) {
3515 copy_len = max_len;
3516 msg->msg_namelen = copy_len;
3517 }
b2cf86e1 3518 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
f3d33426 3519 }
1da177e4 3520
fd53c297 3521 if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) {
ffbc6111
HX
3522 struct tpacket_auxdata aux;
3523
3524 aux.tp_status = TP_STATUS_USER;
3525 if (skb->ip_summed == CHECKSUM_PARTIAL)
3526 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b 3527 else if (skb->pkt_type != PACKET_OUTGOING &&
b85f628a 3528 skb_csum_unnecessary(skb))
682f048b 3529 aux.tp_status |= TP_STATUS_CSUM_VALID;
8e08bb75
XL
3530 if (skb_is_gso(skb) && skb_is_gso_tcp(skb))
3531 aux.tp_status |= TP_STATUS_GSO_TCP;
682f048b 3532
2472d761 3533 aux.tp_len = origlen;
ffbc6111
HX
3534 aux.tp_snaplen = skb->len;
3535 aux.tp_mac = 0;
bbe735e4 3536 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3537 if (skb_vlan_tag_present(skb)) {
3538 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3539 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3540 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3541 } else {
3542 aux.tp_vlan_tci = 0;
a0cdfcf3 3543 aux.tp_vlan_tpid = 0;
a3bcc23e 3544 }
ffbc6111 3545 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3546 }
3547
1da177e4
LT
3548 /*
3549 * Free or return the buffer as appropriate. Again this
3550 * hides all the races and re-entrancy issues from us.
3551 */
bfd5f4a3 3552 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3553
3554out_free:
3555 skb_free_datagram(sk, skb);
3556out:
3557 return err;
3558}
3559
1da177e4 3560static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3561 int peer)
1da177e4
LT
3562{
3563 struct net_device *dev;
3564 struct sock *sk = sock->sk;
3565
3566 if (peer)
3567 return -EOPNOTSUPP;
3568
3569 uaddr->sa_family = AF_PACKET;
b5f0de6d 3570 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min));
654d1f8a 3571 rcu_read_lock();
e032f7c9 3572 dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex));
654d1f8a 3573 if (dev)
b5f0de6d 3574 strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min));
654d1f8a 3575 rcu_read_unlock();
1da177e4 3576
9b2c45d4 3577 return sizeof(*uaddr);
1da177e4 3578}
1da177e4
LT
3579
3580static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3581 int peer)
1da177e4
LT
3582{
3583 struct net_device *dev;
3584 struct sock *sk = sock->sk;
3585 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3586 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
e032f7c9 3587 int ifindex;
1da177e4
LT
3588
3589 if (peer)
3590 return -EOPNOTSUPP;
3591
e032f7c9 3592 ifindex = READ_ONCE(po->ifindex);
1da177e4 3593 sll->sll_family = AF_PACKET;
e032f7c9 3594 sll->sll_ifindex = ifindex;
c7d2ef5d 3595 sll->sll_protocol = READ_ONCE(po->num);
67286640 3596 sll->sll_pkttype = 0;
654d1f8a 3597 rcu_read_lock();
e032f7c9 3598 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
1da177e4
LT
3599 if (dev) {
3600 sll->sll_hatype = dev->type;
3601 sll->sll_halen = dev->addr_len;
3602 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3603 } else {
3604 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3605 sll->sll_halen = 0;
3606 }
654d1f8a 3607 rcu_read_unlock();
1da177e4 3608
9b2c45d4 3609 return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3610}
3611
2aeb0b88
WC
3612static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3613 int what)
1da177e4
LT
3614{
3615 switch (i->type) {
3616 case PACKET_MR_MULTICAST:
1162563f
JP
3617 if (i->alen != dev->addr_len)
3618 return -EINVAL;
1da177e4 3619 if (what > 0)
22bedad3 3620 return dev_mc_add(dev, i->addr);
1da177e4 3621 else
22bedad3 3622 return dev_mc_del(dev, i->addr);
1da177e4
LT
3623 break;
3624 case PACKET_MR_PROMISC:
2aeb0b88 3625 return dev_set_promiscuity(dev, what);
1da177e4 3626 case PACKET_MR_ALLMULTI:
2aeb0b88 3627 return dev_set_allmulti(dev, what);
d95ed927 3628 case PACKET_MR_UNICAST:
1162563f
JP
3629 if (i->alen != dev->addr_len)
3630 return -EINVAL;
d95ed927 3631 if (what > 0)
a748ee24 3632 return dev_uc_add(dev, i->addr);
d95ed927 3633 else
a748ee24 3634 return dev_uc_del(dev, i->addr);
d95ed927 3635 break;
40d4e3df
ED
3636 default:
3637 break;
1da177e4 3638 }
2aeb0b88 3639 return 0;
1da177e4
LT
3640}
3641
82f17091
FR
3642static void packet_dev_mclist_delete(struct net_device *dev,
3643 struct packet_mclist **mlp)
1da177e4 3644{
82f17091
FR
3645 struct packet_mclist *ml;
3646
3647 while ((ml = *mlp) != NULL) {
3648 if (ml->ifindex == dev->ifindex) {
3649 packet_dev_mc(dev, ml, -1);
3650 *mlp = ml->next;
3651 kfree(ml);
3652 } else
3653 mlp = &ml->next;
1da177e4
LT
3654 }
3655}
3656
0fb375fb 3657static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3658{
3659 struct packet_sock *po = pkt_sk(sk);
3660 struct packet_mclist *ml, *i;
3661 struct net_device *dev;
3662 int err;
3663
3664 rtnl_lock();
3665
3666 err = -ENODEV;
3b1e0a65 3667 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3668 if (!dev)
3669 goto done;
3670
3671 err = -EINVAL;
1162563f 3672 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3673 goto done;
3674
3675 err = -ENOBUFS;
8b3a7005 3676 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3677 if (i == NULL)
3678 goto done;
3679
3680 err = 0;
3681 for (ml = po->mclist; ml; ml = ml->next) {
3682 if (ml->ifindex == mreq->mr_ifindex &&
3683 ml->type == mreq->mr_type &&
3684 ml->alen == mreq->mr_alen &&
3685 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3686 ml->count++;
3687 /* Free the new element ... */
3688 kfree(i);
3689 goto done;
3690 }
3691 }
3692
3693 i->type = mreq->mr_type;
3694 i->ifindex = mreq->mr_ifindex;
3695 i->alen = mreq->mr_alen;
3696 memcpy(i->addr, mreq->mr_address, i->alen);
309cf37f 3697 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3698 i->count = 1;
3699 i->next = po->mclist;
3700 po->mclist = i;
2aeb0b88
WC
3701 err = packet_dev_mc(dev, i, 1);
3702 if (err) {
3703 po->mclist = i->next;
3704 kfree(i);
3705 }
1da177e4
LT
3706
3707done:
3708 rtnl_unlock();
3709 return err;
3710}
3711
0fb375fb 3712static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3713{
3714 struct packet_mclist *ml, **mlp;
3715
3716 rtnl_lock();
3717
3718 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3719 if (ml->ifindex == mreq->mr_ifindex &&
3720 ml->type == mreq->mr_type &&
3721 ml->alen == mreq->mr_alen &&
3722 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3723 if (--ml->count == 0) {
3724 struct net_device *dev;
3725 *mlp = ml->next;
ad959e76
ED
3726 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3727 if (dev)
1da177e4 3728 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3729 kfree(ml);
3730 }
82f17091 3731 break;
1da177e4
LT
3732 }
3733 }
3734 rtnl_unlock();
82f17091 3735 return 0;
1da177e4
LT
3736}
3737
3738static void packet_flush_mclist(struct sock *sk)
3739{
3740 struct packet_sock *po = pkt_sk(sk);
3741 struct packet_mclist *ml;
3742
3743 if (!po->mclist)
3744 return;
3745
3746 rtnl_lock();
3747 while ((ml = po->mclist) != NULL) {
3748 struct net_device *dev;
3749
3750 po->mclist = ml->next;
ad959e76
ED
3751 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3752 if (dev != NULL)
1da177e4 3753 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3754 kfree(ml);
3755 }
3756 rtnl_unlock();
3757}
1da177e4
LT
3758
3759static int
a7b75c5a
CH
3760packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
3761 unsigned int optlen)
1da177e4
LT
3762{
3763 struct sock *sk = sock->sk;
8dc41944 3764 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3765 int ret;
3766
3767 if (level != SOL_PACKET)
3768 return -ENOPROTOOPT;
3769
69e3c75f 3770 switch (optname) {
1ce4f28b 3771 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3772 case PACKET_DROP_MEMBERSHIP:
3773 {
0fb375fb
EB
3774 struct packet_mreq_max mreq;
3775 int len = optlen;
3776 memset(&mreq, 0, sizeof(mreq));
3777 if (len < sizeof(struct packet_mreq))
1da177e4 3778 return -EINVAL;
0fb375fb
EB
3779 if (len > sizeof(mreq))
3780 len = sizeof(mreq);
a7b75c5a 3781 if (copy_from_sockptr(&mreq, optval, len))
1da177e4 3782 return -EFAULT;
0fb375fb
EB
3783 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3784 return -EINVAL;
1da177e4
LT
3785 if (optname == PACKET_ADD_MEMBERSHIP)
3786 ret = packet_mc_add(sk, &mreq);
3787 else
3788 ret = packet_mc_drop(sk, &mreq);
3789 return ret;
3790 }
a2efcfa0 3791
1da177e4 3792 case PACKET_RX_RING:
69e3c75f 3793 case PACKET_TX_RING:
1da177e4 3794 {
f6fb8f10 3795 union tpacket_req_u req_u;
3796 int len;
1da177e4 3797
5171b37d 3798 lock_sock(sk);
f6fb8f10 3799 switch (po->tp_version) {
3800 case TPACKET_V1:
3801 case TPACKET_V2:
3802 len = sizeof(req_u.req);
3803 break;
3804 case TPACKET_V3:
3805 default:
3806 len = sizeof(req_u.req3);
3807 break;
3808 }
5171b37d
ED
3809 if (optlen < len) {
3810 ret = -EINVAL;
3811 } else {
a7b75c5a 3812 if (copy_from_sockptr(&req_u.req, optval, len))
5171b37d
ED
3813 ret = -EFAULT;
3814 else
3815 ret = packet_set_ring(sk, &req_u, 0,
3816 optname == PACKET_TX_RING);
3817 }
3818 release_sock(sk);
3819 return ret;
1da177e4
LT
3820 }
3821 case PACKET_COPY_THRESH:
3822 {
3823 int val;
3824
40d4e3df 3825 if (optlen != sizeof(val))
1da177e4 3826 return -EINVAL;
a7b75c5a 3827 if (copy_from_sockptr(&val, optval, sizeof(val)))
1da177e4
LT
3828 return -EFAULT;
3829
3830 pkt_sk(sk)->copy_thresh = val;
3831 return 0;
3832 }
bbd6ef87
PM
3833 case PACKET_VERSION:
3834 {
3835 int val;
3836
3837 if (optlen != sizeof(val))
3838 return -EINVAL;
a7b75c5a 3839 if (copy_from_sockptr(&val, optval, sizeof(val)))
bbd6ef87
PM
3840 return -EFAULT;
3841 switch (val) {
3842 case TPACKET_V1:
3843 case TPACKET_V2:
f6fb8f10 3844 case TPACKET_V3:
84ac7260 3845 break;
bbd6ef87
PM
3846 default:
3847 return -EINVAL;
3848 }
84ac7260
PP
3849 lock_sock(sk);
3850 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3851 ret = -EBUSY;
3852 } else {
3853 po->tp_version = val;
3854 ret = 0;
3855 }
3856 release_sock(sk);
3857 return ret;
bbd6ef87 3858 }
8913336a
PM
3859 case PACKET_RESERVE:
3860 {
3861 unsigned int val;
3862
3863 if (optlen != sizeof(val))
3864 return -EINVAL;
a7b75c5a 3865 if (copy_from_sockptr(&val, optval, sizeof(val)))
8913336a 3866 return -EFAULT;
bcc5364b
AK
3867 if (val > INT_MAX)
3868 return -EINVAL;
c27927e3
WB
3869 lock_sock(sk);
3870 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3871 ret = -EBUSY;
3872 } else {
3873 po->tp_reserve = val;
3874 ret = 0;
3875 }
3876 release_sock(sk);
3877 return ret;
8913336a 3878 }
69e3c75f
JB
3879 case PACKET_LOSS:
3880 {
3881 unsigned int val;
3882
3883 if (optlen != sizeof(val))
3884 return -EINVAL;
a7b75c5a 3885 if (copy_from_sockptr(&val, optval, sizeof(val)))
69e3c75f 3886 return -EFAULT;
a6361f0c
WB
3887
3888 lock_sock(sk);
3889 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3890 ret = -EBUSY;
3891 } else {
164bddac 3892 packet_sock_flag_set(po, PACKET_SOCK_TP_LOSS, val);
a6361f0c
WB
3893 ret = 0;
3894 }
3895 release_sock(sk);
3896 return ret;
69e3c75f 3897 }
8dc41944
HX
3898 case PACKET_AUXDATA:
3899 {
3900 int val;
3901
3902 if (optlen < sizeof(val))
3903 return -EINVAL;
a7b75c5a 3904 if (copy_from_sockptr(&val, optval, sizeof(val)))
8dc41944
HX
3905 return -EFAULT;
3906
fd53c297 3907 packet_sock_flag_set(po, PACKET_SOCK_AUXDATA, val);
8dc41944
HX
3908 return 0;
3909 }
80feaacb
PWJ
3910 case PACKET_ORIGDEV:
3911 {
3912 int val;
3913
3914 if (optlen < sizeof(val))
3915 return -EINVAL;
a7b75c5a 3916 if (copy_from_sockptr(&val, optval, sizeof(val)))
80feaacb
PWJ
3917 return -EFAULT;
3918
ee5675ec 3919 packet_sock_flag_set(po, PACKET_SOCK_ORIGDEV, val);
80feaacb
PWJ
3920 return 0;
3921 }
bfd5f4a3 3922 case PACKET_VNET_HDR:
dfc39d40 3923 case PACKET_VNET_HDR_SZ:
bfd5f4a3 3924 {
dfc39d40 3925 int val, hdr_len;
bfd5f4a3
SS
3926
3927 if (sock->type != SOCK_RAW)
3928 return -EINVAL;
bfd5f4a3
SS
3929 if (optlen < sizeof(val))
3930 return -EINVAL;
a7b75c5a 3931 if (copy_from_sockptr(&val, optval, sizeof(val)))
bfd5f4a3
SS
3932 return -EFAULT;
3933
dfc39d40
JT
3934 if (optname == PACKET_VNET_HDR_SZ) {
3935 if (val && val != sizeof(struct virtio_net_hdr) &&
3936 val != sizeof(struct virtio_net_hdr_mrg_rxbuf))
3937 return -EINVAL;
3938 hdr_len = val;
3939 } else {
3940 hdr_len = val ? sizeof(struct virtio_net_hdr) : 0;
3941 }
a6361f0c
WB
3942 lock_sock(sk);
3943 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3944 ret = -EBUSY;
3945 } else {
dfc39d40 3946 WRITE_ONCE(po->vnet_hdr_sz, hdr_len);
a6361f0c
WB
3947 ret = 0;
3948 }
3949 release_sock(sk);
3950 return ret;
bfd5f4a3 3951 }
614f60fa
SM
3952 case PACKET_TIMESTAMP:
3953 {
3954 int val;
3955
3956 if (optlen != sizeof(val))
3957 return -EINVAL;
a7b75c5a 3958 if (copy_from_sockptr(&val, optval, sizeof(val)))
614f60fa
SM
3959 return -EFAULT;
3960
1051ce4a 3961 WRITE_ONCE(po->tp_tstamp, val);
614f60fa
SM
3962 return 0;
3963 }
dc99f600
DM
3964 case PACKET_FANOUT:
3965 {
9c661b0b 3966 struct fanout_args args = { 0 };
dc99f600 3967
9c661b0b 3968 if (optlen != sizeof(int) && optlen != sizeof(args))
dc99f600 3969 return -EINVAL;
9c661b0b 3970 if (copy_from_sockptr(&args, optval, optlen))
dc99f600
DM
3971 return -EFAULT;
3972
9c661b0b 3973 return fanout_add(sk, &args);
dc99f600 3974 }
47dceb8e
WB
3975 case PACKET_FANOUT_DATA:
3976 {
e42e70ad
ED
3977 /* Paired with the WRITE_ONCE() in fanout_add() */
3978 if (!READ_ONCE(po->fanout))
47dceb8e
WB
3979 return -EINVAL;
3980
3981 return fanout_set_data(po, optval, optlen);
3982 }
fa788d98
VW
3983 case PACKET_IGNORE_OUTGOING:
3984 {
3985 int val;
3986
3987 if (optlen != sizeof(val))
3988 return -EINVAL;
a7b75c5a 3989 if (copy_from_sockptr(&val, optval, sizeof(val)))
fa788d98
VW
3990 return -EFAULT;
3991 if (val < 0 || val > 1)
3992 return -EINVAL;
3993
3994 po->prot_hook.ignore_outgoing = !!val;
3995 return 0;
3996 }
5920cd3a
PC
3997 case PACKET_TX_HAS_OFF:
3998 {
3999 unsigned int val;
4000
4001 if (optlen != sizeof(val))
4002 return -EINVAL;
a7b75c5a 4003 if (copy_from_sockptr(&val, optval, sizeof(val)))
5920cd3a 4004 return -EFAULT;
a6361f0c
WB
4005
4006 lock_sock(sk);
25c55b38 4007 if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec)
74383446 4008 packet_sock_flag_set(po, PACKET_SOCK_TX_HAS_OFF, val);
25c55b38 4009
a6361f0c 4010 release_sock(sk);
5920cd3a
PC
4011 return 0;
4012 }
d346a3fa
DB
4013 case PACKET_QDISC_BYPASS:
4014 {
4015 int val;
4016
4017 if (optlen != sizeof(val))
4018 return -EINVAL;
a7b75c5a 4019 if (copy_from_sockptr(&val, optval, sizeof(val)))
d346a3fa
DB
4020 return -EFAULT;
4021
105a201e 4022 packet_sock_flag_set(po, PACKET_SOCK_QDISC_BYPASS, val);
d346a3fa
DB
4023 return 0;
4024 }
1da177e4
LT
4025 default:
4026 return -ENOPROTOOPT;
4027 }
4028}
4029
4030static int packet_getsockopt(struct socket *sock, int level, int optname,
4031 char __user *optval, int __user *optlen)
4032{
4033 int len;
c06fff6e 4034 int val, lv = sizeof(val);
1da177e4
LT
4035 struct sock *sk = sock->sk;
4036 struct packet_sock *po = pkt_sk(sk);
c06fff6e 4037 void *data = &val;
ee80fbf3 4038 union tpacket_stats_u st;
a9b63918 4039 struct tpacket_rollover_stats rstats;
8e8e2951 4040 int drops;
1da177e4
LT
4041
4042 if (level != SOL_PACKET)
4043 return -ENOPROTOOPT;
4044
8ae55f04
KK
4045 if (get_user(len, optlen))
4046 return -EFAULT;
1da177e4
LT
4047
4048 if (len < 0)
4049 return -EINVAL;
1ce4f28b 4050
69e3c75f 4051 switch (optname) {
1da177e4 4052 case PACKET_STATISTICS:
1da177e4 4053 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
4054 memcpy(&st, &po->stats, sizeof(st));
4055 memset(&po->stats, 0, sizeof(po->stats));
4056 spin_unlock_bh(&sk->sk_receive_queue.lock);
8e8e2951 4057 drops = atomic_xchg(&po->tp_drops, 0);
ee80fbf3 4058
f6fb8f10 4059 if (po->tp_version == TPACKET_V3) {
c06fff6e 4060 lv = sizeof(struct tpacket_stats_v3);
8e8e2951
ED
4061 st.stats3.tp_drops = drops;
4062 st.stats3.tp_packets += drops;
ee80fbf3 4063 data = &st.stats3;
f6fb8f10 4064 } else {
c06fff6e 4065 lv = sizeof(struct tpacket_stats);
8e8e2951
ED
4066 st.stats1.tp_drops = drops;
4067 st.stats1.tp_packets += drops;
ee80fbf3 4068 data = &st.stats1;
f6fb8f10 4069 }
ee80fbf3 4070
8dc41944
HX
4071 break;
4072 case PACKET_AUXDATA:
fd53c297 4073 val = packet_sock_flag(po, PACKET_SOCK_AUXDATA);
80feaacb
PWJ
4074 break;
4075 case PACKET_ORIGDEV:
ee5675ec 4076 val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV);
bfd5f4a3
SS
4077 break;
4078 case PACKET_VNET_HDR:
dfc39d40
JT
4079 val = !!READ_ONCE(po->vnet_hdr_sz);
4080 break;
4081 case PACKET_VNET_HDR_SZ:
4082 val = READ_ONCE(po->vnet_hdr_sz);
1da177e4 4083 break;
bbd6ef87 4084 case PACKET_VERSION:
bbd6ef87 4085 val = po->tp_version;
bbd6ef87
PM
4086 break;
4087 case PACKET_HDRLEN:
4088 if (len > sizeof(int))
4089 len = sizeof(int);
fd2c83b3
AP
4090 if (len < sizeof(int))
4091 return -EINVAL;
bbd6ef87
PM
4092 if (copy_from_user(&val, optval, len))
4093 return -EFAULT;
4094 switch (val) {
4095 case TPACKET_V1:
4096 val = sizeof(struct tpacket_hdr);
4097 break;
4098 case TPACKET_V2:
4099 val = sizeof(struct tpacket2_hdr);
4100 break;
f6fb8f10 4101 case TPACKET_V3:
4102 val = sizeof(struct tpacket3_hdr);
4103 break;
bbd6ef87
PM
4104 default:
4105 return -EINVAL;
4106 }
bbd6ef87 4107 break;
8913336a 4108 case PACKET_RESERVE:
8913336a 4109 val = po->tp_reserve;
8913336a 4110 break;
69e3c75f 4111 case PACKET_LOSS:
164bddac 4112 val = packet_sock_flag(po, PACKET_SOCK_TP_LOSS);
69e3c75f 4113 break;
614f60fa 4114 case PACKET_TIMESTAMP:
1051ce4a 4115 val = READ_ONCE(po->tp_tstamp);
614f60fa 4116 break;
dc99f600 4117 case PACKET_FANOUT:
dc99f600
DM
4118 val = (po->fanout ?
4119 ((u32)po->fanout->id |
77f65ebd
WB
4120 ((u32)po->fanout->type << 16) |
4121 ((u32)po->fanout->flags << 24)) :
dc99f600 4122 0);
dc99f600 4123 break;
fa788d98
VW
4124 case PACKET_IGNORE_OUTGOING:
4125 val = po->prot_hook.ignore_outgoing;
4126 break;
a9b63918 4127 case PACKET_ROLLOVER_STATS:
57f015f5 4128 if (!po->rollover)
a9b63918 4129 return -EINVAL;
57f015f5
MM
4130 rstats.tp_all = atomic_long_read(&po->rollover->num);
4131 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
4132 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
4133 data = &rstats;
4134 lv = sizeof(rstats);
a9b63918 4135 break;
5920cd3a 4136 case PACKET_TX_HAS_OFF:
74383446 4137 val = packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF);
5920cd3a 4138 break;
d346a3fa 4139 case PACKET_QDISC_BYPASS:
105a201e 4140 val = packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS);
d346a3fa 4141 break;
1da177e4
LT
4142 default:
4143 return -ENOPROTOOPT;
4144 }
4145
c06fff6e
ED
4146 if (len > lv)
4147 len = lv;
8ae55f04
KK
4148 if (put_user(len, optlen))
4149 return -EFAULT;
8dc41944
HX
4150 if (copy_to_user(optval, data, len))
4151 return -EFAULT;
8ae55f04 4152 return 0;
1da177e4
LT
4153}
4154
351638e7
JP
4155static int packet_notifier(struct notifier_block *this,
4156 unsigned long msg, void *ptr)
1da177e4
LT
4157{
4158 struct sock *sk;
351638e7 4159 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 4160 struct net *net = dev_net(dev);
1da177e4 4161
808f5114 4162 rcu_read_lock();
b67bfe0d 4163 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
4164 struct packet_sock *po = pkt_sk(sk);
4165
4166 switch (msg) {
4167 case NETDEV_UNREGISTER:
1da177e4 4168 if (po->mclist)
82f17091 4169 packet_dev_mclist_delete(dev, &po->mclist);
df561f66 4170 fallthrough;
a2efcfa0 4171
1da177e4
LT
4172 case NETDEV_DOWN:
4173 if (dev->ifindex == po->ifindex) {
4174 spin_lock(&po->bind_lock);
61edf479 4175 if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
ce06b03e 4176 __unregister_prot_hook(sk, false);
1da177e4
LT
4177 sk->sk_err = ENETDOWN;
4178 if (!sock_flag(sk, SOCK_DEAD))
e3ae2365 4179 sk_error_report(sk);
1da177e4
LT
4180 }
4181 if (msg == NETDEV_UNREGISTER) {
66e56cd4 4182 packet_cached_dev_reset(po);
e032f7c9 4183 WRITE_ONCE(po->ifindex, -1);
d62607c3
JK
4184 netdev_put(po->prot_hook.dev,
4185 &po->prot_hook.dev_tracker);
1da177e4
LT
4186 po->prot_hook.dev = NULL;
4187 }
4188 spin_unlock(&po->bind_lock);
4189 }
4190 break;
4191 case NETDEV_UP:
808f5114 4192 if (dev->ifindex == po->ifindex) {
4193 spin_lock(&po->bind_lock);
ce06b03e
DM
4194 if (po->num)
4195 register_prot_hook(sk);
808f5114 4196 spin_unlock(&po->bind_lock);
1da177e4 4197 }
1da177e4
LT
4198 break;
4199 }
4200 }
808f5114 4201 rcu_read_unlock();
1da177e4
LT
4202 return NOTIFY_DONE;
4203}
4204
4205
4206static int packet_ioctl(struct socket *sock, unsigned int cmd,
4207 unsigned long arg)
4208{
4209 struct sock *sk = sock->sk;
4210
69e3c75f 4211 switch (cmd) {
40d4e3df
ED
4212 case SIOCOUTQ:
4213 {
4214 int amount = sk_wmem_alloc_get(sk);
31e6d363 4215
40d4e3df
ED
4216 return put_user(amount, (int __user *)arg);
4217 }
4218 case SIOCINQ:
4219 {
4220 struct sk_buff *skb;
4221 int amount = 0;
4222
4223 spin_lock_bh(&sk->sk_receive_queue.lock);
4224 skb = skb_peek(&sk->sk_receive_queue);
4225 if (skb)
4226 amount = skb->len;
4227 spin_unlock_bh(&sk->sk_receive_queue.lock);
4228 return put_user(amount, (int __user *)arg);
4229 }
1da177e4 4230#ifdef CONFIG_INET
40d4e3df
ED
4231 case SIOCADDRT:
4232 case SIOCDELRT:
4233 case SIOCDARP:
4234 case SIOCGARP:
4235 case SIOCSARP:
4236 case SIOCGIFADDR:
4237 case SIOCSIFADDR:
4238 case SIOCGIFBRDADDR:
4239 case SIOCSIFBRDADDR:
4240 case SIOCGIFNETMASK:
4241 case SIOCSIFNETMASK:
4242 case SIOCGIFDSTADDR:
4243 case SIOCSIFDSTADDR:
4244 case SIOCSIFFLAGS:
40d4e3df 4245 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
4246#endif
4247
40d4e3df
ED
4248 default:
4249 return -ENOIOCTLCMD;
1da177e4
LT
4250 }
4251 return 0;
4252}
4253
a11e1d43
LT
4254static __poll_t packet_poll(struct file *file, struct socket *sock,
4255 poll_table *wait)
1da177e4
LT
4256{
4257 struct sock *sk = sock->sk;
4258 struct packet_sock *po = pkt_sk(sk);
a11e1d43 4259 __poll_t mask = datagram_poll(file, sock, wait);
1da177e4
LT
4260
4261 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4262 if (po->rx_ring.pg_vec) {
f6fb8f10 4263 if (!packet_previous_rx_frame(po, &po->rx_ring,
4264 TP_STATUS_KERNEL))
a9a08845 4265 mask |= EPOLLIN | EPOLLRDNORM;
1da177e4 4266 }
9bb6cd65 4267 packet_rcv_try_clear_pressure(po);
1da177e4 4268 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4269 spin_lock_bh(&sk->sk_write_queue.lock);
4270 if (po->tx_ring.pg_vec) {
4271 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
a9a08845 4272 mask |= EPOLLOUT | EPOLLWRNORM;
69e3c75f
JB
4273 }
4274 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4275 return mask;
4276}
4277
4278
4279/* Dirty? Well, I still did not learn better way to account
4280 * for user mmaps.
4281 */
4282
4283static void packet_mm_open(struct vm_area_struct *vma)
4284{
4285 struct file *file = vma->vm_file;
40d4e3df 4286 struct socket *sock = file->private_data;
1da177e4 4287 struct sock *sk = sock->sk;
1ce4f28b 4288
1da177e4
LT
4289 if (sk)
4290 atomic_inc(&pkt_sk(sk)->mapped);
4291}
4292
4293static void packet_mm_close(struct vm_area_struct *vma)
4294{
4295 struct file *file = vma->vm_file;
40d4e3df 4296 struct socket *sock = file->private_data;
1da177e4 4297 struct sock *sk = sock->sk;
1ce4f28b 4298
1da177e4
LT
4299 if (sk)
4300 atomic_dec(&pkt_sk(sk)->mapped);
4301}
4302
f0f37e2f 4303static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4304 .open = packet_mm_open,
4305 .close = packet_mm_close,
1da177e4
LT
4306};
4307
3a7ad063
ED
4308static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4309 unsigned int len)
1da177e4
LT
4310{
4311 int i;
4312
4ebf0ae2 4313 for (i = 0; i < len; i++) {
0e3125c7 4314 if (likely(pg_vec[i].buffer)) {
3a7ad063
ED
4315 if (is_vmalloc_addr(pg_vec[i].buffer))
4316 vfree(pg_vec[i].buffer);
4317 else
4318 free_pages((unsigned long)pg_vec[i].buffer,
4319 order);
0e3125c7
NH
4320 pg_vec[i].buffer = NULL;
4321 }
1da177e4
LT
4322 }
4323 kfree(pg_vec);
4324}
4325
3a7ad063 4326static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4327{
f0d4eb29 4328 char *buffer;
3a7ad063
ED
4329 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4330 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
0e3125c7 4331
3a7ad063 4332 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4333 if (buffer)
4334 return buffer;
4335
3a7ad063
ED
4336 /* __get_free_pages failed, fall back to vmalloc */
4337 buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
4338 if (buffer)
4339 return buffer;
0e3125c7 4340
3a7ad063
ED
4341 /* vmalloc failed, lets dig into swap here */
4342 gfp_flags &= ~__GFP_NORETRY;
4343 buffer = (char *) __get_free_pages(gfp_flags, order);
4344 if (buffer)
4345 return buffer;
4346
4347 /* complete and utter failure */
4348 return NULL;
4ebf0ae2
DM
4349}
4350
3a7ad063 4351static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4352{
4353 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4354 struct pgv *pg_vec;
4ebf0ae2
DM
4355 int i;
4356
398f0132 4357 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
4ebf0ae2
DM
4358 if (unlikely(!pg_vec))
4359 goto out;
4360
4361 for (i = 0; i < block_nr; i++) {
3a7ad063 4362 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4363 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4364 goto out_free_pgvec;
4365 }
4366
4367out:
4368 return pg_vec;
4369
4370out_free_pgvec:
3a7ad063 4371 free_pg_vec(pg_vec, order, block_nr);
4ebf0ae2
DM
4372 pg_vec = NULL;
4373 goto out;
4374}
1da177e4 4375
f6fb8f10 4376static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4377 int closing, int tx_ring)
1da177e4 4378{
0e3125c7 4379 struct pgv *pg_vec = NULL;
1da177e4 4380 struct packet_sock *po = pkt_sk(sk);
61fad681 4381 unsigned long *rx_owner_map = NULL;
3a7ad063 4382 int was_running, order = 0;
69e3c75f
JB
4383 struct packet_ring_buffer *rb;
4384 struct sk_buff_head *rb_queue;
0e11c91e 4385 __be16 num;
2a6d6c31 4386 int err;
f6fb8f10 4387 /* Added to avoid minimal code churn */
4388 struct tpacket_req *req = &req_u->req;
4389
69e3c75f
JB
4390 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4391 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4392
69e3c75f
JB
4393 err = -EBUSY;
4394 if (!closing) {
4395 if (atomic_read(&po->mapped))
4396 goto out;
b0138408 4397 if (packet_read_pending(rb))
69e3c75f
JB
4398 goto out;
4399 }
1da177e4 4400
69e3c75f 4401 if (req->tp_block_nr) {
4576cd46
WB
4402 unsigned int min_frame_size;
4403
69e3c75f
JB
4404 /* Sanity tests and some calculations */
4405 err = -EBUSY;
4406 if (unlikely(rb->pg_vec))
4407 goto out;
1da177e4 4408
bbd6ef87
PM
4409 switch (po->tp_version) {
4410 case TPACKET_V1:
4411 po->tp_hdrlen = TPACKET_HDRLEN;
4412 break;
4413 case TPACKET_V2:
4414 po->tp_hdrlen = TPACKET2_HDRLEN;
4415 break;
f6fb8f10 4416 case TPACKET_V3:
4417 po->tp_hdrlen = TPACKET3_HDRLEN;
4418 break;
bbd6ef87
PM
4419 }
4420
69e3c75f 4421 err = -EINVAL;
4ebf0ae2 4422 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4423 goto out;
90836b67 4424 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4425 goto out;
4576cd46 4426 min_frame_size = po->tp_hdrlen + po->tp_reserve;
dc808110 4427 if (po->tp_version >= TPACKET_V3 &&
4576cd46
WB
4428 req->tp_block_size <
4429 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
dc808110 4430 goto out;
4576cd46 4431 if (unlikely(req->tp_frame_size < min_frame_size))
69e3c75f 4432 goto out;
4ebf0ae2 4433 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4434 goto out;
1da177e4 4435
4194b491
TK
4436 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4437 if (unlikely(rb->frames_per_block == 0))
69e3c75f 4438 goto out;
fc62814d 4439 if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
8f8d28e4 4440 goto out;
69e3c75f
JB
4441 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4442 req->tp_frame_nr))
4443 goto out;
1da177e4
LT
4444
4445 err = -ENOMEM;
3a7ad063
ED
4446 order = get_order(req->tp_block_size);
4447 pg_vec = alloc_pg_vec(req, order);
4ebf0ae2 4448 if (unlikely(!pg_vec))
1da177e4 4449 goto out;
f6fb8f10 4450 switch (po->tp_version) {
4451 case TPACKET_V3:
7f953ab2
SV
4452 /* Block transmit is not supported yet */
4453 if (!tx_ring) {
e8e85cc5 4454 init_prb_bdqc(po, rb, pg_vec, req_u);
7f953ab2
SV
4455 } else {
4456 struct tpacket_req3 *req3 = &req_u->req3;
4457
4458 if (req3->tp_retire_blk_tov ||
4459 req3->tp_sizeof_priv ||
4460 req3->tp_feature_req_word) {
4461 err = -EINVAL;
55655e3d 4462 goto out_free_pg_vec;
7f953ab2
SV
4463 }
4464 }
d7cf0c34 4465 break;
f6fb8f10 4466 default:
61fad681
WB
4467 if (!tx_ring) {
4468 rx_owner_map = bitmap_alloc(req->tp_frame_nr,
4469 GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
4470 if (!rx_owner_map)
4471 goto out_free_pg_vec;
4472 }
f6fb8f10 4473 break;
4474 }
69e3c75f
JB
4475 }
4476 /* Done */
4477 else {
4478 err = -EINVAL;
4ebf0ae2 4479 if (unlikely(req->tp_frame_nr))
69e3c75f 4480 goto out;
1da177e4
LT
4481 }
4482
1da177e4
LT
4483
4484 /* Detach socket from network */
4485 spin_lock(&po->bind_lock);
61edf479 4486 was_running = packet_sock_flag(po, PACKET_SOCK_RUNNING);
1da177e4
LT
4487 num = po->num;
4488 if (was_running) {
c7d2ef5d 4489 WRITE_ONCE(po->num, 0);
ce06b03e 4490 __unregister_prot_hook(sk, false);
1da177e4
LT
4491 }
4492 spin_unlock(&po->bind_lock);
1ce4f28b 4493
1da177e4
LT
4494 synchronize_net();
4495
4496 err = -EBUSY;
905db440 4497 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4498 if (closing || atomic_read(&po->mapped) == 0) {
4499 err = 0;
69e3c75f 4500 spin_lock_bh(&rb_queue->lock);
c053fd96 4501 swap(rb->pg_vec, pg_vec);
61fad681
WB
4502 if (po->tp_version <= TPACKET_V2)
4503 swap(rb->rx_owner_map, rx_owner_map);
69e3c75f
JB
4504 rb->frame_max = (req->tp_frame_nr - 1);
4505 rb->head = 0;
4506 rb->frame_size = req->tp_frame_size;
4507 spin_unlock_bh(&rb_queue->lock);
4508
3a7ad063 4509 swap(rb->pg_vec_order, order);
c053fd96 4510 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4511
4512 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4513 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4514 tpacket_rcv : packet_rcv;
4515 skb_queue_purge(rb_queue);
1da177e4 4516 if (atomic_read(&po->mapped))
40d4e3df
ED
4517 pr_err("packet_mmap: vma is busy: %d\n",
4518 atomic_read(&po->mapped));
1da177e4 4519 }
905db440 4520 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4521
4522 spin_lock(&po->bind_lock);
ce06b03e 4523 if (was_running) {
c7d2ef5d 4524 WRITE_ONCE(po->num, num);
ce06b03e 4525 register_prot_hook(sk);
1da177e4
LT
4526 }
4527 spin_unlock(&po->bind_lock);
c800aaf8 4528 if (pg_vec && (po->tp_version > TPACKET_V2)) {
f6fb8f10 4529 /* Because we don't support block-based V3 on tx-ring */
4530 if (!tx_ring)
73d0fcf2 4531 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4532 }
1da177e4 4533
55655e3d 4534out_free_pg_vec:
ec6af094
WB
4535 if (pg_vec) {
4536 bitmap_free(rx_owner_map);
3a7ad063 4537 free_pg_vec(pg_vec, order, req->tp_block_nr);
ec6af094 4538 }
1da177e4
LT
4539out:
4540 return err;
4541}
4542
69e3c75f
JB
4543static int packet_mmap(struct file *file, struct socket *sock,
4544 struct vm_area_struct *vma)
1da177e4
LT
4545{
4546 struct sock *sk = sock->sk;
4547 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4548 unsigned long size, expected_size;
4549 struct packet_ring_buffer *rb;
1da177e4
LT
4550 unsigned long start;
4551 int err = -EINVAL;
4552 int i;
4553
4554 if (vma->vm_pgoff)
4555 return -EINVAL;
4556
905db440 4557 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4558
4559 expected_size = 0;
4560 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4561 if (rb->pg_vec) {
4562 expected_size += rb->pg_vec_len
4563 * rb->pg_vec_pages
4564 * PAGE_SIZE;
4565 }
4566 }
4567
4568 if (expected_size == 0)
1da177e4 4569 goto out;
69e3c75f
JB
4570
4571 size = vma->vm_end - vma->vm_start;
4572 if (size != expected_size)
1da177e4
LT
4573 goto out;
4574
1da177e4 4575 start = vma->vm_start;
69e3c75f
JB
4576 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4577 if (rb->pg_vec == NULL)
4578 continue;
4579
4580 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4581 struct page *page;
4582 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4583 int pg_num;
4584
c56b4d90
CG
4585 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4586 page = pgv_to_page(kaddr);
69e3c75f
JB
4587 err = vm_insert_page(vma, start, page);
4588 if (unlikely(err))
4589 goto out;
4590 start += PAGE_SIZE;
0e3125c7 4591 kaddr += PAGE_SIZE;
69e3c75f 4592 }
4ebf0ae2 4593 }
1da177e4 4594 }
69e3c75f 4595
4ebf0ae2 4596 atomic_inc(&po->mapped);
1da177e4
LT
4597 vma->vm_ops = &packet_mmap_ops;
4598 err = 0;
4599
4600out:
905db440 4601 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4602 return err;
4603}
1da177e4 4604
90ddc4f0 4605static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4606 .family = PF_PACKET,
4607 .owner = THIS_MODULE,
4608 .release = packet_release,
4609 .bind = packet_bind_spkt,
4610 .connect = sock_no_connect,
4611 .socketpair = sock_no_socketpair,
4612 .accept = sock_no_accept,
4613 .getname = packet_getname_spkt,
a11e1d43 4614 .poll = datagram_poll,
1da177e4 4615 .ioctl = packet_ioctl,
c7cbdbf2 4616 .gettstamp = sock_gettstamp,
1da177e4
LT
4617 .listen = sock_no_listen,
4618 .shutdown = sock_no_shutdown,
1da177e4
LT
4619 .sendmsg = packet_sendmsg_spkt,
4620 .recvmsg = packet_recvmsg,
4621 .mmap = sock_no_mmap,
4622 .sendpage = sock_no_sendpage,
4623};
1da177e4 4624
90ddc4f0 4625static const struct proto_ops packet_ops = {
1da177e4
LT
4626 .family = PF_PACKET,
4627 .owner = THIS_MODULE,
4628 .release = packet_release,
4629 .bind = packet_bind,
4630 .connect = sock_no_connect,
4631 .socketpair = sock_no_socketpair,
4632 .accept = sock_no_accept,
1ce4f28b 4633 .getname = packet_getname,
a11e1d43 4634 .poll = packet_poll,
1da177e4 4635 .ioctl = packet_ioctl,
c7cbdbf2 4636 .gettstamp = sock_gettstamp,
1da177e4
LT
4637 .listen = sock_no_listen,
4638 .shutdown = sock_no_shutdown,
4639 .setsockopt = packet_setsockopt,
4640 .getsockopt = packet_getsockopt,
4641 .sendmsg = packet_sendmsg,
4642 .recvmsg = packet_recvmsg,
4643 .mmap = packet_mmap,
4644 .sendpage = sock_no_sendpage,
4645};
4646
ec1b4cf7 4647static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4648 .family = PF_PACKET,
4649 .create = packet_create,
4650 .owner = THIS_MODULE,
4651};
4652
4653static struct notifier_block packet_netdev_notifier = {
40d4e3df 4654 .notifier_call = packet_notifier,
1da177e4
LT
4655};
4656
4657#ifdef CONFIG_PROC_FS
1da177e4
LT
4658
4659static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4660 __acquires(RCU)
1da177e4 4661{
e372c414 4662 struct net *net = seq_file_net(seq);
808f5114 4663
4664 rcu_read_lock();
4665 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4666}
4667
4668static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4669{
1bf40954 4670 struct net *net = seq_file_net(seq);
808f5114 4671 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4672}
4673
4674static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4675 __releases(RCU)
1da177e4 4676{
808f5114 4677 rcu_read_unlock();
1da177e4
LT
4678}
4679
1ce4f28b 4680static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4681{
4682 if (v == SEQ_START_TOKEN)
abdcd06c
BS
4683 seq_printf(seq,
4684 "%*sRefCnt Type Proto Iface R Rmem User Inode\n",
4685 IS_ENABLED(CONFIG_64BIT) ? -17 : -9, "sk");
1da177e4 4686 else {
b7ceabd9 4687 struct sock *s = sk_entry(v);
1da177e4
LT
4688 const struct packet_sock *po = pkt_sk(s);
4689
4690 seq_printf(seq,
71338aa7 4691 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4 4692 s,
41c6d650 4693 refcount_read(&s->sk_refcnt),
1da177e4 4694 s->sk_type,
c7d2ef5d 4695 ntohs(READ_ONCE(po->num)),
e032f7c9 4696 READ_ONCE(po->ifindex),
61edf479 4697 packet_sock_flag(po, PACKET_SOCK_RUNNING),
1da177e4 4698 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4699 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4700 sock_i_ino(s));
1da177e4
LT
4701 }
4702
4703 return 0;
4704}
4705
56b3d975 4706static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4707 .start = packet_seq_start,
4708 .next = packet_seq_next,
4709 .stop = packet_seq_stop,
4710 .show = packet_seq_show,
4711};
1da177e4
LT
4712#endif
4713
2c8c1e72 4714static int __net_init packet_net_init(struct net *net)
d12d01d6 4715{
0fa7fa98 4716 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4717 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4718
a268e0f2 4719#ifdef CONFIG_PROC_FS
c3506372
CH
4720 if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
4721 sizeof(struct seq_net_private)))
d12d01d6 4722 return -ENOMEM;
a268e0f2 4723#endif /* CONFIG_PROC_FS */
d12d01d6
DL
4724
4725 return 0;
4726}
4727
2c8c1e72 4728static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4729{
ece31ffd 4730 remove_proc_entry("packet", net->proc_net);
669f8f1a 4731 WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
d12d01d6
DL
4732}
4733
4734static struct pernet_operations packet_net_ops = {
4735 .init = packet_net_init,
4736 .exit = packet_net_exit,
4737};
4738
4739
1da177e4
LT
4740static void __exit packet_exit(void)
4741{
1da177e4
LT
4742 sock_unregister(PF_PACKET);
4743 proto_unregister(&packet_proto);
63b7c2eb
ZX
4744 unregister_netdevice_notifier(&packet_netdev_notifier);
4745 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4746}
4747
4748static int __init packet_init(void)
4749{
36096f2f 4750 int rc;
1da177e4 4751
36096f2f
Y
4752 rc = register_pernet_subsys(&packet_net_ops);
4753 if (rc)
63b7c2eb 4754 goto out;
36096f2f
Y
4755 rc = register_netdevice_notifier(&packet_netdev_notifier);
4756 if (rc)
4757 goto out_pernet;
63b7c2eb
ZX
4758 rc = proto_register(&packet_proto, 0);
4759 if (rc)
4760 goto out_notifier;
4761 rc = sock_register(&packet_family_ops);
4762 if (rc)
4763 goto out_proto;
1da177e4 4764
36096f2f
Y
4765 return 0;
4766
36096f2f
Y
4767out_proto:
4768 proto_unregister(&packet_proto);
63b7c2eb
ZX
4769out_notifier:
4770 unregister_netdevice_notifier(&packet_netdev_notifier);
4771out_pernet:
4772 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4773out:
4774 return rc;
4775}
4776
4777module_init(packet_init);
4778module_exit(packet_exit);
4779MODULE_LICENSE("GPL");
4780MODULE_ALIAS_NETPROTO(PF_PACKET);