treewide: Use fallthrough pseudo-keyword
[linux-block.git] / net / packet / af_packet.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * PACKET - implements raw packet sockets.
8 *
02c30a84 9 * Authors: Ross Biro
1da177e4
LT
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 *
1ce4f28b 13 * Fixes:
1da177e4
LT
14 * Alan Cox : verify_area() now used correctly
15 * Alan Cox : new skbuff lists, look ma no backlogs!
16 * Alan Cox : tidied skbuff lists.
17 * Alan Cox : Now uses generic datagram routines I
18 * added. Also fixed the peek/read crash
19 * from all old Linux datagram code.
20 * Alan Cox : Uses the improved datagram code.
21 * Alan Cox : Added NULL's for socket options.
22 * Alan Cox : Re-commented the code.
23 * Alan Cox : Use new kernel side addressing
24 * Rob Janssen : Correct MTU usage.
25 * Dave Platt : Counter leaks caused by incorrect
26 * interrupt locking and some slightly
27 * dubious gcc output. Can you read
28 * compiler: it said _VOLATILE_
29 * Richard Kooijman : Timestamp fixes.
30 * Alan Cox : New buffers. Use sk->mac.raw.
31 * Alan Cox : sendmsg/recvmsg support.
32 * Alan Cox : Protocol setting support
33 * Alexey Kuznetsov : Untied from IPv4 stack.
34 * Cyrus Durgin : Fixed kerneld for kmod.
35 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 36 * Ulises Alonso : Frame number limit removal and
1da177e4 37 * packet_set_ring memory leak.
0fb375fb
EB
38 * Eric Biederman : Allow for > 8 byte hardware addresses.
39 * The convention is that longer addresses
40 * will simply extend the hardware address
1ce4f28b 41 * byte arrays at the end of sockaddr_ll
0fb375fb 42 * and packet_mreq.
69e3c75f 43 * Johann Baudy : Added TX RING.
f6fb8f10 44 * Chetan Loke : Implemented TPACKET_V3 block abstraction
45 * layer.
46 * Copyright (C) 2011, <lokec@ccs.neu.edu>
1da177e4 47 */
1ce4f28b 48
1da177e4 49#include <linux/types.h>
1da177e4 50#include <linux/mm.h>
4fc268d2 51#include <linux/capability.h>
1da177e4
LT
52#include <linux/fcntl.h>
53#include <linux/socket.h>
54#include <linux/in.h>
55#include <linux/inet.h>
56#include <linux/netdevice.h>
57#include <linux/if_packet.h>
58#include <linux/wireless.h>
ffbc6111 59#include <linux/kernel.h>
1da177e4 60#include <linux/kmod.h>
5a0e3ad6 61#include <linux/slab.h>
0e3125c7 62#include <linux/vmalloc.h>
457c4cbc 63#include <net/net_namespace.h>
1da177e4
LT
64#include <net/ip.h>
65#include <net/protocol.h>
66#include <linux/skbuff.h>
67#include <net/sock.h>
68#include <linux/errno.h>
69#include <linux/timer.h>
7c0f6ba6 70#include <linux/uaccess.h>
1da177e4
LT
71#include <asm/ioctls.h>
72#include <asm/page.h>
a1f8e7f7 73#include <asm/cacheflush.h>
1da177e4
LT
74#include <asm/io.h>
75#include <linux/proc_fs.h>
76#include <linux/seq_file.h>
77#include <linux/poll.h>
78#include <linux/module.h>
79#include <linux/init.h>
905db440 80#include <linux/mutex.h>
05423b24 81#include <linux/if_vlan.h>
bfd5f4a3 82#include <linux/virtio_net.h>
ed85b565 83#include <linux/errqueue.h>
614f60fa 84#include <linux/net_tstamp.h>
b0138408 85#include <linux/percpu.h>
1da177e4
LT
86#ifdef CONFIG_INET
87#include <net/inet_common.h>
88#endif
47dceb8e 89#include <linux/bpf.h>
719c44d3 90#include <net/compat.h>
1da177e4 91
2787b04b
PE
92#include "internal.h"
93
1da177e4
LT
94/*
95 Assumptions:
96 - if device has no dev->hard_header routine, it adds and removes ll header
97 inside itself. In this case ll header is invisible outside of device,
98 but higher levels still should reserve dev->hard_header_len.
99 Some devices are enough clever to reallocate skb, when header
100 will not fit to reserved space (tunnel), another ones are silly
101 (PPP).
102 - packet socket receives packets with pulled ll header,
103 so that SOCK_RAW should push it back.
104
105On receive:
106-----------
107
108Incoming, dev->hard_header!=NULL
b0e380b1
ACM
109 mac_header -> ll header
110 data -> data
1da177e4
LT
111
112Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
113 mac_header -> ll header
114 data -> ll header
1da177e4
LT
115
116Incoming, dev->hard_header==NULL
b0e380b1
ACM
117 mac_header -> UNKNOWN position. It is very likely, that it points to ll
118 header. PPP makes it, that is wrong, because introduce
db0c58f9 119 assymetry between rx and tx paths.
b0e380b1 120 data -> data
1da177e4
LT
121
122Outgoing, dev->hard_header==NULL
b0e380b1
ACM
123 mac_header -> data. ll header is still not built!
124 data -> data
1da177e4
LT
125
126Resume
127 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
128
129
130On transmit:
131------------
132
133dev->hard_header != NULL
b0e380b1
ACM
134 mac_header -> ll header
135 data -> ll header
1da177e4
LT
136
137dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
138 mac_header -> data
139 data -> data
1da177e4
LT
140
141 We should set nh.raw on output to correct posistion,
142 packet classifier depends on it.
143 */
144
1da177e4
LT
145/* Private packet socket structures. */
146
0fb375fb
EB
147/* identical to struct packet_mreq except it has
148 * a longer address field.
149 */
40d4e3df 150struct packet_mreq_max {
0fb375fb
EB
151 int mr_ifindex;
152 unsigned short mr_type;
153 unsigned short mr_alen;
154 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 155};
a2efcfa0 156
184f489e
DB
157union tpacket_uhdr {
158 struct tpacket_hdr *h1;
159 struct tpacket2_hdr *h2;
160 struct tpacket3_hdr *h3;
161 void *raw;
162};
163
f6fb8f10 164static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
165 int closing, int tx_ring);
166
f6fb8f10 167#define V3_ALIGNMENT (8)
168
bc59ba39 169#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 170
171#define BLK_PLUS_PRIV(sz_of_priv) \
172 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
173
f6fb8f10 174#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
175#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
176#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
177#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
178#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
179#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
180#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
181
69e3c75f 182struct packet_sock;
77f65ebd
WB
183static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
184 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 185
f6fb8f10 186static void *packet_previous_frame(struct packet_sock *po,
187 struct packet_ring_buffer *rb,
188 int status);
189static void packet_increment_head(struct packet_ring_buffer *buff);
878cd3ba 190static int prb_curr_blk_in_use(struct tpacket_block_desc *);
bc59ba39 191static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 192 struct packet_sock *);
bc59ba39 193static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 194 struct packet_sock *, unsigned int status);
bc59ba39 195static int prb_queue_frozen(struct tpacket_kbdq_core *);
196static void prb_open_block(struct tpacket_kbdq_core *,
197 struct tpacket_block_desc *);
17bfd8c8 198static void prb_retire_rx_blk_timer_expired(struct timer_list *);
bc59ba39 199static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
bc59ba39 200static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
201static void prb_clear_rxhash(struct tpacket_kbdq_core *,
202 struct tpacket3_hdr *);
203static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
204 struct tpacket3_hdr *);
1da177e4 205static void packet_flush_mclist(struct sock *sk);
865b03f2 206static u16 packet_pick_tx_queue(struct sk_buff *skb);
1da177e4 207
ffbc6111 208struct packet_skb_cb {
ffbc6111
HX
209 union {
210 struct sockaddr_pkt pkt;
2472d761
EB
211 union {
212 /* Trick: alias skb original length with
213 * ll.sll_family and ll.protocol in order
214 * to save room.
215 */
216 unsigned int origlen;
217 struct sockaddr_ll ll;
218 };
ffbc6111
HX
219 } sa;
220};
221
d3869efe
DW
222#define vio_le() virtio_legacy_is_little_endian()
223
ffbc6111 224#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 225
bc59ba39 226#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 227#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 228 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 229#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 230 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 231#define GET_NEXT_PRB_BLK_NUM(x) \
232 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
233 ((x)->kactive_blk_num+1) : 0)
234
dc99f600
DM
235static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
236static void __fanout_link(struct sock *sk, struct packet_sock *po);
237
d346a3fa
DB
238static int packet_direct_xmit(struct sk_buff *skb)
239{
865b03f2 240 return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
d346a3fa
DB
241}
242
66e56cd4
DB
243static struct net_device *packet_cached_dev_get(struct packet_sock *po)
244{
245 struct net_device *dev;
246
247 rcu_read_lock();
248 dev = rcu_dereference(po->cached_dev);
249 if (likely(dev))
250 dev_hold(dev);
251 rcu_read_unlock();
252
253 return dev;
254}
255
256static void packet_cached_dev_assign(struct packet_sock *po,
257 struct net_device *dev)
258{
259 rcu_assign_pointer(po->cached_dev, dev);
260}
261
262static void packet_cached_dev_reset(struct packet_sock *po)
263{
264 RCU_INIT_POINTER(po->cached_dev, NULL);
265}
266
d346a3fa
DB
267static bool packet_use_direct_xmit(const struct packet_sock *po)
268{
269 return po->xmit == packet_direct_xmit;
270}
271
865b03f2 272static u16 packet_pick_tx_queue(struct sk_buff *skb)
0fd5d57b 273{
865b03f2 274 struct net_device *dev = skb->dev;
0fd5d57b 275 const struct net_device_ops *ops = dev->netdev_ops;
b71b5837 276 int cpu = raw_smp_processor_id();
0fd5d57b
DB
277 u16 queue_index;
278
b71b5837
PA
279#ifdef CONFIG_XPS
280 skb->sender_cpu = cpu + 1;
281#endif
282 skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
0fd5d57b 283 if (ops->ndo_select_queue) {
a350ecce 284 queue_index = ops->ndo_select_queue(dev, skb, NULL);
0fd5d57b
DB
285 queue_index = netdev_cap_txqueue(dev, queue_index);
286 } else {
b71b5837 287 queue_index = netdev_pick_tx(dev, skb, NULL);
0fd5d57b
DB
288 }
289
865b03f2 290 return queue_index;
0fd5d57b
DB
291}
292
a6361f0c 293/* __register_prot_hook must be invoked through register_prot_hook
ce06b03e
DM
294 * or from a context in which asynchronous accesses to the packet
295 * socket is not possible (packet_create()).
296 */
a6361f0c 297static void __register_prot_hook(struct sock *sk)
ce06b03e
DM
298{
299 struct packet_sock *po = pkt_sk(sk);
e40526cb 300
ce06b03e 301 if (!po->running) {
66e56cd4 302 if (po->fanout)
dc99f600 303 __fanout_link(sk, po);
66e56cd4 304 else
dc99f600 305 dev_add_pack(&po->prot_hook);
e40526cb 306
ce06b03e
DM
307 sock_hold(sk);
308 po->running = 1;
309 }
310}
311
a6361f0c
WB
312static void register_prot_hook(struct sock *sk)
313{
314 lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
315 __register_prot_hook(sk);
316}
317
318/* If the sync parameter is true, we will temporarily drop
ce06b03e
DM
319 * the po->bind_lock and do a synchronize_net to make sure no
320 * asynchronous packet processing paths still refer to the elements
321 * of po->prot_hook. If the sync parameter is false, it is the
322 * callers responsibility to take care of this.
323 */
324static void __unregister_prot_hook(struct sock *sk, bool sync)
325{
326 struct packet_sock *po = pkt_sk(sk);
327
a6361f0c
WB
328 lockdep_assert_held_once(&po->bind_lock);
329
ce06b03e 330 po->running = 0;
66e56cd4
DB
331
332 if (po->fanout)
dc99f600 333 __fanout_unlink(sk, po);
66e56cd4 334 else
dc99f600 335 __dev_remove_pack(&po->prot_hook);
e40526cb 336
ce06b03e
DM
337 __sock_put(sk);
338
339 if (sync) {
340 spin_unlock(&po->bind_lock);
341 synchronize_net();
342 spin_lock(&po->bind_lock);
343 }
344}
345
346static void unregister_prot_hook(struct sock *sk, bool sync)
347{
348 struct packet_sock *po = pkt_sk(sk);
349
350 if (po->running)
351 __unregister_prot_hook(sk, sync);
352}
353
6e58040b 354static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
355{
356 if (is_vmalloc_addr(addr))
357 return vmalloc_to_page(addr);
358 return virt_to_page(addr);
359}
360
69e3c75f 361static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 362{
184f489e 363 union tpacket_uhdr h;
1da177e4 364
69e3c75f 365 h.raw = frame;
bbd6ef87
PM
366 switch (po->tp_version) {
367 case TPACKET_V1:
69e3c75f 368 h.h1->tp_status = status;
0af55bb5 369 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
370 break;
371 case TPACKET_V2:
69e3c75f 372 h.h2->tp_status = status;
0af55bb5 373 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 374 break;
f6fb8f10 375 case TPACKET_V3:
7f953ab2
SV
376 h.h3->tp_status = status;
377 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
378 break;
69e3c75f 379 default:
f6fb8f10 380 WARN(1, "TPACKET version not supported.\n");
69e3c75f 381 BUG();
bbd6ef87 382 }
69e3c75f
JB
383
384 smp_wmb();
bbd6ef87
PM
385}
386
96f657e6 387static int __packet_get_status(const struct packet_sock *po, void *frame)
bbd6ef87 388{
184f489e 389 union tpacket_uhdr h;
bbd6ef87 390
69e3c75f
JB
391 smp_rmb();
392
bbd6ef87
PM
393 h.raw = frame;
394 switch (po->tp_version) {
395 case TPACKET_V1:
0af55bb5 396 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 397 return h.h1->tp_status;
bbd6ef87 398 case TPACKET_V2:
0af55bb5 399 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 400 return h.h2->tp_status;
f6fb8f10 401 case TPACKET_V3:
7f953ab2
SV
402 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
403 return h.h3->tp_status;
69e3c75f 404 default:
f6fb8f10 405 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
406 BUG();
407 return 0;
bbd6ef87 408 }
1da177e4 409}
69e3c75f 410
d413fcb4 411static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts,
b9c32fb2 412 unsigned int flags)
7a51384c
DB
413{
414 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
415
68a360e8
WB
416 if (shhwtstamps &&
417 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
d413fcb4 418 ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts))
68a360e8 419 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c 420
d413fcb4 421 if (ktime_to_timespec64_cond(skb->tstamp, ts))
b9c32fb2 422 return TP_STATUS_TS_SOFTWARE;
7a51384c 423
b9c32fb2 424 return 0;
7a51384c
DB
425}
426
b9c32fb2
DB
427static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
428 struct sk_buff *skb)
2e31396f
WB
429{
430 union tpacket_uhdr h;
d413fcb4 431 struct timespec64 ts;
b9c32fb2 432 __u32 ts_status;
2e31396f 433
b9c32fb2
DB
434 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
435 return 0;
2e31396f
WB
436
437 h.raw = frame;
d413fcb4
AB
438 /*
439 * versions 1 through 3 overflow the timestamps in y2106, since they
440 * all store the seconds in a 32-bit unsigned integer.
441 * If we create a version 4, that should have a 64-bit timestamp,
442 * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit
443 * nanoseconds.
444 */
2e31396f
WB
445 switch (po->tp_version) {
446 case TPACKET_V1:
447 h.h1->tp_sec = ts.tv_sec;
448 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
449 break;
450 case TPACKET_V2:
451 h.h2->tp_sec = ts.tv_sec;
452 h.h2->tp_nsec = ts.tv_nsec;
453 break;
454 case TPACKET_V3:
57ea884b
DB
455 h.h3->tp_sec = ts.tv_sec;
456 h.h3->tp_nsec = ts.tv_nsec;
457 break;
2e31396f
WB
458 default:
459 WARN(1, "TPACKET version not supported.\n");
460 BUG();
461 }
462
463 /* one flush is safe, as both fields always lie on the same cacheline */
464 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
465 smp_wmb();
b9c32fb2
DB
466
467 return ts_status;
2e31396f
WB
468}
469
d4b5bd98
ED
470static void *packet_lookup_frame(const struct packet_sock *po,
471 const struct packet_ring_buffer *rb,
472 unsigned int position,
473 int status)
69e3c75f
JB
474{
475 unsigned int pg_vec_pos, frame_offset;
184f489e 476 union tpacket_uhdr h;
69e3c75f
JB
477
478 pg_vec_pos = position / rb->frames_per_block;
479 frame_offset = position % rb->frames_per_block;
480
0e3125c7
NH
481 h.raw = rb->pg_vec[pg_vec_pos].buffer +
482 (frame_offset * rb->frame_size);
69e3c75f
JB
483
484 if (status != __packet_get_status(po, h.raw))
485 return NULL;
486
487 return h.raw;
488}
489
eea49cc9 490static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
491 struct packet_ring_buffer *rb,
492 int status)
493{
494 return packet_lookup_frame(po, rb, rb->head, status);
495}
496
bc59ba39 497static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 498{
499 del_timer_sync(&pkc->retire_blk_timer);
500}
501
502static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 503 struct sk_buff_head *rb_queue)
504{
bc59ba39 505 struct tpacket_kbdq_core *pkc;
f6fb8f10 506
73d0fcf2 507 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 508
ec6f809f 509 spin_lock_bh(&rb_queue->lock);
f6fb8f10 510 pkc->delete_blk_timer = 1;
ec6f809f 511 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 512
513 prb_del_retire_blk_timer(pkc);
514}
515
e8e85cc5 516static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 517{
bc59ba39 518 struct tpacket_kbdq_core *pkc;
f6fb8f10 519
e8e85cc5 520 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
17bfd8c8
KC
521 timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
522 0);
523 pkc->retire_blk_timer.expires = jiffies;
f6fb8f10 524}
525
526static int prb_calc_retire_blk_tmo(struct packet_sock *po,
527 int blk_size_in_bytes)
528{
529 struct net_device *dev;
0914d2bb 530 unsigned int mbits, div;
7cad1bac 531 struct ethtool_link_ksettings ecmd;
4bc71cb9 532 int err;
f6fb8f10 533
4bc71cb9
JP
534 rtnl_lock();
535 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
536 if (unlikely(!dev)) {
537 rtnl_unlock();
f6fb8f10 538 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 539 }
7cad1bac 540 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9 541 rtnl_unlock();
0914d2bb 542 if (err)
b43d1f9f 543 return DEFAULT_PRB_RETIRE_TOV;
f6fb8f10 544
0914d2bb
MW
545 /* If the link speed is so slow you don't really
546 * need to worry about perf anyways
547 */
548 if (ecmd.base.speed < SPEED_1000 ||
549 ecmd.base.speed == SPEED_UNKNOWN)
550 return DEFAULT_PRB_RETIRE_TOV;
551
552 div = ecmd.base.speed / 1000;
f6fb8f10 553 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
554
555 if (div)
556 mbits /= div;
557
f6fb8f10 558 if (div)
0914d2bb
MW
559 return mbits + 1;
560 return mbits;
f6fb8f10 561}
562
bc59ba39 563static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 564 union tpacket_req_u *req_u)
565{
566 p1->feature_req_word = req_u->req3.tp_feature_req_word;
567}
568
569static void init_prb_bdqc(struct packet_sock *po,
570 struct packet_ring_buffer *rb,
571 struct pgv *pg_vec,
e8e85cc5 572 union tpacket_req_u *req_u)
f6fb8f10 573{
22781a5b 574 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 575 struct tpacket_block_desc *pbd;
f6fb8f10 576
577 memset(p1, 0x0, sizeof(*p1));
578
579 p1->knxt_seq_num = 1;
580 p1->pkbdq = pg_vec;
bc59ba39 581 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 582 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 583 p1->kblk_size = req_u->req3.tp_block_size;
584 p1->knum_blocks = req_u->req3.tp_block_nr;
585 p1->hdrlen = po->tp_hdrlen;
586 p1->version = po->tp_version;
587 p1->last_kactive_blk_num = 0;
ee80fbf3 588 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 589 if (req_u->req3.tp_retire_blk_tov)
590 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
591 else
592 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
593 req_u->req3.tp_block_size);
594 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
595 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
632ca50f 596 rwlock_init(&p1->blk_fill_in_prog_lock);
f6fb8f10 597
dc808110 598 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 599 prb_init_ft_ops(p1, req_u);
e8e85cc5 600 prb_setup_retire_blk_timer(po);
f6fb8f10 601 prb_open_block(p1, pbd);
602}
603
604/* Do NOT update the last_blk_num first.
605 * Assumes sk_buff_head lock is held.
606 */
bc59ba39 607static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 608{
609 mod_timer(&pkc->retire_blk_timer,
610 jiffies + pkc->tov_in_jiffies);
611 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
612}
613
614/*
615 * Timer logic:
616 * 1) We refresh the timer only when we open a block.
617 * By doing this we don't waste cycles refreshing the timer
618 * on packet-by-packet basis.
619 *
620 * With a 1MB block-size, on a 1Gbps line, it will take
621 * i) ~8 ms to fill a block + ii) memcpy etc.
622 * In this cut we are not accounting for the memcpy time.
623 *
624 * So, if the user sets the 'tmo' to 10ms then the timer
625 * will never fire while the block is still getting filled
626 * (which is what we want). However, the user could choose
627 * to close a block early and that's fine.
628 *
629 * But when the timer does fire, we check whether or not to refresh it.
630 * Since the tmo granularity is in msecs, it is not too expensive
631 * to refresh the timer, lets say every '8' msecs.
632 * Either the user can set the 'tmo' or we can derive it based on
633 * a) line-speed and b) block-size.
634 * prb_calc_retire_blk_tmo() calculates the tmo.
635 *
636 */
17bfd8c8 637static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
f6fb8f10 638{
17bfd8c8
KC
639 struct packet_sock *po =
640 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
22781a5b 641 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 642 unsigned int frozen;
bc59ba39 643 struct tpacket_block_desc *pbd;
f6fb8f10 644
645 spin_lock(&po->sk.sk_receive_queue.lock);
646
647 frozen = prb_queue_frozen(pkc);
648 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
649
650 if (unlikely(pkc->delete_blk_timer))
651 goto out;
652
653 /* We only need to plug the race when the block is partially filled.
654 * tpacket_rcv:
655 * lock(); increment BLOCK_NUM_PKTS; unlock()
656 * copy_bits() is in progress ...
657 * timer fires on other cpu:
658 * we can't retire the current block because copy_bits
659 * is in progress.
660 *
661 */
662 if (BLOCK_NUM_PKTS(pbd)) {
632ca50f
JO
663 /* Waiting for skb_copy_bits to finish... */
664 write_lock(&pkc->blk_fill_in_prog_lock);
665 write_unlock(&pkc->blk_fill_in_prog_lock);
f6fb8f10 666 }
667
668 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
669 if (!frozen) {
41a50d62
AD
670 if (!BLOCK_NUM_PKTS(pbd)) {
671 /* An empty block. Just refresh the timer. */
672 goto refresh_timer;
673 }
f6fb8f10 674 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
675 if (!prb_dispatch_next_block(pkc, po))
676 goto refresh_timer;
677 else
678 goto out;
679 } else {
680 /* Case 1. Queue was frozen because user-space was
681 * lagging behind.
682 */
878cd3ba 683 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 684 /*
685 * Ok, user-space is still behind.
686 * So just refresh the timer.
687 */
688 goto refresh_timer;
689 } else {
690 /* Case 2. queue was frozen,user-space caught up,
691 * now the link went idle && the timer fired.
692 * We don't have a block to close.So we open this
693 * block and restart the timer.
694 * opening a block thaws the queue,restarts timer
695 * Thawing/timer-refresh is a side effect.
696 */
697 prb_open_block(pkc, pbd);
698 goto out;
699 }
700 }
701 }
702
703refresh_timer:
704 _prb_refresh_rx_retire_blk_timer(pkc);
705
706out:
707 spin_unlock(&po->sk.sk_receive_queue.lock);
708}
709
eea49cc9 710static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 711 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 712{
713 /* Flush everything minus the block header */
714
715#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
716 u8 *start, *end;
717
718 start = (u8 *)pbd1;
719
720 /* Skip the block header(we know header WILL fit in 4K) */
721 start += PAGE_SIZE;
722
723 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
724 for (; start < end; start += PAGE_SIZE)
725 flush_dcache_page(pgv_to_page(start));
726
727 smp_wmb();
728#endif
729
730 /* Now update the block status. */
731
732 BLOCK_STATUS(pbd1) = status;
733
734 /* Flush the block header */
735
736#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
737 start = (u8 *)pbd1;
738 flush_dcache_page(pgv_to_page(start));
739
740 smp_wmb();
741#endif
742}
743
744/*
745 * Side effect:
746 *
747 * 1) flush the block
748 * 2) Increment active_blk_num
749 *
750 * Note:We DONT refresh the timer on purpose.
751 * Because almost always the next block will be opened.
752 */
bc59ba39 753static void prb_close_block(struct tpacket_kbdq_core *pkc1,
754 struct tpacket_block_desc *pbd1,
f6fb8f10 755 struct packet_sock *po, unsigned int stat)
756{
757 __u32 status = TP_STATUS_USER | stat;
758
759 struct tpacket3_hdr *last_pkt;
bc59ba39 760 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 761 struct sock *sk = &po->sk;
f6fb8f10 762
8e8e2951 763 if (atomic_read(&po->tp_drops))
f6fb8f10 764 status |= TP_STATUS_LOSING;
765
766 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
767 last_pkt->tp_next_offset = 0;
768
769 /* Get the ts of the last pkt */
770 if (BLOCK_NUM_PKTS(pbd1)) {
771 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
772 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
773 } else {
41a50d62
AD
774 /* Ok, we tmo'd - so get the current time.
775 *
776 * It shouldn't really happen as we don't close empty
777 * blocks. See prb_retire_rx_blk_timer_expired().
778 */
d413fcb4
AB
779 struct timespec64 ts;
780 ktime_get_real_ts64(&ts);
f6fb8f10 781 h1->ts_last_pkt.ts_sec = ts.tv_sec;
782 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
783 }
784
785 smp_wmb();
786
787 /* Flush the block */
788 prb_flush_block(pkc1, pbd1, status);
789
da413eec
DC
790 sk->sk_data_ready(sk);
791
f6fb8f10 792 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
793}
794
eea49cc9 795static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 796{
797 pkc->reset_pending_on_curr_blk = 0;
798}
799
800/*
801 * Side effect of opening a block:
802 *
803 * 1) prb_queue is thawed.
804 * 2) retire_blk_timer is refreshed.
805 *
806 */
bc59ba39 807static void prb_open_block(struct tpacket_kbdq_core *pkc1,
808 struct tpacket_block_desc *pbd1)
f6fb8f10 809{
d413fcb4 810 struct timespec64 ts;
bc59ba39 811 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 812
813 smp_rmb();
814
8da3056c
DB
815 /* We could have just memset this but we will lose the
816 * flexibility of making the priv area sticky
817 */
f6fb8f10 818
8da3056c
DB
819 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
820 BLOCK_NUM_PKTS(pbd1) = 0;
821 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 822
d413fcb4 823 ktime_get_real_ts64(&ts);
8da3056c
DB
824
825 h1->ts_first_pkt.ts_sec = ts.tv_sec;
826 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 827
8da3056c
DB
828 pkc1->pkblk_start = (char *)pbd1;
829 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
830
831 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
832 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
833
834 pbd1->version = pkc1->version;
835 pkc1->prev = pkc1->nxt_offset;
836 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
837
838 prb_thaw_queue(pkc1);
839 _prb_refresh_rx_retire_blk_timer(pkc1);
840
841 smp_wmb();
f6fb8f10 842}
843
844/*
845 * Queue freeze logic:
846 * 1) Assume tp_block_nr = 8 blocks.
847 * 2) At time 't0', user opens Rx ring.
848 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
849 * 4) user-space is either sleeping or processing block '0'.
850 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
851 * it will close block-7,loop around and try to fill block '0'.
852 * call-flow:
853 * __packet_lookup_frame_in_block
854 * prb_retire_current_block()
855 * prb_dispatch_next_block()
856 * |->(BLOCK_STATUS == USER) evaluates to true
857 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
858 * 6) Now there are two cases:
859 * 6.1) Link goes idle right after the queue is frozen.
860 * But remember, the last open_block() refreshed the timer.
861 * When this timer expires,it will refresh itself so that we can
862 * re-open block-0 in near future.
863 * 6.2) Link is busy and keeps on receiving packets. This is a simple
864 * case and __packet_lookup_frame_in_block will check if block-0
865 * is free and can now be re-used.
866 */
eea49cc9 867static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 868 struct packet_sock *po)
869{
870 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 871 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 872}
873
874#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
875
876/*
877 * If the next block is free then we will dispatch it
878 * and return a good offset.
879 * Else, we will freeze the queue.
880 * So, caller must check the return value.
881 */
bc59ba39 882static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 883 struct packet_sock *po)
884{
bc59ba39 885 struct tpacket_block_desc *pbd;
f6fb8f10 886
887 smp_rmb();
888
889 /* 1. Get current block num */
890 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
891
892 /* 2. If this block is currently in_use then freeze the queue */
893 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
894 prb_freeze_queue(pkc, po);
895 return NULL;
896 }
897
898 /*
899 * 3.
900 * open this block and return the offset where the first packet
901 * needs to get stored.
902 */
903 prb_open_block(pkc, pbd);
904 return (void *)pkc->nxt_offset;
905}
906
bc59ba39 907static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 908 struct packet_sock *po, unsigned int status)
909{
bc59ba39 910 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 911
912 /* retire/close the current block */
913 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
914 /*
915 * Plug the case where copy_bits() is in progress on
916 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
917 * have space to copy the pkt in the current block and
918 * called prb_retire_current_block()
919 *
920 * We don't need to worry about the TMO case because
921 * the timer-handler already handled this case.
922 */
923 if (!(status & TP_STATUS_BLK_TMO)) {
632ca50f
JO
924 /* Waiting for skb_copy_bits to finish... */
925 write_lock(&pkc->blk_fill_in_prog_lock);
926 write_unlock(&pkc->blk_fill_in_prog_lock);
f6fb8f10 927 }
928 prb_close_block(pkc, pbd, po, status);
929 return;
930 }
f6fb8f10 931}
932
878cd3ba 933static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
f6fb8f10 934{
935 return TP_STATUS_USER & BLOCK_STATUS(pbd);
936}
937
eea49cc9 938static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 939{
940 return pkc->reset_pending_on_curr_blk;
941}
942
eea49cc9 943static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
88fd1cb8 944 __releases(&pkc->blk_fill_in_prog_lock)
f6fb8f10 945{
bc59ba39 946 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
632ca50f
JO
947
948 read_unlock(&pkc->blk_fill_in_prog_lock);
f6fb8f10 949}
950
eea49cc9 951static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 952 struct tpacket3_hdr *ppd)
953{
3958afa1 954 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 955}
956
eea49cc9 957static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 958 struct tpacket3_hdr *ppd)
959{
960 ppd->hv1.tp_rxhash = 0;
961}
962
eea49cc9 963static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 964 struct tpacket3_hdr *ppd)
965{
df8a39de
JP
966 if (skb_vlan_tag_present(pkc->skb)) {
967 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
968 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
969 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 970 } else {
9e67030a 971 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 972 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 973 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 974 }
975}
976
bc59ba39 977static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 978 struct tpacket3_hdr *ppd)
979{
a0cdfcf3 980 ppd->hv1.tp_padding = 0;
f6fb8f10 981 prb_fill_vlan_info(pkc, ppd);
982
983 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
984 prb_fill_rxhash(pkc, ppd);
985 else
986 prb_clear_rxhash(pkc, ppd);
987}
988
eea49cc9 989static void prb_fill_curr_block(char *curr,
bc59ba39 990 struct tpacket_kbdq_core *pkc,
991 struct tpacket_block_desc *pbd,
f6fb8f10 992 unsigned int len)
88fd1cb8 993 __acquires(&pkc->blk_fill_in_prog_lock)
f6fb8f10 994{
995 struct tpacket3_hdr *ppd;
996
997 ppd = (struct tpacket3_hdr *)curr;
998 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
999 pkc->prev = curr;
1000 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1001 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1002 BLOCK_NUM_PKTS(pbd) += 1;
632ca50f 1003 read_lock(&pkc->blk_fill_in_prog_lock);
f6fb8f10 1004 prb_run_all_ft_ops(pkc, ppd);
1005}
1006
1007/* Assumes caller has the sk->rx_queue.lock */
1008static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1009 struct sk_buff *skb,
f6fb8f10 1010 unsigned int len
1011 )
1012{
bc59ba39 1013 struct tpacket_kbdq_core *pkc;
1014 struct tpacket_block_desc *pbd;
f6fb8f10 1015 char *curr, *end;
1016
e3192690 1017 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1018 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1019
1020 /* Queue is frozen when user space is lagging behind */
1021 if (prb_queue_frozen(pkc)) {
1022 /*
1023 * Check if that last block which caused the queue to freeze,
1024 * is still in_use by user-space.
1025 */
878cd3ba 1026 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 1027 /* Can't record this packet */
1028 return NULL;
1029 } else {
1030 /*
1031 * Ok, the block was released by user-space.
1032 * Now let's open that block.
1033 * opening a block also thaws the queue.
1034 * Thawing is a side effect.
1035 */
1036 prb_open_block(pkc, pbd);
1037 }
1038 }
1039
1040 smp_mb();
1041 curr = pkc->nxt_offset;
1042 pkc->skb = skb;
e3192690 1043 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1044
1045 /* first try the current block */
1046 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1047 prb_fill_curr_block(curr, pkc, pbd, len);
1048 return (void *)curr;
1049 }
1050
1051 /* Ok, close the current block */
1052 prb_retire_current_block(pkc, po, 0);
1053
1054 /* Now, try to dispatch the next block */
1055 curr = (char *)prb_dispatch_next_block(pkc, po);
1056 if (curr) {
1057 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1058 prb_fill_curr_block(curr, pkc, pbd, len);
1059 return (void *)curr;
1060 }
1061
1062 /*
1063 * No free blocks are available.user_space hasn't caught up yet.
1064 * Queue was just frozen and now this packet will get dropped.
1065 */
1066 return NULL;
1067}
1068
eea49cc9 1069static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1070 struct sk_buff *skb,
1071 int status, unsigned int len)
1072{
1073 char *curr = NULL;
1074 switch (po->tp_version) {
1075 case TPACKET_V1:
1076 case TPACKET_V2:
1077 curr = packet_lookup_frame(po, &po->rx_ring,
1078 po->rx_ring.head, status);
1079 return curr;
1080 case TPACKET_V3:
46088059 1081 return __packet_lookup_frame_in_block(po, skb, len);
f6fb8f10 1082 default:
1083 WARN(1, "TPACKET version not supported\n");
1084 BUG();
99aa3473 1085 return NULL;
f6fb8f10 1086 }
1087}
1088
dcf70cef
ED
1089static void *prb_lookup_block(const struct packet_sock *po,
1090 const struct packet_ring_buffer *rb,
1091 unsigned int idx,
1092 int status)
f6fb8f10 1093{
bc59ba39 1094 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1095 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1096
1097 if (status != BLOCK_STATUS(pbd))
1098 return NULL;
1099 return pbd;
1100}
1101
eea49cc9 1102static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1103{
1104 unsigned int prev;
1105 if (rb->prb_bdqc.kactive_blk_num)
1106 prev = rb->prb_bdqc.kactive_blk_num-1;
1107 else
1108 prev = rb->prb_bdqc.knum_blocks-1;
1109 return prev;
1110}
1111
1112/* Assumes caller has held the rx_queue.lock */
eea49cc9 1113static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1114 struct packet_ring_buffer *rb,
1115 int status)
1116{
1117 unsigned int previous = prb_previous_blk_num(rb);
1118 return prb_lookup_block(po, rb, previous, status);
1119}
1120
eea49cc9 1121static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1122 struct packet_ring_buffer *rb,
1123 int status)
1124{
1125 if (po->tp_version <= TPACKET_V2)
1126 return packet_previous_frame(po, rb, status);
1127
1128 return __prb_previous_block(po, rb, status);
1129}
1130
eea49cc9 1131static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1132 struct packet_ring_buffer *rb)
1133{
1134 switch (po->tp_version) {
1135 case TPACKET_V1:
1136 case TPACKET_V2:
1137 return packet_increment_head(rb);
1138 case TPACKET_V3:
1139 default:
1140 WARN(1, "TPACKET version not supported.\n");
1141 BUG();
1142 return;
1143 }
1144}
1145
eea49cc9 1146static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1147 struct packet_ring_buffer *rb,
1148 int status)
1149{
1150 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1151 return packet_lookup_frame(po, rb, previous, status);
1152}
1153
eea49cc9 1154static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1155{
1156 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1157}
1158
b0138408
DB
1159static void packet_inc_pending(struct packet_ring_buffer *rb)
1160{
1161 this_cpu_inc(*rb->pending_refcnt);
1162}
1163
1164static void packet_dec_pending(struct packet_ring_buffer *rb)
1165{
1166 this_cpu_dec(*rb->pending_refcnt);
1167}
1168
1169static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1170{
1171 unsigned int refcnt = 0;
1172 int cpu;
1173
1174 /* We don't use pending refcount in rx_ring. */
1175 if (rb->pending_refcnt == NULL)
1176 return 0;
1177
1178 for_each_possible_cpu(cpu)
1179 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1180
1181 return refcnt;
1182}
1183
1184static int packet_alloc_pending(struct packet_sock *po)
1185{
1186 po->rx_ring.pending_refcnt = NULL;
1187
1188 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1189 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1190 return -ENOBUFS;
1191
1192 return 0;
1193}
1194
1195static void packet_free_pending(struct packet_sock *po)
1196{
1197 free_percpu(po->tx_ring.pending_refcnt);
1198}
1199
9954729b
WB
1200#define ROOM_POW_OFF 2
1201#define ROOM_NONE 0x0
1202#define ROOM_LOW 0x1
1203#define ROOM_NORMAL 0x2
1204
d4b5bd98 1205static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
77f65ebd 1206{
9954729b
WB
1207 int idx, len;
1208
d4b5bd98
ED
1209 len = READ_ONCE(po->rx_ring.frame_max) + 1;
1210 idx = READ_ONCE(po->rx_ring.head);
9954729b
WB
1211 if (pow_off)
1212 idx += len >> pow_off;
1213 if (idx >= len)
1214 idx -= len;
1215 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1216}
1217
dcf70cef 1218static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
9954729b
WB
1219{
1220 int idx, len;
1221
dcf70cef
ED
1222 len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
1223 idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
9954729b
WB
1224 if (pow_off)
1225 idx += len >> pow_off;
1226 if (idx >= len)
1227 idx -= len;
1228 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1229}
77f65ebd 1230
0338a145
ED
1231static int __packet_rcv_has_room(const struct packet_sock *po,
1232 const struct sk_buff *skb)
9954729b 1233{
0338a145 1234 const struct sock *sk = &po->sk;
9954729b
WB
1235 int ret = ROOM_NONE;
1236
1237 if (po->prot_hook.func != tpacket_rcv) {
0338a145
ED
1238 int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
1239 int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1240 - (skb ? skb->truesize : 0);
1241
1242 if (avail > (rcvbuf >> ROOM_POW_OFF))
9954729b
WB
1243 return ROOM_NORMAL;
1244 else if (avail > 0)
1245 return ROOM_LOW;
1246 else
1247 return ROOM_NONE;
1248 }
77f65ebd 1249
9954729b
WB
1250 if (po->tp_version == TPACKET_V3) {
1251 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1252 ret = ROOM_NORMAL;
1253 else if (__tpacket_v3_has_room(po, 0))
1254 ret = ROOM_LOW;
1255 } else {
1256 if (__tpacket_has_room(po, ROOM_POW_OFF))
1257 ret = ROOM_NORMAL;
1258 else if (__tpacket_has_room(po, 0))
1259 ret = ROOM_LOW;
1260 }
2ccdbaa6
WB
1261
1262 return ret;
1263}
1264
1265static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1266{
3a2bb84e 1267 int pressure, ret;
2ccdbaa6 1268
54d7c01d 1269 ret = __packet_rcv_has_room(po, skb);
3a2bb84e
ED
1270 pressure = ret != ROOM_NORMAL;
1271
1272 if (READ_ONCE(po->pressure) != pressure)
1273 WRITE_ONCE(po->pressure, pressure);
77f65ebd 1274
9954729b 1275 return ret;
77f65ebd
WB
1276}
1277
9bb6cd65
ED
1278static void packet_rcv_try_clear_pressure(struct packet_sock *po)
1279{
1280 if (READ_ONCE(po->pressure) &&
1281 __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
1282 WRITE_ONCE(po->pressure, 0);
1283}
1284
1da177e4
LT
1285static void packet_sock_destruct(struct sock *sk)
1286{
ed85b565
RC
1287 skb_queue_purge(&sk->sk_error_queue);
1288
547b792c 1289 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
14afee4b 1290 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1da177e4
LT
1291
1292 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1293 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1294 return;
1295 }
1296
17ab56a2 1297 sk_refcnt_debug_dec(sk);
1da177e4
LT
1298}
1299
3b3a5b0a
WB
1300static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1301{
b756ad92
ED
1302 u32 *history = po->rollover->history;
1303 u32 victim, rxhash;
3b3a5b0a
WB
1304 int i, count = 0;
1305
1306 rxhash = skb_get_hash(skb);
1307 for (i = 0; i < ROLLOVER_HLEN; i++)
b756ad92 1308 if (READ_ONCE(history[i]) == rxhash)
3b3a5b0a
WB
1309 count++;
1310
b756ad92
ED
1311 victim = prandom_u32() % ROLLOVER_HLEN;
1312
1313 /* Avoid dirtying the cache line if possible */
1314 if (READ_ONCE(history[victim]) != rxhash)
1315 WRITE_ONCE(history[victim], rxhash);
1316
3b3a5b0a
WB
1317 return count > (ROLLOVER_HLEN >> 1);
1318}
1319
77f65ebd
WB
1320static unsigned int fanout_demux_hash(struct packet_fanout *f,
1321 struct sk_buff *skb,
1322 unsigned int num)
dc99f600 1323{
eb70db87 1324 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
dc99f600
DM
1325}
1326
77f65ebd
WB
1327static unsigned int fanout_demux_lb(struct packet_fanout *f,
1328 struct sk_buff *skb,
1329 unsigned int num)
dc99f600 1330{
468479e6 1331 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1332
468479e6 1333 return val % num;
77f65ebd
WB
1334}
1335
1336static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1337 struct sk_buff *skb,
1338 unsigned int num)
1339{
1340 return smp_processor_id() % num;
dc99f600
DM
1341}
1342
5df0ddfb
DB
1343static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1344 struct sk_buff *skb,
1345 unsigned int num)
1346{
f337db64 1347 return prandom_u32_max(num);
5df0ddfb
DB
1348}
1349
77f65ebd
WB
1350static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1351 struct sk_buff *skb,
ad377cab 1352 unsigned int idx, bool try_self,
77f65ebd 1353 unsigned int num)
95ec3eb4 1354{
4633c9e0 1355 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1356 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1357
0648ab70 1358 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1359
1360 if (try_self) {
1361 room = packet_rcv_has_room(po, skb);
1362 if (room == ROOM_NORMAL ||
1363 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1364 return idx;
4633c9e0 1365 po_skip = po;
3b3a5b0a 1366 }
ad377cab 1367
0648ab70 1368 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1369 do {
2ccdbaa6 1370 po_next = pkt_sk(f->arr[i]);
3a2bb84e 1371 if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
2ccdbaa6 1372 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1373 if (i != j)
0648ab70 1374 po->rollover->sock = i;
a9b63918
WB
1375 atomic_long_inc(&po->rollover->num);
1376 if (room == ROOM_LOW)
1377 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1378 return i;
1379 }
ad377cab 1380
77f65ebd
WB
1381 if (++i == num)
1382 i = 0;
1383 } while (i != j);
1384
a9b63918 1385 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1386 return idx;
1387}
1388
2d36097d
NH
1389static unsigned int fanout_demux_qm(struct packet_fanout *f,
1390 struct sk_buff *skb,
1391 unsigned int num)
1392{
1393 return skb_get_queue_mapping(skb) % num;
1394}
1395
47dceb8e
WB
1396static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1397 struct sk_buff *skb,
1398 unsigned int num)
1399{
1400 struct bpf_prog *prog;
1401 unsigned int ret = 0;
1402
1403 rcu_read_lock();
1404 prog = rcu_dereference(f->bpf_prog);
1405 if (prog)
ff936a04 1406 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1407 rcu_read_unlock();
1408
1409 return ret;
1410}
1411
77f65ebd
WB
1412static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1413{
1414 return f->flags & (flag >> 8);
95ec3eb4
DM
1415}
1416
95ec3eb4
DM
1417static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1418 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1419{
1420 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1421 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1422 struct net *net = read_pnet(&f->net);
dc99f600 1423 struct packet_sock *po;
77f65ebd 1424 unsigned int idx;
dc99f600 1425
19bcf9f2 1426 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1427 kfree_skb(skb);
1428 return 0;
1429 }
1430
3f34b24a 1431 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1432 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1433 if (!skb)
1434 return 0;
1435 }
95ec3eb4
DM
1436 switch (f->type) {
1437 case PACKET_FANOUT_HASH:
1438 default:
77f65ebd 1439 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1440 break;
1441 case PACKET_FANOUT_LB:
77f65ebd 1442 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1443 break;
1444 case PACKET_FANOUT_CPU:
77f65ebd
WB
1445 idx = fanout_demux_cpu(f, skb, num);
1446 break;
5df0ddfb
DB
1447 case PACKET_FANOUT_RND:
1448 idx = fanout_demux_rnd(f, skb, num);
1449 break;
2d36097d
NH
1450 case PACKET_FANOUT_QM:
1451 idx = fanout_demux_qm(f, skb, num);
1452 break;
77f65ebd 1453 case PACKET_FANOUT_ROLLOVER:
ad377cab 1454 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1455 break;
47dceb8e 1456 case PACKET_FANOUT_CBPF:
f2e52095 1457 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1458 idx = fanout_demux_bpf(f, skb, num);
1459 break;
dc99f600
DM
1460 }
1461
ad377cab
WB
1462 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1463 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1464
ad377cab 1465 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1466 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1467}
1468
fff3321d
PE
1469DEFINE_MUTEX(fanout_mutex);
1470EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600 1471static LIST_HEAD(fanout_list);
4a69a864 1472static u16 fanout_next_id;
dc99f600
DM
1473
1474static void __fanout_link(struct sock *sk, struct packet_sock *po)
1475{
1476 struct packet_fanout *f = po->fanout;
1477
1478 spin_lock(&f->lock);
1479 f->arr[f->num_members] = sk;
1480 smp_wmb();
1481 f->num_members++;
2bd624b4
AS
1482 if (f->num_members == 1)
1483 dev_add_pack(&f->prot_hook);
dc99f600
DM
1484 spin_unlock(&f->lock);
1485}
1486
1487static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1488{
1489 struct packet_fanout *f = po->fanout;
1490 int i;
1491
1492 spin_lock(&f->lock);
1493 for (i = 0; i < f->num_members; i++) {
1494 if (f->arr[i] == sk)
1495 break;
1496 }
1497 BUG_ON(i >= f->num_members);
1498 f->arr[i] = f->arr[f->num_members - 1];
1499 f->num_members--;
2bd624b4
AS
1500 if (f->num_members == 0)
1501 __dev_remove_pack(&f->prot_hook);
dc99f600
DM
1502 spin_unlock(&f->lock);
1503}
1504
d4dd8aee 1505static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1506{
161642e2
ED
1507 if (sk->sk_family != PF_PACKET)
1508 return false;
c0de08d0 1509
161642e2 1510 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1511}
1512
47dceb8e
WB
1513static void fanout_init_data(struct packet_fanout *f)
1514{
1515 switch (f->type) {
1516 case PACKET_FANOUT_LB:
1517 atomic_set(&f->rr_cur, 0);
1518 break;
1519 case PACKET_FANOUT_CBPF:
f2e52095 1520 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1521 RCU_INIT_POINTER(f->bpf_prog, NULL);
1522 break;
1523 }
1524}
1525
1526static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1527{
1528 struct bpf_prog *old;
1529
1530 spin_lock(&f->lock);
1531 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1532 rcu_assign_pointer(f->bpf_prog, new);
1533 spin_unlock(&f->lock);
1534
1535 if (old) {
1536 synchronize_net();
1537 bpf_prog_destroy(old);
1538 }
1539}
1540
b1ea9ff6 1541static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data,
47dceb8e
WB
1542 unsigned int len)
1543{
1544 struct bpf_prog *new;
1545 struct sock_fprog fprog;
1546 int ret;
1547
1548 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1549 return -EPERM;
4d295e54
CH
1550
1551 ret = copy_bpf_fprog_from_user(&fprog, data, len);
1552 if (ret)
1553 return ret;
47dceb8e 1554
bab18991 1555 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1556 if (ret)
1557 return ret;
1558
1559 __fanout_set_data_bpf(po->fanout, new);
1560 return 0;
1561}
1562
a7b75c5a 1563static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data,
f2e52095
WB
1564 unsigned int len)
1565{
1566 struct bpf_prog *new;
1567 u32 fd;
1568
1569 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1570 return -EPERM;
1571 if (len != sizeof(fd))
1572 return -EINVAL;
a7b75c5a 1573 if (copy_from_sockptr(&fd, data, len))
f2e52095
WB
1574 return -EFAULT;
1575
113214be 1576 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
f2e52095
WB
1577 if (IS_ERR(new))
1578 return PTR_ERR(new);
f2e52095
WB
1579
1580 __fanout_set_data_bpf(po->fanout, new);
1581 return 0;
1582}
1583
a7b75c5a 1584static int fanout_set_data(struct packet_sock *po, sockptr_t data,
47dceb8e
WB
1585 unsigned int len)
1586{
1587 switch (po->fanout->type) {
1588 case PACKET_FANOUT_CBPF:
a7b75c5a 1589 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1590 case PACKET_FANOUT_EBPF:
1591 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1592 default:
1593 return -EINVAL;
07d53ae4 1594 }
47dceb8e
WB
1595}
1596
1597static void fanout_release_data(struct packet_fanout *f)
1598{
1599 switch (f->type) {
1600 case PACKET_FANOUT_CBPF:
f2e52095 1601 case PACKET_FANOUT_EBPF:
47dceb8e 1602 __fanout_set_data_bpf(f, NULL);
07d53ae4 1603 }
47dceb8e
WB
1604}
1605
4a69a864
MM
1606static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1607{
1608 struct packet_fanout *f;
1609
1610 list_for_each_entry(f, &fanout_list, list) {
1611 if (f->id == candidate_id &&
1612 read_pnet(&f->net) == sock_net(sk)) {
1613 return false;
1614 }
1615 }
1616 return true;
1617}
1618
1619static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1620{
1621 u16 id = fanout_next_id;
1622
1623 do {
1624 if (__fanout_id_is_free(sk, id)) {
1625 *new_id = id;
1626 fanout_next_id = id + 1;
1627 return true;
1628 }
1629
1630 id++;
1631 } while (id != fanout_next_id);
1632
1633 return false;
1634}
1635
7736d33f 1636static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600 1637{
d199fab6 1638 struct packet_rollover *rollover = NULL;
dc99f600
DM
1639 struct packet_sock *po = pkt_sk(sk);
1640 struct packet_fanout *f, *match;
7736d33f 1641 u8 type = type_flags & 0xff;
77f65ebd 1642 u8 flags = type_flags >> 8;
dc99f600
DM
1643 int err;
1644
1645 switch (type) {
77f65ebd
WB
1646 case PACKET_FANOUT_ROLLOVER:
1647 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1648 return -EINVAL;
dc99f600
DM
1649 case PACKET_FANOUT_HASH:
1650 case PACKET_FANOUT_LB:
95ec3eb4 1651 case PACKET_FANOUT_CPU:
5df0ddfb 1652 case PACKET_FANOUT_RND:
2d36097d 1653 case PACKET_FANOUT_QM:
47dceb8e 1654 case PACKET_FANOUT_CBPF:
f2e52095 1655 case PACKET_FANOUT_EBPF:
dc99f600
DM
1656 break;
1657 default:
1658 return -EINVAL;
1659 }
1660
d199fab6
ED
1661 mutex_lock(&fanout_mutex);
1662
d199fab6 1663 err = -EALREADY;
dc99f600 1664 if (po->fanout)
d199fab6 1665 goto out;
dc99f600 1666
4633c9e0
WB
1667 if (type == PACKET_FANOUT_ROLLOVER ||
1668 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
d199fab6
ED
1669 err = -ENOMEM;
1670 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1671 if (!rollover)
1672 goto out;
1673 atomic_long_set(&rollover->num, 0);
1674 atomic_long_set(&rollover->num_huge, 0);
1675 atomic_long_set(&rollover->num_failed, 0);
0648ab70
WB
1676 }
1677
4a69a864
MM
1678 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1679 if (id != 0) {
1680 err = -EINVAL;
1681 goto out;
1682 }
1683 if (!fanout_find_new_id(sk, &id)) {
1684 err = -ENOMEM;
1685 goto out;
1686 }
1687 /* ephemeral flag for the first socket in the group: drop it */
1688 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1689 }
1690
dc99f600
DM
1691 match = NULL;
1692 list_for_each_entry(f, &fanout_list, list) {
1693 if (f->id == id &&
1694 read_pnet(&f->net) == sock_net(sk)) {
1695 match = f;
1696 break;
1697 }
1698 }
afe62c68 1699 err = -EINVAL;
77f65ebd 1700 if (match && match->flags != flags)
afe62c68 1701 goto out;
dc99f600 1702 if (!match) {
afe62c68 1703 err = -ENOMEM;
dc99f600 1704 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1705 if (!match)
1706 goto out;
1707 write_pnet(&match->net, sock_net(sk));
1708 match->id = id;
1709 match->type = type;
77f65ebd 1710 match->flags = flags;
afe62c68
ED
1711 INIT_LIST_HEAD(&match->list);
1712 spin_lock_init(&match->lock);
fb5c2c17 1713 refcount_set(&match->sk_ref, 0);
47dceb8e 1714 fanout_init_data(match);
afe62c68
ED
1715 match->prot_hook.type = po->prot_hook.type;
1716 match->prot_hook.dev = po->prot_hook.dev;
1717 match->prot_hook.func = packet_rcv_fanout;
1718 match->prot_hook.af_packet_priv = match;
c0de08d0 1719 match->prot_hook.id_match = match_fanout_group;
afe62c68 1720 list_add(&match->list, &fanout_list);
dc99f600 1721 }
afe62c68 1722 err = -EINVAL;
008ba2a1
WB
1723
1724 spin_lock(&po->bind_lock);
1725 if (po->running &&
1726 match->type == type &&
afe62c68
ED
1727 match->prot_hook.type == po->prot_hook.type &&
1728 match->prot_hook.dev == po->prot_hook.dev) {
1729 err = -ENOSPC;
fb5c2c17 1730 if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
afe62c68
ED
1731 __dev_remove_pack(&po->prot_hook);
1732 po->fanout = match;
57f015f5
MM
1733 po->rollover = rollover;
1734 rollover = NULL;
fb5c2c17 1735 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
afe62c68
ED
1736 __fanout_link(sk, po);
1737 err = 0;
dc99f600
DM
1738 }
1739 }
008ba2a1
WB
1740 spin_unlock(&po->bind_lock);
1741
1742 if (err && !refcount_read(&match->sk_ref)) {
1743 list_del(&match->list);
1744 kfree(match);
1745 }
1746
afe62c68 1747out:
57f015f5 1748 kfree(rollover);
d199fab6 1749 mutex_unlock(&fanout_mutex);
dc99f600
DM
1750 return err;
1751}
1752
2bd624b4
AS
1753/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1754 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1755 * It is the responsibility of the caller to call fanout_release_data() and
1756 * free the returned packet_fanout (after synchronize_net())
1757 */
1758static struct packet_fanout *fanout_release(struct sock *sk)
dc99f600
DM
1759{
1760 struct packet_sock *po = pkt_sk(sk);
1761 struct packet_fanout *f;
1762
fff3321d 1763 mutex_lock(&fanout_mutex);
d199fab6
ED
1764 f = po->fanout;
1765 if (f) {
1766 po->fanout = NULL;
1767
fb5c2c17 1768 if (refcount_dec_and_test(&f->sk_ref))
d199fab6 1769 list_del(&f->list);
2bd624b4
AS
1770 else
1771 f = NULL;
dc99f600
DM
1772 }
1773 mutex_unlock(&fanout_mutex);
2bd624b4
AS
1774
1775 return f;
dc99f600 1776}
1da177e4 1777
3c70c132
DB
1778static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1779 struct sk_buff *skb)
1780{
1781 /* Earlier code assumed this would be a VLAN pkt, double-check
1782 * this now that we have the actual packet in hand. We can only
1783 * do this check on Ethernet devices.
1784 */
1785 if (unlikely(dev->type != ARPHRD_ETHER))
1786 return false;
1787
1788 skb_reset_mac_header(skb);
1789 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1790}
1791
90ddc4f0 1792static const struct proto_ops packet_ops;
1da177e4 1793
90ddc4f0 1794static const struct proto_ops packet_ops_spkt;
1da177e4 1795
40d4e3df
ED
1796static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1797 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1798{
1799 struct sock *sk;
1800 struct sockaddr_pkt *spkt;
1801
1802 /*
1803 * When we registered the protocol we saved the socket in the data
1804 * field for just this event.
1805 */
1806
1807 sk = pt->af_packet_priv;
1ce4f28b 1808
1da177e4
LT
1809 /*
1810 * Yank back the headers [hope the device set this
1811 * right or kerboom...]
1812 *
1813 * Incoming packets have ll header pulled,
1814 * push it back.
1815 *
98e399f8 1816 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1817 * so that this procedure is noop.
1818 */
1819
1820 if (skb->pkt_type == PACKET_LOOPBACK)
1821 goto out;
1822
09ad9bc7 1823 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1824 goto out;
1825
40d4e3df
ED
1826 skb = skb_share_check(skb, GFP_ATOMIC);
1827 if (skb == NULL)
1da177e4
LT
1828 goto oom;
1829
1830 /* drop any routing info */
adf30907 1831 skb_dst_drop(skb);
1da177e4 1832
84531c24 1833 /* drop conntrack reference */
895b5c9f 1834 nf_reset_ct(skb);
84531c24 1835
ffbc6111 1836 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1837
98e399f8 1838 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1839
1840 /*
1841 * The SOCK_PACKET socket receives _all_ frames.
1842 */
1843
1844 spkt->spkt_family = dev->type;
1845 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1846 spkt->spkt_protocol = skb->protocol;
1847
1848 /*
1849 * Charge the memory to the socket. This is done specifically
1850 * to prevent sockets using all the memory up.
1851 */
1852
40d4e3df 1853 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1854 return 0;
1855
1856out:
1857 kfree_skb(skb);
1858oom:
1859 return 0;
1860}
1861
75c65772
MM
1862static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
1863{
18bed891
YK
1864 if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
1865 sock->type == SOCK_RAW) {
75c65772
MM
1866 skb_reset_mac_header(skb);
1867 skb->protocol = dev_parse_header_protocol(skb);
1868 }
1869
1870 skb_probe_transport_header(skb);
1871}
1da177e4
LT
1872
1873/*
1874 * Output a raw packet to a device layer. This bypasses all the other
1875 * protocol layers and you must therefore supply it with a complete frame
1876 */
1ce4f28b 1877
1b784140
YX
1878static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1879 size_t len)
1da177e4
LT
1880{
1881 struct sock *sk = sock->sk;
342dfc30 1882 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1883 struct sk_buff *skb = NULL;
1da177e4 1884 struct net_device *dev;
c14ac945 1885 struct sockcm_cookie sockc;
40d4e3df 1886 __be16 proto = 0;
1da177e4 1887 int err;
3bdc0eba 1888 int extra_len = 0;
1ce4f28b 1889
1da177e4 1890 /*
1ce4f28b 1891 * Get and verify the address.
1da177e4
LT
1892 */
1893
40d4e3df 1894 if (saddr) {
1da177e4 1895 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1896 return -EINVAL;
1897 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1898 proto = saddr->spkt_protocol;
1899 } else
1900 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1901
1902 /*
1ce4f28b 1903 * Find the device first to size check it
1da177e4
LT
1904 */
1905
de74e92a 1906 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1907retry:
654d1f8a
ED
1908 rcu_read_lock();
1909 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1910 err = -ENODEV;
1911 if (dev == NULL)
1912 goto out_unlock;
1ce4f28b 1913
d5e76b0a
DM
1914 err = -ENETDOWN;
1915 if (!(dev->flags & IFF_UP))
1916 goto out_unlock;
1917
1da177e4 1918 /*
40d4e3df
ED
1919 * You may not queue a frame bigger than the mtu. This is the lowest level
1920 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1921 */
1ce4f28b 1922
3bdc0eba
BG
1923 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1924 if (!netif_supports_nofcs(dev)) {
1925 err = -EPROTONOSUPPORT;
1926 goto out_unlock;
1927 }
1928 extra_len = 4; /* We're doing our own CRC */
1929 }
1930
1da177e4 1931 err = -EMSGSIZE;
3bdc0eba 1932 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1933 goto out_unlock;
1934
1a35ca80
ED
1935 if (!skb) {
1936 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1937 int tlen = dev->needed_tailroom;
1a35ca80
ED
1938 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1939
1940 rcu_read_unlock();
4ce40912 1941 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1942 if (skb == NULL)
1943 return -ENOBUFS;
1944 /* FIXME: Save some space for broken drivers that write a hard
1945 * header at transmission time by themselves. PPP is the notable
1946 * one here. This should really be fixed at the driver level.
1947 */
1948 skb_reserve(skb, reserved);
1949 skb_reset_network_header(skb);
1950
1951 /* Try to align data part correctly */
1952 if (hhlen) {
1953 skb->data -= hhlen;
1954 skb->tail -= hhlen;
1955 if (len < hhlen)
1956 skb_reset_network_header(skb);
1957 }
6ce8e9ce 1958 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1959 if (err)
1960 goto out_free;
1961 goto retry;
1da177e4
LT
1962 }
1963
9ed988cd
WB
1964 if (!dev_validate_header(dev, skb->data, len)) {
1965 err = -EINVAL;
1966 goto out_unlock;
1967 }
3c70c132
DB
1968 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1969 !packet_extra_vlan_len_allowed(dev, skb)) {
1970 err = -EMSGSIZE;
1971 goto out_unlock;
57f89bfa 1972 }
1a35ca80 1973
657a0667 1974 sockcm_init(&sockc, sk);
c14ac945
SHY
1975 if (msg->msg_controllen) {
1976 err = sock_cmsg_send(sk, msg, &sockc);
f8e7718c 1977 if (unlikely(err))
c14ac945 1978 goto out_unlock;
c14ac945
SHY
1979 }
1980
1da177e4
LT
1981 skb->protocol = proto;
1982 skb->dev = dev;
1983 skb->priority = sk->sk_priority;
2d37a186 1984 skb->mark = sk->sk_mark;
3d0ba8c0 1985 skb->tstamp = sockc.transmit_time;
bf84a010 1986
8f932f76 1987 skb_setup_tx_timestamp(skb, sockc.tsflags);
1da177e4 1988
3bdc0eba
BG
1989 if (unlikely(extra_len == 4))
1990 skb->no_fcs = 1;
1991
75c65772 1992 packet_parse_headers(skb, sock);
c1aad275 1993
1da177e4 1994 dev_queue_xmit(skb);
654d1f8a 1995 rcu_read_unlock();
40d4e3df 1996 return len;
1da177e4 1997
1da177e4 1998out_unlock:
654d1f8a 1999 rcu_read_unlock();
1a35ca80
ED
2000out_free:
2001 kfree_skb(skb);
1da177e4
LT
2002 return err;
2003}
1da177e4 2004
ff936a04
AS
2005static unsigned int run_filter(struct sk_buff *skb,
2006 const struct sock *sk,
2007 unsigned int res)
1da177e4
LT
2008{
2009 struct sk_filter *filter;
fda9ef5d 2010
80f8f102
ED
2011 rcu_read_lock();
2012 filter = rcu_dereference(sk->sk_filter);
dbcb5855 2013 if (filter != NULL)
ff936a04 2014 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 2015 rcu_read_unlock();
1da177e4 2016
dbcb5855 2017 return res;
1da177e4
LT
2018}
2019
16cc1400
WB
2020static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2021 size_t *len)
2022{
2023 struct virtio_net_hdr vnet_hdr;
2024
2025 if (*len < sizeof(vnet_hdr))
2026 return -EINVAL;
2027 *len -= sizeof(vnet_hdr);
2028
fd3a8862 2029 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
16cc1400
WB
2030 return -EINVAL;
2031
2032 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2033}
2034
1da177e4 2035/*
62ab0812
ED
2036 * This function makes lazy skb cloning in hope that most of packets
2037 * are discarded by BPF.
2038 *
2039 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2040 * and skb->cb are mangled. It works because (and until) packets
2041 * falling here are owned by current CPU. Output packets are cloned
2042 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2043 * sequencially, so that if we return skb to original state on exit,
2044 * we will not harm anyone.
1da177e4
LT
2045 */
2046
40d4e3df
ED
2047static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2048 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2049{
2050 struct sock *sk;
2051 struct sockaddr_ll *sll;
2052 struct packet_sock *po;
40d4e3df 2053 u8 *skb_head = skb->data;
1da177e4 2054 int skb_len = skb->len;
dbcb5855 2055 unsigned int snaplen, res;
da37845f 2056 bool is_drop_n_account = false;
1da177e4
LT
2057
2058 if (skb->pkt_type == PACKET_LOOPBACK)
2059 goto drop;
2060
2061 sk = pt->af_packet_priv;
2062 po = pkt_sk(sk);
2063
09ad9bc7 2064 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2065 goto drop;
2066
1da177e4
LT
2067 skb->dev = dev;
2068
3b04ddde 2069 if (dev->header_ops) {
1da177e4 2070 /* The device has an explicit notion of ll header,
62ab0812
ED
2071 * exported to higher levels.
2072 *
2073 * Otherwise, the device hides details of its frame
2074 * structure, so that corresponding packet head is
2075 * never delivered to user.
1da177e4
LT
2076 */
2077 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2078 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2079 else if (skb->pkt_type == PACKET_OUTGOING) {
2080 /* Special case: outgoing packets have ll header at head */
bbe735e4 2081 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2082 }
2083 }
2084
2085 snaplen = skb->len;
2086
dbcb5855
DM
2087 res = run_filter(skb, sk, snaplen);
2088 if (!res)
fda9ef5d 2089 goto drop_n_restore;
dbcb5855
DM
2090 if (snaplen > res)
2091 snaplen = res;
1da177e4 2092
0fd7bac6 2093 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2094 goto drop_n_acct;
2095
2096 if (skb_shared(skb)) {
2097 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2098 if (nskb == NULL)
2099 goto drop_n_acct;
2100
2101 if (skb_head != skb->data) {
2102 skb->data = skb_head;
2103 skb->len = skb_len;
2104 }
abc4e4fa 2105 consume_skb(skb);
1da177e4
LT
2106 skb = nskb;
2107 }
2108
b4772ef8 2109 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2110
2111 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2112 sll->sll_hatype = dev->type;
1da177e4 2113 sll->sll_pkttype = skb->pkt_type;
8032b464 2114 if (unlikely(po->origdev))
80feaacb
PWJ
2115 sll->sll_ifindex = orig_dev->ifindex;
2116 else
2117 sll->sll_ifindex = dev->ifindex;
1da177e4 2118
b95cce35 2119 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2120
2472d761
EB
2121 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2122 * Use their space for storing the original skb length.
2123 */
2124 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2125
1da177e4
LT
2126 if (pskb_trim(skb, snaplen))
2127 goto drop_n_acct;
2128
2129 skb_set_owner_r(skb, sk);
2130 skb->dev = NULL;
adf30907 2131 skb_dst_drop(skb);
1da177e4 2132
84531c24 2133 /* drop conntrack reference */
895b5c9f 2134 nf_reset_ct(skb);
84531c24 2135
1da177e4 2136 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2137 po->stats.stats1.tp_packets++;
3bc3b96f 2138 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2139 __skb_queue_tail(&sk->sk_receive_queue, skb);
2140 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2141 sk->sk_data_ready(sk);
1da177e4
LT
2142 return 0;
2143
2144drop_n_acct:
da37845f 2145 is_drop_n_account = true;
8e8e2951 2146 atomic_inc(&po->tp_drops);
7091fbd8 2147 atomic_inc(&sk->sk_drops);
1da177e4
LT
2148
2149drop_n_restore:
2150 if (skb_head != skb->data && skb_shared(skb)) {
2151 skb->data = skb_head;
2152 skb->len = skb_len;
2153 }
2154drop:
da37845f
WJ
2155 if (!is_drop_n_account)
2156 consume_skb(skb);
2157 else
2158 kfree_skb(skb);
1da177e4
LT
2159 return 0;
2160}
2161
40d4e3df
ED
2162static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2163 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2164{
2165 struct sock *sk;
2166 struct packet_sock *po;
2167 struct sockaddr_ll *sll;
184f489e 2168 union tpacket_uhdr h;
40d4e3df 2169 u8 *skb_head = skb->data;
1da177e4 2170 int skb_len = skb->len;
dbcb5855 2171 unsigned int snaplen, res;
f6fb8f10 2172 unsigned long status = TP_STATUS_USER;
bbd6ef87 2173 unsigned short macoff, netoff, hdrlen;
1da177e4 2174 struct sk_buff *copy_skb = NULL;
d413fcb4 2175 struct timespec64 ts;
b9c32fb2 2176 __u32 ts_status;
da37845f 2177 bool is_drop_n_account = false;
61fad681 2178 unsigned int slot_id = 0;
edbd58be 2179 bool do_vnet = false;
1da177e4 2180
51846355
AW
2181 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2182 * We may add members to them until current aligned size without forcing
2183 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2184 */
2185 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2186 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2187
1da177e4
LT
2188 if (skb->pkt_type == PACKET_LOOPBACK)
2189 goto drop;
2190
2191 sk = pt->af_packet_priv;
2192 po = pkt_sk(sk);
2193
09ad9bc7 2194 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2195 goto drop;
2196
3b04ddde 2197 if (dev->header_ops) {
1da177e4 2198 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2199 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2200 else if (skb->pkt_type == PACKET_OUTGOING) {
2201 /* Special case: outgoing packets have ll header at head */
bbe735e4 2202 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2203 }
2204 }
2205
2206 snaplen = skb->len;
2207
dbcb5855
DM
2208 res = run_filter(skb, sk, snaplen);
2209 if (!res)
fda9ef5d 2210 goto drop_n_restore;
68c2e5de 2211
2c51c627
ED
2212 /* If we are flooded, just give up */
2213 if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
2214 atomic_inc(&po->tp_drops);
2215 goto drop_n_restore;
2216 }
2217
68c2e5de
AD
2218 if (skb->ip_summed == CHECKSUM_PARTIAL)
2219 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2220 else if (skb->pkt_type != PACKET_OUTGOING &&
2221 (skb->ip_summed == CHECKSUM_COMPLETE ||
2222 skb_csum_unnecessary(skb)))
2223 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2224
dbcb5855
DM
2225 if (snaplen > res)
2226 snaplen = res;
1da177e4
LT
2227
2228 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2229 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2230 po->tp_reserve;
1da177e4 2231 } else {
95c96174 2232 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2233 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2234 (maclen < 16 ? 16 : maclen)) +
58d19b19 2235 po->tp_reserve;
edbd58be 2236 if (po->has_vnet_hdr) {
58d19b19 2237 netoff += sizeof(struct virtio_net_hdr);
edbd58be
BP
2238 do_vnet = true;
2239 }
1da177e4
LT
2240 macoff = netoff - maclen;
2241 }
f6fb8f10 2242 if (po->tp_version <= TPACKET_V2) {
2243 if (macoff + snaplen > po->rx_ring.frame_size) {
2244 if (po->copy_thresh &&
0fd7bac6 2245 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2246 if (skb_shared(skb)) {
2247 copy_skb = skb_clone(skb, GFP_ATOMIC);
2248 } else {
2249 copy_skb = skb_get(skb);
2250 skb_head = skb->data;
2251 }
2252 if (copy_skb)
2253 skb_set_owner_r(copy_skb, sk);
1da177e4 2254 }
f6fb8f10 2255 snaplen = po->rx_ring.frame_size - macoff;
edbd58be 2256 if ((int)snaplen < 0) {
f6fb8f10 2257 snaplen = 0;
edbd58be
BP
2258 do_vnet = false;
2259 }
1da177e4 2260 }
dc808110
ED
2261 } else if (unlikely(macoff + snaplen >
2262 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2263 u32 nval;
2264
2265 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2266 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2267 snaplen, nval, macoff);
2268 snaplen = nval;
2269 if (unlikely((int)snaplen < 0)) {
2270 snaplen = 0;
2271 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
edbd58be 2272 do_vnet = false;
dc808110 2273 }
1da177e4 2274 }
1da177e4 2275 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2276 h.raw = packet_current_rx_frame(po, skb,
2277 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2278 if (!h.raw)
58d19b19 2279 goto drop_n_account;
46e4c421 2280
61fad681
WB
2281 if (po->tp_version <= TPACKET_V2) {
2282 slot_id = po->rx_ring.head;
2283 if (test_bit(slot_id, po->rx_ring.rx_owner_map))
2284 goto drop_n_account;
2285 __set_bit(slot_id, po->rx_ring.rx_owner_map);
2286 }
2287
46e4c421
WB
2288 if (do_vnet &&
2289 virtio_net_hdr_from_skb(skb, h.raw + macoff -
2290 sizeof(struct virtio_net_hdr),
88fd1cb8
JO
2291 vio_le(), true, 0)) {
2292 if (po->tp_version == TPACKET_V3)
2293 prb_clear_blk_fill_status(&po->rx_ring);
46e4c421 2294 goto drop_n_account;
88fd1cb8 2295 }
46e4c421 2296
f6fb8f10 2297 if (po->tp_version <= TPACKET_V2) {
2298 packet_increment_rx_head(po, &po->rx_ring);
2299 /*
2300 * LOSING will be reported till you read the stats,
2301 * because it's COR - Clear On Read.
2302 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2303 * at packet level.
2304 */
8e8e2951 2305 if (atomic_read(&po->tp_drops))
f6fb8f10 2306 status |= TP_STATUS_LOSING;
2307 }
945d015e 2308
ee80fbf3 2309 po->stats.stats1.tp_packets++;
1da177e4
LT
2310 if (copy_skb) {
2311 status |= TP_STATUS_COPY;
2312 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2313 }
1da177e4
LT
2314 spin_unlock(&sk->sk_receive_queue.lock);
2315
bbd6ef87 2316 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2317
2318 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
d413fcb4 2319 ktime_get_real_ts64(&ts);
1da177e4 2320
b9c32fb2
DB
2321 status |= ts_status;
2322
bbd6ef87
PM
2323 switch (po->tp_version) {
2324 case TPACKET_V1:
2325 h.h1->tp_len = skb->len;
2326 h.h1->tp_snaplen = snaplen;
2327 h.h1->tp_mac = macoff;
2328 h.h1->tp_net = netoff;
4b457bdf
DB
2329 h.h1->tp_sec = ts.tv_sec;
2330 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2331 hdrlen = sizeof(*h.h1);
2332 break;
2333 case TPACKET_V2:
2334 h.h2->tp_len = skb->len;
2335 h.h2->tp_snaplen = snaplen;
2336 h.h2->tp_mac = macoff;
2337 h.h2->tp_net = netoff;
bbd6ef87
PM
2338 h.h2->tp_sec = ts.tv_sec;
2339 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2340 if (skb_vlan_tag_present(skb)) {
2341 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2342 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2343 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2344 } else {
2345 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2346 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2347 }
e4d26f4b 2348 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2349 hdrlen = sizeof(*h.h2);
2350 break;
f6fb8f10 2351 case TPACKET_V3:
2352 /* tp_nxt_offset,vlan are already populated above.
2353 * So DONT clear those fields here
2354 */
2355 h.h3->tp_status |= status;
2356 h.h3->tp_len = skb->len;
2357 h.h3->tp_snaplen = snaplen;
2358 h.h3->tp_mac = macoff;
2359 h.h3->tp_net = netoff;
f6fb8f10 2360 h.h3->tp_sec = ts.tv_sec;
2361 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2362 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2363 hdrlen = sizeof(*h.h3);
2364 break;
bbd6ef87
PM
2365 default:
2366 BUG();
2367 }
1da177e4 2368
bbd6ef87 2369 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2370 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2371 sll->sll_family = AF_PACKET;
2372 sll->sll_hatype = dev->type;
2373 sll->sll_protocol = skb->protocol;
2374 sll->sll_pkttype = skb->pkt_type;
8032b464 2375 if (unlikely(po->origdev))
80feaacb
PWJ
2376 sll->sll_ifindex = orig_dev->ifindex;
2377 else
2378 sll->sll_ifindex = dev->ifindex;
1da177e4 2379
e16aa207 2380 smp_mb();
f0d4eb29 2381
f6dafa95 2382#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2383 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2384 u8 *start, *end;
2385
f0d4eb29
DB
2386 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2387 macoff + snaplen);
2388
2389 for (start = h.raw; start < end; start += PAGE_SIZE)
2390 flush_dcache_page(pgv_to_page(start));
1da177e4 2391 }
f0d4eb29 2392 smp_wmb();
f6dafa95 2393#endif
f0d4eb29 2394
da413eec 2395 if (po->tp_version <= TPACKET_V2) {
61fad681 2396 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2397 __packet_set_status(po, h.raw, status);
61fad681
WB
2398 __clear_bit(slot_id, po->rx_ring.rx_owner_map);
2399 spin_unlock(&sk->sk_receive_queue.lock);
da413eec 2400 sk->sk_data_ready(sk);
88fd1cb8 2401 } else if (po->tp_version == TPACKET_V3) {
f6fb8f10 2402 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2403 }
1da177e4
LT
2404
2405drop_n_restore:
2406 if (skb_head != skb->data && skb_shared(skb)) {
2407 skb->data = skb_head;
2408 skb->len = skb_len;
2409 }
2410drop:
da37845f
WJ
2411 if (!is_drop_n_account)
2412 consume_skb(skb);
2413 else
2414 kfree_skb(skb);
1da177e4
LT
2415 return 0;
2416
58d19b19 2417drop_n_account:
1da177e4 2418 spin_unlock(&sk->sk_receive_queue.lock);
8e8e2951
ED
2419 atomic_inc(&po->tp_drops);
2420 is_drop_n_account = true;
1da177e4 2421
676d2369 2422 sk->sk_data_ready(sk);
acb5d75b 2423 kfree_skb(copy_skb);
1da177e4
LT
2424 goto drop_n_restore;
2425}
2426
69e3c75f
JB
2427static void tpacket_destruct_skb(struct sk_buff *skb)
2428{
2429 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2430
69e3c75f 2431 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2432 void *ph;
b9c32fb2
DB
2433 __u32 ts;
2434
5cd8d46e 2435 ph = skb_zcopy_get_nouarg(skb);
b0138408 2436 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2437
2438 ts = __packet_set_timestamp(po, ph, skb);
2439 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
89ed5b51
NH
2440
2441 if (!packet_read_pending(&po->tx_ring))
2442 complete(&po->skb_completion);
69e3c75f
JB
2443 }
2444
2445 sock_wfree(skb);
2446}
2447
16cc1400
WB
2448static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2449{
16cc1400
WB
2450 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2451 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2452 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2453 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2454 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2455 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2456 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2457
2458 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2459 return -EINVAL;
2460
16cc1400
WB
2461 return 0;
2462}
2463
2464static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2465 struct virtio_net_hdr *vnet_hdr)
2466{
16cc1400
WB
2467 if (*len < sizeof(*vnet_hdr))
2468 return -EINVAL;
2469 *len -= sizeof(*vnet_hdr);
2470
cbbd26b8 2471 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
16cc1400
WB
2472 return -EFAULT;
2473
2474 return __packet_snd_vnet_parse(vnet_hdr, *len);
2475}
2476
40d4e3df 2477static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2478 void *frame, struct net_device *dev, void *data, int tp_len,
c14ac945
SHY
2479 __be16 proto, unsigned char *addr, int hlen, int copylen,
2480 const struct sockcm_cookie *sockc)
69e3c75f 2481{
184f489e 2482 union tpacket_uhdr ph;
8d39b4a6 2483 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2484 struct socket *sock = po->sk.sk_socket;
2485 struct page *page;
69e3c75f
JB
2486 int err;
2487
2488 ph.raw = frame;
2489
2490 skb->protocol = proto;
2491 skb->dev = dev;
2492 skb->priority = po->sk.sk_priority;
2d37a186 2493 skb->mark = po->sk.sk_mark;
3d0ba8c0 2494 skb->tstamp = sockc->transmit_time;
8f932f76 2495 skb_setup_tx_timestamp(skb, sockc->tsflags);
5cd8d46e 2496 skb_zcopy_set_nouarg(skb, ph.raw);
69e3c75f 2497
ae641949 2498 skb_reserve(skb, hlen);
69e3c75f 2499 skb_reset_network_header(skb);
c1aad275 2500
69e3c75f
JB
2501 to_write = tp_len;
2502
2503 if (sock->type == SOCK_DGRAM) {
2504 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2505 NULL, tp_len);
2506 if (unlikely(err < 0))
2507 return -EINVAL;
1d036d25 2508 } else if (copylen) {
9ed988cd
WB
2509 int hdrlen = min_t(int, copylen, tp_len);
2510
69e3c75f 2511 skb_push(skb, dev->hard_header_len);
1d036d25 2512 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2513 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2514 if (unlikely(err))
2515 return err;
9ed988cd
WB
2516 if (!dev_validate_header(dev, skb->data, hdrlen))
2517 return -EINVAL;
69e3c75f 2518
9ed988cd
WB
2519 data += hdrlen;
2520 to_write -= hdrlen;
69e3c75f
JB
2521 }
2522
69e3c75f
JB
2523 offset = offset_in_page(data);
2524 len_max = PAGE_SIZE - offset;
2525 len = ((to_write > len_max) ? len_max : to_write);
2526
2527 skb->data_len = to_write;
2528 skb->len += to_write;
2529 skb->truesize += to_write;
14afee4b 2530 refcount_add(to_write, &po->sk.sk_wmem_alloc);
69e3c75f
JB
2531
2532 while (likely(to_write)) {
2533 nr_frags = skb_shinfo(skb)->nr_frags;
2534
2535 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2536 pr_err("Packet exceed the number of skb frags(%lu)\n",
2537 MAX_SKB_FRAGS);
69e3c75f
JB
2538 return -EFAULT;
2539 }
2540
0af55bb5
CG
2541 page = pgv_to_page(data);
2542 data += len;
69e3c75f
JB
2543 flush_dcache_page(page);
2544 get_page(page);
0af55bb5 2545 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2546 to_write -= len;
2547 offset = 0;
2548 len_max = PAGE_SIZE;
2549 len = ((to_write > len_max) ? len_max : to_write);
2550 }
2551
75c65772 2552 packet_parse_headers(skb, sock);
efdfa2f7 2553
69e3c75f
JB
2554 return tp_len;
2555}
2556
8d39b4a6
WB
2557static int tpacket_parse_header(struct packet_sock *po, void *frame,
2558 int size_max, void **data)
2559{
2560 union tpacket_uhdr ph;
2561 int tp_len, off;
2562
2563 ph.raw = frame;
2564
2565 switch (po->tp_version) {
7f953ab2
SV
2566 case TPACKET_V3:
2567 if (ph.h3->tp_next_offset != 0) {
2568 pr_warn_once("variable sized slot not supported");
2569 return -EINVAL;
2570 }
2571 tp_len = ph.h3->tp_len;
2572 break;
8d39b4a6
WB
2573 case TPACKET_V2:
2574 tp_len = ph.h2->tp_len;
2575 break;
2576 default:
2577 tp_len = ph.h1->tp_len;
2578 break;
2579 }
2580 if (unlikely(tp_len > size_max)) {
2581 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2582 return -EMSGSIZE;
2583 }
2584
2585 if (unlikely(po->tp_tx_has_off)) {
2586 int off_min, off_max;
2587
2588 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2589 off_max = po->tx_ring.frame_size - tp_len;
2590 if (po->sk.sk_type == SOCK_DGRAM) {
2591 switch (po->tp_version) {
7f953ab2
SV
2592 case TPACKET_V3:
2593 off = ph.h3->tp_net;
2594 break;
8d39b4a6
WB
2595 case TPACKET_V2:
2596 off = ph.h2->tp_net;
2597 break;
2598 default:
2599 off = ph.h1->tp_net;
2600 break;
2601 }
2602 } else {
2603 switch (po->tp_version) {
7f953ab2
SV
2604 case TPACKET_V3:
2605 off = ph.h3->tp_mac;
2606 break;
8d39b4a6
WB
2607 case TPACKET_V2:
2608 off = ph.h2->tp_mac;
2609 break;
2610 default:
2611 off = ph.h1->tp_mac;
2612 break;
2613 }
2614 }
2615 if (unlikely((off < off_min) || (off_max < off)))
2616 return -EINVAL;
2617 } else {
2618 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2619 }
2620
2621 *data = frame + off;
2622 return tp_len;
2623}
2624
69e3c75f
JB
2625static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2626{
89ed5b51 2627 struct sk_buff *skb = NULL;
69e3c75f 2628 struct net_device *dev;
1d036d25 2629 struct virtio_net_hdr *vnet_hdr = NULL;
c14ac945 2630 struct sockcm_cookie sockc;
69e3c75f 2631 __be16 proto;
09effa67 2632 int err, reserve = 0;
40d4e3df 2633 void *ph;
342dfc30 2634 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2635 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
486efdc8 2636 unsigned char *addr = NULL;
69e3c75f 2637 int tp_len, size_max;
8d39b4a6 2638 void *data;
69e3c75f 2639 int len_sum = 0;
9e67030a 2640 int status = TP_STATUS_AVAILABLE;
1d036d25 2641 int hlen, tlen, copylen = 0;
89ed5b51 2642 long timeo = 0;
69e3c75f 2643
69e3c75f
JB
2644 mutex_lock(&po->pg_vec_lock);
2645
32d3182c
ED
2646 /* packet_sendmsg() check on tx_ring.pg_vec was lockless,
2647 * we need to confirm it under protection of pg_vec_lock.
2648 */
2649 if (unlikely(!po->tx_ring.pg_vec)) {
2650 err = -EBUSY;
2651 goto out;
2652 }
66e56cd4 2653 if (likely(saddr == NULL)) {
e40526cb 2654 dev = packet_cached_dev_get(po);
69e3c75f 2655 proto = po->num;
69e3c75f
JB
2656 } else {
2657 err = -EINVAL;
2658 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2659 goto out;
2660 if (msg->msg_namelen < (saddr->sll_halen
2661 + offsetof(struct sockaddr_ll,
2662 sll_addr)))
2663 goto out;
69e3c75f 2664 proto = saddr->sll_protocol;
827d9780 2665 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
486efdc8
WB
2666 if (po->sk.sk_socket->type == SOCK_DGRAM) {
2667 if (dev && msg->msg_namelen < dev->addr_len +
2668 offsetof(struct sockaddr_ll, sll_addr))
2669 goto out_put;
2670 addr = saddr->sll_addr;
2671 }
69e3c75f
JB
2672 }
2673
69e3c75f
JB
2674 err = -ENXIO;
2675 if (unlikely(dev == NULL))
2676 goto out;
69e3c75f
JB
2677 err = -ENETDOWN;
2678 if (unlikely(!(dev->flags & IFF_UP)))
2679 goto out_put;
2680
657a0667 2681 sockcm_init(&sockc, &po->sk);
d19b183c
DCS
2682 if (msg->msg_controllen) {
2683 err = sock_cmsg_send(&po->sk, msg, &sockc);
2684 if (unlikely(err))
2685 goto out_put;
2686 }
2687
5cfb4c8d
DB
2688 if (po->sk.sk_socket->type == SOCK_RAW)
2689 reserve = dev->hard_header_len;
69e3c75f 2690 size_max = po->tx_ring.frame_size
b5dd884e 2691 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2692
1d036d25 2693 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2694 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2695
89ed5b51
NH
2696 reinit_completion(&po->skb_completion);
2697
69e3c75f
JB
2698 do {
2699 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2700 TP_STATUS_SEND_REQUEST);
69e3c75f 2701 if (unlikely(ph == NULL)) {
89ed5b51
NH
2702 if (need_wait && skb) {
2703 timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
2704 timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
2705 if (timeo <= 0) {
2706 err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
2707 goto out_put;
2708 }
2709 }
2710 /* check for additional frames */
69e3c75f
JB
2711 continue;
2712 }
2713
8d39b4a6
WB
2714 skb = NULL;
2715 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2716 if (tp_len < 0)
2717 goto tpacket_error;
2718
69e3c75f 2719 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2720 hlen = LL_RESERVED_SPACE(dev);
2721 tlen = dev->needed_tailroom;
1d036d25
WB
2722 if (po->has_vnet_hdr) {
2723 vnet_hdr = data;
2724 data += sizeof(*vnet_hdr);
2725 tp_len -= sizeof(*vnet_hdr);
2726 if (tp_len < 0 ||
2727 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2728 tp_len = -EINVAL;
2729 goto tpacket_error;
2730 }
2731 copylen = __virtio16_to_cpu(vio_le(),
2732 vnet_hdr->hdr_len);
2733 }
9ed988cd 2734 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2735 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2736 hlen + tlen + sizeof(struct sockaddr_ll) +
2737 (copylen - dev->hard_header_len),
fbf33a28 2738 !need_wait, &err);
69e3c75f 2739
fbf33a28
KM
2740 if (unlikely(skb == NULL)) {
2741 /* we assume the socket was initially writeable ... */
2742 if (likely(len_sum > 0))
2743 err = len_sum;
69e3c75f 2744 goto out_status;
fbf33a28 2745 }
8d39b4a6 2746 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
c14ac945 2747 addr, hlen, copylen, &sockc);
dbd46ab4 2748 if (likely(tp_len >= 0) &&
5cfb4c8d 2749 tp_len > dev->mtu + reserve &&
1d036d25 2750 !po->has_vnet_hdr &&
3c70c132
DB
2751 !packet_extra_vlan_len_allowed(dev, skb))
2752 tp_len = -EMSGSIZE;
69e3c75f
JB
2753
2754 if (unlikely(tp_len < 0)) {
8d39b4a6 2755tpacket_error:
69e3c75f
JB
2756 if (po->tp_loss) {
2757 __packet_set_status(po, ph,
2758 TP_STATUS_AVAILABLE);
2759 packet_increment_head(&po->tx_ring);
2760 kfree_skb(skb);
2761 continue;
2762 } else {
2763 status = TP_STATUS_WRONG_FORMAT;
2764 err = tp_len;
2765 goto out_status;
2766 }
2767 }
2768
9d2f67e4
JT
2769 if (po->has_vnet_hdr) {
2770 if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
2771 tp_len = -EINVAL;
2772 goto tpacket_error;
2773 }
2774 virtio_net_hdr_set_proto(skb, vnet_hdr);
1d036d25
WB
2775 }
2776
69e3c75f
JB
2777 skb->destructor = tpacket_destruct_skb;
2778 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2779 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2780
2781 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2782 err = po->xmit(skb);
eb70df13
JP
2783 if (unlikely(err > 0)) {
2784 err = net_xmit_errno(err);
2785 if (err && __packet_get_status(po, ph) ==
2786 TP_STATUS_AVAILABLE) {
2787 /* skb was destructed already */
2788 skb = NULL;
2789 goto out_status;
2790 }
2791 /*
2792 * skb was dropped but not destructed yet;
2793 * let's treat it like congestion or err < 0
2794 */
2795 err = 0;
2796 }
69e3c75f
JB
2797 packet_increment_head(&po->tx_ring);
2798 len_sum += tp_len;
b0138408
DB
2799 } while (likely((ph != NULL) ||
2800 /* Note: packet_read_pending() might be slow if we have
2801 * to call it as it's per_cpu variable, but in fast-path
2802 * we already short-circuit the loop with the first
2803 * condition, and luckily don't have to go that path
2804 * anyway.
2805 */
2806 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2807
2808 err = len_sum;
2809 goto out_put;
2810
69e3c75f
JB
2811out_status:
2812 __packet_set_status(po, ph, status);
2813 kfree_skb(skb);
2814out_put:
e40526cb 2815 dev_put(dev);
69e3c75f
JB
2816out:
2817 mutex_unlock(&po->pg_vec_lock);
2818 return err;
2819}
69e3c75f 2820
eea49cc9
OJ
2821static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2822 size_t reserve, size_t len,
2823 size_t linear, int noblock,
2824 int *err)
bfd5f4a3
SS
2825{
2826 struct sk_buff *skb;
2827
2828 /* Under a page? Don't bother with paged skb. */
2829 if (prepad + len < PAGE_SIZE || !linear)
2830 linear = len;
2831
2832 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2833 err, 0);
bfd5f4a3
SS
2834 if (!skb)
2835 return NULL;
2836
2837 skb_reserve(skb, reserve);
2838 skb_put(skb, linear);
2839 skb->data_len = len - linear;
2840 skb->len += len - linear;
2841
2842 return skb;
2843}
2844
d346a3fa 2845static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2846{
2847 struct sock *sk = sock->sk;
342dfc30 2848 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2849 struct sk_buff *skb;
2850 struct net_device *dev;
0e11c91e 2851 __be16 proto;
486efdc8 2852 unsigned char *addr = NULL;
827d9780 2853 int err, reserve = 0;
c7d39e32 2854 struct sockcm_cookie sockc;
bfd5f4a3
SS
2855 struct virtio_net_hdr vnet_hdr = { 0 };
2856 int offset = 0;
bfd5f4a3 2857 struct packet_sock *po = pkt_sk(sk);
da7c9561 2858 bool has_vnet_hdr = false;
57031eb7 2859 int hlen, tlen, linear;
3bdc0eba 2860 int extra_len = 0;
1da177e4
LT
2861
2862 /*
1ce4f28b 2863 * Get and verify the address.
1da177e4 2864 */
1ce4f28b 2865
66e56cd4 2866 if (likely(saddr == NULL)) {
e40526cb 2867 dev = packet_cached_dev_get(po);
1da177e4 2868 proto = po->num;
1da177e4
LT
2869 } else {
2870 err = -EINVAL;
2871 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2872 goto out;
0fb375fb
EB
2873 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2874 goto out;
1da177e4 2875 proto = saddr->sll_protocol;
827d9780 2876 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
486efdc8
WB
2877 if (sock->type == SOCK_DGRAM) {
2878 if (dev && msg->msg_namelen < dev->addr_len +
2879 offsetof(struct sockaddr_ll, sll_addr))
2880 goto out_unlock;
2881 addr = saddr->sll_addr;
2882 }
1da177e4
LT
2883 }
2884
1da177e4 2885 err = -ENXIO;
e40526cb 2886 if (unlikely(dev == NULL))
1da177e4 2887 goto out_unlock;
d5e76b0a 2888 err = -ENETDOWN;
e40526cb 2889 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2890 goto out_unlock;
2891
657a0667 2892 sockcm_init(&sockc, sk);
c7d39e32
EJ
2893 sockc.mark = sk->sk_mark;
2894 if (msg->msg_controllen) {
2895 err = sock_cmsg_send(sk, msg, &sockc);
2896 if (unlikely(err))
2897 goto out_unlock;
2898 }
2899
e40526cb
DB
2900 if (sock->type == SOCK_RAW)
2901 reserve = dev->hard_header_len;
bfd5f4a3 2902 if (po->has_vnet_hdr) {
16cc1400
WB
2903 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2904 if (err)
bfd5f4a3 2905 goto out_unlock;
da7c9561 2906 has_vnet_hdr = true;
bfd5f4a3
SS
2907 }
2908
3bdc0eba
BG
2909 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2910 if (!netif_supports_nofcs(dev)) {
2911 err = -EPROTONOSUPPORT;
2912 goto out_unlock;
2913 }
2914 extra_len = 4; /* We're doing our own CRC */
2915 }
2916
1da177e4 2917 err = -EMSGSIZE;
16cc1400
WB
2918 if (!vnet_hdr.gso_type &&
2919 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2920 goto out_unlock;
2921
bfd5f4a3 2922 err = -ENOBUFS;
ae641949
HX
2923 hlen = LL_RESERVED_SPACE(dev);
2924 tlen = dev->needed_tailroom;
57031eb7
WB
2925 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2926 linear = max(linear, min_t(int, len, dev->hard_header_len));
2927 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
bfd5f4a3 2928 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2929 if (skb == NULL)
1da177e4
LT
2930 goto out_unlock;
2931
b84bbaf7 2932 skb_reset_network_header(skb);
1da177e4 2933
0c4e8581 2934 err = -EINVAL;
9c707762
WB
2935 if (sock->type == SOCK_DGRAM) {
2936 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2937 if (unlikely(offset < 0))
9c707762 2938 goto out_free;
b84bbaf7 2939 } else if (reserve) {
9aad13b0 2940 skb_reserve(skb, -reserve);
88a8121d
ND
2941 if (len < reserve + sizeof(struct ipv6hdr) &&
2942 dev->min_header_len != dev->hard_header_len)
993675a3 2943 skb_reset_network_header(skb);
9c707762 2944 }
1da177e4
LT
2945
2946 /* Returns -EFAULT on error */
c0371da6 2947 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2948 if (err)
2949 goto out_free;
bf84a010 2950
9ed988cd
WB
2951 if (sock->type == SOCK_RAW &&
2952 !dev_validate_header(dev, skb->data, len)) {
2953 err = -EINVAL;
2954 goto out_free;
2955 }
2956
8f932f76 2957 skb_setup_tx_timestamp(skb, sockc.tsflags);
1da177e4 2958
16cc1400 2959 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2960 !packet_extra_vlan_len_allowed(dev, skb)) {
2961 err = -EMSGSIZE;
2962 goto out_free;
57f89bfa
BG
2963 }
2964
09effa67
DM
2965 skb->protocol = proto;
2966 skb->dev = dev;
1da177e4 2967 skb->priority = sk->sk_priority;
c7d39e32 2968 skb->mark = sockc.mark;
3d0ba8c0 2969 skb->tstamp = sockc.transmit_time;
0fd5d57b 2970
da7c9561 2971 if (has_vnet_hdr) {
db60eb5f 2972 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
16cc1400
WB
2973 if (err)
2974 goto out_free;
2975 len += sizeof(vnet_hdr);
9d2f67e4 2976 virtio_net_hdr_set_proto(skb, &vnet_hdr);
bfd5f4a3
SS
2977 }
2978
75c65772 2979 packet_parse_headers(skb, sock);
8fd6c80d 2980
3bdc0eba
BG
2981 if (unlikely(extra_len == 4))
2982 skb->no_fcs = 1;
2983
d346a3fa 2984 err = po->xmit(skb);
1da177e4
LT
2985 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2986 goto out_unlock;
2987
e40526cb 2988 dev_put(dev);
1da177e4 2989
40d4e3df 2990 return len;
1da177e4
LT
2991
2992out_free:
2993 kfree_skb(skb);
2994out_unlock:
e40526cb 2995 if (dev)
1da177e4
LT
2996 dev_put(dev);
2997out:
2998 return err;
2999}
3000
1b784140 3001static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 3002{
69e3c75f
JB
3003 struct sock *sk = sock->sk;
3004 struct packet_sock *po = pkt_sk(sk);
d346a3fa 3005
69e3c75f
JB
3006 if (po->tx_ring.pg_vec)
3007 return tpacket_snd(po, msg);
3008 else
69e3c75f
JB
3009 return packet_snd(sock, msg, len);
3010}
3011
1da177e4
LT
3012/*
3013 * Close a PACKET socket. This is fairly simple. We immediately go
3014 * to 'closed' state and remove our protocol entry in the device list.
3015 */
3016
3017static int packet_release(struct socket *sock)
3018{
3019 struct sock *sk = sock->sk;
3020 struct packet_sock *po;
2bd624b4 3021 struct packet_fanout *f;
d12d01d6 3022 struct net *net;
f6fb8f10 3023 union tpacket_req_u req_u;
1da177e4
LT
3024
3025 if (!sk)
3026 return 0;
3027
3b1e0a65 3028 net = sock_net(sk);
1da177e4
LT
3029 po = pkt_sk(sk);
3030
0fa7fa98 3031 mutex_lock(&net->packet.sklist_lock);
808f5114 3032 sk_del_node_init_rcu(sk);
0fa7fa98
PE
3033 mutex_unlock(&net->packet.sklist_lock);
3034
3035 preempt_disable();
920de804 3036 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 3037 preempt_enable();
1da177e4 3038
808f5114 3039 spin_lock(&po->bind_lock);
ce06b03e 3040 unregister_prot_hook(sk, false);
66e56cd4
DB
3041 packet_cached_dev_reset(po);
3042
160ff18a
BG
3043 if (po->prot_hook.dev) {
3044 dev_put(po->prot_hook.dev);
3045 po->prot_hook.dev = NULL;
3046 }
808f5114 3047 spin_unlock(&po->bind_lock);
1da177e4 3048
1da177e4 3049 packet_flush_mclist(sk);
1da177e4 3050
5171b37d 3051 lock_sock(sk);
9665d5d6
PS
3052 if (po->rx_ring.pg_vec) {
3053 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3054 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 3055 }
69e3c75f 3056
9665d5d6
PS
3057 if (po->tx_ring.pg_vec) {
3058 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3059 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 3060 }
5171b37d 3061 release_sock(sk);
1da177e4 3062
2bd624b4 3063 f = fanout_release(sk);
dc99f600 3064
808f5114 3065 synchronize_net();
2bd624b4 3066
afa0925c 3067 kfree(po->rollover);
2bd624b4
AS
3068 if (f) {
3069 fanout_release_data(f);
3070 kfree(f);
3071 }
1da177e4
LT
3072 /*
3073 * Now the socket is dead. No more input will appear.
3074 */
1da177e4
LT
3075 sock_orphan(sk);
3076 sock->sk = NULL;
3077
3078 /* Purge queues */
3079
3080 skb_queue_purge(&sk->sk_receive_queue);
b0138408 3081 packet_free_pending(po);
17ab56a2 3082 sk_refcnt_debug_release(sk);
1da177e4
LT
3083
3084 sock_put(sk);
3085 return 0;
3086}
3087
3088/*
3089 * Attach a packet hook.
3090 */
3091
30f7ea1c
FR
3092static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3093 __be16 proto)
1da177e4
LT
3094{
3095 struct packet_sock *po = pkt_sk(sk);
158cd4af 3096 struct net_device *dev_curr;
902fefb8
DB
3097 __be16 proto_curr;
3098 bool need_rehook;
30f7ea1c
FR
3099 struct net_device *dev = NULL;
3100 int ret = 0;
3101 bool unlisted = false;
dc99f600 3102
1da177e4 3103 lock_sock(sk);
1da177e4 3104 spin_lock(&po->bind_lock);
30f7ea1c
FR
3105 rcu_read_lock();
3106
4971613c
WB
3107 if (po->fanout) {
3108 ret = -EINVAL;
3109 goto out_unlock;
3110 }
3111
30f7ea1c
FR
3112 if (name) {
3113 dev = dev_get_by_name_rcu(sock_net(sk), name);
3114 if (!dev) {
3115 ret = -ENODEV;
3116 goto out_unlock;
3117 }
3118 } else if (ifindex) {
3119 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3120 if (!dev) {
3121 ret = -ENODEV;
3122 goto out_unlock;
3123 }
3124 }
3125
3126 if (dev)
3127 dev_hold(dev);
66e56cd4 3128
902fefb8
DB
3129 proto_curr = po->prot_hook.type;
3130 dev_curr = po->prot_hook.dev;
3131
3132 need_rehook = proto_curr != proto || dev_curr != dev;
3133
3134 if (need_rehook) {
30f7ea1c
FR
3135 if (po->running) {
3136 rcu_read_unlock();
15fe076e
ED
3137 /* prevents packet_notifier() from calling
3138 * register_prot_hook()
3139 */
3140 po->num = 0;
30f7ea1c
FR
3141 __unregister_prot_hook(sk, true);
3142 rcu_read_lock();
3143 dev_curr = po->prot_hook.dev;
3144 if (dev)
3145 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3146 dev->ifindex);
3147 }
1da177e4 3148
15fe076e 3149 BUG_ON(po->running);
902fefb8
DB
3150 po->num = proto;
3151 po->prot_hook.type = proto;
902fefb8 3152
30f7ea1c
FR
3153 if (unlikely(unlisted)) {
3154 dev_put(dev);
3155 po->prot_hook.dev = NULL;
3156 po->ifindex = -1;
3157 packet_cached_dev_reset(po);
3158 } else {
3159 po->prot_hook.dev = dev;
3160 po->ifindex = dev ? dev->ifindex : 0;
3161 packet_cached_dev_assign(po, dev);
3162 }
902fefb8 3163 }
158cd4af
LW
3164 if (dev_curr)
3165 dev_put(dev_curr);
66e56cd4 3166
902fefb8 3167 if (proto == 0 || !need_rehook)
1da177e4
LT
3168 goto out_unlock;
3169
30f7ea1c 3170 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3171 register_prot_hook(sk);
be85d4ad
UT
3172 } else {
3173 sk->sk_err = ENETDOWN;
3174 if (!sock_flag(sk, SOCK_DEAD))
3175 sk->sk_error_report(sk);
1da177e4
LT
3176 }
3177
3178out_unlock:
30f7ea1c 3179 rcu_read_unlock();
1da177e4
LT
3180 spin_unlock(&po->bind_lock);
3181 release_sock(sk);
30f7ea1c 3182 return ret;
1da177e4
LT
3183}
3184
3185/*
3186 * Bind a packet socket to a device
3187 */
3188
40d4e3df
ED
3189static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3190 int addr_len)
1da177e4 3191{
40d4e3df 3192 struct sock *sk = sock->sk;
540e2894 3193 char name[sizeof(uaddr->sa_data) + 1];
1ce4f28b 3194
1da177e4
LT
3195 /*
3196 * Check legality
3197 */
1ce4f28b 3198
8ae55f04 3199 if (addr_len != sizeof(struct sockaddr))
1da177e4 3200 return -EINVAL;
540e2894
AP
3201 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3202 * zero-terminated.
3203 */
3204 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3205 name[sizeof(uaddr->sa_data)] = 0;
1da177e4 3206
30f7ea1c 3207 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3208}
1da177e4
LT
3209
3210static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3211{
40d4e3df
ED
3212 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3213 struct sock *sk = sock->sk;
1da177e4
LT
3214
3215 /*
3216 * Check legality
3217 */
1ce4f28b 3218
1da177e4
LT
3219 if (addr_len < sizeof(struct sockaddr_ll))
3220 return -EINVAL;
3221 if (sll->sll_family != AF_PACKET)
3222 return -EINVAL;
3223
30f7ea1c
FR
3224 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3225 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3226}
3227
3228static struct proto packet_proto = {
3229 .name = "PACKET",
3230 .owner = THIS_MODULE,
3231 .obj_size = sizeof(struct packet_sock),
3232};
3233
3234/*
1ce4f28b 3235 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3236 */
3237
3f378b68
EP
3238static int packet_create(struct net *net, struct socket *sock, int protocol,
3239 int kern)
1da177e4
LT
3240{
3241 struct sock *sk;
3242 struct packet_sock *po;
0e11c91e 3243 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3244 int err;
3245
df008c91 3246 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3247 return -EPERM;
be02097c
DM
3248 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3249 sock->type != SOCK_PACKET)
1da177e4
LT
3250 return -ESOCKTNOSUPPORT;
3251
3252 sock->state = SS_UNCONNECTED;
3253
3254 err = -ENOBUFS;
11aa9c28 3255 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3256 if (sk == NULL)
3257 goto out;
3258
3259 sock->ops = &packet_ops;
1da177e4
LT
3260 if (sock->type == SOCK_PACKET)
3261 sock->ops = &packet_ops_spkt;
be02097c 3262
1da177e4
LT
3263 sock_init_data(sock, sk);
3264
3265 po = pkt_sk(sk);
89ed5b51 3266 init_completion(&po->skb_completion);
1da177e4 3267 sk->sk_family = PF_PACKET;
0e11c91e 3268 po->num = proto;
d346a3fa 3269 po->xmit = dev_queue_xmit;
66e56cd4 3270
b0138408
DB
3271 err = packet_alloc_pending(po);
3272 if (err)
3273 goto out2;
3274
66e56cd4 3275 packet_cached_dev_reset(po);
1da177e4
LT
3276
3277 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3278 sk_refcnt_debug_inc(sk);
1da177e4
LT
3279
3280 /*
3281 * Attach a protocol block
3282 */
3283
3284 spin_lock_init(&po->bind_lock);
905db440 3285 mutex_init(&po->pg_vec_lock);
0648ab70 3286 po->rollover = NULL;
1da177e4 3287 po->prot_hook.func = packet_rcv;
be02097c 3288
1da177e4
LT
3289 if (sock->type == SOCK_PACKET)
3290 po->prot_hook.func = packet_rcv_spkt;
be02097c 3291
1da177e4
LT
3292 po->prot_hook.af_packet_priv = sk;
3293
0e11c91e
AV
3294 if (proto) {
3295 po->prot_hook.type = proto;
a6361f0c 3296 __register_prot_hook(sk);
1da177e4
LT
3297 }
3298
0fa7fa98 3299 mutex_lock(&net->packet.sklist_lock);
a4dc6a49 3300 sk_add_node_tail_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3301 mutex_unlock(&net->packet.sklist_lock);
3302
3303 preempt_disable();
3680453c 3304 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3305 preempt_enable();
808f5114 3306
40d4e3df 3307 return 0;
b0138408
DB
3308out2:
3309 sk_free(sk);
1da177e4
LT
3310out:
3311 return err;
3312}
3313
3314/*
3315 * Pull a packet from our receive queue and hand it to the user.
3316 * If necessary we block.
3317 */
3318
1b784140
YX
3319static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3320 int flags)
1da177e4
LT
3321{
3322 struct sock *sk = sock->sk;
3323 struct sk_buff *skb;
3324 int copied, err;
bfd5f4a3 3325 int vnet_hdr_len = 0;
2472d761 3326 unsigned int origlen = 0;
1da177e4
LT
3327
3328 err = -EINVAL;
ed85b565 3329 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3330 goto out;
3331
3332#if 0
3333 /* What error should we return now? EUNATTACH? */
3334 if (pkt_sk(sk)->ifindex < 0)
3335 return -ENODEV;
3336#endif
3337
ed85b565 3338 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3339 err = sock_recv_errqueue(sk, msg, len,
3340 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3341 goto out;
3342 }
3343
1da177e4
LT
3344 /*
3345 * Call the generic datagram receiver. This handles all sorts
3346 * of horrible races and re-entrancy so we can forget about it
3347 * in the protocol layers.
3348 *
3349 * Now it will return ENETDOWN, if device have just gone down,
3350 * but then it will block.
3351 */
3352
40d4e3df 3353 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3354
3355 /*
1ce4f28b 3356 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3357 * handles the blocking we don't see and worry about blocking
3358 * retries.
3359 */
3360
8ae55f04 3361 if (skb == NULL)
1da177e4
LT
3362 goto out;
3363
9bb6cd65 3364 packet_rcv_try_clear_pressure(pkt_sk(sk));
2ccdbaa6 3365
bfd5f4a3 3366 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3367 err = packet_rcv_vnet(msg, skb, &len);
3368 if (err)
bfd5f4a3 3369 goto out_free;
16cc1400 3370 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3371 }
3372
f3d33426
HFS
3373 /* You lose any data beyond the buffer you gave. If it worries
3374 * a user program they can ask the device for its MTU
3375 * anyway.
1da177e4 3376 */
1da177e4 3377 copied = skb->len;
40d4e3df
ED
3378 if (copied > len) {
3379 copied = len;
3380 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3381 }
3382
51f3d02b 3383 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3384 if (err)
3385 goto out_free;
3386
2472d761
EB
3387 if (sock->type != SOCK_PACKET) {
3388 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3389
3390 /* Original length was stored in sockaddr_ll fields */
3391 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3392 sll->sll_family = AF_PACKET;
3393 sll->sll_protocol = skb->protocol;
3394 }
3395
3b885787 3396 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3397
f3d33426 3398 if (msg->msg_name) {
b2cf86e1
WB
3399 int copy_len;
3400
f3d33426
HFS
3401 /* If the address length field is there to be filled
3402 * in, we fill it in now.
3403 */
3404 if (sock->type == SOCK_PACKET) {
342dfc30 3405 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426 3406 msg->msg_namelen = sizeof(struct sockaddr_pkt);
b2cf86e1 3407 copy_len = msg->msg_namelen;
f3d33426
HFS
3408 } else {
3409 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3410
f3d33426
HFS
3411 msg->msg_namelen = sll->sll_halen +
3412 offsetof(struct sockaddr_ll, sll_addr);
b2cf86e1
WB
3413 copy_len = msg->msg_namelen;
3414 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
3415 memset(msg->msg_name +
3416 offsetof(struct sockaddr_ll, sll_addr),
3417 0, sizeof(sll->sll_addr));
3418 msg->msg_namelen = sizeof(struct sockaddr_ll);
3419 }
f3d33426 3420 }
b2cf86e1 3421 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
f3d33426 3422 }
1da177e4 3423
8dc41944 3424 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3425 struct tpacket_auxdata aux;
3426
3427 aux.tp_status = TP_STATUS_USER;
3428 if (skb->ip_summed == CHECKSUM_PARTIAL)
3429 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3430 else if (skb->pkt_type != PACKET_OUTGOING &&
3431 (skb->ip_summed == CHECKSUM_COMPLETE ||
3432 skb_csum_unnecessary(skb)))
3433 aux.tp_status |= TP_STATUS_CSUM_VALID;
3434
2472d761 3435 aux.tp_len = origlen;
ffbc6111
HX
3436 aux.tp_snaplen = skb->len;
3437 aux.tp_mac = 0;
bbe735e4 3438 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3439 if (skb_vlan_tag_present(skb)) {
3440 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3441 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3442 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3443 } else {
3444 aux.tp_vlan_tci = 0;
a0cdfcf3 3445 aux.tp_vlan_tpid = 0;
a3bcc23e 3446 }
ffbc6111 3447 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3448 }
3449
1da177e4
LT
3450 /*
3451 * Free or return the buffer as appropriate. Again this
3452 * hides all the races and re-entrancy issues from us.
3453 */
bfd5f4a3 3454 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3455
3456out_free:
3457 skb_free_datagram(sk, skb);
3458out:
3459 return err;
3460}
3461
1da177e4 3462static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3463 int peer)
1da177e4
LT
3464{
3465 struct net_device *dev;
3466 struct sock *sk = sock->sk;
3467
3468 if (peer)
3469 return -EOPNOTSUPP;
3470
3471 uaddr->sa_family = AF_PACKET;
2dc85bf3 3472 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3473 rcu_read_lock();
3474 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3475 if (dev)
2dc85bf3 3476 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3477 rcu_read_unlock();
1da177e4 3478
9b2c45d4 3479 return sizeof(*uaddr);
1da177e4 3480}
1da177e4
LT
3481
3482static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3483 int peer)
1da177e4
LT
3484{
3485 struct net_device *dev;
3486 struct sock *sk = sock->sk;
3487 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3488 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3489
3490 if (peer)
3491 return -EOPNOTSUPP;
3492
3493 sll->sll_family = AF_PACKET;
3494 sll->sll_ifindex = po->ifindex;
3495 sll->sll_protocol = po->num;
67286640 3496 sll->sll_pkttype = 0;
654d1f8a
ED
3497 rcu_read_lock();
3498 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3499 if (dev) {
3500 sll->sll_hatype = dev->type;
3501 sll->sll_halen = dev->addr_len;
3502 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3503 } else {
3504 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3505 sll->sll_halen = 0;
3506 }
654d1f8a 3507 rcu_read_unlock();
1da177e4 3508
9b2c45d4 3509 return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3510}
3511
2aeb0b88
WC
3512static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3513 int what)
1da177e4
LT
3514{
3515 switch (i->type) {
3516 case PACKET_MR_MULTICAST:
1162563f
JP
3517 if (i->alen != dev->addr_len)
3518 return -EINVAL;
1da177e4 3519 if (what > 0)
22bedad3 3520 return dev_mc_add(dev, i->addr);
1da177e4 3521 else
22bedad3 3522 return dev_mc_del(dev, i->addr);
1da177e4
LT
3523 break;
3524 case PACKET_MR_PROMISC:
2aeb0b88 3525 return dev_set_promiscuity(dev, what);
1da177e4 3526 case PACKET_MR_ALLMULTI:
2aeb0b88 3527 return dev_set_allmulti(dev, what);
d95ed927 3528 case PACKET_MR_UNICAST:
1162563f
JP
3529 if (i->alen != dev->addr_len)
3530 return -EINVAL;
d95ed927 3531 if (what > 0)
a748ee24 3532 return dev_uc_add(dev, i->addr);
d95ed927 3533 else
a748ee24 3534 return dev_uc_del(dev, i->addr);
d95ed927 3535 break;
40d4e3df
ED
3536 default:
3537 break;
1da177e4 3538 }
2aeb0b88 3539 return 0;
1da177e4
LT
3540}
3541
82f17091
FR
3542static void packet_dev_mclist_delete(struct net_device *dev,
3543 struct packet_mclist **mlp)
1da177e4 3544{
82f17091
FR
3545 struct packet_mclist *ml;
3546
3547 while ((ml = *mlp) != NULL) {
3548 if (ml->ifindex == dev->ifindex) {
3549 packet_dev_mc(dev, ml, -1);
3550 *mlp = ml->next;
3551 kfree(ml);
3552 } else
3553 mlp = &ml->next;
1da177e4
LT
3554 }
3555}
3556
0fb375fb 3557static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3558{
3559 struct packet_sock *po = pkt_sk(sk);
3560 struct packet_mclist *ml, *i;
3561 struct net_device *dev;
3562 int err;
3563
3564 rtnl_lock();
3565
3566 err = -ENODEV;
3b1e0a65 3567 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3568 if (!dev)
3569 goto done;
3570
3571 err = -EINVAL;
1162563f 3572 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3573 goto done;
3574
3575 err = -ENOBUFS;
8b3a7005 3576 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3577 if (i == NULL)
3578 goto done;
3579
3580 err = 0;
3581 for (ml = po->mclist; ml; ml = ml->next) {
3582 if (ml->ifindex == mreq->mr_ifindex &&
3583 ml->type == mreq->mr_type &&
3584 ml->alen == mreq->mr_alen &&
3585 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3586 ml->count++;
3587 /* Free the new element ... */
3588 kfree(i);
3589 goto done;
3590 }
3591 }
3592
3593 i->type = mreq->mr_type;
3594 i->ifindex = mreq->mr_ifindex;
3595 i->alen = mreq->mr_alen;
3596 memcpy(i->addr, mreq->mr_address, i->alen);
309cf37f 3597 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3598 i->count = 1;
3599 i->next = po->mclist;
3600 po->mclist = i;
2aeb0b88
WC
3601 err = packet_dev_mc(dev, i, 1);
3602 if (err) {
3603 po->mclist = i->next;
3604 kfree(i);
3605 }
1da177e4
LT
3606
3607done:
3608 rtnl_unlock();
3609 return err;
3610}
3611
0fb375fb 3612static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3613{
3614 struct packet_mclist *ml, **mlp;
3615
3616 rtnl_lock();
3617
3618 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3619 if (ml->ifindex == mreq->mr_ifindex &&
3620 ml->type == mreq->mr_type &&
3621 ml->alen == mreq->mr_alen &&
3622 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3623 if (--ml->count == 0) {
3624 struct net_device *dev;
3625 *mlp = ml->next;
ad959e76
ED
3626 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3627 if (dev)
1da177e4 3628 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3629 kfree(ml);
3630 }
82f17091 3631 break;
1da177e4
LT
3632 }
3633 }
3634 rtnl_unlock();
82f17091 3635 return 0;
1da177e4
LT
3636}
3637
3638static void packet_flush_mclist(struct sock *sk)
3639{
3640 struct packet_sock *po = pkt_sk(sk);
3641 struct packet_mclist *ml;
3642
3643 if (!po->mclist)
3644 return;
3645
3646 rtnl_lock();
3647 while ((ml = po->mclist) != NULL) {
3648 struct net_device *dev;
3649
3650 po->mclist = ml->next;
ad959e76
ED
3651 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3652 if (dev != NULL)
1da177e4 3653 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3654 kfree(ml);
3655 }
3656 rtnl_unlock();
3657}
1da177e4
LT
3658
3659static int
a7b75c5a
CH
3660packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
3661 unsigned int optlen)
1da177e4
LT
3662{
3663 struct sock *sk = sock->sk;
8dc41944 3664 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3665 int ret;
3666
3667 if (level != SOL_PACKET)
3668 return -ENOPROTOOPT;
3669
69e3c75f 3670 switch (optname) {
1ce4f28b 3671 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3672 case PACKET_DROP_MEMBERSHIP:
3673 {
0fb375fb
EB
3674 struct packet_mreq_max mreq;
3675 int len = optlen;
3676 memset(&mreq, 0, sizeof(mreq));
3677 if (len < sizeof(struct packet_mreq))
1da177e4 3678 return -EINVAL;
0fb375fb
EB
3679 if (len > sizeof(mreq))
3680 len = sizeof(mreq);
a7b75c5a 3681 if (copy_from_sockptr(&mreq, optval, len))
1da177e4 3682 return -EFAULT;
0fb375fb
EB
3683 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3684 return -EINVAL;
1da177e4
LT
3685 if (optname == PACKET_ADD_MEMBERSHIP)
3686 ret = packet_mc_add(sk, &mreq);
3687 else
3688 ret = packet_mc_drop(sk, &mreq);
3689 return ret;
3690 }
a2efcfa0 3691
1da177e4 3692 case PACKET_RX_RING:
69e3c75f 3693 case PACKET_TX_RING:
1da177e4 3694 {
f6fb8f10 3695 union tpacket_req_u req_u;
3696 int len;
1da177e4 3697
5171b37d 3698 lock_sock(sk);
f6fb8f10 3699 switch (po->tp_version) {
3700 case TPACKET_V1:
3701 case TPACKET_V2:
3702 len = sizeof(req_u.req);
3703 break;
3704 case TPACKET_V3:
3705 default:
3706 len = sizeof(req_u.req3);
3707 break;
3708 }
5171b37d
ED
3709 if (optlen < len) {
3710 ret = -EINVAL;
3711 } else {
a7b75c5a 3712 if (copy_from_sockptr(&req_u.req, optval, len))
5171b37d
ED
3713 ret = -EFAULT;
3714 else
3715 ret = packet_set_ring(sk, &req_u, 0,
3716 optname == PACKET_TX_RING);
3717 }
3718 release_sock(sk);
3719 return ret;
1da177e4
LT
3720 }
3721 case PACKET_COPY_THRESH:
3722 {
3723 int val;
3724
40d4e3df 3725 if (optlen != sizeof(val))
1da177e4 3726 return -EINVAL;
a7b75c5a 3727 if (copy_from_sockptr(&val, optval, sizeof(val)))
1da177e4
LT
3728 return -EFAULT;
3729
3730 pkt_sk(sk)->copy_thresh = val;
3731 return 0;
3732 }
bbd6ef87
PM
3733 case PACKET_VERSION:
3734 {
3735 int val;
3736
3737 if (optlen != sizeof(val))
3738 return -EINVAL;
a7b75c5a 3739 if (copy_from_sockptr(&val, optval, sizeof(val)))
bbd6ef87
PM
3740 return -EFAULT;
3741 switch (val) {
3742 case TPACKET_V1:
3743 case TPACKET_V2:
f6fb8f10 3744 case TPACKET_V3:
84ac7260 3745 break;
bbd6ef87
PM
3746 default:
3747 return -EINVAL;
3748 }
84ac7260
PP
3749 lock_sock(sk);
3750 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3751 ret = -EBUSY;
3752 } else {
3753 po->tp_version = val;
3754 ret = 0;
3755 }
3756 release_sock(sk);
3757 return ret;
bbd6ef87 3758 }
8913336a
PM
3759 case PACKET_RESERVE:
3760 {
3761 unsigned int val;
3762
3763 if (optlen != sizeof(val))
3764 return -EINVAL;
a7b75c5a 3765 if (copy_from_sockptr(&val, optval, sizeof(val)))
8913336a 3766 return -EFAULT;
bcc5364b
AK
3767 if (val > INT_MAX)
3768 return -EINVAL;
c27927e3
WB
3769 lock_sock(sk);
3770 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3771 ret = -EBUSY;
3772 } else {
3773 po->tp_reserve = val;
3774 ret = 0;
3775 }
3776 release_sock(sk);
3777 return ret;
8913336a 3778 }
69e3c75f
JB
3779 case PACKET_LOSS:
3780 {
3781 unsigned int val;
3782
3783 if (optlen != sizeof(val))
3784 return -EINVAL;
a7b75c5a 3785 if (copy_from_sockptr(&val, optval, sizeof(val)))
69e3c75f 3786 return -EFAULT;
a6361f0c
WB
3787
3788 lock_sock(sk);
3789 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3790 ret = -EBUSY;
3791 } else {
3792 po->tp_loss = !!val;
3793 ret = 0;
3794 }
3795 release_sock(sk);
3796 return ret;
69e3c75f 3797 }
8dc41944
HX
3798 case PACKET_AUXDATA:
3799 {
3800 int val;
3801
3802 if (optlen < sizeof(val))
3803 return -EINVAL;
a7b75c5a 3804 if (copy_from_sockptr(&val, optval, sizeof(val)))
8dc41944
HX
3805 return -EFAULT;
3806
a6361f0c 3807 lock_sock(sk);
8dc41944 3808 po->auxdata = !!val;
a6361f0c 3809 release_sock(sk);
8dc41944
HX
3810 return 0;
3811 }
80feaacb
PWJ
3812 case PACKET_ORIGDEV:
3813 {
3814 int val;
3815
3816 if (optlen < sizeof(val))
3817 return -EINVAL;
a7b75c5a 3818 if (copy_from_sockptr(&val, optval, sizeof(val)))
80feaacb
PWJ
3819 return -EFAULT;
3820
a6361f0c 3821 lock_sock(sk);
80feaacb 3822 po->origdev = !!val;
a6361f0c 3823 release_sock(sk);
80feaacb
PWJ
3824 return 0;
3825 }
bfd5f4a3
SS
3826 case PACKET_VNET_HDR:
3827 {
3828 int val;
3829
3830 if (sock->type != SOCK_RAW)
3831 return -EINVAL;
bfd5f4a3
SS
3832 if (optlen < sizeof(val))
3833 return -EINVAL;
a7b75c5a 3834 if (copy_from_sockptr(&val, optval, sizeof(val)))
bfd5f4a3
SS
3835 return -EFAULT;
3836
a6361f0c
WB
3837 lock_sock(sk);
3838 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3839 ret = -EBUSY;
3840 } else {
3841 po->has_vnet_hdr = !!val;
3842 ret = 0;
3843 }
3844 release_sock(sk);
3845 return ret;
bfd5f4a3 3846 }
614f60fa
SM
3847 case PACKET_TIMESTAMP:
3848 {
3849 int val;
3850
3851 if (optlen != sizeof(val))
3852 return -EINVAL;
a7b75c5a 3853 if (copy_from_sockptr(&val, optval, sizeof(val)))
614f60fa
SM
3854 return -EFAULT;
3855
3856 po->tp_tstamp = val;
3857 return 0;
3858 }
dc99f600
DM
3859 case PACKET_FANOUT:
3860 {
3861 int val;
3862
3863 if (optlen != sizeof(val))
3864 return -EINVAL;
a7b75c5a 3865 if (copy_from_sockptr(&val, optval, sizeof(val)))
dc99f600
DM
3866 return -EFAULT;
3867
3868 return fanout_add(sk, val & 0xffff, val >> 16);
3869 }
47dceb8e
WB
3870 case PACKET_FANOUT_DATA:
3871 {
3872 if (!po->fanout)
3873 return -EINVAL;
3874
3875 return fanout_set_data(po, optval, optlen);
3876 }
fa788d98
VW
3877 case PACKET_IGNORE_OUTGOING:
3878 {
3879 int val;
3880
3881 if (optlen != sizeof(val))
3882 return -EINVAL;
a7b75c5a 3883 if (copy_from_sockptr(&val, optval, sizeof(val)))
fa788d98
VW
3884 return -EFAULT;
3885 if (val < 0 || val > 1)
3886 return -EINVAL;
3887
3888 po->prot_hook.ignore_outgoing = !!val;
3889 return 0;
3890 }
5920cd3a
PC
3891 case PACKET_TX_HAS_OFF:
3892 {
3893 unsigned int val;
3894
3895 if (optlen != sizeof(val))
3896 return -EINVAL;
a7b75c5a 3897 if (copy_from_sockptr(&val, optval, sizeof(val)))
5920cd3a 3898 return -EFAULT;
a6361f0c
WB
3899
3900 lock_sock(sk);
3901 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3902 ret = -EBUSY;
3903 } else {
3904 po->tp_tx_has_off = !!val;
3905 ret = 0;
3906 }
3907 release_sock(sk);
5920cd3a
PC
3908 return 0;
3909 }
d346a3fa
DB
3910 case PACKET_QDISC_BYPASS:
3911 {
3912 int val;
3913
3914 if (optlen != sizeof(val))
3915 return -EINVAL;
a7b75c5a 3916 if (copy_from_sockptr(&val, optval, sizeof(val)))
d346a3fa
DB
3917 return -EFAULT;
3918
3919 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3920 return 0;
3921 }
1da177e4
LT
3922 default:
3923 return -ENOPROTOOPT;
3924 }
3925}
3926
3927static int packet_getsockopt(struct socket *sock, int level, int optname,
3928 char __user *optval, int __user *optlen)
3929{
3930 int len;
c06fff6e 3931 int val, lv = sizeof(val);
1da177e4
LT
3932 struct sock *sk = sock->sk;
3933 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3934 void *data = &val;
ee80fbf3 3935 union tpacket_stats_u st;
a9b63918 3936 struct tpacket_rollover_stats rstats;
8e8e2951 3937 int drops;
1da177e4
LT
3938
3939 if (level != SOL_PACKET)
3940 return -ENOPROTOOPT;
3941
8ae55f04
KK
3942 if (get_user(len, optlen))
3943 return -EFAULT;
1da177e4
LT
3944
3945 if (len < 0)
3946 return -EINVAL;
1ce4f28b 3947
69e3c75f 3948 switch (optname) {
1da177e4 3949 case PACKET_STATISTICS:
1da177e4 3950 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3951 memcpy(&st, &po->stats, sizeof(st));
3952 memset(&po->stats, 0, sizeof(po->stats));
3953 spin_unlock_bh(&sk->sk_receive_queue.lock);
8e8e2951 3954 drops = atomic_xchg(&po->tp_drops, 0);
ee80fbf3 3955
f6fb8f10 3956 if (po->tp_version == TPACKET_V3) {
c06fff6e 3957 lv = sizeof(struct tpacket_stats_v3);
8e8e2951
ED
3958 st.stats3.tp_drops = drops;
3959 st.stats3.tp_packets += drops;
ee80fbf3 3960 data = &st.stats3;
f6fb8f10 3961 } else {
c06fff6e 3962 lv = sizeof(struct tpacket_stats);
8e8e2951
ED
3963 st.stats1.tp_drops = drops;
3964 st.stats1.tp_packets += drops;
ee80fbf3 3965 data = &st.stats1;
f6fb8f10 3966 }
ee80fbf3 3967
8dc41944
HX
3968 break;
3969 case PACKET_AUXDATA:
8dc41944 3970 val = po->auxdata;
80feaacb
PWJ
3971 break;
3972 case PACKET_ORIGDEV:
80feaacb 3973 val = po->origdev;
bfd5f4a3
SS
3974 break;
3975 case PACKET_VNET_HDR:
bfd5f4a3 3976 val = po->has_vnet_hdr;
1da177e4 3977 break;
bbd6ef87 3978 case PACKET_VERSION:
bbd6ef87 3979 val = po->tp_version;
bbd6ef87
PM
3980 break;
3981 case PACKET_HDRLEN:
3982 if (len > sizeof(int))
3983 len = sizeof(int);
fd2c83b3
AP
3984 if (len < sizeof(int))
3985 return -EINVAL;
bbd6ef87
PM
3986 if (copy_from_user(&val, optval, len))
3987 return -EFAULT;
3988 switch (val) {
3989 case TPACKET_V1:
3990 val = sizeof(struct tpacket_hdr);
3991 break;
3992 case TPACKET_V2:
3993 val = sizeof(struct tpacket2_hdr);
3994 break;
f6fb8f10 3995 case TPACKET_V3:
3996 val = sizeof(struct tpacket3_hdr);
3997 break;
bbd6ef87
PM
3998 default:
3999 return -EINVAL;
4000 }
bbd6ef87 4001 break;
8913336a 4002 case PACKET_RESERVE:
8913336a 4003 val = po->tp_reserve;
8913336a 4004 break;
69e3c75f 4005 case PACKET_LOSS:
69e3c75f 4006 val = po->tp_loss;
69e3c75f 4007 break;
614f60fa 4008 case PACKET_TIMESTAMP:
614f60fa 4009 val = po->tp_tstamp;
614f60fa 4010 break;
dc99f600 4011 case PACKET_FANOUT:
dc99f600
DM
4012 val = (po->fanout ?
4013 ((u32)po->fanout->id |
77f65ebd
WB
4014 ((u32)po->fanout->type << 16) |
4015 ((u32)po->fanout->flags << 24)) :
dc99f600 4016 0);
dc99f600 4017 break;
fa788d98
VW
4018 case PACKET_IGNORE_OUTGOING:
4019 val = po->prot_hook.ignore_outgoing;
4020 break;
a9b63918 4021 case PACKET_ROLLOVER_STATS:
57f015f5 4022 if (!po->rollover)
a9b63918 4023 return -EINVAL;
57f015f5
MM
4024 rstats.tp_all = atomic_long_read(&po->rollover->num);
4025 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
4026 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
4027 data = &rstats;
4028 lv = sizeof(rstats);
a9b63918 4029 break;
5920cd3a
PC
4030 case PACKET_TX_HAS_OFF:
4031 val = po->tp_tx_has_off;
4032 break;
d346a3fa
DB
4033 case PACKET_QDISC_BYPASS:
4034 val = packet_use_direct_xmit(po);
4035 break;
1da177e4
LT
4036 default:
4037 return -ENOPROTOOPT;
4038 }
4039
c06fff6e
ED
4040 if (len > lv)
4041 len = lv;
8ae55f04
KK
4042 if (put_user(len, optlen))
4043 return -EFAULT;
8dc41944
HX
4044 if (copy_to_user(optval, data, len))
4045 return -EFAULT;
8ae55f04 4046 return 0;
1da177e4
LT
4047}
4048
351638e7
JP
4049static int packet_notifier(struct notifier_block *this,
4050 unsigned long msg, void *ptr)
1da177e4
LT
4051{
4052 struct sock *sk;
351638e7 4053 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 4054 struct net *net = dev_net(dev);
1da177e4 4055
808f5114 4056 rcu_read_lock();
b67bfe0d 4057 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
4058 struct packet_sock *po = pkt_sk(sk);
4059
4060 switch (msg) {
4061 case NETDEV_UNREGISTER:
1da177e4 4062 if (po->mclist)
82f17091 4063 packet_dev_mclist_delete(dev, &po->mclist);
df561f66 4064 fallthrough;
a2efcfa0 4065
1da177e4
LT
4066 case NETDEV_DOWN:
4067 if (dev->ifindex == po->ifindex) {
4068 spin_lock(&po->bind_lock);
4069 if (po->running) {
ce06b03e 4070 __unregister_prot_hook(sk, false);
1da177e4
LT
4071 sk->sk_err = ENETDOWN;
4072 if (!sock_flag(sk, SOCK_DEAD))
4073 sk->sk_error_report(sk);
4074 }
4075 if (msg == NETDEV_UNREGISTER) {
66e56cd4 4076 packet_cached_dev_reset(po);
1da177e4 4077 po->ifindex = -1;
160ff18a
BG
4078 if (po->prot_hook.dev)
4079 dev_put(po->prot_hook.dev);
1da177e4
LT
4080 po->prot_hook.dev = NULL;
4081 }
4082 spin_unlock(&po->bind_lock);
4083 }
4084 break;
4085 case NETDEV_UP:
808f5114 4086 if (dev->ifindex == po->ifindex) {
4087 spin_lock(&po->bind_lock);
ce06b03e
DM
4088 if (po->num)
4089 register_prot_hook(sk);
808f5114 4090 spin_unlock(&po->bind_lock);
1da177e4 4091 }
1da177e4
LT
4092 break;
4093 }
4094 }
808f5114 4095 rcu_read_unlock();
1da177e4
LT
4096 return NOTIFY_DONE;
4097}
4098
4099
4100static int packet_ioctl(struct socket *sock, unsigned int cmd,
4101 unsigned long arg)
4102{
4103 struct sock *sk = sock->sk;
4104
69e3c75f 4105 switch (cmd) {
40d4e3df
ED
4106 case SIOCOUTQ:
4107 {
4108 int amount = sk_wmem_alloc_get(sk);
31e6d363 4109
40d4e3df
ED
4110 return put_user(amount, (int __user *)arg);
4111 }
4112 case SIOCINQ:
4113 {
4114 struct sk_buff *skb;
4115 int amount = 0;
4116
4117 spin_lock_bh(&sk->sk_receive_queue.lock);
4118 skb = skb_peek(&sk->sk_receive_queue);
4119 if (skb)
4120 amount = skb->len;
4121 spin_unlock_bh(&sk->sk_receive_queue.lock);
4122 return put_user(amount, (int __user *)arg);
4123 }
1da177e4 4124#ifdef CONFIG_INET
40d4e3df
ED
4125 case SIOCADDRT:
4126 case SIOCDELRT:
4127 case SIOCDARP:
4128 case SIOCGARP:
4129 case SIOCSARP:
4130 case SIOCGIFADDR:
4131 case SIOCSIFADDR:
4132 case SIOCGIFBRDADDR:
4133 case SIOCSIFBRDADDR:
4134 case SIOCGIFNETMASK:
4135 case SIOCSIFNETMASK:
4136 case SIOCGIFDSTADDR:
4137 case SIOCSIFDSTADDR:
4138 case SIOCSIFFLAGS:
40d4e3df 4139 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
4140#endif
4141
40d4e3df
ED
4142 default:
4143 return -ENOIOCTLCMD;
1da177e4
LT
4144 }
4145 return 0;
4146}
4147
a11e1d43
LT
4148static __poll_t packet_poll(struct file *file, struct socket *sock,
4149 poll_table *wait)
1da177e4
LT
4150{
4151 struct sock *sk = sock->sk;
4152 struct packet_sock *po = pkt_sk(sk);
a11e1d43 4153 __poll_t mask = datagram_poll(file, sock, wait);
1da177e4
LT
4154
4155 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4156 if (po->rx_ring.pg_vec) {
f6fb8f10 4157 if (!packet_previous_rx_frame(po, &po->rx_ring,
4158 TP_STATUS_KERNEL))
a9a08845 4159 mask |= EPOLLIN | EPOLLRDNORM;
1da177e4 4160 }
9bb6cd65 4161 packet_rcv_try_clear_pressure(po);
1da177e4 4162 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4163 spin_lock_bh(&sk->sk_write_queue.lock);
4164 if (po->tx_ring.pg_vec) {
4165 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
a9a08845 4166 mask |= EPOLLOUT | EPOLLWRNORM;
69e3c75f
JB
4167 }
4168 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4169 return mask;
4170}
4171
4172
4173/* Dirty? Well, I still did not learn better way to account
4174 * for user mmaps.
4175 */
4176
4177static void packet_mm_open(struct vm_area_struct *vma)
4178{
4179 struct file *file = vma->vm_file;
40d4e3df 4180 struct socket *sock = file->private_data;
1da177e4 4181 struct sock *sk = sock->sk;
1ce4f28b 4182
1da177e4
LT
4183 if (sk)
4184 atomic_inc(&pkt_sk(sk)->mapped);
4185}
4186
4187static void packet_mm_close(struct vm_area_struct *vma)
4188{
4189 struct file *file = vma->vm_file;
40d4e3df 4190 struct socket *sock = file->private_data;
1da177e4 4191 struct sock *sk = sock->sk;
1ce4f28b 4192
1da177e4
LT
4193 if (sk)
4194 atomic_dec(&pkt_sk(sk)->mapped);
4195}
4196
f0f37e2f 4197static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4198 .open = packet_mm_open,
4199 .close = packet_mm_close,
1da177e4
LT
4200};
4201
3a7ad063
ED
4202static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4203 unsigned int len)
1da177e4
LT
4204{
4205 int i;
4206
4ebf0ae2 4207 for (i = 0; i < len; i++) {
0e3125c7 4208 if (likely(pg_vec[i].buffer)) {
3a7ad063
ED
4209 if (is_vmalloc_addr(pg_vec[i].buffer))
4210 vfree(pg_vec[i].buffer);
4211 else
4212 free_pages((unsigned long)pg_vec[i].buffer,
4213 order);
0e3125c7
NH
4214 pg_vec[i].buffer = NULL;
4215 }
1da177e4
LT
4216 }
4217 kfree(pg_vec);
4218}
4219
3a7ad063 4220static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4221{
f0d4eb29 4222 char *buffer;
3a7ad063
ED
4223 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4224 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
0e3125c7 4225
3a7ad063 4226 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4227 if (buffer)
4228 return buffer;
4229
3a7ad063
ED
4230 /* __get_free_pages failed, fall back to vmalloc */
4231 buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
4232 if (buffer)
4233 return buffer;
0e3125c7 4234
3a7ad063
ED
4235 /* vmalloc failed, lets dig into swap here */
4236 gfp_flags &= ~__GFP_NORETRY;
4237 buffer = (char *) __get_free_pages(gfp_flags, order);
4238 if (buffer)
4239 return buffer;
4240
4241 /* complete and utter failure */
4242 return NULL;
4ebf0ae2
DM
4243}
4244
3a7ad063 4245static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4246{
4247 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4248 struct pgv *pg_vec;
4ebf0ae2
DM
4249 int i;
4250
398f0132 4251 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
4ebf0ae2
DM
4252 if (unlikely(!pg_vec))
4253 goto out;
4254
4255 for (i = 0; i < block_nr; i++) {
3a7ad063 4256 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4257 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4258 goto out_free_pgvec;
4259 }
4260
4261out:
4262 return pg_vec;
4263
4264out_free_pgvec:
3a7ad063 4265 free_pg_vec(pg_vec, order, block_nr);
4ebf0ae2
DM
4266 pg_vec = NULL;
4267 goto out;
4268}
1da177e4 4269
f6fb8f10 4270static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4271 int closing, int tx_ring)
1da177e4 4272{
0e3125c7 4273 struct pgv *pg_vec = NULL;
1da177e4 4274 struct packet_sock *po = pkt_sk(sk);
61fad681 4275 unsigned long *rx_owner_map = NULL;
3a7ad063 4276 int was_running, order = 0;
69e3c75f
JB
4277 struct packet_ring_buffer *rb;
4278 struct sk_buff_head *rb_queue;
0e11c91e 4279 __be16 num;
2a6d6c31 4280 int err;
f6fb8f10 4281 /* Added to avoid minimal code churn */
4282 struct tpacket_req *req = &req_u->req;
4283
69e3c75f
JB
4284 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4285 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4286
69e3c75f
JB
4287 err = -EBUSY;
4288 if (!closing) {
4289 if (atomic_read(&po->mapped))
4290 goto out;
b0138408 4291 if (packet_read_pending(rb))
69e3c75f
JB
4292 goto out;
4293 }
1da177e4 4294
69e3c75f 4295 if (req->tp_block_nr) {
4576cd46
WB
4296 unsigned int min_frame_size;
4297
69e3c75f
JB
4298 /* Sanity tests and some calculations */
4299 err = -EBUSY;
4300 if (unlikely(rb->pg_vec))
4301 goto out;
1da177e4 4302
bbd6ef87
PM
4303 switch (po->tp_version) {
4304 case TPACKET_V1:
4305 po->tp_hdrlen = TPACKET_HDRLEN;
4306 break;
4307 case TPACKET_V2:
4308 po->tp_hdrlen = TPACKET2_HDRLEN;
4309 break;
f6fb8f10 4310 case TPACKET_V3:
4311 po->tp_hdrlen = TPACKET3_HDRLEN;
4312 break;
bbd6ef87
PM
4313 }
4314
69e3c75f 4315 err = -EINVAL;
4ebf0ae2 4316 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4317 goto out;
90836b67 4318 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4319 goto out;
4576cd46 4320 min_frame_size = po->tp_hdrlen + po->tp_reserve;
dc808110 4321 if (po->tp_version >= TPACKET_V3 &&
4576cd46
WB
4322 req->tp_block_size <
4323 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
dc808110 4324 goto out;
4576cd46 4325 if (unlikely(req->tp_frame_size < min_frame_size))
69e3c75f 4326 goto out;
4ebf0ae2 4327 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4328 goto out;
1da177e4 4329
4194b491
TK
4330 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4331 if (unlikely(rb->frames_per_block == 0))
69e3c75f 4332 goto out;
fc62814d 4333 if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
8f8d28e4 4334 goto out;
69e3c75f
JB
4335 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4336 req->tp_frame_nr))
4337 goto out;
1da177e4
LT
4338
4339 err = -ENOMEM;
3a7ad063
ED
4340 order = get_order(req->tp_block_size);
4341 pg_vec = alloc_pg_vec(req, order);
4ebf0ae2 4342 if (unlikely(!pg_vec))
1da177e4 4343 goto out;
f6fb8f10 4344 switch (po->tp_version) {
4345 case TPACKET_V3:
7f953ab2
SV
4346 /* Block transmit is not supported yet */
4347 if (!tx_ring) {
e8e85cc5 4348 init_prb_bdqc(po, rb, pg_vec, req_u);
7f953ab2
SV
4349 } else {
4350 struct tpacket_req3 *req3 = &req_u->req3;
4351
4352 if (req3->tp_retire_blk_tov ||
4353 req3->tp_sizeof_priv ||
4354 req3->tp_feature_req_word) {
4355 err = -EINVAL;
55655e3d 4356 goto out_free_pg_vec;
7f953ab2
SV
4357 }
4358 }
d7cf0c34 4359 break;
f6fb8f10 4360 default:
61fad681
WB
4361 if (!tx_ring) {
4362 rx_owner_map = bitmap_alloc(req->tp_frame_nr,
4363 GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
4364 if (!rx_owner_map)
4365 goto out_free_pg_vec;
4366 }
f6fb8f10 4367 break;
4368 }
69e3c75f
JB
4369 }
4370 /* Done */
4371 else {
4372 err = -EINVAL;
4ebf0ae2 4373 if (unlikely(req->tp_frame_nr))
69e3c75f 4374 goto out;
1da177e4
LT
4375 }
4376
1da177e4
LT
4377
4378 /* Detach socket from network */
4379 spin_lock(&po->bind_lock);
4380 was_running = po->running;
4381 num = po->num;
4382 if (was_running) {
1da177e4 4383 po->num = 0;
ce06b03e 4384 __unregister_prot_hook(sk, false);
1da177e4
LT
4385 }
4386 spin_unlock(&po->bind_lock);
1ce4f28b 4387
1da177e4
LT
4388 synchronize_net();
4389
4390 err = -EBUSY;
905db440 4391 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4392 if (closing || atomic_read(&po->mapped) == 0) {
4393 err = 0;
69e3c75f 4394 spin_lock_bh(&rb_queue->lock);
c053fd96 4395 swap(rb->pg_vec, pg_vec);
61fad681
WB
4396 if (po->tp_version <= TPACKET_V2)
4397 swap(rb->rx_owner_map, rx_owner_map);
69e3c75f
JB
4398 rb->frame_max = (req->tp_frame_nr - 1);
4399 rb->head = 0;
4400 rb->frame_size = req->tp_frame_size;
4401 spin_unlock_bh(&rb_queue->lock);
4402
3a7ad063 4403 swap(rb->pg_vec_order, order);
c053fd96 4404 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4405
4406 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4407 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4408 tpacket_rcv : packet_rcv;
4409 skb_queue_purge(rb_queue);
1da177e4 4410 if (atomic_read(&po->mapped))
40d4e3df
ED
4411 pr_err("packet_mmap: vma is busy: %d\n",
4412 atomic_read(&po->mapped));
1da177e4 4413 }
905db440 4414 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4415
4416 spin_lock(&po->bind_lock);
ce06b03e 4417 if (was_running) {
1da177e4 4418 po->num = num;
ce06b03e 4419 register_prot_hook(sk);
1da177e4
LT
4420 }
4421 spin_unlock(&po->bind_lock);
c800aaf8 4422 if (pg_vec && (po->tp_version > TPACKET_V2)) {
f6fb8f10 4423 /* Because we don't support block-based V3 on tx-ring */
4424 if (!tx_ring)
73d0fcf2 4425 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4426 }
1da177e4 4427
55655e3d 4428out_free_pg_vec:
61fad681 4429 bitmap_free(rx_owner_map);
1da177e4 4430 if (pg_vec)
3a7ad063 4431 free_pg_vec(pg_vec, order, req->tp_block_nr);
1da177e4
LT
4432out:
4433 return err;
4434}
4435
69e3c75f
JB
4436static int packet_mmap(struct file *file, struct socket *sock,
4437 struct vm_area_struct *vma)
1da177e4
LT
4438{
4439 struct sock *sk = sock->sk;
4440 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4441 unsigned long size, expected_size;
4442 struct packet_ring_buffer *rb;
1da177e4
LT
4443 unsigned long start;
4444 int err = -EINVAL;
4445 int i;
4446
4447 if (vma->vm_pgoff)
4448 return -EINVAL;
4449
905db440 4450 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4451
4452 expected_size = 0;
4453 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4454 if (rb->pg_vec) {
4455 expected_size += rb->pg_vec_len
4456 * rb->pg_vec_pages
4457 * PAGE_SIZE;
4458 }
4459 }
4460
4461 if (expected_size == 0)
1da177e4 4462 goto out;
69e3c75f
JB
4463
4464 size = vma->vm_end - vma->vm_start;
4465 if (size != expected_size)
1da177e4
LT
4466 goto out;
4467
1da177e4 4468 start = vma->vm_start;
69e3c75f
JB
4469 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4470 if (rb->pg_vec == NULL)
4471 continue;
4472
4473 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4474 struct page *page;
4475 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4476 int pg_num;
4477
c56b4d90
CG
4478 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4479 page = pgv_to_page(kaddr);
69e3c75f
JB
4480 err = vm_insert_page(vma, start, page);
4481 if (unlikely(err))
4482 goto out;
4483 start += PAGE_SIZE;
0e3125c7 4484 kaddr += PAGE_SIZE;
69e3c75f 4485 }
4ebf0ae2 4486 }
1da177e4 4487 }
69e3c75f 4488
4ebf0ae2 4489 atomic_inc(&po->mapped);
1da177e4
LT
4490 vma->vm_ops = &packet_mmap_ops;
4491 err = 0;
4492
4493out:
905db440 4494 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4495 return err;
4496}
1da177e4 4497
90ddc4f0 4498static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4499 .family = PF_PACKET,
4500 .owner = THIS_MODULE,
4501 .release = packet_release,
4502 .bind = packet_bind_spkt,
4503 .connect = sock_no_connect,
4504 .socketpair = sock_no_socketpair,
4505 .accept = sock_no_accept,
4506 .getname = packet_getname_spkt,
a11e1d43 4507 .poll = datagram_poll,
1da177e4 4508 .ioctl = packet_ioctl,
c7cbdbf2 4509 .gettstamp = sock_gettstamp,
1da177e4
LT
4510 .listen = sock_no_listen,
4511 .shutdown = sock_no_shutdown,
1da177e4
LT
4512 .sendmsg = packet_sendmsg_spkt,
4513 .recvmsg = packet_recvmsg,
4514 .mmap = sock_no_mmap,
4515 .sendpage = sock_no_sendpage,
4516};
1da177e4 4517
90ddc4f0 4518static const struct proto_ops packet_ops = {
1da177e4
LT
4519 .family = PF_PACKET,
4520 .owner = THIS_MODULE,
4521 .release = packet_release,
4522 .bind = packet_bind,
4523 .connect = sock_no_connect,
4524 .socketpair = sock_no_socketpair,
4525 .accept = sock_no_accept,
1ce4f28b 4526 .getname = packet_getname,
a11e1d43 4527 .poll = packet_poll,
1da177e4 4528 .ioctl = packet_ioctl,
c7cbdbf2 4529 .gettstamp = sock_gettstamp,
1da177e4
LT
4530 .listen = sock_no_listen,
4531 .shutdown = sock_no_shutdown,
4532 .setsockopt = packet_setsockopt,
4533 .getsockopt = packet_getsockopt,
4534 .sendmsg = packet_sendmsg,
4535 .recvmsg = packet_recvmsg,
4536 .mmap = packet_mmap,
4537 .sendpage = sock_no_sendpage,
4538};
4539
ec1b4cf7 4540static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4541 .family = PF_PACKET,
4542 .create = packet_create,
4543 .owner = THIS_MODULE,
4544};
4545
4546static struct notifier_block packet_netdev_notifier = {
40d4e3df 4547 .notifier_call = packet_notifier,
1da177e4
LT
4548};
4549
4550#ifdef CONFIG_PROC_FS
1da177e4
LT
4551
4552static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4553 __acquires(RCU)
1da177e4 4554{
e372c414 4555 struct net *net = seq_file_net(seq);
808f5114 4556
4557 rcu_read_lock();
4558 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4559}
4560
4561static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4562{
1bf40954 4563 struct net *net = seq_file_net(seq);
808f5114 4564 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4565}
4566
4567static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4568 __releases(RCU)
1da177e4 4569{
808f5114 4570 rcu_read_unlock();
1da177e4
LT
4571}
4572
1ce4f28b 4573static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4574{
4575 if (v == SEQ_START_TOKEN)
4576 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4577 else {
b7ceabd9 4578 struct sock *s = sk_entry(v);
1da177e4
LT
4579 const struct packet_sock *po = pkt_sk(s);
4580
4581 seq_printf(seq,
71338aa7 4582 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4 4583 s,
41c6d650 4584 refcount_read(&s->sk_refcnt),
1da177e4
LT
4585 s->sk_type,
4586 ntohs(po->num),
4587 po->ifindex,
4588 po->running,
4589 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4590 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4591 sock_i_ino(s));
1da177e4
LT
4592 }
4593
4594 return 0;
4595}
4596
56b3d975 4597static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4598 .start = packet_seq_start,
4599 .next = packet_seq_next,
4600 .stop = packet_seq_stop,
4601 .show = packet_seq_show,
4602};
1da177e4
LT
4603#endif
4604
2c8c1e72 4605static int __net_init packet_net_init(struct net *net)
d12d01d6 4606{
0fa7fa98 4607 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4608 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4609
c3506372
CH
4610 if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
4611 sizeof(struct seq_net_private)))
d12d01d6
DL
4612 return -ENOMEM;
4613
4614 return 0;
4615}
4616
2c8c1e72 4617static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4618{
ece31ffd 4619 remove_proc_entry("packet", net->proc_net);
669f8f1a 4620 WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
d12d01d6
DL
4621}
4622
4623static struct pernet_operations packet_net_ops = {
4624 .init = packet_net_init,
4625 .exit = packet_net_exit,
4626};
4627
4628
1da177e4
LT
4629static void __exit packet_exit(void)
4630{
1da177e4 4631 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4632 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4633 sock_unregister(PF_PACKET);
4634 proto_unregister(&packet_proto);
4635}
4636
4637static int __init packet_init(void)
4638{
36096f2f 4639 int rc;
1da177e4 4640
36096f2f
Y
4641 rc = proto_register(&packet_proto, 0);
4642 if (rc)
1da177e4 4643 goto out;
36096f2f
Y
4644 rc = sock_register(&packet_family_ops);
4645 if (rc)
4646 goto out_proto;
4647 rc = register_pernet_subsys(&packet_net_ops);
4648 if (rc)
4649 goto out_sock;
4650 rc = register_netdevice_notifier(&packet_netdev_notifier);
4651 if (rc)
4652 goto out_pernet;
1da177e4 4653
36096f2f
Y
4654 return 0;
4655
4656out_pernet:
4657 unregister_pernet_subsys(&packet_net_ops);
4658out_sock:
4659 sock_unregister(PF_PACKET);
4660out_proto:
4661 proto_unregister(&packet_proto);
1da177e4
LT
4662out:
4663 return rc;
4664}
4665
4666module_init(packet_init);
4667module_exit(packet_exit);
4668MODULE_LICENSE("GPL");
4669MODULE_ALIAS_NETPROTO(PF_PACKET);