Merge tag 'drm-fixes-for-v4.14-rc4' of git://people.freedesktop.org/~airlied/linux
[linux-block.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
7c0f6ba6 76#include <linux/uaccess.h>
1da177e4
LT
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
47dceb8e 95#include <linux/bpf.h>
719c44d3 96#include <net/compat.h>
1da177e4 97
2787b04b
PE
98#include "internal.h"
99
1da177e4
LT
100/*
101 Assumptions:
102 - if device has no dev->hard_header routine, it adds and removes ll header
103 inside itself. In this case ll header is invisible outside of device,
104 but higher levels still should reserve dev->hard_header_len.
105 Some devices are enough clever to reallocate skb, when header
106 will not fit to reserved space (tunnel), another ones are silly
107 (PPP).
108 - packet socket receives packets with pulled ll header,
109 so that SOCK_RAW should push it back.
110
111On receive:
112-----------
113
114Incoming, dev->hard_header!=NULL
b0e380b1
ACM
115 mac_header -> ll header
116 data -> data
1da177e4
LT
117
118Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
119 mac_header -> ll header
120 data -> ll header
1da177e4
LT
121
122Incoming, dev->hard_header==NULL
b0e380b1
ACM
123 mac_header -> UNKNOWN position. It is very likely, that it points to ll
124 header. PPP makes it, that is wrong, because introduce
db0c58f9 125 assymetry between rx and tx paths.
b0e380b1 126 data -> data
1da177e4
LT
127
128Outgoing, dev->hard_header==NULL
b0e380b1
ACM
129 mac_header -> data. ll header is still not built!
130 data -> data
1da177e4
LT
131
132Resume
133 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
134
135
136On transmit:
137------------
138
139dev->hard_header != NULL
b0e380b1
ACM
140 mac_header -> ll header
141 data -> ll header
1da177e4
LT
142
143dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
144 mac_header -> data
145 data -> data
1da177e4
LT
146
147 We should set nh.raw on output to correct posistion,
148 packet classifier depends on it.
149 */
150
1da177e4
LT
151/* Private packet socket structures. */
152
0fb375fb
EB
153/* identical to struct packet_mreq except it has
154 * a longer address field.
155 */
40d4e3df 156struct packet_mreq_max {
0fb375fb
EB
157 int mr_ifindex;
158 unsigned short mr_type;
159 unsigned short mr_alen;
160 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 161};
a2efcfa0 162
184f489e
DB
163union tpacket_uhdr {
164 struct tpacket_hdr *h1;
165 struct tpacket2_hdr *h2;
166 struct tpacket3_hdr *h3;
167 void *raw;
168};
169
f6fb8f10 170static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
171 int closing, int tx_ring);
172
f6fb8f10 173#define V3_ALIGNMENT (8)
174
bc59ba39 175#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 176
177#define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f 188struct packet_sock;
77f65ebd
WB
189static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
190 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 191
f6fb8f10 192static void *packet_previous_frame(struct packet_sock *po,
193 struct packet_ring_buffer *rb,
194 int status);
195static void packet_increment_head(struct packet_ring_buffer *buff);
878cd3ba 196static int prb_curr_blk_in_use(struct tpacket_block_desc *);
bc59ba39 197static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 198 struct packet_sock *);
bc59ba39 199static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *, unsigned int status);
bc59ba39 201static int prb_queue_frozen(struct tpacket_kbdq_core *);
202static void prb_open_block(struct tpacket_kbdq_core *,
203 struct tpacket_block_desc *);
f6fb8f10 204static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 205static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
206static void prb_init_blk_timer(struct packet_sock *,
207 struct tpacket_kbdq_core *,
208 void (*func) (unsigned long));
209static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
210static void prb_clear_rxhash(struct tpacket_kbdq_core *,
211 struct tpacket3_hdr *);
212static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
213 struct tpacket3_hdr *);
1da177e4 214static void packet_flush_mclist(struct sock *sk);
ccd4eb49 215static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb);
1da177e4 216
ffbc6111 217struct packet_skb_cb {
ffbc6111
HX
218 union {
219 struct sockaddr_pkt pkt;
2472d761
EB
220 union {
221 /* Trick: alias skb original length with
222 * ll.sll_family and ll.protocol in order
223 * to save room.
224 */
225 unsigned int origlen;
226 struct sockaddr_ll ll;
227 };
ffbc6111
HX
228 } sa;
229};
230
d3869efe
DW
231#define vio_le() virtio_legacy_is_little_endian()
232
ffbc6111 233#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 234
bc59ba39 235#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 236#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 237 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 238#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 239 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 240#define GET_NEXT_PRB_BLK_NUM(x) \
241 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
242 ((x)->kactive_blk_num+1) : 0)
243
dc99f600
DM
244static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
245static void __fanout_link(struct sock *sk, struct packet_sock *po);
246
d346a3fa
DB
247static int packet_direct_xmit(struct sk_buff *skb)
248{
249 struct net_device *dev = skb->dev;
104ba78c 250 struct sk_buff *orig_skb = skb;
d346a3fa 251 struct netdev_queue *txq;
43279500 252 int ret = NETDEV_TX_BUSY;
d346a3fa
DB
253
254 if (unlikely(!netif_running(dev) ||
43279500
DB
255 !netif_carrier_ok(dev)))
256 goto drop;
d346a3fa 257
104ba78c
WB
258 skb = validate_xmit_skb_list(skb, dev);
259 if (skb != orig_skb)
43279500 260 goto drop;
d346a3fa 261
ccd4eb49 262 packet_pick_tx_queue(dev, skb);
10c51b56 263 txq = skb_get_tx_queue(dev, skb);
d346a3fa 264
43279500
DB
265 local_bh_disable();
266
267 HARD_TX_LOCK(dev, txq, smp_processor_id());
10b3ad8c 268 if (!netif_xmit_frozen_or_drv_stopped(txq))
fa2dbdc2 269 ret = netdev_start_xmit(skb, dev, txq, false);
43279500 270 HARD_TX_UNLOCK(dev, txq);
d346a3fa 271
43279500
DB
272 local_bh_enable();
273
274 if (!dev_xmit_complete(ret))
d346a3fa 275 kfree_skb(skb);
43279500 276
d346a3fa 277 return ret;
43279500 278drop:
0f97ede4 279 atomic_long_inc(&dev->tx_dropped);
104ba78c 280 kfree_skb_list(skb);
43279500 281 return NET_XMIT_DROP;
d346a3fa
DB
282}
283
66e56cd4
DB
284static struct net_device *packet_cached_dev_get(struct packet_sock *po)
285{
286 struct net_device *dev;
287
288 rcu_read_lock();
289 dev = rcu_dereference(po->cached_dev);
290 if (likely(dev))
291 dev_hold(dev);
292 rcu_read_unlock();
293
294 return dev;
295}
296
297static void packet_cached_dev_assign(struct packet_sock *po,
298 struct net_device *dev)
299{
300 rcu_assign_pointer(po->cached_dev, dev);
301}
302
303static void packet_cached_dev_reset(struct packet_sock *po)
304{
305 RCU_INIT_POINTER(po->cached_dev, NULL);
306}
307
d346a3fa
DB
308static bool packet_use_direct_xmit(const struct packet_sock *po)
309{
310 return po->xmit == packet_direct_xmit;
311}
312
0fd5d57b 313static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
d346a3fa 314{
1cbac010 315 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
316}
317
0fd5d57b
DB
318static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
319{
320 const struct net_device_ops *ops = dev->netdev_ops;
321 u16 queue_index;
322
323 if (ops->ndo_select_queue) {
324 queue_index = ops->ndo_select_queue(dev, skb, NULL,
325 __packet_pick_tx_queue);
326 queue_index = netdev_cap_txqueue(dev, queue_index);
327 } else {
328 queue_index = __packet_pick_tx_queue(dev, skb);
329 }
330
331 skb_set_queue_mapping(skb, queue_index);
332}
333
ce06b03e
DM
334/* register_prot_hook must be invoked with the po->bind_lock held,
335 * or from a context in which asynchronous accesses to the packet
336 * socket is not possible (packet_create()).
337 */
338static void register_prot_hook(struct sock *sk)
339{
340 struct packet_sock *po = pkt_sk(sk);
e40526cb 341
ce06b03e 342 if (!po->running) {
66e56cd4 343 if (po->fanout)
dc99f600 344 __fanout_link(sk, po);
66e56cd4 345 else
dc99f600 346 dev_add_pack(&po->prot_hook);
e40526cb 347
ce06b03e
DM
348 sock_hold(sk);
349 po->running = 1;
350 }
351}
352
353/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
354 * held. If the sync parameter is true, we will temporarily drop
355 * the po->bind_lock and do a synchronize_net to make sure no
356 * asynchronous packet processing paths still refer to the elements
357 * of po->prot_hook. If the sync parameter is false, it is the
358 * callers responsibility to take care of this.
359 */
360static void __unregister_prot_hook(struct sock *sk, bool sync)
361{
362 struct packet_sock *po = pkt_sk(sk);
363
364 po->running = 0;
66e56cd4
DB
365
366 if (po->fanout)
dc99f600 367 __fanout_unlink(sk, po);
66e56cd4 368 else
dc99f600 369 __dev_remove_pack(&po->prot_hook);
e40526cb 370
ce06b03e
DM
371 __sock_put(sk);
372
373 if (sync) {
374 spin_unlock(&po->bind_lock);
375 synchronize_net();
376 spin_lock(&po->bind_lock);
377 }
378}
379
380static void unregister_prot_hook(struct sock *sk, bool sync)
381{
382 struct packet_sock *po = pkt_sk(sk);
383
384 if (po->running)
385 __unregister_prot_hook(sk, sync);
386}
387
6e58040b 388static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
389{
390 if (is_vmalloc_addr(addr))
391 return vmalloc_to_page(addr);
392 return virt_to_page(addr);
393}
394
69e3c75f 395static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 396{
184f489e 397 union tpacket_uhdr h;
1da177e4 398
69e3c75f 399 h.raw = frame;
bbd6ef87
PM
400 switch (po->tp_version) {
401 case TPACKET_V1:
69e3c75f 402 h.h1->tp_status = status;
0af55bb5 403 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
404 break;
405 case TPACKET_V2:
69e3c75f 406 h.h2->tp_status = status;
0af55bb5 407 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 408 break;
f6fb8f10 409 case TPACKET_V3:
7f953ab2
SV
410 h.h3->tp_status = status;
411 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
412 break;
69e3c75f 413 default:
f6fb8f10 414 WARN(1, "TPACKET version not supported.\n");
69e3c75f 415 BUG();
bbd6ef87 416 }
69e3c75f
JB
417
418 smp_wmb();
bbd6ef87
PM
419}
420
69e3c75f 421static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 422{
184f489e 423 union tpacket_uhdr h;
bbd6ef87 424
69e3c75f
JB
425 smp_rmb();
426
bbd6ef87
PM
427 h.raw = frame;
428 switch (po->tp_version) {
429 case TPACKET_V1:
0af55bb5 430 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 431 return h.h1->tp_status;
bbd6ef87 432 case TPACKET_V2:
0af55bb5 433 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 434 return h.h2->tp_status;
f6fb8f10 435 case TPACKET_V3:
7f953ab2
SV
436 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
437 return h.h3->tp_status;
69e3c75f 438 default:
f6fb8f10 439 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
440 BUG();
441 return 0;
bbd6ef87 442 }
1da177e4 443}
69e3c75f 444
b9c32fb2
DB
445static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
446 unsigned int flags)
7a51384c
DB
447{
448 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
449
68a360e8
WB
450 if (shhwtstamps &&
451 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
452 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
453 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
454
455 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 456 return TP_STATUS_TS_SOFTWARE;
7a51384c 457
b9c32fb2 458 return 0;
7a51384c
DB
459}
460
b9c32fb2
DB
461static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
462 struct sk_buff *skb)
2e31396f
WB
463{
464 union tpacket_uhdr h;
465 struct timespec ts;
b9c32fb2 466 __u32 ts_status;
2e31396f 467
b9c32fb2
DB
468 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
469 return 0;
2e31396f
WB
470
471 h.raw = frame;
472 switch (po->tp_version) {
473 case TPACKET_V1:
474 h.h1->tp_sec = ts.tv_sec;
475 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
476 break;
477 case TPACKET_V2:
478 h.h2->tp_sec = ts.tv_sec;
479 h.h2->tp_nsec = ts.tv_nsec;
480 break;
481 case TPACKET_V3:
57ea884b
DB
482 h.h3->tp_sec = ts.tv_sec;
483 h.h3->tp_nsec = ts.tv_nsec;
484 break;
2e31396f
WB
485 default:
486 WARN(1, "TPACKET version not supported.\n");
487 BUG();
488 }
489
490 /* one flush is safe, as both fields always lie on the same cacheline */
491 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
492 smp_wmb();
b9c32fb2
DB
493
494 return ts_status;
2e31396f
WB
495}
496
69e3c75f
JB
497static void *packet_lookup_frame(struct packet_sock *po,
498 struct packet_ring_buffer *rb,
499 unsigned int position,
500 int status)
501{
502 unsigned int pg_vec_pos, frame_offset;
184f489e 503 union tpacket_uhdr h;
69e3c75f
JB
504
505 pg_vec_pos = position / rb->frames_per_block;
506 frame_offset = position % rb->frames_per_block;
507
0e3125c7
NH
508 h.raw = rb->pg_vec[pg_vec_pos].buffer +
509 (frame_offset * rb->frame_size);
69e3c75f
JB
510
511 if (status != __packet_get_status(po, h.raw))
512 return NULL;
513
514 return h.raw;
515}
516
eea49cc9 517static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
518 struct packet_ring_buffer *rb,
519 int status)
520{
521 return packet_lookup_frame(po, rb, rb->head, status);
522}
523
bc59ba39 524static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 525{
526 del_timer_sync(&pkc->retire_blk_timer);
527}
528
529static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 530 struct sk_buff_head *rb_queue)
531{
bc59ba39 532 struct tpacket_kbdq_core *pkc;
f6fb8f10 533
73d0fcf2 534 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 535
ec6f809f 536 spin_lock_bh(&rb_queue->lock);
f6fb8f10 537 pkc->delete_blk_timer = 1;
ec6f809f 538 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 539
540 prb_del_retire_blk_timer(pkc);
541}
542
543static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 544 struct tpacket_kbdq_core *pkc,
f6fb8f10 545 void (*func) (unsigned long))
546{
547 init_timer(&pkc->retire_blk_timer);
548 pkc->retire_blk_timer.data = (long)po;
549 pkc->retire_blk_timer.function = func;
550 pkc->retire_blk_timer.expires = jiffies;
551}
552
e8e85cc5 553static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 554{
bc59ba39 555 struct tpacket_kbdq_core *pkc;
f6fb8f10 556
e8e85cc5 557 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 558 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
559}
560
561static int prb_calc_retire_blk_tmo(struct packet_sock *po,
562 int blk_size_in_bytes)
563{
564 struct net_device *dev;
565 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
7cad1bac 566 struct ethtool_link_ksettings ecmd;
4bc71cb9 567 int err;
f6fb8f10 568
4bc71cb9
JP
569 rtnl_lock();
570 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
571 if (unlikely(!dev)) {
572 rtnl_unlock();
f6fb8f10 573 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 574 }
7cad1bac 575 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9
JP
576 rtnl_unlock();
577 if (!err) {
4bc71cb9
JP
578 /*
579 * If the link speed is so slow you don't really
580 * need to worry about perf anyways
581 */
7cad1bac
DD
582 if (ecmd.base.speed < SPEED_1000 ||
583 ecmd.base.speed == SPEED_UNKNOWN) {
4bc71cb9 584 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 585 } else {
586 msec = 1;
7cad1bac 587 div = ecmd.base.speed / 1000;
f6fb8f10 588 }
589 }
590
591 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
592
593 if (div)
594 mbits /= div;
595
596 tmo = mbits * msec;
597
598 if (div)
599 return tmo+1;
600 return tmo;
601}
602
bc59ba39 603static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 604 union tpacket_req_u *req_u)
605{
606 p1->feature_req_word = req_u->req3.tp_feature_req_word;
607}
608
609static void init_prb_bdqc(struct packet_sock *po,
610 struct packet_ring_buffer *rb,
611 struct pgv *pg_vec,
e8e85cc5 612 union tpacket_req_u *req_u)
f6fb8f10 613{
22781a5b 614 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 615 struct tpacket_block_desc *pbd;
f6fb8f10 616
617 memset(p1, 0x0, sizeof(*p1));
618
619 p1->knxt_seq_num = 1;
620 p1->pkbdq = pg_vec;
bc59ba39 621 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 622 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 623 p1->kblk_size = req_u->req3.tp_block_size;
624 p1->knum_blocks = req_u->req3.tp_block_nr;
625 p1->hdrlen = po->tp_hdrlen;
626 p1->version = po->tp_version;
627 p1->last_kactive_blk_num = 0;
ee80fbf3 628 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 629 if (req_u->req3.tp_retire_blk_tov)
630 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
631 else
632 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
633 req_u->req3.tp_block_size);
634 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
635 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
636
dc808110 637 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 638 prb_init_ft_ops(p1, req_u);
e8e85cc5 639 prb_setup_retire_blk_timer(po);
f6fb8f10 640 prb_open_block(p1, pbd);
641}
642
643/* Do NOT update the last_blk_num first.
644 * Assumes sk_buff_head lock is held.
645 */
bc59ba39 646static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 647{
648 mod_timer(&pkc->retire_blk_timer,
649 jiffies + pkc->tov_in_jiffies);
650 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
651}
652
653/*
654 * Timer logic:
655 * 1) We refresh the timer only when we open a block.
656 * By doing this we don't waste cycles refreshing the timer
657 * on packet-by-packet basis.
658 *
659 * With a 1MB block-size, on a 1Gbps line, it will take
660 * i) ~8 ms to fill a block + ii) memcpy etc.
661 * In this cut we are not accounting for the memcpy time.
662 *
663 * So, if the user sets the 'tmo' to 10ms then the timer
664 * will never fire while the block is still getting filled
665 * (which is what we want). However, the user could choose
666 * to close a block early and that's fine.
667 *
668 * But when the timer does fire, we check whether or not to refresh it.
669 * Since the tmo granularity is in msecs, it is not too expensive
670 * to refresh the timer, lets say every '8' msecs.
671 * Either the user can set the 'tmo' or we can derive it based on
672 * a) line-speed and b) block-size.
673 * prb_calc_retire_blk_tmo() calculates the tmo.
674 *
675 */
676static void prb_retire_rx_blk_timer_expired(unsigned long data)
677{
678 struct packet_sock *po = (struct packet_sock *)data;
22781a5b 679 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 680 unsigned int frozen;
bc59ba39 681 struct tpacket_block_desc *pbd;
f6fb8f10 682
683 spin_lock(&po->sk.sk_receive_queue.lock);
684
685 frozen = prb_queue_frozen(pkc);
686 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
687
688 if (unlikely(pkc->delete_blk_timer))
689 goto out;
690
691 /* We only need to plug the race when the block is partially filled.
692 * tpacket_rcv:
693 * lock(); increment BLOCK_NUM_PKTS; unlock()
694 * copy_bits() is in progress ...
695 * timer fires on other cpu:
696 * we can't retire the current block because copy_bits
697 * is in progress.
698 *
699 */
700 if (BLOCK_NUM_PKTS(pbd)) {
701 while (atomic_read(&pkc->blk_fill_in_prog)) {
702 /* Waiting for skb_copy_bits to finish... */
703 cpu_relax();
704 }
705 }
706
707 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
708 if (!frozen) {
41a50d62
AD
709 if (!BLOCK_NUM_PKTS(pbd)) {
710 /* An empty block. Just refresh the timer. */
711 goto refresh_timer;
712 }
f6fb8f10 713 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
714 if (!prb_dispatch_next_block(pkc, po))
715 goto refresh_timer;
716 else
717 goto out;
718 } else {
719 /* Case 1. Queue was frozen because user-space was
720 * lagging behind.
721 */
878cd3ba 722 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 723 /*
724 * Ok, user-space is still behind.
725 * So just refresh the timer.
726 */
727 goto refresh_timer;
728 } else {
729 /* Case 2. queue was frozen,user-space caught up,
730 * now the link went idle && the timer fired.
731 * We don't have a block to close.So we open this
732 * block and restart the timer.
733 * opening a block thaws the queue,restarts timer
734 * Thawing/timer-refresh is a side effect.
735 */
736 prb_open_block(pkc, pbd);
737 goto out;
738 }
739 }
740 }
741
742refresh_timer:
743 _prb_refresh_rx_retire_blk_timer(pkc);
744
745out:
746 spin_unlock(&po->sk.sk_receive_queue.lock);
747}
748
eea49cc9 749static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 750 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 751{
752 /* Flush everything minus the block header */
753
754#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
755 u8 *start, *end;
756
757 start = (u8 *)pbd1;
758
759 /* Skip the block header(we know header WILL fit in 4K) */
760 start += PAGE_SIZE;
761
762 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
763 for (; start < end; start += PAGE_SIZE)
764 flush_dcache_page(pgv_to_page(start));
765
766 smp_wmb();
767#endif
768
769 /* Now update the block status. */
770
771 BLOCK_STATUS(pbd1) = status;
772
773 /* Flush the block header */
774
775#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
776 start = (u8 *)pbd1;
777 flush_dcache_page(pgv_to_page(start));
778
779 smp_wmb();
780#endif
781}
782
783/*
784 * Side effect:
785 *
786 * 1) flush the block
787 * 2) Increment active_blk_num
788 *
789 * Note:We DONT refresh the timer on purpose.
790 * Because almost always the next block will be opened.
791 */
bc59ba39 792static void prb_close_block(struct tpacket_kbdq_core *pkc1,
793 struct tpacket_block_desc *pbd1,
f6fb8f10 794 struct packet_sock *po, unsigned int stat)
795{
796 __u32 status = TP_STATUS_USER | stat;
797
798 struct tpacket3_hdr *last_pkt;
bc59ba39 799 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 800 struct sock *sk = &po->sk;
f6fb8f10 801
ee80fbf3 802 if (po->stats.stats3.tp_drops)
f6fb8f10 803 status |= TP_STATUS_LOSING;
804
805 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
806 last_pkt->tp_next_offset = 0;
807
808 /* Get the ts of the last pkt */
809 if (BLOCK_NUM_PKTS(pbd1)) {
810 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
811 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
812 } else {
41a50d62
AD
813 /* Ok, we tmo'd - so get the current time.
814 *
815 * It shouldn't really happen as we don't close empty
816 * blocks. See prb_retire_rx_blk_timer_expired().
817 */
f6fb8f10 818 struct timespec ts;
819 getnstimeofday(&ts);
820 h1->ts_last_pkt.ts_sec = ts.tv_sec;
821 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
822 }
823
824 smp_wmb();
825
826 /* Flush the block */
827 prb_flush_block(pkc1, pbd1, status);
828
da413eec
DC
829 sk->sk_data_ready(sk);
830
f6fb8f10 831 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
832}
833
eea49cc9 834static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 835{
836 pkc->reset_pending_on_curr_blk = 0;
837}
838
839/*
840 * Side effect of opening a block:
841 *
842 * 1) prb_queue is thawed.
843 * 2) retire_blk_timer is refreshed.
844 *
845 */
bc59ba39 846static void prb_open_block(struct tpacket_kbdq_core *pkc1,
847 struct tpacket_block_desc *pbd1)
f6fb8f10 848{
849 struct timespec ts;
bc59ba39 850 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 851
852 smp_rmb();
853
8da3056c
DB
854 /* We could have just memset this but we will lose the
855 * flexibility of making the priv area sticky
856 */
f6fb8f10 857
8da3056c
DB
858 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
859 BLOCK_NUM_PKTS(pbd1) = 0;
860 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 861
8da3056c
DB
862 getnstimeofday(&ts);
863
864 h1->ts_first_pkt.ts_sec = ts.tv_sec;
865 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 866
8da3056c
DB
867 pkc1->pkblk_start = (char *)pbd1;
868 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
869
870 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
871 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
872
873 pbd1->version = pkc1->version;
874 pkc1->prev = pkc1->nxt_offset;
875 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
876
877 prb_thaw_queue(pkc1);
878 _prb_refresh_rx_retire_blk_timer(pkc1);
879
880 smp_wmb();
f6fb8f10 881}
882
883/*
884 * Queue freeze logic:
885 * 1) Assume tp_block_nr = 8 blocks.
886 * 2) At time 't0', user opens Rx ring.
887 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
888 * 4) user-space is either sleeping or processing block '0'.
889 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
890 * it will close block-7,loop around and try to fill block '0'.
891 * call-flow:
892 * __packet_lookup_frame_in_block
893 * prb_retire_current_block()
894 * prb_dispatch_next_block()
895 * |->(BLOCK_STATUS == USER) evaluates to true
896 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
897 * 6) Now there are two cases:
898 * 6.1) Link goes idle right after the queue is frozen.
899 * But remember, the last open_block() refreshed the timer.
900 * When this timer expires,it will refresh itself so that we can
901 * re-open block-0 in near future.
902 * 6.2) Link is busy and keeps on receiving packets. This is a simple
903 * case and __packet_lookup_frame_in_block will check if block-0
904 * is free and can now be re-used.
905 */
eea49cc9 906static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 907 struct packet_sock *po)
908{
909 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 910 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 911}
912
913#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
914
915/*
916 * If the next block is free then we will dispatch it
917 * and return a good offset.
918 * Else, we will freeze the queue.
919 * So, caller must check the return value.
920 */
bc59ba39 921static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 922 struct packet_sock *po)
923{
bc59ba39 924 struct tpacket_block_desc *pbd;
f6fb8f10 925
926 smp_rmb();
927
928 /* 1. Get current block num */
929 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
930
931 /* 2. If this block is currently in_use then freeze the queue */
932 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
933 prb_freeze_queue(pkc, po);
934 return NULL;
935 }
936
937 /*
938 * 3.
939 * open this block and return the offset where the first packet
940 * needs to get stored.
941 */
942 prb_open_block(pkc, pbd);
943 return (void *)pkc->nxt_offset;
944}
945
bc59ba39 946static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 947 struct packet_sock *po, unsigned int status)
948{
bc59ba39 949 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 950
951 /* retire/close the current block */
952 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
953 /*
954 * Plug the case where copy_bits() is in progress on
955 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
956 * have space to copy the pkt in the current block and
957 * called prb_retire_current_block()
958 *
959 * We don't need to worry about the TMO case because
960 * the timer-handler already handled this case.
961 */
962 if (!(status & TP_STATUS_BLK_TMO)) {
963 while (atomic_read(&pkc->blk_fill_in_prog)) {
964 /* Waiting for skb_copy_bits to finish... */
965 cpu_relax();
966 }
967 }
968 prb_close_block(pkc, pbd, po, status);
969 return;
970 }
f6fb8f10 971}
972
878cd3ba 973static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
f6fb8f10 974{
975 return TP_STATUS_USER & BLOCK_STATUS(pbd);
976}
977
eea49cc9 978static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 979{
980 return pkc->reset_pending_on_curr_blk;
981}
982
eea49cc9 983static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 984{
bc59ba39 985 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 986 atomic_dec(&pkc->blk_fill_in_prog);
987}
988
eea49cc9 989static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 990 struct tpacket3_hdr *ppd)
991{
3958afa1 992 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 993}
994
eea49cc9 995static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 996 struct tpacket3_hdr *ppd)
997{
998 ppd->hv1.tp_rxhash = 0;
999}
1000
eea49cc9 1001static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 1002 struct tpacket3_hdr *ppd)
1003{
df8a39de
JP
1004 if (skb_vlan_tag_present(pkc->skb)) {
1005 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
1006 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1007 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 1008 } else {
9e67030a 1009 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 1010 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 1011 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 1012 }
1013}
1014
bc59ba39 1015static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 1016 struct tpacket3_hdr *ppd)
1017{
a0cdfcf3 1018 ppd->hv1.tp_padding = 0;
f6fb8f10 1019 prb_fill_vlan_info(pkc, ppd);
1020
1021 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1022 prb_fill_rxhash(pkc, ppd);
1023 else
1024 prb_clear_rxhash(pkc, ppd);
1025}
1026
eea49cc9 1027static void prb_fill_curr_block(char *curr,
bc59ba39 1028 struct tpacket_kbdq_core *pkc,
1029 struct tpacket_block_desc *pbd,
f6fb8f10 1030 unsigned int len)
1031{
1032 struct tpacket3_hdr *ppd;
1033
1034 ppd = (struct tpacket3_hdr *)curr;
1035 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1036 pkc->prev = curr;
1037 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1038 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1039 BLOCK_NUM_PKTS(pbd) += 1;
1040 atomic_inc(&pkc->blk_fill_in_prog);
1041 prb_run_all_ft_ops(pkc, ppd);
1042}
1043
1044/* Assumes caller has the sk->rx_queue.lock */
1045static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1046 struct sk_buff *skb,
1047 int status,
1048 unsigned int len
1049 )
1050{
bc59ba39 1051 struct tpacket_kbdq_core *pkc;
1052 struct tpacket_block_desc *pbd;
f6fb8f10 1053 char *curr, *end;
1054
e3192690 1055 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1056 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1057
1058 /* Queue is frozen when user space is lagging behind */
1059 if (prb_queue_frozen(pkc)) {
1060 /*
1061 * Check if that last block which caused the queue to freeze,
1062 * is still in_use by user-space.
1063 */
878cd3ba 1064 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 1065 /* Can't record this packet */
1066 return NULL;
1067 } else {
1068 /*
1069 * Ok, the block was released by user-space.
1070 * Now let's open that block.
1071 * opening a block also thaws the queue.
1072 * Thawing is a side effect.
1073 */
1074 prb_open_block(pkc, pbd);
1075 }
1076 }
1077
1078 smp_mb();
1079 curr = pkc->nxt_offset;
1080 pkc->skb = skb;
e3192690 1081 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1082
1083 /* first try the current block */
1084 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1085 prb_fill_curr_block(curr, pkc, pbd, len);
1086 return (void *)curr;
1087 }
1088
1089 /* Ok, close the current block */
1090 prb_retire_current_block(pkc, po, 0);
1091
1092 /* Now, try to dispatch the next block */
1093 curr = (char *)prb_dispatch_next_block(pkc, po);
1094 if (curr) {
1095 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1096 prb_fill_curr_block(curr, pkc, pbd, len);
1097 return (void *)curr;
1098 }
1099
1100 /*
1101 * No free blocks are available.user_space hasn't caught up yet.
1102 * Queue was just frozen and now this packet will get dropped.
1103 */
1104 return NULL;
1105}
1106
eea49cc9 1107static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1108 struct sk_buff *skb,
1109 int status, unsigned int len)
1110{
1111 char *curr = NULL;
1112 switch (po->tp_version) {
1113 case TPACKET_V1:
1114 case TPACKET_V2:
1115 curr = packet_lookup_frame(po, &po->rx_ring,
1116 po->rx_ring.head, status);
1117 return curr;
1118 case TPACKET_V3:
1119 return __packet_lookup_frame_in_block(po, skb, status, len);
1120 default:
1121 WARN(1, "TPACKET version not supported\n");
1122 BUG();
99aa3473 1123 return NULL;
f6fb8f10 1124 }
1125}
1126
eea49cc9 1127static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1128 struct packet_ring_buffer *rb,
77f65ebd 1129 unsigned int idx,
f6fb8f10 1130 int status)
1131{
bc59ba39 1132 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1133 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1134
1135 if (status != BLOCK_STATUS(pbd))
1136 return NULL;
1137 return pbd;
1138}
1139
eea49cc9 1140static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1141{
1142 unsigned int prev;
1143 if (rb->prb_bdqc.kactive_blk_num)
1144 prev = rb->prb_bdqc.kactive_blk_num-1;
1145 else
1146 prev = rb->prb_bdqc.knum_blocks-1;
1147 return prev;
1148}
1149
1150/* Assumes caller has held the rx_queue.lock */
eea49cc9 1151static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1152 struct packet_ring_buffer *rb,
1153 int status)
1154{
1155 unsigned int previous = prb_previous_blk_num(rb);
1156 return prb_lookup_block(po, rb, previous, status);
1157}
1158
eea49cc9 1159static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1160 struct packet_ring_buffer *rb,
1161 int status)
1162{
1163 if (po->tp_version <= TPACKET_V2)
1164 return packet_previous_frame(po, rb, status);
1165
1166 return __prb_previous_block(po, rb, status);
1167}
1168
eea49cc9 1169static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1170 struct packet_ring_buffer *rb)
1171{
1172 switch (po->tp_version) {
1173 case TPACKET_V1:
1174 case TPACKET_V2:
1175 return packet_increment_head(rb);
1176 case TPACKET_V3:
1177 default:
1178 WARN(1, "TPACKET version not supported.\n");
1179 BUG();
1180 return;
1181 }
1182}
1183
eea49cc9 1184static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1185 struct packet_ring_buffer *rb,
1186 int status)
1187{
1188 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1189 return packet_lookup_frame(po, rb, previous, status);
1190}
1191
eea49cc9 1192static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1193{
1194 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1195}
1196
b0138408
DB
1197static void packet_inc_pending(struct packet_ring_buffer *rb)
1198{
1199 this_cpu_inc(*rb->pending_refcnt);
1200}
1201
1202static void packet_dec_pending(struct packet_ring_buffer *rb)
1203{
1204 this_cpu_dec(*rb->pending_refcnt);
1205}
1206
1207static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1208{
1209 unsigned int refcnt = 0;
1210 int cpu;
1211
1212 /* We don't use pending refcount in rx_ring. */
1213 if (rb->pending_refcnt == NULL)
1214 return 0;
1215
1216 for_each_possible_cpu(cpu)
1217 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1218
1219 return refcnt;
1220}
1221
1222static int packet_alloc_pending(struct packet_sock *po)
1223{
1224 po->rx_ring.pending_refcnt = NULL;
1225
1226 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1227 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1228 return -ENOBUFS;
1229
1230 return 0;
1231}
1232
1233static void packet_free_pending(struct packet_sock *po)
1234{
1235 free_percpu(po->tx_ring.pending_refcnt);
1236}
1237
9954729b
WB
1238#define ROOM_POW_OFF 2
1239#define ROOM_NONE 0x0
1240#define ROOM_LOW 0x1
1241#define ROOM_NORMAL 0x2
1242
1243static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1244{
9954729b
WB
1245 int idx, len;
1246
1247 len = po->rx_ring.frame_max + 1;
1248 idx = po->rx_ring.head;
1249 if (pow_off)
1250 idx += len >> pow_off;
1251 if (idx >= len)
1252 idx -= len;
1253 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1254}
1255
1256static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1257{
1258 int idx, len;
1259
1260 len = po->rx_ring.prb_bdqc.knum_blocks;
1261 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1262 if (pow_off)
1263 idx += len >> pow_off;
1264 if (idx >= len)
1265 idx -= len;
1266 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1267}
77f65ebd 1268
2ccdbaa6 1269static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1270{
1271 struct sock *sk = &po->sk;
1272 int ret = ROOM_NONE;
1273
1274 if (po->prot_hook.func != tpacket_rcv) {
1275 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1276 - (skb ? skb->truesize : 0);
9954729b
WB
1277 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1278 return ROOM_NORMAL;
1279 else if (avail > 0)
1280 return ROOM_LOW;
1281 else
1282 return ROOM_NONE;
1283 }
77f65ebd 1284
9954729b
WB
1285 if (po->tp_version == TPACKET_V3) {
1286 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1287 ret = ROOM_NORMAL;
1288 else if (__tpacket_v3_has_room(po, 0))
1289 ret = ROOM_LOW;
1290 } else {
1291 if (__tpacket_has_room(po, ROOM_POW_OFF))
1292 ret = ROOM_NORMAL;
1293 else if (__tpacket_has_room(po, 0))
1294 ret = ROOM_LOW;
1295 }
2ccdbaa6
WB
1296
1297 return ret;
1298}
1299
1300static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1301{
1302 int ret;
1303 bool has_room;
1304
54d7c01d
WB
1305 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1306 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1307 has_room = ret == ROOM_NORMAL;
1308 if (po->pressure == has_room)
54d7c01d
WB
1309 po->pressure = !has_room;
1310 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1311
9954729b 1312 return ret;
77f65ebd
WB
1313}
1314
1da177e4
LT
1315static void packet_sock_destruct(struct sock *sk)
1316{
ed85b565
RC
1317 skb_queue_purge(&sk->sk_error_queue);
1318
547b792c 1319 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
14afee4b 1320 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1da177e4
LT
1321
1322 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1323 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1324 return;
1325 }
1326
17ab56a2 1327 sk_refcnt_debug_dec(sk);
1da177e4
LT
1328}
1329
3b3a5b0a
WB
1330static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1331{
1332 u32 rxhash;
1333 int i, count = 0;
1334
1335 rxhash = skb_get_hash(skb);
1336 for (i = 0; i < ROLLOVER_HLEN; i++)
1337 if (po->rollover->history[i] == rxhash)
1338 count++;
1339
1340 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1341 return count > (ROLLOVER_HLEN >> 1);
1342}
1343
77f65ebd
WB
1344static unsigned int fanout_demux_hash(struct packet_fanout *f,
1345 struct sk_buff *skb,
1346 unsigned int num)
dc99f600 1347{
eb70db87 1348 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
dc99f600
DM
1349}
1350
77f65ebd
WB
1351static unsigned int fanout_demux_lb(struct packet_fanout *f,
1352 struct sk_buff *skb,
1353 unsigned int num)
dc99f600 1354{
468479e6 1355 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1356
468479e6 1357 return val % num;
77f65ebd
WB
1358}
1359
1360static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1361 struct sk_buff *skb,
1362 unsigned int num)
1363{
1364 return smp_processor_id() % num;
dc99f600
DM
1365}
1366
5df0ddfb
DB
1367static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1368 struct sk_buff *skb,
1369 unsigned int num)
1370{
f337db64 1371 return prandom_u32_max(num);
5df0ddfb
DB
1372}
1373
77f65ebd
WB
1374static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1375 struct sk_buff *skb,
ad377cab 1376 unsigned int idx, bool try_self,
77f65ebd 1377 unsigned int num)
95ec3eb4 1378{
4633c9e0 1379 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1380 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1381
0648ab70 1382 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1383
1384 if (try_self) {
1385 room = packet_rcv_has_room(po, skb);
1386 if (room == ROOM_NORMAL ||
1387 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1388 return idx;
4633c9e0 1389 po_skip = po;
3b3a5b0a 1390 }
ad377cab 1391
0648ab70 1392 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1393 do {
2ccdbaa6 1394 po_next = pkt_sk(f->arr[i]);
4633c9e0 1395 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1396 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1397 if (i != j)
0648ab70 1398 po->rollover->sock = i;
a9b63918
WB
1399 atomic_long_inc(&po->rollover->num);
1400 if (room == ROOM_LOW)
1401 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1402 return i;
1403 }
ad377cab 1404
77f65ebd
WB
1405 if (++i == num)
1406 i = 0;
1407 } while (i != j);
1408
a9b63918 1409 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1410 return idx;
1411}
1412
2d36097d
NH
1413static unsigned int fanout_demux_qm(struct packet_fanout *f,
1414 struct sk_buff *skb,
1415 unsigned int num)
1416{
1417 return skb_get_queue_mapping(skb) % num;
1418}
1419
47dceb8e
WB
1420static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1421 struct sk_buff *skb,
1422 unsigned int num)
1423{
1424 struct bpf_prog *prog;
1425 unsigned int ret = 0;
1426
1427 rcu_read_lock();
1428 prog = rcu_dereference(f->bpf_prog);
1429 if (prog)
ff936a04 1430 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1431 rcu_read_unlock();
1432
1433 return ret;
1434}
1435
77f65ebd
WB
1436static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1437{
1438 return f->flags & (flag >> 8);
95ec3eb4
DM
1439}
1440
95ec3eb4
DM
1441static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1442 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1443{
1444 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1445 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1446 struct net *net = read_pnet(&f->net);
dc99f600 1447 struct packet_sock *po;
77f65ebd 1448 unsigned int idx;
dc99f600 1449
19bcf9f2 1450 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1451 kfree_skb(skb);
1452 return 0;
1453 }
1454
3f34b24a 1455 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1456 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1457 if (!skb)
1458 return 0;
1459 }
95ec3eb4
DM
1460 switch (f->type) {
1461 case PACKET_FANOUT_HASH:
1462 default:
77f65ebd 1463 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1464 break;
1465 case PACKET_FANOUT_LB:
77f65ebd 1466 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1467 break;
1468 case PACKET_FANOUT_CPU:
77f65ebd
WB
1469 idx = fanout_demux_cpu(f, skb, num);
1470 break;
5df0ddfb
DB
1471 case PACKET_FANOUT_RND:
1472 idx = fanout_demux_rnd(f, skb, num);
1473 break;
2d36097d
NH
1474 case PACKET_FANOUT_QM:
1475 idx = fanout_demux_qm(f, skb, num);
1476 break;
77f65ebd 1477 case PACKET_FANOUT_ROLLOVER:
ad377cab 1478 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1479 break;
47dceb8e 1480 case PACKET_FANOUT_CBPF:
f2e52095 1481 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1482 idx = fanout_demux_bpf(f, skb, num);
1483 break;
dc99f600
DM
1484 }
1485
ad377cab
WB
1486 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1487 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1488
ad377cab 1489 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1490 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1491}
1492
fff3321d
PE
1493DEFINE_MUTEX(fanout_mutex);
1494EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600 1495static LIST_HEAD(fanout_list);
4a69a864 1496static u16 fanout_next_id;
dc99f600
DM
1497
1498static void __fanout_link(struct sock *sk, struct packet_sock *po)
1499{
1500 struct packet_fanout *f = po->fanout;
1501
1502 spin_lock(&f->lock);
1503 f->arr[f->num_members] = sk;
1504 smp_wmb();
1505 f->num_members++;
2bd624b4
AS
1506 if (f->num_members == 1)
1507 dev_add_pack(&f->prot_hook);
dc99f600
DM
1508 spin_unlock(&f->lock);
1509}
1510
1511static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1512{
1513 struct packet_fanout *f = po->fanout;
1514 int i;
1515
1516 spin_lock(&f->lock);
1517 for (i = 0; i < f->num_members; i++) {
1518 if (f->arr[i] == sk)
1519 break;
1520 }
1521 BUG_ON(i >= f->num_members);
1522 f->arr[i] = f->arr[f->num_members - 1];
1523 f->num_members--;
2bd624b4
AS
1524 if (f->num_members == 0)
1525 __dev_remove_pack(&f->prot_hook);
dc99f600
DM
1526 spin_unlock(&f->lock);
1527}
1528
d4dd8aee 1529static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1530{
161642e2
ED
1531 if (sk->sk_family != PF_PACKET)
1532 return false;
c0de08d0 1533
161642e2 1534 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1535}
1536
47dceb8e
WB
1537static void fanout_init_data(struct packet_fanout *f)
1538{
1539 switch (f->type) {
1540 case PACKET_FANOUT_LB:
1541 atomic_set(&f->rr_cur, 0);
1542 break;
1543 case PACKET_FANOUT_CBPF:
f2e52095 1544 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1545 RCU_INIT_POINTER(f->bpf_prog, NULL);
1546 break;
1547 }
1548}
1549
1550static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1551{
1552 struct bpf_prog *old;
1553
1554 spin_lock(&f->lock);
1555 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1556 rcu_assign_pointer(f->bpf_prog, new);
1557 spin_unlock(&f->lock);
1558
1559 if (old) {
1560 synchronize_net();
1561 bpf_prog_destroy(old);
1562 }
1563}
1564
1565static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1566 unsigned int len)
1567{
1568 struct bpf_prog *new;
1569 struct sock_fprog fprog;
1570 int ret;
1571
1572 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1573 return -EPERM;
1574 if (len != sizeof(fprog))
1575 return -EINVAL;
1576 if (copy_from_user(&fprog, data, len))
1577 return -EFAULT;
1578
bab18991 1579 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1580 if (ret)
1581 return ret;
1582
1583 __fanout_set_data_bpf(po->fanout, new);
1584 return 0;
1585}
1586
f2e52095
WB
1587static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1588 unsigned int len)
1589{
1590 struct bpf_prog *new;
1591 u32 fd;
1592
1593 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1594 return -EPERM;
1595 if (len != sizeof(fd))
1596 return -EINVAL;
1597 if (copy_from_user(&fd, data, len))
1598 return -EFAULT;
1599
113214be 1600 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
f2e52095
WB
1601 if (IS_ERR(new))
1602 return PTR_ERR(new);
f2e52095
WB
1603
1604 __fanout_set_data_bpf(po->fanout, new);
1605 return 0;
1606}
1607
47dceb8e
WB
1608static int fanout_set_data(struct packet_sock *po, char __user *data,
1609 unsigned int len)
1610{
1611 switch (po->fanout->type) {
1612 case PACKET_FANOUT_CBPF:
1613 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1614 case PACKET_FANOUT_EBPF:
1615 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1616 default:
1617 return -EINVAL;
1618 };
1619}
1620
1621static void fanout_release_data(struct packet_fanout *f)
1622{
1623 switch (f->type) {
1624 case PACKET_FANOUT_CBPF:
f2e52095 1625 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1626 __fanout_set_data_bpf(f, NULL);
1627 };
1628}
1629
4a69a864
MM
1630static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1631{
1632 struct packet_fanout *f;
1633
1634 list_for_each_entry(f, &fanout_list, list) {
1635 if (f->id == candidate_id &&
1636 read_pnet(&f->net) == sock_net(sk)) {
1637 return false;
1638 }
1639 }
1640 return true;
1641}
1642
1643static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1644{
1645 u16 id = fanout_next_id;
1646
1647 do {
1648 if (__fanout_id_is_free(sk, id)) {
1649 *new_id = id;
1650 fanout_next_id = id + 1;
1651 return true;
1652 }
1653
1654 id++;
1655 } while (id != fanout_next_id);
1656
1657 return false;
1658}
1659
7736d33f 1660static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600 1661{
d199fab6 1662 struct packet_rollover *rollover = NULL;
dc99f600
DM
1663 struct packet_sock *po = pkt_sk(sk);
1664 struct packet_fanout *f, *match;
7736d33f 1665 u8 type = type_flags & 0xff;
77f65ebd 1666 u8 flags = type_flags >> 8;
dc99f600
DM
1667 int err;
1668
1669 switch (type) {
77f65ebd
WB
1670 case PACKET_FANOUT_ROLLOVER:
1671 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1672 return -EINVAL;
dc99f600
DM
1673 case PACKET_FANOUT_HASH:
1674 case PACKET_FANOUT_LB:
95ec3eb4 1675 case PACKET_FANOUT_CPU:
5df0ddfb 1676 case PACKET_FANOUT_RND:
2d36097d 1677 case PACKET_FANOUT_QM:
47dceb8e 1678 case PACKET_FANOUT_CBPF:
f2e52095 1679 case PACKET_FANOUT_EBPF:
dc99f600
DM
1680 break;
1681 default:
1682 return -EINVAL;
1683 }
1684
d199fab6
ED
1685 mutex_lock(&fanout_mutex);
1686
d199fab6 1687 err = -EALREADY;
dc99f600 1688 if (po->fanout)
d199fab6 1689 goto out;
dc99f600 1690
4633c9e0
WB
1691 if (type == PACKET_FANOUT_ROLLOVER ||
1692 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
d199fab6
ED
1693 err = -ENOMEM;
1694 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1695 if (!rollover)
1696 goto out;
1697 atomic_long_set(&rollover->num, 0);
1698 atomic_long_set(&rollover->num_huge, 0);
1699 atomic_long_set(&rollover->num_failed, 0);
1700 po->rollover = rollover;
0648ab70
WB
1701 }
1702
4a69a864
MM
1703 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1704 if (id != 0) {
1705 err = -EINVAL;
1706 goto out;
1707 }
1708 if (!fanout_find_new_id(sk, &id)) {
1709 err = -ENOMEM;
1710 goto out;
1711 }
1712 /* ephemeral flag for the first socket in the group: drop it */
1713 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1714 }
1715
dc99f600
DM
1716 match = NULL;
1717 list_for_each_entry(f, &fanout_list, list) {
1718 if (f->id == id &&
1719 read_pnet(&f->net) == sock_net(sk)) {
1720 match = f;
1721 break;
1722 }
1723 }
afe62c68 1724 err = -EINVAL;
77f65ebd 1725 if (match && match->flags != flags)
afe62c68 1726 goto out;
dc99f600 1727 if (!match) {
afe62c68 1728 err = -ENOMEM;
dc99f600 1729 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1730 if (!match)
1731 goto out;
1732 write_pnet(&match->net, sock_net(sk));
1733 match->id = id;
1734 match->type = type;
77f65ebd 1735 match->flags = flags;
afe62c68
ED
1736 INIT_LIST_HEAD(&match->list);
1737 spin_lock_init(&match->lock);
fb5c2c17 1738 refcount_set(&match->sk_ref, 0);
47dceb8e 1739 fanout_init_data(match);
afe62c68
ED
1740 match->prot_hook.type = po->prot_hook.type;
1741 match->prot_hook.dev = po->prot_hook.dev;
1742 match->prot_hook.func = packet_rcv_fanout;
1743 match->prot_hook.af_packet_priv = match;
c0de08d0 1744 match->prot_hook.id_match = match_fanout_group;
afe62c68 1745 list_add(&match->list, &fanout_list);
dc99f600 1746 }
afe62c68 1747 err = -EINVAL;
008ba2a1
WB
1748
1749 spin_lock(&po->bind_lock);
1750 if (po->running &&
1751 match->type == type &&
afe62c68
ED
1752 match->prot_hook.type == po->prot_hook.type &&
1753 match->prot_hook.dev == po->prot_hook.dev) {
1754 err = -ENOSPC;
fb5c2c17 1755 if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
afe62c68
ED
1756 __dev_remove_pack(&po->prot_hook);
1757 po->fanout = match;
fb5c2c17 1758 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
afe62c68
ED
1759 __fanout_link(sk, po);
1760 err = 0;
dc99f600
DM
1761 }
1762 }
008ba2a1
WB
1763 spin_unlock(&po->bind_lock);
1764
1765 if (err && !refcount_read(&match->sk_ref)) {
1766 list_del(&match->list);
1767 kfree(match);
1768 }
1769
afe62c68 1770out:
d199fab6
ED
1771 if (err && rollover) {
1772 kfree(rollover);
0648ab70
WB
1773 po->rollover = NULL;
1774 }
d199fab6 1775 mutex_unlock(&fanout_mutex);
dc99f600
DM
1776 return err;
1777}
1778
2bd624b4
AS
1779/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1780 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1781 * It is the responsibility of the caller to call fanout_release_data() and
1782 * free the returned packet_fanout (after synchronize_net())
1783 */
1784static struct packet_fanout *fanout_release(struct sock *sk)
dc99f600
DM
1785{
1786 struct packet_sock *po = pkt_sk(sk);
1787 struct packet_fanout *f;
1788
fff3321d 1789 mutex_lock(&fanout_mutex);
d199fab6
ED
1790 f = po->fanout;
1791 if (f) {
1792 po->fanout = NULL;
1793
fb5c2c17 1794 if (refcount_dec_and_test(&f->sk_ref))
d199fab6 1795 list_del(&f->list);
2bd624b4
AS
1796 else
1797 f = NULL;
dc99f600 1798
d199fab6
ED
1799 if (po->rollover)
1800 kfree_rcu(po->rollover, rcu);
dc99f600
DM
1801 }
1802 mutex_unlock(&fanout_mutex);
2bd624b4
AS
1803
1804 return f;
dc99f600 1805}
1da177e4 1806
3c70c132
DB
1807static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1808 struct sk_buff *skb)
1809{
1810 /* Earlier code assumed this would be a VLAN pkt, double-check
1811 * this now that we have the actual packet in hand. We can only
1812 * do this check on Ethernet devices.
1813 */
1814 if (unlikely(dev->type != ARPHRD_ETHER))
1815 return false;
1816
1817 skb_reset_mac_header(skb);
1818 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1819}
1820
90ddc4f0 1821static const struct proto_ops packet_ops;
1da177e4 1822
90ddc4f0 1823static const struct proto_ops packet_ops_spkt;
1da177e4 1824
40d4e3df
ED
1825static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1826 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1827{
1828 struct sock *sk;
1829 struct sockaddr_pkt *spkt;
1830
1831 /*
1832 * When we registered the protocol we saved the socket in the data
1833 * field for just this event.
1834 */
1835
1836 sk = pt->af_packet_priv;
1ce4f28b 1837
1da177e4
LT
1838 /*
1839 * Yank back the headers [hope the device set this
1840 * right or kerboom...]
1841 *
1842 * Incoming packets have ll header pulled,
1843 * push it back.
1844 *
98e399f8 1845 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1846 * so that this procedure is noop.
1847 */
1848
1849 if (skb->pkt_type == PACKET_LOOPBACK)
1850 goto out;
1851
09ad9bc7 1852 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1853 goto out;
1854
40d4e3df
ED
1855 skb = skb_share_check(skb, GFP_ATOMIC);
1856 if (skb == NULL)
1da177e4
LT
1857 goto oom;
1858
1859 /* drop any routing info */
adf30907 1860 skb_dst_drop(skb);
1da177e4 1861
84531c24
PO
1862 /* drop conntrack reference */
1863 nf_reset(skb);
1864
ffbc6111 1865 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1866
98e399f8 1867 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1868
1869 /*
1870 * The SOCK_PACKET socket receives _all_ frames.
1871 */
1872
1873 spkt->spkt_family = dev->type;
1874 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1875 spkt->spkt_protocol = skb->protocol;
1876
1877 /*
1878 * Charge the memory to the socket. This is done specifically
1879 * to prevent sockets using all the memory up.
1880 */
1881
40d4e3df 1882 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1883 return 0;
1884
1885out:
1886 kfree_skb(skb);
1887oom:
1888 return 0;
1889}
1890
1891
1892/*
1893 * Output a raw packet to a device layer. This bypasses all the other
1894 * protocol layers and you must therefore supply it with a complete frame
1895 */
1ce4f28b 1896
1b784140
YX
1897static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1898 size_t len)
1da177e4
LT
1899{
1900 struct sock *sk = sock->sk;
342dfc30 1901 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1902 struct sk_buff *skb = NULL;
1da177e4 1903 struct net_device *dev;
c14ac945 1904 struct sockcm_cookie sockc;
40d4e3df 1905 __be16 proto = 0;
1da177e4 1906 int err;
3bdc0eba 1907 int extra_len = 0;
1ce4f28b 1908
1da177e4 1909 /*
1ce4f28b 1910 * Get and verify the address.
1da177e4
LT
1911 */
1912
40d4e3df 1913 if (saddr) {
1da177e4 1914 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1915 return -EINVAL;
1916 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1917 proto = saddr->spkt_protocol;
1918 } else
1919 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1920
1921 /*
1ce4f28b 1922 * Find the device first to size check it
1da177e4
LT
1923 */
1924
de74e92a 1925 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1926retry:
654d1f8a
ED
1927 rcu_read_lock();
1928 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1929 err = -ENODEV;
1930 if (dev == NULL)
1931 goto out_unlock;
1ce4f28b 1932
d5e76b0a
DM
1933 err = -ENETDOWN;
1934 if (!(dev->flags & IFF_UP))
1935 goto out_unlock;
1936
1da177e4 1937 /*
40d4e3df
ED
1938 * You may not queue a frame bigger than the mtu. This is the lowest level
1939 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1940 */
1ce4f28b 1941
3bdc0eba
BG
1942 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1943 if (!netif_supports_nofcs(dev)) {
1944 err = -EPROTONOSUPPORT;
1945 goto out_unlock;
1946 }
1947 extra_len = 4; /* We're doing our own CRC */
1948 }
1949
1da177e4 1950 err = -EMSGSIZE;
3bdc0eba 1951 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1952 goto out_unlock;
1953
1a35ca80
ED
1954 if (!skb) {
1955 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1956 int tlen = dev->needed_tailroom;
1a35ca80
ED
1957 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1958
1959 rcu_read_unlock();
4ce40912 1960 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1961 if (skb == NULL)
1962 return -ENOBUFS;
1963 /* FIXME: Save some space for broken drivers that write a hard
1964 * header at transmission time by themselves. PPP is the notable
1965 * one here. This should really be fixed at the driver level.
1966 */
1967 skb_reserve(skb, reserved);
1968 skb_reset_network_header(skb);
1969
1970 /* Try to align data part correctly */
1971 if (hhlen) {
1972 skb->data -= hhlen;
1973 skb->tail -= hhlen;
1974 if (len < hhlen)
1975 skb_reset_network_header(skb);
1976 }
6ce8e9ce 1977 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1978 if (err)
1979 goto out_free;
1980 goto retry;
1da177e4
LT
1981 }
1982
9ed988cd
WB
1983 if (!dev_validate_header(dev, skb->data, len)) {
1984 err = -EINVAL;
1985 goto out_unlock;
1986 }
3c70c132
DB
1987 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1988 !packet_extra_vlan_len_allowed(dev, skb)) {
1989 err = -EMSGSIZE;
1990 goto out_unlock;
57f89bfa 1991 }
1a35ca80 1992
edbe7746 1993 sockc.tsflags = sk->sk_tsflags;
c14ac945
SHY
1994 if (msg->msg_controllen) {
1995 err = sock_cmsg_send(sk, msg, &sockc);
f8e7718c 1996 if (unlikely(err))
c14ac945 1997 goto out_unlock;
c14ac945
SHY
1998 }
1999
1da177e4
LT
2000 skb->protocol = proto;
2001 skb->dev = dev;
2002 skb->priority = sk->sk_priority;
2d37a186 2003 skb->mark = sk->sk_mark;
bf84a010 2004
c14ac945 2005 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1da177e4 2006
3bdc0eba
BG
2007 if (unlikely(extra_len == 4))
2008 skb->no_fcs = 1;
2009
40893fd0 2010 skb_probe_transport_header(skb, 0);
c1aad275 2011
1da177e4 2012 dev_queue_xmit(skb);
654d1f8a 2013 rcu_read_unlock();
40d4e3df 2014 return len;
1da177e4 2015
1da177e4 2016out_unlock:
654d1f8a 2017 rcu_read_unlock();
1a35ca80
ED
2018out_free:
2019 kfree_skb(skb);
1da177e4
LT
2020 return err;
2021}
1da177e4 2022
ff936a04
AS
2023static unsigned int run_filter(struct sk_buff *skb,
2024 const struct sock *sk,
2025 unsigned int res)
1da177e4
LT
2026{
2027 struct sk_filter *filter;
fda9ef5d 2028
80f8f102
ED
2029 rcu_read_lock();
2030 filter = rcu_dereference(sk->sk_filter);
dbcb5855 2031 if (filter != NULL)
ff936a04 2032 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 2033 rcu_read_unlock();
1da177e4 2034
dbcb5855 2035 return res;
1da177e4
LT
2036}
2037
16cc1400
WB
2038static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2039 size_t *len)
2040{
2041 struct virtio_net_hdr vnet_hdr;
2042
2043 if (*len < sizeof(vnet_hdr))
2044 return -EINVAL;
2045 *len -= sizeof(vnet_hdr);
2046
6391a448 2047 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true))
16cc1400
WB
2048 return -EINVAL;
2049
2050 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2051}
2052
1da177e4 2053/*
62ab0812
ED
2054 * This function makes lazy skb cloning in hope that most of packets
2055 * are discarded by BPF.
2056 *
2057 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2058 * and skb->cb are mangled. It works because (and until) packets
2059 * falling here are owned by current CPU. Output packets are cloned
2060 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2061 * sequencially, so that if we return skb to original state on exit,
2062 * we will not harm anyone.
1da177e4
LT
2063 */
2064
40d4e3df
ED
2065static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2066 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2067{
2068 struct sock *sk;
2069 struct sockaddr_ll *sll;
2070 struct packet_sock *po;
40d4e3df 2071 u8 *skb_head = skb->data;
1da177e4 2072 int skb_len = skb->len;
dbcb5855 2073 unsigned int snaplen, res;
da37845f 2074 bool is_drop_n_account = false;
1da177e4
LT
2075
2076 if (skb->pkt_type == PACKET_LOOPBACK)
2077 goto drop;
2078
2079 sk = pt->af_packet_priv;
2080 po = pkt_sk(sk);
2081
09ad9bc7 2082 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2083 goto drop;
2084
1da177e4
LT
2085 skb->dev = dev;
2086
3b04ddde 2087 if (dev->header_ops) {
1da177e4 2088 /* The device has an explicit notion of ll header,
62ab0812
ED
2089 * exported to higher levels.
2090 *
2091 * Otherwise, the device hides details of its frame
2092 * structure, so that corresponding packet head is
2093 * never delivered to user.
1da177e4
LT
2094 */
2095 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2096 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2097 else if (skb->pkt_type == PACKET_OUTGOING) {
2098 /* Special case: outgoing packets have ll header at head */
bbe735e4 2099 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2100 }
2101 }
2102
2103 snaplen = skb->len;
2104
dbcb5855
DM
2105 res = run_filter(skb, sk, snaplen);
2106 if (!res)
fda9ef5d 2107 goto drop_n_restore;
dbcb5855
DM
2108 if (snaplen > res)
2109 snaplen = res;
1da177e4 2110
0fd7bac6 2111 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2112 goto drop_n_acct;
2113
2114 if (skb_shared(skb)) {
2115 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2116 if (nskb == NULL)
2117 goto drop_n_acct;
2118
2119 if (skb_head != skb->data) {
2120 skb->data = skb_head;
2121 skb->len = skb_len;
2122 }
abc4e4fa 2123 consume_skb(skb);
1da177e4
LT
2124 skb = nskb;
2125 }
2126
b4772ef8 2127 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2128
2129 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2130 sll->sll_hatype = dev->type;
1da177e4 2131 sll->sll_pkttype = skb->pkt_type;
8032b464 2132 if (unlikely(po->origdev))
80feaacb
PWJ
2133 sll->sll_ifindex = orig_dev->ifindex;
2134 else
2135 sll->sll_ifindex = dev->ifindex;
1da177e4 2136
b95cce35 2137 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2138
2472d761
EB
2139 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2140 * Use their space for storing the original skb length.
2141 */
2142 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2143
1da177e4
LT
2144 if (pskb_trim(skb, snaplen))
2145 goto drop_n_acct;
2146
2147 skb_set_owner_r(skb, sk);
2148 skb->dev = NULL;
adf30907 2149 skb_dst_drop(skb);
1da177e4 2150
84531c24
PO
2151 /* drop conntrack reference */
2152 nf_reset(skb);
2153
1da177e4 2154 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2155 po->stats.stats1.tp_packets++;
3bc3b96f 2156 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2157 __skb_queue_tail(&sk->sk_receive_queue, skb);
2158 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2159 sk->sk_data_ready(sk);
1da177e4
LT
2160 return 0;
2161
2162drop_n_acct:
da37845f 2163 is_drop_n_account = true;
7091fbd8 2164 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2165 po->stats.stats1.tp_drops++;
7091fbd8
WB
2166 atomic_inc(&sk->sk_drops);
2167 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
2168
2169drop_n_restore:
2170 if (skb_head != skb->data && skb_shared(skb)) {
2171 skb->data = skb_head;
2172 skb->len = skb_len;
2173 }
2174drop:
da37845f
WJ
2175 if (!is_drop_n_account)
2176 consume_skb(skb);
2177 else
2178 kfree_skb(skb);
1da177e4
LT
2179 return 0;
2180}
2181
40d4e3df
ED
2182static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2183 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2184{
2185 struct sock *sk;
2186 struct packet_sock *po;
2187 struct sockaddr_ll *sll;
184f489e 2188 union tpacket_uhdr h;
40d4e3df 2189 u8 *skb_head = skb->data;
1da177e4 2190 int skb_len = skb->len;
dbcb5855 2191 unsigned int snaplen, res;
f6fb8f10 2192 unsigned long status = TP_STATUS_USER;
bbd6ef87 2193 unsigned short macoff, netoff, hdrlen;
1da177e4 2194 struct sk_buff *copy_skb = NULL;
bbd6ef87 2195 struct timespec ts;
b9c32fb2 2196 __u32 ts_status;
da37845f 2197 bool is_drop_n_account = false;
edbd58be 2198 bool do_vnet = false;
1da177e4 2199
51846355
AW
2200 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2201 * We may add members to them until current aligned size without forcing
2202 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2203 */
2204 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2205 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2206
1da177e4
LT
2207 if (skb->pkt_type == PACKET_LOOPBACK)
2208 goto drop;
2209
2210 sk = pt->af_packet_priv;
2211 po = pkt_sk(sk);
2212
09ad9bc7 2213 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2214 goto drop;
2215
3b04ddde 2216 if (dev->header_ops) {
1da177e4 2217 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2218 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2219 else if (skb->pkt_type == PACKET_OUTGOING) {
2220 /* Special case: outgoing packets have ll header at head */
bbe735e4 2221 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2222 }
2223 }
2224
2225 snaplen = skb->len;
2226
dbcb5855
DM
2227 res = run_filter(skb, sk, snaplen);
2228 if (!res)
fda9ef5d 2229 goto drop_n_restore;
68c2e5de
AD
2230
2231 if (skb->ip_summed == CHECKSUM_PARTIAL)
2232 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2233 else if (skb->pkt_type != PACKET_OUTGOING &&
2234 (skb->ip_summed == CHECKSUM_COMPLETE ||
2235 skb_csum_unnecessary(skb)))
2236 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2237
dbcb5855
DM
2238 if (snaplen > res)
2239 snaplen = res;
1da177e4
LT
2240
2241 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2242 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2243 po->tp_reserve;
1da177e4 2244 } else {
95c96174 2245 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2246 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2247 (maclen < 16 ? 16 : maclen)) +
58d19b19 2248 po->tp_reserve;
edbd58be 2249 if (po->has_vnet_hdr) {
58d19b19 2250 netoff += sizeof(struct virtio_net_hdr);
edbd58be
BP
2251 do_vnet = true;
2252 }
1da177e4
LT
2253 macoff = netoff - maclen;
2254 }
f6fb8f10 2255 if (po->tp_version <= TPACKET_V2) {
2256 if (macoff + snaplen > po->rx_ring.frame_size) {
2257 if (po->copy_thresh &&
0fd7bac6 2258 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2259 if (skb_shared(skb)) {
2260 copy_skb = skb_clone(skb, GFP_ATOMIC);
2261 } else {
2262 copy_skb = skb_get(skb);
2263 skb_head = skb->data;
2264 }
2265 if (copy_skb)
2266 skb_set_owner_r(copy_skb, sk);
1da177e4 2267 }
f6fb8f10 2268 snaplen = po->rx_ring.frame_size - macoff;
edbd58be 2269 if ((int)snaplen < 0) {
f6fb8f10 2270 snaplen = 0;
edbd58be
BP
2271 do_vnet = false;
2272 }
1da177e4 2273 }
dc808110
ED
2274 } else if (unlikely(macoff + snaplen >
2275 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2276 u32 nval;
2277
2278 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2279 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2280 snaplen, nval, macoff);
2281 snaplen = nval;
2282 if (unlikely((int)snaplen < 0)) {
2283 snaplen = 0;
2284 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
edbd58be 2285 do_vnet = false;
dc808110 2286 }
1da177e4 2287 }
1da177e4 2288 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2289 h.raw = packet_current_rx_frame(po, skb,
2290 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2291 if (!h.raw)
58d19b19 2292 goto drop_n_account;
f6fb8f10 2293 if (po->tp_version <= TPACKET_V2) {
2294 packet_increment_rx_head(po, &po->rx_ring);
2295 /*
2296 * LOSING will be reported till you read the stats,
2297 * because it's COR - Clear On Read.
2298 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2299 * at packet level.
2300 */
ee80fbf3 2301 if (po->stats.stats1.tp_drops)
f6fb8f10 2302 status |= TP_STATUS_LOSING;
2303 }
ee80fbf3 2304 po->stats.stats1.tp_packets++;
1da177e4
LT
2305 if (copy_skb) {
2306 status |= TP_STATUS_COPY;
2307 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2308 }
1da177e4
LT
2309 spin_unlock(&sk->sk_receive_queue.lock);
2310
edbd58be 2311 if (do_vnet) {
5a213881
JR
2312 if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
2313 sizeof(struct virtio_net_hdr),
6391a448 2314 vio_le(), true)) {
58d19b19
WB
2315 spin_lock(&sk->sk_receive_queue.lock);
2316 goto drop_n_account;
2317 }
2318 }
2319
bbd6ef87 2320 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2321
2322 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2323 getnstimeofday(&ts);
1da177e4 2324
b9c32fb2
DB
2325 status |= ts_status;
2326
bbd6ef87
PM
2327 switch (po->tp_version) {
2328 case TPACKET_V1:
2329 h.h1->tp_len = skb->len;
2330 h.h1->tp_snaplen = snaplen;
2331 h.h1->tp_mac = macoff;
2332 h.h1->tp_net = netoff;
4b457bdf
DB
2333 h.h1->tp_sec = ts.tv_sec;
2334 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2335 hdrlen = sizeof(*h.h1);
2336 break;
2337 case TPACKET_V2:
2338 h.h2->tp_len = skb->len;
2339 h.h2->tp_snaplen = snaplen;
2340 h.h2->tp_mac = macoff;
2341 h.h2->tp_net = netoff;
bbd6ef87
PM
2342 h.h2->tp_sec = ts.tv_sec;
2343 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2344 if (skb_vlan_tag_present(skb)) {
2345 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2346 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2347 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2348 } else {
2349 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2350 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2351 }
e4d26f4b 2352 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2353 hdrlen = sizeof(*h.h2);
2354 break;
f6fb8f10 2355 case TPACKET_V3:
2356 /* tp_nxt_offset,vlan are already populated above.
2357 * So DONT clear those fields here
2358 */
2359 h.h3->tp_status |= status;
2360 h.h3->tp_len = skb->len;
2361 h.h3->tp_snaplen = snaplen;
2362 h.h3->tp_mac = macoff;
2363 h.h3->tp_net = netoff;
f6fb8f10 2364 h.h3->tp_sec = ts.tv_sec;
2365 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2366 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2367 hdrlen = sizeof(*h.h3);
2368 break;
bbd6ef87
PM
2369 default:
2370 BUG();
2371 }
1da177e4 2372
bbd6ef87 2373 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2374 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2375 sll->sll_family = AF_PACKET;
2376 sll->sll_hatype = dev->type;
2377 sll->sll_protocol = skb->protocol;
2378 sll->sll_pkttype = skb->pkt_type;
8032b464 2379 if (unlikely(po->origdev))
80feaacb
PWJ
2380 sll->sll_ifindex = orig_dev->ifindex;
2381 else
2382 sll->sll_ifindex = dev->ifindex;
1da177e4 2383
e16aa207 2384 smp_mb();
f0d4eb29 2385
f6dafa95 2386#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2387 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2388 u8 *start, *end;
2389
f0d4eb29
DB
2390 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2391 macoff + snaplen);
2392
2393 for (start = h.raw; start < end; start += PAGE_SIZE)
2394 flush_dcache_page(pgv_to_page(start));
1da177e4 2395 }
f0d4eb29 2396 smp_wmb();
f6dafa95 2397#endif
f0d4eb29 2398
da413eec 2399 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2400 __packet_set_status(po, h.raw, status);
da413eec
DC
2401 sk->sk_data_ready(sk);
2402 } else {
f6fb8f10 2403 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2404 }
1da177e4
LT
2405
2406drop_n_restore:
2407 if (skb_head != skb->data && skb_shared(skb)) {
2408 skb->data = skb_head;
2409 skb->len = skb_len;
2410 }
2411drop:
da37845f
WJ
2412 if (!is_drop_n_account)
2413 consume_skb(skb);
2414 else
2415 kfree_skb(skb);
1da177e4
LT
2416 return 0;
2417
58d19b19 2418drop_n_account:
da37845f 2419 is_drop_n_account = true;
ee80fbf3 2420 po->stats.stats1.tp_drops++;
1da177e4
LT
2421 spin_unlock(&sk->sk_receive_queue.lock);
2422
676d2369 2423 sk->sk_data_ready(sk);
acb5d75b 2424 kfree_skb(copy_skb);
1da177e4
LT
2425 goto drop_n_restore;
2426}
2427
69e3c75f
JB
2428static void tpacket_destruct_skb(struct sk_buff *skb)
2429{
2430 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2431
69e3c75f 2432 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2433 void *ph;
b9c32fb2
DB
2434 __u32 ts;
2435
69e3c75f 2436 ph = skb_shinfo(skb)->destructor_arg;
b0138408 2437 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2438
2439 ts = __packet_set_timestamp(po, ph, skb);
2440 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2441 }
2442
2443 sock_wfree(skb);
2444}
2445
c72219b7
DB
2446static void tpacket_set_protocol(const struct net_device *dev,
2447 struct sk_buff *skb)
2448{
2449 if (dev->type == ARPHRD_ETHER) {
2450 skb_reset_mac_header(skb);
2451 skb->protocol = eth_hdr(skb)->h_proto;
2452 }
2453}
2454
16cc1400
WB
2455static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2456{
16cc1400
WB
2457 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2458 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2459 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2460 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2461 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2462 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2463 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2464
2465 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2466 return -EINVAL;
2467
16cc1400
WB
2468 return 0;
2469}
2470
2471static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2472 struct virtio_net_hdr *vnet_hdr)
2473{
16cc1400
WB
2474 if (*len < sizeof(*vnet_hdr))
2475 return -EINVAL;
2476 *len -= sizeof(*vnet_hdr);
2477
cbbd26b8 2478 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
16cc1400
WB
2479 return -EFAULT;
2480
2481 return __packet_snd_vnet_parse(vnet_hdr, *len);
2482}
2483
40d4e3df 2484static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2485 void *frame, struct net_device *dev, void *data, int tp_len,
c14ac945
SHY
2486 __be16 proto, unsigned char *addr, int hlen, int copylen,
2487 const struct sockcm_cookie *sockc)
69e3c75f 2488{
184f489e 2489 union tpacket_uhdr ph;
8d39b4a6 2490 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2491 struct socket *sock = po->sk.sk_socket;
2492 struct page *page;
69e3c75f
JB
2493 int err;
2494
2495 ph.raw = frame;
2496
2497 skb->protocol = proto;
2498 skb->dev = dev;
2499 skb->priority = po->sk.sk_priority;
2d37a186 2500 skb->mark = po->sk.sk_mark;
c14ac945 2501 sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2502 skb_shinfo(skb)->destructor_arg = ph.raw;
2503
ae641949 2504 skb_reserve(skb, hlen);
69e3c75f 2505 skb_reset_network_header(skb);
c1aad275 2506
69e3c75f
JB
2507 to_write = tp_len;
2508
2509 if (sock->type == SOCK_DGRAM) {
2510 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2511 NULL, tp_len);
2512 if (unlikely(err < 0))
2513 return -EINVAL;
1d036d25 2514 } else if (copylen) {
9ed988cd
WB
2515 int hdrlen = min_t(int, copylen, tp_len);
2516
69e3c75f 2517 skb_push(skb, dev->hard_header_len);
1d036d25 2518 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2519 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2520 if (unlikely(err))
2521 return err;
9ed988cd
WB
2522 if (!dev_validate_header(dev, skb->data, hdrlen))
2523 return -EINVAL;
c72219b7
DB
2524 if (!skb->protocol)
2525 tpacket_set_protocol(dev, skb);
69e3c75f 2526
9ed988cd
WB
2527 data += hdrlen;
2528 to_write -= hdrlen;
69e3c75f
JB
2529 }
2530
69e3c75f
JB
2531 offset = offset_in_page(data);
2532 len_max = PAGE_SIZE - offset;
2533 len = ((to_write > len_max) ? len_max : to_write);
2534
2535 skb->data_len = to_write;
2536 skb->len += to_write;
2537 skb->truesize += to_write;
14afee4b 2538 refcount_add(to_write, &po->sk.sk_wmem_alloc);
69e3c75f
JB
2539
2540 while (likely(to_write)) {
2541 nr_frags = skb_shinfo(skb)->nr_frags;
2542
2543 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2544 pr_err("Packet exceed the number of skb frags(%lu)\n",
2545 MAX_SKB_FRAGS);
69e3c75f
JB
2546 return -EFAULT;
2547 }
2548
0af55bb5
CG
2549 page = pgv_to_page(data);
2550 data += len;
69e3c75f
JB
2551 flush_dcache_page(page);
2552 get_page(page);
0af55bb5 2553 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2554 to_write -= len;
2555 offset = 0;
2556 len_max = PAGE_SIZE;
2557 len = ((to_write > len_max) ? len_max : to_write);
2558 }
2559
8fd6c80d 2560 skb_probe_transport_header(skb, 0);
efdfa2f7 2561
69e3c75f
JB
2562 return tp_len;
2563}
2564
8d39b4a6
WB
2565static int tpacket_parse_header(struct packet_sock *po, void *frame,
2566 int size_max, void **data)
2567{
2568 union tpacket_uhdr ph;
2569 int tp_len, off;
2570
2571 ph.raw = frame;
2572
2573 switch (po->tp_version) {
7f953ab2
SV
2574 case TPACKET_V3:
2575 if (ph.h3->tp_next_offset != 0) {
2576 pr_warn_once("variable sized slot not supported");
2577 return -EINVAL;
2578 }
2579 tp_len = ph.h3->tp_len;
2580 break;
8d39b4a6
WB
2581 case TPACKET_V2:
2582 tp_len = ph.h2->tp_len;
2583 break;
2584 default:
2585 tp_len = ph.h1->tp_len;
2586 break;
2587 }
2588 if (unlikely(tp_len > size_max)) {
2589 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2590 return -EMSGSIZE;
2591 }
2592
2593 if (unlikely(po->tp_tx_has_off)) {
2594 int off_min, off_max;
2595
2596 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2597 off_max = po->tx_ring.frame_size - tp_len;
2598 if (po->sk.sk_type == SOCK_DGRAM) {
2599 switch (po->tp_version) {
7f953ab2
SV
2600 case TPACKET_V3:
2601 off = ph.h3->tp_net;
2602 break;
8d39b4a6
WB
2603 case TPACKET_V2:
2604 off = ph.h2->tp_net;
2605 break;
2606 default:
2607 off = ph.h1->tp_net;
2608 break;
2609 }
2610 } else {
2611 switch (po->tp_version) {
7f953ab2
SV
2612 case TPACKET_V3:
2613 off = ph.h3->tp_mac;
2614 break;
8d39b4a6
WB
2615 case TPACKET_V2:
2616 off = ph.h2->tp_mac;
2617 break;
2618 default:
2619 off = ph.h1->tp_mac;
2620 break;
2621 }
2622 }
2623 if (unlikely((off < off_min) || (off_max < off)))
2624 return -EINVAL;
2625 } else {
2626 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2627 }
2628
2629 *data = frame + off;
2630 return tp_len;
2631}
2632
69e3c75f
JB
2633static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2634{
69e3c75f
JB
2635 struct sk_buff *skb;
2636 struct net_device *dev;
1d036d25 2637 struct virtio_net_hdr *vnet_hdr = NULL;
c14ac945 2638 struct sockcm_cookie sockc;
69e3c75f 2639 __be16 proto;
09effa67 2640 int err, reserve = 0;
40d4e3df 2641 void *ph;
342dfc30 2642 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2643 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2644 int tp_len, size_max;
2645 unsigned char *addr;
8d39b4a6 2646 void *data;
69e3c75f 2647 int len_sum = 0;
9e67030a 2648 int status = TP_STATUS_AVAILABLE;
1d036d25 2649 int hlen, tlen, copylen = 0;
69e3c75f 2650
69e3c75f
JB
2651 mutex_lock(&po->pg_vec_lock);
2652
66e56cd4 2653 if (likely(saddr == NULL)) {
e40526cb 2654 dev = packet_cached_dev_get(po);
69e3c75f
JB
2655 proto = po->num;
2656 addr = NULL;
2657 } else {
2658 err = -EINVAL;
2659 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2660 goto out;
2661 if (msg->msg_namelen < (saddr->sll_halen
2662 + offsetof(struct sockaddr_ll,
2663 sll_addr)))
2664 goto out;
69e3c75f
JB
2665 proto = saddr->sll_protocol;
2666 addr = saddr->sll_addr;
827d9780 2667 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2668 }
2669
69e3c75f
JB
2670 err = -ENXIO;
2671 if (unlikely(dev == NULL))
2672 goto out;
69e3c75f
JB
2673 err = -ENETDOWN;
2674 if (unlikely(!(dev->flags & IFF_UP)))
2675 goto out_put;
2676
d19b183c
DCS
2677 sockc.tsflags = po->sk.sk_tsflags;
2678 if (msg->msg_controllen) {
2679 err = sock_cmsg_send(&po->sk, msg, &sockc);
2680 if (unlikely(err))
2681 goto out_put;
2682 }
2683
5cfb4c8d
DB
2684 if (po->sk.sk_socket->type == SOCK_RAW)
2685 reserve = dev->hard_header_len;
69e3c75f 2686 size_max = po->tx_ring.frame_size
b5dd884e 2687 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2688
1d036d25 2689 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2690 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2691
69e3c75f
JB
2692 do {
2693 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2694 TP_STATUS_SEND_REQUEST);
69e3c75f 2695 if (unlikely(ph == NULL)) {
87a2fd28
DB
2696 if (need_wait && need_resched())
2697 schedule();
69e3c75f
JB
2698 continue;
2699 }
2700
8d39b4a6
WB
2701 skb = NULL;
2702 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2703 if (tp_len < 0)
2704 goto tpacket_error;
2705
69e3c75f 2706 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2707 hlen = LL_RESERVED_SPACE(dev);
2708 tlen = dev->needed_tailroom;
1d036d25
WB
2709 if (po->has_vnet_hdr) {
2710 vnet_hdr = data;
2711 data += sizeof(*vnet_hdr);
2712 tp_len -= sizeof(*vnet_hdr);
2713 if (tp_len < 0 ||
2714 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2715 tp_len = -EINVAL;
2716 goto tpacket_error;
2717 }
2718 copylen = __virtio16_to_cpu(vio_le(),
2719 vnet_hdr->hdr_len);
2720 }
9ed988cd 2721 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2722 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2723 hlen + tlen + sizeof(struct sockaddr_ll) +
2724 (copylen - dev->hard_header_len),
fbf33a28 2725 !need_wait, &err);
69e3c75f 2726
fbf33a28
KM
2727 if (unlikely(skb == NULL)) {
2728 /* we assume the socket was initially writeable ... */
2729 if (likely(len_sum > 0))
2730 err = len_sum;
69e3c75f 2731 goto out_status;
fbf33a28 2732 }
8d39b4a6 2733 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
c14ac945 2734 addr, hlen, copylen, &sockc);
dbd46ab4 2735 if (likely(tp_len >= 0) &&
5cfb4c8d 2736 tp_len > dev->mtu + reserve &&
1d036d25 2737 !po->has_vnet_hdr &&
3c70c132
DB
2738 !packet_extra_vlan_len_allowed(dev, skb))
2739 tp_len = -EMSGSIZE;
69e3c75f
JB
2740
2741 if (unlikely(tp_len < 0)) {
8d39b4a6 2742tpacket_error:
69e3c75f
JB
2743 if (po->tp_loss) {
2744 __packet_set_status(po, ph,
2745 TP_STATUS_AVAILABLE);
2746 packet_increment_head(&po->tx_ring);
2747 kfree_skb(skb);
2748 continue;
2749 } else {
2750 status = TP_STATUS_WRONG_FORMAT;
2751 err = tp_len;
2752 goto out_status;
2753 }
2754 }
2755
db60eb5f
JR
2756 if (po->has_vnet_hdr && virtio_net_hdr_to_skb(skb, vnet_hdr,
2757 vio_le())) {
1d036d25
WB
2758 tp_len = -EINVAL;
2759 goto tpacket_error;
2760 }
2761
69e3c75f
JB
2762 skb->destructor = tpacket_destruct_skb;
2763 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2764 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2765
2766 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2767 err = po->xmit(skb);
eb70df13
JP
2768 if (unlikely(err > 0)) {
2769 err = net_xmit_errno(err);
2770 if (err && __packet_get_status(po, ph) ==
2771 TP_STATUS_AVAILABLE) {
2772 /* skb was destructed already */
2773 skb = NULL;
2774 goto out_status;
2775 }
2776 /*
2777 * skb was dropped but not destructed yet;
2778 * let's treat it like congestion or err < 0
2779 */
2780 err = 0;
2781 }
69e3c75f
JB
2782 packet_increment_head(&po->tx_ring);
2783 len_sum += tp_len;
b0138408
DB
2784 } while (likely((ph != NULL) ||
2785 /* Note: packet_read_pending() might be slow if we have
2786 * to call it as it's per_cpu variable, but in fast-path
2787 * we already short-circuit the loop with the first
2788 * condition, and luckily don't have to go that path
2789 * anyway.
2790 */
2791 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2792
2793 err = len_sum;
2794 goto out_put;
2795
69e3c75f
JB
2796out_status:
2797 __packet_set_status(po, ph, status);
2798 kfree_skb(skb);
2799out_put:
e40526cb 2800 dev_put(dev);
69e3c75f
JB
2801out:
2802 mutex_unlock(&po->pg_vec_lock);
2803 return err;
2804}
69e3c75f 2805
eea49cc9
OJ
2806static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2807 size_t reserve, size_t len,
2808 size_t linear, int noblock,
2809 int *err)
bfd5f4a3
SS
2810{
2811 struct sk_buff *skb;
2812
2813 /* Under a page? Don't bother with paged skb. */
2814 if (prepad + len < PAGE_SIZE || !linear)
2815 linear = len;
2816
2817 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2818 err, 0);
bfd5f4a3
SS
2819 if (!skb)
2820 return NULL;
2821
2822 skb_reserve(skb, reserve);
2823 skb_put(skb, linear);
2824 skb->data_len = len - linear;
2825 skb->len += len - linear;
2826
2827 return skb;
2828}
2829
d346a3fa 2830static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2831{
2832 struct sock *sk = sock->sk;
342dfc30 2833 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2834 struct sk_buff *skb;
2835 struct net_device *dev;
0e11c91e 2836 __be16 proto;
1da177e4 2837 unsigned char *addr;
827d9780 2838 int err, reserve = 0;
c7d39e32 2839 struct sockcm_cookie sockc;
bfd5f4a3
SS
2840 struct virtio_net_hdr vnet_hdr = { 0 };
2841 int offset = 0;
bfd5f4a3 2842 struct packet_sock *po = pkt_sk(sk);
da7c9561 2843 bool has_vnet_hdr = false;
57031eb7 2844 int hlen, tlen, linear;
3bdc0eba 2845 int extra_len = 0;
1da177e4
LT
2846
2847 /*
1ce4f28b 2848 * Get and verify the address.
1da177e4 2849 */
1ce4f28b 2850
66e56cd4 2851 if (likely(saddr == NULL)) {
e40526cb 2852 dev = packet_cached_dev_get(po);
1da177e4
LT
2853 proto = po->num;
2854 addr = NULL;
2855 } else {
2856 err = -EINVAL;
2857 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2858 goto out;
0fb375fb
EB
2859 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2860 goto out;
1da177e4
LT
2861 proto = saddr->sll_protocol;
2862 addr = saddr->sll_addr;
827d9780 2863 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2864 }
2865
1da177e4 2866 err = -ENXIO;
e40526cb 2867 if (unlikely(dev == NULL))
1da177e4 2868 goto out_unlock;
d5e76b0a 2869 err = -ENETDOWN;
e40526cb 2870 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2871 goto out_unlock;
2872
edbe7746 2873 sockc.tsflags = sk->sk_tsflags;
c7d39e32
EJ
2874 sockc.mark = sk->sk_mark;
2875 if (msg->msg_controllen) {
2876 err = sock_cmsg_send(sk, msg, &sockc);
2877 if (unlikely(err))
2878 goto out_unlock;
2879 }
2880
e40526cb
DB
2881 if (sock->type == SOCK_RAW)
2882 reserve = dev->hard_header_len;
bfd5f4a3 2883 if (po->has_vnet_hdr) {
16cc1400
WB
2884 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2885 if (err)
bfd5f4a3 2886 goto out_unlock;
da7c9561 2887 has_vnet_hdr = true;
bfd5f4a3
SS
2888 }
2889
3bdc0eba
BG
2890 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2891 if (!netif_supports_nofcs(dev)) {
2892 err = -EPROTONOSUPPORT;
2893 goto out_unlock;
2894 }
2895 extra_len = 4; /* We're doing our own CRC */
2896 }
2897
1da177e4 2898 err = -EMSGSIZE;
16cc1400
WB
2899 if (!vnet_hdr.gso_type &&
2900 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2901 goto out_unlock;
2902
bfd5f4a3 2903 err = -ENOBUFS;
ae641949
HX
2904 hlen = LL_RESERVED_SPACE(dev);
2905 tlen = dev->needed_tailroom;
57031eb7
WB
2906 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2907 linear = max(linear, min_t(int, len, dev->hard_header_len));
2908 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
bfd5f4a3 2909 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2910 if (skb == NULL)
1da177e4
LT
2911 goto out_unlock;
2912
bfd5f4a3 2913 skb_set_network_header(skb, reserve);
1da177e4 2914
0c4e8581 2915 err = -EINVAL;
9c707762
WB
2916 if (sock->type == SOCK_DGRAM) {
2917 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2918 if (unlikely(offset < 0))
9c707762 2919 goto out_free;
9c707762 2920 }
1da177e4
LT
2921
2922 /* Returns -EFAULT on error */
c0371da6 2923 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2924 if (err)
2925 goto out_free;
bf84a010 2926
9ed988cd
WB
2927 if (sock->type == SOCK_RAW &&
2928 !dev_validate_header(dev, skb->data, len)) {
2929 err = -EINVAL;
2930 goto out_free;
2931 }
2932
c14ac945 2933 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1da177e4 2934
16cc1400 2935 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2936 !packet_extra_vlan_len_allowed(dev, skb)) {
2937 err = -EMSGSIZE;
2938 goto out_free;
57f89bfa
BG
2939 }
2940
09effa67
DM
2941 skb->protocol = proto;
2942 skb->dev = dev;
1da177e4 2943 skb->priority = sk->sk_priority;
c7d39e32 2944 skb->mark = sockc.mark;
0fd5d57b 2945
da7c9561 2946 if (has_vnet_hdr) {
db60eb5f 2947 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
16cc1400
WB
2948 if (err)
2949 goto out_free;
2950 len += sizeof(vnet_hdr);
bfd5f4a3
SS
2951 }
2952
8fd6c80d
DB
2953 skb_probe_transport_header(skb, reserve);
2954
3bdc0eba
BG
2955 if (unlikely(extra_len == 4))
2956 skb->no_fcs = 1;
2957
d346a3fa 2958 err = po->xmit(skb);
1da177e4
LT
2959 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2960 goto out_unlock;
2961
e40526cb 2962 dev_put(dev);
1da177e4 2963
40d4e3df 2964 return len;
1da177e4
LT
2965
2966out_free:
2967 kfree_skb(skb);
2968out_unlock:
e40526cb 2969 if (dev)
1da177e4
LT
2970 dev_put(dev);
2971out:
2972 return err;
2973}
2974
1b784140 2975static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2976{
69e3c75f
JB
2977 struct sock *sk = sock->sk;
2978 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2979
69e3c75f
JB
2980 if (po->tx_ring.pg_vec)
2981 return tpacket_snd(po, msg);
2982 else
69e3c75f
JB
2983 return packet_snd(sock, msg, len);
2984}
2985
1da177e4
LT
2986/*
2987 * Close a PACKET socket. This is fairly simple. We immediately go
2988 * to 'closed' state and remove our protocol entry in the device list.
2989 */
2990
2991static int packet_release(struct socket *sock)
2992{
2993 struct sock *sk = sock->sk;
2994 struct packet_sock *po;
2bd624b4 2995 struct packet_fanout *f;
d12d01d6 2996 struct net *net;
f6fb8f10 2997 union tpacket_req_u req_u;
1da177e4
LT
2998
2999 if (!sk)
3000 return 0;
3001
3b1e0a65 3002 net = sock_net(sk);
1da177e4
LT
3003 po = pkt_sk(sk);
3004
0fa7fa98 3005 mutex_lock(&net->packet.sklist_lock);
808f5114 3006 sk_del_node_init_rcu(sk);
0fa7fa98
PE
3007 mutex_unlock(&net->packet.sklist_lock);
3008
3009 preempt_disable();
920de804 3010 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 3011 preempt_enable();
1da177e4 3012
808f5114 3013 spin_lock(&po->bind_lock);
ce06b03e 3014 unregister_prot_hook(sk, false);
66e56cd4
DB
3015 packet_cached_dev_reset(po);
3016
160ff18a
BG
3017 if (po->prot_hook.dev) {
3018 dev_put(po->prot_hook.dev);
3019 po->prot_hook.dev = NULL;
3020 }
808f5114 3021 spin_unlock(&po->bind_lock);
1da177e4 3022
1da177e4 3023 packet_flush_mclist(sk);
1da177e4 3024
9665d5d6
PS
3025 if (po->rx_ring.pg_vec) {
3026 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3027 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 3028 }
69e3c75f 3029
9665d5d6
PS
3030 if (po->tx_ring.pg_vec) {
3031 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3032 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 3033 }
1da177e4 3034
2bd624b4 3035 f = fanout_release(sk);
dc99f600 3036
808f5114 3037 synchronize_net();
2bd624b4
AS
3038
3039 if (f) {
3040 fanout_release_data(f);
3041 kfree(f);
3042 }
1da177e4
LT
3043 /*
3044 * Now the socket is dead. No more input will appear.
3045 */
1da177e4
LT
3046 sock_orphan(sk);
3047 sock->sk = NULL;
3048
3049 /* Purge queues */
3050
3051 skb_queue_purge(&sk->sk_receive_queue);
b0138408 3052 packet_free_pending(po);
17ab56a2 3053 sk_refcnt_debug_release(sk);
1da177e4
LT
3054
3055 sock_put(sk);
3056 return 0;
3057}
3058
3059/*
3060 * Attach a packet hook.
3061 */
3062
30f7ea1c
FR
3063static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3064 __be16 proto)
1da177e4
LT
3065{
3066 struct packet_sock *po = pkt_sk(sk);
158cd4af 3067 struct net_device *dev_curr;
902fefb8
DB
3068 __be16 proto_curr;
3069 bool need_rehook;
30f7ea1c
FR
3070 struct net_device *dev = NULL;
3071 int ret = 0;
3072 bool unlisted = false;
dc99f600 3073
1da177e4 3074 lock_sock(sk);
1da177e4 3075 spin_lock(&po->bind_lock);
30f7ea1c
FR
3076 rcu_read_lock();
3077
4971613c
WB
3078 if (po->fanout) {
3079 ret = -EINVAL;
3080 goto out_unlock;
3081 }
3082
30f7ea1c
FR
3083 if (name) {
3084 dev = dev_get_by_name_rcu(sock_net(sk), name);
3085 if (!dev) {
3086 ret = -ENODEV;
3087 goto out_unlock;
3088 }
3089 } else if (ifindex) {
3090 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3091 if (!dev) {
3092 ret = -ENODEV;
3093 goto out_unlock;
3094 }
3095 }
3096
3097 if (dev)
3098 dev_hold(dev);
66e56cd4 3099
902fefb8
DB
3100 proto_curr = po->prot_hook.type;
3101 dev_curr = po->prot_hook.dev;
3102
3103 need_rehook = proto_curr != proto || dev_curr != dev;
3104
3105 if (need_rehook) {
30f7ea1c
FR
3106 if (po->running) {
3107 rcu_read_unlock();
3108 __unregister_prot_hook(sk, true);
3109 rcu_read_lock();
3110 dev_curr = po->prot_hook.dev;
3111 if (dev)
3112 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3113 dev->ifindex);
3114 }
1da177e4 3115
902fefb8
DB
3116 po->num = proto;
3117 po->prot_hook.type = proto;
902fefb8 3118
30f7ea1c
FR
3119 if (unlikely(unlisted)) {
3120 dev_put(dev);
3121 po->prot_hook.dev = NULL;
3122 po->ifindex = -1;
3123 packet_cached_dev_reset(po);
3124 } else {
3125 po->prot_hook.dev = dev;
3126 po->ifindex = dev ? dev->ifindex : 0;
3127 packet_cached_dev_assign(po, dev);
3128 }
902fefb8 3129 }
158cd4af
LW
3130 if (dev_curr)
3131 dev_put(dev_curr);
66e56cd4 3132
902fefb8 3133 if (proto == 0 || !need_rehook)
1da177e4
LT
3134 goto out_unlock;
3135
30f7ea1c 3136 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3137 register_prot_hook(sk);
be85d4ad
UT
3138 } else {
3139 sk->sk_err = ENETDOWN;
3140 if (!sock_flag(sk, SOCK_DEAD))
3141 sk->sk_error_report(sk);
1da177e4
LT
3142 }
3143
3144out_unlock:
30f7ea1c 3145 rcu_read_unlock();
1da177e4
LT
3146 spin_unlock(&po->bind_lock);
3147 release_sock(sk);
30f7ea1c 3148 return ret;
1da177e4
LT
3149}
3150
3151/*
3152 * Bind a packet socket to a device
3153 */
3154
40d4e3df
ED
3155static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3156 int addr_len)
1da177e4 3157{
40d4e3df 3158 struct sock *sk = sock->sk;
540e2894 3159 char name[sizeof(uaddr->sa_data) + 1];
1ce4f28b 3160
1da177e4
LT
3161 /*
3162 * Check legality
3163 */
1ce4f28b 3164
8ae55f04 3165 if (addr_len != sizeof(struct sockaddr))
1da177e4 3166 return -EINVAL;
540e2894
AP
3167 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3168 * zero-terminated.
3169 */
3170 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3171 name[sizeof(uaddr->sa_data)] = 0;
1da177e4 3172
30f7ea1c 3173 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3174}
1da177e4
LT
3175
3176static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3177{
40d4e3df
ED
3178 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3179 struct sock *sk = sock->sk;
1da177e4
LT
3180
3181 /*
3182 * Check legality
3183 */
1ce4f28b 3184
1da177e4
LT
3185 if (addr_len < sizeof(struct sockaddr_ll))
3186 return -EINVAL;
3187 if (sll->sll_family != AF_PACKET)
3188 return -EINVAL;
3189
30f7ea1c
FR
3190 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3191 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3192}
3193
3194static struct proto packet_proto = {
3195 .name = "PACKET",
3196 .owner = THIS_MODULE,
3197 .obj_size = sizeof(struct packet_sock),
3198};
3199
3200/*
1ce4f28b 3201 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3202 */
3203
3f378b68
EP
3204static int packet_create(struct net *net, struct socket *sock, int protocol,
3205 int kern)
1da177e4
LT
3206{
3207 struct sock *sk;
3208 struct packet_sock *po;
0e11c91e 3209 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3210 int err;
3211
df008c91 3212 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3213 return -EPERM;
be02097c
DM
3214 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3215 sock->type != SOCK_PACKET)
1da177e4
LT
3216 return -ESOCKTNOSUPPORT;
3217
3218 sock->state = SS_UNCONNECTED;
3219
3220 err = -ENOBUFS;
11aa9c28 3221 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3222 if (sk == NULL)
3223 goto out;
3224
3225 sock->ops = &packet_ops;
1da177e4
LT
3226 if (sock->type == SOCK_PACKET)
3227 sock->ops = &packet_ops_spkt;
be02097c 3228
1da177e4
LT
3229 sock_init_data(sock, sk);
3230
3231 po = pkt_sk(sk);
3232 sk->sk_family = PF_PACKET;
0e11c91e 3233 po->num = proto;
d346a3fa 3234 po->xmit = dev_queue_xmit;
66e56cd4 3235
b0138408
DB
3236 err = packet_alloc_pending(po);
3237 if (err)
3238 goto out2;
3239
66e56cd4 3240 packet_cached_dev_reset(po);
1da177e4
LT
3241
3242 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3243 sk_refcnt_debug_inc(sk);
1da177e4
LT
3244
3245 /*
3246 * Attach a protocol block
3247 */
3248
3249 spin_lock_init(&po->bind_lock);
905db440 3250 mutex_init(&po->pg_vec_lock);
0648ab70 3251 po->rollover = NULL;
1da177e4 3252 po->prot_hook.func = packet_rcv;
be02097c 3253
1da177e4
LT
3254 if (sock->type == SOCK_PACKET)
3255 po->prot_hook.func = packet_rcv_spkt;
be02097c 3256
1da177e4
LT
3257 po->prot_hook.af_packet_priv = sk;
3258
0e11c91e
AV
3259 if (proto) {
3260 po->prot_hook.type = proto;
ce06b03e 3261 register_prot_hook(sk);
1da177e4
LT
3262 }
3263
0fa7fa98 3264 mutex_lock(&net->packet.sklist_lock);
808f5114 3265 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3266 mutex_unlock(&net->packet.sklist_lock);
3267
3268 preempt_disable();
3680453c 3269 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3270 preempt_enable();
808f5114 3271
40d4e3df 3272 return 0;
b0138408
DB
3273out2:
3274 sk_free(sk);
1da177e4
LT
3275out:
3276 return err;
3277}
3278
3279/*
3280 * Pull a packet from our receive queue and hand it to the user.
3281 * If necessary we block.
3282 */
3283
1b784140
YX
3284static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3285 int flags)
1da177e4
LT
3286{
3287 struct sock *sk = sock->sk;
3288 struct sk_buff *skb;
3289 int copied, err;
bfd5f4a3 3290 int vnet_hdr_len = 0;
2472d761 3291 unsigned int origlen = 0;
1da177e4
LT
3292
3293 err = -EINVAL;
ed85b565 3294 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3295 goto out;
3296
3297#if 0
3298 /* What error should we return now? EUNATTACH? */
3299 if (pkt_sk(sk)->ifindex < 0)
3300 return -ENODEV;
3301#endif
3302
ed85b565 3303 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3304 err = sock_recv_errqueue(sk, msg, len,
3305 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3306 goto out;
3307 }
3308
1da177e4
LT
3309 /*
3310 * Call the generic datagram receiver. This handles all sorts
3311 * of horrible races and re-entrancy so we can forget about it
3312 * in the protocol layers.
3313 *
3314 * Now it will return ENETDOWN, if device have just gone down,
3315 * but then it will block.
3316 */
3317
40d4e3df 3318 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3319
3320 /*
1ce4f28b 3321 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3322 * handles the blocking we don't see and worry about blocking
3323 * retries.
3324 */
3325
8ae55f04 3326 if (skb == NULL)
1da177e4
LT
3327 goto out;
3328
2ccdbaa6
WB
3329 if (pkt_sk(sk)->pressure)
3330 packet_rcv_has_room(pkt_sk(sk), NULL);
3331
bfd5f4a3 3332 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3333 err = packet_rcv_vnet(msg, skb, &len);
3334 if (err)
bfd5f4a3 3335 goto out_free;
16cc1400 3336 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3337 }
3338
f3d33426
HFS
3339 /* You lose any data beyond the buffer you gave. If it worries
3340 * a user program they can ask the device for its MTU
3341 * anyway.
1da177e4 3342 */
1da177e4 3343 copied = skb->len;
40d4e3df
ED
3344 if (copied > len) {
3345 copied = len;
3346 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3347 }
3348
51f3d02b 3349 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3350 if (err)
3351 goto out_free;
3352
2472d761
EB
3353 if (sock->type != SOCK_PACKET) {
3354 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3355
3356 /* Original length was stored in sockaddr_ll fields */
3357 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3358 sll->sll_family = AF_PACKET;
3359 sll->sll_protocol = skb->protocol;
3360 }
3361
3b885787 3362 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3363
f3d33426
HFS
3364 if (msg->msg_name) {
3365 /* If the address length field is there to be filled
3366 * in, we fill it in now.
3367 */
3368 if (sock->type == SOCK_PACKET) {
342dfc30 3369 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3370 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3371 } else {
3372 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3373
f3d33426
HFS
3374 msg->msg_namelen = sll->sll_halen +
3375 offsetof(struct sockaddr_ll, sll_addr);
3376 }
ffbc6111
HX
3377 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3378 msg->msg_namelen);
f3d33426 3379 }
1da177e4 3380
8dc41944 3381 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3382 struct tpacket_auxdata aux;
3383
3384 aux.tp_status = TP_STATUS_USER;
3385 if (skb->ip_summed == CHECKSUM_PARTIAL)
3386 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3387 else if (skb->pkt_type != PACKET_OUTGOING &&
3388 (skb->ip_summed == CHECKSUM_COMPLETE ||
3389 skb_csum_unnecessary(skb)))
3390 aux.tp_status |= TP_STATUS_CSUM_VALID;
3391
2472d761 3392 aux.tp_len = origlen;
ffbc6111
HX
3393 aux.tp_snaplen = skb->len;
3394 aux.tp_mac = 0;
bbe735e4 3395 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3396 if (skb_vlan_tag_present(skb)) {
3397 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3398 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3399 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3400 } else {
3401 aux.tp_vlan_tci = 0;
a0cdfcf3 3402 aux.tp_vlan_tpid = 0;
a3bcc23e 3403 }
ffbc6111 3404 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3405 }
3406
1da177e4
LT
3407 /*
3408 * Free or return the buffer as appropriate. Again this
3409 * hides all the races and re-entrancy issues from us.
3410 */
bfd5f4a3 3411 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3412
3413out_free:
3414 skb_free_datagram(sk, skb);
3415out:
3416 return err;
3417}
3418
1da177e4
LT
3419static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3420 int *uaddr_len, int peer)
3421{
3422 struct net_device *dev;
3423 struct sock *sk = sock->sk;
3424
3425 if (peer)
3426 return -EOPNOTSUPP;
3427
3428 uaddr->sa_family = AF_PACKET;
2dc85bf3 3429 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3430 rcu_read_lock();
3431 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3432 if (dev)
2dc85bf3 3433 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3434 rcu_read_unlock();
1da177e4
LT
3435 *uaddr_len = sizeof(*uaddr);
3436
3437 return 0;
3438}
1da177e4
LT
3439
3440static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3441 int *uaddr_len, int peer)
3442{
3443 struct net_device *dev;
3444 struct sock *sk = sock->sk;
3445 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3446 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3447
3448 if (peer)
3449 return -EOPNOTSUPP;
3450
3451 sll->sll_family = AF_PACKET;
3452 sll->sll_ifindex = po->ifindex;
3453 sll->sll_protocol = po->num;
67286640 3454 sll->sll_pkttype = 0;
654d1f8a
ED
3455 rcu_read_lock();
3456 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3457 if (dev) {
3458 sll->sll_hatype = dev->type;
3459 sll->sll_halen = dev->addr_len;
3460 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3461 } else {
3462 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3463 sll->sll_halen = 0;
3464 }
654d1f8a 3465 rcu_read_unlock();
0fb375fb 3466 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3467
3468 return 0;
3469}
3470
2aeb0b88
WC
3471static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3472 int what)
1da177e4
LT
3473{
3474 switch (i->type) {
3475 case PACKET_MR_MULTICAST:
1162563f
JP
3476 if (i->alen != dev->addr_len)
3477 return -EINVAL;
1da177e4 3478 if (what > 0)
22bedad3 3479 return dev_mc_add(dev, i->addr);
1da177e4 3480 else
22bedad3 3481 return dev_mc_del(dev, i->addr);
1da177e4
LT
3482 break;
3483 case PACKET_MR_PROMISC:
2aeb0b88 3484 return dev_set_promiscuity(dev, what);
1da177e4 3485 case PACKET_MR_ALLMULTI:
2aeb0b88 3486 return dev_set_allmulti(dev, what);
d95ed927 3487 case PACKET_MR_UNICAST:
1162563f
JP
3488 if (i->alen != dev->addr_len)
3489 return -EINVAL;
d95ed927 3490 if (what > 0)
a748ee24 3491 return dev_uc_add(dev, i->addr);
d95ed927 3492 else
a748ee24 3493 return dev_uc_del(dev, i->addr);
d95ed927 3494 break;
40d4e3df
ED
3495 default:
3496 break;
1da177e4 3497 }
2aeb0b88 3498 return 0;
1da177e4
LT
3499}
3500
82f17091
FR
3501static void packet_dev_mclist_delete(struct net_device *dev,
3502 struct packet_mclist **mlp)
1da177e4 3503{
82f17091
FR
3504 struct packet_mclist *ml;
3505
3506 while ((ml = *mlp) != NULL) {
3507 if (ml->ifindex == dev->ifindex) {
3508 packet_dev_mc(dev, ml, -1);
3509 *mlp = ml->next;
3510 kfree(ml);
3511 } else
3512 mlp = &ml->next;
1da177e4
LT
3513 }
3514}
3515
0fb375fb 3516static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3517{
3518 struct packet_sock *po = pkt_sk(sk);
3519 struct packet_mclist *ml, *i;
3520 struct net_device *dev;
3521 int err;
3522
3523 rtnl_lock();
3524
3525 err = -ENODEV;
3b1e0a65 3526 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3527 if (!dev)
3528 goto done;
3529
3530 err = -EINVAL;
1162563f 3531 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3532 goto done;
3533
3534 err = -ENOBUFS;
8b3a7005 3535 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3536 if (i == NULL)
3537 goto done;
3538
3539 err = 0;
3540 for (ml = po->mclist; ml; ml = ml->next) {
3541 if (ml->ifindex == mreq->mr_ifindex &&
3542 ml->type == mreq->mr_type &&
3543 ml->alen == mreq->mr_alen &&
3544 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3545 ml->count++;
3546 /* Free the new element ... */
3547 kfree(i);
3548 goto done;
3549 }
3550 }
3551
3552 i->type = mreq->mr_type;
3553 i->ifindex = mreq->mr_ifindex;
3554 i->alen = mreq->mr_alen;
3555 memcpy(i->addr, mreq->mr_address, i->alen);
309cf37f 3556 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3557 i->count = 1;
3558 i->next = po->mclist;
3559 po->mclist = i;
2aeb0b88
WC
3560 err = packet_dev_mc(dev, i, 1);
3561 if (err) {
3562 po->mclist = i->next;
3563 kfree(i);
3564 }
1da177e4
LT
3565
3566done:
3567 rtnl_unlock();
3568 return err;
3569}
3570
0fb375fb 3571static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3572{
3573 struct packet_mclist *ml, **mlp;
3574
3575 rtnl_lock();
3576
3577 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3578 if (ml->ifindex == mreq->mr_ifindex &&
3579 ml->type == mreq->mr_type &&
3580 ml->alen == mreq->mr_alen &&
3581 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3582 if (--ml->count == 0) {
3583 struct net_device *dev;
3584 *mlp = ml->next;
ad959e76
ED
3585 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3586 if (dev)
1da177e4 3587 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3588 kfree(ml);
3589 }
82f17091 3590 break;
1da177e4
LT
3591 }
3592 }
3593 rtnl_unlock();
82f17091 3594 return 0;
1da177e4
LT
3595}
3596
3597static void packet_flush_mclist(struct sock *sk)
3598{
3599 struct packet_sock *po = pkt_sk(sk);
3600 struct packet_mclist *ml;
3601
3602 if (!po->mclist)
3603 return;
3604
3605 rtnl_lock();
3606 while ((ml = po->mclist) != NULL) {
3607 struct net_device *dev;
3608
3609 po->mclist = ml->next;
ad959e76
ED
3610 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3611 if (dev != NULL)
1da177e4 3612 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3613 kfree(ml);
3614 }
3615 rtnl_unlock();
3616}
1da177e4
LT
3617
3618static int
b7058842 3619packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3620{
3621 struct sock *sk = sock->sk;
8dc41944 3622 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3623 int ret;
3624
3625 if (level != SOL_PACKET)
3626 return -ENOPROTOOPT;
3627
69e3c75f 3628 switch (optname) {
1ce4f28b 3629 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3630 case PACKET_DROP_MEMBERSHIP:
3631 {
0fb375fb
EB
3632 struct packet_mreq_max mreq;
3633 int len = optlen;
3634 memset(&mreq, 0, sizeof(mreq));
3635 if (len < sizeof(struct packet_mreq))
1da177e4 3636 return -EINVAL;
0fb375fb
EB
3637 if (len > sizeof(mreq))
3638 len = sizeof(mreq);
40d4e3df 3639 if (copy_from_user(&mreq, optval, len))
1da177e4 3640 return -EFAULT;
0fb375fb
EB
3641 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3642 return -EINVAL;
1da177e4
LT
3643 if (optname == PACKET_ADD_MEMBERSHIP)
3644 ret = packet_mc_add(sk, &mreq);
3645 else
3646 ret = packet_mc_drop(sk, &mreq);
3647 return ret;
3648 }
a2efcfa0 3649
1da177e4 3650 case PACKET_RX_RING:
69e3c75f 3651 case PACKET_TX_RING:
1da177e4 3652 {
f6fb8f10 3653 union tpacket_req_u req_u;
3654 int len;
1da177e4 3655
f6fb8f10 3656 switch (po->tp_version) {
3657 case TPACKET_V1:
3658 case TPACKET_V2:
3659 len = sizeof(req_u.req);
3660 break;
3661 case TPACKET_V3:
3662 default:
3663 len = sizeof(req_u.req3);
3664 break;
3665 }
3666 if (optlen < len)
1da177e4 3667 return -EINVAL;
f6fb8f10 3668 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3669 return -EFAULT;
f6fb8f10 3670 return packet_set_ring(sk, &req_u, 0,
3671 optname == PACKET_TX_RING);
1da177e4
LT
3672 }
3673 case PACKET_COPY_THRESH:
3674 {
3675 int val;
3676
40d4e3df 3677 if (optlen != sizeof(val))
1da177e4 3678 return -EINVAL;
40d4e3df 3679 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3680 return -EFAULT;
3681
3682 pkt_sk(sk)->copy_thresh = val;
3683 return 0;
3684 }
bbd6ef87
PM
3685 case PACKET_VERSION:
3686 {
3687 int val;
3688
3689 if (optlen != sizeof(val))
3690 return -EINVAL;
bbd6ef87
PM
3691 if (copy_from_user(&val, optval, sizeof(val)))
3692 return -EFAULT;
3693 switch (val) {
3694 case TPACKET_V1:
3695 case TPACKET_V2:
f6fb8f10 3696 case TPACKET_V3:
84ac7260 3697 break;
bbd6ef87
PM
3698 default:
3699 return -EINVAL;
3700 }
84ac7260
PP
3701 lock_sock(sk);
3702 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3703 ret = -EBUSY;
3704 } else {
3705 po->tp_version = val;
3706 ret = 0;
3707 }
3708 release_sock(sk);
3709 return ret;
bbd6ef87 3710 }
8913336a
PM
3711 case PACKET_RESERVE:
3712 {
3713 unsigned int val;
3714
3715 if (optlen != sizeof(val))
3716 return -EINVAL;
8913336a
PM
3717 if (copy_from_user(&val, optval, sizeof(val)))
3718 return -EFAULT;
bcc5364b
AK
3719 if (val > INT_MAX)
3720 return -EINVAL;
c27927e3
WB
3721 lock_sock(sk);
3722 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3723 ret = -EBUSY;
3724 } else {
3725 po->tp_reserve = val;
3726 ret = 0;
3727 }
3728 release_sock(sk);
3729 return ret;
8913336a 3730 }
69e3c75f
JB
3731 case PACKET_LOSS:
3732 {
3733 unsigned int val;
3734
3735 if (optlen != sizeof(val))
3736 return -EINVAL;
3737 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3738 return -EBUSY;
3739 if (copy_from_user(&val, optval, sizeof(val)))
3740 return -EFAULT;
3741 po->tp_loss = !!val;
3742 return 0;
3743 }
8dc41944
HX
3744 case PACKET_AUXDATA:
3745 {
3746 int val;
3747
3748 if (optlen < sizeof(val))
3749 return -EINVAL;
3750 if (copy_from_user(&val, optval, sizeof(val)))
3751 return -EFAULT;
3752
3753 po->auxdata = !!val;
3754 return 0;
3755 }
80feaacb
PWJ
3756 case PACKET_ORIGDEV:
3757 {
3758 int val;
3759
3760 if (optlen < sizeof(val))
3761 return -EINVAL;
3762 if (copy_from_user(&val, optval, sizeof(val)))
3763 return -EFAULT;
3764
3765 po->origdev = !!val;
3766 return 0;
3767 }
bfd5f4a3
SS
3768 case PACKET_VNET_HDR:
3769 {
3770 int val;
3771
3772 if (sock->type != SOCK_RAW)
3773 return -EINVAL;
3774 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3775 return -EBUSY;
3776 if (optlen < sizeof(val))
3777 return -EINVAL;
3778 if (copy_from_user(&val, optval, sizeof(val)))
3779 return -EFAULT;
3780
3781 po->has_vnet_hdr = !!val;
3782 return 0;
3783 }
614f60fa
SM
3784 case PACKET_TIMESTAMP:
3785 {
3786 int val;
3787
3788 if (optlen != sizeof(val))
3789 return -EINVAL;
3790 if (copy_from_user(&val, optval, sizeof(val)))
3791 return -EFAULT;
3792
3793 po->tp_tstamp = val;
3794 return 0;
3795 }
dc99f600
DM
3796 case PACKET_FANOUT:
3797 {
3798 int val;
3799
3800 if (optlen != sizeof(val))
3801 return -EINVAL;
3802 if (copy_from_user(&val, optval, sizeof(val)))
3803 return -EFAULT;
3804
3805 return fanout_add(sk, val & 0xffff, val >> 16);
3806 }
47dceb8e
WB
3807 case PACKET_FANOUT_DATA:
3808 {
3809 if (!po->fanout)
3810 return -EINVAL;
3811
3812 return fanout_set_data(po, optval, optlen);
3813 }
5920cd3a
PC
3814 case PACKET_TX_HAS_OFF:
3815 {
3816 unsigned int val;
3817
3818 if (optlen != sizeof(val))
3819 return -EINVAL;
3820 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3821 return -EBUSY;
3822 if (copy_from_user(&val, optval, sizeof(val)))
3823 return -EFAULT;
3824 po->tp_tx_has_off = !!val;
3825 return 0;
3826 }
d346a3fa
DB
3827 case PACKET_QDISC_BYPASS:
3828 {
3829 int val;
3830
3831 if (optlen != sizeof(val))
3832 return -EINVAL;
3833 if (copy_from_user(&val, optval, sizeof(val)))
3834 return -EFAULT;
3835
3836 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3837 return 0;
3838 }
1da177e4
LT
3839 default:
3840 return -ENOPROTOOPT;
3841 }
3842}
3843
3844static int packet_getsockopt(struct socket *sock, int level, int optname,
3845 char __user *optval, int __user *optlen)
3846{
3847 int len;
c06fff6e 3848 int val, lv = sizeof(val);
1da177e4
LT
3849 struct sock *sk = sock->sk;
3850 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3851 void *data = &val;
ee80fbf3 3852 union tpacket_stats_u st;
a9b63918 3853 struct tpacket_rollover_stats rstats;
1da177e4
LT
3854
3855 if (level != SOL_PACKET)
3856 return -ENOPROTOOPT;
3857
8ae55f04
KK
3858 if (get_user(len, optlen))
3859 return -EFAULT;
1da177e4
LT
3860
3861 if (len < 0)
3862 return -EINVAL;
1ce4f28b 3863
69e3c75f 3864 switch (optname) {
1da177e4 3865 case PACKET_STATISTICS:
1da177e4 3866 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3867 memcpy(&st, &po->stats, sizeof(st));
3868 memset(&po->stats, 0, sizeof(po->stats));
3869 spin_unlock_bh(&sk->sk_receive_queue.lock);
3870
f6fb8f10 3871 if (po->tp_version == TPACKET_V3) {
c06fff6e 3872 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3873 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3874 data = &st.stats3;
f6fb8f10 3875 } else {
c06fff6e 3876 lv = sizeof(struct tpacket_stats);
8bcdeaff 3877 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3878 data = &st.stats1;
f6fb8f10 3879 }
ee80fbf3 3880
8dc41944
HX
3881 break;
3882 case PACKET_AUXDATA:
8dc41944 3883 val = po->auxdata;
80feaacb
PWJ
3884 break;
3885 case PACKET_ORIGDEV:
80feaacb 3886 val = po->origdev;
bfd5f4a3
SS
3887 break;
3888 case PACKET_VNET_HDR:
bfd5f4a3 3889 val = po->has_vnet_hdr;
1da177e4 3890 break;
bbd6ef87 3891 case PACKET_VERSION:
bbd6ef87 3892 val = po->tp_version;
bbd6ef87
PM
3893 break;
3894 case PACKET_HDRLEN:
3895 if (len > sizeof(int))
3896 len = sizeof(int);
fd2c83b3
AP
3897 if (len < sizeof(int))
3898 return -EINVAL;
bbd6ef87
PM
3899 if (copy_from_user(&val, optval, len))
3900 return -EFAULT;
3901 switch (val) {
3902 case TPACKET_V1:
3903 val = sizeof(struct tpacket_hdr);
3904 break;
3905 case TPACKET_V2:
3906 val = sizeof(struct tpacket2_hdr);
3907 break;
f6fb8f10 3908 case TPACKET_V3:
3909 val = sizeof(struct tpacket3_hdr);
3910 break;
bbd6ef87
PM
3911 default:
3912 return -EINVAL;
3913 }
bbd6ef87 3914 break;
8913336a 3915 case PACKET_RESERVE:
8913336a 3916 val = po->tp_reserve;
8913336a 3917 break;
69e3c75f 3918 case PACKET_LOSS:
69e3c75f 3919 val = po->tp_loss;
69e3c75f 3920 break;
614f60fa 3921 case PACKET_TIMESTAMP:
614f60fa 3922 val = po->tp_tstamp;
614f60fa 3923 break;
dc99f600 3924 case PACKET_FANOUT:
dc99f600
DM
3925 val = (po->fanout ?
3926 ((u32)po->fanout->id |
77f65ebd
WB
3927 ((u32)po->fanout->type << 16) |
3928 ((u32)po->fanout->flags << 24)) :
dc99f600 3929 0);
dc99f600 3930 break;
a9b63918
WB
3931 case PACKET_ROLLOVER_STATS:
3932 if (!po->rollover)
3933 return -EINVAL;
3934 rstats.tp_all = atomic_long_read(&po->rollover->num);
3935 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3936 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3937 data = &rstats;
3938 lv = sizeof(rstats);
3939 break;
5920cd3a
PC
3940 case PACKET_TX_HAS_OFF:
3941 val = po->tp_tx_has_off;
3942 break;
d346a3fa
DB
3943 case PACKET_QDISC_BYPASS:
3944 val = packet_use_direct_xmit(po);
3945 break;
1da177e4
LT
3946 default:
3947 return -ENOPROTOOPT;
3948 }
3949
c06fff6e
ED
3950 if (len > lv)
3951 len = lv;
8ae55f04
KK
3952 if (put_user(len, optlen))
3953 return -EFAULT;
8dc41944
HX
3954 if (copy_to_user(optval, data, len))
3955 return -EFAULT;
8ae55f04 3956 return 0;
1da177e4
LT
3957}
3958
3959
719c44d3
WB
3960#ifdef CONFIG_COMPAT
3961static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
3962 char __user *optval, unsigned int optlen)
3963{
3964 struct packet_sock *po = pkt_sk(sock->sk);
3965
3966 if (level != SOL_PACKET)
3967 return -ENOPROTOOPT;
3968
3969 if (optname == PACKET_FANOUT_DATA &&
3970 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
3971 optval = (char __user *)get_compat_bpf_fprog(optval);
3972 if (!optval)
3973 return -EFAULT;
3974 optlen = sizeof(struct sock_fprog);
3975 }
3976
3977 return packet_setsockopt(sock, level, optname, optval, optlen);
3978}
3979#endif
3980
351638e7
JP
3981static int packet_notifier(struct notifier_block *this,
3982 unsigned long msg, void *ptr)
1da177e4
LT
3983{
3984 struct sock *sk;
351638e7 3985 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3986 struct net *net = dev_net(dev);
1da177e4 3987
808f5114 3988 rcu_read_lock();
b67bfe0d 3989 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3990 struct packet_sock *po = pkt_sk(sk);
3991
3992 switch (msg) {
3993 case NETDEV_UNREGISTER:
1da177e4 3994 if (po->mclist)
82f17091 3995 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
3996 /* fallthrough */
3997
1da177e4
LT
3998 case NETDEV_DOWN:
3999 if (dev->ifindex == po->ifindex) {
4000 spin_lock(&po->bind_lock);
4001 if (po->running) {
ce06b03e 4002 __unregister_prot_hook(sk, false);
1da177e4
LT
4003 sk->sk_err = ENETDOWN;
4004 if (!sock_flag(sk, SOCK_DEAD))
4005 sk->sk_error_report(sk);
4006 }
4007 if (msg == NETDEV_UNREGISTER) {
66e56cd4 4008 packet_cached_dev_reset(po);
1da177e4 4009 po->ifindex = -1;
160ff18a
BG
4010 if (po->prot_hook.dev)
4011 dev_put(po->prot_hook.dev);
1da177e4
LT
4012 po->prot_hook.dev = NULL;
4013 }
4014 spin_unlock(&po->bind_lock);
4015 }
4016 break;
4017 case NETDEV_UP:
808f5114 4018 if (dev->ifindex == po->ifindex) {
4019 spin_lock(&po->bind_lock);
ce06b03e
DM
4020 if (po->num)
4021 register_prot_hook(sk);
808f5114 4022 spin_unlock(&po->bind_lock);
1da177e4 4023 }
1da177e4
LT
4024 break;
4025 }
4026 }
808f5114 4027 rcu_read_unlock();
1da177e4
LT
4028 return NOTIFY_DONE;
4029}
4030
4031
4032static int packet_ioctl(struct socket *sock, unsigned int cmd,
4033 unsigned long arg)
4034{
4035 struct sock *sk = sock->sk;
4036
69e3c75f 4037 switch (cmd) {
40d4e3df
ED
4038 case SIOCOUTQ:
4039 {
4040 int amount = sk_wmem_alloc_get(sk);
31e6d363 4041
40d4e3df
ED
4042 return put_user(amount, (int __user *)arg);
4043 }
4044 case SIOCINQ:
4045 {
4046 struct sk_buff *skb;
4047 int amount = 0;
4048
4049 spin_lock_bh(&sk->sk_receive_queue.lock);
4050 skb = skb_peek(&sk->sk_receive_queue);
4051 if (skb)
4052 amount = skb->len;
4053 spin_unlock_bh(&sk->sk_receive_queue.lock);
4054 return put_user(amount, (int __user *)arg);
4055 }
4056 case SIOCGSTAMP:
4057 return sock_get_timestamp(sk, (struct timeval __user *)arg);
4058 case SIOCGSTAMPNS:
4059 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 4060
1da177e4 4061#ifdef CONFIG_INET
40d4e3df
ED
4062 case SIOCADDRT:
4063 case SIOCDELRT:
4064 case SIOCDARP:
4065 case SIOCGARP:
4066 case SIOCSARP:
4067 case SIOCGIFADDR:
4068 case SIOCSIFADDR:
4069 case SIOCGIFBRDADDR:
4070 case SIOCSIFBRDADDR:
4071 case SIOCGIFNETMASK:
4072 case SIOCSIFNETMASK:
4073 case SIOCGIFDSTADDR:
4074 case SIOCSIFDSTADDR:
4075 case SIOCSIFFLAGS:
40d4e3df 4076 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
4077#endif
4078
40d4e3df
ED
4079 default:
4080 return -ENOIOCTLCMD;
1da177e4
LT
4081 }
4082 return 0;
4083}
4084
40d4e3df 4085static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
4086 poll_table *wait)
4087{
4088 struct sock *sk = sock->sk;
4089 struct packet_sock *po = pkt_sk(sk);
4090 unsigned int mask = datagram_poll(file, sock, wait);
4091
4092 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4093 if (po->rx_ring.pg_vec) {
f6fb8f10 4094 if (!packet_previous_rx_frame(po, &po->rx_ring,
4095 TP_STATUS_KERNEL))
1da177e4
LT
4096 mask |= POLLIN | POLLRDNORM;
4097 }
2ccdbaa6 4098 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 4099 po->pressure = 0;
1da177e4 4100 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4101 spin_lock_bh(&sk->sk_write_queue.lock);
4102 if (po->tx_ring.pg_vec) {
4103 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4104 mask |= POLLOUT | POLLWRNORM;
4105 }
4106 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4107 return mask;
4108}
4109
4110
4111/* Dirty? Well, I still did not learn better way to account
4112 * for user mmaps.
4113 */
4114
4115static void packet_mm_open(struct vm_area_struct *vma)
4116{
4117 struct file *file = vma->vm_file;
40d4e3df 4118 struct socket *sock = file->private_data;
1da177e4 4119 struct sock *sk = sock->sk;
1ce4f28b 4120
1da177e4
LT
4121 if (sk)
4122 atomic_inc(&pkt_sk(sk)->mapped);
4123}
4124
4125static void packet_mm_close(struct vm_area_struct *vma)
4126{
4127 struct file *file = vma->vm_file;
40d4e3df 4128 struct socket *sock = file->private_data;
1da177e4 4129 struct sock *sk = sock->sk;
1ce4f28b 4130
1da177e4
LT
4131 if (sk)
4132 atomic_dec(&pkt_sk(sk)->mapped);
4133}
4134
f0f37e2f 4135static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4136 .open = packet_mm_open,
4137 .close = packet_mm_close,
1da177e4
LT
4138};
4139
0e3125c7
NH
4140static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4141 unsigned int len)
1da177e4
LT
4142{
4143 int i;
4144
4ebf0ae2 4145 for (i = 0; i < len; i++) {
0e3125c7 4146 if (likely(pg_vec[i].buffer)) {
c56b4d90 4147 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
4148 vfree(pg_vec[i].buffer);
4149 else
4150 free_pages((unsigned long)pg_vec[i].buffer,
4151 order);
4152 pg_vec[i].buffer = NULL;
4153 }
1da177e4
LT
4154 }
4155 kfree(pg_vec);
4156}
4157
eea49cc9 4158static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4159{
f0d4eb29 4160 char *buffer;
0e3125c7
NH
4161 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4162 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4163
4164 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4165 if (buffer)
4166 return buffer;
4167
f0d4eb29 4168 /* __get_free_pages failed, fall back to vmalloc */
bbce5a59 4169 buffer = vzalloc((1 << order) * PAGE_SIZE);
0e3125c7
NH
4170 if (buffer)
4171 return buffer;
4172
f0d4eb29 4173 /* vmalloc failed, lets dig into swap here */
0e3125c7 4174 gfp_flags &= ~__GFP_NORETRY;
f0d4eb29 4175 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4176 if (buffer)
4177 return buffer;
4178
f0d4eb29 4179 /* complete and utter failure */
0e3125c7 4180 return NULL;
4ebf0ae2
DM
4181}
4182
0e3125c7 4183static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4184{
4185 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4186 struct pgv *pg_vec;
4ebf0ae2
DM
4187 int i;
4188
0e3125c7 4189 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
4190 if (unlikely(!pg_vec))
4191 goto out;
4192
4193 for (i = 0; i < block_nr; i++) {
c56b4d90 4194 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4195 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4196 goto out_free_pgvec;
4197 }
4198
4199out:
4200 return pg_vec;
4201
4202out_free_pgvec:
4203 free_pg_vec(pg_vec, order, block_nr);
4204 pg_vec = NULL;
4205 goto out;
4206}
1da177e4 4207
f6fb8f10 4208static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4209 int closing, int tx_ring)
1da177e4 4210{
0e3125c7 4211 struct pgv *pg_vec = NULL;
1da177e4 4212 struct packet_sock *po = pkt_sk(sk);
0e11c91e 4213 int was_running, order = 0;
69e3c75f
JB
4214 struct packet_ring_buffer *rb;
4215 struct sk_buff_head *rb_queue;
0e11c91e 4216 __be16 num;
f6fb8f10 4217 int err = -EINVAL;
4218 /* Added to avoid minimal code churn */
4219 struct tpacket_req *req = &req_u->req;
4220
84ac7260 4221 lock_sock(sk);
1ce4f28b 4222
69e3c75f
JB
4223 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4224 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4225
69e3c75f
JB
4226 err = -EBUSY;
4227 if (!closing) {
4228 if (atomic_read(&po->mapped))
4229 goto out;
b0138408 4230 if (packet_read_pending(rb))
69e3c75f
JB
4231 goto out;
4232 }
1da177e4 4233
69e3c75f
JB
4234 if (req->tp_block_nr) {
4235 /* Sanity tests and some calculations */
4236 err = -EBUSY;
4237 if (unlikely(rb->pg_vec))
4238 goto out;
1da177e4 4239
bbd6ef87
PM
4240 switch (po->tp_version) {
4241 case TPACKET_V1:
4242 po->tp_hdrlen = TPACKET_HDRLEN;
4243 break;
4244 case TPACKET_V2:
4245 po->tp_hdrlen = TPACKET2_HDRLEN;
4246 break;
f6fb8f10 4247 case TPACKET_V3:
4248 po->tp_hdrlen = TPACKET3_HDRLEN;
4249 break;
bbd6ef87
PM
4250 }
4251
69e3c75f 4252 err = -EINVAL;
4ebf0ae2 4253 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4254 goto out;
90836b67 4255 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4256 goto out;
dc808110 4257 if (po->tp_version >= TPACKET_V3 &&
2b6867c2
AK
4258 req->tp_block_size <=
4259 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv))
dc808110 4260 goto out;
8913336a 4261 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
4262 po->tp_reserve))
4263 goto out;
4ebf0ae2 4264 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4265 goto out;
1da177e4 4266
4194b491
TK
4267 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4268 if (unlikely(rb->frames_per_block == 0))
69e3c75f 4269 goto out;
8f8d28e4
AK
4270 if (unlikely(req->tp_block_size > UINT_MAX / req->tp_block_nr))
4271 goto out;
69e3c75f
JB
4272 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4273 req->tp_frame_nr))
4274 goto out;
1da177e4
LT
4275
4276 err = -ENOMEM;
4ebf0ae2
DM
4277 order = get_order(req->tp_block_size);
4278 pg_vec = alloc_pg_vec(req, order);
4279 if (unlikely(!pg_vec))
1da177e4 4280 goto out;
f6fb8f10 4281 switch (po->tp_version) {
4282 case TPACKET_V3:
7f953ab2
SV
4283 /* Block transmit is not supported yet */
4284 if (!tx_ring) {
e8e85cc5 4285 init_prb_bdqc(po, rb, pg_vec, req_u);
7f953ab2
SV
4286 } else {
4287 struct tpacket_req3 *req3 = &req_u->req3;
4288
4289 if (req3->tp_retire_blk_tov ||
4290 req3->tp_sizeof_priv ||
4291 req3->tp_feature_req_word) {
4292 err = -EINVAL;
4293 goto out;
4294 }
4295 }
d7cf0c34 4296 break;
f6fb8f10 4297 default:
4298 break;
4299 }
69e3c75f
JB
4300 }
4301 /* Done */
4302 else {
4303 err = -EINVAL;
4ebf0ae2 4304 if (unlikely(req->tp_frame_nr))
69e3c75f 4305 goto out;
1da177e4
LT
4306 }
4307
1da177e4
LT
4308
4309 /* Detach socket from network */
4310 spin_lock(&po->bind_lock);
4311 was_running = po->running;
4312 num = po->num;
4313 if (was_running) {
1da177e4 4314 po->num = 0;
ce06b03e 4315 __unregister_prot_hook(sk, false);
1da177e4
LT
4316 }
4317 spin_unlock(&po->bind_lock);
1ce4f28b 4318
1da177e4
LT
4319 synchronize_net();
4320
4321 err = -EBUSY;
905db440 4322 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4323 if (closing || atomic_read(&po->mapped) == 0) {
4324 err = 0;
69e3c75f 4325 spin_lock_bh(&rb_queue->lock);
c053fd96 4326 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4327 rb->frame_max = (req->tp_frame_nr - 1);
4328 rb->head = 0;
4329 rb->frame_size = req->tp_frame_size;
4330 spin_unlock_bh(&rb_queue->lock);
4331
c053fd96
CG
4332 swap(rb->pg_vec_order, order);
4333 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4334
4335 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4336 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4337 tpacket_rcv : packet_rcv;
4338 skb_queue_purge(rb_queue);
1da177e4 4339 if (atomic_read(&po->mapped))
40d4e3df
ED
4340 pr_err("packet_mmap: vma is busy: %d\n",
4341 atomic_read(&po->mapped));
1da177e4 4342 }
905db440 4343 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4344
4345 spin_lock(&po->bind_lock);
ce06b03e 4346 if (was_running) {
1da177e4 4347 po->num = num;
ce06b03e 4348 register_prot_hook(sk);
1da177e4
LT
4349 }
4350 spin_unlock(&po->bind_lock);
c800aaf8 4351 if (pg_vec && (po->tp_version > TPACKET_V2)) {
f6fb8f10 4352 /* Because we don't support block-based V3 on tx-ring */
4353 if (!tx_ring)
73d0fcf2 4354 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4355 }
1da177e4 4356
1da177e4
LT
4357 if (pg_vec)
4358 free_pg_vec(pg_vec, order, req->tp_block_nr);
4359out:
84ac7260 4360 release_sock(sk);
1da177e4
LT
4361 return err;
4362}
4363
69e3c75f
JB
4364static int packet_mmap(struct file *file, struct socket *sock,
4365 struct vm_area_struct *vma)
1da177e4
LT
4366{
4367 struct sock *sk = sock->sk;
4368 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4369 unsigned long size, expected_size;
4370 struct packet_ring_buffer *rb;
1da177e4
LT
4371 unsigned long start;
4372 int err = -EINVAL;
4373 int i;
4374
4375 if (vma->vm_pgoff)
4376 return -EINVAL;
4377
905db440 4378 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4379
4380 expected_size = 0;
4381 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4382 if (rb->pg_vec) {
4383 expected_size += rb->pg_vec_len
4384 * rb->pg_vec_pages
4385 * PAGE_SIZE;
4386 }
4387 }
4388
4389 if (expected_size == 0)
1da177e4 4390 goto out;
69e3c75f
JB
4391
4392 size = vma->vm_end - vma->vm_start;
4393 if (size != expected_size)
1da177e4
LT
4394 goto out;
4395
1da177e4 4396 start = vma->vm_start;
69e3c75f
JB
4397 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4398 if (rb->pg_vec == NULL)
4399 continue;
4400
4401 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4402 struct page *page;
4403 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4404 int pg_num;
4405
c56b4d90
CG
4406 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4407 page = pgv_to_page(kaddr);
69e3c75f
JB
4408 err = vm_insert_page(vma, start, page);
4409 if (unlikely(err))
4410 goto out;
4411 start += PAGE_SIZE;
0e3125c7 4412 kaddr += PAGE_SIZE;
69e3c75f 4413 }
4ebf0ae2 4414 }
1da177e4 4415 }
69e3c75f 4416
4ebf0ae2 4417 atomic_inc(&po->mapped);
1da177e4
LT
4418 vma->vm_ops = &packet_mmap_ops;
4419 err = 0;
4420
4421out:
905db440 4422 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4423 return err;
4424}
1da177e4 4425
90ddc4f0 4426static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4427 .family = PF_PACKET,
4428 .owner = THIS_MODULE,
4429 .release = packet_release,
4430 .bind = packet_bind_spkt,
4431 .connect = sock_no_connect,
4432 .socketpair = sock_no_socketpair,
4433 .accept = sock_no_accept,
4434 .getname = packet_getname_spkt,
4435 .poll = datagram_poll,
4436 .ioctl = packet_ioctl,
4437 .listen = sock_no_listen,
4438 .shutdown = sock_no_shutdown,
4439 .setsockopt = sock_no_setsockopt,
4440 .getsockopt = sock_no_getsockopt,
4441 .sendmsg = packet_sendmsg_spkt,
4442 .recvmsg = packet_recvmsg,
4443 .mmap = sock_no_mmap,
4444 .sendpage = sock_no_sendpage,
4445};
1da177e4 4446
90ddc4f0 4447static const struct proto_ops packet_ops = {
1da177e4
LT
4448 .family = PF_PACKET,
4449 .owner = THIS_MODULE,
4450 .release = packet_release,
4451 .bind = packet_bind,
4452 .connect = sock_no_connect,
4453 .socketpair = sock_no_socketpair,
4454 .accept = sock_no_accept,
1ce4f28b 4455 .getname = packet_getname,
1da177e4
LT
4456 .poll = packet_poll,
4457 .ioctl = packet_ioctl,
4458 .listen = sock_no_listen,
4459 .shutdown = sock_no_shutdown,
4460 .setsockopt = packet_setsockopt,
4461 .getsockopt = packet_getsockopt,
719c44d3
WB
4462#ifdef CONFIG_COMPAT
4463 .compat_setsockopt = compat_packet_setsockopt,
4464#endif
1da177e4
LT
4465 .sendmsg = packet_sendmsg,
4466 .recvmsg = packet_recvmsg,
4467 .mmap = packet_mmap,
4468 .sendpage = sock_no_sendpage,
4469};
4470
ec1b4cf7 4471static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4472 .family = PF_PACKET,
4473 .create = packet_create,
4474 .owner = THIS_MODULE,
4475};
4476
4477static struct notifier_block packet_netdev_notifier = {
40d4e3df 4478 .notifier_call = packet_notifier,
1da177e4
LT
4479};
4480
4481#ifdef CONFIG_PROC_FS
1da177e4
LT
4482
4483static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4484 __acquires(RCU)
1da177e4 4485{
e372c414 4486 struct net *net = seq_file_net(seq);
808f5114 4487
4488 rcu_read_lock();
4489 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4490}
4491
4492static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4493{
1bf40954 4494 struct net *net = seq_file_net(seq);
808f5114 4495 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4496}
4497
4498static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4499 __releases(RCU)
1da177e4 4500{
808f5114 4501 rcu_read_unlock();
1da177e4
LT
4502}
4503
1ce4f28b 4504static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4505{
4506 if (v == SEQ_START_TOKEN)
4507 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4508 else {
b7ceabd9 4509 struct sock *s = sk_entry(v);
1da177e4
LT
4510 const struct packet_sock *po = pkt_sk(s);
4511
4512 seq_printf(seq,
71338aa7 4513 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4 4514 s,
41c6d650 4515 refcount_read(&s->sk_refcnt),
1da177e4
LT
4516 s->sk_type,
4517 ntohs(po->num),
4518 po->ifindex,
4519 po->running,
4520 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4521 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4522 sock_i_ino(s));
1da177e4
LT
4523 }
4524
4525 return 0;
4526}
4527
56b3d975 4528static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4529 .start = packet_seq_start,
4530 .next = packet_seq_next,
4531 .stop = packet_seq_stop,
4532 .show = packet_seq_show,
4533};
4534
4535static int packet_seq_open(struct inode *inode, struct file *file)
4536{
e372c414
DL
4537 return seq_open_net(inode, file, &packet_seq_ops,
4538 sizeof(struct seq_net_private));
1da177e4
LT
4539}
4540
da7071d7 4541static const struct file_operations packet_seq_fops = {
1da177e4
LT
4542 .owner = THIS_MODULE,
4543 .open = packet_seq_open,
4544 .read = seq_read,
4545 .llseek = seq_lseek,
e372c414 4546 .release = seq_release_net,
1da177e4
LT
4547};
4548
4549#endif
4550
2c8c1e72 4551static int __net_init packet_net_init(struct net *net)
d12d01d6 4552{
0fa7fa98 4553 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4554 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4555
d4beaa66 4556 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
4557 return -ENOMEM;
4558
4559 return 0;
4560}
4561
2c8c1e72 4562static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4563{
ece31ffd 4564 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
4565}
4566
4567static struct pernet_operations packet_net_ops = {
4568 .init = packet_net_init,
4569 .exit = packet_net_exit,
4570};
4571
4572
1da177e4
LT
4573static void __exit packet_exit(void)
4574{
1da177e4 4575 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4576 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4577 sock_unregister(PF_PACKET);
4578 proto_unregister(&packet_proto);
4579}
4580
4581static int __init packet_init(void)
4582{
4583 int rc = proto_register(&packet_proto, 0);
4584
4585 if (rc != 0)
4586 goto out;
4587
4588 sock_register(&packet_family_ops);
d12d01d6 4589 register_pernet_subsys(&packet_net_ops);
1da177e4 4590 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4591out:
4592 return rc;
4593}
4594
4595module_init(packet_init);
4596module_exit(packet_exit);
4597MODULE_LICENSE("GPL");
4598MODULE_ALIAS_NETPROTO(PF_PACKET);