Merge remote-tracking branch 'asoc/topic/core' into asoc-next
[linux-2.6-block.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95
2787b04b
PE
96#include "internal.h"
97
1da177e4
LT
98/*
99 Assumptions:
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
105 (PPP).
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
108
109On receive:
110-----------
111
112Incoming, dev->hard_header!=NULL
b0e380b1
ACM
113 mac_header -> ll header
114 data -> data
1da177e4
LT
115
116Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
117 mac_header -> ll header
118 data -> ll header
1da177e4
LT
119
120Incoming, dev->hard_header==NULL
b0e380b1
ACM
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
db0c58f9 123 assymetry between rx and tx paths.
b0e380b1 124 data -> data
1da177e4
LT
125
126Outgoing, dev->hard_header==NULL
b0e380b1
ACM
127 mac_header -> data. ll header is still not built!
128 data -> data
1da177e4
LT
129
130Resume
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
132
133
134On transmit:
135------------
136
137dev->hard_header != NULL
b0e380b1
ACM
138 mac_header -> ll header
139 data -> ll header
1da177e4
LT
140
141dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
142 mac_header -> data
143 data -> data
1da177e4
LT
144
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
147 */
148
1da177e4
LT
149/* Private packet socket structures. */
150
0fb375fb
EB
151/* identical to struct packet_mreq except it has
152 * a longer address field.
153 */
40d4e3df 154struct packet_mreq_max {
0fb375fb
EB
155 int mr_ifindex;
156 unsigned short mr_type;
157 unsigned short mr_alen;
158 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 159};
a2efcfa0 160
184f489e
DB
161union tpacket_uhdr {
162 struct tpacket_hdr *h1;
163 struct tpacket2_hdr *h2;
164 struct tpacket3_hdr *h3;
165 void *raw;
166};
167
f6fb8f10 168static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
169 int closing, int tx_ring);
170
f6fb8f10 171#define V3_ALIGNMENT (8)
172
bc59ba39 173#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 174
175#define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
177
f6fb8f10 178#define PGV_FROM_VMALLOC 1
69e3c75f 179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f
JB
188struct packet_sock;
189static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
190static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
191 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 192
f6fb8f10 193static void *packet_previous_frame(struct packet_sock *po,
194 struct packet_ring_buffer *rb,
195 int status);
196static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 197static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
198 struct tpacket_block_desc *);
199static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *);
bc59ba39 201static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 202 struct packet_sock *, unsigned int status);
bc59ba39 203static int prb_queue_frozen(struct tpacket_kbdq_core *);
204static void prb_open_block(struct tpacket_kbdq_core *,
205 struct tpacket_block_desc *);
f6fb8f10 206static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 207static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
208static void prb_init_blk_timer(struct packet_sock *,
209 struct tpacket_kbdq_core *,
210 void (*func) (unsigned long));
211static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
212static void prb_clear_rxhash(struct tpacket_kbdq_core *,
213 struct tpacket3_hdr *);
214static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
1da177e4
LT
216static void packet_flush_mclist(struct sock *sk);
217
ffbc6111 218struct packet_skb_cb {
ffbc6111
HX
219 union {
220 struct sockaddr_pkt pkt;
2472d761
EB
221 union {
222 /* Trick: alias skb original length with
223 * ll.sll_family and ll.protocol in order
224 * to save room.
225 */
226 unsigned int origlen;
227 struct sockaddr_ll ll;
228 };
ffbc6111
HX
229 } sa;
230};
231
232#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 233
bc59ba39 234#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 235#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 236 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 237#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 238 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 239#define GET_NEXT_PRB_BLK_NUM(x) \
240 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
241 ((x)->kactive_blk_num+1) : 0)
242
dc99f600
DM
243static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
244static void __fanout_link(struct sock *sk, struct packet_sock *po);
245
d346a3fa
DB
246static int packet_direct_xmit(struct sk_buff *skb)
247{
248 struct net_device *dev = skb->dev;
d346a3fa
DB
249 netdev_features_t features;
250 struct netdev_queue *txq;
43279500 251 int ret = NETDEV_TX_BUSY;
d346a3fa
DB
252
253 if (unlikely(!netif_running(dev) ||
43279500
DB
254 !netif_carrier_ok(dev)))
255 goto drop;
d346a3fa
DB
256
257 features = netif_skb_features(skb);
258 if (skb_needs_linearize(skb, features) &&
43279500
DB
259 __skb_linearize(skb))
260 goto drop;
d346a3fa 261
10c51b56 262 txq = skb_get_tx_queue(dev, skb);
d346a3fa 263
43279500
DB
264 local_bh_disable();
265
266 HARD_TX_LOCK(dev, txq, smp_processor_id());
10b3ad8c 267 if (!netif_xmit_frozen_or_drv_stopped(txq))
fa2dbdc2 268 ret = netdev_start_xmit(skb, dev, txq, false);
43279500 269 HARD_TX_UNLOCK(dev, txq);
d346a3fa 270
43279500
DB
271 local_bh_enable();
272
273 if (!dev_xmit_complete(ret))
d346a3fa 274 kfree_skb(skb);
43279500 275
d346a3fa 276 return ret;
43279500 277drop:
0f97ede4 278 atomic_long_inc(&dev->tx_dropped);
43279500
DB
279 kfree_skb(skb);
280 return NET_XMIT_DROP;
d346a3fa
DB
281}
282
66e56cd4
DB
283static struct net_device *packet_cached_dev_get(struct packet_sock *po)
284{
285 struct net_device *dev;
286
287 rcu_read_lock();
288 dev = rcu_dereference(po->cached_dev);
289 if (likely(dev))
290 dev_hold(dev);
291 rcu_read_unlock();
292
293 return dev;
294}
295
296static void packet_cached_dev_assign(struct packet_sock *po,
297 struct net_device *dev)
298{
299 rcu_assign_pointer(po->cached_dev, dev);
300}
301
302static void packet_cached_dev_reset(struct packet_sock *po)
303{
304 RCU_INIT_POINTER(po->cached_dev, NULL);
305}
306
d346a3fa
DB
307static bool packet_use_direct_xmit(const struct packet_sock *po)
308{
309 return po->xmit == packet_direct_xmit;
310}
311
0fd5d57b 312static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
d346a3fa 313{
1cbac010 314 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
315}
316
0fd5d57b
DB
317static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
318{
319 const struct net_device_ops *ops = dev->netdev_ops;
320 u16 queue_index;
321
322 if (ops->ndo_select_queue) {
323 queue_index = ops->ndo_select_queue(dev, skb, NULL,
324 __packet_pick_tx_queue);
325 queue_index = netdev_cap_txqueue(dev, queue_index);
326 } else {
327 queue_index = __packet_pick_tx_queue(dev, skb);
328 }
329
330 skb_set_queue_mapping(skb, queue_index);
331}
332
ce06b03e
DM
333/* register_prot_hook must be invoked with the po->bind_lock held,
334 * or from a context in which asynchronous accesses to the packet
335 * socket is not possible (packet_create()).
336 */
337static void register_prot_hook(struct sock *sk)
338{
339 struct packet_sock *po = pkt_sk(sk);
e40526cb 340
ce06b03e 341 if (!po->running) {
66e56cd4 342 if (po->fanout)
dc99f600 343 __fanout_link(sk, po);
66e56cd4 344 else
dc99f600 345 dev_add_pack(&po->prot_hook);
e40526cb 346
ce06b03e
DM
347 sock_hold(sk);
348 po->running = 1;
349 }
350}
351
352/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
353 * held. If the sync parameter is true, we will temporarily drop
354 * the po->bind_lock and do a synchronize_net to make sure no
355 * asynchronous packet processing paths still refer to the elements
356 * of po->prot_hook. If the sync parameter is false, it is the
357 * callers responsibility to take care of this.
358 */
359static void __unregister_prot_hook(struct sock *sk, bool sync)
360{
361 struct packet_sock *po = pkt_sk(sk);
362
363 po->running = 0;
66e56cd4
DB
364
365 if (po->fanout)
dc99f600 366 __fanout_unlink(sk, po);
66e56cd4 367 else
dc99f600 368 __dev_remove_pack(&po->prot_hook);
e40526cb 369
ce06b03e
DM
370 __sock_put(sk);
371
372 if (sync) {
373 spin_unlock(&po->bind_lock);
374 synchronize_net();
375 spin_lock(&po->bind_lock);
376 }
377}
378
379static void unregister_prot_hook(struct sock *sk, bool sync)
380{
381 struct packet_sock *po = pkt_sk(sk);
382
383 if (po->running)
384 __unregister_prot_hook(sk, sync);
385}
386
6e58040b 387static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
388{
389 if (is_vmalloc_addr(addr))
390 return vmalloc_to_page(addr);
391 return virt_to_page(addr);
392}
393
69e3c75f 394static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 395{
184f489e 396 union tpacket_uhdr h;
1da177e4 397
69e3c75f 398 h.raw = frame;
bbd6ef87
PM
399 switch (po->tp_version) {
400 case TPACKET_V1:
69e3c75f 401 h.h1->tp_status = status;
0af55bb5 402 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
403 break;
404 case TPACKET_V2:
69e3c75f 405 h.h2->tp_status = status;
0af55bb5 406 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 407 break;
f6fb8f10 408 case TPACKET_V3:
69e3c75f 409 default:
f6fb8f10 410 WARN(1, "TPACKET version not supported.\n");
69e3c75f 411 BUG();
bbd6ef87 412 }
69e3c75f
JB
413
414 smp_wmb();
bbd6ef87
PM
415}
416
69e3c75f 417static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 418{
184f489e 419 union tpacket_uhdr h;
bbd6ef87 420
69e3c75f
JB
421 smp_rmb();
422
bbd6ef87
PM
423 h.raw = frame;
424 switch (po->tp_version) {
425 case TPACKET_V1:
0af55bb5 426 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 427 return h.h1->tp_status;
bbd6ef87 428 case TPACKET_V2:
0af55bb5 429 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 430 return h.h2->tp_status;
f6fb8f10 431 case TPACKET_V3:
69e3c75f 432 default:
f6fb8f10 433 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
434 BUG();
435 return 0;
bbd6ef87 436 }
1da177e4 437}
69e3c75f 438
b9c32fb2
DB
439static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
440 unsigned int flags)
7a51384c
DB
441{
442 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
443
68a360e8
WB
444 if (shhwtstamps &&
445 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
446 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
447 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
448
449 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 450 return TP_STATUS_TS_SOFTWARE;
7a51384c 451
b9c32fb2 452 return 0;
7a51384c
DB
453}
454
b9c32fb2
DB
455static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
456 struct sk_buff *skb)
2e31396f
WB
457{
458 union tpacket_uhdr h;
459 struct timespec ts;
b9c32fb2 460 __u32 ts_status;
2e31396f 461
b9c32fb2
DB
462 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
463 return 0;
2e31396f
WB
464
465 h.raw = frame;
466 switch (po->tp_version) {
467 case TPACKET_V1:
468 h.h1->tp_sec = ts.tv_sec;
469 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
470 break;
471 case TPACKET_V2:
472 h.h2->tp_sec = ts.tv_sec;
473 h.h2->tp_nsec = ts.tv_nsec;
474 break;
475 case TPACKET_V3:
476 default:
477 WARN(1, "TPACKET version not supported.\n");
478 BUG();
479 }
480
481 /* one flush is safe, as both fields always lie on the same cacheline */
482 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
483 smp_wmb();
b9c32fb2
DB
484
485 return ts_status;
2e31396f
WB
486}
487
69e3c75f
JB
488static void *packet_lookup_frame(struct packet_sock *po,
489 struct packet_ring_buffer *rb,
490 unsigned int position,
491 int status)
492{
493 unsigned int pg_vec_pos, frame_offset;
184f489e 494 union tpacket_uhdr h;
69e3c75f
JB
495
496 pg_vec_pos = position / rb->frames_per_block;
497 frame_offset = position % rb->frames_per_block;
498
0e3125c7
NH
499 h.raw = rb->pg_vec[pg_vec_pos].buffer +
500 (frame_offset * rb->frame_size);
69e3c75f
JB
501
502 if (status != __packet_get_status(po, h.raw))
503 return NULL;
504
505 return h.raw;
506}
507
eea49cc9 508static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
509 struct packet_ring_buffer *rb,
510 int status)
511{
512 return packet_lookup_frame(po, rb, rb->head, status);
513}
514
bc59ba39 515static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 516{
517 del_timer_sync(&pkc->retire_blk_timer);
518}
519
520static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
521 int tx_ring,
522 struct sk_buff_head *rb_queue)
523{
bc59ba39 524 struct tpacket_kbdq_core *pkc;
f6fb8f10 525
22781a5b
DJ
526 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
527 GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 528
ec6f809f 529 spin_lock_bh(&rb_queue->lock);
f6fb8f10 530 pkc->delete_blk_timer = 1;
ec6f809f 531 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 532
533 prb_del_retire_blk_timer(pkc);
534}
535
536static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 537 struct tpacket_kbdq_core *pkc,
f6fb8f10 538 void (*func) (unsigned long))
539{
540 init_timer(&pkc->retire_blk_timer);
541 pkc->retire_blk_timer.data = (long)po;
542 pkc->retire_blk_timer.function = func;
543 pkc->retire_blk_timer.expires = jiffies;
544}
545
e8e85cc5 546static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 547{
bc59ba39 548 struct tpacket_kbdq_core *pkc;
f6fb8f10 549
e8e85cc5 550 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 551 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
552}
553
554static int prb_calc_retire_blk_tmo(struct packet_sock *po,
555 int blk_size_in_bytes)
556{
557 struct net_device *dev;
558 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
559 struct ethtool_cmd ecmd;
560 int err;
e440cf2c 561 u32 speed;
f6fb8f10 562
4bc71cb9
JP
563 rtnl_lock();
564 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
565 if (unlikely(!dev)) {
566 rtnl_unlock();
f6fb8f10 567 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
568 }
569 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 570 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
571 rtnl_unlock();
572 if (!err) {
4bc71cb9
JP
573 /*
574 * If the link speed is so slow you don't really
575 * need to worry about perf anyways
576 */
e440cf2c 577 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 578 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 579 } else {
580 msec = 1;
581 div = speed / 1000;
f6fb8f10 582 }
583 }
584
585 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
586
587 if (div)
588 mbits /= div;
589
590 tmo = mbits * msec;
591
592 if (div)
593 return tmo+1;
594 return tmo;
595}
596
bc59ba39 597static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 598 union tpacket_req_u *req_u)
599{
600 p1->feature_req_word = req_u->req3.tp_feature_req_word;
601}
602
603static void init_prb_bdqc(struct packet_sock *po,
604 struct packet_ring_buffer *rb,
605 struct pgv *pg_vec,
e8e85cc5 606 union tpacket_req_u *req_u)
f6fb8f10 607{
22781a5b 608 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 609 struct tpacket_block_desc *pbd;
f6fb8f10 610
611 memset(p1, 0x0, sizeof(*p1));
612
613 p1->knxt_seq_num = 1;
614 p1->pkbdq = pg_vec;
bc59ba39 615 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 616 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 617 p1->kblk_size = req_u->req3.tp_block_size;
618 p1->knum_blocks = req_u->req3.tp_block_nr;
619 p1->hdrlen = po->tp_hdrlen;
620 p1->version = po->tp_version;
621 p1->last_kactive_blk_num = 0;
ee80fbf3 622 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 623 if (req_u->req3.tp_retire_blk_tov)
624 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
625 else
626 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
627 req_u->req3.tp_block_size);
628 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
629 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
630
dc808110 631 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 632 prb_init_ft_ops(p1, req_u);
e8e85cc5 633 prb_setup_retire_blk_timer(po);
f6fb8f10 634 prb_open_block(p1, pbd);
635}
636
637/* Do NOT update the last_blk_num first.
638 * Assumes sk_buff_head lock is held.
639 */
bc59ba39 640static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 641{
642 mod_timer(&pkc->retire_blk_timer,
643 jiffies + pkc->tov_in_jiffies);
644 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
645}
646
647/*
648 * Timer logic:
649 * 1) We refresh the timer only when we open a block.
650 * By doing this we don't waste cycles refreshing the timer
651 * on packet-by-packet basis.
652 *
653 * With a 1MB block-size, on a 1Gbps line, it will take
654 * i) ~8 ms to fill a block + ii) memcpy etc.
655 * In this cut we are not accounting for the memcpy time.
656 *
657 * So, if the user sets the 'tmo' to 10ms then the timer
658 * will never fire while the block is still getting filled
659 * (which is what we want). However, the user could choose
660 * to close a block early and that's fine.
661 *
662 * But when the timer does fire, we check whether or not to refresh it.
663 * Since the tmo granularity is in msecs, it is not too expensive
664 * to refresh the timer, lets say every '8' msecs.
665 * Either the user can set the 'tmo' or we can derive it based on
666 * a) line-speed and b) block-size.
667 * prb_calc_retire_blk_tmo() calculates the tmo.
668 *
669 */
670static void prb_retire_rx_blk_timer_expired(unsigned long data)
671{
672 struct packet_sock *po = (struct packet_sock *)data;
22781a5b 673 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 674 unsigned int frozen;
bc59ba39 675 struct tpacket_block_desc *pbd;
f6fb8f10 676
677 spin_lock(&po->sk.sk_receive_queue.lock);
678
679 frozen = prb_queue_frozen(pkc);
680 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
681
682 if (unlikely(pkc->delete_blk_timer))
683 goto out;
684
685 /* We only need to plug the race when the block is partially filled.
686 * tpacket_rcv:
687 * lock(); increment BLOCK_NUM_PKTS; unlock()
688 * copy_bits() is in progress ...
689 * timer fires on other cpu:
690 * we can't retire the current block because copy_bits
691 * is in progress.
692 *
693 */
694 if (BLOCK_NUM_PKTS(pbd)) {
695 while (atomic_read(&pkc->blk_fill_in_prog)) {
696 /* Waiting for skb_copy_bits to finish... */
697 cpu_relax();
698 }
699 }
700
701 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
702 if (!frozen) {
41a50d62
AD
703 if (!BLOCK_NUM_PKTS(pbd)) {
704 /* An empty block. Just refresh the timer. */
705 goto refresh_timer;
706 }
f6fb8f10 707 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
708 if (!prb_dispatch_next_block(pkc, po))
709 goto refresh_timer;
710 else
711 goto out;
712 } else {
713 /* Case 1. Queue was frozen because user-space was
714 * lagging behind.
715 */
716 if (prb_curr_blk_in_use(pkc, pbd)) {
717 /*
718 * Ok, user-space is still behind.
719 * So just refresh the timer.
720 */
721 goto refresh_timer;
722 } else {
723 /* Case 2. queue was frozen,user-space caught up,
724 * now the link went idle && the timer fired.
725 * We don't have a block to close.So we open this
726 * block and restart the timer.
727 * opening a block thaws the queue,restarts timer
728 * Thawing/timer-refresh is a side effect.
729 */
730 prb_open_block(pkc, pbd);
731 goto out;
732 }
733 }
734 }
735
736refresh_timer:
737 _prb_refresh_rx_retire_blk_timer(pkc);
738
739out:
740 spin_unlock(&po->sk.sk_receive_queue.lock);
741}
742
eea49cc9 743static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 744 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 745{
746 /* Flush everything minus the block header */
747
748#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
749 u8 *start, *end;
750
751 start = (u8 *)pbd1;
752
753 /* Skip the block header(we know header WILL fit in 4K) */
754 start += PAGE_SIZE;
755
756 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
757 for (; start < end; start += PAGE_SIZE)
758 flush_dcache_page(pgv_to_page(start));
759
760 smp_wmb();
761#endif
762
763 /* Now update the block status. */
764
765 BLOCK_STATUS(pbd1) = status;
766
767 /* Flush the block header */
768
769#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
770 start = (u8 *)pbd1;
771 flush_dcache_page(pgv_to_page(start));
772
773 smp_wmb();
774#endif
775}
776
777/*
778 * Side effect:
779 *
780 * 1) flush the block
781 * 2) Increment active_blk_num
782 *
783 * Note:We DONT refresh the timer on purpose.
784 * Because almost always the next block will be opened.
785 */
bc59ba39 786static void prb_close_block(struct tpacket_kbdq_core *pkc1,
787 struct tpacket_block_desc *pbd1,
f6fb8f10 788 struct packet_sock *po, unsigned int stat)
789{
790 __u32 status = TP_STATUS_USER | stat;
791
792 struct tpacket3_hdr *last_pkt;
bc59ba39 793 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 794 struct sock *sk = &po->sk;
f6fb8f10 795
ee80fbf3 796 if (po->stats.stats3.tp_drops)
f6fb8f10 797 status |= TP_STATUS_LOSING;
798
799 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
800 last_pkt->tp_next_offset = 0;
801
802 /* Get the ts of the last pkt */
803 if (BLOCK_NUM_PKTS(pbd1)) {
804 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
805 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
806 } else {
41a50d62
AD
807 /* Ok, we tmo'd - so get the current time.
808 *
809 * It shouldn't really happen as we don't close empty
810 * blocks. See prb_retire_rx_blk_timer_expired().
811 */
f6fb8f10 812 struct timespec ts;
813 getnstimeofday(&ts);
814 h1->ts_last_pkt.ts_sec = ts.tv_sec;
815 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
816 }
817
818 smp_wmb();
819
820 /* Flush the block */
821 prb_flush_block(pkc1, pbd1, status);
822
da413eec
DC
823 sk->sk_data_ready(sk);
824
f6fb8f10 825 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
826}
827
eea49cc9 828static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 829{
830 pkc->reset_pending_on_curr_blk = 0;
831}
832
833/*
834 * Side effect of opening a block:
835 *
836 * 1) prb_queue is thawed.
837 * 2) retire_blk_timer is refreshed.
838 *
839 */
bc59ba39 840static void prb_open_block(struct tpacket_kbdq_core *pkc1,
841 struct tpacket_block_desc *pbd1)
f6fb8f10 842{
843 struct timespec ts;
bc59ba39 844 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 845
846 smp_rmb();
847
8da3056c
DB
848 /* We could have just memset this but we will lose the
849 * flexibility of making the priv area sticky
850 */
f6fb8f10 851
8da3056c
DB
852 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
853 BLOCK_NUM_PKTS(pbd1) = 0;
854 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 855
8da3056c
DB
856 getnstimeofday(&ts);
857
858 h1->ts_first_pkt.ts_sec = ts.tv_sec;
859 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 860
8da3056c
DB
861 pkc1->pkblk_start = (char *)pbd1;
862 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
863
864 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
865 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
866
867 pbd1->version = pkc1->version;
868 pkc1->prev = pkc1->nxt_offset;
869 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
870
871 prb_thaw_queue(pkc1);
872 _prb_refresh_rx_retire_blk_timer(pkc1);
873
874 smp_wmb();
f6fb8f10 875}
876
877/*
878 * Queue freeze logic:
879 * 1) Assume tp_block_nr = 8 blocks.
880 * 2) At time 't0', user opens Rx ring.
881 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
882 * 4) user-space is either sleeping or processing block '0'.
883 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
884 * it will close block-7,loop around and try to fill block '0'.
885 * call-flow:
886 * __packet_lookup_frame_in_block
887 * prb_retire_current_block()
888 * prb_dispatch_next_block()
889 * |->(BLOCK_STATUS == USER) evaluates to true
890 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
891 * 6) Now there are two cases:
892 * 6.1) Link goes idle right after the queue is frozen.
893 * But remember, the last open_block() refreshed the timer.
894 * When this timer expires,it will refresh itself so that we can
895 * re-open block-0 in near future.
896 * 6.2) Link is busy and keeps on receiving packets. This is a simple
897 * case and __packet_lookup_frame_in_block will check if block-0
898 * is free and can now be re-used.
899 */
eea49cc9 900static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 901 struct packet_sock *po)
902{
903 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 904 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 905}
906
907#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
908
909/*
910 * If the next block is free then we will dispatch it
911 * and return a good offset.
912 * Else, we will freeze the queue.
913 * So, caller must check the return value.
914 */
bc59ba39 915static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 916 struct packet_sock *po)
917{
bc59ba39 918 struct tpacket_block_desc *pbd;
f6fb8f10 919
920 smp_rmb();
921
922 /* 1. Get current block num */
923 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
924
925 /* 2. If this block is currently in_use then freeze the queue */
926 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
927 prb_freeze_queue(pkc, po);
928 return NULL;
929 }
930
931 /*
932 * 3.
933 * open this block and return the offset where the first packet
934 * needs to get stored.
935 */
936 prb_open_block(pkc, pbd);
937 return (void *)pkc->nxt_offset;
938}
939
bc59ba39 940static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 941 struct packet_sock *po, unsigned int status)
942{
bc59ba39 943 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 944
945 /* retire/close the current block */
946 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
947 /*
948 * Plug the case where copy_bits() is in progress on
949 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
950 * have space to copy the pkt in the current block and
951 * called prb_retire_current_block()
952 *
953 * We don't need to worry about the TMO case because
954 * the timer-handler already handled this case.
955 */
956 if (!(status & TP_STATUS_BLK_TMO)) {
957 while (atomic_read(&pkc->blk_fill_in_prog)) {
958 /* Waiting for skb_copy_bits to finish... */
959 cpu_relax();
960 }
961 }
962 prb_close_block(pkc, pbd, po, status);
963 return;
964 }
f6fb8f10 965}
966
eea49cc9 967static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 968 struct tpacket_block_desc *pbd)
f6fb8f10 969{
970 return TP_STATUS_USER & BLOCK_STATUS(pbd);
971}
972
eea49cc9 973static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 974{
975 return pkc->reset_pending_on_curr_blk;
976}
977
eea49cc9 978static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 979{
bc59ba39 980 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 981 atomic_dec(&pkc->blk_fill_in_prog);
982}
983
eea49cc9 984static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 985 struct tpacket3_hdr *ppd)
986{
3958afa1 987 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 988}
989
eea49cc9 990static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 991 struct tpacket3_hdr *ppd)
992{
993 ppd->hv1.tp_rxhash = 0;
994}
995
eea49cc9 996static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 997 struct tpacket3_hdr *ppd)
998{
df8a39de
JP
999 if (skb_vlan_tag_present(pkc->skb)) {
1000 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
1001 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1002 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 1003 } else {
9e67030a 1004 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 1005 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 1006 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 1007 }
1008}
1009
bc59ba39 1010static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 1011 struct tpacket3_hdr *ppd)
1012{
a0cdfcf3 1013 ppd->hv1.tp_padding = 0;
f6fb8f10 1014 prb_fill_vlan_info(pkc, ppd);
1015
1016 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1017 prb_fill_rxhash(pkc, ppd);
1018 else
1019 prb_clear_rxhash(pkc, ppd);
1020}
1021
eea49cc9 1022static void prb_fill_curr_block(char *curr,
bc59ba39 1023 struct tpacket_kbdq_core *pkc,
1024 struct tpacket_block_desc *pbd,
f6fb8f10 1025 unsigned int len)
1026{
1027 struct tpacket3_hdr *ppd;
1028
1029 ppd = (struct tpacket3_hdr *)curr;
1030 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1031 pkc->prev = curr;
1032 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1033 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1034 BLOCK_NUM_PKTS(pbd) += 1;
1035 atomic_inc(&pkc->blk_fill_in_prog);
1036 prb_run_all_ft_ops(pkc, ppd);
1037}
1038
1039/* Assumes caller has the sk->rx_queue.lock */
1040static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1041 struct sk_buff *skb,
1042 int status,
1043 unsigned int len
1044 )
1045{
bc59ba39 1046 struct tpacket_kbdq_core *pkc;
1047 struct tpacket_block_desc *pbd;
f6fb8f10 1048 char *curr, *end;
1049
e3192690 1050 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1051 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1052
1053 /* Queue is frozen when user space is lagging behind */
1054 if (prb_queue_frozen(pkc)) {
1055 /*
1056 * Check if that last block which caused the queue to freeze,
1057 * is still in_use by user-space.
1058 */
1059 if (prb_curr_blk_in_use(pkc, pbd)) {
1060 /* Can't record this packet */
1061 return NULL;
1062 } else {
1063 /*
1064 * Ok, the block was released by user-space.
1065 * Now let's open that block.
1066 * opening a block also thaws the queue.
1067 * Thawing is a side effect.
1068 */
1069 prb_open_block(pkc, pbd);
1070 }
1071 }
1072
1073 smp_mb();
1074 curr = pkc->nxt_offset;
1075 pkc->skb = skb;
e3192690 1076 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1077
1078 /* first try the current block */
1079 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1080 prb_fill_curr_block(curr, pkc, pbd, len);
1081 return (void *)curr;
1082 }
1083
1084 /* Ok, close the current block */
1085 prb_retire_current_block(pkc, po, 0);
1086
1087 /* Now, try to dispatch the next block */
1088 curr = (char *)prb_dispatch_next_block(pkc, po);
1089 if (curr) {
1090 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1091 prb_fill_curr_block(curr, pkc, pbd, len);
1092 return (void *)curr;
1093 }
1094
1095 /*
1096 * No free blocks are available.user_space hasn't caught up yet.
1097 * Queue was just frozen and now this packet will get dropped.
1098 */
1099 return NULL;
1100}
1101
eea49cc9 1102static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1103 struct sk_buff *skb,
1104 int status, unsigned int len)
1105{
1106 char *curr = NULL;
1107 switch (po->tp_version) {
1108 case TPACKET_V1:
1109 case TPACKET_V2:
1110 curr = packet_lookup_frame(po, &po->rx_ring,
1111 po->rx_ring.head, status);
1112 return curr;
1113 case TPACKET_V3:
1114 return __packet_lookup_frame_in_block(po, skb, status, len);
1115 default:
1116 WARN(1, "TPACKET version not supported\n");
1117 BUG();
99aa3473 1118 return NULL;
f6fb8f10 1119 }
1120}
1121
eea49cc9 1122static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1123 struct packet_ring_buffer *rb,
77f65ebd 1124 unsigned int idx,
f6fb8f10 1125 int status)
1126{
bc59ba39 1127 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1128 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1129
1130 if (status != BLOCK_STATUS(pbd))
1131 return NULL;
1132 return pbd;
1133}
1134
eea49cc9 1135static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1136{
1137 unsigned int prev;
1138 if (rb->prb_bdqc.kactive_blk_num)
1139 prev = rb->prb_bdqc.kactive_blk_num-1;
1140 else
1141 prev = rb->prb_bdqc.knum_blocks-1;
1142 return prev;
1143}
1144
1145/* Assumes caller has held the rx_queue.lock */
eea49cc9 1146static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1147 struct packet_ring_buffer *rb,
1148 int status)
1149{
1150 unsigned int previous = prb_previous_blk_num(rb);
1151 return prb_lookup_block(po, rb, previous, status);
1152}
1153
eea49cc9 1154static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1155 struct packet_ring_buffer *rb,
1156 int status)
1157{
1158 if (po->tp_version <= TPACKET_V2)
1159 return packet_previous_frame(po, rb, status);
1160
1161 return __prb_previous_block(po, rb, status);
1162}
1163
eea49cc9 1164static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1165 struct packet_ring_buffer *rb)
1166{
1167 switch (po->tp_version) {
1168 case TPACKET_V1:
1169 case TPACKET_V2:
1170 return packet_increment_head(rb);
1171 case TPACKET_V3:
1172 default:
1173 WARN(1, "TPACKET version not supported.\n");
1174 BUG();
1175 return;
1176 }
1177}
1178
eea49cc9 1179static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1180 struct packet_ring_buffer *rb,
1181 int status)
1182{
1183 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1184 return packet_lookup_frame(po, rb, previous, status);
1185}
1186
eea49cc9 1187static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1188{
1189 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1190}
1191
b0138408
DB
1192static void packet_inc_pending(struct packet_ring_buffer *rb)
1193{
1194 this_cpu_inc(*rb->pending_refcnt);
1195}
1196
1197static void packet_dec_pending(struct packet_ring_buffer *rb)
1198{
1199 this_cpu_dec(*rb->pending_refcnt);
1200}
1201
1202static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1203{
1204 unsigned int refcnt = 0;
1205 int cpu;
1206
1207 /* We don't use pending refcount in rx_ring. */
1208 if (rb->pending_refcnt == NULL)
1209 return 0;
1210
1211 for_each_possible_cpu(cpu)
1212 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1213
1214 return refcnt;
1215}
1216
1217static int packet_alloc_pending(struct packet_sock *po)
1218{
1219 po->rx_ring.pending_refcnt = NULL;
1220
1221 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1222 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1223 return -ENOBUFS;
1224
1225 return 0;
1226}
1227
1228static void packet_free_pending(struct packet_sock *po)
1229{
1230 free_percpu(po->tx_ring.pending_refcnt);
1231}
1232
9954729b
WB
1233#define ROOM_POW_OFF 2
1234#define ROOM_NONE 0x0
1235#define ROOM_LOW 0x1
1236#define ROOM_NORMAL 0x2
1237
1238static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1239{
9954729b
WB
1240 int idx, len;
1241
1242 len = po->rx_ring.frame_max + 1;
1243 idx = po->rx_ring.head;
1244 if (pow_off)
1245 idx += len >> pow_off;
1246 if (idx >= len)
1247 idx -= len;
1248 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1249}
1250
1251static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1252{
1253 int idx, len;
1254
1255 len = po->rx_ring.prb_bdqc.knum_blocks;
1256 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1257 if (pow_off)
1258 idx += len >> pow_off;
1259 if (idx >= len)
1260 idx -= len;
1261 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1262}
77f65ebd 1263
2ccdbaa6 1264static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1265{
1266 struct sock *sk = &po->sk;
1267 int ret = ROOM_NONE;
1268
1269 if (po->prot_hook.func != tpacket_rcv) {
1270 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1271 - (skb ? skb->truesize : 0);
9954729b
WB
1272 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1273 return ROOM_NORMAL;
1274 else if (avail > 0)
1275 return ROOM_LOW;
1276 else
1277 return ROOM_NONE;
1278 }
77f65ebd 1279
9954729b
WB
1280 if (po->tp_version == TPACKET_V3) {
1281 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1282 ret = ROOM_NORMAL;
1283 else if (__tpacket_v3_has_room(po, 0))
1284 ret = ROOM_LOW;
1285 } else {
1286 if (__tpacket_has_room(po, ROOM_POW_OFF))
1287 ret = ROOM_NORMAL;
1288 else if (__tpacket_has_room(po, 0))
1289 ret = ROOM_LOW;
1290 }
2ccdbaa6
WB
1291
1292 return ret;
1293}
1294
1295static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1296{
1297 int ret;
1298 bool has_room;
1299
54d7c01d
WB
1300 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1301 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1302 has_room = ret == ROOM_NORMAL;
1303 if (po->pressure == has_room)
54d7c01d
WB
1304 po->pressure = !has_room;
1305 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1306
9954729b 1307 return ret;
77f65ebd
WB
1308}
1309
1da177e4
LT
1310static void packet_sock_destruct(struct sock *sk)
1311{
ed85b565
RC
1312 skb_queue_purge(&sk->sk_error_queue);
1313
547b792c
IJ
1314 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1315 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1316
1317 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1318 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1319 return;
1320 }
1321
17ab56a2 1322 sk_refcnt_debug_dec(sk);
1da177e4
LT
1323}
1324
3b3a5b0a
WB
1325static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1326{
1327 u32 rxhash;
1328 int i, count = 0;
1329
1330 rxhash = skb_get_hash(skb);
1331 for (i = 0; i < ROLLOVER_HLEN; i++)
1332 if (po->rollover->history[i] == rxhash)
1333 count++;
1334
1335 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1336 return count > (ROLLOVER_HLEN >> 1);
1337}
1338
77f65ebd
WB
1339static unsigned int fanout_demux_hash(struct packet_fanout *f,
1340 struct sk_buff *skb,
1341 unsigned int num)
dc99f600 1342{
61b905da 1343 return reciprocal_scale(skb_get_hash(skb), num);
dc99f600
DM
1344}
1345
77f65ebd
WB
1346static unsigned int fanout_demux_lb(struct packet_fanout *f,
1347 struct sk_buff *skb,
1348 unsigned int num)
dc99f600 1349{
468479e6 1350 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1351
468479e6 1352 return val % num;
77f65ebd
WB
1353}
1354
1355static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1356 struct sk_buff *skb,
1357 unsigned int num)
1358{
1359 return smp_processor_id() % num;
dc99f600
DM
1360}
1361
5df0ddfb
DB
1362static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1363 struct sk_buff *skb,
1364 unsigned int num)
1365{
f337db64 1366 return prandom_u32_max(num);
5df0ddfb
DB
1367}
1368
77f65ebd
WB
1369static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1370 struct sk_buff *skb,
ad377cab 1371 unsigned int idx, bool try_self,
77f65ebd 1372 unsigned int num)
95ec3eb4 1373{
4633c9e0 1374 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1375 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1376
0648ab70 1377 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1378
1379 if (try_self) {
1380 room = packet_rcv_has_room(po, skb);
1381 if (room == ROOM_NORMAL ||
1382 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1383 return idx;
4633c9e0 1384 po_skip = po;
3b3a5b0a 1385 }
ad377cab 1386
0648ab70 1387 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1388 do {
2ccdbaa6 1389 po_next = pkt_sk(f->arr[i]);
4633c9e0 1390 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1391 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1392 if (i != j)
0648ab70 1393 po->rollover->sock = i;
a9b63918
WB
1394 atomic_long_inc(&po->rollover->num);
1395 if (room == ROOM_LOW)
1396 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1397 return i;
1398 }
ad377cab 1399
77f65ebd
WB
1400 if (++i == num)
1401 i = 0;
1402 } while (i != j);
1403
a9b63918 1404 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1405 return idx;
1406}
1407
2d36097d
NH
1408static unsigned int fanout_demux_qm(struct packet_fanout *f,
1409 struct sk_buff *skb,
1410 unsigned int num)
1411{
1412 return skb_get_queue_mapping(skb) % num;
1413}
1414
77f65ebd
WB
1415static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1416{
1417 return f->flags & (flag >> 8);
95ec3eb4
DM
1418}
1419
95ec3eb4
DM
1420static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1421 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1422{
1423 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1424 unsigned int num = READ_ONCE(f->num_members);
dc99f600 1425 struct packet_sock *po;
77f65ebd 1426 unsigned int idx;
dc99f600
DM
1427
1428 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1429 !num) {
1430 kfree_skb(skb);
1431 return 0;
1432 }
1433
3f34b24a
AD
1434 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1435 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
1436 if (!skb)
1437 return 0;
1438 }
95ec3eb4
DM
1439 switch (f->type) {
1440 case PACKET_FANOUT_HASH:
1441 default:
77f65ebd 1442 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1443 break;
1444 case PACKET_FANOUT_LB:
77f65ebd 1445 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1446 break;
1447 case PACKET_FANOUT_CPU:
77f65ebd
WB
1448 idx = fanout_demux_cpu(f, skb, num);
1449 break;
5df0ddfb
DB
1450 case PACKET_FANOUT_RND:
1451 idx = fanout_demux_rnd(f, skb, num);
1452 break;
2d36097d
NH
1453 case PACKET_FANOUT_QM:
1454 idx = fanout_demux_qm(f, skb, num);
1455 break;
77f65ebd 1456 case PACKET_FANOUT_ROLLOVER:
ad377cab 1457 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1458 break;
dc99f600
DM
1459 }
1460
ad377cab
WB
1461 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1462 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1463
ad377cab 1464 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1465 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1466}
1467
fff3321d
PE
1468DEFINE_MUTEX(fanout_mutex);
1469EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1470static LIST_HEAD(fanout_list);
1471
1472static void __fanout_link(struct sock *sk, struct packet_sock *po)
1473{
1474 struct packet_fanout *f = po->fanout;
1475
1476 spin_lock(&f->lock);
1477 f->arr[f->num_members] = sk;
1478 smp_wmb();
1479 f->num_members++;
1480 spin_unlock(&f->lock);
1481}
1482
1483static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1484{
1485 struct packet_fanout *f = po->fanout;
1486 int i;
1487
1488 spin_lock(&f->lock);
1489 for (i = 0; i < f->num_members; i++) {
1490 if (f->arr[i] == sk)
1491 break;
1492 }
1493 BUG_ON(i >= f->num_members);
1494 f->arr[i] = f->arr[f->num_members - 1];
1495 f->num_members--;
1496 spin_unlock(&f->lock);
1497}
1498
d4dd8aee 1499static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1500{
d4dd8aee 1501 if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout)
c0de08d0
EL
1502 return true;
1503
1504 return false;
1505}
1506
7736d33f 1507static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1508{
1509 struct packet_sock *po = pkt_sk(sk);
1510 struct packet_fanout *f, *match;
7736d33f 1511 u8 type = type_flags & 0xff;
77f65ebd 1512 u8 flags = type_flags >> 8;
dc99f600
DM
1513 int err;
1514
1515 switch (type) {
77f65ebd
WB
1516 case PACKET_FANOUT_ROLLOVER:
1517 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1518 return -EINVAL;
dc99f600
DM
1519 case PACKET_FANOUT_HASH:
1520 case PACKET_FANOUT_LB:
95ec3eb4 1521 case PACKET_FANOUT_CPU:
5df0ddfb 1522 case PACKET_FANOUT_RND:
2d36097d 1523 case PACKET_FANOUT_QM:
dc99f600
DM
1524 break;
1525 default:
1526 return -EINVAL;
1527 }
1528
1529 if (!po->running)
1530 return -EINVAL;
1531
1532 if (po->fanout)
1533 return -EALREADY;
1534
4633c9e0
WB
1535 if (type == PACKET_FANOUT_ROLLOVER ||
1536 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
0648ab70
WB
1537 po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL);
1538 if (!po->rollover)
1539 return -ENOMEM;
a9b63918
WB
1540 atomic_long_set(&po->rollover->num, 0);
1541 atomic_long_set(&po->rollover->num_huge, 0);
1542 atomic_long_set(&po->rollover->num_failed, 0);
0648ab70
WB
1543 }
1544
dc99f600
DM
1545 mutex_lock(&fanout_mutex);
1546 match = NULL;
1547 list_for_each_entry(f, &fanout_list, list) {
1548 if (f->id == id &&
1549 read_pnet(&f->net) == sock_net(sk)) {
1550 match = f;
1551 break;
1552 }
1553 }
afe62c68 1554 err = -EINVAL;
77f65ebd 1555 if (match && match->flags != flags)
afe62c68 1556 goto out;
dc99f600 1557 if (!match) {
afe62c68 1558 err = -ENOMEM;
dc99f600 1559 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1560 if (!match)
1561 goto out;
1562 write_pnet(&match->net, sock_net(sk));
1563 match->id = id;
1564 match->type = type;
77f65ebd 1565 match->flags = flags;
afe62c68
ED
1566 atomic_set(&match->rr_cur, 0);
1567 INIT_LIST_HEAD(&match->list);
1568 spin_lock_init(&match->lock);
1569 atomic_set(&match->sk_ref, 0);
1570 match->prot_hook.type = po->prot_hook.type;
1571 match->prot_hook.dev = po->prot_hook.dev;
1572 match->prot_hook.func = packet_rcv_fanout;
1573 match->prot_hook.af_packet_priv = match;
c0de08d0 1574 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1575 dev_add_pack(&match->prot_hook);
1576 list_add(&match->list, &fanout_list);
dc99f600 1577 }
afe62c68
ED
1578 err = -EINVAL;
1579 if (match->type == type &&
1580 match->prot_hook.type == po->prot_hook.type &&
1581 match->prot_hook.dev == po->prot_hook.dev) {
1582 err = -ENOSPC;
1583 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1584 __dev_remove_pack(&po->prot_hook);
1585 po->fanout = match;
1586 atomic_inc(&match->sk_ref);
1587 __fanout_link(sk, po);
1588 err = 0;
dc99f600
DM
1589 }
1590 }
afe62c68 1591out:
dc99f600 1592 mutex_unlock(&fanout_mutex);
0648ab70
WB
1593 if (err) {
1594 kfree(po->rollover);
1595 po->rollover = NULL;
1596 }
dc99f600
DM
1597 return err;
1598}
1599
1600static void fanout_release(struct sock *sk)
1601{
1602 struct packet_sock *po = pkt_sk(sk);
1603 struct packet_fanout *f;
1604
1605 f = po->fanout;
1606 if (!f)
1607 return;
1608
fff3321d 1609 mutex_lock(&fanout_mutex);
dc99f600
DM
1610 po->fanout = NULL;
1611
dc99f600
DM
1612 if (atomic_dec_and_test(&f->sk_ref)) {
1613 list_del(&f->list);
1614 dev_remove_pack(&f->prot_hook);
1615 kfree(f);
1616 }
1617 mutex_unlock(&fanout_mutex);
0648ab70 1618
59f21118
WB
1619 if (po->rollover)
1620 kfree_rcu(po->rollover, rcu);
dc99f600 1621}
1da177e4 1622
90ddc4f0 1623static const struct proto_ops packet_ops;
1da177e4 1624
90ddc4f0 1625static const struct proto_ops packet_ops_spkt;
1da177e4 1626
40d4e3df
ED
1627static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1628 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1629{
1630 struct sock *sk;
1631 struct sockaddr_pkt *spkt;
1632
1633 /*
1634 * When we registered the protocol we saved the socket in the data
1635 * field for just this event.
1636 */
1637
1638 sk = pt->af_packet_priv;
1ce4f28b 1639
1da177e4
LT
1640 /*
1641 * Yank back the headers [hope the device set this
1642 * right or kerboom...]
1643 *
1644 * Incoming packets have ll header pulled,
1645 * push it back.
1646 *
98e399f8 1647 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1648 * so that this procedure is noop.
1649 */
1650
1651 if (skb->pkt_type == PACKET_LOOPBACK)
1652 goto out;
1653
09ad9bc7 1654 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1655 goto out;
1656
40d4e3df
ED
1657 skb = skb_share_check(skb, GFP_ATOMIC);
1658 if (skb == NULL)
1da177e4
LT
1659 goto oom;
1660
1661 /* drop any routing info */
adf30907 1662 skb_dst_drop(skb);
1da177e4 1663
84531c24
PO
1664 /* drop conntrack reference */
1665 nf_reset(skb);
1666
ffbc6111 1667 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1668
98e399f8 1669 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1670
1671 /*
1672 * The SOCK_PACKET socket receives _all_ frames.
1673 */
1674
1675 spkt->spkt_family = dev->type;
1676 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1677 spkt->spkt_protocol = skb->protocol;
1678
1679 /*
1680 * Charge the memory to the socket. This is done specifically
1681 * to prevent sockets using all the memory up.
1682 */
1683
40d4e3df 1684 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1685 return 0;
1686
1687out:
1688 kfree_skb(skb);
1689oom:
1690 return 0;
1691}
1692
1693
1694/*
1695 * Output a raw packet to a device layer. This bypasses all the other
1696 * protocol layers and you must therefore supply it with a complete frame
1697 */
1ce4f28b 1698
1b784140
YX
1699static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1700 size_t len)
1da177e4
LT
1701{
1702 struct sock *sk = sock->sk;
342dfc30 1703 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1704 struct sk_buff *skb = NULL;
1da177e4 1705 struct net_device *dev;
40d4e3df 1706 __be16 proto = 0;
1da177e4 1707 int err;
3bdc0eba 1708 int extra_len = 0;
1ce4f28b 1709
1da177e4 1710 /*
1ce4f28b 1711 * Get and verify the address.
1da177e4
LT
1712 */
1713
40d4e3df 1714 if (saddr) {
1da177e4 1715 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1716 return -EINVAL;
1717 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1718 proto = saddr->spkt_protocol;
1719 } else
1720 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1721
1722 /*
1ce4f28b 1723 * Find the device first to size check it
1da177e4
LT
1724 */
1725
de74e92a 1726 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1727retry:
654d1f8a
ED
1728 rcu_read_lock();
1729 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1730 err = -ENODEV;
1731 if (dev == NULL)
1732 goto out_unlock;
1ce4f28b 1733
d5e76b0a
DM
1734 err = -ENETDOWN;
1735 if (!(dev->flags & IFF_UP))
1736 goto out_unlock;
1737
1da177e4 1738 /*
40d4e3df
ED
1739 * You may not queue a frame bigger than the mtu. This is the lowest level
1740 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1741 */
1ce4f28b 1742
3bdc0eba
BG
1743 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1744 if (!netif_supports_nofcs(dev)) {
1745 err = -EPROTONOSUPPORT;
1746 goto out_unlock;
1747 }
1748 extra_len = 4; /* We're doing our own CRC */
1749 }
1750
1da177e4 1751 err = -EMSGSIZE;
3bdc0eba 1752 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1753 goto out_unlock;
1754
1a35ca80
ED
1755 if (!skb) {
1756 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1757 int tlen = dev->needed_tailroom;
1a35ca80
ED
1758 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1759
1760 rcu_read_unlock();
4ce40912 1761 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1762 if (skb == NULL)
1763 return -ENOBUFS;
1764 /* FIXME: Save some space for broken drivers that write a hard
1765 * header at transmission time by themselves. PPP is the notable
1766 * one here. This should really be fixed at the driver level.
1767 */
1768 skb_reserve(skb, reserved);
1769 skb_reset_network_header(skb);
1770
1771 /* Try to align data part correctly */
1772 if (hhlen) {
1773 skb->data -= hhlen;
1774 skb->tail -= hhlen;
1775 if (len < hhlen)
1776 skb_reset_network_header(skb);
1777 }
6ce8e9ce 1778 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1779 if (err)
1780 goto out_free;
1781 goto retry;
1da177e4
LT
1782 }
1783
3bdc0eba 1784 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1785 /* Earlier code assumed this would be a VLAN pkt,
1786 * double-check this now that we have the actual
1787 * packet in hand.
1788 */
1789 struct ethhdr *ehdr;
1790 skb_reset_mac_header(skb);
1791 ehdr = eth_hdr(skb);
1792 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1793 err = -EMSGSIZE;
1794 goto out_unlock;
1795 }
1796 }
1a35ca80 1797
1da177e4
LT
1798 skb->protocol = proto;
1799 skb->dev = dev;
1800 skb->priority = sk->sk_priority;
2d37a186 1801 skb->mark = sk->sk_mark;
bf84a010
DB
1802
1803 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1804
3bdc0eba
BG
1805 if (unlikely(extra_len == 4))
1806 skb->no_fcs = 1;
1807
40893fd0 1808 skb_probe_transport_header(skb, 0);
c1aad275 1809
1da177e4 1810 dev_queue_xmit(skb);
654d1f8a 1811 rcu_read_unlock();
40d4e3df 1812 return len;
1da177e4 1813
1da177e4 1814out_unlock:
654d1f8a 1815 rcu_read_unlock();
1a35ca80
ED
1816out_free:
1817 kfree_skb(skb);
1da177e4
LT
1818 return err;
1819}
1da177e4 1820
eea49cc9 1821static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1822 const struct sock *sk,
dbcb5855 1823 unsigned int res)
1da177e4
LT
1824{
1825 struct sk_filter *filter;
fda9ef5d 1826
80f8f102
ED
1827 rcu_read_lock();
1828 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1829 if (filter != NULL)
0a14842f 1830 res = SK_RUN_FILTER(filter, skb);
80f8f102 1831 rcu_read_unlock();
1da177e4 1832
dbcb5855 1833 return res;
1da177e4
LT
1834}
1835
1836/*
62ab0812
ED
1837 * This function makes lazy skb cloning in hope that most of packets
1838 * are discarded by BPF.
1839 *
1840 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1841 * and skb->cb are mangled. It works because (and until) packets
1842 * falling here are owned by current CPU. Output packets are cloned
1843 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1844 * sequencially, so that if we return skb to original state on exit,
1845 * we will not harm anyone.
1da177e4
LT
1846 */
1847
40d4e3df
ED
1848static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1849 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1850{
1851 struct sock *sk;
1852 struct sockaddr_ll *sll;
1853 struct packet_sock *po;
40d4e3df 1854 u8 *skb_head = skb->data;
1da177e4 1855 int skb_len = skb->len;
dbcb5855 1856 unsigned int snaplen, res;
1da177e4
LT
1857
1858 if (skb->pkt_type == PACKET_LOOPBACK)
1859 goto drop;
1860
1861 sk = pt->af_packet_priv;
1862 po = pkt_sk(sk);
1863
09ad9bc7 1864 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1865 goto drop;
1866
1da177e4
LT
1867 skb->dev = dev;
1868
3b04ddde 1869 if (dev->header_ops) {
1da177e4 1870 /* The device has an explicit notion of ll header,
62ab0812
ED
1871 * exported to higher levels.
1872 *
1873 * Otherwise, the device hides details of its frame
1874 * structure, so that corresponding packet head is
1875 * never delivered to user.
1da177e4
LT
1876 */
1877 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1878 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1879 else if (skb->pkt_type == PACKET_OUTGOING) {
1880 /* Special case: outgoing packets have ll header at head */
bbe735e4 1881 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1882 }
1883 }
1884
1885 snaplen = skb->len;
1886
dbcb5855
DM
1887 res = run_filter(skb, sk, snaplen);
1888 if (!res)
fda9ef5d 1889 goto drop_n_restore;
dbcb5855
DM
1890 if (snaplen > res)
1891 snaplen = res;
1da177e4 1892
0fd7bac6 1893 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1894 goto drop_n_acct;
1895
1896 if (skb_shared(skb)) {
1897 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1898 if (nskb == NULL)
1899 goto drop_n_acct;
1900
1901 if (skb_head != skb->data) {
1902 skb->data = skb_head;
1903 skb->len = skb_len;
1904 }
abc4e4fa 1905 consume_skb(skb);
1da177e4
LT
1906 skb = nskb;
1907 }
1908
b4772ef8 1909 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
1910
1911 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 1912 sll->sll_hatype = dev->type;
1da177e4 1913 sll->sll_pkttype = skb->pkt_type;
8032b464 1914 if (unlikely(po->origdev))
80feaacb
PWJ
1915 sll->sll_ifindex = orig_dev->ifindex;
1916 else
1917 sll->sll_ifindex = dev->ifindex;
1da177e4 1918
b95cce35 1919 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1920
2472d761
EB
1921 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
1922 * Use their space for storing the original skb length.
1923 */
1924 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 1925
1da177e4
LT
1926 if (pskb_trim(skb, snaplen))
1927 goto drop_n_acct;
1928
1929 skb_set_owner_r(skb, sk);
1930 skb->dev = NULL;
adf30907 1931 skb_dst_drop(skb);
1da177e4 1932
84531c24
PO
1933 /* drop conntrack reference */
1934 nf_reset(skb);
1935
1da177e4 1936 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1937 po->stats.stats1.tp_packets++;
3bc3b96f 1938 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
1939 __skb_queue_tail(&sk->sk_receive_queue, skb);
1940 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 1941 sk->sk_data_ready(sk);
1da177e4
LT
1942 return 0;
1943
1944drop_n_acct:
7091fbd8 1945 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1946 po->stats.stats1.tp_drops++;
7091fbd8
WB
1947 atomic_inc(&sk->sk_drops);
1948 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1949
1950drop_n_restore:
1951 if (skb_head != skb->data && skb_shared(skb)) {
1952 skb->data = skb_head;
1953 skb->len = skb_len;
1954 }
1955drop:
ead2ceb0 1956 consume_skb(skb);
1da177e4
LT
1957 return 0;
1958}
1959
40d4e3df
ED
1960static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1961 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1962{
1963 struct sock *sk;
1964 struct packet_sock *po;
1965 struct sockaddr_ll *sll;
184f489e 1966 union tpacket_uhdr h;
40d4e3df 1967 u8 *skb_head = skb->data;
1da177e4 1968 int skb_len = skb->len;
dbcb5855 1969 unsigned int snaplen, res;
f6fb8f10 1970 unsigned long status = TP_STATUS_USER;
bbd6ef87 1971 unsigned short macoff, netoff, hdrlen;
1da177e4 1972 struct sk_buff *copy_skb = NULL;
bbd6ef87 1973 struct timespec ts;
b9c32fb2 1974 __u32 ts_status;
1da177e4 1975
51846355
AW
1976 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
1977 * We may add members to them until current aligned size without forcing
1978 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
1979 */
1980 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
1981 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
1982
1da177e4
LT
1983 if (skb->pkt_type == PACKET_LOOPBACK)
1984 goto drop;
1985
1986 sk = pt->af_packet_priv;
1987 po = pkt_sk(sk);
1988
09ad9bc7 1989 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1990 goto drop;
1991
3b04ddde 1992 if (dev->header_ops) {
1da177e4 1993 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1994 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1995 else if (skb->pkt_type == PACKET_OUTGOING) {
1996 /* Special case: outgoing packets have ll header at head */
bbe735e4 1997 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1998 }
1999 }
2000
2001 snaplen = skb->len;
2002
dbcb5855
DM
2003 res = run_filter(skb, sk, snaplen);
2004 if (!res)
fda9ef5d 2005 goto drop_n_restore;
68c2e5de
AD
2006
2007 if (skb->ip_summed == CHECKSUM_PARTIAL)
2008 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2009 else if (skb->pkt_type != PACKET_OUTGOING &&
2010 (skb->ip_summed == CHECKSUM_COMPLETE ||
2011 skb_csum_unnecessary(skb)))
2012 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2013
dbcb5855
DM
2014 if (snaplen > res)
2015 snaplen = res;
1da177e4
LT
2016
2017 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2018 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2019 po->tp_reserve;
1da177e4 2020 } else {
95c96174 2021 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2022 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
2023 (maclen < 16 ? 16 : maclen)) +
2024 po->tp_reserve;
1da177e4
LT
2025 macoff = netoff - maclen;
2026 }
f6fb8f10 2027 if (po->tp_version <= TPACKET_V2) {
2028 if (macoff + snaplen > po->rx_ring.frame_size) {
2029 if (po->copy_thresh &&
0fd7bac6 2030 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2031 if (skb_shared(skb)) {
2032 copy_skb = skb_clone(skb, GFP_ATOMIC);
2033 } else {
2034 copy_skb = skb_get(skb);
2035 skb_head = skb->data;
2036 }
2037 if (copy_skb)
2038 skb_set_owner_r(copy_skb, sk);
1da177e4 2039 }
f6fb8f10 2040 snaplen = po->rx_ring.frame_size - macoff;
2041 if ((int)snaplen < 0)
2042 snaplen = 0;
1da177e4 2043 }
dc808110
ED
2044 } else if (unlikely(macoff + snaplen >
2045 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2046 u32 nval;
2047
2048 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2049 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2050 snaplen, nval, macoff);
2051 snaplen = nval;
2052 if (unlikely((int)snaplen < 0)) {
2053 snaplen = 0;
2054 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2055 }
1da177e4 2056 }
1da177e4 2057 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2058 h.raw = packet_current_rx_frame(po, skb,
2059 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2060 if (!h.raw)
1da177e4 2061 goto ring_is_full;
f6fb8f10 2062 if (po->tp_version <= TPACKET_V2) {
2063 packet_increment_rx_head(po, &po->rx_ring);
2064 /*
2065 * LOSING will be reported till you read the stats,
2066 * because it's COR - Clear On Read.
2067 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2068 * at packet level.
2069 */
ee80fbf3 2070 if (po->stats.stats1.tp_drops)
f6fb8f10 2071 status |= TP_STATUS_LOSING;
2072 }
ee80fbf3 2073 po->stats.stats1.tp_packets++;
1da177e4
LT
2074 if (copy_skb) {
2075 status |= TP_STATUS_COPY;
2076 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2077 }
1da177e4
LT
2078 spin_unlock(&sk->sk_receive_queue.lock);
2079
bbd6ef87 2080 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2081
2082 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2083 getnstimeofday(&ts);
1da177e4 2084
b9c32fb2
DB
2085 status |= ts_status;
2086
bbd6ef87
PM
2087 switch (po->tp_version) {
2088 case TPACKET_V1:
2089 h.h1->tp_len = skb->len;
2090 h.h1->tp_snaplen = snaplen;
2091 h.h1->tp_mac = macoff;
2092 h.h1->tp_net = netoff;
4b457bdf
DB
2093 h.h1->tp_sec = ts.tv_sec;
2094 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2095 hdrlen = sizeof(*h.h1);
2096 break;
2097 case TPACKET_V2:
2098 h.h2->tp_len = skb->len;
2099 h.h2->tp_snaplen = snaplen;
2100 h.h2->tp_mac = macoff;
2101 h.h2->tp_net = netoff;
bbd6ef87
PM
2102 h.h2->tp_sec = ts.tv_sec;
2103 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2104 if (skb_vlan_tag_present(skb)) {
2105 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2106 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2107 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2108 } else {
2109 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2110 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2111 }
e4d26f4b 2112 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2113 hdrlen = sizeof(*h.h2);
2114 break;
f6fb8f10 2115 case TPACKET_V3:
2116 /* tp_nxt_offset,vlan are already populated above.
2117 * So DONT clear those fields here
2118 */
2119 h.h3->tp_status |= status;
2120 h.h3->tp_len = skb->len;
2121 h.h3->tp_snaplen = snaplen;
2122 h.h3->tp_mac = macoff;
2123 h.h3->tp_net = netoff;
f6fb8f10 2124 h.h3->tp_sec = ts.tv_sec;
2125 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2126 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2127 hdrlen = sizeof(*h.h3);
2128 break;
bbd6ef87
PM
2129 default:
2130 BUG();
2131 }
1da177e4 2132
bbd6ef87 2133 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2134 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2135 sll->sll_family = AF_PACKET;
2136 sll->sll_hatype = dev->type;
2137 sll->sll_protocol = skb->protocol;
2138 sll->sll_pkttype = skb->pkt_type;
8032b464 2139 if (unlikely(po->origdev))
80feaacb
PWJ
2140 sll->sll_ifindex = orig_dev->ifindex;
2141 else
2142 sll->sll_ifindex = dev->ifindex;
1da177e4 2143
e16aa207 2144 smp_mb();
f0d4eb29 2145
f6dafa95 2146#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2147 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2148 u8 *start, *end;
2149
f0d4eb29
DB
2150 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2151 macoff + snaplen);
2152
2153 for (start = h.raw; start < end; start += PAGE_SIZE)
2154 flush_dcache_page(pgv_to_page(start));
1da177e4 2155 }
f0d4eb29 2156 smp_wmb();
f6dafa95 2157#endif
f0d4eb29 2158
da413eec 2159 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2160 __packet_set_status(po, h.raw, status);
da413eec
DC
2161 sk->sk_data_ready(sk);
2162 } else {
f6fb8f10 2163 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2164 }
1da177e4
LT
2165
2166drop_n_restore:
2167 if (skb_head != skb->data && skb_shared(skb)) {
2168 skb->data = skb_head;
2169 skb->len = skb_len;
2170 }
2171drop:
1ce4f28b 2172 kfree_skb(skb);
1da177e4
LT
2173 return 0;
2174
2175ring_is_full:
ee80fbf3 2176 po->stats.stats1.tp_drops++;
1da177e4
LT
2177 spin_unlock(&sk->sk_receive_queue.lock);
2178
676d2369 2179 sk->sk_data_ready(sk);
acb5d75b 2180 kfree_skb(copy_skb);
1da177e4
LT
2181 goto drop_n_restore;
2182}
2183
69e3c75f
JB
2184static void tpacket_destruct_skb(struct sk_buff *skb)
2185{
2186 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2187
69e3c75f 2188 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2189 void *ph;
b9c32fb2
DB
2190 __u32 ts;
2191
69e3c75f 2192 ph = skb_shinfo(skb)->destructor_arg;
b0138408 2193 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2194
2195 ts = __packet_set_timestamp(po, ph, skb);
2196 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2197 }
2198
2199 sock_wfree(skb);
2200}
2201
9c707762
WB
2202static bool ll_header_truncated(const struct net_device *dev, int len)
2203{
2204 /* net device doesn't like empty head */
2205 if (unlikely(len <= dev->hard_header_len)) {
eee2f04b 2206 net_warn_ratelimited("%s: packet size is too short (%d <= %d)\n",
9c707762
WB
2207 current->comm, len, dev->hard_header_len);
2208 return true;
2209 }
2210
2211 return false;
2212}
2213
40d4e3df
ED
2214static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2215 void *frame, struct net_device *dev, int size_max,
ae641949 2216 __be16 proto, unsigned char *addr, int hlen)
69e3c75f 2217{
184f489e 2218 union tpacket_uhdr ph;
09effa67 2219 int to_write, offset, len, tp_len, nr_frags, len_max;
69e3c75f
JB
2220 struct socket *sock = po->sk.sk_socket;
2221 struct page *page;
2222 void *data;
2223 int err;
2224
2225 ph.raw = frame;
2226
2227 skb->protocol = proto;
2228 skb->dev = dev;
2229 skb->priority = po->sk.sk_priority;
2d37a186 2230 skb->mark = po->sk.sk_mark;
2e31396f 2231 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2232 skb_shinfo(skb)->destructor_arg = ph.raw;
2233
2234 switch (po->tp_version) {
2235 case TPACKET_V2:
2236 tp_len = ph.h2->tp_len;
2237 break;
2238 default:
2239 tp_len = ph.h1->tp_len;
2240 break;
2241 }
09effa67
DM
2242 if (unlikely(tp_len > size_max)) {
2243 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2244 return -EMSGSIZE;
2245 }
69e3c75f 2246
ae641949 2247 skb_reserve(skb, hlen);
69e3c75f 2248 skb_reset_network_header(skb);
c1aad275 2249
d346a3fa
DB
2250 if (!packet_use_direct_xmit(po))
2251 skb_probe_transport_header(skb, 0);
2252 if (unlikely(po->tp_tx_has_off)) {
5920cd3a
PC
2253 int off_min, off_max, off;
2254 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2255 off_max = po->tx_ring.frame_size - tp_len;
2256 if (sock->type == SOCK_DGRAM) {
2257 switch (po->tp_version) {
2258 case TPACKET_V2:
2259 off = ph.h2->tp_net;
2260 break;
2261 default:
2262 off = ph.h1->tp_net;
2263 break;
2264 }
2265 } else {
2266 switch (po->tp_version) {
2267 case TPACKET_V2:
2268 off = ph.h2->tp_mac;
2269 break;
2270 default:
2271 off = ph.h1->tp_mac;
2272 break;
2273 }
2274 }
2275 if (unlikely((off < off_min) || (off_max < off)))
2276 return -EINVAL;
2277 data = ph.raw + off;
2278 } else {
2279 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
2280 }
69e3c75f
JB
2281 to_write = tp_len;
2282
2283 if (sock->type == SOCK_DGRAM) {
2284 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2285 NULL, tp_len);
2286 if (unlikely(err < 0))
2287 return -EINVAL;
40d4e3df 2288 } else if (dev->hard_header_len) {
9c707762 2289 if (ll_header_truncated(dev, tp_len))
69e3c75f 2290 return -EINVAL;
69e3c75f
JB
2291
2292 skb_push(skb, dev->hard_header_len);
2293 err = skb_store_bits(skb, 0, data,
2294 dev->hard_header_len);
2295 if (unlikely(err))
2296 return err;
2297
2298 data += dev->hard_header_len;
2299 to_write -= dev->hard_header_len;
2300 }
2301
69e3c75f
JB
2302 offset = offset_in_page(data);
2303 len_max = PAGE_SIZE - offset;
2304 len = ((to_write > len_max) ? len_max : to_write);
2305
2306 skb->data_len = to_write;
2307 skb->len += to_write;
2308 skb->truesize += to_write;
2309 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2310
2311 while (likely(to_write)) {
2312 nr_frags = skb_shinfo(skb)->nr_frags;
2313
2314 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2315 pr_err("Packet exceed the number of skb frags(%lu)\n",
2316 MAX_SKB_FRAGS);
69e3c75f
JB
2317 return -EFAULT;
2318 }
2319
0af55bb5
CG
2320 page = pgv_to_page(data);
2321 data += len;
69e3c75f
JB
2322 flush_dcache_page(page);
2323 get_page(page);
0af55bb5 2324 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2325 to_write -= len;
2326 offset = 0;
2327 len_max = PAGE_SIZE;
2328 len = ((to_write > len_max) ? len_max : to_write);
2329 }
2330
2331 return tp_len;
2332}
2333
2334static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2335{
69e3c75f
JB
2336 struct sk_buff *skb;
2337 struct net_device *dev;
2338 __be16 proto;
09effa67 2339 int err, reserve = 0;
40d4e3df 2340 void *ph;
342dfc30 2341 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2342 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2343 int tp_len, size_max;
2344 unsigned char *addr;
2345 int len_sum = 0;
9e67030a 2346 int status = TP_STATUS_AVAILABLE;
ae641949 2347 int hlen, tlen;
69e3c75f 2348
69e3c75f
JB
2349 mutex_lock(&po->pg_vec_lock);
2350
66e56cd4 2351 if (likely(saddr == NULL)) {
e40526cb 2352 dev = packet_cached_dev_get(po);
69e3c75f
JB
2353 proto = po->num;
2354 addr = NULL;
2355 } else {
2356 err = -EINVAL;
2357 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2358 goto out;
2359 if (msg->msg_namelen < (saddr->sll_halen
2360 + offsetof(struct sockaddr_ll,
2361 sll_addr)))
2362 goto out;
69e3c75f
JB
2363 proto = saddr->sll_protocol;
2364 addr = saddr->sll_addr;
827d9780 2365 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2366 }
2367
69e3c75f
JB
2368 err = -ENXIO;
2369 if (unlikely(dev == NULL))
2370 goto out;
69e3c75f
JB
2371 err = -ENETDOWN;
2372 if (unlikely(!(dev->flags & IFF_UP)))
2373 goto out_put;
2374
52f1454f 2375 reserve = dev->hard_header_len + VLAN_HLEN;
69e3c75f 2376 size_max = po->tx_ring.frame_size
b5dd884e 2377 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2378
09effa67
DM
2379 if (size_max > dev->mtu + reserve)
2380 size_max = dev->mtu + reserve;
2381
69e3c75f
JB
2382 do {
2383 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2384 TP_STATUS_SEND_REQUEST);
69e3c75f 2385 if (unlikely(ph == NULL)) {
87a2fd28
DB
2386 if (need_wait && need_resched())
2387 schedule();
69e3c75f
JB
2388 continue;
2389 }
2390
2391 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2392 hlen = LL_RESERVED_SPACE(dev);
2393 tlen = dev->needed_tailroom;
69e3c75f 2394 skb = sock_alloc_send_skb(&po->sk,
ae641949 2395 hlen + tlen + sizeof(struct sockaddr_ll),
fbf33a28 2396 !need_wait, &err);
69e3c75f 2397
fbf33a28
KM
2398 if (unlikely(skb == NULL)) {
2399 /* we assume the socket was initially writeable ... */
2400 if (likely(len_sum > 0))
2401 err = len_sum;
69e3c75f 2402 goto out_status;
fbf33a28 2403 }
69e3c75f 2404 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
52f1454f 2405 addr, hlen);
dbd46ab4
AD
2406 if (likely(tp_len >= 0) &&
2407 tp_len > dev->mtu + dev->hard_header_len) {
52f1454f
DB
2408 struct ethhdr *ehdr;
2409 /* Earlier code assumed this would be a VLAN pkt,
2410 * double-check this now that we have the actual
2411 * packet in hand.
2412 */
69e3c75f 2413
52f1454f
DB
2414 skb_reset_mac_header(skb);
2415 ehdr = eth_hdr(skb);
2416 if (ehdr->h_proto != htons(ETH_P_8021Q))
2417 tp_len = -EMSGSIZE;
2418 }
69e3c75f
JB
2419 if (unlikely(tp_len < 0)) {
2420 if (po->tp_loss) {
2421 __packet_set_status(po, ph,
2422 TP_STATUS_AVAILABLE);
2423 packet_increment_head(&po->tx_ring);
2424 kfree_skb(skb);
2425 continue;
2426 } else {
2427 status = TP_STATUS_WRONG_FORMAT;
2428 err = tp_len;
2429 goto out_status;
2430 }
2431 }
2432
0fd5d57b
DB
2433 packet_pick_tx_queue(dev, skb);
2434
69e3c75f
JB
2435 skb->destructor = tpacket_destruct_skb;
2436 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2437 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2438
2439 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2440 err = po->xmit(skb);
eb70df13
JP
2441 if (unlikely(err > 0)) {
2442 err = net_xmit_errno(err);
2443 if (err && __packet_get_status(po, ph) ==
2444 TP_STATUS_AVAILABLE) {
2445 /* skb was destructed already */
2446 skb = NULL;
2447 goto out_status;
2448 }
2449 /*
2450 * skb was dropped but not destructed yet;
2451 * let's treat it like congestion or err < 0
2452 */
2453 err = 0;
2454 }
69e3c75f
JB
2455 packet_increment_head(&po->tx_ring);
2456 len_sum += tp_len;
b0138408
DB
2457 } while (likely((ph != NULL) ||
2458 /* Note: packet_read_pending() might be slow if we have
2459 * to call it as it's per_cpu variable, but in fast-path
2460 * we already short-circuit the loop with the first
2461 * condition, and luckily don't have to go that path
2462 * anyway.
2463 */
2464 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2465
2466 err = len_sum;
2467 goto out_put;
2468
69e3c75f
JB
2469out_status:
2470 __packet_set_status(po, ph, status);
2471 kfree_skb(skb);
2472out_put:
e40526cb 2473 dev_put(dev);
69e3c75f
JB
2474out:
2475 mutex_unlock(&po->pg_vec_lock);
2476 return err;
2477}
69e3c75f 2478
eea49cc9
OJ
2479static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2480 size_t reserve, size_t len,
2481 size_t linear, int noblock,
2482 int *err)
bfd5f4a3
SS
2483{
2484 struct sk_buff *skb;
2485
2486 /* Under a page? Don't bother with paged skb. */
2487 if (prepad + len < PAGE_SIZE || !linear)
2488 linear = len;
2489
2490 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2491 err, 0);
bfd5f4a3
SS
2492 if (!skb)
2493 return NULL;
2494
2495 skb_reserve(skb, reserve);
2496 skb_put(skb, linear);
2497 skb->data_len = len - linear;
2498 skb->len += len - linear;
2499
2500 return skb;
2501}
2502
d346a3fa 2503static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2504{
2505 struct sock *sk = sock->sk;
342dfc30 2506 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2507 struct sk_buff *skb;
2508 struct net_device *dev;
0e11c91e 2509 __be16 proto;
1da177e4 2510 unsigned char *addr;
827d9780 2511 int err, reserve = 0;
bfd5f4a3
SS
2512 struct virtio_net_hdr vnet_hdr = { 0 };
2513 int offset = 0;
2514 int vnet_hdr_len;
2515 struct packet_sock *po = pkt_sk(sk);
2516 unsigned short gso_type = 0;
ae641949 2517 int hlen, tlen;
3bdc0eba 2518 int extra_len = 0;
8feb2fb2 2519 ssize_t n;
1da177e4
LT
2520
2521 /*
1ce4f28b 2522 * Get and verify the address.
1da177e4 2523 */
1ce4f28b 2524
66e56cd4 2525 if (likely(saddr == NULL)) {
e40526cb 2526 dev = packet_cached_dev_get(po);
1da177e4
LT
2527 proto = po->num;
2528 addr = NULL;
2529 } else {
2530 err = -EINVAL;
2531 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2532 goto out;
0fb375fb
EB
2533 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2534 goto out;
1da177e4
LT
2535 proto = saddr->sll_protocol;
2536 addr = saddr->sll_addr;
827d9780 2537 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2538 }
2539
1da177e4 2540 err = -ENXIO;
e40526cb 2541 if (unlikely(dev == NULL))
1da177e4 2542 goto out_unlock;
d5e76b0a 2543 err = -ENETDOWN;
e40526cb 2544 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2545 goto out_unlock;
2546
e40526cb
DB
2547 if (sock->type == SOCK_RAW)
2548 reserve = dev->hard_header_len;
bfd5f4a3
SS
2549 if (po->has_vnet_hdr) {
2550 vnet_hdr_len = sizeof(vnet_hdr);
2551
2552 err = -EINVAL;
2553 if (len < vnet_hdr_len)
2554 goto out_unlock;
2555
2556 len -= vnet_hdr_len;
2557
8feb2fb2 2558 err = -EFAULT;
c0371da6 2559 n = copy_from_iter(&vnet_hdr, vnet_hdr_len, &msg->msg_iter);
8feb2fb2 2560 if (n != vnet_hdr_len)
bfd5f4a3
SS
2561 goto out_unlock;
2562
2563 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
dc9e5153
MT
2564 (__virtio16_to_cpu(false, vnet_hdr.csum_start) +
2565 __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2 >
2566 __virtio16_to_cpu(false, vnet_hdr.hdr_len)))
2567 vnet_hdr.hdr_len = __cpu_to_virtio16(false,
2568 __virtio16_to_cpu(false, vnet_hdr.csum_start) +
2569 __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2);
bfd5f4a3
SS
2570
2571 err = -EINVAL;
dc9e5153 2572 if (__virtio16_to_cpu(false, vnet_hdr.hdr_len) > len)
bfd5f4a3
SS
2573 goto out_unlock;
2574
2575 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2576 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2577 case VIRTIO_NET_HDR_GSO_TCPV4:
2578 gso_type = SKB_GSO_TCPV4;
2579 break;
2580 case VIRTIO_NET_HDR_GSO_TCPV6:
2581 gso_type = SKB_GSO_TCPV6;
2582 break;
2583 case VIRTIO_NET_HDR_GSO_UDP:
2584 gso_type = SKB_GSO_UDP;
2585 break;
2586 default:
2587 goto out_unlock;
2588 }
2589
2590 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2591 gso_type |= SKB_GSO_TCP_ECN;
2592
2593 if (vnet_hdr.gso_size == 0)
2594 goto out_unlock;
2595
2596 }
2597 }
2598
3bdc0eba
BG
2599 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2600 if (!netif_supports_nofcs(dev)) {
2601 err = -EPROTONOSUPPORT;
2602 goto out_unlock;
2603 }
2604 extra_len = 4; /* We're doing our own CRC */
2605 }
2606
1da177e4 2607 err = -EMSGSIZE;
3bdc0eba 2608 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2609 goto out_unlock;
2610
bfd5f4a3 2611 err = -ENOBUFS;
ae641949
HX
2612 hlen = LL_RESERVED_SPACE(dev);
2613 tlen = dev->needed_tailroom;
dc9e5153
MT
2614 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len,
2615 __virtio16_to_cpu(false, vnet_hdr.hdr_len),
bfd5f4a3 2616 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2617 if (skb == NULL)
1da177e4
LT
2618 goto out_unlock;
2619
bfd5f4a3 2620 skb_set_network_header(skb, reserve);
1da177e4 2621
0c4e8581 2622 err = -EINVAL;
9c707762
WB
2623 if (sock->type == SOCK_DGRAM) {
2624 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2625 if (unlikely(offset < 0))
9c707762
WB
2626 goto out_free;
2627 } else {
2628 if (ll_header_truncated(dev, len))
2629 goto out_free;
2630 }
1da177e4
LT
2631
2632 /* Returns -EFAULT on error */
c0371da6 2633 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2634 if (err)
2635 goto out_free;
bf84a010
DB
2636
2637 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2638
3bdc0eba 2639 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
09effa67
DM
2640 /* Earlier code assumed this would be a VLAN pkt,
2641 * double-check this now that we have the actual
2642 * packet in hand.
2643 */
2644 struct ethhdr *ehdr;
2645 skb_reset_mac_header(skb);
2646 ehdr = eth_hdr(skb);
2647 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2648 err = -EMSGSIZE;
2649 goto out_free;
2650 }
57f89bfa
BG
2651 }
2652
09effa67
DM
2653 skb->protocol = proto;
2654 skb->dev = dev;
1da177e4 2655 skb->priority = sk->sk_priority;
2d37a186 2656 skb->mark = sk->sk_mark;
0fd5d57b
DB
2657
2658 packet_pick_tx_queue(dev, skb);
1da177e4 2659
bfd5f4a3
SS
2660 if (po->has_vnet_hdr) {
2661 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
dc9e5153
MT
2662 u16 s = __virtio16_to_cpu(false, vnet_hdr.csum_start);
2663 u16 o = __virtio16_to_cpu(false, vnet_hdr.csum_offset);
2664 if (!skb_partial_csum_set(skb, s, o)) {
bfd5f4a3
SS
2665 err = -EINVAL;
2666 goto out_free;
2667 }
2668 }
2669
dc9e5153
MT
2670 skb_shinfo(skb)->gso_size =
2671 __virtio16_to_cpu(false, vnet_hdr.gso_size);
bfd5f4a3
SS
2672 skb_shinfo(skb)->gso_type = gso_type;
2673
2674 /* Header must be checked, and gso_segs computed. */
2675 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2676 skb_shinfo(skb)->gso_segs = 0;
2677
2678 len += vnet_hdr_len;
2679 }
2680
d346a3fa
DB
2681 if (!packet_use_direct_xmit(po))
2682 skb_probe_transport_header(skb, reserve);
3bdc0eba
BG
2683 if (unlikely(extra_len == 4))
2684 skb->no_fcs = 1;
2685
d346a3fa 2686 err = po->xmit(skb);
1da177e4
LT
2687 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2688 goto out_unlock;
2689
e40526cb 2690 dev_put(dev);
1da177e4 2691
40d4e3df 2692 return len;
1da177e4
LT
2693
2694out_free:
2695 kfree_skb(skb);
2696out_unlock:
e40526cb 2697 if (dev)
1da177e4
LT
2698 dev_put(dev);
2699out:
2700 return err;
2701}
2702
1b784140 2703static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2704{
69e3c75f
JB
2705 struct sock *sk = sock->sk;
2706 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2707
69e3c75f
JB
2708 if (po->tx_ring.pg_vec)
2709 return tpacket_snd(po, msg);
2710 else
69e3c75f
JB
2711 return packet_snd(sock, msg, len);
2712}
2713
1da177e4
LT
2714/*
2715 * Close a PACKET socket. This is fairly simple. We immediately go
2716 * to 'closed' state and remove our protocol entry in the device list.
2717 */
2718
2719static int packet_release(struct socket *sock)
2720{
2721 struct sock *sk = sock->sk;
2722 struct packet_sock *po;
d12d01d6 2723 struct net *net;
f6fb8f10 2724 union tpacket_req_u req_u;
1da177e4
LT
2725
2726 if (!sk)
2727 return 0;
2728
3b1e0a65 2729 net = sock_net(sk);
1da177e4
LT
2730 po = pkt_sk(sk);
2731
0fa7fa98 2732 mutex_lock(&net->packet.sklist_lock);
808f5114 2733 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2734 mutex_unlock(&net->packet.sklist_lock);
2735
2736 preempt_disable();
920de804 2737 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2738 preempt_enable();
1da177e4 2739
808f5114 2740 spin_lock(&po->bind_lock);
ce06b03e 2741 unregister_prot_hook(sk, false);
66e56cd4
DB
2742 packet_cached_dev_reset(po);
2743
160ff18a
BG
2744 if (po->prot_hook.dev) {
2745 dev_put(po->prot_hook.dev);
2746 po->prot_hook.dev = NULL;
2747 }
808f5114 2748 spin_unlock(&po->bind_lock);
1da177e4 2749
1da177e4 2750 packet_flush_mclist(sk);
1da177e4 2751
9665d5d6
PS
2752 if (po->rx_ring.pg_vec) {
2753 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2754 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2755 }
69e3c75f 2756
9665d5d6
PS
2757 if (po->tx_ring.pg_vec) {
2758 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2759 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2760 }
1da177e4 2761
dc99f600
DM
2762 fanout_release(sk);
2763
808f5114 2764 synchronize_net();
1da177e4
LT
2765 /*
2766 * Now the socket is dead. No more input will appear.
2767 */
1da177e4
LT
2768 sock_orphan(sk);
2769 sock->sk = NULL;
2770
2771 /* Purge queues */
2772
2773 skb_queue_purge(&sk->sk_receive_queue);
b0138408 2774 packet_free_pending(po);
17ab56a2 2775 sk_refcnt_debug_release(sk);
1da177e4
LT
2776
2777 sock_put(sk);
2778 return 0;
2779}
2780
2781/*
2782 * Attach a packet hook.
2783 */
2784
902fefb8 2785static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
1da177e4
LT
2786{
2787 struct packet_sock *po = pkt_sk(sk);
158cd4af 2788 struct net_device *dev_curr;
902fefb8
DB
2789 __be16 proto_curr;
2790 bool need_rehook;
dc99f600 2791
aef950b4
WY
2792 if (po->fanout) {
2793 if (dev)
2794 dev_put(dev);
2795
dc99f600 2796 return -EINVAL;
aef950b4 2797 }
1da177e4
LT
2798
2799 lock_sock(sk);
1da177e4 2800 spin_lock(&po->bind_lock);
66e56cd4 2801
902fefb8
DB
2802 proto_curr = po->prot_hook.type;
2803 dev_curr = po->prot_hook.dev;
2804
2805 need_rehook = proto_curr != proto || dev_curr != dev;
2806
2807 if (need_rehook) {
2808 unregister_prot_hook(sk, true);
1da177e4 2809
902fefb8
DB
2810 po->num = proto;
2811 po->prot_hook.type = proto;
902fefb8
DB
2812 po->prot_hook.dev = dev;
2813
2814 po->ifindex = dev ? dev->ifindex : 0;
2815 packet_cached_dev_assign(po, dev);
2816 }
158cd4af
LW
2817 if (dev_curr)
2818 dev_put(dev_curr);
66e56cd4 2819
902fefb8 2820 if (proto == 0 || !need_rehook)
1da177e4
LT
2821 goto out_unlock;
2822
be85d4ad 2823 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2824 register_prot_hook(sk);
be85d4ad
UT
2825 } else {
2826 sk->sk_err = ENETDOWN;
2827 if (!sock_flag(sk, SOCK_DEAD))
2828 sk->sk_error_report(sk);
1da177e4
LT
2829 }
2830
2831out_unlock:
2832 spin_unlock(&po->bind_lock);
2833 release_sock(sk);
2834 return 0;
2835}
2836
2837/*
2838 * Bind a packet socket to a device
2839 */
2840
40d4e3df
ED
2841static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2842 int addr_len)
1da177e4 2843{
40d4e3df 2844 struct sock *sk = sock->sk;
1da177e4
LT
2845 char name[15];
2846 struct net_device *dev;
2847 int err = -ENODEV;
1ce4f28b 2848
1da177e4
LT
2849 /*
2850 * Check legality
2851 */
1ce4f28b 2852
8ae55f04 2853 if (addr_len != sizeof(struct sockaddr))
1da177e4 2854 return -EINVAL;
40d4e3df 2855 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2856
3b1e0a65 2857 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2858 if (dev)
1da177e4 2859 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2860 return err;
2861}
1da177e4
LT
2862
2863static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2864{
40d4e3df
ED
2865 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2866 struct sock *sk = sock->sk;
1da177e4
LT
2867 struct net_device *dev = NULL;
2868 int err;
2869
2870
2871 /*
2872 * Check legality
2873 */
1ce4f28b 2874
1da177e4
LT
2875 if (addr_len < sizeof(struct sockaddr_ll))
2876 return -EINVAL;
2877 if (sll->sll_family != AF_PACKET)
2878 return -EINVAL;
2879
2880 if (sll->sll_ifindex) {
2881 err = -ENODEV;
3b1e0a65 2882 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2883 if (dev == NULL)
2884 goto out;
2885 }
2886 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2887
2888out:
2889 return err;
2890}
2891
2892static struct proto packet_proto = {
2893 .name = "PACKET",
2894 .owner = THIS_MODULE,
2895 .obj_size = sizeof(struct packet_sock),
2896};
2897
2898/*
1ce4f28b 2899 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2900 */
2901
3f378b68
EP
2902static int packet_create(struct net *net, struct socket *sock, int protocol,
2903 int kern)
1da177e4
LT
2904{
2905 struct sock *sk;
2906 struct packet_sock *po;
0e11c91e 2907 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2908 int err;
2909
df008c91 2910 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 2911 return -EPERM;
be02097c
DM
2912 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2913 sock->type != SOCK_PACKET)
1da177e4
LT
2914 return -ESOCKTNOSUPPORT;
2915
2916 sock->state = SS_UNCONNECTED;
2917
2918 err = -ENOBUFS;
11aa9c28 2919 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
2920 if (sk == NULL)
2921 goto out;
2922
2923 sock->ops = &packet_ops;
1da177e4
LT
2924 if (sock->type == SOCK_PACKET)
2925 sock->ops = &packet_ops_spkt;
be02097c 2926
1da177e4
LT
2927 sock_init_data(sock, sk);
2928
2929 po = pkt_sk(sk);
2930 sk->sk_family = PF_PACKET;
0e11c91e 2931 po->num = proto;
d346a3fa 2932 po->xmit = dev_queue_xmit;
66e56cd4 2933
b0138408
DB
2934 err = packet_alloc_pending(po);
2935 if (err)
2936 goto out2;
2937
66e56cd4 2938 packet_cached_dev_reset(po);
1da177e4
LT
2939
2940 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2941 sk_refcnt_debug_inc(sk);
1da177e4
LT
2942
2943 /*
2944 * Attach a protocol block
2945 */
2946
2947 spin_lock_init(&po->bind_lock);
905db440 2948 mutex_init(&po->pg_vec_lock);
0648ab70 2949 po->rollover = NULL;
1da177e4 2950 po->prot_hook.func = packet_rcv;
be02097c 2951
1da177e4
LT
2952 if (sock->type == SOCK_PACKET)
2953 po->prot_hook.func = packet_rcv_spkt;
be02097c 2954
1da177e4
LT
2955 po->prot_hook.af_packet_priv = sk;
2956
0e11c91e
AV
2957 if (proto) {
2958 po->prot_hook.type = proto;
ce06b03e 2959 register_prot_hook(sk);
1da177e4
LT
2960 }
2961
0fa7fa98 2962 mutex_lock(&net->packet.sklist_lock);
808f5114 2963 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
2964 mutex_unlock(&net->packet.sklist_lock);
2965
2966 preempt_disable();
3680453c 2967 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 2968 preempt_enable();
808f5114 2969
40d4e3df 2970 return 0;
b0138408
DB
2971out2:
2972 sk_free(sk);
1da177e4
LT
2973out:
2974 return err;
2975}
2976
2977/*
2978 * Pull a packet from our receive queue and hand it to the user.
2979 * If necessary we block.
2980 */
2981
1b784140
YX
2982static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
2983 int flags)
1da177e4
LT
2984{
2985 struct sock *sk = sock->sk;
2986 struct sk_buff *skb;
2987 int copied, err;
bfd5f4a3 2988 int vnet_hdr_len = 0;
2472d761 2989 unsigned int origlen = 0;
1da177e4
LT
2990
2991 err = -EINVAL;
ed85b565 2992 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2993 goto out;
2994
2995#if 0
2996 /* What error should we return now? EUNATTACH? */
2997 if (pkt_sk(sk)->ifindex < 0)
2998 return -ENODEV;
2999#endif
3000
ed85b565 3001 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3002 err = sock_recv_errqueue(sk, msg, len,
3003 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3004 goto out;
3005 }
3006
1da177e4
LT
3007 /*
3008 * Call the generic datagram receiver. This handles all sorts
3009 * of horrible races and re-entrancy so we can forget about it
3010 * in the protocol layers.
3011 *
3012 * Now it will return ENETDOWN, if device have just gone down,
3013 * but then it will block.
3014 */
3015
40d4e3df 3016 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3017
3018 /*
1ce4f28b 3019 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3020 * handles the blocking we don't see and worry about blocking
3021 * retries.
3022 */
3023
8ae55f04 3024 if (skb == NULL)
1da177e4
LT
3025 goto out;
3026
2ccdbaa6
WB
3027 if (pkt_sk(sk)->pressure)
3028 packet_rcv_has_room(pkt_sk(sk), NULL);
3029
bfd5f4a3
SS
3030 if (pkt_sk(sk)->has_vnet_hdr) {
3031 struct virtio_net_hdr vnet_hdr = { 0 };
3032
3033 err = -EINVAL;
3034 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 3035 if (len < vnet_hdr_len)
bfd5f4a3
SS
3036 goto out_free;
3037
1f18b717
MK
3038 len -= vnet_hdr_len;
3039
bfd5f4a3
SS
3040 if (skb_is_gso(skb)) {
3041 struct skb_shared_info *sinfo = skb_shinfo(skb);
3042
3043 /* This is a hint as to how much should be linear. */
dc9e5153
MT
3044 vnet_hdr.hdr_len =
3045 __cpu_to_virtio16(false, skb_headlen(skb));
3046 vnet_hdr.gso_size =
3047 __cpu_to_virtio16(false, sinfo->gso_size);
bfd5f4a3
SS
3048 if (sinfo->gso_type & SKB_GSO_TCPV4)
3049 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
3050 else if (sinfo->gso_type & SKB_GSO_TCPV6)
3051 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
3052 else if (sinfo->gso_type & SKB_GSO_UDP)
3053 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
3054 else if (sinfo->gso_type & SKB_GSO_FCOE)
3055 goto out_free;
3056 else
3057 BUG();
3058 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
3059 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
3060 } else
3061 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
3062
3063 if (skb->ip_summed == CHECKSUM_PARTIAL) {
3064 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
dc9e5153
MT
3065 vnet_hdr.csum_start = __cpu_to_virtio16(false,
3066 skb_checksum_start_offset(skb));
3067 vnet_hdr.csum_offset = __cpu_to_virtio16(false,
3068 skb->csum_offset);
10a8d94a
JW
3069 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
3070 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
3071 } /* else everything is zero */
3072
7eab8d9e 3073 err = memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_len);
bfd5f4a3
SS
3074 if (err < 0)
3075 goto out_free;
3076 }
3077
f3d33426
HFS
3078 /* You lose any data beyond the buffer you gave. If it worries
3079 * a user program they can ask the device for its MTU
3080 * anyway.
1da177e4 3081 */
1da177e4 3082 copied = skb->len;
40d4e3df
ED
3083 if (copied > len) {
3084 copied = len;
3085 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3086 }
3087
51f3d02b 3088 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3089 if (err)
3090 goto out_free;
3091
2472d761
EB
3092 if (sock->type != SOCK_PACKET) {
3093 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3094
3095 /* Original length was stored in sockaddr_ll fields */
3096 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3097 sll->sll_family = AF_PACKET;
3098 sll->sll_protocol = skb->protocol;
3099 }
3100
3b885787 3101 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3102
f3d33426
HFS
3103 if (msg->msg_name) {
3104 /* If the address length field is there to be filled
3105 * in, we fill it in now.
3106 */
3107 if (sock->type == SOCK_PACKET) {
342dfc30 3108 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3109 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3110 } else {
3111 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3112
f3d33426
HFS
3113 msg->msg_namelen = sll->sll_halen +
3114 offsetof(struct sockaddr_ll, sll_addr);
3115 }
ffbc6111
HX
3116 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3117 msg->msg_namelen);
f3d33426 3118 }
1da177e4 3119
8dc41944 3120 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3121 struct tpacket_auxdata aux;
3122
3123 aux.tp_status = TP_STATUS_USER;
3124 if (skb->ip_summed == CHECKSUM_PARTIAL)
3125 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3126 else if (skb->pkt_type != PACKET_OUTGOING &&
3127 (skb->ip_summed == CHECKSUM_COMPLETE ||
3128 skb_csum_unnecessary(skb)))
3129 aux.tp_status |= TP_STATUS_CSUM_VALID;
3130
2472d761 3131 aux.tp_len = origlen;
ffbc6111
HX
3132 aux.tp_snaplen = skb->len;
3133 aux.tp_mac = 0;
bbe735e4 3134 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3135 if (skb_vlan_tag_present(skb)) {
3136 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3137 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3138 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3139 } else {
3140 aux.tp_vlan_tci = 0;
a0cdfcf3 3141 aux.tp_vlan_tpid = 0;
a3bcc23e 3142 }
ffbc6111 3143 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3144 }
3145
1da177e4
LT
3146 /*
3147 * Free or return the buffer as appropriate. Again this
3148 * hides all the races and re-entrancy issues from us.
3149 */
bfd5f4a3 3150 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3151
3152out_free:
3153 skb_free_datagram(sk, skb);
3154out:
3155 return err;
3156}
3157
1da177e4
LT
3158static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3159 int *uaddr_len, int peer)
3160{
3161 struct net_device *dev;
3162 struct sock *sk = sock->sk;
3163
3164 if (peer)
3165 return -EOPNOTSUPP;
3166
3167 uaddr->sa_family = AF_PACKET;
2dc85bf3 3168 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3169 rcu_read_lock();
3170 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3171 if (dev)
2dc85bf3 3172 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3173 rcu_read_unlock();
1da177e4
LT
3174 *uaddr_len = sizeof(*uaddr);
3175
3176 return 0;
3177}
1da177e4
LT
3178
3179static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3180 int *uaddr_len, int peer)
3181{
3182 struct net_device *dev;
3183 struct sock *sk = sock->sk;
3184 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3185 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3186
3187 if (peer)
3188 return -EOPNOTSUPP;
3189
3190 sll->sll_family = AF_PACKET;
3191 sll->sll_ifindex = po->ifindex;
3192 sll->sll_protocol = po->num;
67286640 3193 sll->sll_pkttype = 0;
654d1f8a
ED
3194 rcu_read_lock();
3195 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3196 if (dev) {
3197 sll->sll_hatype = dev->type;
3198 sll->sll_halen = dev->addr_len;
3199 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3200 } else {
3201 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3202 sll->sll_halen = 0;
3203 }
654d1f8a 3204 rcu_read_unlock();
0fb375fb 3205 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3206
3207 return 0;
3208}
3209
2aeb0b88
WC
3210static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3211 int what)
1da177e4
LT
3212{
3213 switch (i->type) {
3214 case PACKET_MR_MULTICAST:
1162563f
JP
3215 if (i->alen != dev->addr_len)
3216 return -EINVAL;
1da177e4 3217 if (what > 0)
22bedad3 3218 return dev_mc_add(dev, i->addr);
1da177e4 3219 else
22bedad3 3220 return dev_mc_del(dev, i->addr);
1da177e4
LT
3221 break;
3222 case PACKET_MR_PROMISC:
2aeb0b88 3223 return dev_set_promiscuity(dev, what);
1da177e4 3224 case PACKET_MR_ALLMULTI:
2aeb0b88 3225 return dev_set_allmulti(dev, what);
d95ed927 3226 case PACKET_MR_UNICAST:
1162563f
JP
3227 if (i->alen != dev->addr_len)
3228 return -EINVAL;
d95ed927 3229 if (what > 0)
a748ee24 3230 return dev_uc_add(dev, i->addr);
d95ed927 3231 else
a748ee24 3232 return dev_uc_del(dev, i->addr);
d95ed927 3233 break;
40d4e3df
ED
3234 default:
3235 break;
1da177e4 3236 }
2aeb0b88 3237 return 0;
1da177e4
LT
3238}
3239
82f17091
FR
3240static void packet_dev_mclist_delete(struct net_device *dev,
3241 struct packet_mclist **mlp)
1da177e4 3242{
82f17091
FR
3243 struct packet_mclist *ml;
3244
3245 while ((ml = *mlp) != NULL) {
3246 if (ml->ifindex == dev->ifindex) {
3247 packet_dev_mc(dev, ml, -1);
3248 *mlp = ml->next;
3249 kfree(ml);
3250 } else
3251 mlp = &ml->next;
1da177e4
LT
3252 }
3253}
3254
0fb375fb 3255static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3256{
3257 struct packet_sock *po = pkt_sk(sk);
3258 struct packet_mclist *ml, *i;
3259 struct net_device *dev;
3260 int err;
3261
3262 rtnl_lock();
3263
3264 err = -ENODEV;
3b1e0a65 3265 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3266 if (!dev)
3267 goto done;
3268
3269 err = -EINVAL;
1162563f 3270 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3271 goto done;
3272
3273 err = -ENOBUFS;
8b3a7005 3274 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3275 if (i == NULL)
3276 goto done;
3277
3278 err = 0;
3279 for (ml = po->mclist; ml; ml = ml->next) {
3280 if (ml->ifindex == mreq->mr_ifindex &&
3281 ml->type == mreq->mr_type &&
3282 ml->alen == mreq->mr_alen &&
3283 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3284 ml->count++;
3285 /* Free the new element ... */
3286 kfree(i);
3287 goto done;
3288 }
3289 }
3290
3291 i->type = mreq->mr_type;
3292 i->ifindex = mreq->mr_ifindex;
3293 i->alen = mreq->mr_alen;
3294 memcpy(i->addr, mreq->mr_address, i->alen);
3295 i->count = 1;
3296 i->next = po->mclist;
3297 po->mclist = i;
2aeb0b88
WC
3298 err = packet_dev_mc(dev, i, 1);
3299 if (err) {
3300 po->mclist = i->next;
3301 kfree(i);
3302 }
1da177e4
LT
3303
3304done:
3305 rtnl_unlock();
3306 return err;
3307}
3308
0fb375fb 3309static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3310{
3311 struct packet_mclist *ml, **mlp;
3312
3313 rtnl_lock();
3314
3315 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3316 if (ml->ifindex == mreq->mr_ifindex &&
3317 ml->type == mreq->mr_type &&
3318 ml->alen == mreq->mr_alen &&
3319 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3320 if (--ml->count == 0) {
3321 struct net_device *dev;
3322 *mlp = ml->next;
ad959e76
ED
3323 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3324 if (dev)
1da177e4 3325 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3326 kfree(ml);
3327 }
82f17091 3328 break;
1da177e4
LT
3329 }
3330 }
3331 rtnl_unlock();
82f17091 3332 return 0;
1da177e4
LT
3333}
3334
3335static void packet_flush_mclist(struct sock *sk)
3336{
3337 struct packet_sock *po = pkt_sk(sk);
3338 struct packet_mclist *ml;
3339
3340 if (!po->mclist)
3341 return;
3342
3343 rtnl_lock();
3344 while ((ml = po->mclist) != NULL) {
3345 struct net_device *dev;
3346
3347 po->mclist = ml->next;
ad959e76
ED
3348 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3349 if (dev != NULL)
1da177e4 3350 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3351 kfree(ml);
3352 }
3353 rtnl_unlock();
3354}
1da177e4
LT
3355
3356static int
b7058842 3357packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3358{
3359 struct sock *sk = sock->sk;
8dc41944 3360 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3361 int ret;
3362
3363 if (level != SOL_PACKET)
3364 return -ENOPROTOOPT;
3365
69e3c75f 3366 switch (optname) {
1ce4f28b 3367 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3368 case PACKET_DROP_MEMBERSHIP:
3369 {
0fb375fb
EB
3370 struct packet_mreq_max mreq;
3371 int len = optlen;
3372 memset(&mreq, 0, sizeof(mreq));
3373 if (len < sizeof(struct packet_mreq))
1da177e4 3374 return -EINVAL;
0fb375fb
EB
3375 if (len > sizeof(mreq))
3376 len = sizeof(mreq);
40d4e3df 3377 if (copy_from_user(&mreq, optval, len))
1da177e4 3378 return -EFAULT;
0fb375fb
EB
3379 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3380 return -EINVAL;
1da177e4
LT
3381 if (optname == PACKET_ADD_MEMBERSHIP)
3382 ret = packet_mc_add(sk, &mreq);
3383 else
3384 ret = packet_mc_drop(sk, &mreq);
3385 return ret;
3386 }
a2efcfa0 3387
1da177e4 3388 case PACKET_RX_RING:
69e3c75f 3389 case PACKET_TX_RING:
1da177e4 3390 {
f6fb8f10 3391 union tpacket_req_u req_u;
3392 int len;
1da177e4 3393
f6fb8f10 3394 switch (po->tp_version) {
3395 case TPACKET_V1:
3396 case TPACKET_V2:
3397 len = sizeof(req_u.req);
3398 break;
3399 case TPACKET_V3:
3400 default:
3401 len = sizeof(req_u.req3);
3402 break;
3403 }
3404 if (optlen < len)
1da177e4 3405 return -EINVAL;
bfd5f4a3
SS
3406 if (pkt_sk(sk)->has_vnet_hdr)
3407 return -EINVAL;
f6fb8f10 3408 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3409 return -EFAULT;
f6fb8f10 3410 return packet_set_ring(sk, &req_u, 0,
3411 optname == PACKET_TX_RING);
1da177e4
LT
3412 }
3413 case PACKET_COPY_THRESH:
3414 {
3415 int val;
3416
40d4e3df 3417 if (optlen != sizeof(val))
1da177e4 3418 return -EINVAL;
40d4e3df 3419 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3420 return -EFAULT;
3421
3422 pkt_sk(sk)->copy_thresh = val;
3423 return 0;
3424 }
bbd6ef87
PM
3425 case PACKET_VERSION:
3426 {
3427 int val;
3428
3429 if (optlen != sizeof(val))
3430 return -EINVAL;
69e3c75f 3431 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3432 return -EBUSY;
3433 if (copy_from_user(&val, optval, sizeof(val)))
3434 return -EFAULT;
3435 switch (val) {
3436 case TPACKET_V1:
3437 case TPACKET_V2:
f6fb8f10 3438 case TPACKET_V3:
bbd6ef87
PM
3439 po->tp_version = val;
3440 return 0;
3441 default:
3442 return -EINVAL;
3443 }
3444 }
8913336a
PM
3445 case PACKET_RESERVE:
3446 {
3447 unsigned int val;
3448
3449 if (optlen != sizeof(val))
3450 return -EINVAL;
69e3c75f 3451 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3452 return -EBUSY;
3453 if (copy_from_user(&val, optval, sizeof(val)))
3454 return -EFAULT;
3455 po->tp_reserve = val;
3456 return 0;
3457 }
69e3c75f
JB
3458 case PACKET_LOSS:
3459 {
3460 unsigned int val;
3461
3462 if (optlen != sizeof(val))
3463 return -EINVAL;
3464 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3465 return -EBUSY;
3466 if (copy_from_user(&val, optval, sizeof(val)))
3467 return -EFAULT;
3468 po->tp_loss = !!val;
3469 return 0;
3470 }
8dc41944
HX
3471 case PACKET_AUXDATA:
3472 {
3473 int val;
3474
3475 if (optlen < sizeof(val))
3476 return -EINVAL;
3477 if (copy_from_user(&val, optval, sizeof(val)))
3478 return -EFAULT;
3479
3480 po->auxdata = !!val;
3481 return 0;
3482 }
80feaacb
PWJ
3483 case PACKET_ORIGDEV:
3484 {
3485 int val;
3486
3487 if (optlen < sizeof(val))
3488 return -EINVAL;
3489 if (copy_from_user(&val, optval, sizeof(val)))
3490 return -EFAULT;
3491
3492 po->origdev = !!val;
3493 return 0;
3494 }
bfd5f4a3
SS
3495 case PACKET_VNET_HDR:
3496 {
3497 int val;
3498
3499 if (sock->type != SOCK_RAW)
3500 return -EINVAL;
3501 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3502 return -EBUSY;
3503 if (optlen < sizeof(val))
3504 return -EINVAL;
3505 if (copy_from_user(&val, optval, sizeof(val)))
3506 return -EFAULT;
3507
3508 po->has_vnet_hdr = !!val;
3509 return 0;
3510 }
614f60fa
SM
3511 case PACKET_TIMESTAMP:
3512 {
3513 int val;
3514
3515 if (optlen != sizeof(val))
3516 return -EINVAL;
3517 if (copy_from_user(&val, optval, sizeof(val)))
3518 return -EFAULT;
3519
3520 po->tp_tstamp = val;
3521 return 0;
3522 }
dc99f600
DM
3523 case PACKET_FANOUT:
3524 {
3525 int val;
3526
3527 if (optlen != sizeof(val))
3528 return -EINVAL;
3529 if (copy_from_user(&val, optval, sizeof(val)))
3530 return -EFAULT;
3531
3532 return fanout_add(sk, val & 0xffff, val >> 16);
3533 }
5920cd3a
PC
3534 case PACKET_TX_HAS_OFF:
3535 {
3536 unsigned int val;
3537
3538 if (optlen != sizeof(val))
3539 return -EINVAL;
3540 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3541 return -EBUSY;
3542 if (copy_from_user(&val, optval, sizeof(val)))
3543 return -EFAULT;
3544 po->tp_tx_has_off = !!val;
3545 return 0;
3546 }
d346a3fa
DB
3547 case PACKET_QDISC_BYPASS:
3548 {
3549 int val;
3550
3551 if (optlen != sizeof(val))
3552 return -EINVAL;
3553 if (copy_from_user(&val, optval, sizeof(val)))
3554 return -EFAULT;
3555
3556 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3557 return 0;
3558 }
1da177e4
LT
3559 default:
3560 return -ENOPROTOOPT;
3561 }
3562}
3563
3564static int packet_getsockopt(struct socket *sock, int level, int optname,
3565 char __user *optval, int __user *optlen)
3566{
3567 int len;
c06fff6e 3568 int val, lv = sizeof(val);
1da177e4
LT
3569 struct sock *sk = sock->sk;
3570 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3571 void *data = &val;
ee80fbf3 3572 union tpacket_stats_u st;
a9b63918 3573 struct tpacket_rollover_stats rstats;
1da177e4
LT
3574
3575 if (level != SOL_PACKET)
3576 return -ENOPROTOOPT;
3577
8ae55f04
KK
3578 if (get_user(len, optlen))
3579 return -EFAULT;
1da177e4
LT
3580
3581 if (len < 0)
3582 return -EINVAL;
1ce4f28b 3583
69e3c75f 3584 switch (optname) {
1da177e4 3585 case PACKET_STATISTICS:
1da177e4 3586 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3587 memcpy(&st, &po->stats, sizeof(st));
3588 memset(&po->stats, 0, sizeof(po->stats));
3589 spin_unlock_bh(&sk->sk_receive_queue.lock);
3590
f6fb8f10 3591 if (po->tp_version == TPACKET_V3) {
c06fff6e 3592 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3593 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3594 data = &st.stats3;
f6fb8f10 3595 } else {
c06fff6e 3596 lv = sizeof(struct tpacket_stats);
8bcdeaff 3597 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3598 data = &st.stats1;
f6fb8f10 3599 }
ee80fbf3 3600
8dc41944
HX
3601 break;
3602 case PACKET_AUXDATA:
8dc41944 3603 val = po->auxdata;
80feaacb
PWJ
3604 break;
3605 case PACKET_ORIGDEV:
80feaacb 3606 val = po->origdev;
bfd5f4a3
SS
3607 break;
3608 case PACKET_VNET_HDR:
bfd5f4a3 3609 val = po->has_vnet_hdr;
1da177e4 3610 break;
bbd6ef87 3611 case PACKET_VERSION:
bbd6ef87 3612 val = po->tp_version;
bbd6ef87
PM
3613 break;
3614 case PACKET_HDRLEN:
3615 if (len > sizeof(int))
3616 len = sizeof(int);
3617 if (copy_from_user(&val, optval, len))
3618 return -EFAULT;
3619 switch (val) {
3620 case TPACKET_V1:
3621 val = sizeof(struct tpacket_hdr);
3622 break;
3623 case TPACKET_V2:
3624 val = sizeof(struct tpacket2_hdr);
3625 break;
f6fb8f10 3626 case TPACKET_V3:
3627 val = sizeof(struct tpacket3_hdr);
3628 break;
bbd6ef87
PM
3629 default:
3630 return -EINVAL;
3631 }
bbd6ef87 3632 break;
8913336a 3633 case PACKET_RESERVE:
8913336a 3634 val = po->tp_reserve;
8913336a 3635 break;
69e3c75f 3636 case PACKET_LOSS:
69e3c75f 3637 val = po->tp_loss;
69e3c75f 3638 break;
614f60fa 3639 case PACKET_TIMESTAMP:
614f60fa 3640 val = po->tp_tstamp;
614f60fa 3641 break;
dc99f600 3642 case PACKET_FANOUT:
dc99f600
DM
3643 val = (po->fanout ?
3644 ((u32)po->fanout->id |
77f65ebd
WB
3645 ((u32)po->fanout->type << 16) |
3646 ((u32)po->fanout->flags << 24)) :
dc99f600 3647 0);
dc99f600 3648 break;
a9b63918
WB
3649 case PACKET_ROLLOVER_STATS:
3650 if (!po->rollover)
3651 return -EINVAL;
3652 rstats.tp_all = atomic_long_read(&po->rollover->num);
3653 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3654 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3655 data = &rstats;
3656 lv = sizeof(rstats);
3657 break;
5920cd3a
PC
3658 case PACKET_TX_HAS_OFF:
3659 val = po->tp_tx_has_off;
3660 break;
d346a3fa
DB
3661 case PACKET_QDISC_BYPASS:
3662 val = packet_use_direct_xmit(po);
3663 break;
1da177e4
LT
3664 default:
3665 return -ENOPROTOOPT;
3666 }
3667
c06fff6e
ED
3668 if (len > lv)
3669 len = lv;
8ae55f04
KK
3670 if (put_user(len, optlen))
3671 return -EFAULT;
8dc41944
HX
3672 if (copy_to_user(optval, data, len))
3673 return -EFAULT;
8ae55f04 3674 return 0;
1da177e4
LT
3675}
3676
3677
351638e7
JP
3678static int packet_notifier(struct notifier_block *this,
3679 unsigned long msg, void *ptr)
1da177e4
LT
3680{
3681 struct sock *sk;
351638e7 3682 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3683 struct net *net = dev_net(dev);
1da177e4 3684
808f5114 3685 rcu_read_lock();
b67bfe0d 3686 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3687 struct packet_sock *po = pkt_sk(sk);
3688
3689 switch (msg) {
3690 case NETDEV_UNREGISTER:
1da177e4 3691 if (po->mclist)
82f17091 3692 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
3693 /* fallthrough */
3694
1da177e4
LT
3695 case NETDEV_DOWN:
3696 if (dev->ifindex == po->ifindex) {
3697 spin_lock(&po->bind_lock);
3698 if (po->running) {
ce06b03e 3699 __unregister_prot_hook(sk, false);
1da177e4
LT
3700 sk->sk_err = ENETDOWN;
3701 if (!sock_flag(sk, SOCK_DEAD))
3702 sk->sk_error_report(sk);
3703 }
3704 if (msg == NETDEV_UNREGISTER) {
66e56cd4 3705 packet_cached_dev_reset(po);
1da177e4 3706 po->ifindex = -1;
160ff18a
BG
3707 if (po->prot_hook.dev)
3708 dev_put(po->prot_hook.dev);
1da177e4
LT
3709 po->prot_hook.dev = NULL;
3710 }
3711 spin_unlock(&po->bind_lock);
3712 }
3713 break;
3714 case NETDEV_UP:
808f5114 3715 if (dev->ifindex == po->ifindex) {
3716 spin_lock(&po->bind_lock);
ce06b03e
DM
3717 if (po->num)
3718 register_prot_hook(sk);
808f5114 3719 spin_unlock(&po->bind_lock);
1da177e4 3720 }
1da177e4
LT
3721 break;
3722 }
3723 }
808f5114 3724 rcu_read_unlock();
1da177e4
LT
3725 return NOTIFY_DONE;
3726}
3727
3728
3729static int packet_ioctl(struct socket *sock, unsigned int cmd,
3730 unsigned long arg)
3731{
3732 struct sock *sk = sock->sk;
3733
69e3c75f 3734 switch (cmd) {
40d4e3df
ED
3735 case SIOCOUTQ:
3736 {
3737 int amount = sk_wmem_alloc_get(sk);
31e6d363 3738
40d4e3df
ED
3739 return put_user(amount, (int __user *)arg);
3740 }
3741 case SIOCINQ:
3742 {
3743 struct sk_buff *skb;
3744 int amount = 0;
3745
3746 spin_lock_bh(&sk->sk_receive_queue.lock);
3747 skb = skb_peek(&sk->sk_receive_queue);
3748 if (skb)
3749 amount = skb->len;
3750 spin_unlock_bh(&sk->sk_receive_queue.lock);
3751 return put_user(amount, (int __user *)arg);
3752 }
3753 case SIOCGSTAMP:
3754 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3755 case SIOCGSTAMPNS:
3756 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3757
1da177e4 3758#ifdef CONFIG_INET
40d4e3df
ED
3759 case SIOCADDRT:
3760 case SIOCDELRT:
3761 case SIOCDARP:
3762 case SIOCGARP:
3763 case SIOCSARP:
3764 case SIOCGIFADDR:
3765 case SIOCSIFADDR:
3766 case SIOCGIFBRDADDR:
3767 case SIOCSIFBRDADDR:
3768 case SIOCGIFNETMASK:
3769 case SIOCSIFNETMASK:
3770 case SIOCGIFDSTADDR:
3771 case SIOCSIFDSTADDR:
3772 case SIOCSIFFLAGS:
40d4e3df 3773 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3774#endif
3775
40d4e3df
ED
3776 default:
3777 return -ENOIOCTLCMD;
1da177e4
LT
3778 }
3779 return 0;
3780}
3781
40d4e3df 3782static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3783 poll_table *wait)
3784{
3785 struct sock *sk = sock->sk;
3786 struct packet_sock *po = pkt_sk(sk);
3787 unsigned int mask = datagram_poll(file, sock, wait);
3788
3789 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3790 if (po->rx_ring.pg_vec) {
f6fb8f10 3791 if (!packet_previous_rx_frame(po, &po->rx_ring,
3792 TP_STATUS_KERNEL))
1da177e4
LT
3793 mask |= POLLIN | POLLRDNORM;
3794 }
2ccdbaa6 3795 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 3796 po->pressure = 0;
1da177e4 3797 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3798 spin_lock_bh(&sk->sk_write_queue.lock);
3799 if (po->tx_ring.pg_vec) {
3800 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3801 mask |= POLLOUT | POLLWRNORM;
3802 }
3803 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3804 return mask;
3805}
3806
3807
3808/* Dirty? Well, I still did not learn better way to account
3809 * for user mmaps.
3810 */
3811
3812static void packet_mm_open(struct vm_area_struct *vma)
3813{
3814 struct file *file = vma->vm_file;
40d4e3df 3815 struct socket *sock = file->private_data;
1da177e4 3816 struct sock *sk = sock->sk;
1ce4f28b 3817
1da177e4
LT
3818 if (sk)
3819 atomic_inc(&pkt_sk(sk)->mapped);
3820}
3821
3822static void packet_mm_close(struct vm_area_struct *vma)
3823{
3824 struct file *file = vma->vm_file;
40d4e3df 3825 struct socket *sock = file->private_data;
1da177e4 3826 struct sock *sk = sock->sk;
1ce4f28b 3827
1da177e4
LT
3828 if (sk)
3829 atomic_dec(&pkt_sk(sk)->mapped);
3830}
3831
f0f37e2f 3832static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3833 .open = packet_mm_open,
3834 .close = packet_mm_close,
1da177e4
LT
3835};
3836
0e3125c7
NH
3837static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3838 unsigned int len)
1da177e4
LT
3839{
3840 int i;
3841
4ebf0ae2 3842 for (i = 0; i < len; i++) {
0e3125c7 3843 if (likely(pg_vec[i].buffer)) {
c56b4d90 3844 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3845 vfree(pg_vec[i].buffer);
3846 else
3847 free_pages((unsigned long)pg_vec[i].buffer,
3848 order);
3849 pg_vec[i].buffer = NULL;
3850 }
1da177e4
LT
3851 }
3852 kfree(pg_vec);
3853}
3854
eea49cc9 3855static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3856{
f0d4eb29 3857 char *buffer;
0e3125c7
NH
3858 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3859 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3860
3861 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
3862 if (buffer)
3863 return buffer;
3864
f0d4eb29 3865 /* __get_free_pages failed, fall back to vmalloc */
bbce5a59 3866 buffer = vzalloc((1 << order) * PAGE_SIZE);
0e3125c7
NH
3867 if (buffer)
3868 return buffer;
3869
f0d4eb29 3870 /* vmalloc failed, lets dig into swap here */
0e3125c7 3871 gfp_flags &= ~__GFP_NORETRY;
f0d4eb29 3872 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
3873 if (buffer)
3874 return buffer;
3875
f0d4eb29 3876 /* complete and utter failure */
0e3125c7 3877 return NULL;
4ebf0ae2
DM
3878}
3879
0e3125c7 3880static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3881{
3882 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3883 struct pgv *pg_vec;
4ebf0ae2
DM
3884 int i;
3885
0e3125c7 3886 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3887 if (unlikely(!pg_vec))
3888 goto out;
3889
3890 for (i = 0; i < block_nr; i++) {
c56b4d90 3891 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3892 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3893 goto out_free_pgvec;
3894 }
3895
3896out:
3897 return pg_vec;
3898
3899out_free_pgvec:
3900 free_pg_vec(pg_vec, order, block_nr);
3901 pg_vec = NULL;
3902 goto out;
3903}
1da177e4 3904
f6fb8f10 3905static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3906 int closing, int tx_ring)
1da177e4 3907{
0e3125c7 3908 struct pgv *pg_vec = NULL;
1da177e4 3909 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3910 int was_running, order = 0;
69e3c75f
JB
3911 struct packet_ring_buffer *rb;
3912 struct sk_buff_head *rb_queue;
0e11c91e 3913 __be16 num;
f6fb8f10 3914 int err = -EINVAL;
3915 /* Added to avoid minimal code churn */
3916 struct tpacket_req *req = &req_u->req;
3917
3918 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3919 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3920 WARN(1, "Tx-ring is not supported.\n");
3921 goto out;
3922 }
1ce4f28b 3923
69e3c75f
JB
3924 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3925 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3926
69e3c75f
JB
3927 err = -EBUSY;
3928 if (!closing) {
3929 if (atomic_read(&po->mapped))
3930 goto out;
b0138408 3931 if (packet_read_pending(rb))
69e3c75f
JB
3932 goto out;
3933 }
1da177e4 3934
69e3c75f
JB
3935 if (req->tp_block_nr) {
3936 /* Sanity tests and some calculations */
3937 err = -EBUSY;
3938 if (unlikely(rb->pg_vec))
3939 goto out;
1da177e4 3940
bbd6ef87
PM
3941 switch (po->tp_version) {
3942 case TPACKET_V1:
3943 po->tp_hdrlen = TPACKET_HDRLEN;
3944 break;
3945 case TPACKET_V2:
3946 po->tp_hdrlen = TPACKET2_HDRLEN;
3947 break;
f6fb8f10 3948 case TPACKET_V3:
3949 po->tp_hdrlen = TPACKET3_HDRLEN;
3950 break;
bbd6ef87
PM
3951 }
3952
69e3c75f 3953 err = -EINVAL;
4ebf0ae2 3954 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3955 goto out;
4ebf0ae2 3956 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3957 goto out;
dc808110
ED
3958 if (po->tp_version >= TPACKET_V3 &&
3959 (int)(req->tp_block_size -
3960 BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
3961 goto out;
8913336a 3962 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3963 po->tp_reserve))
3964 goto out;
4ebf0ae2 3965 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3966 goto out;
1da177e4 3967
69e3c75f
JB
3968 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3969 if (unlikely(rb->frames_per_block <= 0))
3970 goto out;
3971 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3972 req->tp_frame_nr))
3973 goto out;
1da177e4
LT
3974
3975 err = -ENOMEM;
4ebf0ae2
DM
3976 order = get_order(req->tp_block_size);
3977 pg_vec = alloc_pg_vec(req, order);
3978 if (unlikely(!pg_vec))
1da177e4 3979 goto out;
f6fb8f10 3980 switch (po->tp_version) {
3981 case TPACKET_V3:
3982 /* Transmit path is not supported. We checked
3983 * it above but just being paranoid
3984 */
3985 if (!tx_ring)
e8e85cc5 3986 init_prb_bdqc(po, rb, pg_vec, req_u);
d7cf0c34 3987 break;
f6fb8f10 3988 default:
3989 break;
3990 }
69e3c75f
JB
3991 }
3992 /* Done */
3993 else {
3994 err = -EINVAL;
4ebf0ae2 3995 if (unlikely(req->tp_frame_nr))
69e3c75f 3996 goto out;
1da177e4
LT
3997 }
3998
3999 lock_sock(sk);
4000
4001 /* Detach socket from network */
4002 spin_lock(&po->bind_lock);
4003 was_running = po->running;
4004 num = po->num;
4005 if (was_running) {
1da177e4 4006 po->num = 0;
ce06b03e 4007 __unregister_prot_hook(sk, false);
1da177e4
LT
4008 }
4009 spin_unlock(&po->bind_lock);
1ce4f28b 4010
1da177e4
LT
4011 synchronize_net();
4012
4013 err = -EBUSY;
905db440 4014 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4015 if (closing || atomic_read(&po->mapped) == 0) {
4016 err = 0;
69e3c75f 4017 spin_lock_bh(&rb_queue->lock);
c053fd96 4018 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4019 rb->frame_max = (req->tp_frame_nr - 1);
4020 rb->head = 0;
4021 rb->frame_size = req->tp_frame_size;
4022 spin_unlock_bh(&rb_queue->lock);
4023
c053fd96
CG
4024 swap(rb->pg_vec_order, order);
4025 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4026
4027 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4028 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4029 tpacket_rcv : packet_rcv;
4030 skb_queue_purge(rb_queue);
1da177e4 4031 if (atomic_read(&po->mapped))
40d4e3df
ED
4032 pr_err("packet_mmap: vma is busy: %d\n",
4033 atomic_read(&po->mapped));
1da177e4 4034 }
905db440 4035 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4036
4037 spin_lock(&po->bind_lock);
ce06b03e 4038 if (was_running) {
1da177e4 4039 po->num = num;
ce06b03e 4040 register_prot_hook(sk);
1da177e4
LT
4041 }
4042 spin_unlock(&po->bind_lock);
f6fb8f10 4043 if (closing && (po->tp_version > TPACKET_V2)) {
4044 /* Because we don't support block-based V3 on tx-ring */
4045 if (!tx_ring)
4046 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
4047 }
1da177e4
LT
4048 release_sock(sk);
4049
1da177e4
LT
4050 if (pg_vec)
4051 free_pg_vec(pg_vec, order, req->tp_block_nr);
4052out:
4053 return err;
4054}
4055
69e3c75f
JB
4056static int packet_mmap(struct file *file, struct socket *sock,
4057 struct vm_area_struct *vma)
1da177e4
LT
4058{
4059 struct sock *sk = sock->sk;
4060 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4061 unsigned long size, expected_size;
4062 struct packet_ring_buffer *rb;
1da177e4
LT
4063 unsigned long start;
4064 int err = -EINVAL;
4065 int i;
4066
4067 if (vma->vm_pgoff)
4068 return -EINVAL;
4069
905db440 4070 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4071
4072 expected_size = 0;
4073 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4074 if (rb->pg_vec) {
4075 expected_size += rb->pg_vec_len
4076 * rb->pg_vec_pages
4077 * PAGE_SIZE;
4078 }
4079 }
4080
4081 if (expected_size == 0)
1da177e4 4082 goto out;
69e3c75f
JB
4083
4084 size = vma->vm_end - vma->vm_start;
4085 if (size != expected_size)
1da177e4
LT
4086 goto out;
4087
1da177e4 4088 start = vma->vm_start;
69e3c75f
JB
4089 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4090 if (rb->pg_vec == NULL)
4091 continue;
4092
4093 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4094 struct page *page;
4095 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4096 int pg_num;
4097
c56b4d90
CG
4098 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4099 page = pgv_to_page(kaddr);
69e3c75f
JB
4100 err = vm_insert_page(vma, start, page);
4101 if (unlikely(err))
4102 goto out;
4103 start += PAGE_SIZE;
0e3125c7 4104 kaddr += PAGE_SIZE;
69e3c75f 4105 }
4ebf0ae2 4106 }
1da177e4 4107 }
69e3c75f 4108
4ebf0ae2 4109 atomic_inc(&po->mapped);
1da177e4
LT
4110 vma->vm_ops = &packet_mmap_ops;
4111 err = 0;
4112
4113out:
905db440 4114 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4115 return err;
4116}
1da177e4 4117
90ddc4f0 4118static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4119 .family = PF_PACKET,
4120 .owner = THIS_MODULE,
4121 .release = packet_release,
4122 .bind = packet_bind_spkt,
4123 .connect = sock_no_connect,
4124 .socketpair = sock_no_socketpair,
4125 .accept = sock_no_accept,
4126 .getname = packet_getname_spkt,
4127 .poll = datagram_poll,
4128 .ioctl = packet_ioctl,
4129 .listen = sock_no_listen,
4130 .shutdown = sock_no_shutdown,
4131 .setsockopt = sock_no_setsockopt,
4132 .getsockopt = sock_no_getsockopt,
4133 .sendmsg = packet_sendmsg_spkt,
4134 .recvmsg = packet_recvmsg,
4135 .mmap = sock_no_mmap,
4136 .sendpage = sock_no_sendpage,
4137};
1da177e4 4138
90ddc4f0 4139static const struct proto_ops packet_ops = {
1da177e4
LT
4140 .family = PF_PACKET,
4141 .owner = THIS_MODULE,
4142 .release = packet_release,
4143 .bind = packet_bind,
4144 .connect = sock_no_connect,
4145 .socketpair = sock_no_socketpair,
4146 .accept = sock_no_accept,
1ce4f28b 4147 .getname = packet_getname,
1da177e4
LT
4148 .poll = packet_poll,
4149 .ioctl = packet_ioctl,
4150 .listen = sock_no_listen,
4151 .shutdown = sock_no_shutdown,
4152 .setsockopt = packet_setsockopt,
4153 .getsockopt = packet_getsockopt,
4154 .sendmsg = packet_sendmsg,
4155 .recvmsg = packet_recvmsg,
4156 .mmap = packet_mmap,
4157 .sendpage = sock_no_sendpage,
4158};
4159
ec1b4cf7 4160static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4161 .family = PF_PACKET,
4162 .create = packet_create,
4163 .owner = THIS_MODULE,
4164};
4165
4166static struct notifier_block packet_netdev_notifier = {
40d4e3df 4167 .notifier_call = packet_notifier,
1da177e4
LT
4168};
4169
4170#ifdef CONFIG_PROC_FS
1da177e4
LT
4171
4172static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4173 __acquires(RCU)
1da177e4 4174{
e372c414 4175 struct net *net = seq_file_net(seq);
808f5114 4176
4177 rcu_read_lock();
4178 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4179}
4180
4181static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4182{
1bf40954 4183 struct net *net = seq_file_net(seq);
808f5114 4184 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4185}
4186
4187static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4188 __releases(RCU)
1da177e4 4189{
808f5114 4190 rcu_read_unlock();
1da177e4
LT
4191}
4192
1ce4f28b 4193static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4194{
4195 if (v == SEQ_START_TOKEN)
4196 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4197 else {
b7ceabd9 4198 struct sock *s = sk_entry(v);
1da177e4
LT
4199 const struct packet_sock *po = pkt_sk(s);
4200
4201 seq_printf(seq,
71338aa7 4202 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
4203 s,
4204 atomic_read(&s->sk_refcnt),
4205 s->sk_type,
4206 ntohs(po->num),
4207 po->ifindex,
4208 po->running,
4209 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4210 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4211 sock_i_ino(s));
1da177e4
LT
4212 }
4213
4214 return 0;
4215}
4216
56b3d975 4217static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4218 .start = packet_seq_start,
4219 .next = packet_seq_next,
4220 .stop = packet_seq_stop,
4221 .show = packet_seq_show,
4222};
4223
4224static int packet_seq_open(struct inode *inode, struct file *file)
4225{
e372c414
DL
4226 return seq_open_net(inode, file, &packet_seq_ops,
4227 sizeof(struct seq_net_private));
1da177e4
LT
4228}
4229
da7071d7 4230static const struct file_operations packet_seq_fops = {
1da177e4
LT
4231 .owner = THIS_MODULE,
4232 .open = packet_seq_open,
4233 .read = seq_read,
4234 .llseek = seq_lseek,
e372c414 4235 .release = seq_release_net,
1da177e4
LT
4236};
4237
4238#endif
4239
2c8c1e72 4240static int __net_init packet_net_init(struct net *net)
d12d01d6 4241{
0fa7fa98 4242 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4243 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4244
d4beaa66 4245 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
4246 return -ENOMEM;
4247
4248 return 0;
4249}
4250
2c8c1e72 4251static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4252{
ece31ffd 4253 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
4254}
4255
4256static struct pernet_operations packet_net_ops = {
4257 .init = packet_net_init,
4258 .exit = packet_net_exit,
4259};
4260
4261
1da177e4
LT
4262static void __exit packet_exit(void)
4263{
1da177e4 4264 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4265 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4266 sock_unregister(PF_PACKET);
4267 proto_unregister(&packet_proto);
4268}
4269
4270static int __init packet_init(void)
4271{
4272 int rc = proto_register(&packet_proto, 0);
4273
4274 if (rc != 0)
4275 goto out;
4276
4277 sock_register(&packet_family_ops);
d12d01d6 4278 register_pernet_subsys(&packet_net_ops);
1da177e4 4279 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4280out:
4281 return rc;
4282}
4283
4284module_init(packet_init);
4285module_exit(packet_exit);
4286MODULE_LICENSE("GPL");
4287MODULE_ALIAS_NETPROTO(PF_PACKET);