libceph: introduce and switch to reopen_session()
[linux-2.6-block.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
47dceb8e 95#include <linux/bpf.h>
1da177e4 96
2787b04b
PE
97#include "internal.h"
98
1da177e4
LT
99/*
100 Assumptions:
101 - if device has no dev->hard_header routine, it adds and removes ll header
102 inside itself. In this case ll header is invisible outside of device,
103 but higher levels still should reserve dev->hard_header_len.
104 Some devices are enough clever to reallocate skb, when header
105 will not fit to reserved space (tunnel), another ones are silly
106 (PPP).
107 - packet socket receives packets with pulled ll header,
108 so that SOCK_RAW should push it back.
109
110On receive:
111-----------
112
113Incoming, dev->hard_header!=NULL
b0e380b1
ACM
114 mac_header -> ll header
115 data -> data
1da177e4
LT
116
117Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
118 mac_header -> ll header
119 data -> ll header
1da177e4
LT
120
121Incoming, dev->hard_header==NULL
b0e380b1
ACM
122 mac_header -> UNKNOWN position. It is very likely, that it points to ll
123 header. PPP makes it, that is wrong, because introduce
db0c58f9 124 assymetry between rx and tx paths.
b0e380b1 125 data -> data
1da177e4
LT
126
127Outgoing, dev->hard_header==NULL
b0e380b1
ACM
128 mac_header -> data. ll header is still not built!
129 data -> data
1da177e4
LT
130
131Resume
132 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
133
134
135On transmit:
136------------
137
138dev->hard_header != NULL
b0e380b1
ACM
139 mac_header -> ll header
140 data -> ll header
1da177e4
LT
141
142dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
143 mac_header -> data
144 data -> data
1da177e4
LT
145
146 We should set nh.raw on output to correct posistion,
147 packet classifier depends on it.
148 */
149
1da177e4
LT
150/* Private packet socket structures. */
151
0fb375fb
EB
152/* identical to struct packet_mreq except it has
153 * a longer address field.
154 */
40d4e3df 155struct packet_mreq_max {
0fb375fb
EB
156 int mr_ifindex;
157 unsigned short mr_type;
158 unsigned short mr_alen;
159 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 160};
a2efcfa0 161
184f489e
DB
162union tpacket_uhdr {
163 struct tpacket_hdr *h1;
164 struct tpacket2_hdr *h2;
165 struct tpacket3_hdr *h3;
166 void *raw;
167};
168
f6fb8f10 169static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
170 int closing, int tx_ring);
171
f6fb8f10 172#define V3_ALIGNMENT (8)
173
bc59ba39 174#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 175
176#define BLK_PLUS_PRIV(sz_of_priv) \
177 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
178
f6fb8f10 179#define PGV_FROM_VMALLOC 1
69e3c75f 180
f6fb8f10 181#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
182#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
183#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
184#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
185#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
186#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
187#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
188
69e3c75f
JB
189struct packet_sock;
190static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
191static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
192 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 193
f6fb8f10 194static void *packet_previous_frame(struct packet_sock *po,
195 struct packet_ring_buffer *rb,
196 int status);
197static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 198static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
199 struct tpacket_block_desc *);
200static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 201 struct packet_sock *);
bc59ba39 202static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 203 struct packet_sock *, unsigned int status);
bc59ba39 204static int prb_queue_frozen(struct tpacket_kbdq_core *);
205static void prb_open_block(struct tpacket_kbdq_core *,
206 struct tpacket_block_desc *);
f6fb8f10 207static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 208static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
209static void prb_init_blk_timer(struct packet_sock *,
210 struct tpacket_kbdq_core *,
211 void (*func) (unsigned long));
212static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
213static void prb_clear_rxhash(struct tpacket_kbdq_core *,
214 struct tpacket3_hdr *);
215static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
216 struct tpacket3_hdr *);
1da177e4
LT
217static void packet_flush_mclist(struct sock *sk);
218
ffbc6111 219struct packet_skb_cb {
ffbc6111
HX
220 union {
221 struct sockaddr_pkt pkt;
2472d761
EB
222 union {
223 /* Trick: alias skb original length with
224 * ll.sll_family and ll.protocol in order
225 * to save room.
226 */
227 unsigned int origlen;
228 struct sockaddr_ll ll;
229 };
ffbc6111
HX
230 } sa;
231};
232
d3869efe
DW
233#define vio_le() virtio_legacy_is_little_endian()
234
ffbc6111 235#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 236
bc59ba39 237#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 238#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 239 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 240#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 241 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 242#define GET_NEXT_PRB_BLK_NUM(x) \
243 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
244 ((x)->kactive_blk_num+1) : 0)
245
dc99f600
DM
246static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
247static void __fanout_link(struct sock *sk, struct packet_sock *po);
248
d346a3fa
DB
249static int packet_direct_xmit(struct sk_buff *skb)
250{
251 struct net_device *dev = skb->dev;
d346a3fa
DB
252 netdev_features_t features;
253 struct netdev_queue *txq;
43279500 254 int ret = NETDEV_TX_BUSY;
d346a3fa
DB
255
256 if (unlikely(!netif_running(dev) ||
43279500
DB
257 !netif_carrier_ok(dev)))
258 goto drop;
d346a3fa
DB
259
260 features = netif_skb_features(skb);
261 if (skb_needs_linearize(skb, features) &&
43279500
DB
262 __skb_linearize(skb))
263 goto drop;
d346a3fa 264
10c51b56 265 txq = skb_get_tx_queue(dev, skb);
d346a3fa 266
43279500
DB
267 local_bh_disable();
268
269 HARD_TX_LOCK(dev, txq, smp_processor_id());
10b3ad8c 270 if (!netif_xmit_frozen_or_drv_stopped(txq))
fa2dbdc2 271 ret = netdev_start_xmit(skb, dev, txq, false);
43279500 272 HARD_TX_UNLOCK(dev, txq);
d346a3fa 273
43279500
DB
274 local_bh_enable();
275
276 if (!dev_xmit_complete(ret))
d346a3fa 277 kfree_skb(skb);
43279500 278
d346a3fa 279 return ret;
43279500 280drop:
0f97ede4 281 atomic_long_inc(&dev->tx_dropped);
43279500
DB
282 kfree_skb(skb);
283 return NET_XMIT_DROP;
d346a3fa
DB
284}
285
66e56cd4
DB
286static struct net_device *packet_cached_dev_get(struct packet_sock *po)
287{
288 struct net_device *dev;
289
290 rcu_read_lock();
291 dev = rcu_dereference(po->cached_dev);
292 if (likely(dev))
293 dev_hold(dev);
294 rcu_read_unlock();
295
296 return dev;
297}
298
299static void packet_cached_dev_assign(struct packet_sock *po,
300 struct net_device *dev)
301{
302 rcu_assign_pointer(po->cached_dev, dev);
303}
304
305static void packet_cached_dev_reset(struct packet_sock *po)
306{
307 RCU_INIT_POINTER(po->cached_dev, NULL);
308}
309
d346a3fa
DB
310static bool packet_use_direct_xmit(const struct packet_sock *po)
311{
312 return po->xmit == packet_direct_xmit;
313}
314
0fd5d57b 315static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
d346a3fa 316{
1cbac010 317 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
318}
319
0fd5d57b
DB
320static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
321{
322 const struct net_device_ops *ops = dev->netdev_ops;
323 u16 queue_index;
324
325 if (ops->ndo_select_queue) {
326 queue_index = ops->ndo_select_queue(dev, skb, NULL,
327 __packet_pick_tx_queue);
328 queue_index = netdev_cap_txqueue(dev, queue_index);
329 } else {
330 queue_index = __packet_pick_tx_queue(dev, skb);
331 }
332
333 skb_set_queue_mapping(skb, queue_index);
334}
335
ce06b03e
DM
336/* register_prot_hook must be invoked with the po->bind_lock held,
337 * or from a context in which asynchronous accesses to the packet
338 * socket is not possible (packet_create()).
339 */
340static void register_prot_hook(struct sock *sk)
341{
342 struct packet_sock *po = pkt_sk(sk);
e40526cb 343
ce06b03e 344 if (!po->running) {
66e56cd4 345 if (po->fanout)
dc99f600 346 __fanout_link(sk, po);
66e56cd4 347 else
dc99f600 348 dev_add_pack(&po->prot_hook);
e40526cb 349
ce06b03e
DM
350 sock_hold(sk);
351 po->running = 1;
352 }
353}
354
355/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
356 * held. If the sync parameter is true, we will temporarily drop
357 * the po->bind_lock and do a synchronize_net to make sure no
358 * asynchronous packet processing paths still refer to the elements
359 * of po->prot_hook. If the sync parameter is false, it is the
360 * callers responsibility to take care of this.
361 */
362static void __unregister_prot_hook(struct sock *sk, bool sync)
363{
364 struct packet_sock *po = pkt_sk(sk);
365
366 po->running = 0;
66e56cd4
DB
367
368 if (po->fanout)
dc99f600 369 __fanout_unlink(sk, po);
66e56cd4 370 else
dc99f600 371 __dev_remove_pack(&po->prot_hook);
e40526cb 372
ce06b03e
DM
373 __sock_put(sk);
374
375 if (sync) {
376 spin_unlock(&po->bind_lock);
377 synchronize_net();
378 spin_lock(&po->bind_lock);
379 }
380}
381
382static void unregister_prot_hook(struct sock *sk, bool sync)
383{
384 struct packet_sock *po = pkt_sk(sk);
385
386 if (po->running)
387 __unregister_prot_hook(sk, sync);
388}
389
6e58040b 390static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
391{
392 if (is_vmalloc_addr(addr))
393 return vmalloc_to_page(addr);
394 return virt_to_page(addr);
395}
396
69e3c75f 397static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 398{
184f489e 399 union tpacket_uhdr h;
1da177e4 400
69e3c75f 401 h.raw = frame;
bbd6ef87
PM
402 switch (po->tp_version) {
403 case TPACKET_V1:
69e3c75f 404 h.h1->tp_status = status;
0af55bb5 405 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
406 break;
407 case TPACKET_V2:
69e3c75f 408 h.h2->tp_status = status;
0af55bb5 409 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 410 break;
f6fb8f10 411 case TPACKET_V3:
69e3c75f 412 default:
f6fb8f10 413 WARN(1, "TPACKET version not supported.\n");
69e3c75f 414 BUG();
bbd6ef87 415 }
69e3c75f
JB
416
417 smp_wmb();
bbd6ef87
PM
418}
419
69e3c75f 420static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 421{
184f489e 422 union tpacket_uhdr h;
bbd6ef87 423
69e3c75f
JB
424 smp_rmb();
425
bbd6ef87
PM
426 h.raw = frame;
427 switch (po->tp_version) {
428 case TPACKET_V1:
0af55bb5 429 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 430 return h.h1->tp_status;
bbd6ef87 431 case TPACKET_V2:
0af55bb5 432 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 433 return h.h2->tp_status;
f6fb8f10 434 case TPACKET_V3:
69e3c75f 435 default:
f6fb8f10 436 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
437 BUG();
438 return 0;
bbd6ef87 439 }
1da177e4 440}
69e3c75f 441
b9c32fb2
DB
442static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
443 unsigned int flags)
7a51384c
DB
444{
445 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
446
68a360e8
WB
447 if (shhwtstamps &&
448 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
449 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
450 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
451
452 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 453 return TP_STATUS_TS_SOFTWARE;
7a51384c 454
b9c32fb2 455 return 0;
7a51384c
DB
456}
457
b9c32fb2
DB
458static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
459 struct sk_buff *skb)
2e31396f
WB
460{
461 union tpacket_uhdr h;
462 struct timespec ts;
b9c32fb2 463 __u32 ts_status;
2e31396f 464
b9c32fb2
DB
465 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
466 return 0;
2e31396f
WB
467
468 h.raw = frame;
469 switch (po->tp_version) {
470 case TPACKET_V1:
471 h.h1->tp_sec = ts.tv_sec;
472 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
473 break;
474 case TPACKET_V2:
475 h.h2->tp_sec = ts.tv_sec;
476 h.h2->tp_nsec = ts.tv_nsec;
477 break;
478 case TPACKET_V3:
479 default:
480 WARN(1, "TPACKET version not supported.\n");
481 BUG();
482 }
483
484 /* one flush is safe, as both fields always lie on the same cacheline */
485 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
486 smp_wmb();
b9c32fb2
DB
487
488 return ts_status;
2e31396f
WB
489}
490
69e3c75f
JB
491static void *packet_lookup_frame(struct packet_sock *po,
492 struct packet_ring_buffer *rb,
493 unsigned int position,
494 int status)
495{
496 unsigned int pg_vec_pos, frame_offset;
184f489e 497 union tpacket_uhdr h;
69e3c75f
JB
498
499 pg_vec_pos = position / rb->frames_per_block;
500 frame_offset = position % rb->frames_per_block;
501
0e3125c7
NH
502 h.raw = rb->pg_vec[pg_vec_pos].buffer +
503 (frame_offset * rb->frame_size);
69e3c75f
JB
504
505 if (status != __packet_get_status(po, h.raw))
506 return NULL;
507
508 return h.raw;
509}
510
eea49cc9 511static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
512 struct packet_ring_buffer *rb,
513 int status)
514{
515 return packet_lookup_frame(po, rb, rb->head, status);
516}
517
bc59ba39 518static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 519{
520 del_timer_sync(&pkc->retire_blk_timer);
521}
522
523static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 524 struct sk_buff_head *rb_queue)
525{
bc59ba39 526 struct tpacket_kbdq_core *pkc;
f6fb8f10 527
73d0fcf2 528 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 529
ec6f809f 530 spin_lock_bh(&rb_queue->lock);
f6fb8f10 531 pkc->delete_blk_timer = 1;
ec6f809f 532 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 533
534 prb_del_retire_blk_timer(pkc);
535}
536
537static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 538 struct tpacket_kbdq_core *pkc,
f6fb8f10 539 void (*func) (unsigned long))
540{
541 init_timer(&pkc->retire_blk_timer);
542 pkc->retire_blk_timer.data = (long)po;
543 pkc->retire_blk_timer.function = func;
544 pkc->retire_blk_timer.expires = jiffies;
545}
546
e8e85cc5 547static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 548{
bc59ba39 549 struct tpacket_kbdq_core *pkc;
f6fb8f10 550
e8e85cc5 551 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 552 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
553}
554
555static int prb_calc_retire_blk_tmo(struct packet_sock *po,
556 int blk_size_in_bytes)
557{
558 struct net_device *dev;
559 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
7cad1bac 560 struct ethtool_link_ksettings ecmd;
4bc71cb9 561 int err;
f6fb8f10 562
4bc71cb9
JP
563 rtnl_lock();
564 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
565 if (unlikely(!dev)) {
566 rtnl_unlock();
f6fb8f10 567 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 568 }
7cad1bac 569 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9
JP
570 rtnl_unlock();
571 if (!err) {
4bc71cb9
JP
572 /*
573 * If the link speed is so slow you don't really
574 * need to worry about perf anyways
575 */
7cad1bac
DD
576 if (ecmd.base.speed < SPEED_1000 ||
577 ecmd.base.speed == SPEED_UNKNOWN) {
4bc71cb9 578 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 579 } else {
580 msec = 1;
7cad1bac 581 div = ecmd.base.speed / 1000;
f6fb8f10 582 }
583 }
584
585 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
586
587 if (div)
588 mbits /= div;
589
590 tmo = mbits * msec;
591
592 if (div)
593 return tmo+1;
594 return tmo;
595}
596
bc59ba39 597static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 598 union tpacket_req_u *req_u)
599{
600 p1->feature_req_word = req_u->req3.tp_feature_req_word;
601}
602
603static void init_prb_bdqc(struct packet_sock *po,
604 struct packet_ring_buffer *rb,
605 struct pgv *pg_vec,
e8e85cc5 606 union tpacket_req_u *req_u)
f6fb8f10 607{
22781a5b 608 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 609 struct tpacket_block_desc *pbd;
f6fb8f10 610
611 memset(p1, 0x0, sizeof(*p1));
612
613 p1->knxt_seq_num = 1;
614 p1->pkbdq = pg_vec;
bc59ba39 615 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 616 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 617 p1->kblk_size = req_u->req3.tp_block_size;
618 p1->knum_blocks = req_u->req3.tp_block_nr;
619 p1->hdrlen = po->tp_hdrlen;
620 p1->version = po->tp_version;
621 p1->last_kactive_blk_num = 0;
ee80fbf3 622 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 623 if (req_u->req3.tp_retire_blk_tov)
624 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
625 else
626 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
627 req_u->req3.tp_block_size);
628 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
629 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
630
dc808110 631 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 632 prb_init_ft_ops(p1, req_u);
e8e85cc5 633 prb_setup_retire_blk_timer(po);
f6fb8f10 634 prb_open_block(p1, pbd);
635}
636
637/* Do NOT update the last_blk_num first.
638 * Assumes sk_buff_head lock is held.
639 */
bc59ba39 640static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 641{
642 mod_timer(&pkc->retire_blk_timer,
643 jiffies + pkc->tov_in_jiffies);
644 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
645}
646
647/*
648 * Timer logic:
649 * 1) We refresh the timer only when we open a block.
650 * By doing this we don't waste cycles refreshing the timer
651 * on packet-by-packet basis.
652 *
653 * With a 1MB block-size, on a 1Gbps line, it will take
654 * i) ~8 ms to fill a block + ii) memcpy etc.
655 * In this cut we are not accounting for the memcpy time.
656 *
657 * So, if the user sets the 'tmo' to 10ms then the timer
658 * will never fire while the block is still getting filled
659 * (which is what we want). However, the user could choose
660 * to close a block early and that's fine.
661 *
662 * But when the timer does fire, we check whether or not to refresh it.
663 * Since the tmo granularity is in msecs, it is not too expensive
664 * to refresh the timer, lets say every '8' msecs.
665 * Either the user can set the 'tmo' or we can derive it based on
666 * a) line-speed and b) block-size.
667 * prb_calc_retire_blk_tmo() calculates the tmo.
668 *
669 */
670static void prb_retire_rx_blk_timer_expired(unsigned long data)
671{
672 struct packet_sock *po = (struct packet_sock *)data;
22781a5b 673 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 674 unsigned int frozen;
bc59ba39 675 struct tpacket_block_desc *pbd;
f6fb8f10 676
677 spin_lock(&po->sk.sk_receive_queue.lock);
678
679 frozen = prb_queue_frozen(pkc);
680 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
681
682 if (unlikely(pkc->delete_blk_timer))
683 goto out;
684
685 /* We only need to plug the race when the block is partially filled.
686 * tpacket_rcv:
687 * lock(); increment BLOCK_NUM_PKTS; unlock()
688 * copy_bits() is in progress ...
689 * timer fires on other cpu:
690 * we can't retire the current block because copy_bits
691 * is in progress.
692 *
693 */
694 if (BLOCK_NUM_PKTS(pbd)) {
695 while (atomic_read(&pkc->blk_fill_in_prog)) {
696 /* Waiting for skb_copy_bits to finish... */
697 cpu_relax();
698 }
699 }
700
701 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
702 if (!frozen) {
41a50d62
AD
703 if (!BLOCK_NUM_PKTS(pbd)) {
704 /* An empty block. Just refresh the timer. */
705 goto refresh_timer;
706 }
f6fb8f10 707 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
708 if (!prb_dispatch_next_block(pkc, po))
709 goto refresh_timer;
710 else
711 goto out;
712 } else {
713 /* Case 1. Queue was frozen because user-space was
714 * lagging behind.
715 */
716 if (prb_curr_blk_in_use(pkc, pbd)) {
717 /*
718 * Ok, user-space is still behind.
719 * So just refresh the timer.
720 */
721 goto refresh_timer;
722 } else {
723 /* Case 2. queue was frozen,user-space caught up,
724 * now the link went idle && the timer fired.
725 * We don't have a block to close.So we open this
726 * block and restart the timer.
727 * opening a block thaws the queue,restarts timer
728 * Thawing/timer-refresh is a side effect.
729 */
730 prb_open_block(pkc, pbd);
731 goto out;
732 }
733 }
734 }
735
736refresh_timer:
737 _prb_refresh_rx_retire_blk_timer(pkc);
738
739out:
740 spin_unlock(&po->sk.sk_receive_queue.lock);
741}
742
eea49cc9 743static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 744 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 745{
746 /* Flush everything minus the block header */
747
748#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
749 u8 *start, *end;
750
751 start = (u8 *)pbd1;
752
753 /* Skip the block header(we know header WILL fit in 4K) */
754 start += PAGE_SIZE;
755
756 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
757 for (; start < end; start += PAGE_SIZE)
758 flush_dcache_page(pgv_to_page(start));
759
760 smp_wmb();
761#endif
762
763 /* Now update the block status. */
764
765 BLOCK_STATUS(pbd1) = status;
766
767 /* Flush the block header */
768
769#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
770 start = (u8 *)pbd1;
771 flush_dcache_page(pgv_to_page(start));
772
773 smp_wmb();
774#endif
775}
776
777/*
778 * Side effect:
779 *
780 * 1) flush the block
781 * 2) Increment active_blk_num
782 *
783 * Note:We DONT refresh the timer on purpose.
784 * Because almost always the next block will be opened.
785 */
bc59ba39 786static void prb_close_block(struct tpacket_kbdq_core *pkc1,
787 struct tpacket_block_desc *pbd1,
f6fb8f10 788 struct packet_sock *po, unsigned int stat)
789{
790 __u32 status = TP_STATUS_USER | stat;
791
792 struct tpacket3_hdr *last_pkt;
bc59ba39 793 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 794 struct sock *sk = &po->sk;
f6fb8f10 795
ee80fbf3 796 if (po->stats.stats3.tp_drops)
f6fb8f10 797 status |= TP_STATUS_LOSING;
798
799 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
800 last_pkt->tp_next_offset = 0;
801
802 /* Get the ts of the last pkt */
803 if (BLOCK_NUM_PKTS(pbd1)) {
804 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
805 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
806 } else {
41a50d62
AD
807 /* Ok, we tmo'd - so get the current time.
808 *
809 * It shouldn't really happen as we don't close empty
810 * blocks. See prb_retire_rx_blk_timer_expired().
811 */
f6fb8f10 812 struct timespec ts;
813 getnstimeofday(&ts);
814 h1->ts_last_pkt.ts_sec = ts.tv_sec;
815 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
816 }
817
818 smp_wmb();
819
820 /* Flush the block */
821 prb_flush_block(pkc1, pbd1, status);
822
da413eec
DC
823 sk->sk_data_ready(sk);
824
f6fb8f10 825 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
826}
827
eea49cc9 828static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 829{
830 pkc->reset_pending_on_curr_blk = 0;
831}
832
833/*
834 * Side effect of opening a block:
835 *
836 * 1) prb_queue is thawed.
837 * 2) retire_blk_timer is refreshed.
838 *
839 */
bc59ba39 840static void prb_open_block(struct tpacket_kbdq_core *pkc1,
841 struct tpacket_block_desc *pbd1)
f6fb8f10 842{
843 struct timespec ts;
bc59ba39 844 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 845
846 smp_rmb();
847
8da3056c
DB
848 /* We could have just memset this but we will lose the
849 * flexibility of making the priv area sticky
850 */
f6fb8f10 851
8da3056c
DB
852 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
853 BLOCK_NUM_PKTS(pbd1) = 0;
854 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 855
8da3056c
DB
856 getnstimeofday(&ts);
857
858 h1->ts_first_pkt.ts_sec = ts.tv_sec;
859 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 860
8da3056c
DB
861 pkc1->pkblk_start = (char *)pbd1;
862 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
863
864 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
865 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
866
867 pbd1->version = pkc1->version;
868 pkc1->prev = pkc1->nxt_offset;
869 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
870
871 prb_thaw_queue(pkc1);
872 _prb_refresh_rx_retire_blk_timer(pkc1);
873
874 smp_wmb();
f6fb8f10 875}
876
877/*
878 * Queue freeze logic:
879 * 1) Assume tp_block_nr = 8 blocks.
880 * 2) At time 't0', user opens Rx ring.
881 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
882 * 4) user-space is either sleeping or processing block '0'.
883 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
884 * it will close block-7,loop around and try to fill block '0'.
885 * call-flow:
886 * __packet_lookup_frame_in_block
887 * prb_retire_current_block()
888 * prb_dispatch_next_block()
889 * |->(BLOCK_STATUS == USER) evaluates to true
890 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
891 * 6) Now there are two cases:
892 * 6.1) Link goes idle right after the queue is frozen.
893 * But remember, the last open_block() refreshed the timer.
894 * When this timer expires,it will refresh itself so that we can
895 * re-open block-0 in near future.
896 * 6.2) Link is busy and keeps on receiving packets. This is a simple
897 * case and __packet_lookup_frame_in_block will check if block-0
898 * is free and can now be re-used.
899 */
eea49cc9 900static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 901 struct packet_sock *po)
902{
903 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 904 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 905}
906
907#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
908
909/*
910 * If the next block is free then we will dispatch it
911 * and return a good offset.
912 * Else, we will freeze the queue.
913 * So, caller must check the return value.
914 */
bc59ba39 915static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 916 struct packet_sock *po)
917{
bc59ba39 918 struct tpacket_block_desc *pbd;
f6fb8f10 919
920 smp_rmb();
921
922 /* 1. Get current block num */
923 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
924
925 /* 2. If this block is currently in_use then freeze the queue */
926 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
927 prb_freeze_queue(pkc, po);
928 return NULL;
929 }
930
931 /*
932 * 3.
933 * open this block and return the offset where the first packet
934 * needs to get stored.
935 */
936 prb_open_block(pkc, pbd);
937 return (void *)pkc->nxt_offset;
938}
939
bc59ba39 940static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 941 struct packet_sock *po, unsigned int status)
942{
bc59ba39 943 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 944
945 /* retire/close the current block */
946 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
947 /*
948 * Plug the case where copy_bits() is in progress on
949 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
950 * have space to copy the pkt in the current block and
951 * called prb_retire_current_block()
952 *
953 * We don't need to worry about the TMO case because
954 * the timer-handler already handled this case.
955 */
956 if (!(status & TP_STATUS_BLK_TMO)) {
957 while (atomic_read(&pkc->blk_fill_in_prog)) {
958 /* Waiting for skb_copy_bits to finish... */
959 cpu_relax();
960 }
961 }
962 prb_close_block(pkc, pbd, po, status);
963 return;
964 }
f6fb8f10 965}
966
eea49cc9 967static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 968 struct tpacket_block_desc *pbd)
f6fb8f10 969{
970 return TP_STATUS_USER & BLOCK_STATUS(pbd);
971}
972
eea49cc9 973static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 974{
975 return pkc->reset_pending_on_curr_blk;
976}
977
eea49cc9 978static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 979{
bc59ba39 980 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 981 atomic_dec(&pkc->blk_fill_in_prog);
982}
983
eea49cc9 984static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 985 struct tpacket3_hdr *ppd)
986{
3958afa1 987 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 988}
989
eea49cc9 990static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 991 struct tpacket3_hdr *ppd)
992{
993 ppd->hv1.tp_rxhash = 0;
994}
995
eea49cc9 996static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 997 struct tpacket3_hdr *ppd)
998{
df8a39de
JP
999 if (skb_vlan_tag_present(pkc->skb)) {
1000 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
1001 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1002 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 1003 } else {
9e67030a 1004 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 1005 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 1006 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 1007 }
1008}
1009
bc59ba39 1010static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 1011 struct tpacket3_hdr *ppd)
1012{
a0cdfcf3 1013 ppd->hv1.tp_padding = 0;
f6fb8f10 1014 prb_fill_vlan_info(pkc, ppd);
1015
1016 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1017 prb_fill_rxhash(pkc, ppd);
1018 else
1019 prb_clear_rxhash(pkc, ppd);
1020}
1021
eea49cc9 1022static void prb_fill_curr_block(char *curr,
bc59ba39 1023 struct tpacket_kbdq_core *pkc,
1024 struct tpacket_block_desc *pbd,
f6fb8f10 1025 unsigned int len)
1026{
1027 struct tpacket3_hdr *ppd;
1028
1029 ppd = (struct tpacket3_hdr *)curr;
1030 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1031 pkc->prev = curr;
1032 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1033 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1034 BLOCK_NUM_PKTS(pbd) += 1;
1035 atomic_inc(&pkc->blk_fill_in_prog);
1036 prb_run_all_ft_ops(pkc, ppd);
1037}
1038
1039/* Assumes caller has the sk->rx_queue.lock */
1040static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1041 struct sk_buff *skb,
1042 int status,
1043 unsigned int len
1044 )
1045{
bc59ba39 1046 struct tpacket_kbdq_core *pkc;
1047 struct tpacket_block_desc *pbd;
f6fb8f10 1048 char *curr, *end;
1049
e3192690 1050 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1051 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1052
1053 /* Queue is frozen when user space is lagging behind */
1054 if (prb_queue_frozen(pkc)) {
1055 /*
1056 * Check if that last block which caused the queue to freeze,
1057 * is still in_use by user-space.
1058 */
1059 if (prb_curr_blk_in_use(pkc, pbd)) {
1060 /* Can't record this packet */
1061 return NULL;
1062 } else {
1063 /*
1064 * Ok, the block was released by user-space.
1065 * Now let's open that block.
1066 * opening a block also thaws the queue.
1067 * Thawing is a side effect.
1068 */
1069 prb_open_block(pkc, pbd);
1070 }
1071 }
1072
1073 smp_mb();
1074 curr = pkc->nxt_offset;
1075 pkc->skb = skb;
e3192690 1076 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1077
1078 /* first try the current block */
1079 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1080 prb_fill_curr_block(curr, pkc, pbd, len);
1081 return (void *)curr;
1082 }
1083
1084 /* Ok, close the current block */
1085 prb_retire_current_block(pkc, po, 0);
1086
1087 /* Now, try to dispatch the next block */
1088 curr = (char *)prb_dispatch_next_block(pkc, po);
1089 if (curr) {
1090 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1091 prb_fill_curr_block(curr, pkc, pbd, len);
1092 return (void *)curr;
1093 }
1094
1095 /*
1096 * No free blocks are available.user_space hasn't caught up yet.
1097 * Queue was just frozen and now this packet will get dropped.
1098 */
1099 return NULL;
1100}
1101
eea49cc9 1102static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1103 struct sk_buff *skb,
1104 int status, unsigned int len)
1105{
1106 char *curr = NULL;
1107 switch (po->tp_version) {
1108 case TPACKET_V1:
1109 case TPACKET_V2:
1110 curr = packet_lookup_frame(po, &po->rx_ring,
1111 po->rx_ring.head, status);
1112 return curr;
1113 case TPACKET_V3:
1114 return __packet_lookup_frame_in_block(po, skb, status, len);
1115 default:
1116 WARN(1, "TPACKET version not supported\n");
1117 BUG();
99aa3473 1118 return NULL;
f6fb8f10 1119 }
1120}
1121
eea49cc9 1122static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1123 struct packet_ring_buffer *rb,
77f65ebd 1124 unsigned int idx,
f6fb8f10 1125 int status)
1126{
bc59ba39 1127 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1128 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1129
1130 if (status != BLOCK_STATUS(pbd))
1131 return NULL;
1132 return pbd;
1133}
1134
eea49cc9 1135static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1136{
1137 unsigned int prev;
1138 if (rb->prb_bdqc.kactive_blk_num)
1139 prev = rb->prb_bdqc.kactive_blk_num-1;
1140 else
1141 prev = rb->prb_bdqc.knum_blocks-1;
1142 return prev;
1143}
1144
1145/* Assumes caller has held the rx_queue.lock */
eea49cc9 1146static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1147 struct packet_ring_buffer *rb,
1148 int status)
1149{
1150 unsigned int previous = prb_previous_blk_num(rb);
1151 return prb_lookup_block(po, rb, previous, status);
1152}
1153
eea49cc9 1154static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1155 struct packet_ring_buffer *rb,
1156 int status)
1157{
1158 if (po->tp_version <= TPACKET_V2)
1159 return packet_previous_frame(po, rb, status);
1160
1161 return __prb_previous_block(po, rb, status);
1162}
1163
eea49cc9 1164static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1165 struct packet_ring_buffer *rb)
1166{
1167 switch (po->tp_version) {
1168 case TPACKET_V1:
1169 case TPACKET_V2:
1170 return packet_increment_head(rb);
1171 case TPACKET_V3:
1172 default:
1173 WARN(1, "TPACKET version not supported.\n");
1174 BUG();
1175 return;
1176 }
1177}
1178
eea49cc9 1179static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1180 struct packet_ring_buffer *rb,
1181 int status)
1182{
1183 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1184 return packet_lookup_frame(po, rb, previous, status);
1185}
1186
eea49cc9 1187static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1188{
1189 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1190}
1191
b0138408
DB
1192static void packet_inc_pending(struct packet_ring_buffer *rb)
1193{
1194 this_cpu_inc(*rb->pending_refcnt);
1195}
1196
1197static void packet_dec_pending(struct packet_ring_buffer *rb)
1198{
1199 this_cpu_dec(*rb->pending_refcnt);
1200}
1201
1202static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1203{
1204 unsigned int refcnt = 0;
1205 int cpu;
1206
1207 /* We don't use pending refcount in rx_ring. */
1208 if (rb->pending_refcnt == NULL)
1209 return 0;
1210
1211 for_each_possible_cpu(cpu)
1212 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1213
1214 return refcnt;
1215}
1216
1217static int packet_alloc_pending(struct packet_sock *po)
1218{
1219 po->rx_ring.pending_refcnt = NULL;
1220
1221 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1222 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1223 return -ENOBUFS;
1224
1225 return 0;
1226}
1227
1228static void packet_free_pending(struct packet_sock *po)
1229{
1230 free_percpu(po->tx_ring.pending_refcnt);
1231}
1232
9954729b
WB
1233#define ROOM_POW_OFF 2
1234#define ROOM_NONE 0x0
1235#define ROOM_LOW 0x1
1236#define ROOM_NORMAL 0x2
1237
1238static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1239{
9954729b
WB
1240 int idx, len;
1241
1242 len = po->rx_ring.frame_max + 1;
1243 idx = po->rx_ring.head;
1244 if (pow_off)
1245 idx += len >> pow_off;
1246 if (idx >= len)
1247 idx -= len;
1248 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1249}
1250
1251static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1252{
1253 int idx, len;
1254
1255 len = po->rx_ring.prb_bdqc.knum_blocks;
1256 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1257 if (pow_off)
1258 idx += len >> pow_off;
1259 if (idx >= len)
1260 idx -= len;
1261 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1262}
77f65ebd 1263
2ccdbaa6 1264static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1265{
1266 struct sock *sk = &po->sk;
1267 int ret = ROOM_NONE;
1268
1269 if (po->prot_hook.func != tpacket_rcv) {
1270 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1271 - (skb ? skb->truesize : 0);
9954729b
WB
1272 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1273 return ROOM_NORMAL;
1274 else if (avail > 0)
1275 return ROOM_LOW;
1276 else
1277 return ROOM_NONE;
1278 }
77f65ebd 1279
9954729b
WB
1280 if (po->tp_version == TPACKET_V3) {
1281 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1282 ret = ROOM_NORMAL;
1283 else if (__tpacket_v3_has_room(po, 0))
1284 ret = ROOM_LOW;
1285 } else {
1286 if (__tpacket_has_room(po, ROOM_POW_OFF))
1287 ret = ROOM_NORMAL;
1288 else if (__tpacket_has_room(po, 0))
1289 ret = ROOM_LOW;
1290 }
2ccdbaa6
WB
1291
1292 return ret;
1293}
1294
1295static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1296{
1297 int ret;
1298 bool has_room;
1299
54d7c01d
WB
1300 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1301 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1302 has_room = ret == ROOM_NORMAL;
1303 if (po->pressure == has_room)
54d7c01d
WB
1304 po->pressure = !has_room;
1305 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1306
9954729b 1307 return ret;
77f65ebd
WB
1308}
1309
1da177e4
LT
1310static void packet_sock_destruct(struct sock *sk)
1311{
ed85b565
RC
1312 skb_queue_purge(&sk->sk_error_queue);
1313
547b792c
IJ
1314 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1315 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1316
1317 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1318 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1319 return;
1320 }
1321
17ab56a2 1322 sk_refcnt_debug_dec(sk);
1da177e4
LT
1323}
1324
3b3a5b0a
WB
1325static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1326{
1327 u32 rxhash;
1328 int i, count = 0;
1329
1330 rxhash = skb_get_hash(skb);
1331 for (i = 0; i < ROLLOVER_HLEN; i++)
1332 if (po->rollover->history[i] == rxhash)
1333 count++;
1334
1335 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1336 return count > (ROLLOVER_HLEN >> 1);
1337}
1338
77f65ebd
WB
1339static unsigned int fanout_demux_hash(struct packet_fanout *f,
1340 struct sk_buff *skb,
1341 unsigned int num)
dc99f600 1342{
61b905da 1343 return reciprocal_scale(skb_get_hash(skb), num);
dc99f600
DM
1344}
1345
77f65ebd
WB
1346static unsigned int fanout_demux_lb(struct packet_fanout *f,
1347 struct sk_buff *skb,
1348 unsigned int num)
dc99f600 1349{
468479e6 1350 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1351
468479e6 1352 return val % num;
77f65ebd
WB
1353}
1354
1355static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1356 struct sk_buff *skb,
1357 unsigned int num)
1358{
1359 return smp_processor_id() % num;
dc99f600
DM
1360}
1361
5df0ddfb
DB
1362static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1363 struct sk_buff *skb,
1364 unsigned int num)
1365{
f337db64 1366 return prandom_u32_max(num);
5df0ddfb
DB
1367}
1368
77f65ebd
WB
1369static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1370 struct sk_buff *skb,
ad377cab 1371 unsigned int idx, bool try_self,
77f65ebd 1372 unsigned int num)
95ec3eb4 1373{
4633c9e0 1374 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1375 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1376
0648ab70 1377 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1378
1379 if (try_self) {
1380 room = packet_rcv_has_room(po, skb);
1381 if (room == ROOM_NORMAL ||
1382 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1383 return idx;
4633c9e0 1384 po_skip = po;
3b3a5b0a 1385 }
ad377cab 1386
0648ab70 1387 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1388 do {
2ccdbaa6 1389 po_next = pkt_sk(f->arr[i]);
4633c9e0 1390 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1391 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1392 if (i != j)
0648ab70 1393 po->rollover->sock = i;
a9b63918
WB
1394 atomic_long_inc(&po->rollover->num);
1395 if (room == ROOM_LOW)
1396 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1397 return i;
1398 }
ad377cab 1399
77f65ebd
WB
1400 if (++i == num)
1401 i = 0;
1402 } while (i != j);
1403
a9b63918 1404 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1405 return idx;
1406}
1407
2d36097d
NH
1408static unsigned int fanout_demux_qm(struct packet_fanout *f,
1409 struct sk_buff *skb,
1410 unsigned int num)
1411{
1412 return skb_get_queue_mapping(skb) % num;
1413}
1414
47dceb8e
WB
1415static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1416 struct sk_buff *skb,
1417 unsigned int num)
1418{
1419 struct bpf_prog *prog;
1420 unsigned int ret = 0;
1421
1422 rcu_read_lock();
1423 prog = rcu_dereference(f->bpf_prog);
1424 if (prog)
ff936a04 1425 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1426 rcu_read_unlock();
1427
1428 return ret;
1429}
1430
77f65ebd
WB
1431static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1432{
1433 return f->flags & (flag >> 8);
95ec3eb4
DM
1434}
1435
95ec3eb4
DM
1436static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1437 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1438{
1439 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1440 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1441 struct net *net = read_pnet(&f->net);
dc99f600 1442 struct packet_sock *po;
77f65ebd 1443 unsigned int idx;
dc99f600 1444
19bcf9f2 1445 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1446 kfree_skb(skb);
1447 return 0;
1448 }
1449
3f34b24a 1450 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1451 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1452 if (!skb)
1453 return 0;
1454 }
95ec3eb4
DM
1455 switch (f->type) {
1456 case PACKET_FANOUT_HASH:
1457 default:
77f65ebd 1458 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1459 break;
1460 case PACKET_FANOUT_LB:
77f65ebd 1461 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1462 break;
1463 case PACKET_FANOUT_CPU:
77f65ebd
WB
1464 idx = fanout_demux_cpu(f, skb, num);
1465 break;
5df0ddfb
DB
1466 case PACKET_FANOUT_RND:
1467 idx = fanout_demux_rnd(f, skb, num);
1468 break;
2d36097d
NH
1469 case PACKET_FANOUT_QM:
1470 idx = fanout_demux_qm(f, skb, num);
1471 break;
77f65ebd 1472 case PACKET_FANOUT_ROLLOVER:
ad377cab 1473 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1474 break;
47dceb8e 1475 case PACKET_FANOUT_CBPF:
f2e52095 1476 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1477 idx = fanout_demux_bpf(f, skb, num);
1478 break;
dc99f600
DM
1479 }
1480
ad377cab
WB
1481 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1482 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1483
ad377cab 1484 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1485 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1486}
1487
fff3321d
PE
1488DEFINE_MUTEX(fanout_mutex);
1489EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1490static LIST_HEAD(fanout_list);
1491
1492static void __fanout_link(struct sock *sk, struct packet_sock *po)
1493{
1494 struct packet_fanout *f = po->fanout;
1495
1496 spin_lock(&f->lock);
1497 f->arr[f->num_members] = sk;
1498 smp_wmb();
1499 f->num_members++;
1500 spin_unlock(&f->lock);
1501}
1502
1503static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1504{
1505 struct packet_fanout *f = po->fanout;
1506 int i;
1507
1508 spin_lock(&f->lock);
1509 for (i = 0; i < f->num_members; i++) {
1510 if (f->arr[i] == sk)
1511 break;
1512 }
1513 BUG_ON(i >= f->num_members);
1514 f->arr[i] = f->arr[f->num_members - 1];
1515 f->num_members--;
1516 spin_unlock(&f->lock);
1517}
1518
d4dd8aee 1519static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1520{
161642e2
ED
1521 if (sk->sk_family != PF_PACKET)
1522 return false;
c0de08d0 1523
161642e2 1524 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1525}
1526
47dceb8e
WB
1527static void fanout_init_data(struct packet_fanout *f)
1528{
1529 switch (f->type) {
1530 case PACKET_FANOUT_LB:
1531 atomic_set(&f->rr_cur, 0);
1532 break;
1533 case PACKET_FANOUT_CBPF:
f2e52095 1534 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1535 RCU_INIT_POINTER(f->bpf_prog, NULL);
1536 break;
1537 }
1538}
1539
1540static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1541{
1542 struct bpf_prog *old;
1543
1544 spin_lock(&f->lock);
1545 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1546 rcu_assign_pointer(f->bpf_prog, new);
1547 spin_unlock(&f->lock);
1548
1549 if (old) {
1550 synchronize_net();
1551 bpf_prog_destroy(old);
1552 }
1553}
1554
1555static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1556 unsigned int len)
1557{
1558 struct bpf_prog *new;
1559 struct sock_fprog fprog;
1560 int ret;
1561
1562 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1563 return -EPERM;
1564 if (len != sizeof(fprog))
1565 return -EINVAL;
1566 if (copy_from_user(&fprog, data, len))
1567 return -EFAULT;
1568
bab18991 1569 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1570 if (ret)
1571 return ret;
1572
1573 __fanout_set_data_bpf(po->fanout, new);
1574 return 0;
1575}
1576
f2e52095
WB
1577static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1578 unsigned int len)
1579{
1580 struct bpf_prog *new;
1581 u32 fd;
1582
1583 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1584 return -EPERM;
1585 if (len != sizeof(fd))
1586 return -EINVAL;
1587 if (copy_from_user(&fd, data, len))
1588 return -EFAULT;
1589
1590 new = bpf_prog_get(fd);
1591 if (IS_ERR(new))
1592 return PTR_ERR(new);
1593 if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) {
1594 bpf_prog_put(new);
1595 return -EINVAL;
1596 }
1597
1598 __fanout_set_data_bpf(po->fanout, new);
1599 return 0;
1600}
1601
47dceb8e
WB
1602static int fanout_set_data(struct packet_sock *po, char __user *data,
1603 unsigned int len)
1604{
1605 switch (po->fanout->type) {
1606 case PACKET_FANOUT_CBPF:
1607 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1608 case PACKET_FANOUT_EBPF:
1609 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1610 default:
1611 return -EINVAL;
1612 };
1613}
1614
1615static void fanout_release_data(struct packet_fanout *f)
1616{
1617 switch (f->type) {
1618 case PACKET_FANOUT_CBPF:
f2e52095 1619 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1620 __fanout_set_data_bpf(f, NULL);
1621 };
1622}
1623
7736d33f 1624static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1625{
1626 struct packet_sock *po = pkt_sk(sk);
1627 struct packet_fanout *f, *match;
7736d33f 1628 u8 type = type_flags & 0xff;
77f65ebd 1629 u8 flags = type_flags >> 8;
dc99f600
DM
1630 int err;
1631
1632 switch (type) {
77f65ebd
WB
1633 case PACKET_FANOUT_ROLLOVER:
1634 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1635 return -EINVAL;
dc99f600
DM
1636 case PACKET_FANOUT_HASH:
1637 case PACKET_FANOUT_LB:
95ec3eb4 1638 case PACKET_FANOUT_CPU:
5df0ddfb 1639 case PACKET_FANOUT_RND:
2d36097d 1640 case PACKET_FANOUT_QM:
47dceb8e 1641 case PACKET_FANOUT_CBPF:
f2e52095 1642 case PACKET_FANOUT_EBPF:
dc99f600
DM
1643 break;
1644 default:
1645 return -EINVAL;
1646 }
1647
1648 if (!po->running)
1649 return -EINVAL;
1650
1651 if (po->fanout)
1652 return -EALREADY;
1653
4633c9e0
WB
1654 if (type == PACKET_FANOUT_ROLLOVER ||
1655 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
0648ab70
WB
1656 po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL);
1657 if (!po->rollover)
1658 return -ENOMEM;
a9b63918
WB
1659 atomic_long_set(&po->rollover->num, 0);
1660 atomic_long_set(&po->rollover->num_huge, 0);
1661 atomic_long_set(&po->rollover->num_failed, 0);
0648ab70
WB
1662 }
1663
dc99f600
DM
1664 mutex_lock(&fanout_mutex);
1665 match = NULL;
1666 list_for_each_entry(f, &fanout_list, list) {
1667 if (f->id == id &&
1668 read_pnet(&f->net) == sock_net(sk)) {
1669 match = f;
1670 break;
1671 }
1672 }
afe62c68 1673 err = -EINVAL;
77f65ebd 1674 if (match && match->flags != flags)
afe62c68 1675 goto out;
dc99f600 1676 if (!match) {
afe62c68 1677 err = -ENOMEM;
dc99f600 1678 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1679 if (!match)
1680 goto out;
1681 write_pnet(&match->net, sock_net(sk));
1682 match->id = id;
1683 match->type = type;
77f65ebd 1684 match->flags = flags;
afe62c68
ED
1685 INIT_LIST_HEAD(&match->list);
1686 spin_lock_init(&match->lock);
1687 atomic_set(&match->sk_ref, 0);
47dceb8e 1688 fanout_init_data(match);
afe62c68
ED
1689 match->prot_hook.type = po->prot_hook.type;
1690 match->prot_hook.dev = po->prot_hook.dev;
1691 match->prot_hook.func = packet_rcv_fanout;
1692 match->prot_hook.af_packet_priv = match;
c0de08d0 1693 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1694 dev_add_pack(&match->prot_hook);
1695 list_add(&match->list, &fanout_list);
dc99f600 1696 }
afe62c68
ED
1697 err = -EINVAL;
1698 if (match->type == type &&
1699 match->prot_hook.type == po->prot_hook.type &&
1700 match->prot_hook.dev == po->prot_hook.dev) {
1701 err = -ENOSPC;
1702 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1703 __dev_remove_pack(&po->prot_hook);
1704 po->fanout = match;
1705 atomic_inc(&match->sk_ref);
1706 __fanout_link(sk, po);
1707 err = 0;
dc99f600
DM
1708 }
1709 }
afe62c68 1710out:
dc99f600 1711 mutex_unlock(&fanout_mutex);
0648ab70
WB
1712 if (err) {
1713 kfree(po->rollover);
1714 po->rollover = NULL;
1715 }
dc99f600
DM
1716 return err;
1717}
1718
1719static void fanout_release(struct sock *sk)
1720{
1721 struct packet_sock *po = pkt_sk(sk);
1722 struct packet_fanout *f;
1723
1724 f = po->fanout;
1725 if (!f)
1726 return;
1727
fff3321d 1728 mutex_lock(&fanout_mutex);
dc99f600
DM
1729 po->fanout = NULL;
1730
dc99f600
DM
1731 if (atomic_dec_and_test(&f->sk_ref)) {
1732 list_del(&f->list);
1733 dev_remove_pack(&f->prot_hook);
47dceb8e 1734 fanout_release_data(f);
dc99f600
DM
1735 kfree(f);
1736 }
1737 mutex_unlock(&fanout_mutex);
0648ab70 1738
59f21118
WB
1739 if (po->rollover)
1740 kfree_rcu(po->rollover, rcu);
dc99f600 1741}
1da177e4 1742
3c70c132
DB
1743static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1744 struct sk_buff *skb)
1745{
1746 /* Earlier code assumed this would be a VLAN pkt, double-check
1747 * this now that we have the actual packet in hand. We can only
1748 * do this check on Ethernet devices.
1749 */
1750 if (unlikely(dev->type != ARPHRD_ETHER))
1751 return false;
1752
1753 skb_reset_mac_header(skb);
1754 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1755}
1756
90ddc4f0 1757static const struct proto_ops packet_ops;
1da177e4 1758
90ddc4f0 1759static const struct proto_ops packet_ops_spkt;
1da177e4 1760
40d4e3df
ED
1761static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1762 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1763{
1764 struct sock *sk;
1765 struct sockaddr_pkt *spkt;
1766
1767 /*
1768 * When we registered the protocol we saved the socket in the data
1769 * field for just this event.
1770 */
1771
1772 sk = pt->af_packet_priv;
1ce4f28b 1773
1da177e4
LT
1774 /*
1775 * Yank back the headers [hope the device set this
1776 * right or kerboom...]
1777 *
1778 * Incoming packets have ll header pulled,
1779 * push it back.
1780 *
98e399f8 1781 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1782 * so that this procedure is noop.
1783 */
1784
1785 if (skb->pkt_type == PACKET_LOOPBACK)
1786 goto out;
1787
09ad9bc7 1788 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1789 goto out;
1790
40d4e3df
ED
1791 skb = skb_share_check(skb, GFP_ATOMIC);
1792 if (skb == NULL)
1da177e4
LT
1793 goto oom;
1794
1795 /* drop any routing info */
adf30907 1796 skb_dst_drop(skb);
1da177e4 1797
84531c24
PO
1798 /* drop conntrack reference */
1799 nf_reset(skb);
1800
ffbc6111 1801 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1802
98e399f8 1803 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1804
1805 /*
1806 * The SOCK_PACKET socket receives _all_ frames.
1807 */
1808
1809 spkt->spkt_family = dev->type;
1810 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1811 spkt->spkt_protocol = skb->protocol;
1812
1813 /*
1814 * Charge the memory to the socket. This is done specifically
1815 * to prevent sockets using all the memory up.
1816 */
1817
40d4e3df 1818 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1819 return 0;
1820
1821out:
1822 kfree_skb(skb);
1823oom:
1824 return 0;
1825}
1826
1827
1828/*
1829 * Output a raw packet to a device layer. This bypasses all the other
1830 * protocol layers and you must therefore supply it with a complete frame
1831 */
1ce4f28b 1832
1b784140
YX
1833static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1834 size_t len)
1da177e4
LT
1835{
1836 struct sock *sk = sock->sk;
342dfc30 1837 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1838 struct sk_buff *skb = NULL;
1da177e4 1839 struct net_device *dev;
40d4e3df 1840 __be16 proto = 0;
1da177e4 1841 int err;
3bdc0eba 1842 int extra_len = 0;
1ce4f28b 1843
1da177e4 1844 /*
1ce4f28b 1845 * Get and verify the address.
1da177e4
LT
1846 */
1847
40d4e3df 1848 if (saddr) {
1da177e4 1849 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1850 return -EINVAL;
1851 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1852 proto = saddr->spkt_protocol;
1853 } else
1854 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1855
1856 /*
1ce4f28b 1857 * Find the device first to size check it
1da177e4
LT
1858 */
1859
de74e92a 1860 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1861retry:
654d1f8a
ED
1862 rcu_read_lock();
1863 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1864 err = -ENODEV;
1865 if (dev == NULL)
1866 goto out_unlock;
1ce4f28b 1867
d5e76b0a
DM
1868 err = -ENETDOWN;
1869 if (!(dev->flags & IFF_UP))
1870 goto out_unlock;
1871
1da177e4 1872 /*
40d4e3df
ED
1873 * You may not queue a frame bigger than the mtu. This is the lowest level
1874 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1875 */
1ce4f28b 1876
3bdc0eba
BG
1877 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1878 if (!netif_supports_nofcs(dev)) {
1879 err = -EPROTONOSUPPORT;
1880 goto out_unlock;
1881 }
1882 extra_len = 4; /* We're doing our own CRC */
1883 }
1884
1da177e4 1885 err = -EMSGSIZE;
3bdc0eba 1886 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1887 goto out_unlock;
1888
1a35ca80
ED
1889 if (!skb) {
1890 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1891 int tlen = dev->needed_tailroom;
1a35ca80
ED
1892 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1893
1894 rcu_read_unlock();
4ce40912 1895 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1896 if (skb == NULL)
1897 return -ENOBUFS;
1898 /* FIXME: Save some space for broken drivers that write a hard
1899 * header at transmission time by themselves. PPP is the notable
1900 * one here. This should really be fixed at the driver level.
1901 */
1902 skb_reserve(skb, reserved);
1903 skb_reset_network_header(skb);
1904
1905 /* Try to align data part correctly */
1906 if (hhlen) {
1907 skb->data -= hhlen;
1908 skb->tail -= hhlen;
1909 if (len < hhlen)
1910 skb_reset_network_header(skb);
1911 }
6ce8e9ce 1912 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1913 if (err)
1914 goto out_free;
1915 goto retry;
1da177e4
LT
1916 }
1917
9ed988cd
WB
1918 if (!dev_validate_header(dev, skb->data, len)) {
1919 err = -EINVAL;
1920 goto out_unlock;
1921 }
3c70c132
DB
1922 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1923 !packet_extra_vlan_len_allowed(dev, skb)) {
1924 err = -EMSGSIZE;
1925 goto out_unlock;
57f89bfa 1926 }
1a35ca80 1927
1da177e4
LT
1928 skb->protocol = proto;
1929 skb->dev = dev;
1930 skb->priority = sk->sk_priority;
2d37a186 1931 skb->mark = sk->sk_mark;
bf84a010
DB
1932
1933 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1934
3bdc0eba
BG
1935 if (unlikely(extra_len == 4))
1936 skb->no_fcs = 1;
1937
40893fd0 1938 skb_probe_transport_header(skb, 0);
c1aad275 1939
1da177e4 1940 dev_queue_xmit(skb);
654d1f8a 1941 rcu_read_unlock();
40d4e3df 1942 return len;
1da177e4 1943
1da177e4 1944out_unlock:
654d1f8a 1945 rcu_read_unlock();
1a35ca80
ED
1946out_free:
1947 kfree_skb(skb);
1da177e4
LT
1948 return err;
1949}
1da177e4 1950
ff936a04
AS
1951static unsigned int run_filter(struct sk_buff *skb,
1952 const struct sock *sk,
1953 unsigned int res)
1da177e4
LT
1954{
1955 struct sk_filter *filter;
fda9ef5d 1956
80f8f102
ED
1957 rcu_read_lock();
1958 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1959 if (filter != NULL)
ff936a04 1960 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 1961 rcu_read_unlock();
1da177e4 1962
dbcb5855 1963 return res;
1da177e4
LT
1964}
1965
16cc1400
WB
1966static int __packet_rcv_vnet(const struct sk_buff *skb,
1967 struct virtio_net_hdr *vnet_hdr)
1968{
1969 *vnet_hdr = (const struct virtio_net_hdr) { 0 };
1970
1971 if (skb_is_gso(skb)) {
1972 struct skb_shared_info *sinfo = skb_shinfo(skb);
1973
1974 /* This is a hint as to how much should be linear. */
1975 vnet_hdr->hdr_len =
1976 __cpu_to_virtio16(vio_le(), skb_headlen(skb));
1977 vnet_hdr->gso_size =
1978 __cpu_to_virtio16(vio_le(), sinfo->gso_size);
1979
1980 if (sinfo->gso_type & SKB_GSO_TCPV4)
1981 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1982 else if (sinfo->gso_type & SKB_GSO_TCPV6)
1983 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1984 else if (sinfo->gso_type & SKB_GSO_UDP)
1985 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
1986 else if (sinfo->gso_type & SKB_GSO_FCOE)
1987 return -EINVAL;
1988 else
1989 BUG();
1990
1991 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1992 vnet_hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1993 } else
1994 vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
1995
1996 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1997 vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1998 vnet_hdr->csum_start = __cpu_to_virtio16(vio_le(),
1999 skb_checksum_start_offset(skb));
2000 vnet_hdr->csum_offset = __cpu_to_virtio16(vio_le(),
2001 skb->csum_offset);
2002 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2003 vnet_hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID;
2004 } /* else everything is zero */
2005
2006 return 0;
2007}
2008
2009static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2010 size_t *len)
2011{
2012 struct virtio_net_hdr vnet_hdr;
2013
2014 if (*len < sizeof(vnet_hdr))
2015 return -EINVAL;
2016 *len -= sizeof(vnet_hdr);
2017
2018 if (__packet_rcv_vnet(skb, &vnet_hdr))
2019 return -EINVAL;
2020
2021 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2022}
2023
1da177e4 2024/*
62ab0812
ED
2025 * This function makes lazy skb cloning in hope that most of packets
2026 * are discarded by BPF.
2027 *
2028 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2029 * and skb->cb are mangled. It works because (and until) packets
2030 * falling here are owned by current CPU. Output packets are cloned
2031 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2032 * sequencially, so that if we return skb to original state on exit,
2033 * we will not harm anyone.
1da177e4
LT
2034 */
2035
40d4e3df
ED
2036static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2037 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2038{
2039 struct sock *sk;
2040 struct sockaddr_ll *sll;
2041 struct packet_sock *po;
40d4e3df 2042 u8 *skb_head = skb->data;
1da177e4 2043 int skb_len = skb->len;
dbcb5855 2044 unsigned int snaplen, res;
1da177e4
LT
2045
2046 if (skb->pkt_type == PACKET_LOOPBACK)
2047 goto drop;
2048
2049 sk = pt->af_packet_priv;
2050 po = pkt_sk(sk);
2051
09ad9bc7 2052 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2053 goto drop;
2054
1da177e4
LT
2055 skb->dev = dev;
2056
3b04ddde 2057 if (dev->header_ops) {
1da177e4 2058 /* The device has an explicit notion of ll header,
62ab0812
ED
2059 * exported to higher levels.
2060 *
2061 * Otherwise, the device hides details of its frame
2062 * structure, so that corresponding packet head is
2063 * never delivered to user.
1da177e4
LT
2064 */
2065 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2066 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2067 else if (skb->pkt_type == PACKET_OUTGOING) {
2068 /* Special case: outgoing packets have ll header at head */
bbe735e4 2069 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2070 }
2071 }
2072
2073 snaplen = skb->len;
2074
dbcb5855
DM
2075 res = run_filter(skb, sk, snaplen);
2076 if (!res)
fda9ef5d 2077 goto drop_n_restore;
dbcb5855
DM
2078 if (snaplen > res)
2079 snaplen = res;
1da177e4 2080
0fd7bac6 2081 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2082 goto drop_n_acct;
2083
2084 if (skb_shared(skb)) {
2085 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2086 if (nskb == NULL)
2087 goto drop_n_acct;
2088
2089 if (skb_head != skb->data) {
2090 skb->data = skb_head;
2091 skb->len = skb_len;
2092 }
abc4e4fa 2093 consume_skb(skb);
1da177e4
LT
2094 skb = nskb;
2095 }
2096
b4772ef8 2097 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2098
2099 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2100 sll->sll_hatype = dev->type;
1da177e4 2101 sll->sll_pkttype = skb->pkt_type;
8032b464 2102 if (unlikely(po->origdev))
80feaacb
PWJ
2103 sll->sll_ifindex = orig_dev->ifindex;
2104 else
2105 sll->sll_ifindex = dev->ifindex;
1da177e4 2106
b95cce35 2107 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2108
2472d761
EB
2109 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2110 * Use their space for storing the original skb length.
2111 */
2112 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2113
1da177e4
LT
2114 if (pskb_trim(skb, snaplen))
2115 goto drop_n_acct;
2116
2117 skb_set_owner_r(skb, sk);
2118 skb->dev = NULL;
adf30907 2119 skb_dst_drop(skb);
1da177e4 2120
84531c24
PO
2121 /* drop conntrack reference */
2122 nf_reset(skb);
2123
1da177e4 2124 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2125 po->stats.stats1.tp_packets++;
3bc3b96f 2126 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2127 __skb_queue_tail(&sk->sk_receive_queue, skb);
2128 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2129 sk->sk_data_ready(sk);
1da177e4
LT
2130 return 0;
2131
2132drop_n_acct:
7091fbd8 2133 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2134 po->stats.stats1.tp_drops++;
7091fbd8
WB
2135 atomic_inc(&sk->sk_drops);
2136 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
2137
2138drop_n_restore:
2139 if (skb_head != skb->data && skb_shared(skb)) {
2140 skb->data = skb_head;
2141 skb->len = skb_len;
2142 }
2143drop:
ead2ceb0 2144 consume_skb(skb);
1da177e4
LT
2145 return 0;
2146}
2147
40d4e3df
ED
2148static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2149 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2150{
2151 struct sock *sk;
2152 struct packet_sock *po;
2153 struct sockaddr_ll *sll;
184f489e 2154 union tpacket_uhdr h;
40d4e3df 2155 u8 *skb_head = skb->data;
1da177e4 2156 int skb_len = skb->len;
dbcb5855 2157 unsigned int snaplen, res;
f6fb8f10 2158 unsigned long status = TP_STATUS_USER;
bbd6ef87 2159 unsigned short macoff, netoff, hdrlen;
1da177e4 2160 struct sk_buff *copy_skb = NULL;
bbd6ef87 2161 struct timespec ts;
b9c32fb2 2162 __u32 ts_status;
1da177e4 2163
51846355
AW
2164 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2165 * We may add members to them until current aligned size without forcing
2166 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2167 */
2168 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2169 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2170
1da177e4
LT
2171 if (skb->pkt_type == PACKET_LOOPBACK)
2172 goto drop;
2173
2174 sk = pt->af_packet_priv;
2175 po = pkt_sk(sk);
2176
09ad9bc7 2177 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2178 goto drop;
2179
3b04ddde 2180 if (dev->header_ops) {
1da177e4 2181 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2182 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2183 else if (skb->pkt_type == PACKET_OUTGOING) {
2184 /* Special case: outgoing packets have ll header at head */
bbe735e4 2185 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2186 }
2187 }
2188
2189 snaplen = skb->len;
2190
dbcb5855
DM
2191 res = run_filter(skb, sk, snaplen);
2192 if (!res)
fda9ef5d 2193 goto drop_n_restore;
68c2e5de
AD
2194
2195 if (skb->ip_summed == CHECKSUM_PARTIAL)
2196 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2197 else if (skb->pkt_type != PACKET_OUTGOING &&
2198 (skb->ip_summed == CHECKSUM_COMPLETE ||
2199 skb_csum_unnecessary(skb)))
2200 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2201
dbcb5855
DM
2202 if (snaplen > res)
2203 snaplen = res;
1da177e4
LT
2204
2205 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2206 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2207 po->tp_reserve;
1da177e4 2208 } else {
95c96174 2209 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2210 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2211 (maclen < 16 ? 16 : maclen)) +
58d19b19
WB
2212 po->tp_reserve;
2213 if (po->has_vnet_hdr)
2214 netoff += sizeof(struct virtio_net_hdr);
1da177e4
LT
2215 macoff = netoff - maclen;
2216 }
f6fb8f10 2217 if (po->tp_version <= TPACKET_V2) {
2218 if (macoff + snaplen > po->rx_ring.frame_size) {
2219 if (po->copy_thresh &&
0fd7bac6 2220 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2221 if (skb_shared(skb)) {
2222 copy_skb = skb_clone(skb, GFP_ATOMIC);
2223 } else {
2224 copy_skb = skb_get(skb);
2225 skb_head = skb->data;
2226 }
2227 if (copy_skb)
2228 skb_set_owner_r(copy_skb, sk);
1da177e4 2229 }
f6fb8f10 2230 snaplen = po->rx_ring.frame_size - macoff;
2231 if ((int)snaplen < 0)
2232 snaplen = 0;
1da177e4 2233 }
dc808110
ED
2234 } else if (unlikely(macoff + snaplen >
2235 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2236 u32 nval;
2237
2238 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2239 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2240 snaplen, nval, macoff);
2241 snaplen = nval;
2242 if (unlikely((int)snaplen < 0)) {
2243 snaplen = 0;
2244 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2245 }
1da177e4 2246 }
1da177e4 2247 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2248 h.raw = packet_current_rx_frame(po, skb,
2249 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2250 if (!h.raw)
58d19b19 2251 goto drop_n_account;
f6fb8f10 2252 if (po->tp_version <= TPACKET_V2) {
2253 packet_increment_rx_head(po, &po->rx_ring);
2254 /*
2255 * LOSING will be reported till you read the stats,
2256 * because it's COR - Clear On Read.
2257 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2258 * at packet level.
2259 */
ee80fbf3 2260 if (po->stats.stats1.tp_drops)
f6fb8f10 2261 status |= TP_STATUS_LOSING;
2262 }
ee80fbf3 2263 po->stats.stats1.tp_packets++;
1da177e4
LT
2264 if (copy_skb) {
2265 status |= TP_STATUS_COPY;
2266 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2267 }
1da177e4
LT
2268 spin_unlock(&sk->sk_receive_queue.lock);
2269
58d19b19
WB
2270 if (po->has_vnet_hdr) {
2271 if (__packet_rcv_vnet(skb, h.raw + macoff -
2272 sizeof(struct virtio_net_hdr))) {
2273 spin_lock(&sk->sk_receive_queue.lock);
2274 goto drop_n_account;
2275 }
2276 }
2277
bbd6ef87 2278 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2279
2280 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2281 getnstimeofday(&ts);
1da177e4 2282
b9c32fb2
DB
2283 status |= ts_status;
2284
bbd6ef87
PM
2285 switch (po->tp_version) {
2286 case TPACKET_V1:
2287 h.h1->tp_len = skb->len;
2288 h.h1->tp_snaplen = snaplen;
2289 h.h1->tp_mac = macoff;
2290 h.h1->tp_net = netoff;
4b457bdf
DB
2291 h.h1->tp_sec = ts.tv_sec;
2292 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2293 hdrlen = sizeof(*h.h1);
2294 break;
2295 case TPACKET_V2:
2296 h.h2->tp_len = skb->len;
2297 h.h2->tp_snaplen = snaplen;
2298 h.h2->tp_mac = macoff;
2299 h.h2->tp_net = netoff;
bbd6ef87
PM
2300 h.h2->tp_sec = ts.tv_sec;
2301 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2302 if (skb_vlan_tag_present(skb)) {
2303 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2304 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2305 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2306 } else {
2307 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2308 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2309 }
e4d26f4b 2310 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2311 hdrlen = sizeof(*h.h2);
2312 break;
f6fb8f10 2313 case TPACKET_V3:
2314 /* tp_nxt_offset,vlan are already populated above.
2315 * So DONT clear those fields here
2316 */
2317 h.h3->tp_status |= status;
2318 h.h3->tp_len = skb->len;
2319 h.h3->tp_snaplen = snaplen;
2320 h.h3->tp_mac = macoff;
2321 h.h3->tp_net = netoff;
f6fb8f10 2322 h.h3->tp_sec = ts.tv_sec;
2323 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2324 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2325 hdrlen = sizeof(*h.h3);
2326 break;
bbd6ef87
PM
2327 default:
2328 BUG();
2329 }
1da177e4 2330
bbd6ef87 2331 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2332 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2333 sll->sll_family = AF_PACKET;
2334 sll->sll_hatype = dev->type;
2335 sll->sll_protocol = skb->protocol;
2336 sll->sll_pkttype = skb->pkt_type;
8032b464 2337 if (unlikely(po->origdev))
80feaacb
PWJ
2338 sll->sll_ifindex = orig_dev->ifindex;
2339 else
2340 sll->sll_ifindex = dev->ifindex;
1da177e4 2341
e16aa207 2342 smp_mb();
f0d4eb29 2343
f6dafa95 2344#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2345 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2346 u8 *start, *end;
2347
f0d4eb29
DB
2348 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2349 macoff + snaplen);
2350
2351 for (start = h.raw; start < end; start += PAGE_SIZE)
2352 flush_dcache_page(pgv_to_page(start));
1da177e4 2353 }
f0d4eb29 2354 smp_wmb();
f6dafa95 2355#endif
f0d4eb29 2356
da413eec 2357 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2358 __packet_set_status(po, h.raw, status);
da413eec
DC
2359 sk->sk_data_ready(sk);
2360 } else {
f6fb8f10 2361 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2362 }
1da177e4
LT
2363
2364drop_n_restore:
2365 if (skb_head != skb->data && skb_shared(skb)) {
2366 skb->data = skb_head;
2367 skb->len = skb_len;
2368 }
2369drop:
1ce4f28b 2370 kfree_skb(skb);
1da177e4
LT
2371 return 0;
2372
58d19b19 2373drop_n_account:
ee80fbf3 2374 po->stats.stats1.tp_drops++;
1da177e4
LT
2375 spin_unlock(&sk->sk_receive_queue.lock);
2376
676d2369 2377 sk->sk_data_ready(sk);
acb5d75b 2378 kfree_skb(copy_skb);
1da177e4
LT
2379 goto drop_n_restore;
2380}
2381
69e3c75f
JB
2382static void tpacket_destruct_skb(struct sk_buff *skb)
2383{
2384 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2385
69e3c75f 2386 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2387 void *ph;
b9c32fb2
DB
2388 __u32 ts;
2389
69e3c75f 2390 ph = skb_shinfo(skb)->destructor_arg;
b0138408 2391 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2392
2393 ts = __packet_set_timestamp(po, ph, skb);
2394 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2395 }
2396
2397 sock_wfree(skb);
2398}
2399
c72219b7
DB
2400static void tpacket_set_protocol(const struct net_device *dev,
2401 struct sk_buff *skb)
2402{
2403 if (dev->type == ARPHRD_ETHER) {
2404 skb_reset_mac_header(skb);
2405 skb->protocol = eth_hdr(skb)->h_proto;
2406 }
2407}
2408
16cc1400
WB
2409static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2410{
2411 unsigned short gso_type = 0;
2412
2413 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2414 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2415 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2416 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2417 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2418 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2419 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2420
2421 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2422 return -EINVAL;
2423
2424 if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2425 switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2426 case VIRTIO_NET_HDR_GSO_TCPV4:
2427 gso_type = SKB_GSO_TCPV4;
2428 break;
2429 case VIRTIO_NET_HDR_GSO_TCPV6:
2430 gso_type = SKB_GSO_TCPV6;
2431 break;
2432 case VIRTIO_NET_HDR_GSO_UDP:
2433 gso_type = SKB_GSO_UDP;
2434 break;
2435 default:
2436 return -EINVAL;
2437 }
2438
2439 if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
2440 gso_type |= SKB_GSO_TCP_ECN;
2441
2442 if (vnet_hdr->gso_size == 0)
2443 return -EINVAL;
2444 }
2445
2446 vnet_hdr->gso_type = gso_type; /* changes type, temporary storage */
2447 return 0;
2448}
2449
2450static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2451 struct virtio_net_hdr *vnet_hdr)
2452{
2453 int n;
2454
2455 if (*len < sizeof(*vnet_hdr))
2456 return -EINVAL;
2457 *len -= sizeof(*vnet_hdr);
2458
2459 n = copy_from_iter(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter);
2460 if (n != sizeof(*vnet_hdr))
2461 return -EFAULT;
2462
2463 return __packet_snd_vnet_parse(vnet_hdr, *len);
2464}
2465
2466static int packet_snd_vnet_gso(struct sk_buff *skb,
2467 struct virtio_net_hdr *vnet_hdr)
2468{
2469 if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2470 u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start);
2471 u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset);
2472
2473 if (!skb_partial_csum_set(skb, s, o))
2474 return -EINVAL;
2475 }
2476
2477 skb_shinfo(skb)->gso_size =
2478 __virtio16_to_cpu(vio_le(), vnet_hdr->gso_size);
2479 skb_shinfo(skb)->gso_type = vnet_hdr->gso_type;
2480
2481 /* Header must be checked, and gso_segs computed. */
2482 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2483 skb_shinfo(skb)->gso_segs = 0;
2484 return 0;
2485}
2486
40d4e3df 2487static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2488 void *frame, struct net_device *dev, void *data, int tp_len,
1d036d25 2489 __be16 proto, unsigned char *addr, int hlen, int copylen)
69e3c75f 2490{
184f489e 2491 union tpacket_uhdr ph;
8d39b4a6 2492 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2493 struct socket *sock = po->sk.sk_socket;
2494 struct page *page;
69e3c75f
JB
2495 int err;
2496
2497 ph.raw = frame;
2498
2499 skb->protocol = proto;
2500 skb->dev = dev;
2501 skb->priority = po->sk.sk_priority;
2d37a186 2502 skb->mark = po->sk.sk_mark;
2e31396f 2503 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2504 skb_shinfo(skb)->destructor_arg = ph.raw;
2505
ae641949 2506 skb_reserve(skb, hlen);
69e3c75f 2507 skb_reset_network_header(skb);
c1aad275 2508
69e3c75f
JB
2509 to_write = tp_len;
2510
2511 if (sock->type == SOCK_DGRAM) {
2512 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2513 NULL, tp_len);
2514 if (unlikely(err < 0))
2515 return -EINVAL;
1d036d25 2516 } else if (copylen) {
9ed988cd
WB
2517 int hdrlen = min_t(int, copylen, tp_len);
2518
69e3c75f 2519 skb_push(skb, dev->hard_header_len);
1d036d25 2520 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2521 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2522 if (unlikely(err))
2523 return err;
9ed988cd
WB
2524 if (!dev_validate_header(dev, skb->data, hdrlen))
2525 return -EINVAL;
c72219b7
DB
2526 if (!skb->protocol)
2527 tpacket_set_protocol(dev, skb);
69e3c75f 2528
9ed988cd
WB
2529 data += hdrlen;
2530 to_write -= hdrlen;
69e3c75f
JB
2531 }
2532
69e3c75f
JB
2533 offset = offset_in_page(data);
2534 len_max = PAGE_SIZE - offset;
2535 len = ((to_write > len_max) ? len_max : to_write);
2536
2537 skb->data_len = to_write;
2538 skb->len += to_write;
2539 skb->truesize += to_write;
2540 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2541
2542 while (likely(to_write)) {
2543 nr_frags = skb_shinfo(skb)->nr_frags;
2544
2545 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2546 pr_err("Packet exceed the number of skb frags(%lu)\n",
2547 MAX_SKB_FRAGS);
69e3c75f
JB
2548 return -EFAULT;
2549 }
2550
0af55bb5
CG
2551 page = pgv_to_page(data);
2552 data += len;
69e3c75f
JB
2553 flush_dcache_page(page);
2554 get_page(page);
0af55bb5 2555 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2556 to_write -= len;
2557 offset = 0;
2558 len_max = PAGE_SIZE;
2559 len = ((to_write > len_max) ? len_max : to_write);
2560 }
2561
8fd6c80d 2562 skb_probe_transport_header(skb, 0);
efdfa2f7 2563
69e3c75f
JB
2564 return tp_len;
2565}
2566
8d39b4a6
WB
2567static int tpacket_parse_header(struct packet_sock *po, void *frame,
2568 int size_max, void **data)
2569{
2570 union tpacket_uhdr ph;
2571 int tp_len, off;
2572
2573 ph.raw = frame;
2574
2575 switch (po->tp_version) {
2576 case TPACKET_V2:
2577 tp_len = ph.h2->tp_len;
2578 break;
2579 default:
2580 tp_len = ph.h1->tp_len;
2581 break;
2582 }
2583 if (unlikely(tp_len > size_max)) {
2584 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2585 return -EMSGSIZE;
2586 }
2587
2588 if (unlikely(po->tp_tx_has_off)) {
2589 int off_min, off_max;
2590
2591 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2592 off_max = po->tx_ring.frame_size - tp_len;
2593 if (po->sk.sk_type == SOCK_DGRAM) {
2594 switch (po->tp_version) {
2595 case TPACKET_V2:
2596 off = ph.h2->tp_net;
2597 break;
2598 default:
2599 off = ph.h1->tp_net;
2600 break;
2601 }
2602 } else {
2603 switch (po->tp_version) {
2604 case TPACKET_V2:
2605 off = ph.h2->tp_mac;
2606 break;
2607 default:
2608 off = ph.h1->tp_mac;
2609 break;
2610 }
2611 }
2612 if (unlikely((off < off_min) || (off_max < off)))
2613 return -EINVAL;
2614 } else {
2615 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2616 }
2617
2618 *data = frame + off;
2619 return tp_len;
2620}
2621
69e3c75f
JB
2622static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2623{
69e3c75f
JB
2624 struct sk_buff *skb;
2625 struct net_device *dev;
1d036d25 2626 struct virtio_net_hdr *vnet_hdr = NULL;
69e3c75f 2627 __be16 proto;
09effa67 2628 int err, reserve = 0;
40d4e3df 2629 void *ph;
342dfc30 2630 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2631 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2632 int tp_len, size_max;
2633 unsigned char *addr;
8d39b4a6 2634 void *data;
69e3c75f 2635 int len_sum = 0;
9e67030a 2636 int status = TP_STATUS_AVAILABLE;
1d036d25 2637 int hlen, tlen, copylen = 0;
69e3c75f 2638
69e3c75f
JB
2639 mutex_lock(&po->pg_vec_lock);
2640
66e56cd4 2641 if (likely(saddr == NULL)) {
e40526cb 2642 dev = packet_cached_dev_get(po);
69e3c75f
JB
2643 proto = po->num;
2644 addr = NULL;
2645 } else {
2646 err = -EINVAL;
2647 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2648 goto out;
2649 if (msg->msg_namelen < (saddr->sll_halen
2650 + offsetof(struct sockaddr_ll,
2651 sll_addr)))
2652 goto out;
69e3c75f
JB
2653 proto = saddr->sll_protocol;
2654 addr = saddr->sll_addr;
827d9780 2655 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2656 }
2657
69e3c75f
JB
2658 err = -ENXIO;
2659 if (unlikely(dev == NULL))
2660 goto out;
69e3c75f
JB
2661 err = -ENETDOWN;
2662 if (unlikely(!(dev->flags & IFF_UP)))
2663 goto out_put;
2664
5cfb4c8d
DB
2665 if (po->sk.sk_socket->type == SOCK_RAW)
2666 reserve = dev->hard_header_len;
69e3c75f 2667 size_max = po->tx_ring.frame_size
b5dd884e 2668 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2669
1d036d25 2670 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2671 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2672
69e3c75f
JB
2673 do {
2674 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2675 TP_STATUS_SEND_REQUEST);
69e3c75f 2676 if (unlikely(ph == NULL)) {
87a2fd28
DB
2677 if (need_wait && need_resched())
2678 schedule();
69e3c75f
JB
2679 continue;
2680 }
2681
8d39b4a6
WB
2682 skb = NULL;
2683 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2684 if (tp_len < 0)
2685 goto tpacket_error;
2686
69e3c75f 2687 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2688 hlen = LL_RESERVED_SPACE(dev);
2689 tlen = dev->needed_tailroom;
1d036d25
WB
2690 if (po->has_vnet_hdr) {
2691 vnet_hdr = data;
2692 data += sizeof(*vnet_hdr);
2693 tp_len -= sizeof(*vnet_hdr);
2694 if (tp_len < 0 ||
2695 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2696 tp_len = -EINVAL;
2697 goto tpacket_error;
2698 }
2699 copylen = __virtio16_to_cpu(vio_le(),
2700 vnet_hdr->hdr_len);
2701 }
9ed988cd 2702 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2703 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2704 hlen + tlen + sizeof(struct sockaddr_ll) +
2705 (copylen - dev->hard_header_len),
fbf33a28 2706 !need_wait, &err);
69e3c75f 2707
fbf33a28
KM
2708 if (unlikely(skb == NULL)) {
2709 /* we assume the socket was initially writeable ... */
2710 if (likely(len_sum > 0))
2711 err = len_sum;
69e3c75f 2712 goto out_status;
fbf33a28 2713 }
8d39b4a6 2714 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
1d036d25 2715 addr, hlen, copylen);
dbd46ab4 2716 if (likely(tp_len >= 0) &&
5cfb4c8d 2717 tp_len > dev->mtu + reserve &&
1d036d25 2718 !po->has_vnet_hdr &&
3c70c132
DB
2719 !packet_extra_vlan_len_allowed(dev, skb))
2720 tp_len = -EMSGSIZE;
69e3c75f
JB
2721
2722 if (unlikely(tp_len < 0)) {
8d39b4a6 2723tpacket_error:
69e3c75f
JB
2724 if (po->tp_loss) {
2725 __packet_set_status(po, ph,
2726 TP_STATUS_AVAILABLE);
2727 packet_increment_head(&po->tx_ring);
2728 kfree_skb(skb);
2729 continue;
2730 } else {
2731 status = TP_STATUS_WRONG_FORMAT;
2732 err = tp_len;
2733 goto out_status;
2734 }
2735 }
2736
1d036d25
WB
2737 if (po->has_vnet_hdr && packet_snd_vnet_gso(skb, vnet_hdr)) {
2738 tp_len = -EINVAL;
2739 goto tpacket_error;
2740 }
2741
0fd5d57b
DB
2742 packet_pick_tx_queue(dev, skb);
2743
69e3c75f
JB
2744 skb->destructor = tpacket_destruct_skb;
2745 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2746 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2747
2748 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2749 err = po->xmit(skb);
eb70df13
JP
2750 if (unlikely(err > 0)) {
2751 err = net_xmit_errno(err);
2752 if (err && __packet_get_status(po, ph) ==
2753 TP_STATUS_AVAILABLE) {
2754 /* skb was destructed already */
2755 skb = NULL;
2756 goto out_status;
2757 }
2758 /*
2759 * skb was dropped but not destructed yet;
2760 * let's treat it like congestion or err < 0
2761 */
2762 err = 0;
2763 }
69e3c75f
JB
2764 packet_increment_head(&po->tx_ring);
2765 len_sum += tp_len;
b0138408
DB
2766 } while (likely((ph != NULL) ||
2767 /* Note: packet_read_pending() might be slow if we have
2768 * to call it as it's per_cpu variable, but in fast-path
2769 * we already short-circuit the loop with the first
2770 * condition, and luckily don't have to go that path
2771 * anyway.
2772 */
2773 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2774
2775 err = len_sum;
2776 goto out_put;
2777
69e3c75f
JB
2778out_status:
2779 __packet_set_status(po, ph, status);
2780 kfree_skb(skb);
2781out_put:
e40526cb 2782 dev_put(dev);
69e3c75f
JB
2783out:
2784 mutex_unlock(&po->pg_vec_lock);
2785 return err;
2786}
69e3c75f 2787
eea49cc9
OJ
2788static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2789 size_t reserve, size_t len,
2790 size_t linear, int noblock,
2791 int *err)
bfd5f4a3
SS
2792{
2793 struct sk_buff *skb;
2794
2795 /* Under a page? Don't bother with paged skb. */
2796 if (prepad + len < PAGE_SIZE || !linear)
2797 linear = len;
2798
2799 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2800 err, 0);
bfd5f4a3
SS
2801 if (!skb)
2802 return NULL;
2803
2804 skb_reserve(skb, reserve);
2805 skb_put(skb, linear);
2806 skb->data_len = len - linear;
2807 skb->len += len - linear;
2808
2809 return skb;
2810}
2811
d346a3fa 2812static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2813{
2814 struct sock *sk = sock->sk;
342dfc30 2815 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2816 struct sk_buff *skb;
2817 struct net_device *dev;
0e11c91e 2818 __be16 proto;
1da177e4 2819 unsigned char *addr;
827d9780 2820 int err, reserve = 0;
c7d39e32 2821 struct sockcm_cookie sockc;
bfd5f4a3
SS
2822 struct virtio_net_hdr vnet_hdr = { 0 };
2823 int offset = 0;
bfd5f4a3 2824 struct packet_sock *po = pkt_sk(sk);
ae641949 2825 int hlen, tlen;
3bdc0eba 2826 int extra_len = 0;
1da177e4
LT
2827
2828 /*
1ce4f28b 2829 * Get and verify the address.
1da177e4 2830 */
1ce4f28b 2831
66e56cd4 2832 if (likely(saddr == NULL)) {
e40526cb 2833 dev = packet_cached_dev_get(po);
1da177e4
LT
2834 proto = po->num;
2835 addr = NULL;
2836 } else {
2837 err = -EINVAL;
2838 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2839 goto out;
0fb375fb
EB
2840 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2841 goto out;
1da177e4
LT
2842 proto = saddr->sll_protocol;
2843 addr = saddr->sll_addr;
827d9780 2844 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2845 }
2846
1da177e4 2847 err = -ENXIO;
e40526cb 2848 if (unlikely(dev == NULL))
1da177e4 2849 goto out_unlock;
d5e76b0a 2850 err = -ENETDOWN;
e40526cb 2851 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2852 goto out_unlock;
2853
c7d39e32
EJ
2854 sockc.mark = sk->sk_mark;
2855 if (msg->msg_controllen) {
2856 err = sock_cmsg_send(sk, msg, &sockc);
2857 if (unlikely(err))
2858 goto out_unlock;
2859 }
2860
e40526cb
DB
2861 if (sock->type == SOCK_RAW)
2862 reserve = dev->hard_header_len;
bfd5f4a3 2863 if (po->has_vnet_hdr) {
16cc1400
WB
2864 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2865 if (err)
bfd5f4a3 2866 goto out_unlock;
bfd5f4a3
SS
2867 }
2868
3bdc0eba
BG
2869 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2870 if (!netif_supports_nofcs(dev)) {
2871 err = -EPROTONOSUPPORT;
2872 goto out_unlock;
2873 }
2874 extra_len = 4; /* We're doing our own CRC */
2875 }
2876
1da177e4 2877 err = -EMSGSIZE;
16cc1400
WB
2878 if (!vnet_hdr.gso_type &&
2879 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2880 goto out_unlock;
2881
bfd5f4a3 2882 err = -ENOBUFS;
ae641949
HX
2883 hlen = LL_RESERVED_SPACE(dev);
2884 tlen = dev->needed_tailroom;
dc9e5153 2885 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len,
d3869efe 2886 __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len),
bfd5f4a3 2887 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2888 if (skb == NULL)
1da177e4
LT
2889 goto out_unlock;
2890
bfd5f4a3 2891 skb_set_network_header(skb, reserve);
1da177e4 2892
0c4e8581 2893 err = -EINVAL;
9c707762
WB
2894 if (sock->type == SOCK_DGRAM) {
2895 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2896 if (unlikely(offset < 0))
9c707762 2897 goto out_free;
9c707762 2898 }
1da177e4
LT
2899
2900 /* Returns -EFAULT on error */
c0371da6 2901 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2902 if (err)
2903 goto out_free;
bf84a010 2904
9ed988cd
WB
2905 if (sock->type == SOCK_RAW &&
2906 !dev_validate_header(dev, skb->data, len)) {
2907 err = -EINVAL;
2908 goto out_free;
2909 }
2910
bf84a010 2911 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2912
16cc1400 2913 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2914 !packet_extra_vlan_len_allowed(dev, skb)) {
2915 err = -EMSGSIZE;
2916 goto out_free;
57f89bfa
BG
2917 }
2918
09effa67
DM
2919 skb->protocol = proto;
2920 skb->dev = dev;
1da177e4 2921 skb->priority = sk->sk_priority;
c7d39e32 2922 skb->mark = sockc.mark;
0fd5d57b
DB
2923
2924 packet_pick_tx_queue(dev, skb);
1da177e4 2925
bfd5f4a3 2926 if (po->has_vnet_hdr) {
16cc1400
WB
2927 err = packet_snd_vnet_gso(skb, &vnet_hdr);
2928 if (err)
2929 goto out_free;
2930 len += sizeof(vnet_hdr);
bfd5f4a3
SS
2931 }
2932
8fd6c80d
DB
2933 skb_probe_transport_header(skb, reserve);
2934
3bdc0eba
BG
2935 if (unlikely(extra_len == 4))
2936 skb->no_fcs = 1;
2937
d346a3fa 2938 err = po->xmit(skb);
1da177e4
LT
2939 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2940 goto out_unlock;
2941
e40526cb 2942 dev_put(dev);
1da177e4 2943
40d4e3df 2944 return len;
1da177e4
LT
2945
2946out_free:
2947 kfree_skb(skb);
2948out_unlock:
e40526cb 2949 if (dev)
1da177e4
LT
2950 dev_put(dev);
2951out:
2952 return err;
2953}
2954
1b784140 2955static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2956{
69e3c75f
JB
2957 struct sock *sk = sock->sk;
2958 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2959
69e3c75f
JB
2960 if (po->tx_ring.pg_vec)
2961 return tpacket_snd(po, msg);
2962 else
69e3c75f
JB
2963 return packet_snd(sock, msg, len);
2964}
2965
1da177e4
LT
2966/*
2967 * Close a PACKET socket. This is fairly simple. We immediately go
2968 * to 'closed' state and remove our protocol entry in the device list.
2969 */
2970
2971static int packet_release(struct socket *sock)
2972{
2973 struct sock *sk = sock->sk;
2974 struct packet_sock *po;
d12d01d6 2975 struct net *net;
f6fb8f10 2976 union tpacket_req_u req_u;
1da177e4
LT
2977
2978 if (!sk)
2979 return 0;
2980
3b1e0a65 2981 net = sock_net(sk);
1da177e4
LT
2982 po = pkt_sk(sk);
2983
0fa7fa98 2984 mutex_lock(&net->packet.sklist_lock);
808f5114 2985 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2986 mutex_unlock(&net->packet.sklist_lock);
2987
2988 preempt_disable();
920de804 2989 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2990 preempt_enable();
1da177e4 2991
808f5114 2992 spin_lock(&po->bind_lock);
ce06b03e 2993 unregister_prot_hook(sk, false);
66e56cd4
DB
2994 packet_cached_dev_reset(po);
2995
160ff18a
BG
2996 if (po->prot_hook.dev) {
2997 dev_put(po->prot_hook.dev);
2998 po->prot_hook.dev = NULL;
2999 }
808f5114 3000 spin_unlock(&po->bind_lock);
1da177e4 3001
1da177e4 3002 packet_flush_mclist(sk);
1da177e4 3003
9665d5d6
PS
3004 if (po->rx_ring.pg_vec) {
3005 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3006 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 3007 }
69e3c75f 3008
9665d5d6
PS
3009 if (po->tx_ring.pg_vec) {
3010 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3011 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 3012 }
1da177e4 3013
dc99f600
DM
3014 fanout_release(sk);
3015
808f5114 3016 synchronize_net();
1da177e4
LT
3017 /*
3018 * Now the socket is dead. No more input will appear.
3019 */
1da177e4
LT
3020 sock_orphan(sk);
3021 sock->sk = NULL;
3022
3023 /* Purge queues */
3024
3025 skb_queue_purge(&sk->sk_receive_queue);
b0138408 3026 packet_free_pending(po);
17ab56a2 3027 sk_refcnt_debug_release(sk);
1da177e4
LT
3028
3029 sock_put(sk);
3030 return 0;
3031}
3032
3033/*
3034 * Attach a packet hook.
3035 */
3036
30f7ea1c
FR
3037static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3038 __be16 proto)
1da177e4
LT
3039{
3040 struct packet_sock *po = pkt_sk(sk);
158cd4af 3041 struct net_device *dev_curr;
902fefb8
DB
3042 __be16 proto_curr;
3043 bool need_rehook;
30f7ea1c
FR
3044 struct net_device *dev = NULL;
3045 int ret = 0;
3046 bool unlisted = false;
dc99f600 3047
30f7ea1c 3048 if (po->fanout)
dc99f600 3049 return -EINVAL;
1da177e4
LT
3050
3051 lock_sock(sk);
1da177e4 3052 spin_lock(&po->bind_lock);
30f7ea1c
FR
3053 rcu_read_lock();
3054
3055 if (name) {
3056 dev = dev_get_by_name_rcu(sock_net(sk), name);
3057 if (!dev) {
3058 ret = -ENODEV;
3059 goto out_unlock;
3060 }
3061 } else if (ifindex) {
3062 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3063 if (!dev) {
3064 ret = -ENODEV;
3065 goto out_unlock;
3066 }
3067 }
3068
3069 if (dev)
3070 dev_hold(dev);
66e56cd4 3071
902fefb8
DB
3072 proto_curr = po->prot_hook.type;
3073 dev_curr = po->prot_hook.dev;
3074
3075 need_rehook = proto_curr != proto || dev_curr != dev;
3076
3077 if (need_rehook) {
30f7ea1c
FR
3078 if (po->running) {
3079 rcu_read_unlock();
3080 __unregister_prot_hook(sk, true);
3081 rcu_read_lock();
3082 dev_curr = po->prot_hook.dev;
3083 if (dev)
3084 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3085 dev->ifindex);
3086 }
1da177e4 3087
902fefb8
DB
3088 po->num = proto;
3089 po->prot_hook.type = proto;
902fefb8 3090
30f7ea1c
FR
3091 if (unlikely(unlisted)) {
3092 dev_put(dev);
3093 po->prot_hook.dev = NULL;
3094 po->ifindex = -1;
3095 packet_cached_dev_reset(po);
3096 } else {
3097 po->prot_hook.dev = dev;
3098 po->ifindex = dev ? dev->ifindex : 0;
3099 packet_cached_dev_assign(po, dev);
3100 }
902fefb8 3101 }
158cd4af
LW
3102 if (dev_curr)
3103 dev_put(dev_curr);
66e56cd4 3104
902fefb8 3105 if (proto == 0 || !need_rehook)
1da177e4
LT
3106 goto out_unlock;
3107
30f7ea1c 3108 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3109 register_prot_hook(sk);
be85d4ad
UT
3110 } else {
3111 sk->sk_err = ENETDOWN;
3112 if (!sock_flag(sk, SOCK_DEAD))
3113 sk->sk_error_report(sk);
1da177e4
LT
3114 }
3115
3116out_unlock:
30f7ea1c 3117 rcu_read_unlock();
1da177e4
LT
3118 spin_unlock(&po->bind_lock);
3119 release_sock(sk);
30f7ea1c 3120 return ret;
1da177e4
LT
3121}
3122
3123/*
3124 * Bind a packet socket to a device
3125 */
3126
40d4e3df
ED
3127static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3128 int addr_len)
1da177e4 3129{
40d4e3df 3130 struct sock *sk = sock->sk;
1da177e4 3131 char name[15];
1ce4f28b 3132
1da177e4
LT
3133 /*
3134 * Check legality
3135 */
1ce4f28b 3136
8ae55f04 3137 if (addr_len != sizeof(struct sockaddr))
1da177e4 3138 return -EINVAL;
40d4e3df 3139 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 3140
30f7ea1c 3141 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3142}
1da177e4
LT
3143
3144static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3145{
40d4e3df
ED
3146 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3147 struct sock *sk = sock->sk;
1da177e4
LT
3148
3149 /*
3150 * Check legality
3151 */
1ce4f28b 3152
1da177e4
LT
3153 if (addr_len < sizeof(struct sockaddr_ll))
3154 return -EINVAL;
3155 if (sll->sll_family != AF_PACKET)
3156 return -EINVAL;
3157
30f7ea1c
FR
3158 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3159 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3160}
3161
3162static struct proto packet_proto = {
3163 .name = "PACKET",
3164 .owner = THIS_MODULE,
3165 .obj_size = sizeof(struct packet_sock),
3166};
3167
3168/*
1ce4f28b 3169 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3170 */
3171
3f378b68
EP
3172static int packet_create(struct net *net, struct socket *sock, int protocol,
3173 int kern)
1da177e4
LT
3174{
3175 struct sock *sk;
3176 struct packet_sock *po;
0e11c91e 3177 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3178 int err;
3179
df008c91 3180 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3181 return -EPERM;
be02097c
DM
3182 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3183 sock->type != SOCK_PACKET)
1da177e4
LT
3184 return -ESOCKTNOSUPPORT;
3185
3186 sock->state = SS_UNCONNECTED;
3187
3188 err = -ENOBUFS;
11aa9c28 3189 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3190 if (sk == NULL)
3191 goto out;
3192
3193 sock->ops = &packet_ops;
1da177e4
LT
3194 if (sock->type == SOCK_PACKET)
3195 sock->ops = &packet_ops_spkt;
be02097c 3196
1da177e4
LT
3197 sock_init_data(sock, sk);
3198
3199 po = pkt_sk(sk);
3200 sk->sk_family = PF_PACKET;
0e11c91e 3201 po->num = proto;
d346a3fa 3202 po->xmit = dev_queue_xmit;
66e56cd4 3203
b0138408
DB
3204 err = packet_alloc_pending(po);
3205 if (err)
3206 goto out2;
3207
66e56cd4 3208 packet_cached_dev_reset(po);
1da177e4
LT
3209
3210 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3211 sk_refcnt_debug_inc(sk);
1da177e4
LT
3212
3213 /*
3214 * Attach a protocol block
3215 */
3216
3217 spin_lock_init(&po->bind_lock);
905db440 3218 mutex_init(&po->pg_vec_lock);
0648ab70 3219 po->rollover = NULL;
1da177e4 3220 po->prot_hook.func = packet_rcv;
be02097c 3221
1da177e4
LT
3222 if (sock->type == SOCK_PACKET)
3223 po->prot_hook.func = packet_rcv_spkt;
be02097c 3224
1da177e4
LT
3225 po->prot_hook.af_packet_priv = sk;
3226
0e11c91e
AV
3227 if (proto) {
3228 po->prot_hook.type = proto;
ce06b03e 3229 register_prot_hook(sk);
1da177e4
LT
3230 }
3231
0fa7fa98 3232 mutex_lock(&net->packet.sklist_lock);
808f5114 3233 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3234 mutex_unlock(&net->packet.sklist_lock);
3235
3236 preempt_disable();
3680453c 3237 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3238 preempt_enable();
808f5114 3239
40d4e3df 3240 return 0;
b0138408
DB
3241out2:
3242 sk_free(sk);
1da177e4
LT
3243out:
3244 return err;
3245}
3246
3247/*
3248 * Pull a packet from our receive queue and hand it to the user.
3249 * If necessary we block.
3250 */
3251
1b784140
YX
3252static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3253 int flags)
1da177e4
LT
3254{
3255 struct sock *sk = sock->sk;
3256 struct sk_buff *skb;
3257 int copied, err;
bfd5f4a3 3258 int vnet_hdr_len = 0;
2472d761 3259 unsigned int origlen = 0;
1da177e4
LT
3260
3261 err = -EINVAL;
ed85b565 3262 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3263 goto out;
3264
3265#if 0
3266 /* What error should we return now? EUNATTACH? */
3267 if (pkt_sk(sk)->ifindex < 0)
3268 return -ENODEV;
3269#endif
3270
ed85b565 3271 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3272 err = sock_recv_errqueue(sk, msg, len,
3273 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3274 goto out;
3275 }
3276
1da177e4
LT
3277 /*
3278 * Call the generic datagram receiver. This handles all sorts
3279 * of horrible races and re-entrancy so we can forget about it
3280 * in the protocol layers.
3281 *
3282 * Now it will return ENETDOWN, if device have just gone down,
3283 * but then it will block.
3284 */
3285
40d4e3df 3286 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3287
3288 /*
1ce4f28b 3289 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3290 * handles the blocking we don't see and worry about blocking
3291 * retries.
3292 */
3293
8ae55f04 3294 if (skb == NULL)
1da177e4
LT
3295 goto out;
3296
2ccdbaa6
WB
3297 if (pkt_sk(sk)->pressure)
3298 packet_rcv_has_room(pkt_sk(sk), NULL);
3299
bfd5f4a3 3300 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3301 err = packet_rcv_vnet(msg, skb, &len);
3302 if (err)
bfd5f4a3 3303 goto out_free;
16cc1400 3304 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3305 }
3306
f3d33426
HFS
3307 /* You lose any data beyond the buffer you gave. If it worries
3308 * a user program they can ask the device for its MTU
3309 * anyway.
1da177e4 3310 */
1da177e4 3311 copied = skb->len;
40d4e3df
ED
3312 if (copied > len) {
3313 copied = len;
3314 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3315 }
3316
51f3d02b 3317 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3318 if (err)
3319 goto out_free;
3320
2472d761
EB
3321 if (sock->type != SOCK_PACKET) {
3322 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3323
3324 /* Original length was stored in sockaddr_ll fields */
3325 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3326 sll->sll_family = AF_PACKET;
3327 sll->sll_protocol = skb->protocol;
3328 }
3329
3b885787 3330 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3331
f3d33426
HFS
3332 if (msg->msg_name) {
3333 /* If the address length field is there to be filled
3334 * in, we fill it in now.
3335 */
3336 if (sock->type == SOCK_PACKET) {
342dfc30 3337 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3338 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3339 } else {
3340 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3341
f3d33426
HFS
3342 msg->msg_namelen = sll->sll_halen +
3343 offsetof(struct sockaddr_ll, sll_addr);
3344 }
ffbc6111
HX
3345 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3346 msg->msg_namelen);
f3d33426 3347 }
1da177e4 3348
8dc41944 3349 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3350 struct tpacket_auxdata aux;
3351
3352 aux.tp_status = TP_STATUS_USER;
3353 if (skb->ip_summed == CHECKSUM_PARTIAL)
3354 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3355 else if (skb->pkt_type != PACKET_OUTGOING &&
3356 (skb->ip_summed == CHECKSUM_COMPLETE ||
3357 skb_csum_unnecessary(skb)))
3358 aux.tp_status |= TP_STATUS_CSUM_VALID;
3359
2472d761 3360 aux.tp_len = origlen;
ffbc6111
HX
3361 aux.tp_snaplen = skb->len;
3362 aux.tp_mac = 0;
bbe735e4 3363 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3364 if (skb_vlan_tag_present(skb)) {
3365 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3366 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3367 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3368 } else {
3369 aux.tp_vlan_tci = 0;
a0cdfcf3 3370 aux.tp_vlan_tpid = 0;
a3bcc23e 3371 }
ffbc6111 3372 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3373 }
3374
1da177e4
LT
3375 /*
3376 * Free or return the buffer as appropriate. Again this
3377 * hides all the races and re-entrancy issues from us.
3378 */
bfd5f4a3 3379 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3380
3381out_free:
3382 skb_free_datagram(sk, skb);
3383out:
3384 return err;
3385}
3386
1da177e4
LT
3387static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3388 int *uaddr_len, int peer)
3389{
3390 struct net_device *dev;
3391 struct sock *sk = sock->sk;
3392
3393 if (peer)
3394 return -EOPNOTSUPP;
3395
3396 uaddr->sa_family = AF_PACKET;
2dc85bf3 3397 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3398 rcu_read_lock();
3399 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3400 if (dev)
2dc85bf3 3401 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3402 rcu_read_unlock();
1da177e4
LT
3403 *uaddr_len = sizeof(*uaddr);
3404
3405 return 0;
3406}
1da177e4
LT
3407
3408static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3409 int *uaddr_len, int peer)
3410{
3411 struct net_device *dev;
3412 struct sock *sk = sock->sk;
3413 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3414 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3415
3416 if (peer)
3417 return -EOPNOTSUPP;
3418
3419 sll->sll_family = AF_PACKET;
3420 sll->sll_ifindex = po->ifindex;
3421 sll->sll_protocol = po->num;
67286640 3422 sll->sll_pkttype = 0;
654d1f8a
ED
3423 rcu_read_lock();
3424 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3425 if (dev) {
3426 sll->sll_hatype = dev->type;
3427 sll->sll_halen = dev->addr_len;
3428 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3429 } else {
3430 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3431 sll->sll_halen = 0;
3432 }
654d1f8a 3433 rcu_read_unlock();
0fb375fb 3434 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3435
3436 return 0;
3437}
3438
2aeb0b88
WC
3439static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3440 int what)
1da177e4
LT
3441{
3442 switch (i->type) {
3443 case PACKET_MR_MULTICAST:
1162563f
JP
3444 if (i->alen != dev->addr_len)
3445 return -EINVAL;
1da177e4 3446 if (what > 0)
22bedad3 3447 return dev_mc_add(dev, i->addr);
1da177e4 3448 else
22bedad3 3449 return dev_mc_del(dev, i->addr);
1da177e4
LT
3450 break;
3451 case PACKET_MR_PROMISC:
2aeb0b88 3452 return dev_set_promiscuity(dev, what);
1da177e4 3453 case PACKET_MR_ALLMULTI:
2aeb0b88 3454 return dev_set_allmulti(dev, what);
d95ed927 3455 case PACKET_MR_UNICAST:
1162563f
JP
3456 if (i->alen != dev->addr_len)
3457 return -EINVAL;
d95ed927 3458 if (what > 0)
a748ee24 3459 return dev_uc_add(dev, i->addr);
d95ed927 3460 else
a748ee24 3461 return dev_uc_del(dev, i->addr);
d95ed927 3462 break;
40d4e3df
ED
3463 default:
3464 break;
1da177e4 3465 }
2aeb0b88 3466 return 0;
1da177e4
LT
3467}
3468
82f17091
FR
3469static void packet_dev_mclist_delete(struct net_device *dev,
3470 struct packet_mclist **mlp)
1da177e4 3471{
82f17091
FR
3472 struct packet_mclist *ml;
3473
3474 while ((ml = *mlp) != NULL) {
3475 if (ml->ifindex == dev->ifindex) {
3476 packet_dev_mc(dev, ml, -1);
3477 *mlp = ml->next;
3478 kfree(ml);
3479 } else
3480 mlp = &ml->next;
1da177e4
LT
3481 }
3482}
3483
0fb375fb 3484static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3485{
3486 struct packet_sock *po = pkt_sk(sk);
3487 struct packet_mclist *ml, *i;
3488 struct net_device *dev;
3489 int err;
3490
3491 rtnl_lock();
3492
3493 err = -ENODEV;
3b1e0a65 3494 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3495 if (!dev)
3496 goto done;
3497
3498 err = -EINVAL;
1162563f 3499 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3500 goto done;
3501
3502 err = -ENOBUFS;
8b3a7005 3503 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3504 if (i == NULL)
3505 goto done;
3506
3507 err = 0;
3508 for (ml = po->mclist; ml; ml = ml->next) {
3509 if (ml->ifindex == mreq->mr_ifindex &&
3510 ml->type == mreq->mr_type &&
3511 ml->alen == mreq->mr_alen &&
3512 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3513 ml->count++;
3514 /* Free the new element ... */
3515 kfree(i);
3516 goto done;
3517 }
3518 }
3519
3520 i->type = mreq->mr_type;
3521 i->ifindex = mreq->mr_ifindex;
3522 i->alen = mreq->mr_alen;
3523 memcpy(i->addr, mreq->mr_address, i->alen);
3524 i->count = 1;
3525 i->next = po->mclist;
3526 po->mclist = i;
2aeb0b88
WC
3527 err = packet_dev_mc(dev, i, 1);
3528 if (err) {
3529 po->mclist = i->next;
3530 kfree(i);
3531 }
1da177e4
LT
3532
3533done:
3534 rtnl_unlock();
3535 return err;
3536}
3537
0fb375fb 3538static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3539{
3540 struct packet_mclist *ml, **mlp;
3541
3542 rtnl_lock();
3543
3544 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3545 if (ml->ifindex == mreq->mr_ifindex &&
3546 ml->type == mreq->mr_type &&
3547 ml->alen == mreq->mr_alen &&
3548 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3549 if (--ml->count == 0) {
3550 struct net_device *dev;
3551 *mlp = ml->next;
ad959e76
ED
3552 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3553 if (dev)
1da177e4 3554 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3555 kfree(ml);
3556 }
82f17091 3557 break;
1da177e4
LT
3558 }
3559 }
3560 rtnl_unlock();
82f17091 3561 return 0;
1da177e4
LT
3562}
3563
3564static void packet_flush_mclist(struct sock *sk)
3565{
3566 struct packet_sock *po = pkt_sk(sk);
3567 struct packet_mclist *ml;
3568
3569 if (!po->mclist)
3570 return;
3571
3572 rtnl_lock();
3573 while ((ml = po->mclist) != NULL) {
3574 struct net_device *dev;
3575
3576 po->mclist = ml->next;
ad959e76
ED
3577 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3578 if (dev != NULL)
1da177e4 3579 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3580 kfree(ml);
3581 }
3582 rtnl_unlock();
3583}
1da177e4
LT
3584
3585static int
b7058842 3586packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3587{
3588 struct sock *sk = sock->sk;
8dc41944 3589 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3590 int ret;
3591
3592 if (level != SOL_PACKET)
3593 return -ENOPROTOOPT;
3594
69e3c75f 3595 switch (optname) {
1ce4f28b 3596 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3597 case PACKET_DROP_MEMBERSHIP:
3598 {
0fb375fb
EB
3599 struct packet_mreq_max mreq;
3600 int len = optlen;
3601 memset(&mreq, 0, sizeof(mreq));
3602 if (len < sizeof(struct packet_mreq))
1da177e4 3603 return -EINVAL;
0fb375fb
EB
3604 if (len > sizeof(mreq))
3605 len = sizeof(mreq);
40d4e3df 3606 if (copy_from_user(&mreq, optval, len))
1da177e4 3607 return -EFAULT;
0fb375fb
EB
3608 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3609 return -EINVAL;
1da177e4
LT
3610 if (optname == PACKET_ADD_MEMBERSHIP)
3611 ret = packet_mc_add(sk, &mreq);
3612 else
3613 ret = packet_mc_drop(sk, &mreq);
3614 return ret;
3615 }
a2efcfa0 3616
1da177e4 3617 case PACKET_RX_RING:
69e3c75f 3618 case PACKET_TX_RING:
1da177e4 3619 {
f6fb8f10 3620 union tpacket_req_u req_u;
3621 int len;
1da177e4 3622
f6fb8f10 3623 switch (po->tp_version) {
3624 case TPACKET_V1:
3625 case TPACKET_V2:
3626 len = sizeof(req_u.req);
3627 break;
3628 case TPACKET_V3:
3629 default:
3630 len = sizeof(req_u.req3);
3631 break;
3632 }
3633 if (optlen < len)
1da177e4 3634 return -EINVAL;
f6fb8f10 3635 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3636 return -EFAULT;
f6fb8f10 3637 return packet_set_ring(sk, &req_u, 0,
3638 optname == PACKET_TX_RING);
1da177e4
LT
3639 }
3640 case PACKET_COPY_THRESH:
3641 {
3642 int val;
3643
40d4e3df 3644 if (optlen != sizeof(val))
1da177e4 3645 return -EINVAL;
40d4e3df 3646 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3647 return -EFAULT;
3648
3649 pkt_sk(sk)->copy_thresh = val;
3650 return 0;
3651 }
bbd6ef87
PM
3652 case PACKET_VERSION:
3653 {
3654 int val;
3655
3656 if (optlen != sizeof(val))
3657 return -EINVAL;
69e3c75f 3658 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3659 return -EBUSY;
3660 if (copy_from_user(&val, optval, sizeof(val)))
3661 return -EFAULT;
3662 switch (val) {
3663 case TPACKET_V1:
3664 case TPACKET_V2:
f6fb8f10 3665 case TPACKET_V3:
bbd6ef87
PM
3666 po->tp_version = val;
3667 return 0;
3668 default:
3669 return -EINVAL;
3670 }
3671 }
8913336a
PM
3672 case PACKET_RESERVE:
3673 {
3674 unsigned int val;
3675
3676 if (optlen != sizeof(val))
3677 return -EINVAL;
69e3c75f 3678 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3679 return -EBUSY;
3680 if (copy_from_user(&val, optval, sizeof(val)))
3681 return -EFAULT;
3682 po->tp_reserve = val;
3683 return 0;
3684 }
69e3c75f
JB
3685 case PACKET_LOSS:
3686 {
3687 unsigned int val;
3688
3689 if (optlen != sizeof(val))
3690 return -EINVAL;
3691 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3692 return -EBUSY;
3693 if (copy_from_user(&val, optval, sizeof(val)))
3694 return -EFAULT;
3695 po->tp_loss = !!val;
3696 return 0;
3697 }
8dc41944
HX
3698 case PACKET_AUXDATA:
3699 {
3700 int val;
3701
3702 if (optlen < sizeof(val))
3703 return -EINVAL;
3704 if (copy_from_user(&val, optval, sizeof(val)))
3705 return -EFAULT;
3706
3707 po->auxdata = !!val;
3708 return 0;
3709 }
80feaacb
PWJ
3710 case PACKET_ORIGDEV:
3711 {
3712 int val;
3713
3714 if (optlen < sizeof(val))
3715 return -EINVAL;
3716 if (copy_from_user(&val, optval, sizeof(val)))
3717 return -EFAULT;
3718
3719 po->origdev = !!val;
3720 return 0;
3721 }
bfd5f4a3
SS
3722 case PACKET_VNET_HDR:
3723 {
3724 int val;
3725
3726 if (sock->type != SOCK_RAW)
3727 return -EINVAL;
3728 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3729 return -EBUSY;
3730 if (optlen < sizeof(val))
3731 return -EINVAL;
3732 if (copy_from_user(&val, optval, sizeof(val)))
3733 return -EFAULT;
3734
3735 po->has_vnet_hdr = !!val;
3736 return 0;
3737 }
614f60fa
SM
3738 case PACKET_TIMESTAMP:
3739 {
3740 int val;
3741
3742 if (optlen != sizeof(val))
3743 return -EINVAL;
3744 if (copy_from_user(&val, optval, sizeof(val)))
3745 return -EFAULT;
3746
3747 po->tp_tstamp = val;
3748 return 0;
3749 }
dc99f600
DM
3750 case PACKET_FANOUT:
3751 {
3752 int val;
3753
3754 if (optlen != sizeof(val))
3755 return -EINVAL;
3756 if (copy_from_user(&val, optval, sizeof(val)))
3757 return -EFAULT;
3758
3759 return fanout_add(sk, val & 0xffff, val >> 16);
3760 }
47dceb8e
WB
3761 case PACKET_FANOUT_DATA:
3762 {
3763 if (!po->fanout)
3764 return -EINVAL;
3765
3766 return fanout_set_data(po, optval, optlen);
3767 }
5920cd3a
PC
3768 case PACKET_TX_HAS_OFF:
3769 {
3770 unsigned int val;
3771
3772 if (optlen != sizeof(val))
3773 return -EINVAL;
3774 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3775 return -EBUSY;
3776 if (copy_from_user(&val, optval, sizeof(val)))
3777 return -EFAULT;
3778 po->tp_tx_has_off = !!val;
3779 return 0;
3780 }
d346a3fa
DB
3781 case PACKET_QDISC_BYPASS:
3782 {
3783 int val;
3784
3785 if (optlen != sizeof(val))
3786 return -EINVAL;
3787 if (copy_from_user(&val, optval, sizeof(val)))
3788 return -EFAULT;
3789
3790 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3791 return 0;
3792 }
1da177e4
LT
3793 default:
3794 return -ENOPROTOOPT;
3795 }
3796}
3797
3798static int packet_getsockopt(struct socket *sock, int level, int optname,
3799 char __user *optval, int __user *optlen)
3800{
3801 int len;
c06fff6e 3802 int val, lv = sizeof(val);
1da177e4
LT
3803 struct sock *sk = sock->sk;
3804 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3805 void *data = &val;
ee80fbf3 3806 union tpacket_stats_u st;
a9b63918 3807 struct tpacket_rollover_stats rstats;
1da177e4
LT
3808
3809 if (level != SOL_PACKET)
3810 return -ENOPROTOOPT;
3811
8ae55f04
KK
3812 if (get_user(len, optlen))
3813 return -EFAULT;
1da177e4
LT
3814
3815 if (len < 0)
3816 return -EINVAL;
1ce4f28b 3817
69e3c75f 3818 switch (optname) {
1da177e4 3819 case PACKET_STATISTICS:
1da177e4 3820 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3821 memcpy(&st, &po->stats, sizeof(st));
3822 memset(&po->stats, 0, sizeof(po->stats));
3823 spin_unlock_bh(&sk->sk_receive_queue.lock);
3824
f6fb8f10 3825 if (po->tp_version == TPACKET_V3) {
c06fff6e 3826 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3827 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3828 data = &st.stats3;
f6fb8f10 3829 } else {
c06fff6e 3830 lv = sizeof(struct tpacket_stats);
8bcdeaff 3831 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3832 data = &st.stats1;
f6fb8f10 3833 }
ee80fbf3 3834
8dc41944
HX
3835 break;
3836 case PACKET_AUXDATA:
8dc41944 3837 val = po->auxdata;
80feaacb
PWJ
3838 break;
3839 case PACKET_ORIGDEV:
80feaacb 3840 val = po->origdev;
bfd5f4a3
SS
3841 break;
3842 case PACKET_VNET_HDR:
bfd5f4a3 3843 val = po->has_vnet_hdr;
1da177e4 3844 break;
bbd6ef87 3845 case PACKET_VERSION:
bbd6ef87 3846 val = po->tp_version;
bbd6ef87
PM
3847 break;
3848 case PACKET_HDRLEN:
3849 if (len > sizeof(int))
3850 len = sizeof(int);
3851 if (copy_from_user(&val, optval, len))
3852 return -EFAULT;
3853 switch (val) {
3854 case TPACKET_V1:
3855 val = sizeof(struct tpacket_hdr);
3856 break;
3857 case TPACKET_V2:
3858 val = sizeof(struct tpacket2_hdr);
3859 break;
f6fb8f10 3860 case TPACKET_V3:
3861 val = sizeof(struct tpacket3_hdr);
3862 break;
bbd6ef87
PM
3863 default:
3864 return -EINVAL;
3865 }
bbd6ef87 3866 break;
8913336a 3867 case PACKET_RESERVE:
8913336a 3868 val = po->tp_reserve;
8913336a 3869 break;
69e3c75f 3870 case PACKET_LOSS:
69e3c75f 3871 val = po->tp_loss;
69e3c75f 3872 break;
614f60fa 3873 case PACKET_TIMESTAMP:
614f60fa 3874 val = po->tp_tstamp;
614f60fa 3875 break;
dc99f600 3876 case PACKET_FANOUT:
dc99f600
DM
3877 val = (po->fanout ?
3878 ((u32)po->fanout->id |
77f65ebd
WB
3879 ((u32)po->fanout->type << 16) |
3880 ((u32)po->fanout->flags << 24)) :
dc99f600 3881 0);
dc99f600 3882 break;
a9b63918
WB
3883 case PACKET_ROLLOVER_STATS:
3884 if (!po->rollover)
3885 return -EINVAL;
3886 rstats.tp_all = atomic_long_read(&po->rollover->num);
3887 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3888 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3889 data = &rstats;
3890 lv = sizeof(rstats);
3891 break;
5920cd3a
PC
3892 case PACKET_TX_HAS_OFF:
3893 val = po->tp_tx_has_off;
3894 break;
d346a3fa
DB
3895 case PACKET_QDISC_BYPASS:
3896 val = packet_use_direct_xmit(po);
3897 break;
1da177e4
LT
3898 default:
3899 return -ENOPROTOOPT;
3900 }
3901
c06fff6e
ED
3902 if (len > lv)
3903 len = lv;
8ae55f04
KK
3904 if (put_user(len, optlen))
3905 return -EFAULT;
8dc41944
HX
3906 if (copy_to_user(optval, data, len))
3907 return -EFAULT;
8ae55f04 3908 return 0;
1da177e4
LT
3909}
3910
3911
351638e7
JP
3912static int packet_notifier(struct notifier_block *this,
3913 unsigned long msg, void *ptr)
1da177e4
LT
3914{
3915 struct sock *sk;
351638e7 3916 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3917 struct net *net = dev_net(dev);
1da177e4 3918
808f5114 3919 rcu_read_lock();
b67bfe0d 3920 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3921 struct packet_sock *po = pkt_sk(sk);
3922
3923 switch (msg) {
3924 case NETDEV_UNREGISTER:
1da177e4 3925 if (po->mclist)
82f17091 3926 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
3927 /* fallthrough */
3928
1da177e4
LT
3929 case NETDEV_DOWN:
3930 if (dev->ifindex == po->ifindex) {
3931 spin_lock(&po->bind_lock);
3932 if (po->running) {
ce06b03e 3933 __unregister_prot_hook(sk, false);
1da177e4
LT
3934 sk->sk_err = ENETDOWN;
3935 if (!sock_flag(sk, SOCK_DEAD))
3936 sk->sk_error_report(sk);
3937 }
3938 if (msg == NETDEV_UNREGISTER) {
66e56cd4 3939 packet_cached_dev_reset(po);
1da177e4 3940 po->ifindex = -1;
160ff18a
BG
3941 if (po->prot_hook.dev)
3942 dev_put(po->prot_hook.dev);
1da177e4
LT
3943 po->prot_hook.dev = NULL;
3944 }
3945 spin_unlock(&po->bind_lock);
3946 }
3947 break;
3948 case NETDEV_UP:
808f5114 3949 if (dev->ifindex == po->ifindex) {
3950 spin_lock(&po->bind_lock);
ce06b03e
DM
3951 if (po->num)
3952 register_prot_hook(sk);
808f5114 3953 spin_unlock(&po->bind_lock);
1da177e4 3954 }
1da177e4
LT
3955 break;
3956 }
3957 }
808f5114 3958 rcu_read_unlock();
1da177e4
LT
3959 return NOTIFY_DONE;
3960}
3961
3962
3963static int packet_ioctl(struct socket *sock, unsigned int cmd,
3964 unsigned long arg)
3965{
3966 struct sock *sk = sock->sk;
3967
69e3c75f 3968 switch (cmd) {
40d4e3df
ED
3969 case SIOCOUTQ:
3970 {
3971 int amount = sk_wmem_alloc_get(sk);
31e6d363 3972
40d4e3df
ED
3973 return put_user(amount, (int __user *)arg);
3974 }
3975 case SIOCINQ:
3976 {
3977 struct sk_buff *skb;
3978 int amount = 0;
3979
3980 spin_lock_bh(&sk->sk_receive_queue.lock);
3981 skb = skb_peek(&sk->sk_receive_queue);
3982 if (skb)
3983 amount = skb->len;
3984 spin_unlock_bh(&sk->sk_receive_queue.lock);
3985 return put_user(amount, (int __user *)arg);
3986 }
3987 case SIOCGSTAMP:
3988 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3989 case SIOCGSTAMPNS:
3990 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3991
1da177e4 3992#ifdef CONFIG_INET
40d4e3df
ED
3993 case SIOCADDRT:
3994 case SIOCDELRT:
3995 case SIOCDARP:
3996 case SIOCGARP:
3997 case SIOCSARP:
3998 case SIOCGIFADDR:
3999 case SIOCSIFADDR:
4000 case SIOCGIFBRDADDR:
4001 case SIOCSIFBRDADDR:
4002 case SIOCGIFNETMASK:
4003 case SIOCSIFNETMASK:
4004 case SIOCGIFDSTADDR:
4005 case SIOCSIFDSTADDR:
4006 case SIOCSIFFLAGS:
40d4e3df 4007 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
4008#endif
4009
40d4e3df
ED
4010 default:
4011 return -ENOIOCTLCMD;
1da177e4
LT
4012 }
4013 return 0;
4014}
4015
40d4e3df 4016static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
4017 poll_table *wait)
4018{
4019 struct sock *sk = sock->sk;
4020 struct packet_sock *po = pkt_sk(sk);
4021 unsigned int mask = datagram_poll(file, sock, wait);
4022
4023 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4024 if (po->rx_ring.pg_vec) {
f6fb8f10 4025 if (!packet_previous_rx_frame(po, &po->rx_ring,
4026 TP_STATUS_KERNEL))
1da177e4
LT
4027 mask |= POLLIN | POLLRDNORM;
4028 }
2ccdbaa6 4029 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 4030 po->pressure = 0;
1da177e4 4031 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4032 spin_lock_bh(&sk->sk_write_queue.lock);
4033 if (po->tx_ring.pg_vec) {
4034 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4035 mask |= POLLOUT | POLLWRNORM;
4036 }
4037 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4038 return mask;
4039}
4040
4041
4042/* Dirty? Well, I still did not learn better way to account
4043 * for user mmaps.
4044 */
4045
4046static void packet_mm_open(struct vm_area_struct *vma)
4047{
4048 struct file *file = vma->vm_file;
40d4e3df 4049 struct socket *sock = file->private_data;
1da177e4 4050 struct sock *sk = sock->sk;
1ce4f28b 4051
1da177e4
LT
4052 if (sk)
4053 atomic_inc(&pkt_sk(sk)->mapped);
4054}
4055
4056static void packet_mm_close(struct vm_area_struct *vma)
4057{
4058 struct file *file = vma->vm_file;
40d4e3df 4059 struct socket *sock = file->private_data;
1da177e4 4060 struct sock *sk = sock->sk;
1ce4f28b 4061
1da177e4
LT
4062 if (sk)
4063 atomic_dec(&pkt_sk(sk)->mapped);
4064}
4065
f0f37e2f 4066static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4067 .open = packet_mm_open,
4068 .close = packet_mm_close,
1da177e4
LT
4069};
4070
0e3125c7
NH
4071static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4072 unsigned int len)
1da177e4
LT
4073{
4074 int i;
4075
4ebf0ae2 4076 for (i = 0; i < len; i++) {
0e3125c7 4077 if (likely(pg_vec[i].buffer)) {
c56b4d90 4078 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
4079 vfree(pg_vec[i].buffer);
4080 else
4081 free_pages((unsigned long)pg_vec[i].buffer,
4082 order);
4083 pg_vec[i].buffer = NULL;
4084 }
1da177e4
LT
4085 }
4086 kfree(pg_vec);
4087}
4088
eea49cc9 4089static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4090{
f0d4eb29 4091 char *buffer;
0e3125c7
NH
4092 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4093 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4094
4095 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4096 if (buffer)
4097 return buffer;
4098
f0d4eb29 4099 /* __get_free_pages failed, fall back to vmalloc */
bbce5a59 4100 buffer = vzalloc((1 << order) * PAGE_SIZE);
0e3125c7
NH
4101 if (buffer)
4102 return buffer;
4103
f0d4eb29 4104 /* vmalloc failed, lets dig into swap here */
0e3125c7 4105 gfp_flags &= ~__GFP_NORETRY;
f0d4eb29 4106 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4107 if (buffer)
4108 return buffer;
4109
f0d4eb29 4110 /* complete and utter failure */
0e3125c7 4111 return NULL;
4ebf0ae2
DM
4112}
4113
0e3125c7 4114static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4115{
4116 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4117 struct pgv *pg_vec;
4ebf0ae2
DM
4118 int i;
4119
0e3125c7 4120 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
4121 if (unlikely(!pg_vec))
4122 goto out;
4123
4124 for (i = 0; i < block_nr; i++) {
c56b4d90 4125 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4126 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4127 goto out_free_pgvec;
4128 }
4129
4130out:
4131 return pg_vec;
4132
4133out_free_pgvec:
4134 free_pg_vec(pg_vec, order, block_nr);
4135 pg_vec = NULL;
4136 goto out;
4137}
1da177e4 4138
f6fb8f10 4139static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4140 int closing, int tx_ring)
1da177e4 4141{
0e3125c7 4142 struct pgv *pg_vec = NULL;
1da177e4 4143 struct packet_sock *po = pkt_sk(sk);
0e11c91e 4144 int was_running, order = 0;
69e3c75f
JB
4145 struct packet_ring_buffer *rb;
4146 struct sk_buff_head *rb_queue;
0e11c91e 4147 __be16 num;
f6fb8f10 4148 int err = -EINVAL;
4149 /* Added to avoid minimal code churn */
4150 struct tpacket_req *req = &req_u->req;
4151
4152 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
4153 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
4154 WARN(1, "Tx-ring is not supported.\n");
4155 goto out;
4156 }
1ce4f28b 4157
69e3c75f
JB
4158 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4159 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4160
69e3c75f
JB
4161 err = -EBUSY;
4162 if (!closing) {
4163 if (atomic_read(&po->mapped))
4164 goto out;
b0138408 4165 if (packet_read_pending(rb))
69e3c75f
JB
4166 goto out;
4167 }
1da177e4 4168
69e3c75f
JB
4169 if (req->tp_block_nr) {
4170 /* Sanity tests and some calculations */
4171 err = -EBUSY;
4172 if (unlikely(rb->pg_vec))
4173 goto out;
1da177e4 4174
bbd6ef87
PM
4175 switch (po->tp_version) {
4176 case TPACKET_V1:
4177 po->tp_hdrlen = TPACKET_HDRLEN;
4178 break;
4179 case TPACKET_V2:
4180 po->tp_hdrlen = TPACKET2_HDRLEN;
4181 break;
f6fb8f10 4182 case TPACKET_V3:
4183 po->tp_hdrlen = TPACKET3_HDRLEN;
4184 break;
bbd6ef87
PM
4185 }
4186
69e3c75f 4187 err = -EINVAL;
4ebf0ae2 4188 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4189 goto out;
90836b67 4190 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4191 goto out;
dc808110
ED
4192 if (po->tp_version >= TPACKET_V3 &&
4193 (int)(req->tp_block_size -
4194 BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
4195 goto out;
8913336a 4196 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
4197 po->tp_reserve))
4198 goto out;
4ebf0ae2 4199 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4200 goto out;
1da177e4 4201
4194b491
TK
4202 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4203 if (unlikely(rb->frames_per_block == 0))
69e3c75f
JB
4204 goto out;
4205 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4206 req->tp_frame_nr))
4207 goto out;
1da177e4
LT
4208
4209 err = -ENOMEM;
4ebf0ae2
DM
4210 order = get_order(req->tp_block_size);
4211 pg_vec = alloc_pg_vec(req, order);
4212 if (unlikely(!pg_vec))
1da177e4 4213 goto out;
f6fb8f10 4214 switch (po->tp_version) {
4215 case TPACKET_V3:
4216 /* Transmit path is not supported. We checked
4217 * it above but just being paranoid
4218 */
4219 if (!tx_ring)
e8e85cc5 4220 init_prb_bdqc(po, rb, pg_vec, req_u);
d7cf0c34 4221 break;
f6fb8f10 4222 default:
4223 break;
4224 }
69e3c75f
JB
4225 }
4226 /* Done */
4227 else {
4228 err = -EINVAL;
4ebf0ae2 4229 if (unlikely(req->tp_frame_nr))
69e3c75f 4230 goto out;
1da177e4
LT
4231 }
4232
4233 lock_sock(sk);
4234
4235 /* Detach socket from network */
4236 spin_lock(&po->bind_lock);
4237 was_running = po->running;
4238 num = po->num;
4239 if (was_running) {
1da177e4 4240 po->num = 0;
ce06b03e 4241 __unregister_prot_hook(sk, false);
1da177e4
LT
4242 }
4243 spin_unlock(&po->bind_lock);
1ce4f28b 4244
1da177e4
LT
4245 synchronize_net();
4246
4247 err = -EBUSY;
905db440 4248 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4249 if (closing || atomic_read(&po->mapped) == 0) {
4250 err = 0;
69e3c75f 4251 spin_lock_bh(&rb_queue->lock);
c053fd96 4252 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4253 rb->frame_max = (req->tp_frame_nr - 1);
4254 rb->head = 0;
4255 rb->frame_size = req->tp_frame_size;
4256 spin_unlock_bh(&rb_queue->lock);
4257
c053fd96
CG
4258 swap(rb->pg_vec_order, order);
4259 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4260
4261 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4262 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4263 tpacket_rcv : packet_rcv;
4264 skb_queue_purge(rb_queue);
1da177e4 4265 if (atomic_read(&po->mapped))
40d4e3df
ED
4266 pr_err("packet_mmap: vma is busy: %d\n",
4267 atomic_read(&po->mapped));
1da177e4 4268 }
905db440 4269 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4270
4271 spin_lock(&po->bind_lock);
ce06b03e 4272 if (was_running) {
1da177e4 4273 po->num = num;
ce06b03e 4274 register_prot_hook(sk);
1da177e4
LT
4275 }
4276 spin_unlock(&po->bind_lock);
f6fb8f10 4277 if (closing && (po->tp_version > TPACKET_V2)) {
4278 /* Because we don't support block-based V3 on tx-ring */
4279 if (!tx_ring)
73d0fcf2 4280 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4281 }
1da177e4
LT
4282 release_sock(sk);
4283
1da177e4
LT
4284 if (pg_vec)
4285 free_pg_vec(pg_vec, order, req->tp_block_nr);
4286out:
4287 return err;
4288}
4289
69e3c75f
JB
4290static int packet_mmap(struct file *file, struct socket *sock,
4291 struct vm_area_struct *vma)
1da177e4
LT
4292{
4293 struct sock *sk = sock->sk;
4294 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4295 unsigned long size, expected_size;
4296 struct packet_ring_buffer *rb;
1da177e4
LT
4297 unsigned long start;
4298 int err = -EINVAL;
4299 int i;
4300
4301 if (vma->vm_pgoff)
4302 return -EINVAL;
4303
905db440 4304 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4305
4306 expected_size = 0;
4307 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4308 if (rb->pg_vec) {
4309 expected_size += rb->pg_vec_len
4310 * rb->pg_vec_pages
4311 * PAGE_SIZE;
4312 }
4313 }
4314
4315 if (expected_size == 0)
1da177e4 4316 goto out;
69e3c75f
JB
4317
4318 size = vma->vm_end - vma->vm_start;
4319 if (size != expected_size)
1da177e4
LT
4320 goto out;
4321
1da177e4 4322 start = vma->vm_start;
69e3c75f
JB
4323 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4324 if (rb->pg_vec == NULL)
4325 continue;
4326
4327 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4328 struct page *page;
4329 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4330 int pg_num;
4331
c56b4d90
CG
4332 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4333 page = pgv_to_page(kaddr);
69e3c75f
JB
4334 err = vm_insert_page(vma, start, page);
4335 if (unlikely(err))
4336 goto out;
4337 start += PAGE_SIZE;
0e3125c7 4338 kaddr += PAGE_SIZE;
69e3c75f 4339 }
4ebf0ae2 4340 }
1da177e4 4341 }
69e3c75f 4342
4ebf0ae2 4343 atomic_inc(&po->mapped);
1da177e4
LT
4344 vma->vm_ops = &packet_mmap_ops;
4345 err = 0;
4346
4347out:
905db440 4348 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4349 return err;
4350}
1da177e4 4351
90ddc4f0 4352static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4353 .family = PF_PACKET,
4354 .owner = THIS_MODULE,
4355 .release = packet_release,
4356 .bind = packet_bind_spkt,
4357 .connect = sock_no_connect,
4358 .socketpair = sock_no_socketpair,
4359 .accept = sock_no_accept,
4360 .getname = packet_getname_spkt,
4361 .poll = datagram_poll,
4362 .ioctl = packet_ioctl,
4363 .listen = sock_no_listen,
4364 .shutdown = sock_no_shutdown,
4365 .setsockopt = sock_no_setsockopt,
4366 .getsockopt = sock_no_getsockopt,
4367 .sendmsg = packet_sendmsg_spkt,
4368 .recvmsg = packet_recvmsg,
4369 .mmap = sock_no_mmap,
4370 .sendpage = sock_no_sendpage,
4371};
1da177e4 4372
90ddc4f0 4373static const struct proto_ops packet_ops = {
1da177e4
LT
4374 .family = PF_PACKET,
4375 .owner = THIS_MODULE,
4376 .release = packet_release,
4377 .bind = packet_bind,
4378 .connect = sock_no_connect,
4379 .socketpair = sock_no_socketpair,
4380 .accept = sock_no_accept,
1ce4f28b 4381 .getname = packet_getname,
1da177e4
LT
4382 .poll = packet_poll,
4383 .ioctl = packet_ioctl,
4384 .listen = sock_no_listen,
4385 .shutdown = sock_no_shutdown,
4386 .setsockopt = packet_setsockopt,
4387 .getsockopt = packet_getsockopt,
4388 .sendmsg = packet_sendmsg,
4389 .recvmsg = packet_recvmsg,
4390 .mmap = packet_mmap,
4391 .sendpage = sock_no_sendpage,
4392};
4393
ec1b4cf7 4394static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4395 .family = PF_PACKET,
4396 .create = packet_create,
4397 .owner = THIS_MODULE,
4398};
4399
4400static struct notifier_block packet_netdev_notifier = {
40d4e3df 4401 .notifier_call = packet_notifier,
1da177e4
LT
4402};
4403
4404#ifdef CONFIG_PROC_FS
1da177e4
LT
4405
4406static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4407 __acquires(RCU)
1da177e4 4408{
e372c414 4409 struct net *net = seq_file_net(seq);
808f5114 4410
4411 rcu_read_lock();
4412 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4413}
4414
4415static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4416{
1bf40954 4417 struct net *net = seq_file_net(seq);
808f5114 4418 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4419}
4420
4421static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4422 __releases(RCU)
1da177e4 4423{
808f5114 4424 rcu_read_unlock();
1da177e4
LT
4425}
4426
1ce4f28b 4427static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4428{
4429 if (v == SEQ_START_TOKEN)
4430 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4431 else {
b7ceabd9 4432 struct sock *s = sk_entry(v);
1da177e4
LT
4433 const struct packet_sock *po = pkt_sk(s);
4434
4435 seq_printf(seq,
71338aa7 4436 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
4437 s,
4438 atomic_read(&s->sk_refcnt),
4439 s->sk_type,
4440 ntohs(po->num),
4441 po->ifindex,
4442 po->running,
4443 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4444 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4445 sock_i_ino(s));
1da177e4
LT
4446 }
4447
4448 return 0;
4449}
4450
56b3d975 4451static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4452 .start = packet_seq_start,
4453 .next = packet_seq_next,
4454 .stop = packet_seq_stop,
4455 .show = packet_seq_show,
4456};
4457
4458static int packet_seq_open(struct inode *inode, struct file *file)
4459{
e372c414
DL
4460 return seq_open_net(inode, file, &packet_seq_ops,
4461 sizeof(struct seq_net_private));
1da177e4
LT
4462}
4463
da7071d7 4464static const struct file_operations packet_seq_fops = {
1da177e4
LT
4465 .owner = THIS_MODULE,
4466 .open = packet_seq_open,
4467 .read = seq_read,
4468 .llseek = seq_lseek,
e372c414 4469 .release = seq_release_net,
1da177e4
LT
4470};
4471
4472#endif
4473
2c8c1e72 4474static int __net_init packet_net_init(struct net *net)
d12d01d6 4475{
0fa7fa98 4476 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4477 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4478
d4beaa66 4479 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
4480 return -ENOMEM;
4481
4482 return 0;
4483}
4484
2c8c1e72 4485static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4486{
ece31ffd 4487 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
4488}
4489
4490static struct pernet_operations packet_net_ops = {
4491 .init = packet_net_init,
4492 .exit = packet_net_exit,
4493};
4494
4495
1da177e4
LT
4496static void __exit packet_exit(void)
4497{
1da177e4 4498 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4499 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4500 sock_unregister(PF_PACKET);
4501 proto_unregister(&packet_proto);
4502}
4503
4504static int __init packet_init(void)
4505{
4506 int rc = proto_register(&packet_proto, 0);
4507
4508 if (rc != 0)
4509 goto out;
4510
4511 sock_register(&packet_family_ops);
d12d01d6 4512 register_pernet_subsys(&packet_net_ops);
1da177e4 4513 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4514out:
4515 return rc;
4516}
4517
4518module_init(packet_init);
4519module_exit(packet_exit);
4520MODULE_LICENSE("GPL");
4521MODULE_ALIAS_NETPROTO(PF_PACKET);