ipv6: ipcm6_cookie initializer
[linux-block.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
7c0f6ba6 76#include <linux/uaccess.h>
1da177e4
LT
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
47dceb8e 95#include <linux/bpf.h>
719c44d3 96#include <net/compat.h>
1da177e4 97
2787b04b
PE
98#include "internal.h"
99
1da177e4
LT
100/*
101 Assumptions:
102 - if device has no dev->hard_header routine, it adds and removes ll header
103 inside itself. In this case ll header is invisible outside of device,
104 but higher levels still should reserve dev->hard_header_len.
105 Some devices are enough clever to reallocate skb, when header
106 will not fit to reserved space (tunnel), another ones are silly
107 (PPP).
108 - packet socket receives packets with pulled ll header,
109 so that SOCK_RAW should push it back.
110
111On receive:
112-----------
113
114Incoming, dev->hard_header!=NULL
b0e380b1
ACM
115 mac_header -> ll header
116 data -> data
1da177e4
LT
117
118Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
119 mac_header -> ll header
120 data -> ll header
1da177e4
LT
121
122Incoming, dev->hard_header==NULL
b0e380b1
ACM
123 mac_header -> UNKNOWN position. It is very likely, that it points to ll
124 header. PPP makes it, that is wrong, because introduce
db0c58f9 125 assymetry between rx and tx paths.
b0e380b1 126 data -> data
1da177e4
LT
127
128Outgoing, dev->hard_header==NULL
b0e380b1
ACM
129 mac_header -> data. ll header is still not built!
130 data -> data
1da177e4
LT
131
132Resume
133 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
134
135
136On transmit:
137------------
138
139dev->hard_header != NULL
b0e380b1
ACM
140 mac_header -> ll header
141 data -> ll header
1da177e4
LT
142
143dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
144 mac_header -> data
145 data -> data
1da177e4
LT
146
147 We should set nh.raw on output to correct posistion,
148 packet classifier depends on it.
149 */
150
1da177e4
LT
151/* Private packet socket structures. */
152
0fb375fb
EB
153/* identical to struct packet_mreq except it has
154 * a longer address field.
155 */
40d4e3df 156struct packet_mreq_max {
0fb375fb
EB
157 int mr_ifindex;
158 unsigned short mr_type;
159 unsigned short mr_alen;
160 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 161};
a2efcfa0 162
184f489e
DB
163union tpacket_uhdr {
164 struct tpacket_hdr *h1;
165 struct tpacket2_hdr *h2;
166 struct tpacket3_hdr *h3;
167 void *raw;
168};
169
f6fb8f10 170static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
171 int closing, int tx_ring);
172
f6fb8f10 173#define V3_ALIGNMENT (8)
174
bc59ba39 175#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 176
177#define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f 188struct packet_sock;
77f65ebd
WB
189static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
190 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 191
f6fb8f10 192static void *packet_previous_frame(struct packet_sock *po,
193 struct packet_ring_buffer *rb,
194 int status);
195static void packet_increment_head(struct packet_ring_buffer *buff);
878cd3ba 196static int prb_curr_blk_in_use(struct tpacket_block_desc *);
bc59ba39 197static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 198 struct packet_sock *);
bc59ba39 199static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *, unsigned int status);
bc59ba39 201static int prb_queue_frozen(struct tpacket_kbdq_core *);
202static void prb_open_block(struct tpacket_kbdq_core *,
203 struct tpacket_block_desc *);
17bfd8c8 204static void prb_retire_rx_blk_timer_expired(struct timer_list *);
bc59ba39 205static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
bc59ba39 206static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
207static void prb_clear_rxhash(struct tpacket_kbdq_core *,
208 struct tpacket3_hdr *);
209static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
210 struct tpacket3_hdr *);
1da177e4 211static void packet_flush_mclist(struct sock *sk);
865b03f2 212static u16 packet_pick_tx_queue(struct sk_buff *skb);
1da177e4 213
ffbc6111 214struct packet_skb_cb {
ffbc6111
HX
215 union {
216 struct sockaddr_pkt pkt;
2472d761
EB
217 union {
218 /* Trick: alias skb original length with
219 * ll.sll_family and ll.protocol in order
220 * to save room.
221 */
222 unsigned int origlen;
223 struct sockaddr_ll ll;
224 };
ffbc6111
HX
225 } sa;
226};
227
d3869efe
DW
228#define vio_le() virtio_legacy_is_little_endian()
229
ffbc6111 230#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 231
bc59ba39 232#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 233#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 234 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 235#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 236 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 237#define GET_NEXT_PRB_BLK_NUM(x) \
238 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
239 ((x)->kactive_blk_num+1) : 0)
240
dc99f600
DM
241static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
242static void __fanout_link(struct sock *sk, struct packet_sock *po);
243
d346a3fa
DB
244static int packet_direct_xmit(struct sk_buff *skb)
245{
865b03f2 246 return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
d346a3fa
DB
247}
248
66e56cd4
DB
249static struct net_device *packet_cached_dev_get(struct packet_sock *po)
250{
251 struct net_device *dev;
252
253 rcu_read_lock();
254 dev = rcu_dereference(po->cached_dev);
255 if (likely(dev))
256 dev_hold(dev);
257 rcu_read_unlock();
258
259 return dev;
260}
261
262static void packet_cached_dev_assign(struct packet_sock *po,
263 struct net_device *dev)
264{
265 rcu_assign_pointer(po->cached_dev, dev);
266}
267
268static void packet_cached_dev_reset(struct packet_sock *po)
269{
270 RCU_INIT_POINTER(po->cached_dev, NULL);
271}
272
d346a3fa
DB
273static bool packet_use_direct_xmit(const struct packet_sock *po)
274{
275 return po->xmit == packet_direct_xmit;
276}
277
0fd5d57b 278static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
d346a3fa 279{
1cbac010 280 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
281}
282
865b03f2 283static u16 packet_pick_tx_queue(struct sk_buff *skb)
0fd5d57b 284{
865b03f2 285 struct net_device *dev = skb->dev;
0fd5d57b
DB
286 const struct net_device_ops *ops = dev->netdev_ops;
287 u16 queue_index;
288
289 if (ops->ndo_select_queue) {
290 queue_index = ops->ndo_select_queue(dev, skb, NULL,
291 __packet_pick_tx_queue);
292 queue_index = netdev_cap_txqueue(dev, queue_index);
293 } else {
294 queue_index = __packet_pick_tx_queue(dev, skb);
295 }
296
865b03f2 297 return queue_index;
0fd5d57b
DB
298}
299
a6361f0c 300/* __register_prot_hook must be invoked through register_prot_hook
ce06b03e
DM
301 * or from a context in which asynchronous accesses to the packet
302 * socket is not possible (packet_create()).
303 */
a6361f0c 304static void __register_prot_hook(struct sock *sk)
ce06b03e
DM
305{
306 struct packet_sock *po = pkt_sk(sk);
e40526cb 307
ce06b03e 308 if (!po->running) {
66e56cd4 309 if (po->fanout)
dc99f600 310 __fanout_link(sk, po);
66e56cd4 311 else
dc99f600 312 dev_add_pack(&po->prot_hook);
e40526cb 313
ce06b03e
DM
314 sock_hold(sk);
315 po->running = 1;
316 }
317}
318
a6361f0c
WB
319static void register_prot_hook(struct sock *sk)
320{
321 lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
322 __register_prot_hook(sk);
323}
324
325/* If the sync parameter is true, we will temporarily drop
ce06b03e
DM
326 * the po->bind_lock and do a synchronize_net to make sure no
327 * asynchronous packet processing paths still refer to the elements
328 * of po->prot_hook. If the sync parameter is false, it is the
329 * callers responsibility to take care of this.
330 */
331static void __unregister_prot_hook(struct sock *sk, bool sync)
332{
333 struct packet_sock *po = pkt_sk(sk);
334
a6361f0c
WB
335 lockdep_assert_held_once(&po->bind_lock);
336
ce06b03e 337 po->running = 0;
66e56cd4
DB
338
339 if (po->fanout)
dc99f600 340 __fanout_unlink(sk, po);
66e56cd4 341 else
dc99f600 342 __dev_remove_pack(&po->prot_hook);
e40526cb 343
ce06b03e
DM
344 __sock_put(sk);
345
346 if (sync) {
347 spin_unlock(&po->bind_lock);
348 synchronize_net();
349 spin_lock(&po->bind_lock);
350 }
351}
352
353static void unregister_prot_hook(struct sock *sk, bool sync)
354{
355 struct packet_sock *po = pkt_sk(sk);
356
357 if (po->running)
358 __unregister_prot_hook(sk, sync);
359}
360
6e58040b 361static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
362{
363 if (is_vmalloc_addr(addr))
364 return vmalloc_to_page(addr);
365 return virt_to_page(addr);
366}
367
69e3c75f 368static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 369{
184f489e 370 union tpacket_uhdr h;
1da177e4 371
69e3c75f 372 h.raw = frame;
bbd6ef87
PM
373 switch (po->tp_version) {
374 case TPACKET_V1:
69e3c75f 375 h.h1->tp_status = status;
0af55bb5 376 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
377 break;
378 case TPACKET_V2:
69e3c75f 379 h.h2->tp_status = status;
0af55bb5 380 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 381 break;
f6fb8f10 382 case TPACKET_V3:
7f953ab2
SV
383 h.h3->tp_status = status;
384 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
385 break;
69e3c75f 386 default:
f6fb8f10 387 WARN(1, "TPACKET version not supported.\n");
69e3c75f 388 BUG();
bbd6ef87 389 }
69e3c75f
JB
390
391 smp_wmb();
bbd6ef87
PM
392}
393
69e3c75f 394static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 395{
184f489e 396 union tpacket_uhdr h;
bbd6ef87 397
69e3c75f
JB
398 smp_rmb();
399
bbd6ef87
PM
400 h.raw = frame;
401 switch (po->tp_version) {
402 case TPACKET_V1:
0af55bb5 403 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 404 return h.h1->tp_status;
bbd6ef87 405 case TPACKET_V2:
0af55bb5 406 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 407 return h.h2->tp_status;
f6fb8f10 408 case TPACKET_V3:
7f953ab2
SV
409 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
410 return h.h3->tp_status;
69e3c75f 411 default:
f6fb8f10 412 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
413 BUG();
414 return 0;
bbd6ef87 415 }
1da177e4 416}
69e3c75f 417
b9c32fb2
DB
418static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
419 unsigned int flags)
7a51384c
DB
420{
421 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
422
68a360e8
WB
423 if (shhwtstamps &&
424 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
425 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
426 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
427
428 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 429 return TP_STATUS_TS_SOFTWARE;
7a51384c 430
b9c32fb2 431 return 0;
7a51384c
DB
432}
433
b9c32fb2
DB
434static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
435 struct sk_buff *skb)
2e31396f
WB
436{
437 union tpacket_uhdr h;
438 struct timespec ts;
b9c32fb2 439 __u32 ts_status;
2e31396f 440
b9c32fb2
DB
441 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
442 return 0;
2e31396f
WB
443
444 h.raw = frame;
445 switch (po->tp_version) {
446 case TPACKET_V1:
447 h.h1->tp_sec = ts.tv_sec;
448 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
449 break;
450 case TPACKET_V2:
451 h.h2->tp_sec = ts.tv_sec;
452 h.h2->tp_nsec = ts.tv_nsec;
453 break;
454 case TPACKET_V3:
57ea884b
DB
455 h.h3->tp_sec = ts.tv_sec;
456 h.h3->tp_nsec = ts.tv_nsec;
457 break;
2e31396f
WB
458 default:
459 WARN(1, "TPACKET version not supported.\n");
460 BUG();
461 }
462
463 /* one flush is safe, as both fields always lie on the same cacheline */
464 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
465 smp_wmb();
b9c32fb2
DB
466
467 return ts_status;
2e31396f
WB
468}
469
69e3c75f
JB
470static void *packet_lookup_frame(struct packet_sock *po,
471 struct packet_ring_buffer *rb,
472 unsigned int position,
473 int status)
474{
475 unsigned int pg_vec_pos, frame_offset;
184f489e 476 union tpacket_uhdr h;
69e3c75f
JB
477
478 pg_vec_pos = position / rb->frames_per_block;
479 frame_offset = position % rb->frames_per_block;
480
0e3125c7
NH
481 h.raw = rb->pg_vec[pg_vec_pos].buffer +
482 (frame_offset * rb->frame_size);
69e3c75f
JB
483
484 if (status != __packet_get_status(po, h.raw))
485 return NULL;
486
487 return h.raw;
488}
489
eea49cc9 490static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
491 struct packet_ring_buffer *rb,
492 int status)
493{
494 return packet_lookup_frame(po, rb, rb->head, status);
495}
496
bc59ba39 497static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 498{
499 del_timer_sync(&pkc->retire_blk_timer);
500}
501
502static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 503 struct sk_buff_head *rb_queue)
504{
bc59ba39 505 struct tpacket_kbdq_core *pkc;
f6fb8f10 506
73d0fcf2 507 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 508
ec6f809f 509 spin_lock_bh(&rb_queue->lock);
f6fb8f10 510 pkc->delete_blk_timer = 1;
ec6f809f 511 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 512
513 prb_del_retire_blk_timer(pkc);
514}
515
e8e85cc5 516static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 517{
bc59ba39 518 struct tpacket_kbdq_core *pkc;
f6fb8f10 519
e8e85cc5 520 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
17bfd8c8
KC
521 timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
522 0);
523 pkc->retire_blk_timer.expires = jiffies;
f6fb8f10 524}
525
526static int prb_calc_retire_blk_tmo(struct packet_sock *po,
527 int blk_size_in_bytes)
528{
529 struct net_device *dev;
530 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
7cad1bac 531 struct ethtool_link_ksettings ecmd;
4bc71cb9 532 int err;
f6fb8f10 533
4bc71cb9
JP
534 rtnl_lock();
535 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
536 if (unlikely(!dev)) {
537 rtnl_unlock();
f6fb8f10 538 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 539 }
7cad1bac 540 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9
JP
541 rtnl_unlock();
542 if (!err) {
4bc71cb9
JP
543 /*
544 * If the link speed is so slow you don't really
545 * need to worry about perf anyways
546 */
7cad1bac
DD
547 if (ecmd.base.speed < SPEED_1000 ||
548 ecmd.base.speed == SPEED_UNKNOWN) {
4bc71cb9 549 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 550 } else {
551 msec = 1;
7cad1bac 552 div = ecmd.base.speed / 1000;
f6fb8f10 553 }
554 }
555
556 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
557
558 if (div)
559 mbits /= div;
560
561 tmo = mbits * msec;
562
563 if (div)
564 return tmo+1;
565 return tmo;
566}
567
bc59ba39 568static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 569 union tpacket_req_u *req_u)
570{
571 p1->feature_req_word = req_u->req3.tp_feature_req_word;
572}
573
574static void init_prb_bdqc(struct packet_sock *po,
575 struct packet_ring_buffer *rb,
576 struct pgv *pg_vec,
e8e85cc5 577 union tpacket_req_u *req_u)
f6fb8f10 578{
22781a5b 579 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 580 struct tpacket_block_desc *pbd;
f6fb8f10 581
582 memset(p1, 0x0, sizeof(*p1));
583
584 p1->knxt_seq_num = 1;
585 p1->pkbdq = pg_vec;
bc59ba39 586 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 587 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 588 p1->kblk_size = req_u->req3.tp_block_size;
589 p1->knum_blocks = req_u->req3.tp_block_nr;
590 p1->hdrlen = po->tp_hdrlen;
591 p1->version = po->tp_version;
592 p1->last_kactive_blk_num = 0;
ee80fbf3 593 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 594 if (req_u->req3.tp_retire_blk_tov)
595 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
596 else
597 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
598 req_u->req3.tp_block_size);
599 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
600 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
601
dc808110 602 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 603 prb_init_ft_ops(p1, req_u);
e8e85cc5 604 prb_setup_retire_blk_timer(po);
f6fb8f10 605 prb_open_block(p1, pbd);
606}
607
608/* Do NOT update the last_blk_num first.
609 * Assumes sk_buff_head lock is held.
610 */
bc59ba39 611static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 612{
613 mod_timer(&pkc->retire_blk_timer,
614 jiffies + pkc->tov_in_jiffies);
615 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
616}
617
618/*
619 * Timer logic:
620 * 1) We refresh the timer only when we open a block.
621 * By doing this we don't waste cycles refreshing the timer
622 * on packet-by-packet basis.
623 *
624 * With a 1MB block-size, on a 1Gbps line, it will take
625 * i) ~8 ms to fill a block + ii) memcpy etc.
626 * In this cut we are not accounting for the memcpy time.
627 *
628 * So, if the user sets the 'tmo' to 10ms then the timer
629 * will never fire while the block is still getting filled
630 * (which is what we want). However, the user could choose
631 * to close a block early and that's fine.
632 *
633 * But when the timer does fire, we check whether or not to refresh it.
634 * Since the tmo granularity is in msecs, it is not too expensive
635 * to refresh the timer, lets say every '8' msecs.
636 * Either the user can set the 'tmo' or we can derive it based on
637 * a) line-speed and b) block-size.
638 * prb_calc_retire_blk_tmo() calculates the tmo.
639 *
640 */
17bfd8c8 641static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
f6fb8f10 642{
17bfd8c8
KC
643 struct packet_sock *po =
644 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
22781a5b 645 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 646 unsigned int frozen;
bc59ba39 647 struct tpacket_block_desc *pbd;
f6fb8f10 648
649 spin_lock(&po->sk.sk_receive_queue.lock);
650
651 frozen = prb_queue_frozen(pkc);
652 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
653
654 if (unlikely(pkc->delete_blk_timer))
655 goto out;
656
657 /* We only need to plug the race when the block is partially filled.
658 * tpacket_rcv:
659 * lock(); increment BLOCK_NUM_PKTS; unlock()
660 * copy_bits() is in progress ...
661 * timer fires on other cpu:
662 * we can't retire the current block because copy_bits
663 * is in progress.
664 *
665 */
666 if (BLOCK_NUM_PKTS(pbd)) {
667 while (atomic_read(&pkc->blk_fill_in_prog)) {
668 /* Waiting for skb_copy_bits to finish... */
669 cpu_relax();
670 }
671 }
672
673 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
674 if (!frozen) {
41a50d62
AD
675 if (!BLOCK_NUM_PKTS(pbd)) {
676 /* An empty block. Just refresh the timer. */
677 goto refresh_timer;
678 }
f6fb8f10 679 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
680 if (!prb_dispatch_next_block(pkc, po))
681 goto refresh_timer;
682 else
683 goto out;
684 } else {
685 /* Case 1. Queue was frozen because user-space was
686 * lagging behind.
687 */
878cd3ba 688 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 689 /*
690 * Ok, user-space is still behind.
691 * So just refresh the timer.
692 */
693 goto refresh_timer;
694 } else {
695 /* Case 2. queue was frozen,user-space caught up,
696 * now the link went idle && the timer fired.
697 * We don't have a block to close.So we open this
698 * block and restart the timer.
699 * opening a block thaws the queue,restarts timer
700 * Thawing/timer-refresh is a side effect.
701 */
702 prb_open_block(pkc, pbd);
703 goto out;
704 }
705 }
706 }
707
708refresh_timer:
709 _prb_refresh_rx_retire_blk_timer(pkc);
710
711out:
712 spin_unlock(&po->sk.sk_receive_queue.lock);
713}
714
eea49cc9 715static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 716 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 717{
718 /* Flush everything minus the block header */
719
720#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
721 u8 *start, *end;
722
723 start = (u8 *)pbd1;
724
725 /* Skip the block header(we know header WILL fit in 4K) */
726 start += PAGE_SIZE;
727
728 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
729 for (; start < end; start += PAGE_SIZE)
730 flush_dcache_page(pgv_to_page(start));
731
732 smp_wmb();
733#endif
734
735 /* Now update the block status. */
736
737 BLOCK_STATUS(pbd1) = status;
738
739 /* Flush the block header */
740
741#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
742 start = (u8 *)pbd1;
743 flush_dcache_page(pgv_to_page(start));
744
745 smp_wmb();
746#endif
747}
748
749/*
750 * Side effect:
751 *
752 * 1) flush the block
753 * 2) Increment active_blk_num
754 *
755 * Note:We DONT refresh the timer on purpose.
756 * Because almost always the next block will be opened.
757 */
bc59ba39 758static void prb_close_block(struct tpacket_kbdq_core *pkc1,
759 struct tpacket_block_desc *pbd1,
f6fb8f10 760 struct packet_sock *po, unsigned int stat)
761{
762 __u32 status = TP_STATUS_USER | stat;
763
764 struct tpacket3_hdr *last_pkt;
bc59ba39 765 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 766 struct sock *sk = &po->sk;
f6fb8f10 767
ee80fbf3 768 if (po->stats.stats3.tp_drops)
f6fb8f10 769 status |= TP_STATUS_LOSING;
770
771 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
772 last_pkt->tp_next_offset = 0;
773
774 /* Get the ts of the last pkt */
775 if (BLOCK_NUM_PKTS(pbd1)) {
776 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
777 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
778 } else {
41a50d62
AD
779 /* Ok, we tmo'd - so get the current time.
780 *
781 * It shouldn't really happen as we don't close empty
782 * blocks. See prb_retire_rx_blk_timer_expired().
783 */
f6fb8f10 784 struct timespec ts;
785 getnstimeofday(&ts);
786 h1->ts_last_pkt.ts_sec = ts.tv_sec;
787 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
788 }
789
790 smp_wmb();
791
792 /* Flush the block */
793 prb_flush_block(pkc1, pbd1, status);
794
da413eec
DC
795 sk->sk_data_ready(sk);
796
f6fb8f10 797 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
798}
799
eea49cc9 800static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 801{
802 pkc->reset_pending_on_curr_blk = 0;
803}
804
805/*
806 * Side effect of opening a block:
807 *
808 * 1) prb_queue is thawed.
809 * 2) retire_blk_timer is refreshed.
810 *
811 */
bc59ba39 812static void prb_open_block(struct tpacket_kbdq_core *pkc1,
813 struct tpacket_block_desc *pbd1)
f6fb8f10 814{
815 struct timespec ts;
bc59ba39 816 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 817
818 smp_rmb();
819
8da3056c
DB
820 /* We could have just memset this but we will lose the
821 * flexibility of making the priv area sticky
822 */
f6fb8f10 823
8da3056c
DB
824 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
825 BLOCK_NUM_PKTS(pbd1) = 0;
826 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 827
8da3056c
DB
828 getnstimeofday(&ts);
829
830 h1->ts_first_pkt.ts_sec = ts.tv_sec;
831 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 832
8da3056c
DB
833 pkc1->pkblk_start = (char *)pbd1;
834 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
835
836 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
837 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
838
839 pbd1->version = pkc1->version;
840 pkc1->prev = pkc1->nxt_offset;
841 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
842
843 prb_thaw_queue(pkc1);
844 _prb_refresh_rx_retire_blk_timer(pkc1);
845
846 smp_wmb();
f6fb8f10 847}
848
849/*
850 * Queue freeze logic:
851 * 1) Assume tp_block_nr = 8 blocks.
852 * 2) At time 't0', user opens Rx ring.
853 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
854 * 4) user-space is either sleeping or processing block '0'.
855 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
856 * it will close block-7,loop around and try to fill block '0'.
857 * call-flow:
858 * __packet_lookup_frame_in_block
859 * prb_retire_current_block()
860 * prb_dispatch_next_block()
861 * |->(BLOCK_STATUS == USER) evaluates to true
862 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
863 * 6) Now there are two cases:
864 * 6.1) Link goes idle right after the queue is frozen.
865 * But remember, the last open_block() refreshed the timer.
866 * When this timer expires,it will refresh itself so that we can
867 * re-open block-0 in near future.
868 * 6.2) Link is busy and keeps on receiving packets. This is a simple
869 * case and __packet_lookup_frame_in_block will check if block-0
870 * is free and can now be re-used.
871 */
eea49cc9 872static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 873 struct packet_sock *po)
874{
875 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 876 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 877}
878
879#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
880
881/*
882 * If the next block is free then we will dispatch it
883 * and return a good offset.
884 * Else, we will freeze the queue.
885 * So, caller must check the return value.
886 */
bc59ba39 887static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 888 struct packet_sock *po)
889{
bc59ba39 890 struct tpacket_block_desc *pbd;
f6fb8f10 891
892 smp_rmb();
893
894 /* 1. Get current block num */
895 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
896
897 /* 2. If this block is currently in_use then freeze the queue */
898 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
899 prb_freeze_queue(pkc, po);
900 return NULL;
901 }
902
903 /*
904 * 3.
905 * open this block and return the offset where the first packet
906 * needs to get stored.
907 */
908 prb_open_block(pkc, pbd);
909 return (void *)pkc->nxt_offset;
910}
911
bc59ba39 912static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 913 struct packet_sock *po, unsigned int status)
914{
bc59ba39 915 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 916
917 /* retire/close the current block */
918 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
919 /*
920 * Plug the case where copy_bits() is in progress on
921 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
922 * have space to copy the pkt in the current block and
923 * called prb_retire_current_block()
924 *
925 * We don't need to worry about the TMO case because
926 * the timer-handler already handled this case.
927 */
928 if (!(status & TP_STATUS_BLK_TMO)) {
929 while (atomic_read(&pkc->blk_fill_in_prog)) {
930 /* Waiting for skb_copy_bits to finish... */
931 cpu_relax();
932 }
933 }
934 prb_close_block(pkc, pbd, po, status);
935 return;
936 }
f6fb8f10 937}
938
878cd3ba 939static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
f6fb8f10 940{
941 return TP_STATUS_USER & BLOCK_STATUS(pbd);
942}
943
eea49cc9 944static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 945{
946 return pkc->reset_pending_on_curr_blk;
947}
948
eea49cc9 949static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 950{
bc59ba39 951 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 952 atomic_dec(&pkc->blk_fill_in_prog);
953}
954
eea49cc9 955static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 956 struct tpacket3_hdr *ppd)
957{
3958afa1 958 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 959}
960
eea49cc9 961static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 962 struct tpacket3_hdr *ppd)
963{
964 ppd->hv1.tp_rxhash = 0;
965}
966
eea49cc9 967static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 968 struct tpacket3_hdr *ppd)
969{
df8a39de
JP
970 if (skb_vlan_tag_present(pkc->skb)) {
971 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
972 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
973 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 974 } else {
9e67030a 975 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 976 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 977 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 978 }
979}
980
bc59ba39 981static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 982 struct tpacket3_hdr *ppd)
983{
a0cdfcf3 984 ppd->hv1.tp_padding = 0;
f6fb8f10 985 prb_fill_vlan_info(pkc, ppd);
986
987 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
988 prb_fill_rxhash(pkc, ppd);
989 else
990 prb_clear_rxhash(pkc, ppd);
991}
992
eea49cc9 993static void prb_fill_curr_block(char *curr,
bc59ba39 994 struct tpacket_kbdq_core *pkc,
995 struct tpacket_block_desc *pbd,
f6fb8f10 996 unsigned int len)
997{
998 struct tpacket3_hdr *ppd;
999
1000 ppd = (struct tpacket3_hdr *)curr;
1001 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1002 pkc->prev = curr;
1003 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1004 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1005 BLOCK_NUM_PKTS(pbd) += 1;
1006 atomic_inc(&pkc->blk_fill_in_prog);
1007 prb_run_all_ft_ops(pkc, ppd);
1008}
1009
1010/* Assumes caller has the sk->rx_queue.lock */
1011static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1012 struct sk_buff *skb,
1013 int status,
1014 unsigned int len
1015 )
1016{
bc59ba39 1017 struct tpacket_kbdq_core *pkc;
1018 struct tpacket_block_desc *pbd;
f6fb8f10 1019 char *curr, *end;
1020
e3192690 1021 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1022 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1023
1024 /* Queue is frozen when user space is lagging behind */
1025 if (prb_queue_frozen(pkc)) {
1026 /*
1027 * Check if that last block which caused the queue to freeze,
1028 * is still in_use by user-space.
1029 */
878cd3ba 1030 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 1031 /* Can't record this packet */
1032 return NULL;
1033 } else {
1034 /*
1035 * Ok, the block was released by user-space.
1036 * Now let's open that block.
1037 * opening a block also thaws the queue.
1038 * Thawing is a side effect.
1039 */
1040 prb_open_block(pkc, pbd);
1041 }
1042 }
1043
1044 smp_mb();
1045 curr = pkc->nxt_offset;
1046 pkc->skb = skb;
e3192690 1047 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1048
1049 /* first try the current block */
1050 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1051 prb_fill_curr_block(curr, pkc, pbd, len);
1052 return (void *)curr;
1053 }
1054
1055 /* Ok, close the current block */
1056 prb_retire_current_block(pkc, po, 0);
1057
1058 /* Now, try to dispatch the next block */
1059 curr = (char *)prb_dispatch_next_block(pkc, po);
1060 if (curr) {
1061 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1062 prb_fill_curr_block(curr, pkc, pbd, len);
1063 return (void *)curr;
1064 }
1065
1066 /*
1067 * No free blocks are available.user_space hasn't caught up yet.
1068 * Queue was just frozen and now this packet will get dropped.
1069 */
1070 return NULL;
1071}
1072
eea49cc9 1073static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1074 struct sk_buff *skb,
1075 int status, unsigned int len)
1076{
1077 char *curr = NULL;
1078 switch (po->tp_version) {
1079 case TPACKET_V1:
1080 case TPACKET_V2:
1081 curr = packet_lookup_frame(po, &po->rx_ring,
1082 po->rx_ring.head, status);
1083 return curr;
1084 case TPACKET_V3:
1085 return __packet_lookup_frame_in_block(po, skb, status, len);
1086 default:
1087 WARN(1, "TPACKET version not supported\n");
1088 BUG();
99aa3473 1089 return NULL;
f6fb8f10 1090 }
1091}
1092
eea49cc9 1093static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1094 struct packet_ring_buffer *rb,
77f65ebd 1095 unsigned int idx,
f6fb8f10 1096 int status)
1097{
bc59ba39 1098 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1099 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1100
1101 if (status != BLOCK_STATUS(pbd))
1102 return NULL;
1103 return pbd;
1104}
1105
eea49cc9 1106static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1107{
1108 unsigned int prev;
1109 if (rb->prb_bdqc.kactive_blk_num)
1110 prev = rb->prb_bdqc.kactive_blk_num-1;
1111 else
1112 prev = rb->prb_bdqc.knum_blocks-1;
1113 return prev;
1114}
1115
1116/* Assumes caller has held the rx_queue.lock */
eea49cc9 1117static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1118 struct packet_ring_buffer *rb,
1119 int status)
1120{
1121 unsigned int previous = prb_previous_blk_num(rb);
1122 return prb_lookup_block(po, rb, previous, status);
1123}
1124
eea49cc9 1125static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1126 struct packet_ring_buffer *rb,
1127 int status)
1128{
1129 if (po->tp_version <= TPACKET_V2)
1130 return packet_previous_frame(po, rb, status);
1131
1132 return __prb_previous_block(po, rb, status);
1133}
1134
eea49cc9 1135static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1136 struct packet_ring_buffer *rb)
1137{
1138 switch (po->tp_version) {
1139 case TPACKET_V1:
1140 case TPACKET_V2:
1141 return packet_increment_head(rb);
1142 case TPACKET_V3:
1143 default:
1144 WARN(1, "TPACKET version not supported.\n");
1145 BUG();
1146 return;
1147 }
1148}
1149
eea49cc9 1150static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1151 struct packet_ring_buffer *rb,
1152 int status)
1153{
1154 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1155 return packet_lookup_frame(po, rb, previous, status);
1156}
1157
eea49cc9 1158static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1159{
1160 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1161}
1162
b0138408
DB
1163static void packet_inc_pending(struct packet_ring_buffer *rb)
1164{
1165 this_cpu_inc(*rb->pending_refcnt);
1166}
1167
1168static void packet_dec_pending(struct packet_ring_buffer *rb)
1169{
1170 this_cpu_dec(*rb->pending_refcnt);
1171}
1172
1173static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1174{
1175 unsigned int refcnt = 0;
1176 int cpu;
1177
1178 /* We don't use pending refcount in rx_ring. */
1179 if (rb->pending_refcnt == NULL)
1180 return 0;
1181
1182 for_each_possible_cpu(cpu)
1183 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1184
1185 return refcnt;
1186}
1187
1188static int packet_alloc_pending(struct packet_sock *po)
1189{
1190 po->rx_ring.pending_refcnt = NULL;
1191
1192 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1193 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1194 return -ENOBUFS;
1195
1196 return 0;
1197}
1198
1199static void packet_free_pending(struct packet_sock *po)
1200{
1201 free_percpu(po->tx_ring.pending_refcnt);
1202}
1203
9954729b
WB
1204#define ROOM_POW_OFF 2
1205#define ROOM_NONE 0x0
1206#define ROOM_LOW 0x1
1207#define ROOM_NORMAL 0x2
1208
1209static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1210{
9954729b
WB
1211 int idx, len;
1212
1213 len = po->rx_ring.frame_max + 1;
1214 idx = po->rx_ring.head;
1215 if (pow_off)
1216 idx += len >> pow_off;
1217 if (idx >= len)
1218 idx -= len;
1219 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1220}
1221
1222static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1223{
1224 int idx, len;
1225
1226 len = po->rx_ring.prb_bdqc.knum_blocks;
1227 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1228 if (pow_off)
1229 idx += len >> pow_off;
1230 if (idx >= len)
1231 idx -= len;
1232 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1233}
77f65ebd 1234
2ccdbaa6 1235static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1236{
1237 struct sock *sk = &po->sk;
1238 int ret = ROOM_NONE;
1239
1240 if (po->prot_hook.func != tpacket_rcv) {
1241 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1242 - (skb ? skb->truesize : 0);
9954729b
WB
1243 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1244 return ROOM_NORMAL;
1245 else if (avail > 0)
1246 return ROOM_LOW;
1247 else
1248 return ROOM_NONE;
1249 }
77f65ebd 1250
9954729b
WB
1251 if (po->tp_version == TPACKET_V3) {
1252 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1253 ret = ROOM_NORMAL;
1254 else if (__tpacket_v3_has_room(po, 0))
1255 ret = ROOM_LOW;
1256 } else {
1257 if (__tpacket_has_room(po, ROOM_POW_OFF))
1258 ret = ROOM_NORMAL;
1259 else if (__tpacket_has_room(po, 0))
1260 ret = ROOM_LOW;
1261 }
2ccdbaa6
WB
1262
1263 return ret;
1264}
1265
1266static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1267{
1268 int ret;
1269 bool has_room;
1270
54d7c01d
WB
1271 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1272 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1273 has_room = ret == ROOM_NORMAL;
1274 if (po->pressure == has_room)
54d7c01d
WB
1275 po->pressure = !has_room;
1276 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1277
9954729b 1278 return ret;
77f65ebd
WB
1279}
1280
1da177e4
LT
1281static void packet_sock_destruct(struct sock *sk)
1282{
ed85b565
RC
1283 skb_queue_purge(&sk->sk_error_queue);
1284
547b792c 1285 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
14afee4b 1286 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1da177e4
LT
1287
1288 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1289 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1290 return;
1291 }
1292
17ab56a2 1293 sk_refcnt_debug_dec(sk);
1da177e4
LT
1294}
1295
3b3a5b0a
WB
1296static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1297{
1298 u32 rxhash;
1299 int i, count = 0;
1300
1301 rxhash = skb_get_hash(skb);
1302 for (i = 0; i < ROLLOVER_HLEN; i++)
1303 if (po->rollover->history[i] == rxhash)
1304 count++;
1305
1306 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1307 return count > (ROLLOVER_HLEN >> 1);
1308}
1309
77f65ebd
WB
1310static unsigned int fanout_demux_hash(struct packet_fanout *f,
1311 struct sk_buff *skb,
1312 unsigned int num)
dc99f600 1313{
eb70db87 1314 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
dc99f600
DM
1315}
1316
77f65ebd
WB
1317static unsigned int fanout_demux_lb(struct packet_fanout *f,
1318 struct sk_buff *skb,
1319 unsigned int num)
dc99f600 1320{
468479e6 1321 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1322
468479e6 1323 return val % num;
77f65ebd
WB
1324}
1325
1326static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1327 struct sk_buff *skb,
1328 unsigned int num)
1329{
1330 return smp_processor_id() % num;
dc99f600
DM
1331}
1332
5df0ddfb
DB
1333static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1334 struct sk_buff *skb,
1335 unsigned int num)
1336{
f337db64 1337 return prandom_u32_max(num);
5df0ddfb
DB
1338}
1339
77f65ebd
WB
1340static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1341 struct sk_buff *skb,
ad377cab 1342 unsigned int idx, bool try_self,
77f65ebd 1343 unsigned int num)
95ec3eb4 1344{
4633c9e0 1345 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1346 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1347
0648ab70 1348 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1349
1350 if (try_self) {
1351 room = packet_rcv_has_room(po, skb);
1352 if (room == ROOM_NORMAL ||
1353 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1354 return idx;
4633c9e0 1355 po_skip = po;
3b3a5b0a 1356 }
ad377cab 1357
0648ab70 1358 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1359 do {
2ccdbaa6 1360 po_next = pkt_sk(f->arr[i]);
4633c9e0 1361 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1362 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1363 if (i != j)
0648ab70 1364 po->rollover->sock = i;
a9b63918
WB
1365 atomic_long_inc(&po->rollover->num);
1366 if (room == ROOM_LOW)
1367 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1368 return i;
1369 }
ad377cab 1370
77f65ebd
WB
1371 if (++i == num)
1372 i = 0;
1373 } while (i != j);
1374
a9b63918 1375 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1376 return idx;
1377}
1378
2d36097d
NH
1379static unsigned int fanout_demux_qm(struct packet_fanout *f,
1380 struct sk_buff *skb,
1381 unsigned int num)
1382{
1383 return skb_get_queue_mapping(skb) % num;
1384}
1385
47dceb8e
WB
1386static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1387 struct sk_buff *skb,
1388 unsigned int num)
1389{
1390 struct bpf_prog *prog;
1391 unsigned int ret = 0;
1392
1393 rcu_read_lock();
1394 prog = rcu_dereference(f->bpf_prog);
1395 if (prog)
ff936a04 1396 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1397 rcu_read_unlock();
1398
1399 return ret;
1400}
1401
77f65ebd
WB
1402static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1403{
1404 return f->flags & (flag >> 8);
95ec3eb4
DM
1405}
1406
95ec3eb4
DM
1407static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1408 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1409{
1410 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1411 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1412 struct net *net = read_pnet(&f->net);
dc99f600 1413 struct packet_sock *po;
77f65ebd 1414 unsigned int idx;
dc99f600 1415
19bcf9f2 1416 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1417 kfree_skb(skb);
1418 return 0;
1419 }
1420
3f34b24a 1421 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1422 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1423 if (!skb)
1424 return 0;
1425 }
95ec3eb4
DM
1426 switch (f->type) {
1427 case PACKET_FANOUT_HASH:
1428 default:
77f65ebd 1429 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1430 break;
1431 case PACKET_FANOUT_LB:
77f65ebd 1432 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1433 break;
1434 case PACKET_FANOUT_CPU:
77f65ebd
WB
1435 idx = fanout_demux_cpu(f, skb, num);
1436 break;
5df0ddfb
DB
1437 case PACKET_FANOUT_RND:
1438 idx = fanout_demux_rnd(f, skb, num);
1439 break;
2d36097d
NH
1440 case PACKET_FANOUT_QM:
1441 idx = fanout_demux_qm(f, skb, num);
1442 break;
77f65ebd 1443 case PACKET_FANOUT_ROLLOVER:
ad377cab 1444 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1445 break;
47dceb8e 1446 case PACKET_FANOUT_CBPF:
f2e52095 1447 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1448 idx = fanout_demux_bpf(f, skb, num);
1449 break;
dc99f600
DM
1450 }
1451
ad377cab
WB
1452 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1453 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1454
ad377cab 1455 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1456 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1457}
1458
fff3321d
PE
1459DEFINE_MUTEX(fanout_mutex);
1460EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600 1461static LIST_HEAD(fanout_list);
4a69a864 1462static u16 fanout_next_id;
dc99f600
DM
1463
1464static void __fanout_link(struct sock *sk, struct packet_sock *po)
1465{
1466 struct packet_fanout *f = po->fanout;
1467
1468 spin_lock(&f->lock);
1469 f->arr[f->num_members] = sk;
1470 smp_wmb();
1471 f->num_members++;
2bd624b4
AS
1472 if (f->num_members == 1)
1473 dev_add_pack(&f->prot_hook);
dc99f600
DM
1474 spin_unlock(&f->lock);
1475}
1476
1477static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1478{
1479 struct packet_fanout *f = po->fanout;
1480 int i;
1481
1482 spin_lock(&f->lock);
1483 for (i = 0; i < f->num_members; i++) {
1484 if (f->arr[i] == sk)
1485 break;
1486 }
1487 BUG_ON(i >= f->num_members);
1488 f->arr[i] = f->arr[f->num_members - 1];
1489 f->num_members--;
2bd624b4
AS
1490 if (f->num_members == 0)
1491 __dev_remove_pack(&f->prot_hook);
dc99f600
DM
1492 spin_unlock(&f->lock);
1493}
1494
d4dd8aee 1495static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1496{
161642e2
ED
1497 if (sk->sk_family != PF_PACKET)
1498 return false;
c0de08d0 1499
161642e2 1500 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1501}
1502
47dceb8e
WB
1503static void fanout_init_data(struct packet_fanout *f)
1504{
1505 switch (f->type) {
1506 case PACKET_FANOUT_LB:
1507 atomic_set(&f->rr_cur, 0);
1508 break;
1509 case PACKET_FANOUT_CBPF:
f2e52095 1510 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1511 RCU_INIT_POINTER(f->bpf_prog, NULL);
1512 break;
1513 }
1514}
1515
1516static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1517{
1518 struct bpf_prog *old;
1519
1520 spin_lock(&f->lock);
1521 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1522 rcu_assign_pointer(f->bpf_prog, new);
1523 spin_unlock(&f->lock);
1524
1525 if (old) {
1526 synchronize_net();
1527 bpf_prog_destroy(old);
1528 }
1529}
1530
1531static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1532 unsigned int len)
1533{
1534 struct bpf_prog *new;
1535 struct sock_fprog fprog;
1536 int ret;
1537
1538 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1539 return -EPERM;
1540 if (len != sizeof(fprog))
1541 return -EINVAL;
1542 if (copy_from_user(&fprog, data, len))
1543 return -EFAULT;
1544
bab18991 1545 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1546 if (ret)
1547 return ret;
1548
1549 __fanout_set_data_bpf(po->fanout, new);
1550 return 0;
1551}
1552
f2e52095
WB
1553static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1554 unsigned int len)
1555{
1556 struct bpf_prog *new;
1557 u32 fd;
1558
1559 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1560 return -EPERM;
1561 if (len != sizeof(fd))
1562 return -EINVAL;
1563 if (copy_from_user(&fd, data, len))
1564 return -EFAULT;
1565
113214be 1566 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
f2e52095
WB
1567 if (IS_ERR(new))
1568 return PTR_ERR(new);
f2e52095
WB
1569
1570 __fanout_set_data_bpf(po->fanout, new);
1571 return 0;
1572}
1573
47dceb8e
WB
1574static int fanout_set_data(struct packet_sock *po, char __user *data,
1575 unsigned int len)
1576{
1577 switch (po->fanout->type) {
1578 case PACKET_FANOUT_CBPF:
1579 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1580 case PACKET_FANOUT_EBPF:
1581 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1582 default:
1583 return -EINVAL;
1584 };
1585}
1586
1587static void fanout_release_data(struct packet_fanout *f)
1588{
1589 switch (f->type) {
1590 case PACKET_FANOUT_CBPF:
f2e52095 1591 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1592 __fanout_set_data_bpf(f, NULL);
1593 };
1594}
1595
4a69a864
MM
1596static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1597{
1598 struct packet_fanout *f;
1599
1600 list_for_each_entry(f, &fanout_list, list) {
1601 if (f->id == candidate_id &&
1602 read_pnet(&f->net) == sock_net(sk)) {
1603 return false;
1604 }
1605 }
1606 return true;
1607}
1608
1609static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1610{
1611 u16 id = fanout_next_id;
1612
1613 do {
1614 if (__fanout_id_is_free(sk, id)) {
1615 *new_id = id;
1616 fanout_next_id = id + 1;
1617 return true;
1618 }
1619
1620 id++;
1621 } while (id != fanout_next_id);
1622
1623 return false;
1624}
1625
7736d33f 1626static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600 1627{
d199fab6 1628 struct packet_rollover *rollover = NULL;
dc99f600
DM
1629 struct packet_sock *po = pkt_sk(sk);
1630 struct packet_fanout *f, *match;
7736d33f 1631 u8 type = type_flags & 0xff;
77f65ebd 1632 u8 flags = type_flags >> 8;
dc99f600
DM
1633 int err;
1634
1635 switch (type) {
77f65ebd
WB
1636 case PACKET_FANOUT_ROLLOVER:
1637 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1638 return -EINVAL;
dc99f600
DM
1639 case PACKET_FANOUT_HASH:
1640 case PACKET_FANOUT_LB:
95ec3eb4 1641 case PACKET_FANOUT_CPU:
5df0ddfb 1642 case PACKET_FANOUT_RND:
2d36097d 1643 case PACKET_FANOUT_QM:
47dceb8e 1644 case PACKET_FANOUT_CBPF:
f2e52095 1645 case PACKET_FANOUT_EBPF:
dc99f600
DM
1646 break;
1647 default:
1648 return -EINVAL;
1649 }
1650
d199fab6
ED
1651 mutex_lock(&fanout_mutex);
1652
d199fab6 1653 err = -EALREADY;
dc99f600 1654 if (po->fanout)
d199fab6 1655 goto out;
dc99f600 1656
4633c9e0
WB
1657 if (type == PACKET_FANOUT_ROLLOVER ||
1658 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
d199fab6
ED
1659 err = -ENOMEM;
1660 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1661 if (!rollover)
1662 goto out;
1663 atomic_long_set(&rollover->num, 0);
1664 atomic_long_set(&rollover->num_huge, 0);
1665 atomic_long_set(&rollover->num_failed, 0);
0648ab70
WB
1666 }
1667
4a69a864
MM
1668 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1669 if (id != 0) {
1670 err = -EINVAL;
1671 goto out;
1672 }
1673 if (!fanout_find_new_id(sk, &id)) {
1674 err = -ENOMEM;
1675 goto out;
1676 }
1677 /* ephemeral flag for the first socket in the group: drop it */
1678 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1679 }
1680
dc99f600
DM
1681 match = NULL;
1682 list_for_each_entry(f, &fanout_list, list) {
1683 if (f->id == id &&
1684 read_pnet(&f->net) == sock_net(sk)) {
1685 match = f;
1686 break;
1687 }
1688 }
afe62c68 1689 err = -EINVAL;
77f65ebd 1690 if (match && match->flags != flags)
afe62c68 1691 goto out;
dc99f600 1692 if (!match) {
afe62c68 1693 err = -ENOMEM;
dc99f600 1694 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1695 if (!match)
1696 goto out;
1697 write_pnet(&match->net, sock_net(sk));
1698 match->id = id;
1699 match->type = type;
77f65ebd 1700 match->flags = flags;
afe62c68
ED
1701 INIT_LIST_HEAD(&match->list);
1702 spin_lock_init(&match->lock);
fb5c2c17 1703 refcount_set(&match->sk_ref, 0);
47dceb8e 1704 fanout_init_data(match);
afe62c68
ED
1705 match->prot_hook.type = po->prot_hook.type;
1706 match->prot_hook.dev = po->prot_hook.dev;
1707 match->prot_hook.func = packet_rcv_fanout;
1708 match->prot_hook.af_packet_priv = match;
c0de08d0 1709 match->prot_hook.id_match = match_fanout_group;
afe62c68 1710 list_add(&match->list, &fanout_list);
dc99f600 1711 }
afe62c68 1712 err = -EINVAL;
008ba2a1
WB
1713
1714 spin_lock(&po->bind_lock);
1715 if (po->running &&
1716 match->type == type &&
afe62c68
ED
1717 match->prot_hook.type == po->prot_hook.type &&
1718 match->prot_hook.dev == po->prot_hook.dev) {
1719 err = -ENOSPC;
fb5c2c17 1720 if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
afe62c68
ED
1721 __dev_remove_pack(&po->prot_hook);
1722 po->fanout = match;
57f015f5
MM
1723 po->rollover = rollover;
1724 rollover = NULL;
fb5c2c17 1725 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
afe62c68
ED
1726 __fanout_link(sk, po);
1727 err = 0;
dc99f600
DM
1728 }
1729 }
008ba2a1
WB
1730 spin_unlock(&po->bind_lock);
1731
1732 if (err && !refcount_read(&match->sk_ref)) {
1733 list_del(&match->list);
1734 kfree(match);
1735 }
1736
afe62c68 1737out:
57f015f5 1738 kfree(rollover);
d199fab6 1739 mutex_unlock(&fanout_mutex);
dc99f600
DM
1740 return err;
1741}
1742
2bd624b4
AS
1743/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1744 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1745 * It is the responsibility of the caller to call fanout_release_data() and
1746 * free the returned packet_fanout (after synchronize_net())
1747 */
1748static struct packet_fanout *fanout_release(struct sock *sk)
dc99f600
DM
1749{
1750 struct packet_sock *po = pkt_sk(sk);
1751 struct packet_fanout *f;
1752
fff3321d 1753 mutex_lock(&fanout_mutex);
d199fab6
ED
1754 f = po->fanout;
1755 if (f) {
1756 po->fanout = NULL;
1757
fb5c2c17 1758 if (refcount_dec_and_test(&f->sk_ref))
d199fab6 1759 list_del(&f->list);
2bd624b4
AS
1760 else
1761 f = NULL;
dc99f600
DM
1762 }
1763 mutex_unlock(&fanout_mutex);
2bd624b4
AS
1764
1765 return f;
dc99f600 1766}
1da177e4 1767
3c70c132
DB
1768static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1769 struct sk_buff *skb)
1770{
1771 /* Earlier code assumed this would be a VLAN pkt, double-check
1772 * this now that we have the actual packet in hand. We can only
1773 * do this check on Ethernet devices.
1774 */
1775 if (unlikely(dev->type != ARPHRD_ETHER))
1776 return false;
1777
1778 skb_reset_mac_header(skb);
1779 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1780}
1781
90ddc4f0 1782static const struct proto_ops packet_ops;
1da177e4 1783
90ddc4f0 1784static const struct proto_ops packet_ops_spkt;
1da177e4 1785
40d4e3df
ED
1786static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1787 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1788{
1789 struct sock *sk;
1790 struct sockaddr_pkt *spkt;
1791
1792 /*
1793 * When we registered the protocol we saved the socket in the data
1794 * field for just this event.
1795 */
1796
1797 sk = pt->af_packet_priv;
1ce4f28b 1798
1da177e4
LT
1799 /*
1800 * Yank back the headers [hope the device set this
1801 * right or kerboom...]
1802 *
1803 * Incoming packets have ll header pulled,
1804 * push it back.
1805 *
98e399f8 1806 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1807 * so that this procedure is noop.
1808 */
1809
1810 if (skb->pkt_type == PACKET_LOOPBACK)
1811 goto out;
1812
09ad9bc7 1813 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1814 goto out;
1815
40d4e3df
ED
1816 skb = skb_share_check(skb, GFP_ATOMIC);
1817 if (skb == NULL)
1da177e4
LT
1818 goto oom;
1819
1820 /* drop any routing info */
adf30907 1821 skb_dst_drop(skb);
1da177e4 1822
84531c24
PO
1823 /* drop conntrack reference */
1824 nf_reset(skb);
1825
ffbc6111 1826 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1827
98e399f8 1828 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1829
1830 /*
1831 * The SOCK_PACKET socket receives _all_ frames.
1832 */
1833
1834 spkt->spkt_family = dev->type;
1835 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1836 spkt->spkt_protocol = skb->protocol;
1837
1838 /*
1839 * Charge the memory to the socket. This is done specifically
1840 * to prevent sockets using all the memory up.
1841 */
1842
40d4e3df 1843 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1844 return 0;
1845
1846out:
1847 kfree_skb(skb);
1848oom:
1849 return 0;
1850}
1851
1852
1853/*
1854 * Output a raw packet to a device layer. This bypasses all the other
1855 * protocol layers and you must therefore supply it with a complete frame
1856 */
1ce4f28b 1857
1b784140
YX
1858static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1859 size_t len)
1da177e4
LT
1860{
1861 struct sock *sk = sock->sk;
342dfc30 1862 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1863 struct sk_buff *skb = NULL;
1da177e4 1864 struct net_device *dev;
c14ac945 1865 struct sockcm_cookie sockc;
40d4e3df 1866 __be16 proto = 0;
1da177e4 1867 int err;
3bdc0eba 1868 int extra_len = 0;
1ce4f28b 1869
1da177e4 1870 /*
1ce4f28b 1871 * Get and verify the address.
1da177e4
LT
1872 */
1873
40d4e3df 1874 if (saddr) {
1da177e4 1875 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1876 return -EINVAL;
1877 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1878 proto = saddr->spkt_protocol;
1879 } else
1880 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1881
1882 /*
1ce4f28b 1883 * Find the device first to size check it
1da177e4
LT
1884 */
1885
de74e92a 1886 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1887retry:
654d1f8a
ED
1888 rcu_read_lock();
1889 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1890 err = -ENODEV;
1891 if (dev == NULL)
1892 goto out_unlock;
1ce4f28b 1893
d5e76b0a
DM
1894 err = -ENETDOWN;
1895 if (!(dev->flags & IFF_UP))
1896 goto out_unlock;
1897
1da177e4 1898 /*
40d4e3df
ED
1899 * You may not queue a frame bigger than the mtu. This is the lowest level
1900 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1901 */
1ce4f28b 1902
3bdc0eba
BG
1903 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1904 if (!netif_supports_nofcs(dev)) {
1905 err = -EPROTONOSUPPORT;
1906 goto out_unlock;
1907 }
1908 extra_len = 4; /* We're doing our own CRC */
1909 }
1910
1da177e4 1911 err = -EMSGSIZE;
3bdc0eba 1912 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1913 goto out_unlock;
1914
1a35ca80
ED
1915 if (!skb) {
1916 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1917 int tlen = dev->needed_tailroom;
1a35ca80
ED
1918 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1919
1920 rcu_read_unlock();
4ce40912 1921 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1922 if (skb == NULL)
1923 return -ENOBUFS;
1924 /* FIXME: Save some space for broken drivers that write a hard
1925 * header at transmission time by themselves. PPP is the notable
1926 * one here. This should really be fixed at the driver level.
1927 */
1928 skb_reserve(skb, reserved);
1929 skb_reset_network_header(skb);
1930
1931 /* Try to align data part correctly */
1932 if (hhlen) {
1933 skb->data -= hhlen;
1934 skb->tail -= hhlen;
1935 if (len < hhlen)
1936 skb_reset_network_header(skb);
1937 }
6ce8e9ce 1938 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1939 if (err)
1940 goto out_free;
1941 goto retry;
1da177e4
LT
1942 }
1943
9ed988cd
WB
1944 if (!dev_validate_header(dev, skb->data, len)) {
1945 err = -EINVAL;
1946 goto out_unlock;
1947 }
3c70c132
DB
1948 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1949 !packet_extra_vlan_len_allowed(dev, skb)) {
1950 err = -EMSGSIZE;
1951 goto out_unlock;
57f89bfa 1952 }
1a35ca80 1953
3d0ba8c0 1954 sockc.transmit_time = 0;
edbe7746 1955 sockc.tsflags = sk->sk_tsflags;
c14ac945
SHY
1956 if (msg->msg_controllen) {
1957 err = sock_cmsg_send(sk, msg, &sockc);
f8e7718c 1958 if (unlikely(err))
c14ac945 1959 goto out_unlock;
c14ac945
SHY
1960 }
1961
1da177e4
LT
1962 skb->protocol = proto;
1963 skb->dev = dev;
1964 skb->priority = sk->sk_priority;
2d37a186 1965 skb->mark = sk->sk_mark;
3d0ba8c0 1966 skb->tstamp = sockc.transmit_time;
bf84a010 1967
c14ac945 1968 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1da177e4 1969
3bdc0eba
BG
1970 if (unlikely(extra_len == 4))
1971 skb->no_fcs = 1;
1972
40893fd0 1973 skb_probe_transport_header(skb, 0);
c1aad275 1974
1da177e4 1975 dev_queue_xmit(skb);
654d1f8a 1976 rcu_read_unlock();
40d4e3df 1977 return len;
1da177e4 1978
1da177e4 1979out_unlock:
654d1f8a 1980 rcu_read_unlock();
1a35ca80
ED
1981out_free:
1982 kfree_skb(skb);
1da177e4
LT
1983 return err;
1984}
1da177e4 1985
ff936a04
AS
1986static unsigned int run_filter(struct sk_buff *skb,
1987 const struct sock *sk,
1988 unsigned int res)
1da177e4
LT
1989{
1990 struct sk_filter *filter;
fda9ef5d 1991
80f8f102
ED
1992 rcu_read_lock();
1993 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1994 if (filter != NULL)
ff936a04 1995 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 1996 rcu_read_unlock();
1da177e4 1997
dbcb5855 1998 return res;
1da177e4
LT
1999}
2000
16cc1400
WB
2001static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2002 size_t *len)
2003{
2004 struct virtio_net_hdr vnet_hdr;
2005
2006 if (*len < sizeof(vnet_hdr))
2007 return -EINVAL;
2008 *len -= sizeof(vnet_hdr);
2009
fd3a8862 2010 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
16cc1400
WB
2011 return -EINVAL;
2012
2013 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2014}
2015
1da177e4 2016/*
62ab0812
ED
2017 * This function makes lazy skb cloning in hope that most of packets
2018 * are discarded by BPF.
2019 *
2020 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2021 * and skb->cb are mangled. It works because (and until) packets
2022 * falling here are owned by current CPU. Output packets are cloned
2023 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2024 * sequencially, so that if we return skb to original state on exit,
2025 * we will not harm anyone.
1da177e4
LT
2026 */
2027
40d4e3df
ED
2028static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2029 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2030{
2031 struct sock *sk;
2032 struct sockaddr_ll *sll;
2033 struct packet_sock *po;
40d4e3df 2034 u8 *skb_head = skb->data;
1da177e4 2035 int skb_len = skb->len;
dbcb5855 2036 unsigned int snaplen, res;
da37845f 2037 bool is_drop_n_account = false;
1da177e4
LT
2038
2039 if (skb->pkt_type == PACKET_LOOPBACK)
2040 goto drop;
2041
2042 sk = pt->af_packet_priv;
2043 po = pkt_sk(sk);
2044
09ad9bc7 2045 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2046 goto drop;
2047
1da177e4
LT
2048 skb->dev = dev;
2049
3b04ddde 2050 if (dev->header_ops) {
1da177e4 2051 /* The device has an explicit notion of ll header,
62ab0812
ED
2052 * exported to higher levels.
2053 *
2054 * Otherwise, the device hides details of its frame
2055 * structure, so that corresponding packet head is
2056 * never delivered to user.
1da177e4
LT
2057 */
2058 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2059 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2060 else if (skb->pkt_type == PACKET_OUTGOING) {
2061 /* Special case: outgoing packets have ll header at head */
bbe735e4 2062 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2063 }
2064 }
2065
2066 snaplen = skb->len;
2067
dbcb5855
DM
2068 res = run_filter(skb, sk, snaplen);
2069 if (!res)
fda9ef5d 2070 goto drop_n_restore;
dbcb5855
DM
2071 if (snaplen > res)
2072 snaplen = res;
1da177e4 2073
0fd7bac6 2074 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2075 goto drop_n_acct;
2076
2077 if (skb_shared(skb)) {
2078 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2079 if (nskb == NULL)
2080 goto drop_n_acct;
2081
2082 if (skb_head != skb->data) {
2083 skb->data = skb_head;
2084 skb->len = skb_len;
2085 }
abc4e4fa 2086 consume_skb(skb);
1da177e4
LT
2087 skb = nskb;
2088 }
2089
b4772ef8 2090 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2091
2092 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2093 sll->sll_hatype = dev->type;
1da177e4 2094 sll->sll_pkttype = skb->pkt_type;
8032b464 2095 if (unlikely(po->origdev))
80feaacb
PWJ
2096 sll->sll_ifindex = orig_dev->ifindex;
2097 else
2098 sll->sll_ifindex = dev->ifindex;
1da177e4 2099
b95cce35 2100 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2101
2472d761
EB
2102 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2103 * Use their space for storing the original skb length.
2104 */
2105 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2106
1da177e4
LT
2107 if (pskb_trim(skb, snaplen))
2108 goto drop_n_acct;
2109
2110 skb_set_owner_r(skb, sk);
2111 skb->dev = NULL;
adf30907 2112 skb_dst_drop(skb);
1da177e4 2113
84531c24
PO
2114 /* drop conntrack reference */
2115 nf_reset(skb);
2116
1da177e4 2117 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2118 po->stats.stats1.tp_packets++;
3bc3b96f 2119 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2120 __skb_queue_tail(&sk->sk_receive_queue, skb);
2121 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2122 sk->sk_data_ready(sk);
1da177e4
LT
2123 return 0;
2124
2125drop_n_acct:
da37845f 2126 is_drop_n_account = true;
7091fbd8 2127 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2128 po->stats.stats1.tp_drops++;
7091fbd8
WB
2129 atomic_inc(&sk->sk_drops);
2130 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
2131
2132drop_n_restore:
2133 if (skb_head != skb->data && skb_shared(skb)) {
2134 skb->data = skb_head;
2135 skb->len = skb_len;
2136 }
2137drop:
da37845f
WJ
2138 if (!is_drop_n_account)
2139 consume_skb(skb);
2140 else
2141 kfree_skb(skb);
1da177e4
LT
2142 return 0;
2143}
2144
40d4e3df
ED
2145static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2146 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2147{
2148 struct sock *sk;
2149 struct packet_sock *po;
2150 struct sockaddr_ll *sll;
184f489e 2151 union tpacket_uhdr h;
40d4e3df 2152 u8 *skb_head = skb->data;
1da177e4 2153 int skb_len = skb->len;
dbcb5855 2154 unsigned int snaplen, res;
f6fb8f10 2155 unsigned long status = TP_STATUS_USER;
bbd6ef87 2156 unsigned short macoff, netoff, hdrlen;
1da177e4 2157 struct sk_buff *copy_skb = NULL;
bbd6ef87 2158 struct timespec ts;
b9c32fb2 2159 __u32 ts_status;
da37845f 2160 bool is_drop_n_account = false;
edbd58be 2161 bool do_vnet = false;
1da177e4 2162
51846355
AW
2163 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2164 * We may add members to them until current aligned size without forcing
2165 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2166 */
2167 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2168 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2169
1da177e4
LT
2170 if (skb->pkt_type == PACKET_LOOPBACK)
2171 goto drop;
2172
2173 sk = pt->af_packet_priv;
2174 po = pkt_sk(sk);
2175
09ad9bc7 2176 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2177 goto drop;
2178
3b04ddde 2179 if (dev->header_ops) {
1da177e4 2180 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2181 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2182 else if (skb->pkt_type == PACKET_OUTGOING) {
2183 /* Special case: outgoing packets have ll header at head */
bbe735e4 2184 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2185 }
2186 }
2187
2188 snaplen = skb->len;
2189
dbcb5855
DM
2190 res = run_filter(skb, sk, snaplen);
2191 if (!res)
fda9ef5d 2192 goto drop_n_restore;
68c2e5de
AD
2193
2194 if (skb->ip_summed == CHECKSUM_PARTIAL)
2195 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2196 else if (skb->pkt_type != PACKET_OUTGOING &&
2197 (skb->ip_summed == CHECKSUM_COMPLETE ||
2198 skb_csum_unnecessary(skb)))
2199 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2200
dbcb5855
DM
2201 if (snaplen > res)
2202 snaplen = res;
1da177e4
LT
2203
2204 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2205 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2206 po->tp_reserve;
1da177e4 2207 } else {
95c96174 2208 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2209 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2210 (maclen < 16 ? 16 : maclen)) +
58d19b19 2211 po->tp_reserve;
edbd58be 2212 if (po->has_vnet_hdr) {
58d19b19 2213 netoff += sizeof(struct virtio_net_hdr);
edbd58be
BP
2214 do_vnet = true;
2215 }
1da177e4
LT
2216 macoff = netoff - maclen;
2217 }
f6fb8f10 2218 if (po->tp_version <= TPACKET_V2) {
2219 if (macoff + snaplen > po->rx_ring.frame_size) {
2220 if (po->copy_thresh &&
0fd7bac6 2221 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2222 if (skb_shared(skb)) {
2223 copy_skb = skb_clone(skb, GFP_ATOMIC);
2224 } else {
2225 copy_skb = skb_get(skb);
2226 skb_head = skb->data;
2227 }
2228 if (copy_skb)
2229 skb_set_owner_r(copy_skb, sk);
1da177e4 2230 }
f6fb8f10 2231 snaplen = po->rx_ring.frame_size - macoff;
edbd58be 2232 if ((int)snaplen < 0) {
f6fb8f10 2233 snaplen = 0;
edbd58be
BP
2234 do_vnet = false;
2235 }
1da177e4 2236 }
dc808110
ED
2237 } else if (unlikely(macoff + snaplen >
2238 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2239 u32 nval;
2240
2241 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2242 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2243 snaplen, nval, macoff);
2244 snaplen = nval;
2245 if (unlikely((int)snaplen < 0)) {
2246 snaplen = 0;
2247 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
edbd58be 2248 do_vnet = false;
dc808110 2249 }
1da177e4 2250 }
1da177e4 2251 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2252 h.raw = packet_current_rx_frame(po, skb,
2253 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2254 if (!h.raw)
58d19b19 2255 goto drop_n_account;
f6fb8f10 2256 if (po->tp_version <= TPACKET_V2) {
2257 packet_increment_rx_head(po, &po->rx_ring);
2258 /*
2259 * LOSING will be reported till you read the stats,
2260 * because it's COR - Clear On Read.
2261 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2262 * at packet level.
2263 */
ee80fbf3 2264 if (po->stats.stats1.tp_drops)
f6fb8f10 2265 status |= TP_STATUS_LOSING;
2266 }
945d015e
ED
2267
2268 if (do_vnet &&
2269 virtio_net_hdr_from_skb(skb, h.raw + macoff -
2270 sizeof(struct virtio_net_hdr),
2271 vio_le(), true, 0))
2272 goto drop_n_account;
2273
ee80fbf3 2274 po->stats.stats1.tp_packets++;
1da177e4
LT
2275 if (copy_skb) {
2276 status |= TP_STATUS_COPY;
2277 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2278 }
1da177e4
LT
2279 spin_unlock(&sk->sk_receive_queue.lock);
2280
bbd6ef87 2281 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2282
2283 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2284 getnstimeofday(&ts);
1da177e4 2285
b9c32fb2
DB
2286 status |= ts_status;
2287
bbd6ef87
PM
2288 switch (po->tp_version) {
2289 case TPACKET_V1:
2290 h.h1->tp_len = skb->len;
2291 h.h1->tp_snaplen = snaplen;
2292 h.h1->tp_mac = macoff;
2293 h.h1->tp_net = netoff;
4b457bdf
DB
2294 h.h1->tp_sec = ts.tv_sec;
2295 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2296 hdrlen = sizeof(*h.h1);
2297 break;
2298 case TPACKET_V2:
2299 h.h2->tp_len = skb->len;
2300 h.h2->tp_snaplen = snaplen;
2301 h.h2->tp_mac = macoff;
2302 h.h2->tp_net = netoff;
bbd6ef87
PM
2303 h.h2->tp_sec = ts.tv_sec;
2304 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2305 if (skb_vlan_tag_present(skb)) {
2306 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2307 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2308 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2309 } else {
2310 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2311 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2312 }
e4d26f4b 2313 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2314 hdrlen = sizeof(*h.h2);
2315 break;
f6fb8f10 2316 case TPACKET_V3:
2317 /* tp_nxt_offset,vlan are already populated above.
2318 * So DONT clear those fields here
2319 */
2320 h.h3->tp_status |= status;
2321 h.h3->tp_len = skb->len;
2322 h.h3->tp_snaplen = snaplen;
2323 h.h3->tp_mac = macoff;
2324 h.h3->tp_net = netoff;
f6fb8f10 2325 h.h3->tp_sec = ts.tv_sec;
2326 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2327 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2328 hdrlen = sizeof(*h.h3);
2329 break;
bbd6ef87
PM
2330 default:
2331 BUG();
2332 }
1da177e4 2333
bbd6ef87 2334 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2335 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2336 sll->sll_family = AF_PACKET;
2337 sll->sll_hatype = dev->type;
2338 sll->sll_protocol = skb->protocol;
2339 sll->sll_pkttype = skb->pkt_type;
8032b464 2340 if (unlikely(po->origdev))
80feaacb
PWJ
2341 sll->sll_ifindex = orig_dev->ifindex;
2342 else
2343 sll->sll_ifindex = dev->ifindex;
1da177e4 2344
e16aa207 2345 smp_mb();
f0d4eb29 2346
f6dafa95 2347#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2348 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2349 u8 *start, *end;
2350
f0d4eb29
DB
2351 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2352 macoff + snaplen);
2353
2354 for (start = h.raw; start < end; start += PAGE_SIZE)
2355 flush_dcache_page(pgv_to_page(start));
1da177e4 2356 }
f0d4eb29 2357 smp_wmb();
f6dafa95 2358#endif
f0d4eb29 2359
da413eec 2360 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2361 __packet_set_status(po, h.raw, status);
da413eec
DC
2362 sk->sk_data_ready(sk);
2363 } else {
f6fb8f10 2364 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2365 }
1da177e4
LT
2366
2367drop_n_restore:
2368 if (skb_head != skb->data && skb_shared(skb)) {
2369 skb->data = skb_head;
2370 skb->len = skb_len;
2371 }
2372drop:
da37845f
WJ
2373 if (!is_drop_n_account)
2374 consume_skb(skb);
2375 else
2376 kfree_skb(skb);
1da177e4
LT
2377 return 0;
2378
58d19b19 2379drop_n_account:
da37845f 2380 is_drop_n_account = true;
ee80fbf3 2381 po->stats.stats1.tp_drops++;
1da177e4
LT
2382 spin_unlock(&sk->sk_receive_queue.lock);
2383
676d2369 2384 sk->sk_data_ready(sk);
acb5d75b 2385 kfree_skb(copy_skb);
1da177e4
LT
2386 goto drop_n_restore;
2387}
2388
69e3c75f
JB
2389static void tpacket_destruct_skb(struct sk_buff *skb)
2390{
2391 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2392
69e3c75f 2393 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2394 void *ph;
b9c32fb2
DB
2395 __u32 ts;
2396
69e3c75f 2397 ph = skb_shinfo(skb)->destructor_arg;
b0138408 2398 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2399
2400 ts = __packet_set_timestamp(po, ph, skb);
2401 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2402 }
2403
2404 sock_wfree(skb);
2405}
2406
c72219b7
DB
2407static void tpacket_set_protocol(const struct net_device *dev,
2408 struct sk_buff *skb)
2409{
2410 if (dev->type == ARPHRD_ETHER) {
2411 skb_reset_mac_header(skb);
2412 skb->protocol = eth_hdr(skb)->h_proto;
2413 }
2414}
2415
16cc1400
WB
2416static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2417{
16cc1400
WB
2418 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2419 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2420 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2421 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2422 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2423 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2424 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2425
2426 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2427 return -EINVAL;
2428
16cc1400
WB
2429 return 0;
2430}
2431
2432static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2433 struct virtio_net_hdr *vnet_hdr)
2434{
16cc1400
WB
2435 if (*len < sizeof(*vnet_hdr))
2436 return -EINVAL;
2437 *len -= sizeof(*vnet_hdr);
2438
cbbd26b8 2439 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
16cc1400
WB
2440 return -EFAULT;
2441
2442 return __packet_snd_vnet_parse(vnet_hdr, *len);
2443}
2444
40d4e3df 2445static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2446 void *frame, struct net_device *dev, void *data, int tp_len,
c14ac945
SHY
2447 __be16 proto, unsigned char *addr, int hlen, int copylen,
2448 const struct sockcm_cookie *sockc)
69e3c75f 2449{
184f489e 2450 union tpacket_uhdr ph;
8d39b4a6 2451 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2452 struct socket *sock = po->sk.sk_socket;
2453 struct page *page;
69e3c75f
JB
2454 int err;
2455
2456 ph.raw = frame;
2457
2458 skb->protocol = proto;
2459 skb->dev = dev;
2460 skb->priority = po->sk.sk_priority;
2d37a186 2461 skb->mark = po->sk.sk_mark;
3d0ba8c0 2462 skb->tstamp = sockc->transmit_time;
c14ac945 2463 sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2464 skb_shinfo(skb)->destructor_arg = ph.raw;
2465
ae641949 2466 skb_reserve(skb, hlen);
69e3c75f 2467 skb_reset_network_header(skb);
c1aad275 2468
69e3c75f
JB
2469 to_write = tp_len;
2470
2471 if (sock->type == SOCK_DGRAM) {
2472 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2473 NULL, tp_len);
2474 if (unlikely(err < 0))
2475 return -EINVAL;
1d036d25 2476 } else if (copylen) {
9ed988cd
WB
2477 int hdrlen = min_t(int, copylen, tp_len);
2478
69e3c75f 2479 skb_push(skb, dev->hard_header_len);
1d036d25 2480 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2481 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2482 if (unlikely(err))
2483 return err;
9ed988cd
WB
2484 if (!dev_validate_header(dev, skb->data, hdrlen))
2485 return -EINVAL;
c72219b7
DB
2486 if (!skb->protocol)
2487 tpacket_set_protocol(dev, skb);
69e3c75f 2488
9ed988cd
WB
2489 data += hdrlen;
2490 to_write -= hdrlen;
69e3c75f
JB
2491 }
2492
69e3c75f
JB
2493 offset = offset_in_page(data);
2494 len_max = PAGE_SIZE - offset;
2495 len = ((to_write > len_max) ? len_max : to_write);
2496
2497 skb->data_len = to_write;
2498 skb->len += to_write;
2499 skb->truesize += to_write;
14afee4b 2500 refcount_add(to_write, &po->sk.sk_wmem_alloc);
69e3c75f
JB
2501
2502 while (likely(to_write)) {
2503 nr_frags = skb_shinfo(skb)->nr_frags;
2504
2505 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2506 pr_err("Packet exceed the number of skb frags(%lu)\n",
2507 MAX_SKB_FRAGS);
69e3c75f
JB
2508 return -EFAULT;
2509 }
2510
0af55bb5
CG
2511 page = pgv_to_page(data);
2512 data += len;
69e3c75f
JB
2513 flush_dcache_page(page);
2514 get_page(page);
0af55bb5 2515 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2516 to_write -= len;
2517 offset = 0;
2518 len_max = PAGE_SIZE;
2519 len = ((to_write > len_max) ? len_max : to_write);
2520 }
2521
8fd6c80d 2522 skb_probe_transport_header(skb, 0);
efdfa2f7 2523
69e3c75f
JB
2524 return tp_len;
2525}
2526
8d39b4a6
WB
2527static int tpacket_parse_header(struct packet_sock *po, void *frame,
2528 int size_max, void **data)
2529{
2530 union tpacket_uhdr ph;
2531 int tp_len, off;
2532
2533 ph.raw = frame;
2534
2535 switch (po->tp_version) {
7f953ab2
SV
2536 case TPACKET_V3:
2537 if (ph.h3->tp_next_offset != 0) {
2538 pr_warn_once("variable sized slot not supported");
2539 return -EINVAL;
2540 }
2541 tp_len = ph.h3->tp_len;
2542 break;
8d39b4a6
WB
2543 case TPACKET_V2:
2544 tp_len = ph.h2->tp_len;
2545 break;
2546 default:
2547 tp_len = ph.h1->tp_len;
2548 break;
2549 }
2550 if (unlikely(tp_len > size_max)) {
2551 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2552 return -EMSGSIZE;
2553 }
2554
2555 if (unlikely(po->tp_tx_has_off)) {
2556 int off_min, off_max;
2557
2558 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2559 off_max = po->tx_ring.frame_size - tp_len;
2560 if (po->sk.sk_type == SOCK_DGRAM) {
2561 switch (po->tp_version) {
7f953ab2
SV
2562 case TPACKET_V3:
2563 off = ph.h3->tp_net;
2564 break;
8d39b4a6
WB
2565 case TPACKET_V2:
2566 off = ph.h2->tp_net;
2567 break;
2568 default:
2569 off = ph.h1->tp_net;
2570 break;
2571 }
2572 } else {
2573 switch (po->tp_version) {
7f953ab2
SV
2574 case TPACKET_V3:
2575 off = ph.h3->tp_mac;
2576 break;
8d39b4a6
WB
2577 case TPACKET_V2:
2578 off = ph.h2->tp_mac;
2579 break;
2580 default:
2581 off = ph.h1->tp_mac;
2582 break;
2583 }
2584 }
2585 if (unlikely((off < off_min) || (off_max < off)))
2586 return -EINVAL;
2587 } else {
2588 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2589 }
2590
2591 *data = frame + off;
2592 return tp_len;
2593}
2594
69e3c75f
JB
2595static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2596{
69e3c75f
JB
2597 struct sk_buff *skb;
2598 struct net_device *dev;
1d036d25 2599 struct virtio_net_hdr *vnet_hdr = NULL;
c14ac945 2600 struct sockcm_cookie sockc;
69e3c75f 2601 __be16 proto;
09effa67 2602 int err, reserve = 0;
40d4e3df 2603 void *ph;
342dfc30 2604 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2605 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2606 int tp_len, size_max;
2607 unsigned char *addr;
8d39b4a6 2608 void *data;
69e3c75f 2609 int len_sum = 0;
9e67030a 2610 int status = TP_STATUS_AVAILABLE;
1d036d25 2611 int hlen, tlen, copylen = 0;
69e3c75f 2612
69e3c75f
JB
2613 mutex_lock(&po->pg_vec_lock);
2614
66e56cd4 2615 if (likely(saddr == NULL)) {
e40526cb 2616 dev = packet_cached_dev_get(po);
69e3c75f
JB
2617 proto = po->num;
2618 addr = NULL;
2619 } else {
2620 err = -EINVAL;
2621 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2622 goto out;
2623 if (msg->msg_namelen < (saddr->sll_halen
2624 + offsetof(struct sockaddr_ll,
2625 sll_addr)))
2626 goto out;
69e3c75f
JB
2627 proto = saddr->sll_protocol;
2628 addr = saddr->sll_addr;
827d9780 2629 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2630 }
2631
69e3c75f
JB
2632 err = -ENXIO;
2633 if (unlikely(dev == NULL))
2634 goto out;
69e3c75f
JB
2635 err = -ENETDOWN;
2636 if (unlikely(!(dev->flags & IFF_UP)))
2637 goto out_put;
2638
3d0ba8c0 2639 sockc.transmit_time = 0;
d19b183c
DCS
2640 sockc.tsflags = po->sk.sk_tsflags;
2641 if (msg->msg_controllen) {
2642 err = sock_cmsg_send(&po->sk, msg, &sockc);
2643 if (unlikely(err))
2644 goto out_put;
2645 }
2646
5cfb4c8d
DB
2647 if (po->sk.sk_socket->type == SOCK_RAW)
2648 reserve = dev->hard_header_len;
69e3c75f 2649 size_max = po->tx_ring.frame_size
b5dd884e 2650 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2651
1d036d25 2652 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2653 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2654
69e3c75f
JB
2655 do {
2656 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2657 TP_STATUS_SEND_REQUEST);
69e3c75f 2658 if (unlikely(ph == NULL)) {
87a2fd28
DB
2659 if (need_wait && need_resched())
2660 schedule();
69e3c75f
JB
2661 continue;
2662 }
2663
8d39b4a6
WB
2664 skb = NULL;
2665 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2666 if (tp_len < 0)
2667 goto tpacket_error;
2668
69e3c75f 2669 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2670 hlen = LL_RESERVED_SPACE(dev);
2671 tlen = dev->needed_tailroom;
1d036d25
WB
2672 if (po->has_vnet_hdr) {
2673 vnet_hdr = data;
2674 data += sizeof(*vnet_hdr);
2675 tp_len -= sizeof(*vnet_hdr);
2676 if (tp_len < 0 ||
2677 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2678 tp_len = -EINVAL;
2679 goto tpacket_error;
2680 }
2681 copylen = __virtio16_to_cpu(vio_le(),
2682 vnet_hdr->hdr_len);
2683 }
9ed988cd 2684 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2685 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2686 hlen + tlen + sizeof(struct sockaddr_ll) +
2687 (copylen - dev->hard_header_len),
fbf33a28 2688 !need_wait, &err);
69e3c75f 2689
fbf33a28
KM
2690 if (unlikely(skb == NULL)) {
2691 /* we assume the socket was initially writeable ... */
2692 if (likely(len_sum > 0))
2693 err = len_sum;
69e3c75f 2694 goto out_status;
fbf33a28 2695 }
8d39b4a6 2696 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
c14ac945 2697 addr, hlen, copylen, &sockc);
dbd46ab4 2698 if (likely(tp_len >= 0) &&
5cfb4c8d 2699 tp_len > dev->mtu + reserve &&
1d036d25 2700 !po->has_vnet_hdr &&
3c70c132
DB
2701 !packet_extra_vlan_len_allowed(dev, skb))
2702 tp_len = -EMSGSIZE;
69e3c75f
JB
2703
2704 if (unlikely(tp_len < 0)) {
8d39b4a6 2705tpacket_error:
69e3c75f
JB
2706 if (po->tp_loss) {
2707 __packet_set_status(po, ph,
2708 TP_STATUS_AVAILABLE);
2709 packet_increment_head(&po->tx_ring);
2710 kfree_skb(skb);
2711 continue;
2712 } else {
2713 status = TP_STATUS_WRONG_FORMAT;
2714 err = tp_len;
2715 goto out_status;
2716 }
2717 }
2718
db60eb5f
JR
2719 if (po->has_vnet_hdr && virtio_net_hdr_to_skb(skb, vnet_hdr,
2720 vio_le())) {
1d036d25
WB
2721 tp_len = -EINVAL;
2722 goto tpacket_error;
2723 }
2724
69e3c75f
JB
2725 skb->destructor = tpacket_destruct_skb;
2726 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2727 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2728
2729 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2730 err = po->xmit(skb);
eb70df13
JP
2731 if (unlikely(err > 0)) {
2732 err = net_xmit_errno(err);
2733 if (err && __packet_get_status(po, ph) ==
2734 TP_STATUS_AVAILABLE) {
2735 /* skb was destructed already */
2736 skb = NULL;
2737 goto out_status;
2738 }
2739 /*
2740 * skb was dropped but not destructed yet;
2741 * let's treat it like congestion or err < 0
2742 */
2743 err = 0;
2744 }
69e3c75f
JB
2745 packet_increment_head(&po->tx_ring);
2746 len_sum += tp_len;
b0138408
DB
2747 } while (likely((ph != NULL) ||
2748 /* Note: packet_read_pending() might be slow if we have
2749 * to call it as it's per_cpu variable, but in fast-path
2750 * we already short-circuit the loop with the first
2751 * condition, and luckily don't have to go that path
2752 * anyway.
2753 */
2754 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2755
2756 err = len_sum;
2757 goto out_put;
2758
69e3c75f
JB
2759out_status:
2760 __packet_set_status(po, ph, status);
2761 kfree_skb(skb);
2762out_put:
e40526cb 2763 dev_put(dev);
69e3c75f
JB
2764out:
2765 mutex_unlock(&po->pg_vec_lock);
2766 return err;
2767}
69e3c75f 2768
eea49cc9
OJ
2769static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2770 size_t reserve, size_t len,
2771 size_t linear, int noblock,
2772 int *err)
bfd5f4a3
SS
2773{
2774 struct sk_buff *skb;
2775
2776 /* Under a page? Don't bother with paged skb. */
2777 if (prepad + len < PAGE_SIZE || !linear)
2778 linear = len;
2779
2780 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2781 err, 0);
bfd5f4a3
SS
2782 if (!skb)
2783 return NULL;
2784
2785 skb_reserve(skb, reserve);
2786 skb_put(skb, linear);
2787 skb->data_len = len - linear;
2788 skb->len += len - linear;
2789
2790 return skb;
2791}
2792
d346a3fa 2793static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2794{
2795 struct sock *sk = sock->sk;
342dfc30 2796 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2797 struct sk_buff *skb;
2798 struct net_device *dev;
0e11c91e 2799 __be16 proto;
1da177e4 2800 unsigned char *addr;
827d9780 2801 int err, reserve = 0;
c7d39e32 2802 struct sockcm_cookie sockc;
bfd5f4a3
SS
2803 struct virtio_net_hdr vnet_hdr = { 0 };
2804 int offset = 0;
bfd5f4a3 2805 struct packet_sock *po = pkt_sk(sk);
da7c9561 2806 bool has_vnet_hdr = false;
57031eb7 2807 int hlen, tlen, linear;
3bdc0eba 2808 int extra_len = 0;
1da177e4
LT
2809
2810 /*
1ce4f28b 2811 * Get and verify the address.
1da177e4 2812 */
1ce4f28b 2813
66e56cd4 2814 if (likely(saddr == NULL)) {
e40526cb 2815 dev = packet_cached_dev_get(po);
1da177e4
LT
2816 proto = po->num;
2817 addr = NULL;
2818 } else {
2819 err = -EINVAL;
2820 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2821 goto out;
0fb375fb
EB
2822 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2823 goto out;
1da177e4
LT
2824 proto = saddr->sll_protocol;
2825 addr = saddr->sll_addr;
827d9780 2826 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2827 }
2828
1da177e4 2829 err = -ENXIO;
e40526cb 2830 if (unlikely(dev == NULL))
1da177e4 2831 goto out_unlock;
d5e76b0a 2832 err = -ENETDOWN;
e40526cb 2833 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2834 goto out_unlock;
2835
3d0ba8c0 2836 sockc.transmit_time = 0;
edbe7746 2837 sockc.tsflags = sk->sk_tsflags;
c7d39e32
EJ
2838 sockc.mark = sk->sk_mark;
2839 if (msg->msg_controllen) {
2840 err = sock_cmsg_send(sk, msg, &sockc);
2841 if (unlikely(err))
2842 goto out_unlock;
2843 }
2844
e40526cb
DB
2845 if (sock->type == SOCK_RAW)
2846 reserve = dev->hard_header_len;
bfd5f4a3 2847 if (po->has_vnet_hdr) {
16cc1400
WB
2848 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2849 if (err)
bfd5f4a3 2850 goto out_unlock;
da7c9561 2851 has_vnet_hdr = true;
bfd5f4a3
SS
2852 }
2853
3bdc0eba
BG
2854 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2855 if (!netif_supports_nofcs(dev)) {
2856 err = -EPROTONOSUPPORT;
2857 goto out_unlock;
2858 }
2859 extra_len = 4; /* We're doing our own CRC */
2860 }
2861
1da177e4 2862 err = -EMSGSIZE;
16cc1400
WB
2863 if (!vnet_hdr.gso_type &&
2864 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2865 goto out_unlock;
2866
bfd5f4a3 2867 err = -ENOBUFS;
ae641949
HX
2868 hlen = LL_RESERVED_SPACE(dev);
2869 tlen = dev->needed_tailroom;
57031eb7
WB
2870 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2871 linear = max(linear, min_t(int, len, dev->hard_header_len));
2872 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
bfd5f4a3 2873 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2874 if (skb == NULL)
1da177e4
LT
2875 goto out_unlock;
2876
b84bbaf7 2877 skb_reset_network_header(skb);
1da177e4 2878
0c4e8581 2879 err = -EINVAL;
9c707762
WB
2880 if (sock->type == SOCK_DGRAM) {
2881 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2882 if (unlikely(offset < 0))
9c707762 2883 goto out_free;
b84bbaf7 2884 } else if (reserve) {
9aad13b0 2885 skb_reserve(skb, -reserve);
9c707762 2886 }
1da177e4
LT
2887
2888 /* Returns -EFAULT on error */
c0371da6 2889 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2890 if (err)
2891 goto out_free;
bf84a010 2892
9ed988cd
WB
2893 if (sock->type == SOCK_RAW &&
2894 !dev_validate_header(dev, skb->data, len)) {
2895 err = -EINVAL;
2896 goto out_free;
2897 }
2898
c14ac945 2899 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1da177e4 2900
16cc1400 2901 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2902 !packet_extra_vlan_len_allowed(dev, skb)) {
2903 err = -EMSGSIZE;
2904 goto out_free;
57f89bfa
BG
2905 }
2906
09effa67
DM
2907 skb->protocol = proto;
2908 skb->dev = dev;
1da177e4 2909 skb->priority = sk->sk_priority;
c7d39e32 2910 skb->mark = sockc.mark;
3d0ba8c0 2911 skb->tstamp = sockc.transmit_time;
0fd5d57b 2912
da7c9561 2913 if (has_vnet_hdr) {
db60eb5f 2914 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
16cc1400
WB
2915 if (err)
2916 goto out_free;
2917 len += sizeof(vnet_hdr);
bfd5f4a3
SS
2918 }
2919
8fd6c80d
DB
2920 skb_probe_transport_header(skb, reserve);
2921
3bdc0eba
BG
2922 if (unlikely(extra_len == 4))
2923 skb->no_fcs = 1;
2924
d346a3fa 2925 err = po->xmit(skb);
1da177e4
LT
2926 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2927 goto out_unlock;
2928
e40526cb 2929 dev_put(dev);
1da177e4 2930
40d4e3df 2931 return len;
1da177e4
LT
2932
2933out_free:
2934 kfree_skb(skb);
2935out_unlock:
e40526cb 2936 if (dev)
1da177e4
LT
2937 dev_put(dev);
2938out:
2939 return err;
2940}
2941
1b784140 2942static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2943{
69e3c75f
JB
2944 struct sock *sk = sock->sk;
2945 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2946
69e3c75f
JB
2947 if (po->tx_ring.pg_vec)
2948 return tpacket_snd(po, msg);
2949 else
69e3c75f
JB
2950 return packet_snd(sock, msg, len);
2951}
2952
1da177e4
LT
2953/*
2954 * Close a PACKET socket. This is fairly simple. We immediately go
2955 * to 'closed' state and remove our protocol entry in the device list.
2956 */
2957
2958static int packet_release(struct socket *sock)
2959{
2960 struct sock *sk = sock->sk;
2961 struct packet_sock *po;
2bd624b4 2962 struct packet_fanout *f;
d12d01d6 2963 struct net *net;
f6fb8f10 2964 union tpacket_req_u req_u;
1da177e4
LT
2965
2966 if (!sk)
2967 return 0;
2968
3b1e0a65 2969 net = sock_net(sk);
1da177e4
LT
2970 po = pkt_sk(sk);
2971
0fa7fa98 2972 mutex_lock(&net->packet.sklist_lock);
808f5114 2973 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2974 mutex_unlock(&net->packet.sklist_lock);
2975
2976 preempt_disable();
920de804 2977 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2978 preempt_enable();
1da177e4 2979
808f5114 2980 spin_lock(&po->bind_lock);
ce06b03e 2981 unregister_prot_hook(sk, false);
66e56cd4
DB
2982 packet_cached_dev_reset(po);
2983
160ff18a
BG
2984 if (po->prot_hook.dev) {
2985 dev_put(po->prot_hook.dev);
2986 po->prot_hook.dev = NULL;
2987 }
808f5114 2988 spin_unlock(&po->bind_lock);
1da177e4 2989
1da177e4 2990 packet_flush_mclist(sk);
1da177e4 2991
5171b37d 2992 lock_sock(sk);
9665d5d6
PS
2993 if (po->rx_ring.pg_vec) {
2994 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2995 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2996 }
69e3c75f 2997
9665d5d6
PS
2998 if (po->tx_ring.pg_vec) {
2999 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3000 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 3001 }
5171b37d 3002 release_sock(sk);
1da177e4 3003
2bd624b4 3004 f = fanout_release(sk);
dc99f600 3005
808f5114 3006 synchronize_net();
2bd624b4
AS
3007
3008 if (f) {
57f015f5 3009 kfree(po->rollover);
2bd624b4
AS
3010 fanout_release_data(f);
3011 kfree(f);
3012 }
1da177e4
LT
3013 /*
3014 * Now the socket is dead. No more input will appear.
3015 */
1da177e4
LT
3016 sock_orphan(sk);
3017 sock->sk = NULL;
3018
3019 /* Purge queues */
3020
3021 skb_queue_purge(&sk->sk_receive_queue);
b0138408 3022 packet_free_pending(po);
17ab56a2 3023 sk_refcnt_debug_release(sk);
1da177e4
LT
3024
3025 sock_put(sk);
3026 return 0;
3027}
3028
3029/*
3030 * Attach a packet hook.
3031 */
3032
30f7ea1c
FR
3033static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3034 __be16 proto)
1da177e4
LT
3035{
3036 struct packet_sock *po = pkt_sk(sk);
158cd4af 3037 struct net_device *dev_curr;
902fefb8
DB
3038 __be16 proto_curr;
3039 bool need_rehook;
30f7ea1c
FR
3040 struct net_device *dev = NULL;
3041 int ret = 0;
3042 bool unlisted = false;
dc99f600 3043
1da177e4 3044 lock_sock(sk);
1da177e4 3045 spin_lock(&po->bind_lock);
30f7ea1c
FR
3046 rcu_read_lock();
3047
4971613c
WB
3048 if (po->fanout) {
3049 ret = -EINVAL;
3050 goto out_unlock;
3051 }
3052
30f7ea1c
FR
3053 if (name) {
3054 dev = dev_get_by_name_rcu(sock_net(sk), name);
3055 if (!dev) {
3056 ret = -ENODEV;
3057 goto out_unlock;
3058 }
3059 } else if (ifindex) {
3060 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3061 if (!dev) {
3062 ret = -ENODEV;
3063 goto out_unlock;
3064 }
3065 }
3066
3067 if (dev)
3068 dev_hold(dev);
66e56cd4 3069
902fefb8
DB
3070 proto_curr = po->prot_hook.type;
3071 dev_curr = po->prot_hook.dev;
3072
3073 need_rehook = proto_curr != proto || dev_curr != dev;
3074
3075 if (need_rehook) {
30f7ea1c
FR
3076 if (po->running) {
3077 rcu_read_unlock();
15fe076e
ED
3078 /* prevents packet_notifier() from calling
3079 * register_prot_hook()
3080 */
3081 po->num = 0;
30f7ea1c
FR
3082 __unregister_prot_hook(sk, true);
3083 rcu_read_lock();
3084 dev_curr = po->prot_hook.dev;
3085 if (dev)
3086 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3087 dev->ifindex);
3088 }
1da177e4 3089
15fe076e 3090 BUG_ON(po->running);
902fefb8
DB
3091 po->num = proto;
3092 po->prot_hook.type = proto;
902fefb8 3093
30f7ea1c
FR
3094 if (unlikely(unlisted)) {
3095 dev_put(dev);
3096 po->prot_hook.dev = NULL;
3097 po->ifindex = -1;
3098 packet_cached_dev_reset(po);
3099 } else {
3100 po->prot_hook.dev = dev;
3101 po->ifindex = dev ? dev->ifindex : 0;
3102 packet_cached_dev_assign(po, dev);
3103 }
902fefb8 3104 }
158cd4af
LW
3105 if (dev_curr)
3106 dev_put(dev_curr);
66e56cd4 3107
902fefb8 3108 if (proto == 0 || !need_rehook)
1da177e4
LT
3109 goto out_unlock;
3110
30f7ea1c 3111 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3112 register_prot_hook(sk);
be85d4ad
UT
3113 } else {
3114 sk->sk_err = ENETDOWN;
3115 if (!sock_flag(sk, SOCK_DEAD))
3116 sk->sk_error_report(sk);
1da177e4
LT
3117 }
3118
3119out_unlock:
30f7ea1c 3120 rcu_read_unlock();
1da177e4
LT
3121 spin_unlock(&po->bind_lock);
3122 release_sock(sk);
30f7ea1c 3123 return ret;
1da177e4
LT
3124}
3125
3126/*
3127 * Bind a packet socket to a device
3128 */
3129
40d4e3df
ED
3130static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3131 int addr_len)
1da177e4 3132{
40d4e3df 3133 struct sock *sk = sock->sk;
540e2894 3134 char name[sizeof(uaddr->sa_data) + 1];
1ce4f28b 3135
1da177e4
LT
3136 /*
3137 * Check legality
3138 */
1ce4f28b 3139
8ae55f04 3140 if (addr_len != sizeof(struct sockaddr))
1da177e4 3141 return -EINVAL;
540e2894
AP
3142 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3143 * zero-terminated.
3144 */
3145 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3146 name[sizeof(uaddr->sa_data)] = 0;
1da177e4 3147
30f7ea1c 3148 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3149}
1da177e4
LT
3150
3151static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3152{
40d4e3df
ED
3153 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3154 struct sock *sk = sock->sk;
1da177e4
LT
3155
3156 /*
3157 * Check legality
3158 */
1ce4f28b 3159
1da177e4
LT
3160 if (addr_len < sizeof(struct sockaddr_ll))
3161 return -EINVAL;
3162 if (sll->sll_family != AF_PACKET)
3163 return -EINVAL;
3164
30f7ea1c
FR
3165 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3166 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3167}
3168
3169static struct proto packet_proto = {
3170 .name = "PACKET",
3171 .owner = THIS_MODULE,
3172 .obj_size = sizeof(struct packet_sock),
3173};
3174
3175/*
1ce4f28b 3176 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3177 */
3178
3f378b68
EP
3179static int packet_create(struct net *net, struct socket *sock, int protocol,
3180 int kern)
1da177e4
LT
3181{
3182 struct sock *sk;
3183 struct packet_sock *po;
0e11c91e 3184 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3185 int err;
3186
df008c91 3187 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3188 return -EPERM;
be02097c
DM
3189 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3190 sock->type != SOCK_PACKET)
1da177e4
LT
3191 return -ESOCKTNOSUPPORT;
3192
3193 sock->state = SS_UNCONNECTED;
3194
3195 err = -ENOBUFS;
11aa9c28 3196 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3197 if (sk == NULL)
3198 goto out;
3199
3200 sock->ops = &packet_ops;
1da177e4
LT
3201 if (sock->type == SOCK_PACKET)
3202 sock->ops = &packet_ops_spkt;
be02097c 3203
1da177e4
LT
3204 sock_init_data(sock, sk);
3205
3206 po = pkt_sk(sk);
3207 sk->sk_family = PF_PACKET;
0e11c91e 3208 po->num = proto;
d346a3fa 3209 po->xmit = dev_queue_xmit;
66e56cd4 3210
b0138408
DB
3211 err = packet_alloc_pending(po);
3212 if (err)
3213 goto out2;
3214
66e56cd4 3215 packet_cached_dev_reset(po);
1da177e4
LT
3216
3217 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3218 sk_refcnt_debug_inc(sk);
1da177e4
LT
3219
3220 /*
3221 * Attach a protocol block
3222 */
3223
3224 spin_lock_init(&po->bind_lock);
905db440 3225 mutex_init(&po->pg_vec_lock);
0648ab70 3226 po->rollover = NULL;
1da177e4 3227 po->prot_hook.func = packet_rcv;
be02097c 3228
1da177e4
LT
3229 if (sock->type == SOCK_PACKET)
3230 po->prot_hook.func = packet_rcv_spkt;
be02097c 3231
1da177e4
LT
3232 po->prot_hook.af_packet_priv = sk;
3233
0e11c91e
AV
3234 if (proto) {
3235 po->prot_hook.type = proto;
a6361f0c 3236 __register_prot_hook(sk);
1da177e4
LT
3237 }
3238
0fa7fa98 3239 mutex_lock(&net->packet.sklist_lock);
808f5114 3240 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3241 mutex_unlock(&net->packet.sklist_lock);
3242
3243 preempt_disable();
3680453c 3244 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3245 preempt_enable();
808f5114 3246
40d4e3df 3247 return 0;
b0138408
DB
3248out2:
3249 sk_free(sk);
1da177e4
LT
3250out:
3251 return err;
3252}
3253
3254/*
3255 * Pull a packet from our receive queue and hand it to the user.
3256 * If necessary we block.
3257 */
3258
1b784140
YX
3259static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3260 int flags)
1da177e4
LT
3261{
3262 struct sock *sk = sock->sk;
3263 struct sk_buff *skb;
3264 int copied, err;
bfd5f4a3 3265 int vnet_hdr_len = 0;
2472d761 3266 unsigned int origlen = 0;
1da177e4
LT
3267
3268 err = -EINVAL;
ed85b565 3269 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3270 goto out;
3271
3272#if 0
3273 /* What error should we return now? EUNATTACH? */
3274 if (pkt_sk(sk)->ifindex < 0)
3275 return -ENODEV;
3276#endif
3277
ed85b565 3278 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3279 err = sock_recv_errqueue(sk, msg, len,
3280 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3281 goto out;
3282 }
3283
1da177e4
LT
3284 /*
3285 * Call the generic datagram receiver. This handles all sorts
3286 * of horrible races and re-entrancy so we can forget about it
3287 * in the protocol layers.
3288 *
3289 * Now it will return ENETDOWN, if device have just gone down,
3290 * but then it will block.
3291 */
3292
40d4e3df 3293 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3294
3295 /*
1ce4f28b 3296 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3297 * handles the blocking we don't see and worry about blocking
3298 * retries.
3299 */
3300
8ae55f04 3301 if (skb == NULL)
1da177e4
LT
3302 goto out;
3303
2ccdbaa6
WB
3304 if (pkt_sk(sk)->pressure)
3305 packet_rcv_has_room(pkt_sk(sk), NULL);
3306
bfd5f4a3 3307 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3308 err = packet_rcv_vnet(msg, skb, &len);
3309 if (err)
bfd5f4a3 3310 goto out_free;
16cc1400 3311 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3312 }
3313
f3d33426
HFS
3314 /* You lose any data beyond the buffer you gave. If it worries
3315 * a user program they can ask the device for its MTU
3316 * anyway.
1da177e4 3317 */
1da177e4 3318 copied = skb->len;
40d4e3df
ED
3319 if (copied > len) {
3320 copied = len;
3321 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3322 }
3323
51f3d02b 3324 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3325 if (err)
3326 goto out_free;
3327
2472d761
EB
3328 if (sock->type != SOCK_PACKET) {
3329 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3330
3331 /* Original length was stored in sockaddr_ll fields */
3332 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3333 sll->sll_family = AF_PACKET;
3334 sll->sll_protocol = skb->protocol;
3335 }
3336
3b885787 3337 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3338
f3d33426
HFS
3339 if (msg->msg_name) {
3340 /* If the address length field is there to be filled
3341 * in, we fill it in now.
3342 */
3343 if (sock->type == SOCK_PACKET) {
342dfc30 3344 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3345 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3346 } else {
3347 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3348
f3d33426
HFS
3349 msg->msg_namelen = sll->sll_halen +
3350 offsetof(struct sockaddr_ll, sll_addr);
3351 }
ffbc6111
HX
3352 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3353 msg->msg_namelen);
f3d33426 3354 }
1da177e4 3355
8dc41944 3356 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3357 struct tpacket_auxdata aux;
3358
3359 aux.tp_status = TP_STATUS_USER;
3360 if (skb->ip_summed == CHECKSUM_PARTIAL)
3361 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3362 else if (skb->pkt_type != PACKET_OUTGOING &&
3363 (skb->ip_summed == CHECKSUM_COMPLETE ||
3364 skb_csum_unnecessary(skb)))
3365 aux.tp_status |= TP_STATUS_CSUM_VALID;
3366
2472d761 3367 aux.tp_len = origlen;
ffbc6111
HX
3368 aux.tp_snaplen = skb->len;
3369 aux.tp_mac = 0;
bbe735e4 3370 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3371 if (skb_vlan_tag_present(skb)) {
3372 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3373 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3374 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3375 } else {
3376 aux.tp_vlan_tci = 0;
a0cdfcf3 3377 aux.tp_vlan_tpid = 0;
a3bcc23e 3378 }
ffbc6111 3379 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3380 }
3381
1da177e4
LT
3382 /*
3383 * Free or return the buffer as appropriate. Again this
3384 * hides all the races and re-entrancy issues from us.
3385 */
bfd5f4a3 3386 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3387
3388out_free:
3389 skb_free_datagram(sk, skb);
3390out:
3391 return err;
3392}
3393
1da177e4 3394static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3395 int peer)
1da177e4
LT
3396{
3397 struct net_device *dev;
3398 struct sock *sk = sock->sk;
3399
3400 if (peer)
3401 return -EOPNOTSUPP;
3402
3403 uaddr->sa_family = AF_PACKET;
2dc85bf3 3404 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3405 rcu_read_lock();
3406 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3407 if (dev)
2dc85bf3 3408 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3409 rcu_read_unlock();
1da177e4 3410
9b2c45d4 3411 return sizeof(*uaddr);
1da177e4 3412}
1da177e4
LT
3413
3414static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3415 int peer)
1da177e4
LT
3416{
3417 struct net_device *dev;
3418 struct sock *sk = sock->sk;
3419 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3420 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3421
3422 if (peer)
3423 return -EOPNOTSUPP;
3424
3425 sll->sll_family = AF_PACKET;
3426 sll->sll_ifindex = po->ifindex;
3427 sll->sll_protocol = po->num;
67286640 3428 sll->sll_pkttype = 0;
654d1f8a
ED
3429 rcu_read_lock();
3430 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3431 if (dev) {
3432 sll->sll_hatype = dev->type;
3433 sll->sll_halen = dev->addr_len;
3434 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3435 } else {
3436 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3437 sll->sll_halen = 0;
3438 }
654d1f8a 3439 rcu_read_unlock();
1da177e4 3440
9b2c45d4 3441 return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3442}
3443
2aeb0b88
WC
3444static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3445 int what)
1da177e4
LT
3446{
3447 switch (i->type) {
3448 case PACKET_MR_MULTICAST:
1162563f
JP
3449 if (i->alen != dev->addr_len)
3450 return -EINVAL;
1da177e4 3451 if (what > 0)
22bedad3 3452 return dev_mc_add(dev, i->addr);
1da177e4 3453 else
22bedad3 3454 return dev_mc_del(dev, i->addr);
1da177e4
LT
3455 break;
3456 case PACKET_MR_PROMISC:
2aeb0b88 3457 return dev_set_promiscuity(dev, what);
1da177e4 3458 case PACKET_MR_ALLMULTI:
2aeb0b88 3459 return dev_set_allmulti(dev, what);
d95ed927 3460 case PACKET_MR_UNICAST:
1162563f
JP
3461 if (i->alen != dev->addr_len)
3462 return -EINVAL;
d95ed927 3463 if (what > 0)
a748ee24 3464 return dev_uc_add(dev, i->addr);
d95ed927 3465 else
a748ee24 3466 return dev_uc_del(dev, i->addr);
d95ed927 3467 break;
40d4e3df
ED
3468 default:
3469 break;
1da177e4 3470 }
2aeb0b88 3471 return 0;
1da177e4
LT
3472}
3473
82f17091
FR
3474static void packet_dev_mclist_delete(struct net_device *dev,
3475 struct packet_mclist **mlp)
1da177e4 3476{
82f17091
FR
3477 struct packet_mclist *ml;
3478
3479 while ((ml = *mlp) != NULL) {
3480 if (ml->ifindex == dev->ifindex) {
3481 packet_dev_mc(dev, ml, -1);
3482 *mlp = ml->next;
3483 kfree(ml);
3484 } else
3485 mlp = &ml->next;
1da177e4
LT
3486 }
3487}
3488
0fb375fb 3489static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3490{
3491 struct packet_sock *po = pkt_sk(sk);
3492 struct packet_mclist *ml, *i;
3493 struct net_device *dev;
3494 int err;
3495
3496 rtnl_lock();
3497
3498 err = -ENODEV;
3b1e0a65 3499 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3500 if (!dev)
3501 goto done;
3502
3503 err = -EINVAL;
1162563f 3504 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3505 goto done;
3506
3507 err = -ENOBUFS;
8b3a7005 3508 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3509 if (i == NULL)
3510 goto done;
3511
3512 err = 0;
3513 for (ml = po->mclist; ml; ml = ml->next) {
3514 if (ml->ifindex == mreq->mr_ifindex &&
3515 ml->type == mreq->mr_type &&
3516 ml->alen == mreq->mr_alen &&
3517 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3518 ml->count++;
3519 /* Free the new element ... */
3520 kfree(i);
3521 goto done;
3522 }
3523 }
3524
3525 i->type = mreq->mr_type;
3526 i->ifindex = mreq->mr_ifindex;
3527 i->alen = mreq->mr_alen;
3528 memcpy(i->addr, mreq->mr_address, i->alen);
309cf37f 3529 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3530 i->count = 1;
3531 i->next = po->mclist;
3532 po->mclist = i;
2aeb0b88
WC
3533 err = packet_dev_mc(dev, i, 1);
3534 if (err) {
3535 po->mclist = i->next;
3536 kfree(i);
3537 }
1da177e4
LT
3538
3539done:
3540 rtnl_unlock();
3541 return err;
3542}
3543
0fb375fb 3544static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3545{
3546 struct packet_mclist *ml, **mlp;
3547
3548 rtnl_lock();
3549
3550 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3551 if (ml->ifindex == mreq->mr_ifindex &&
3552 ml->type == mreq->mr_type &&
3553 ml->alen == mreq->mr_alen &&
3554 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3555 if (--ml->count == 0) {
3556 struct net_device *dev;
3557 *mlp = ml->next;
ad959e76
ED
3558 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3559 if (dev)
1da177e4 3560 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3561 kfree(ml);
3562 }
82f17091 3563 break;
1da177e4
LT
3564 }
3565 }
3566 rtnl_unlock();
82f17091 3567 return 0;
1da177e4
LT
3568}
3569
3570static void packet_flush_mclist(struct sock *sk)
3571{
3572 struct packet_sock *po = pkt_sk(sk);
3573 struct packet_mclist *ml;
3574
3575 if (!po->mclist)
3576 return;
3577
3578 rtnl_lock();
3579 while ((ml = po->mclist) != NULL) {
3580 struct net_device *dev;
3581
3582 po->mclist = ml->next;
ad959e76
ED
3583 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3584 if (dev != NULL)
1da177e4 3585 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3586 kfree(ml);
3587 }
3588 rtnl_unlock();
3589}
1da177e4
LT
3590
3591static int
b7058842 3592packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3593{
3594 struct sock *sk = sock->sk;
8dc41944 3595 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3596 int ret;
3597
3598 if (level != SOL_PACKET)
3599 return -ENOPROTOOPT;
3600
69e3c75f 3601 switch (optname) {
1ce4f28b 3602 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3603 case PACKET_DROP_MEMBERSHIP:
3604 {
0fb375fb
EB
3605 struct packet_mreq_max mreq;
3606 int len = optlen;
3607 memset(&mreq, 0, sizeof(mreq));
3608 if (len < sizeof(struct packet_mreq))
1da177e4 3609 return -EINVAL;
0fb375fb
EB
3610 if (len > sizeof(mreq))
3611 len = sizeof(mreq);
40d4e3df 3612 if (copy_from_user(&mreq, optval, len))
1da177e4 3613 return -EFAULT;
0fb375fb
EB
3614 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3615 return -EINVAL;
1da177e4
LT
3616 if (optname == PACKET_ADD_MEMBERSHIP)
3617 ret = packet_mc_add(sk, &mreq);
3618 else
3619 ret = packet_mc_drop(sk, &mreq);
3620 return ret;
3621 }
a2efcfa0 3622
1da177e4 3623 case PACKET_RX_RING:
69e3c75f 3624 case PACKET_TX_RING:
1da177e4 3625 {
f6fb8f10 3626 union tpacket_req_u req_u;
3627 int len;
1da177e4 3628
5171b37d 3629 lock_sock(sk);
f6fb8f10 3630 switch (po->tp_version) {
3631 case TPACKET_V1:
3632 case TPACKET_V2:
3633 len = sizeof(req_u.req);
3634 break;
3635 case TPACKET_V3:
3636 default:
3637 len = sizeof(req_u.req3);
3638 break;
3639 }
5171b37d
ED
3640 if (optlen < len) {
3641 ret = -EINVAL;
3642 } else {
3643 if (copy_from_user(&req_u.req, optval, len))
3644 ret = -EFAULT;
3645 else
3646 ret = packet_set_ring(sk, &req_u, 0,
3647 optname == PACKET_TX_RING);
3648 }
3649 release_sock(sk);
3650 return ret;
1da177e4
LT
3651 }
3652 case PACKET_COPY_THRESH:
3653 {
3654 int val;
3655
40d4e3df 3656 if (optlen != sizeof(val))
1da177e4 3657 return -EINVAL;
40d4e3df 3658 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3659 return -EFAULT;
3660
3661 pkt_sk(sk)->copy_thresh = val;
3662 return 0;
3663 }
bbd6ef87
PM
3664 case PACKET_VERSION:
3665 {
3666 int val;
3667
3668 if (optlen != sizeof(val))
3669 return -EINVAL;
bbd6ef87
PM
3670 if (copy_from_user(&val, optval, sizeof(val)))
3671 return -EFAULT;
3672 switch (val) {
3673 case TPACKET_V1:
3674 case TPACKET_V2:
f6fb8f10 3675 case TPACKET_V3:
84ac7260 3676 break;
bbd6ef87
PM
3677 default:
3678 return -EINVAL;
3679 }
84ac7260
PP
3680 lock_sock(sk);
3681 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3682 ret = -EBUSY;
3683 } else {
3684 po->tp_version = val;
3685 ret = 0;
3686 }
3687 release_sock(sk);
3688 return ret;
bbd6ef87 3689 }
8913336a
PM
3690 case PACKET_RESERVE:
3691 {
3692 unsigned int val;
3693
3694 if (optlen != sizeof(val))
3695 return -EINVAL;
8913336a
PM
3696 if (copy_from_user(&val, optval, sizeof(val)))
3697 return -EFAULT;
bcc5364b
AK
3698 if (val > INT_MAX)
3699 return -EINVAL;
c27927e3
WB
3700 lock_sock(sk);
3701 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3702 ret = -EBUSY;
3703 } else {
3704 po->tp_reserve = val;
3705 ret = 0;
3706 }
3707 release_sock(sk);
3708 return ret;
8913336a 3709 }
69e3c75f
JB
3710 case PACKET_LOSS:
3711 {
3712 unsigned int val;
3713
3714 if (optlen != sizeof(val))
3715 return -EINVAL;
69e3c75f
JB
3716 if (copy_from_user(&val, optval, sizeof(val)))
3717 return -EFAULT;
a6361f0c
WB
3718
3719 lock_sock(sk);
3720 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3721 ret = -EBUSY;
3722 } else {
3723 po->tp_loss = !!val;
3724 ret = 0;
3725 }
3726 release_sock(sk);
3727 return ret;
69e3c75f 3728 }
8dc41944
HX
3729 case PACKET_AUXDATA:
3730 {
3731 int val;
3732
3733 if (optlen < sizeof(val))
3734 return -EINVAL;
3735 if (copy_from_user(&val, optval, sizeof(val)))
3736 return -EFAULT;
3737
a6361f0c 3738 lock_sock(sk);
8dc41944 3739 po->auxdata = !!val;
a6361f0c 3740 release_sock(sk);
8dc41944
HX
3741 return 0;
3742 }
80feaacb
PWJ
3743 case PACKET_ORIGDEV:
3744 {
3745 int val;
3746
3747 if (optlen < sizeof(val))
3748 return -EINVAL;
3749 if (copy_from_user(&val, optval, sizeof(val)))
3750 return -EFAULT;
3751
a6361f0c 3752 lock_sock(sk);
80feaacb 3753 po->origdev = !!val;
a6361f0c 3754 release_sock(sk);
80feaacb
PWJ
3755 return 0;
3756 }
bfd5f4a3
SS
3757 case PACKET_VNET_HDR:
3758 {
3759 int val;
3760
3761 if (sock->type != SOCK_RAW)
3762 return -EINVAL;
bfd5f4a3
SS
3763 if (optlen < sizeof(val))
3764 return -EINVAL;
3765 if (copy_from_user(&val, optval, sizeof(val)))
3766 return -EFAULT;
3767
a6361f0c
WB
3768 lock_sock(sk);
3769 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3770 ret = -EBUSY;
3771 } else {
3772 po->has_vnet_hdr = !!val;
3773 ret = 0;
3774 }
3775 release_sock(sk);
3776 return ret;
bfd5f4a3 3777 }
614f60fa
SM
3778 case PACKET_TIMESTAMP:
3779 {
3780 int val;
3781
3782 if (optlen != sizeof(val))
3783 return -EINVAL;
3784 if (copy_from_user(&val, optval, sizeof(val)))
3785 return -EFAULT;
3786
3787 po->tp_tstamp = val;
3788 return 0;
3789 }
dc99f600
DM
3790 case PACKET_FANOUT:
3791 {
3792 int val;
3793
3794 if (optlen != sizeof(val))
3795 return -EINVAL;
3796 if (copy_from_user(&val, optval, sizeof(val)))
3797 return -EFAULT;
3798
3799 return fanout_add(sk, val & 0xffff, val >> 16);
3800 }
47dceb8e
WB
3801 case PACKET_FANOUT_DATA:
3802 {
3803 if (!po->fanout)
3804 return -EINVAL;
3805
3806 return fanout_set_data(po, optval, optlen);
3807 }
5920cd3a
PC
3808 case PACKET_TX_HAS_OFF:
3809 {
3810 unsigned int val;
3811
3812 if (optlen != sizeof(val))
3813 return -EINVAL;
5920cd3a
PC
3814 if (copy_from_user(&val, optval, sizeof(val)))
3815 return -EFAULT;
a6361f0c
WB
3816
3817 lock_sock(sk);
3818 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3819 ret = -EBUSY;
3820 } else {
3821 po->tp_tx_has_off = !!val;
3822 ret = 0;
3823 }
3824 release_sock(sk);
5920cd3a
PC
3825 return 0;
3826 }
d346a3fa
DB
3827 case PACKET_QDISC_BYPASS:
3828 {
3829 int val;
3830
3831 if (optlen != sizeof(val))
3832 return -EINVAL;
3833 if (copy_from_user(&val, optval, sizeof(val)))
3834 return -EFAULT;
3835
3836 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3837 return 0;
3838 }
1da177e4
LT
3839 default:
3840 return -ENOPROTOOPT;
3841 }
3842}
3843
3844static int packet_getsockopt(struct socket *sock, int level, int optname,
3845 char __user *optval, int __user *optlen)
3846{
3847 int len;
c06fff6e 3848 int val, lv = sizeof(val);
1da177e4
LT
3849 struct sock *sk = sock->sk;
3850 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3851 void *data = &val;
ee80fbf3 3852 union tpacket_stats_u st;
a9b63918 3853 struct tpacket_rollover_stats rstats;
1da177e4
LT
3854
3855 if (level != SOL_PACKET)
3856 return -ENOPROTOOPT;
3857
8ae55f04
KK
3858 if (get_user(len, optlen))
3859 return -EFAULT;
1da177e4
LT
3860
3861 if (len < 0)
3862 return -EINVAL;
1ce4f28b 3863
69e3c75f 3864 switch (optname) {
1da177e4 3865 case PACKET_STATISTICS:
1da177e4 3866 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3867 memcpy(&st, &po->stats, sizeof(st));
3868 memset(&po->stats, 0, sizeof(po->stats));
3869 spin_unlock_bh(&sk->sk_receive_queue.lock);
3870
f6fb8f10 3871 if (po->tp_version == TPACKET_V3) {
c06fff6e 3872 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3873 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3874 data = &st.stats3;
f6fb8f10 3875 } else {
c06fff6e 3876 lv = sizeof(struct tpacket_stats);
8bcdeaff 3877 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3878 data = &st.stats1;
f6fb8f10 3879 }
ee80fbf3 3880
8dc41944
HX
3881 break;
3882 case PACKET_AUXDATA:
8dc41944 3883 val = po->auxdata;
80feaacb
PWJ
3884 break;
3885 case PACKET_ORIGDEV:
80feaacb 3886 val = po->origdev;
bfd5f4a3
SS
3887 break;
3888 case PACKET_VNET_HDR:
bfd5f4a3 3889 val = po->has_vnet_hdr;
1da177e4 3890 break;
bbd6ef87 3891 case PACKET_VERSION:
bbd6ef87 3892 val = po->tp_version;
bbd6ef87
PM
3893 break;
3894 case PACKET_HDRLEN:
3895 if (len > sizeof(int))
3896 len = sizeof(int);
fd2c83b3
AP
3897 if (len < sizeof(int))
3898 return -EINVAL;
bbd6ef87
PM
3899 if (copy_from_user(&val, optval, len))
3900 return -EFAULT;
3901 switch (val) {
3902 case TPACKET_V1:
3903 val = sizeof(struct tpacket_hdr);
3904 break;
3905 case TPACKET_V2:
3906 val = sizeof(struct tpacket2_hdr);
3907 break;
f6fb8f10 3908 case TPACKET_V3:
3909 val = sizeof(struct tpacket3_hdr);
3910 break;
bbd6ef87
PM
3911 default:
3912 return -EINVAL;
3913 }
bbd6ef87 3914 break;
8913336a 3915 case PACKET_RESERVE:
8913336a 3916 val = po->tp_reserve;
8913336a 3917 break;
69e3c75f 3918 case PACKET_LOSS:
69e3c75f 3919 val = po->tp_loss;
69e3c75f 3920 break;
614f60fa 3921 case PACKET_TIMESTAMP:
614f60fa 3922 val = po->tp_tstamp;
614f60fa 3923 break;
dc99f600 3924 case PACKET_FANOUT:
dc99f600
DM
3925 val = (po->fanout ?
3926 ((u32)po->fanout->id |
77f65ebd
WB
3927 ((u32)po->fanout->type << 16) |
3928 ((u32)po->fanout->flags << 24)) :
dc99f600 3929 0);
dc99f600 3930 break;
a9b63918 3931 case PACKET_ROLLOVER_STATS:
57f015f5 3932 if (!po->rollover)
a9b63918 3933 return -EINVAL;
57f015f5
MM
3934 rstats.tp_all = atomic_long_read(&po->rollover->num);
3935 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3936 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3937 data = &rstats;
3938 lv = sizeof(rstats);
a9b63918 3939 break;
5920cd3a
PC
3940 case PACKET_TX_HAS_OFF:
3941 val = po->tp_tx_has_off;
3942 break;
d346a3fa
DB
3943 case PACKET_QDISC_BYPASS:
3944 val = packet_use_direct_xmit(po);
3945 break;
1da177e4
LT
3946 default:
3947 return -ENOPROTOOPT;
3948 }
3949
c06fff6e
ED
3950 if (len > lv)
3951 len = lv;
8ae55f04
KK
3952 if (put_user(len, optlen))
3953 return -EFAULT;
8dc41944
HX
3954 if (copy_to_user(optval, data, len))
3955 return -EFAULT;
8ae55f04 3956 return 0;
1da177e4
LT
3957}
3958
3959
719c44d3
WB
3960#ifdef CONFIG_COMPAT
3961static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
3962 char __user *optval, unsigned int optlen)
3963{
3964 struct packet_sock *po = pkt_sk(sock->sk);
3965
3966 if (level != SOL_PACKET)
3967 return -ENOPROTOOPT;
3968
3969 if (optname == PACKET_FANOUT_DATA &&
3970 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
3971 optval = (char __user *)get_compat_bpf_fprog(optval);
3972 if (!optval)
3973 return -EFAULT;
3974 optlen = sizeof(struct sock_fprog);
3975 }
3976
3977 return packet_setsockopt(sock, level, optname, optval, optlen);
3978}
3979#endif
3980
351638e7
JP
3981static int packet_notifier(struct notifier_block *this,
3982 unsigned long msg, void *ptr)
1da177e4
LT
3983{
3984 struct sock *sk;
351638e7 3985 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3986 struct net *net = dev_net(dev);
1da177e4 3987
808f5114 3988 rcu_read_lock();
b67bfe0d 3989 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3990 struct packet_sock *po = pkt_sk(sk);
3991
3992 switch (msg) {
3993 case NETDEV_UNREGISTER:
1da177e4 3994 if (po->mclist)
82f17091 3995 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
3996 /* fallthrough */
3997
1da177e4
LT
3998 case NETDEV_DOWN:
3999 if (dev->ifindex == po->ifindex) {
4000 spin_lock(&po->bind_lock);
4001 if (po->running) {
ce06b03e 4002 __unregister_prot_hook(sk, false);
1da177e4
LT
4003 sk->sk_err = ENETDOWN;
4004 if (!sock_flag(sk, SOCK_DEAD))
4005 sk->sk_error_report(sk);
4006 }
4007 if (msg == NETDEV_UNREGISTER) {
66e56cd4 4008 packet_cached_dev_reset(po);
1da177e4 4009 po->ifindex = -1;
160ff18a
BG
4010 if (po->prot_hook.dev)
4011 dev_put(po->prot_hook.dev);
1da177e4
LT
4012 po->prot_hook.dev = NULL;
4013 }
4014 spin_unlock(&po->bind_lock);
4015 }
4016 break;
4017 case NETDEV_UP:
808f5114 4018 if (dev->ifindex == po->ifindex) {
4019 spin_lock(&po->bind_lock);
ce06b03e
DM
4020 if (po->num)
4021 register_prot_hook(sk);
808f5114 4022 spin_unlock(&po->bind_lock);
1da177e4 4023 }
1da177e4
LT
4024 break;
4025 }
4026 }
808f5114 4027 rcu_read_unlock();
1da177e4
LT
4028 return NOTIFY_DONE;
4029}
4030
4031
4032static int packet_ioctl(struct socket *sock, unsigned int cmd,
4033 unsigned long arg)
4034{
4035 struct sock *sk = sock->sk;
4036
69e3c75f 4037 switch (cmd) {
40d4e3df
ED
4038 case SIOCOUTQ:
4039 {
4040 int amount = sk_wmem_alloc_get(sk);
31e6d363 4041
40d4e3df
ED
4042 return put_user(amount, (int __user *)arg);
4043 }
4044 case SIOCINQ:
4045 {
4046 struct sk_buff *skb;
4047 int amount = 0;
4048
4049 spin_lock_bh(&sk->sk_receive_queue.lock);
4050 skb = skb_peek(&sk->sk_receive_queue);
4051 if (skb)
4052 amount = skb->len;
4053 spin_unlock_bh(&sk->sk_receive_queue.lock);
4054 return put_user(amount, (int __user *)arg);
4055 }
4056 case SIOCGSTAMP:
4057 return sock_get_timestamp(sk, (struct timeval __user *)arg);
4058 case SIOCGSTAMPNS:
4059 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 4060
1da177e4 4061#ifdef CONFIG_INET
40d4e3df
ED
4062 case SIOCADDRT:
4063 case SIOCDELRT:
4064 case SIOCDARP:
4065 case SIOCGARP:
4066 case SIOCSARP:
4067 case SIOCGIFADDR:
4068 case SIOCSIFADDR:
4069 case SIOCGIFBRDADDR:
4070 case SIOCSIFBRDADDR:
4071 case SIOCGIFNETMASK:
4072 case SIOCSIFNETMASK:
4073 case SIOCGIFDSTADDR:
4074 case SIOCSIFDSTADDR:
4075 case SIOCSIFFLAGS:
40d4e3df 4076 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
4077#endif
4078
40d4e3df
ED
4079 default:
4080 return -ENOIOCTLCMD;
1da177e4
LT
4081 }
4082 return 0;
4083}
4084
a11e1d43
LT
4085static __poll_t packet_poll(struct file *file, struct socket *sock,
4086 poll_table *wait)
1da177e4
LT
4087{
4088 struct sock *sk = sock->sk;
4089 struct packet_sock *po = pkt_sk(sk);
a11e1d43 4090 __poll_t mask = datagram_poll(file, sock, wait);
1da177e4
LT
4091
4092 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4093 if (po->rx_ring.pg_vec) {
f6fb8f10 4094 if (!packet_previous_rx_frame(po, &po->rx_ring,
4095 TP_STATUS_KERNEL))
a9a08845 4096 mask |= EPOLLIN | EPOLLRDNORM;
1da177e4 4097 }
2ccdbaa6 4098 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 4099 po->pressure = 0;
1da177e4 4100 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4101 spin_lock_bh(&sk->sk_write_queue.lock);
4102 if (po->tx_ring.pg_vec) {
4103 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
a9a08845 4104 mask |= EPOLLOUT | EPOLLWRNORM;
69e3c75f
JB
4105 }
4106 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4107 return mask;
4108}
4109
4110
4111/* Dirty? Well, I still did not learn better way to account
4112 * for user mmaps.
4113 */
4114
4115static void packet_mm_open(struct vm_area_struct *vma)
4116{
4117 struct file *file = vma->vm_file;
40d4e3df 4118 struct socket *sock = file->private_data;
1da177e4 4119 struct sock *sk = sock->sk;
1ce4f28b 4120
1da177e4
LT
4121 if (sk)
4122 atomic_inc(&pkt_sk(sk)->mapped);
4123}
4124
4125static void packet_mm_close(struct vm_area_struct *vma)
4126{
4127 struct file *file = vma->vm_file;
40d4e3df 4128 struct socket *sock = file->private_data;
1da177e4 4129 struct sock *sk = sock->sk;
1ce4f28b 4130
1da177e4
LT
4131 if (sk)
4132 atomic_dec(&pkt_sk(sk)->mapped);
4133}
4134
f0f37e2f 4135static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4136 .open = packet_mm_open,
4137 .close = packet_mm_close,
1da177e4
LT
4138};
4139
0e3125c7
NH
4140static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4141 unsigned int len)
1da177e4
LT
4142{
4143 int i;
4144
4ebf0ae2 4145 for (i = 0; i < len; i++) {
0e3125c7 4146 if (likely(pg_vec[i].buffer)) {
c56b4d90 4147 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
4148 vfree(pg_vec[i].buffer);
4149 else
4150 free_pages((unsigned long)pg_vec[i].buffer,
4151 order);
4152 pg_vec[i].buffer = NULL;
4153 }
1da177e4
LT
4154 }
4155 kfree(pg_vec);
4156}
4157
eea49cc9 4158static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4159{
f0d4eb29 4160 char *buffer;
0e3125c7
NH
4161 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4162 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4163
4164 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4165 if (buffer)
4166 return buffer;
4167
f0d4eb29 4168 /* __get_free_pages failed, fall back to vmalloc */
fad953ce 4169 buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
0e3125c7
NH
4170 if (buffer)
4171 return buffer;
4172
f0d4eb29 4173 /* vmalloc failed, lets dig into swap here */
0e3125c7 4174 gfp_flags &= ~__GFP_NORETRY;
f0d4eb29 4175 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4176 if (buffer)
4177 return buffer;
4178
f0d4eb29 4179 /* complete and utter failure */
0e3125c7 4180 return NULL;
4ebf0ae2
DM
4181}
4182
0e3125c7 4183static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4184{
4185 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4186 struct pgv *pg_vec;
4ebf0ae2
DM
4187 int i;
4188
0e3125c7 4189 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
4190 if (unlikely(!pg_vec))
4191 goto out;
4192
4193 for (i = 0; i < block_nr; i++) {
c56b4d90 4194 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4195 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4196 goto out_free_pgvec;
4197 }
4198
4199out:
4200 return pg_vec;
4201
4202out_free_pgvec:
4203 free_pg_vec(pg_vec, order, block_nr);
4204 pg_vec = NULL;
4205 goto out;
4206}
1da177e4 4207
f6fb8f10 4208static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4209 int closing, int tx_ring)
1da177e4 4210{
0e3125c7 4211 struct pgv *pg_vec = NULL;
1da177e4 4212 struct packet_sock *po = pkt_sk(sk);
0e11c91e 4213 int was_running, order = 0;
69e3c75f
JB
4214 struct packet_ring_buffer *rb;
4215 struct sk_buff_head *rb_queue;
0e11c91e 4216 __be16 num;
f6fb8f10 4217 int err = -EINVAL;
4218 /* Added to avoid minimal code churn */
4219 struct tpacket_req *req = &req_u->req;
4220
69e3c75f
JB
4221 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4222 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4223
69e3c75f
JB
4224 err = -EBUSY;
4225 if (!closing) {
4226 if (atomic_read(&po->mapped))
4227 goto out;
b0138408 4228 if (packet_read_pending(rb))
69e3c75f
JB
4229 goto out;
4230 }
1da177e4 4231
69e3c75f
JB
4232 if (req->tp_block_nr) {
4233 /* Sanity tests and some calculations */
4234 err = -EBUSY;
4235 if (unlikely(rb->pg_vec))
4236 goto out;
1da177e4 4237
bbd6ef87
PM
4238 switch (po->tp_version) {
4239 case TPACKET_V1:
4240 po->tp_hdrlen = TPACKET_HDRLEN;
4241 break;
4242 case TPACKET_V2:
4243 po->tp_hdrlen = TPACKET2_HDRLEN;
4244 break;
f6fb8f10 4245 case TPACKET_V3:
4246 po->tp_hdrlen = TPACKET3_HDRLEN;
4247 break;
bbd6ef87
PM
4248 }
4249
69e3c75f 4250 err = -EINVAL;
4ebf0ae2 4251 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4252 goto out;
90836b67 4253 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4254 goto out;
dc808110 4255 if (po->tp_version >= TPACKET_V3 &&
2b6867c2 4256 req->tp_block_size <=
eb73190f 4257 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + sizeof(struct tpacket3_hdr))
dc808110 4258 goto out;
8913336a 4259 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
4260 po->tp_reserve))
4261 goto out;
4ebf0ae2 4262 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4263 goto out;
1da177e4 4264
4194b491
TK
4265 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4266 if (unlikely(rb->frames_per_block == 0))
69e3c75f 4267 goto out;
8f8d28e4
AK
4268 if (unlikely(req->tp_block_size > UINT_MAX / req->tp_block_nr))
4269 goto out;
69e3c75f
JB
4270 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4271 req->tp_frame_nr))
4272 goto out;
1da177e4
LT
4273
4274 err = -ENOMEM;
4ebf0ae2
DM
4275 order = get_order(req->tp_block_size);
4276 pg_vec = alloc_pg_vec(req, order);
4277 if (unlikely(!pg_vec))
1da177e4 4278 goto out;
f6fb8f10 4279 switch (po->tp_version) {
4280 case TPACKET_V3:
7f953ab2
SV
4281 /* Block transmit is not supported yet */
4282 if (!tx_ring) {
e8e85cc5 4283 init_prb_bdqc(po, rb, pg_vec, req_u);
7f953ab2
SV
4284 } else {
4285 struct tpacket_req3 *req3 = &req_u->req3;
4286
4287 if (req3->tp_retire_blk_tov ||
4288 req3->tp_sizeof_priv ||
4289 req3->tp_feature_req_word) {
4290 err = -EINVAL;
4291 goto out;
4292 }
4293 }
d7cf0c34 4294 break;
f6fb8f10 4295 default:
4296 break;
4297 }
69e3c75f
JB
4298 }
4299 /* Done */
4300 else {
4301 err = -EINVAL;
4ebf0ae2 4302 if (unlikely(req->tp_frame_nr))
69e3c75f 4303 goto out;
1da177e4
LT
4304 }
4305
1da177e4
LT
4306
4307 /* Detach socket from network */
4308 spin_lock(&po->bind_lock);
4309 was_running = po->running;
4310 num = po->num;
4311 if (was_running) {
1da177e4 4312 po->num = 0;
ce06b03e 4313 __unregister_prot_hook(sk, false);
1da177e4
LT
4314 }
4315 spin_unlock(&po->bind_lock);
1ce4f28b 4316
1da177e4
LT
4317 synchronize_net();
4318
4319 err = -EBUSY;
905db440 4320 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4321 if (closing || atomic_read(&po->mapped) == 0) {
4322 err = 0;
69e3c75f 4323 spin_lock_bh(&rb_queue->lock);
c053fd96 4324 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4325 rb->frame_max = (req->tp_frame_nr - 1);
4326 rb->head = 0;
4327 rb->frame_size = req->tp_frame_size;
4328 spin_unlock_bh(&rb_queue->lock);
4329
c053fd96
CG
4330 swap(rb->pg_vec_order, order);
4331 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4332
4333 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4334 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4335 tpacket_rcv : packet_rcv;
4336 skb_queue_purge(rb_queue);
1da177e4 4337 if (atomic_read(&po->mapped))
40d4e3df
ED
4338 pr_err("packet_mmap: vma is busy: %d\n",
4339 atomic_read(&po->mapped));
1da177e4 4340 }
905db440 4341 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4342
4343 spin_lock(&po->bind_lock);
ce06b03e 4344 if (was_running) {
1da177e4 4345 po->num = num;
ce06b03e 4346 register_prot_hook(sk);
1da177e4
LT
4347 }
4348 spin_unlock(&po->bind_lock);
c800aaf8 4349 if (pg_vec && (po->tp_version > TPACKET_V2)) {
f6fb8f10 4350 /* Because we don't support block-based V3 on tx-ring */
4351 if (!tx_ring)
73d0fcf2 4352 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4353 }
1da177e4 4354
1da177e4
LT
4355 if (pg_vec)
4356 free_pg_vec(pg_vec, order, req->tp_block_nr);
4357out:
4358 return err;
4359}
4360
69e3c75f
JB
4361static int packet_mmap(struct file *file, struct socket *sock,
4362 struct vm_area_struct *vma)
1da177e4
LT
4363{
4364 struct sock *sk = sock->sk;
4365 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4366 unsigned long size, expected_size;
4367 struct packet_ring_buffer *rb;
1da177e4
LT
4368 unsigned long start;
4369 int err = -EINVAL;
4370 int i;
4371
4372 if (vma->vm_pgoff)
4373 return -EINVAL;
4374
905db440 4375 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4376
4377 expected_size = 0;
4378 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4379 if (rb->pg_vec) {
4380 expected_size += rb->pg_vec_len
4381 * rb->pg_vec_pages
4382 * PAGE_SIZE;
4383 }
4384 }
4385
4386 if (expected_size == 0)
1da177e4 4387 goto out;
69e3c75f
JB
4388
4389 size = vma->vm_end - vma->vm_start;
4390 if (size != expected_size)
1da177e4
LT
4391 goto out;
4392
1da177e4 4393 start = vma->vm_start;
69e3c75f
JB
4394 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4395 if (rb->pg_vec == NULL)
4396 continue;
4397
4398 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4399 struct page *page;
4400 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4401 int pg_num;
4402
c56b4d90
CG
4403 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4404 page = pgv_to_page(kaddr);
69e3c75f
JB
4405 err = vm_insert_page(vma, start, page);
4406 if (unlikely(err))
4407 goto out;
4408 start += PAGE_SIZE;
0e3125c7 4409 kaddr += PAGE_SIZE;
69e3c75f 4410 }
4ebf0ae2 4411 }
1da177e4 4412 }
69e3c75f 4413
4ebf0ae2 4414 atomic_inc(&po->mapped);
1da177e4
LT
4415 vma->vm_ops = &packet_mmap_ops;
4416 err = 0;
4417
4418out:
905db440 4419 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4420 return err;
4421}
1da177e4 4422
90ddc4f0 4423static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4424 .family = PF_PACKET,
4425 .owner = THIS_MODULE,
4426 .release = packet_release,
4427 .bind = packet_bind_spkt,
4428 .connect = sock_no_connect,
4429 .socketpair = sock_no_socketpair,
4430 .accept = sock_no_accept,
4431 .getname = packet_getname_spkt,
a11e1d43 4432 .poll = datagram_poll,
1da177e4
LT
4433 .ioctl = packet_ioctl,
4434 .listen = sock_no_listen,
4435 .shutdown = sock_no_shutdown,
4436 .setsockopt = sock_no_setsockopt,
4437 .getsockopt = sock_no_getsockopt,
4438 .sendmsg = packet_sendmsg_spkt,
4439 .recvmsg = packet_recvmsg,
4440 .mmap = sock_no_mmap,
4441 .sendpage = sock_no_sendpage,
4442};
1da177e4 4443
90ddc4f0 4444static const struct proto_ops packet_ops = {
1da177e4
LT
4445 .family = PF_PACKET,
4446 .owner = THIS_MODULE,
4447 .release = packet_release,
4448 .bind = packet_bind,
4449 .connect = sock_no_connect,
4450 .socketpair = sock_no_socketpair,
4451 .accept = sock_no_accept,
1ce4f28b 4452 .getname = packet_getname,
a11e1d43 4453 .poll = packet_poll,
1da177e4
LT
4454 .ioctl = packet_ioctl,
4455 .listen = sock_no_listen,
4456 .shutdown = sock_no_shutdown,
4457 .setsockopt = packet_setsockopt,
4458 .getsockopt = packet_getsockopt,
719c44d3
WB
4459#ifdef CONFIG_COMPAT
4460 .compat_setsockopt = compat_packet_setsockopt,
4461#endif
1da177e4
LT
4462 .sendmsg = packet_sendmsg,
4463 .recvmsg = packet_recvmsg,
4464 .mmap = packet_mmap,
4465 .sendpage = sock_no_sendpage,
4466};
4467
ec1b4cf7 4468static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4469 .family = PF_PACKET,
4470 .create = packet_create,
4471 .owner = THIS_MODULE,
4472};
4473
4474static struct notifier_block packet_netdev_notifier = {
40d4e3df 4475 .notifier_call = packet_notifier,
1da177e4
LT
4476};
4477
4478#ifdef CONFIG_PROC_FS
1da177e4
LT
4479
4480static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4481 __acquires(RCU)
1da177e4 4482{
e372c414 4483 struct net *net = seq_file_net(seq);
808f5114 4484
4485 rcu_read_lock();
4486 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4487}
4488
4489static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4490{
1bf40954 4491 struct net *net = seq_file_net(seq);
808f5114 4492 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4493}
4494
4495static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4496 __releases(RCU)
1da177e4 4497{
808f5114 4498 rcu_read_unlock();
1da177e4
LT
4499}
4500
1ce4f28b 4501static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4502{
4503 if (v == SEQ_START_TOKEN)
4504 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4505 else {
b7ceabd9 4506 struct sock *s = sk_entry(v);
1da177e4
LT
4507 const struct packet_sock *po = pkt_sk(s);
4508
4509 seq_printf(seq,
71338aa7 4510 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4 4511 s,
41c6d650 4512 refcount_read(&s->sk_refcnt),
1da177e4
LT
4513 s->sk_type,
4514 ntohs(po->num),
4515 po->ifindex,
4516 po->running,
4517 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4518 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4519 sock_i_ino(s));
1da177e4
LT
4520 }
4521
4522 return 0;
4523}
4524
56b3d975 4525static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4526 .start = packet_seq_start,
4527 .next = packet_seq_next,
4528 .stop = packet_seq_stop,
4529 .show = packet_seq_show,
4530};
1da177e4
LT
4531#endif
4532
2c8c1e72 4533static int __net_init packet_net_init(struct net *net)
d12d01d6 4534{
0fa7fa98 4535 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4536 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4537
c3506372
CH
4538 if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
4539 sizeof(struct seq_net_private)))
d12d01d6
DL
4540 return -ENOMEM;
4541
4542 return 0;
4543}
4544
2c8c1e72 4545static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4546{
ece31ffd 4547 remove_proc_entry("packet", net->proc_net);
669f8f1a 4548 WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
d12d01d6
DL
4549}
4550
4551static struct pernet_operations packet_net_ops = {
4552 .init = packet_net_init,
4553 .exit = packet_net_exit,
4554};
4555
4556
1da177e4
LT
4557static void __exit packet_exit(void)
4558{
1da177e4 4559 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4560 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4561 sock_unregister(PF_PACKET);
4562 proto_unregister(&packet_proto);
4563}
4564
4565static int __init packet_init(void)
4566{
4567 int rc = proto_register(&packet_proto, 0);
4568
4569 if (rc != 0)
4570 goto out;
4571
4572 sock_register(&packet_family_ops);
d12d01d6 4573 register_pernet_subsys(&packet_net_ops);
1da177e4 4574 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4575out:
4576 return rc;
4577}
4578
4579module_init(packet_init);
4580module_exit(packet_exit);
4581MODULE_LICENSE("GPL");
4582MODULE_ALIAS_NETPROTO(PF_PACKET);