packet: Fix error path in packet_init
[linux-2.6-block.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
7c0f6ba6 76#include <linux/uaccess.h>
1da177e4
LT
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
47dceb8e 95#include <linux/bpf.h>
719c44d3 96#include <net/compat.h>
1da177e4 97
2787b04b
PE
98#include "internal.h"
99
1da177e4
LT
100/*
101 Assumptions:
102 - if device has no dev->hard_header routine, it adds and removes ll header
103 inside itself. In this case ll header is invisible outside of device,
104 but higher levels still should reserve dev->hard_header_len.
105 Some devices are enough clever to reallocate skb, when header
106 will not fit to reserved space (tunnel), another ones are silly
107 (PPP).
108 - packet socket receives packets with pulled ll header,
109 so that SOCK_RAW should push it back.
110
111On receive:
112-----------
113
114Incoming, dev->hard_header!=NULL
b0e380b1
ACM
115 mac_header -> ll header
116 data -> data
1da177e4
LT
117
118Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
119 mac_header -> ll header
120 data -> ll header
1da177e4
LT
121
122Incoming, dev->hard_header==NULL
b0e380b1
ACM
123 mac_header -> UNKNOWN position. It is very likely, that it points to ll
124 header. PPP makes it, that is wrong, because introduce
db0c58f9 125 assymetry between rx and tx paths.
b0e380b1 126 data -> data
1da177e4
LT
127
128Outgoing, dev->hard_header==NULL
b0e380b1
ACM
129 mac_header -> data. ll header is still not built!
130 data -> data
1da177e4
LT
131
132Resume
133 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
134
135
136On transmit:
137------------
138
139dev->hard_header != NULL
b0e380b1
ACM
140 mac_header -> ll header
141 data -> ll header
1da177e4
LT
142
143dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
144 mac_header -> data
145 data -> data
1da177e4
LT
146
147 We should set nh.raw on output to correct posistion,
148 packet classifier depends on it.
149 */
150
1da177e4
LT
151/* Private packet socket structures. */
152
0fb375fb
EB
153/* identical to struct packet_mreq except it has
154 * a longer address field.
155 */
40d4e3df 156struct packet_mreq_max {
0fb375fb
EB
157 int mr_ifindex;
158 unsigned short mr_type;
159 unsigned short mr_alen;
160 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 161};
a2efcfa0 162
184f489e
DB
163union tpacket_uhdr {
164 struct tpacket_hdr *h1;
165 struct tpacket2_hdr *h2;
166 struct tpacket3_hdr *h3;
167 void *raw;
168};
169
f6fb8f10 170static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
171 int closing, int tx_ring);
172
f6fb8f10 173#define V3_ALIGNMENT (8)
174
bc59ba39 175#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 176
177#define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f 188struct packet_sock;
77f65ebd
WB
189static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
190 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 191
f6fb8f10 192static void *packet_previous_frame(struct packet_sock *po,
193 struct packet_ring_buffer *rb,
194 int status);
195static void packet_increment_head(struct packet_ring_buffer *buff);
878cd3ba 196static int prb_curr_blk_in_use(struct tpacket_block_desc *);
bc59ba39 197static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 198 struct packet_sock *);
bc59ba39 199static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *, unsigned int status);
bc59ba39 201static int prb_queue_frozen(struct tpacket_kbdq_core *);
202static void prb_open_block(struct tpacket_kbdq_core *,
203 struct tpacket_block_desc *);
17bfd8c8 204static void prb_retire_rx_blk_timer_expired(struct timer_list *);
bc59ba39 205static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
bc59ba39 206static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
207static void prb_clear_rxhash(struct tpacket_kbdq_core *,
208 struct tpacket3_hdr *);
209static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
210 struct tpacket3_hdr *);
1da177e4 211static void packet_flush_mclist(struct sock *sk);
865b03f2 212static u16 packet_pick_tx_queue(struct sk_buff *skb);
1da177e4 213
ffbc6111 214struct packet_skb_cb {
ffbc6111
HX
215 union {
216 struct sockaddr_pkt pkt;
2472d761
EB
217 union {
218 /* Trick: alias skb original length with
219 * ll.sll_family and ll.protocol in order
220 * to save room.
221 */
222 unsigned int origlen;
223 struct sockaddr_ll ll;
224 };
ffbc6111
HX
225 } sa;
226};
227
d3869efe
DW
228#define vio_le() virtio_legacy_is_little_endian()
229
ffbc6111 230#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 231
bc59ba39 232#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 233#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 234 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 235#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 236 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 237#define GET_NEXT_PRB_BLK_NUM(x) \
238 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
239 ((x)->kactive_blk_num+1) : 0)
240
dc99f600
DM
241static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
242static void __fanout_link(struct sock *sk, struct packet_sock *po);
243
d346a3fa
DB
244static int packet_direct_xmit(struct sk_buff *skb)
245{
865b03f2 246 return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
d346a3fa
DB
247}
248
66e56cd4
DB
249static struct net_device *packet_cached_dev_get(struct packet_sock *po)
250{
251 struct net_device *dev;
252
253 rcu_read_lock();
254 dev = rcu_dereference(po->cached_dev);
255 if (likely(dev))
256 dev_hold(dev);
257 rcu_read_unlock();
258
259 return dev;
260}
261
262static void packet_cached_dev_assign(struct packet_sock *po,
263 struct net_device *dev)
264{
265 rcu_assign_pointer(po->cached_dev, dev);
266}
267
268static void packet_cached_dev_reset(struct packet_sock *po)
269{
270 RCU_INIT_POINTER(po->cached_dev, NULL);
271}
272
d346a3fa
DB
273static bool packet_use_direct_xmit(const struct packet_sock *po)
274{
275 return po->xmit == packet_direct_xmit;
276}
277
865b03f2 278static u16 packet_pick_tx_queue(struct sk_buff *skb)
0fd5d57b 279{
865b03f2 280 struct net_device *dev = skb->dev;
0fd5d57b 281 const struct net_device_ops *ops = dev->netdev_ops;
b71b5837 282 int cpu = raw_smp_processor_id();
0fd5d57b
DB
283 u16 queue_index;
284
b71b5837
PA
285#ifdef CONFIG_XPS
286 skb->sender_cpu = cpu + 1;
287#endif
288 skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
0fd5d57b 289 if (ops->ndo_select_queue) {
a350ecce 290 queue_index = ops->ndo_select_queue(dev, skb, NULL);
0fd5d57b
DB
291 queue_index = netdev_cap_txqueue(dev, queue_index);
292 } else {
b71b5837 293 queue_index = netdev_pick_tx(dev, skb, NULL);
0fd5d57b
DB
294 }
295
865b03f2 296 return queue_index;
0fd5d57b
DB
297}
298
a6361f0c 299/* __register_prot_hook must be invoked through register_prot_hook
ce06b03e
DM
300 * or from a context in which asynchronous accesses to the packet
301 * socket is not possible (packet_create()).
302 */
a6361f0c 303static void __register_prot_hook(struct sock *sk)
ce06b03e
DM
304{
305 struct packet_sock *po = pkt_sk(sk);
e40526cb 306
ce06b03e 307 if (!po->running) {
66e56cd4 308 if (po->fanout)
dc99f600 309 __fanout_link(sk, po);
66e56cd4 310 else
dc99f600 311 dev_add_pack(&po->prot_hook);
e40526cb 312
ce06b03e
DM
313 sock_hold(sk);
314 po->running = 1;
315 }
316}
317
a6361f0c
WB
318static void register_prot_hook(struct sock *sk)
319{
320 lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
321 __register_prot_hook(sk);
322}
323
324/* If the sync parameter is true, we will temporarily drop
ce06b03e
DM
325 * the po->bind_lock and do a synchronize_net to make sure no
326 * asynchronous packet processing paths still refer to the elements
327 * of po->prot_hook. If the sync parameter is false, it is the
328 * callers responsibility to take care of this.
329 */
330static void __unregister_prot_hook(struct sock *sk, bool sync)
331{
332 struct packet_sock *po = pkt_sk(sk);
333
a6361f0c
WB
334 lockdep_assert_held_once(&po->bind_lock);
335
ce06b03e 336 po->running = 0;
66e56cd4
DB
337
338 if (po->fanout)
dc99f600 339 __fanout_unlink(sk, po);
66e56cd4 340 else
dc99f600 341 __dev_remove_pack(&po->prot_hook);
e40526cb 342
ce06b03e
DM
343 __sock_put(sk);
344
345 if (sync) {
346 spin_unlock(&po->bind_lock);
347 synchronize_net();
348 spin_lock(&po->bind_lock);
349 }
350}
351
352static void unregister_prot_hook(struct sock *sk, bool sync)
353{
354 struct packet_sock *po = pkt_sk(sk);
355
356 if (po->running)
357 __unregister_prot_hook(sk, sync);
358}
359
6e58040b 360static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
361{
362 if (is_vmalloc_addr(addr))
363 return vmalloc_to_page(addr);
364 return virt_to_page(addr);
365}
366
69e3c75f 367static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 368{
184f489e 369 union tpacket_uhdr h;
1da177e4 370
69e3c75f 371 h.raw = frame;
bbd6ef87
PM
372 switch (po->tp_version) {
373 case TPACKET_V1:
69e3c75f 374 h.h1->tp_status = status;
0af55bb5 375 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
376 break;
377 case TPACKET_V2:
69e3c75f 378 h.h2->tp_status = status;
0af55bb5 379 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 380 break;
f6fb8f10 381 case TPACKET_V3:
7f953ab2
SV
382 h.h3->tp_status = status;
383 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
384 break;
69e3c75f 385 default:
f6fb8f10 386 WARN(1, "TPACKET version not supported.\n");
69e3c75f 387 BUG();
bbd6ef87 388 }
69e3c75f
JB
389
390 smp_wmb();
bbd6ef87
PM
391}
392
69e3c75f 393static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 394{
184f489e 395 union tpacket_uhdr h;
bbd6ef87 396
69e3c75f
JB
397 smp_rmb();
398
bbd6ef87
PM
399 h.raw = frame;
400 switch (po->tp_version) {
401 case TPACKET_V1:
0af55bb5 402 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 403 return h.h1->tp_status;
bbd6ef87 404 case TPACKET_V2:
0af55bb5 405 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 406 return h.h2->tp_status;
f6fb8f10 407 case TPACKET_V3:
7f953ab2
SV
408 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
409 return h.h3->tp_status;
69e3c75f 410 default:
f6fb8f10 411 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
412 BUG();
413 return 0;
bbd6ef87 414 }
1da177e4 415}
69e3c75f 416
b9c32fb2
DB
417static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
418 unsigned int flags)
7a51384c
DB
419{
420 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
421
68a360e8
WB
422 if (shhwtstamps &&
423 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
424 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
425 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
426
427 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 428 return TP_STATUS_TS_SOFTWARE;
7a51384c 429
b9c32fb2 430 return 0;
7a51384c
DB
431}
432
b9c32fb2
DB
433static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
434 struct sk_buff *skb)
2e31396f
WB
435{
436 union tpacket_uhdr h;
437 struct timespec ts;
b9c32fb2 438 __u32 ts_status;
2e31396f 439
b9c32fb2
DB
440 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
441 return 0;
2e31396f
WB
442
443 h.raw = frame;
444 switch (po->tp_version) {
445 case TPACKET_V1:
446 h.h1->tp_sec = ts.tv_sec;
447 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
448 break;
449 case TPACKET_V2:
450 h.h2->tp_sec = ts.tv_sec;
451 h.h2->tp_nsec = ts.tv_nsec;
452 break;
453 case TPACKET_V3:
57ea884b
DB
454 h.h3->tp_sec = ts.tv_sec;
455 h.h3->tp_nsec = ts.tv_nsec;
456 break;
2e31396f
WB
457 default:
458 WARN(1, "TPACKET version not supported.\n");
459 BUG();
460 }
461
462 /* one flush is safe, as both fields always lie on the same cacheline */
463 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
464 smp_wmb();
b9c32fb2
DB
465
466 return ts_status;
2e31396f
WB
467}
468
69e3c75f
JB
469static void *packet_lookup_frame(struct packet_sock *po,
470 struct packet_ring_buffer *rb,
471 unsigned int position,
472 int status)
473{
474 unsigned int pg_vec_pos, frame_offset;
184f489e 475 union tpacket_uhdr h;
69e3c75f
JB
476
477 pg_vec_pos = position / rb->frames_per_block;
478 frame_offset = position % rb->frames_per_block;
479
0e3125c7
NH
480 h.raw = rb->pg_vec[pg_vec_pos].buffer +
481 (frame_offset * rb->frame_size);
69e3c75f
JB
482
483 if (status != __packet_get_status(po, h.raw))
484 return NULL;
485
486 return h.raw;
487}
488
eea49cc9 489static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
490 struct packet_ring_buffer *rb,
491 int status)
492{
493 return packet_lookup_frame(po, rb, rb->head, status);
494}
495
bc59ba39 496static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 497{
498 del_timer_sync(&pkc->retire_blk_timer);
499}
500
501static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 502 struct sk_buff_head *rb_queue)
503{
bc59ba39 504 struct tpacket_kbdq_core *pkc;
f6fb8f10 505
73d0fcf2 506 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 507
ec6f809f 508 spin_lock_bh(&rb_queue->lock);
f6fb8f10 509 pkc->delete_blk_timer = 1;
ec6f809f 510 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 511
512 prb_del_retire_blk_timer(pkc);
513}
514
e8e85cc5 515static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 516{
bc59ba39 517 struct tpacket_kbdq_core *pkc;
f6fb8f10 518
e8e85cc5 519 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
17bfd8c8
KC
520 timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
521 0);
522 pkc->retire_blk_timer.expires = jiffies;
f6fb8f10 523}
524
525static int prb_calc_retire_blk_tmo(struct packet_sock *po,
526 int blk_size_in_bytes)
527{
528 struct net_device *dev;
529 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
7cad1bac 530 struct ethtool_link_ksettings ecmd;
4bc71cb9 531 int err;
f6fb8f10 532
4bc71cb9
JP
533 rtnl_lock();
534 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
535 if (unlikely(!dev)) {
536 rtnl_unlock();
f6fb8f10 537 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 538 }
7cad1bac 539 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9
JP
540 rtnl_unlock();
541 if (!err) {
4bc71cb9
JP
542 /*
543 * If the link speed is so slow you don't really
544 * need to worry about perf anyways
545 */
7cad1bac
DD
546 if (ecmd.base.speed < SPEED_1000 ||
547 ecmd.base.speed == SPEED_UNKNOWN) {
4bc71cb9 548 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 549 } else {
550 msec = 1;
7cad1bac 551 div = ecmd.base.speed / 1000;
f6fb8f10 552 }
553 }
554
555 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
556
557 if (div)
558 mbits /= div;
559
560 tmo = mbits * msec;
561
562 if (div)
563 return tmo+1;
564 return tmo;
565}
566
bc59ba39 567static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 568 union tpacket_req_u *req_u)
569{
570 p1->feature_req_word = req_u->req3.tp_feature_req_word;
571}
572
573static void init_prb_bdqc(struct packet_sock *po,
574 struct packet_ring_buffer *rb,
575 struct pgv *pg_vec,
e8e85cc5 576 union tpacket_req_u *req_u)
f6fb8f10 577{
22781a5b 578 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 579 struct tpacket_block_desc *pbd;
f6fb8f10 580
581 memset(p1, 0x0, sizeof(*p1));
582
583 p1->knxt_seq_num = 1;
584 p1->pkbdq = pg_vec;
bc59ba39 585 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 586 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 587 p1->kblk_size = req_u->req3.tp_block_size;
588 p1->knum_blocks = req_u->req3.tp_block_nr;
589 p1->hdrlen = po->tp_hdrlen;
590 p1->version = po->tp_version;
591 p1->last_kactive_blk_num = 0;
ee80fbf3 592 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 593 if (req_u->req3.tp_retire_blk_tov)
594 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
595 else
596 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
597 req_u->req3.tp_block_size);
598 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
599 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
600
dc808110 601 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 602 prb_init_ft_ops(p1, req_u);
e8e85cc5 603 prb_setup_retire_blk_timer(po);
f6fb8f10 604 prb_open_block(p1, pbd);
605}
606
607/* Do NOT update the last_blk_num first.
608 * Assumes sk_buff_head lock is held.
609 */
bc59ba39 610static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 611{
612 mod_timer(&pkc->retire_blk_timer,
613 jiffies + pkc->tov_in_jiffies);
614 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
615}
616
617/*
618 * Timer logic:
619 * 1) We refresh the timer only when we open a block.
620 * By doing this we don't waste cycles refreshing the timer
621 * on packet-by-packet basis.
622 *
623 * With a 1MB block-size, on a 1Gbps line, it will take
624 * i) ~8 ms to fill a block + ii) memcpy etc.
625 * In this cut we are not accounting for the memcpy time.
626 *
627 * So, if the user sets the 'tmo' to 10ms then the timer
628 * will never fire while the block is still getting filled
629 * (which is what we want). However, the user could choose
630 * to close a block early and that's fine.
631 *
632 * But when the timer does fire, we check whether or not to refresh it.
633 * Since the tmo granularity is in msecs, it is not too expensive
634 * to refresh the timer, lets say every '8' msecs.
635 * Either the user can set the 'tmo' or we can derive it based on
636 * a) line-speed and b) block-size.
637 * prb_calc_retire_blk_tmo() calculates the tmo.
638 *
639 */
17bfd8c8 640static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
f6fb8f10 641{
17bfd8c8
KC
642 struct packet_sock *po =
643 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
22781a5b 644 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 645 unsigned int frozen;
bc59ba39 646 struct tpacket_block_desc *pbd;
f6fb8f10 647
648 spin_lock(&po->sk.sk_receive_queue.lock);
649
650 frozen = prb_queue_frozen(pkc);
651 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
652
653 if (unlikely(pkc->delete_blk_timer))
654 goto out;
655
656 /* We only need to plug the race when the block is partially filled.
657 * tpacket_rcv:
658 * lock(); increment BLOCK_NUM_PKTS; unlock()
659 * copy_bits() is in progress ...
660 * timer fires on other cpu:
661 * we can't retire the current block because copy_bits
662 * is in progress.
663 *
664 */
665 if (BLOCK_NUM_PKTS(pbd)) {
666 while (atomic_read(&pkc->blk_fill_in_prog)) {
667 /* Waiting for skb_copy_bits to finish... */
668 cpu_relax();
669 }
670 }
671
672 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
673 if (!frozen) {
41a50d62
AD
674 if (!BLOCK_NUM_PKTS(pbd)) {
675 /* An empty block. Just refresh the timer. */
676 goto refresh_timer;
677 }
f6fb8f10 678 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
679 if (!prb_dispatch_next_block(pkc, po))
680 goto refresh_timer;
681 else
682 goto out;
683 } else {
684 /* Case 1. Queue was frozen because user-space was
685 * lagging behind.
686 */
878cd3ba 687 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 688 /*
689 * Ok, user-space is still behind.
690 * So just refresh the timer.
691 */
692 goto refresh_timer;
693 } else {
694 /* Case 2. queue was frozen,user-space caught up,
695 * now the link went idle && the timer fired.
696 * We don't have a block to close.So we open this
697 * block and restart the timer.
698 * opening a block thaws the queue,restarts timer
699 * Thawing/timer-refresh is a side effect.
700 */
701 prb_open_block(pkc, pbd);
702 goto out;
703 }
704 }
705 }
706
707refresh_timer:
708 _prb_refresh_rx_retire_blk_timer(pkc);
709
710out:
711 spin_unlock(&po->sk.sk_receive_queue.lock);
712}
713
eea49cc9 714static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 715 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 716{
717 /* Flush everything minus the block header */
718
719#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
720 u8 *start, *end;
721
722 start = (u8 *)pbd1;
723
724 /* Skip the block header(we know header WILL fit in 4K) */
725 start += PAGE_SIZE;
726
727 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
728 for (; start < end; start += PAGE_SIZE)
729 flush_dcache_page(pgv_to_page(start));
730
731 smp_wmb();
732#endif
733
734 /* Now update the block status. */
735
736 BLOCK_STATUS(pbd1) = status;
737
738 /* Flush the block header */
739
740#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
741 start = (u8 *)pbd1;
742 flush_dcache_page(pgv_to_page(start));
743
744 smp_wmb();
745#endif
746}
747
748/*
749 * Side effect:
750 *
751 * 1) flush the block
752 * 2) Increment active_blk_num
753 *
754 * Note:We DONT refresh the timer on purpose.
755 * Because almost always the next block will be opened.
756 */
bc59ba39 757static void prb_close_block(struct tpacket_kbdq_core *pkc1,
758 struct tpacket_block_desc *pbd1,
f6fb8f10 759 struct packet_sock *po, unsigned int stat)
760{
761 __u32 status = TP_STATUS_USER | stat;
762
763 struct tpacket3_hdr *last_pkt;
bc59ba39 764 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 765 struct sock *sk = &po->sk;
f6fb8f10 766
ee80fbf3 767 if (po->stats.stats3.tp_drops)
f6fb8f10 768 status |= TP_STATUS_LOSING;
769
770 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
771 last_pkt->tp_next_offset = 0;
772
773 /* Get the ts of the last pkt */
774 if (BLOCK_NUM_PKTS(pbd1)) {
775 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
776 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
777 } else {
41a50d62
AD
778 /* Ok, we tmo'd - so get the current time.
779 *
780 * It shouldn't really happen as we don't close empty
781 * blocks. See prb_retire_rx_blk_timer_expired().
782 */
f6fb8f10 783 struct timespec ts;
784 getnstimeofday(&ts);
785 h1->ts_last_pkt.ts_sec = ts.tv_sec;
786 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
787 }
788
789 smp_wmb();
790
791 /* Flush the block */
792 prb_flush_block(pkc1, pbd1, status);
793
da413eec
DC
794 sk->sk_data_ready(sk);
795
f6fb8f10 796 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
797}
798
eea49cc9 799static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 800{
801 pkc->reset_pending_on_curr_blk = 0;
802}
803
804/*
805 * Side effect of opening a block:
806 *
807 * 1) prb_queue is thawed.
808 * 2) retire_blk_timer is refreshed.
809 *
810 */
bc59ba39 811static void prb_open_block(struct tpacket_kbdq_core *pkc1,
812 struct tpacket_block_desc *pbd1)
f6fb8f10 813{
814 struct timespec ts;
bc59ba39 815 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 816
817 smp_rmb();
818
8da3056c
DB
819 /* We could have just memset this but we will lose the
820 * flexibility of making the priv area sticky
821 */
f6fb8f10 822
8da3056c
DB
823 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
824 BLOCK_NUM_PKTS(pbd1) = 0;
825 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 826
8da3056c
DB
827 getnstimeofday(&ts);
828
829 h1->ts_first_pkt.ts_sec = ts.tv_sec;
830 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 831
8da3056c
DB
832 pkc1->pkblk_start = (char *)pbd1;
833 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
834
835 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
836 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
837
838 pbd1->version = pkc1->version;
839 pkc1->prev = pkc1->nxt_offset;
840 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
841
842 prb_thaw_queue(pkc1);
843 _prb_refresh_rx_retire_blk_timer(pkc1);
844
845 smp_wmb();
f6fb8f10 846}
847
848/*
849 * Queue freeze logic:
850 * 1) Assume tp_block_nr = 8 blocks.
851 * 2) At time 't0', user opens Rx ring.
852 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
853 * 4) user-space is either sleeping or processing block '0'.
854 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
855 * it will close block-7,loop around and try to fill block '0'.
856 * call-flow:
857 * __packet_lookup_frame_in_block
858 * prb_retire_current_block()
859 * prb_dispatch_next_block()
860 * |->(BLOCK_STATUS == USER) evaluates to true
861 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
862 * 6) Now there are two cases:
863 * 6.1) Link goes idle right after the queue is frozen.
864 * But remember, the last open_block() refreshed the timer.
865 * When this timer expires,it will refresh itself so that we can
866 * re-open block-0 in near future.
867 * 6.2) Link is busy and keeps on receiving packets. This is a simple
868 * case and __packet_lookup_frame_in_block will check if block-0
869 * is free and can now be re-used.
870 */
eea49cc9 871static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 872 struct packet_sock *po)
873{
874 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 875 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 876}
877
878#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
879
880/*
881 * If the next block is free then we will dispatch it
882 * and return a good offset.
883 * Else, we will freeze the queue.
884 * So, caller must check the return value.
885 */
bc59ba39 886static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 887 struct packet_sock *po)
888{
bc59ba39 889 struct tpacket_block_desc *pbd;
f6fb8f10 890
891 smp_rmb();
892
893 /* 1. Get current block num */
894 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
895
896 /* 2. If this block is currently in_use then freeze the queue */
897 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
898 prb_freeze_queue(pkc, po);
899 return NULL;
900 }
901
902 /*
903 * 3.
904 * open this block and return the offset where the first packet
905 * needs to get stored.
906 */
907 prb_open_block(pkc, pbd);
908 return (void *)pkc->nxt_offset;
909}
910
bc59ba39 911static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 912 struct packet_sock *po, unsigned int status)
913{
bc59ba39 914 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 915
916 /* retire/close the current block */
917 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
918 /*
919 * Plug the case where copy_bits() is in progress on
920 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
921 * have space to copy the pkt in the current block and
922 * called prb_retire_current_block()
923 *
924 * We don't need to worry about the TMO case because
925 * the timer-handler already handled this case.
926 */
927 if (!(status & TP_STATUS_BLK_TMO)) {
928 while (atomic_read(&pkc->blk_fill_in_prog)) {
929 /* Waiting for skb_copy_bits to finish... */
930 cpu_relax();
931 }
932 }
933 prb_close_block(pkc, pbd, po, status);
934 return;
935 }
f6fb8f10 936}
937
878cd3ba 938static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
f6fb8f10 939{
940 return TP_STATUS_USER & BLOCK_STATUS(pbd);
941}
942
eea49cc9 943static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 944{
945 return pkc->reset_pending_on_curr_blk;
946}
947
eea49cc9 948static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 949{
bc59ba39 950 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 951 atomic_dec(&pkc->blk_fill_in_prog);
952}
953
eea49cc9 954static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 955 struct tpacket3_hdr *ppd)
956{
3958afa1 957 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 958}
959
eea49cc9 960static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 961 struct tpacket3_hdr *ppd)
962{
963 ppd->hv1.tp_rxhash = 0;
964}
965
eea49cc9 966static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 967 struct tpacket3_hdr *ppd)
968{
df8a39de
JP
969 if (skb_vlan_tag_present(pkc->skb)) {
970 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
971 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
972 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 973 } else {
9e67030a 974 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 975 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 976 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 977 }
978}
979
bc59ba39 980static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 981 struct tpacket3_hdr *ppd)
982{
a0cdfcf3 983 ppd->hv1.tp_padding = 0;
f6fb8f10 984 prb_fill_vlan_info(pkc, ppd);
985
986 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
987 prb_fill_rxhash(pkc, ppd);
988 else
989 prb_clear_rxhash(pkc, ppd);
990}
991
eea49cc9 992static void prb_fill_curr_block(char *curr,
bc59ba39 993 struct tpacket_kbdq_core *pkc,
994 struct tpacket_block_desc *pbd,
f6fb8f10 995 unsigned int len)
996{
997 struct tpacket3_hdr *ppd;
998
999 ppd = (struct tpacket3_hdr *)curr;
1000 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1001 pkc->prev = curr;
1002 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1003 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1004 BLOCK_NUM_PKTS(pbd) += 1;
1005 atomic_inc(&pkc->blk_fill_in_prog);
1006 prb_run_all_ft_ops(pkc, ppd);
1007}
1008
1009/* Assumes caller has the sk->rx_queue.lock */
1010static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1011 struct sk_buff *skb,
1012 int status,
1013 unsigned int len
1014 )
1015{
bc59ba39 1016 struct tpacket_kbdq_core *pkc;
1017 struct tpacket_block_desc *pbd;
f6fb8f10 1018 char *curr, *end;
1019
e3192690 1020 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1021 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1022
1023 /* Queue is frozen when user space is lagging behind */
1024 if (prb_queue_frozen(pkc)) {
1025 /*
1026 * Check if that last block which caused the queue to freeze,
1027 * is still in_use by user-space.
1028 */
878cd3ba 1029 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 1030 /* Can't record this packet */
1031 return NULL;
1032 } else {
1033 /*
1034 * Ok, the block was released by user-space.
1035 * Now let's open that block.
1036 * opening a block also thaws the queue.
1037 * Thawing is a side effect.
1038 */
1039 prb_open_block(pkc, pbd);
1040 }
1041 }
1042
1043 smp_mb();
1044 curr = pkc->nxt_offset;
1045 pkc->skb = skb;
e3192690 1046 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1047
1048 /* first try the current block */
1049 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1050 prb_fill_curr_block(curr, pkc, pbd, len);
1051 return (void *)curr;
1052 }
1053
1054 /* Ok, close the current block */
1055 prb_retire_current_block(pkc, po, 0);
1056
1057 /* Now, try to dispatch the next block */
1058 curr = (char *)prb_dispatch_next_block(pkc, po);
1059 if (curr) {
1060 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1061 prb_fill_curr_block(curr, pkc, pbd, len);
1062 return (void *)curr;
1063 }
1064
1065 /*
1066 * No free blocks are available.user_space hasn't caught up yet.
1067 * Queue was just frozen and now this packet will get dropped.
1068 */
1069 return NULL;
1070}
1071
eea49cc9 1072static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1073 struct sk_buff *skb,
1074 int status, unsigned int len)
1075{
1076 char *curr = NULL;
1077 switch (po->tp_version) {
1078 case TPACKET_V1:
1079 case TPACKET_V2:
1080 curr = packet_lookup_frame(po, &po->rx_ring,
1081 po->rx_ring.head, status);
1082 return curr;
1083 case TPACKET_V3:
1084 return __packet_lookup_frame_in_block(po, skb, status, len);
1085 default:
1086 WARN(1, "TPACKET version not supported\n");
1087 BUG();
99aa3473 1088 return NULL;
f6fb8f10 1089 }
1090}
1091
eea49cc9 1092static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1093 struct packet_ring_buffer *rb,
77f65ebd 1094 unsigned int idx,
f6fb8f10 1095 int status)
1096{
bc59ba39 1097 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1098 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1099
1100 if (status != BLOCK_STATUS(pbd))
1101 return NULL;
1102 return pbd;
1103}
1104
eea49cc9 1105static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1106{
1107 unsigned int prev;
1108 if (rb->prb_bdqc.kactive_blk_num)
1109 prev = rb->prb_bdqc.kactive_blk_num-1;
1110 else
1111 prev = rb->prb_bdqc.knum_blocks-1;
1112 return prev;
1113}
1114
1115/* Assumes caller has held the rx_queue.lock */
eea49cc9 1116static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1117 struct packet_ring_buffer *rb,
1118 int status)
1119{
1120 unsigned int previous = prb_previous_blk_num(rb);
1121 return prb_lookup_block(po, rb, previous, status);
1122}
1123
eea49cc9 1124static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1125 struct packet_ring_buffer *rb,
1126 int status)
1127{
1128 if (po->tp_version <= TPACKET_V2)
1129 return packet_previous_frame(po, rb, status);
1130
1131 return __prb_previous_block(po, rb, status);
1132}
1133
eea49cc9 1134static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1135 struct packet_ring_buffer *rb)
1136{
1137 switch (po->tp_version) {
1138 case TPACKET_V1:
1139 case TPACKET_V2:
1140 return packet_increment_head(rb);
1141 case TPACKET_V3:
1142 default:
1143 WARN(1, "TPACKET version not supported.\n");
1144 BUG();
1145 return;
1146 }
1147}
1148
eea49cc9 1149static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1150 struct packet_ring_buffer *rb,
1151 int status)
1152{
1153 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1154 return packet_lookup_frame(po, rb, previous, status);
1155}
1156
eea49cc9 1157static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1158{
1159 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1160}
1161
b0138408
DB
1162static void packet_inc_pending(struct packet_ring_buffer *rb)
1163{
1164 this_cpu_inc(*rb->pending_refcnt);
1165}
1166
1167static void packet_dec_pending(struct packet_ring_buffer *rb)
1168{
1169 this_cpu_dec(*rb->pending_refcnt);
1170}
1171
1172static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1173{
1174 unsigned int refcnt = 0;
1175 int cpu;
1176
1177 /* We don't use pending refcount in rx_ring. */
1178 if (rb->pending_refcnt == NULL)
1179 return 0;
1180
1181 for_each_possible_cpu(cpu)
1182 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1183
1184 return refcnt;
1185}
1186
1187static int packet_alloc_pending(struct packet_sock *po)
1188{
1189 po->rx_ring.pending_refcnt = NULL;
1190
1191 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1192 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1193 return -ENOBUFS;
1194
1195 return 0;
1196}
1197
1198static void packet_free_pending(struct packet_sock *po)
1199{
1200 free_percpu(po->tx_ring.pending_refcnt);
1201}
1202
9954729b
WB
1203#define ROOM_POW_OFF 2
1204#define ROOM_NONE 0x0
1205#define ROOM_LOW 0x1
1206#define ROOM_NORMAL 0x2
1207
1208static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1209{
9954729b
WB
1210 int idx, len;
1211
1212 len = po->rx_ring.frame_max + 1;
1213 idx = po->rx_ring.head;
1214 if (pow_off)
1215 idx += len >> pow_off;
1216 if (idx >= len)
1217 idx -= len;
1218 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1219}
1220
1221static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1222{
1223 int idx, len;
1224
1225 len = po->rx_ring.prb_bdqc.knum_blocks;
1226 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1227 if (pow_off)
1228 idx += len >> pow_off;
1229 if (idx >= len)
1230 idx -= len;
1231 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1232}
77f65ebd 1233
2ccdbaa6 1234static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1235{
1236 struct sock *sk = &po->sk;
1237 int ret = ROOM_NONE;
1238
1239 if (po->prot_hook.func != tpacket_rcv) {
1240 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1241 - (skb ? skb->truesize : 0);
9954729b
WB
1242 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1243 return ROOM_NORMAL;
1244 else if (avail > 0)
1245 return ROOM_LOW;
1246 else
1247 return ROOM_NONE;
1248 }
77f65ebd 1249
9954729b
WB
1250 if (po->tp_version == TPACKET_V3) {
1251 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1252 ret = ROOM_NORMAL;
1253 else if (__tpacket_v3_has_room(po, 0))
1254 ret = ROOM_LOW;
1255 } else {
1256 if (__tpacket_has_room(po, ROOM_POW_OFF))
1257 ret = ROOM_NORMAL;
1258 else if (__tpacket_has_room(po, 0))
1259 ret = ROOM_LOW;
1260 }
2ccdbaa6
WB
1261
1262 return ret;
1263}
1264
1265static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1266{
1267 int ret;
1268 bool has_room;
1269
54d7c01d
WB
1270 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1271 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1272 has_room = ret == ROOM_NORMAL;
1273 if (po->pressure == has_room)
54d7c01d
WB
1274 po->pressure = !has_room;
1275 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1276
9954729b 1277 return ret;
77f65ebd
WB
1278}
1279
1da177e4
LT
1280static void packet_sock_destruct(struct sock *sk)
1281{
ed85b565
RC
1282 skb_queue_purge(&sk->sk_error_queue);
1283
547b792c 1284 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
14afee4b 1285 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1da177e4
LT
1286
1287 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1288 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1289 return;
1290 }
1291
17ab56a2 1292 sk_refcnt_debug_dec(sk);
1da177e4
LT
1293}
1294
3b3a5b0a
WB
1295static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1296{
1297 u32 rxhash;
1298 int i, count = 0;
1299
1300 rxhash = skb_get_hash(skb);
1301 for (i = 0; i < ROLLOVER_HLEN; i++)
1302 if (po->rollover->history[i] == rxhash)
1303 count++;
1304
1305 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1306 return count > (ROLLOVER_HLEN >> 1);
1307}
1308
77f65ebd
WB
1309static unsigned int fanout_demux_hash(struct packet_fanout *f,
1310 struct sk_buff *skb,
1311 unsigned int num)
dc99f600 1312{
eb70db87 1313 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
dc99f600
DM
1314}
1315
77f65ebd
WB
1316static unsigned int fanout_demux_lb(struct packet_fanout *f,
1317 struct sk_buff *skb,
1318 unsigned int num)
dc99f600 1319{
468479e6 1320 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1321
468479e6 1322 return val % num;
77f65ebd
WB
1323}
1324
1325static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1326 struct sk_buff *skb,
1327 unsigned int num)
1328{
1329 return smp_processor_id() % num;
dc99f600
DM
1330}
1331
5df0ddfb
DB
1332static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1333 struct sk_buff *skb,
1334 unsigned int num)
1335{
f337db64 1336 return prandom_u32_max(num);
5df0ddfb
DB
1337}
1338
77f65ebd
WB
1339static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1340 struct sk_buff *skb,
ad377cab 1341 unsigned int idx, bool try_self,
77f65ebd 1342 unsigned int num)
95ec3eb4 1343{
4633c9e0 1344 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1345 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1346
0648ab70 1347 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1348
1349 if (try_self) {
1350 room = packet_rcv_has_room(po, skb);
1351 if (room == ROOM_NORMAL ||
1352 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1353 return idx;
4633c9e0 1354 po_skip = po;
3b3a5b0a 1355 }
ad377cab 1356
0648ab70 1357 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1358 do {
2ccdbaa6 1359 po_next = pkt_sk(f->arr[i]);
4633c9e0 1360 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1361 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1362 if (i != j)
0648ab70 1363 po->rollover->sock = i;
a9b63918
WB
1364 atomic_long_inc(&po->rollover->num);
1365 if (room == ROOM_LOW)
1366 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1367 return i;
1368 }
ad377cab 1369
77f65ebd
WB
1370 if (++i == num)
1371 i = 0;
1372 } while (i != j);
1373
a9b63918 1374 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1375 return idx;
1376}
1377
2d36097d
NH
1378static unsigned int fanout_demux_qm(struct packet_fanout *f,
1379 struct sk_buff *skb,
1380 unsigned int num)
1381{
1382 return skb_get_queue_mapping(skb) % num;
1383}
1384
47dceb8e
WB
1385static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1386 struct sk_buff *skb,
1387 unsigned int num)
1388{
1389 struct bpf_prog *prog;
1390 unsigned int ret = 0;
1391
1392 rcu_read_lock();
1393 prog = rcu_dereference(f->bpf_prog);
1394 if (prog)
ff936a04 1395 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1396 rcu_read_unlock();
1397
1398 return ret;
1399}
1400
77f65ebd
WB
1401static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1402{
1403 return f->flags & (flag >> 8);
95ec3eb4
DM
1404}
1405
95ec3eb4
DM
1406static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1407 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1408{
1409 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1410 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1411 struct net *net = read_pnet(&f->net);
dc99f600 1412 struct packet_sock *po;
77f65ebd 1413 unsigned int idx;
dc99f600 1414
19bcf9f2 1415 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1416 kfree_skb(skb);
1417 return 0;
1418 }
1419
3f34b24a 1420 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1421 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1422 if (!skb)
1423 return 0;
1424 }
95ec3eb4
DM
1425 switch (f->type) {
1426 case PACKET_FANOUT_HASH:
1427 default:
77f65ebd 1428 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1429 break;
1430 case PACKET_FANOUT_LB:
77f65ebd 1431 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1432 break;
1433 case PACKET_FANOUT_CPU:
77f65ebd
WB
1434 idx = fanout_demux_cpu(f, skb, num);
1435 break;
5df0ddfb
DB
1436 case PACKET_FANOUT_RND:
1437 idx = fanout_demux_rnd(f, skb, num);
1438 break;
2d36097d
NH
1439 case PACKET_FANOUT_QM:
1440 idx = fanout_demux_qm(f, skb, num);
1441 break;
77f65ebd 1442 case PACKET_FANOUT_ROLLOVER:
ad377cab 1443 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1444 break;
47dceb8e 1445 case PACKET_FANOUT_CBPF:
f2e52095 1446 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1447 idx = fanout_demux_bpf(f, skb, num);
1448 break;
dc99f600
DM
1449 }
1450
ad377cab
WB
1451 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1452 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1453
ad377cab 1454 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1455 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1456}
1457
fff3321d
PE
1458DEFINE_MUTEX(fanout_mutex);
1459EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600 1460static LIST_HEAD(fanout_list);
4a69a864 1461static u16 fanout_next_id;
dc99f600
DM
1462
1463static void __fanout_link(struct sock *sk, struct packet_sock *po)
1464{
1465 struct packet_fanout *f = po->fanout;
1466
1467 spin_lock(&f->lock);
1468 f->arr[f->num_members] = sk;
1469 smp_wmb();
1470 f->num_members++;
2bd624b4
AS
1471 if (f->num_members == 1)
1472 dev_add_pack(&f->prot_hook);
dc99f600
DM
1473 spin_unlock(&f->lock);
1474}
1475
1476static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1477{
1478 struct packet_fanout *f = po->fanout;
1479 int i;
1480
1481 spin_lock(&f->lock);
1482 for (i = 0; i < f->num_members; i++) {
1483 if (f->arr[i] == sk)
1484 break;
1485 }
1486 BUG_ON(i >= f->num_members);
1487 f->arr[i] = f->arr[f->num_members - 1];
1488 f->num_members--;
2bd624b4
AS
1489 if (f->num_members == 0)
1490 __dev_remove_pack(&f->prot_hook);
dc99f600
DM
1491 spin_unlock(&f->lock);
1492}
1493
d4dd8aee 1494static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1495{
161642e2
ED
1496 if (sk->sk_family != PF_PACKET)
1497 return false;
c0de08d0 1498
161642e2 1499 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1500}
1501
47dceb8e
WB
1502static void fanout_init_data(struct packet_fanout *f)
1503{
1504 switch (f->type) {
1505 case PACKET_FANOUT_LB:
1506 atomic_set(&f->rr_cur, 0);
1507 break;
1508 case PACKET_FANOUT_CBPF:
f2e52095 1509 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1510 RCU_INIT_POINTER(f->bpf_prog, NULL);
1511 break;
1512 }
1513}
1514
1515static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1516{
1517 struct bpf_prog *old;
1518
1519 spin_lock(&f->lock);
1520 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1521 rcu_assign_pointer(f->bpf_prog, new);
1522 spin_unlock(&f->lock);
1523
1524 if (old) {
1525 synchronize_net();
1526 bpf_prog_destroy(old);
1527 }
1528}
1529
1530static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1531 unsigned int len)
1532{
1533 struct bpf_prog *new;
1534 struct sock_fprog fprog;
1535 int ret;
1536
1537 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1538 return -EPERM;
1539 if (len != sizeof(fprog))
1540 return -EINVAL;
1541 if (copy_from_user(&fprog, data, len))
1542 return -EFAULT;
1543
bab18991 1544 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1545 if (ret)
1546 return ret;
1547
1548 __fanout_set_data_bpf(po->fanout, new);
1549 return 0;
1550}
1551
f2e52095
WB
1552static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1553 unsigned int len)
1554{
1555 struct bpf_prog *new;
1556 u32 fd;
1557
1558 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1559 return -EPERM;
1560 if (len != sizeof(fd))
1561 return -EINVAL;
1562 if (copy_from_user(&fd, data, len))
1563 return -EFAULT;
1564
113214be 1565 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
f2e52095
WB
1566 if (IS_ERR(new))
1567 return PTR_ERR(new);
f2e52095
WB
1568
1569 __fanout_set_data_bpf(po->fanout, new);
1570 return 0;
1571}
1572
47dceb8e
WB
1573static int fanout_set_data(struct packet_sock *po, char __user *data,
1574 unsigned int len)
1575{
1576 switch (po->fanout->type) {
1577 case PACKET_FANOUT_CBPF:
1578 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1579 case PACKET_FANOUT_EBPF:
1580 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1581 default:
1582 return -EINVAL;
07d53ae4 1583 }
47dceb8e
WB
1584}
1585
1586static void fanout_release_data(struct packet_fanout *f)
1587{
1588 switch (f->type) {
1589 case PACKET_FANOUT_CBPF:
f2e52095 1590 case PACKET_FANOUT_EBPF:
47dceb8e 1591 __fanout_set_data_bpf(f, NULL);
07d53ae4 1592 }
47dceb8e
WB
1593}
1594
4a69a864
MM
1595static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1596{
1597 struct packet_fanout *f;
1598
1599 list_for_each_entry(f, &fanout_list, list) {
1600 if (f->id == candidate_id &&
1601 read_pnet(&f->net) == sock_net(sk)) {
1602 return false;
1603 }
1604 }
1605 return true;
1606}
1607
1608static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1609{
1610 u16 id = fanout_next_id;
1611
1612 do {
1613 if (__fanout_id_is_free(sk, id)) {
1614 *new_id = id;
1615 fanout_next_id = id + 1;
1616 return true;
1617 }
1618
1619 id++;
1620 } while (id != fanout_next_id);
1621
1622 return false;
1623}
1624
7736d33f 1625static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600 1626{
d199fab6 1627 struct packet_rollover *rollover = NULL;
dc99f600
DM
1628 struct packet_sock *po = pkt_sk(sk);
1629 struct packet_fanout *f, *match;
7736d33f 1630 u8 type = type_flags & 0xff;
77f65ebd 1631 u8 flags = type_flags >> 8;
dc99f600
DM
1632 int err;
1633
1634 switch (type) {
77f65ebd
WB
1635 case PACKET_FANOUT_ROLLOVER:
1636 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1637 return -EINVAL;
dc99f600
DM
1638 case PACKET_FANOUT_HASH:
1639 case PACKET_FANOUT_LB:
95ec3eb4 1640 case PACKET_FANOUT_CPU:
5df0ddfb 1641 case PACKET_FANOUT_RND:
2d36097d 1642 case PACKET_FANOUT_QM:
47dceb8e 1643 case PACKET_FANOUT_CBPF:
f2e52095 1644 case PACKET_FANOUT_EBPF:
dc99f600
DM
1645 break;
1646 default:
1647 return -EINVAL;
1648 }
1649
d199fab6
ED
1650 mutex_lock(&fanout_mutex);
1651
d199fab6 1652 err = -EALREADY;
dc99f600 1653 if (po->fanout)
d199fab6 1654 goto out;
dc99f600 1655
4633c9e0
WB
1656 if (type == PACKET_FANOUT_ROLLOVER ||
1657 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
d199fab6
ED
1658 err = -ENOMEM;
1659 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1660 if (!rollover)
1661 goto out;
1662 atomic_long_set(&rollover->num, 0);
1663 atomic_long_set(&rollover->num_huge, 0);
1664 atomic_long_set(&rollover->num_failed, 0);
0648ab70
WB
1665 }
1666
4a69a864
MM
1667 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1668 if (id != 0) {
1669 err = -EINVAL;
1670 goto out;
1671 }
1672 if (!fanout_find_new_id(sk, &id)) {
1673 err = -ENOMEM;
1674 goto out;
1675 }
1676 /* ephemeral flag for the first socket in the group: drop it */
1677 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1678 }
1679
dc99f600
DM
1680 match = NULL;
1681 list_for_each_entry(f, &fanout_list, list) {
1682 if (f->id == id &&
1683 read_pnet(&f->net) == sock_net(sk)) {
1684 match = f;
1685 break;
1686 }
1687 }
afe62c68 1688 err = -EINVAL;
77f65ebd 1689 if (match && match->flags != flags)
afe62c68 1690 goto out;
dc99f600 1691 if (!match) {
afe62c68 1692 err = -ENOMEM;
dc99f600 1693 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1694 if (!match)
1695 goto out;
1696 write_pnet(&match->net, sock_net(sk));
1697 match->id = id;
1698 match->type = type;
77f65ebd 1699 match->flags = flags;
afe62c68
ED
1700 INIT_LIST_HEAD(&match->list);
1701 spin_lock_init(&match->lock);
fb5c2c17 1702 refcount_set(&match->sk_ref, 0);
47dceb8e 1703 fanout_init_data(match);
afe62c68
ED
1704 match->prot_hook.type = po->prot_hook.type;
1705 match->prot_hook.dev = po->prot_hook.dev;
1706 match->prot_hook.func = packet_rcv_fanout;
1707 match->prot_hook.af_packet_priv = match;
c0de08d0 1708 match->prot_hook.id_match = match_fanout_group;
afe62c68 1709 list_add(&match->list, &fanout_list);
dc99f600 1710 }
afe62c68 1711 err = -EINVAL;
008ba2a1
WB
1712
1713 spin_lock(&po->bind_lock);
1714 if (po->running &&
1715 match->type == type &&
afe62c68
ED
1716 match->prot_hook.type == po->prot_hook.type &&
1717 match->prot_hook.dev == po->prot_hook.dev) {
1718 err = -ENOSPC;
fb5c2c17 1719 if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
afe62c68
ED
1720 __dev_remove_pack(&po->prot_hook);
1721 po->fanout = match;
57f015f5
MM
1722 po->rollover = rollover;
1723 rollover = NULL;
fb5c2c17 1724 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
afe62c68
ED
1725 __fanout_link(sk, po);
1726 err = 0;
dc99f600
DM
1727 }
1728 }
008ba2a1
WB
1729 spin_unlock(&po->bind_lock);
1730
1731 if (err && !refcount_read(&match->sk_ref)) {
1732 list_del(&match->list);
1733 kfree(match);
1734 }
1735
afe62c68 1736out:
57f015f5 1737 kfree(rollover);
d199fab6 1738 mutex_unlock(&fanout_mutex);
dc99f600
DM
1739 return err;
1740}
1741
2bd624b4
AS
1742/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1743 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1744 * It is the responsibility of the caller to call fanout_release_data() and
1745 * free the returned packet_fanout (after synchronize_net())
1746 */
1747static struct packet_fanout *fanout_release(struct sock *sk)
dc99f600
DM
1748{
1749 struct packet_sock *po = pkt_sk(sk);
1750 struct packet_fanout *f;
1751
fff3321d 1752 mutex_lock(&fanout_mutex);
d199fab6
ED
1753 f = po->fanout;
1754 if (f) {
1755 po->fanout = NULL;
1756
fb5c2c17 1757 if (refcount_dec_and_test(&f->sk_ref))
d199fab6 1758 list_del(&f->list);
2bd624b4
AS
1759 else
1760 f = NULL;
dc99f600
DM
1761 }
1762 mutex_unlock(&fanout_mutex);
2bd624b4
AS
1763
1764 return f;
dc99f600 1765}
1da177e4 1766
3c70c132
DB
1767static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1768 struct sk_buff *skb)
1769{
1770 /* Earlier code assumed this would be a VLAN pkt, double-check
1771 * this now that we have the actual packet in hand. We can only
1772 * do this check on Ethernet devices.
1773 */
1774 if (unlikely(dev->type != ARPHRD_ETHER))
1775 return false;
1776
1777 skb_reset_mac_header(skb);
1778 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1779}
1780
90ddc4f0 1781static const struct proto_ops packet_ops;
1da177e4 1782
90ddc4f0 1783static const struct proto_ops packet_ops_spkt;
1da177e4 1784
40d4e3df
ED
1785static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1786 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1787{
1788 struct sock *sk;
1789 struct sockaddr_pkt *spkt;
1790
1791 /*
1792 * When we registered the protocol we saved the socket in the data
1793 * field for just this event.
1794 */
1795
1796 sk = pt->af_packet_priv;
1ce4f28b 1797
1da177e4
LT
1798 /*
1799 * Yank back the headers [hope the device set this
1800 * right or kerboom...]
1801 *
1802 * Incoming packets have ll header pulled,
1803 * push it back.
1804 *
98e399f8 1805 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1806 * so that this procedure is noop.
1807 */
1808
1809 if (skb->pkt_type == PACKET_LOOPBACK)
1810 goto out;
1811
09ad9bc7 1812 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1813 goto out;
1814
40d4e3df
ED
1815 skb = skb_share_check(skb, GFP_ATOMIC);
1816 if (skb == NULL)
1da177e4
LT
1817 goto oom;
1818
1819 /* drop any routing info */
adf30907 1820 skb_dst_drop(skb);
1da177e4 1821
84531c24
PO
1822 /* drop conntrack reference */
1823 nf_reset(skb);
1824
ffbc6111 1825 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1826
98e399f8 1827 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1828
1829 /*
1830 * The SOCK_PACKET socket receives _all_ frames.
1831 */
1832
1833 spkt->spkt_family = dev->type;
1834 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1835 spkt->spkt_protocol = skb->protocol;
1836
1837 /*
1838 * Charge the memory to the socket. This is done specifically
1839 * to prevent sockets using all the memory up.
1840 */
1841
40d4e3df 1842 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1843 return 0;
1844
1845out:
1846 kfree_skb(skb);
1847oom:
1848 return 0;
1849}
1850
75c65772
MM
1851static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
1852{
18bed891
YK
1853 if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
1854 sock->type == SOCK_RAW) {
75c65772
MM
1855 skb_reset_mac_header(skb);
1856 skb->protocol = dev_parse_header_protocol(skb);
1857 }
1858
1859 skb_probe_transport_header(skb);
1860}
1da177e4
LT
1861
1862/*
1863 * Output a raw packet to a device layer. This bypasses all the other
1864 * protocol layers and you must therefore supply it with a complete frame
1865 */
1ce4f28b 1866
1b784140
YX
1867static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1868 size_t len)
1da177e4
LT
1869{
1870 struct sock *sk = sock->sk;
342dfc30 1871 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1872 struct sk_buff *skb = NULL;
1da177e4 1873 struct net_device *dev;
c14ac945 1874 struct sockcm_cookie sockc;
40d4e3df 1875 __be16 proto = 0;
1da177e4 1876 int err;
3bdc0eba 1877 int extra_len = 0;
1ce4f28b 1878
1da177e4 1879 /*
1ce4f28b 1880 * Get and verify the address.
1da177e4
LT
1881 */
1882
40d4e3df 1883 if (saddr) {
1da177e4 1884 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1885 return -EINVAL;
1886 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1887 proto = saddr->spkt_protocol;
1888 } else
1889 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1890
1891 /*
1ce4f28b 1892 * Find the device first to size check it
1da177e4
LT
1893 */
1894
de74e92a 1895 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1896retry:
654d1f8a
ED
1897 rcu_read_lock();
1898 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1899 err = -ENODEV;
1900 if (dev == NULL)
1901 goto out_unlock;
1ce4f28b 1902
d5e76b0a
DM
1903 err = -ENETDOWN;
1904 if (!(dev->flags & IFF_UP))
1905 goto out_unlock;
1906
1da177e4 1907 /*
40d4e3df
ED
1908 * You may not queue a frame bigger than the mtu. This is the lowest level
1909 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1910 */
1ce4f28b 1911
3bdc0eba
BG
1912 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1913 if (!netif_supports_nofcs(dev)) {
1914 err = -EPROTONOSUPPORT;
1915 goto out_unlock;
1916 }
1917 extra_len = 4; /* We're doing our own CRC */
1918 }
1919
1da177e4 1920 err = -EMSGSIZE;
3bdc0eba 1921 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1922 goto out_unlock;
1923
1a35ca80
ED
1924 if (!skb) {
1925 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1926 int tlen = dev->needed_tailroom;
1a35ca80
ED
1927 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1928
1929 rcu_read_unlock();
4ce40912 1930 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1931 if (skb == NULL)
1932 return -ENOBUFS;
1933 /* FIXME: Save some space for broken drivers that write a hard
1934 * header at transmission time by themselves. PPP is the notable
1935 * one here. This should really be fixed at the driver level.
1936 */
1937 skb_reserve(skb, reserved);
1938 skb_reset_network_header(skb);
1939
1940 /* Try to align data part correctly */
1941 if (hhlen) {
1942 skb->data -= hhlen;
1943 skb->tail -= hhlen;
1944 if (len < hhlen)
1945 skb_reset_network_header(skb);
1946 }
6ce8e9ce 1947 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1948 if (err)
1949 goto out_free;
1950 goto retry;
1da177e4
LT
1951 }
1952
9ed988cd
WB
1953 if (!dev_validate_header(dev, skb->data, len)) {
1954 err = -EINVAL;
1955 goto out_unlock;
1956 }
3c70c132
DB
1957 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1958 !packet_extra_vlan_len_allowed(dev, skb)) {
1959 err = -EMSGSIZE;
1960 goto out_unlock;
57f89bfa 1961 }
1a35ca80 1962
657a0667 1963 sockcm_init(&sockc, sk);
c14ac945
SHY
1964 if (msg->msg_controllen) {
1965 err = sock_cmsg_send(sk, msg, &sockc);
f8e7718c 1966 if (unlikely(err))
c14ac945 1967 goto out_unlock;
c14ac945
SHY
1968 }
1969
1da177e4
LT
1970 skb->protocol = proto;
1971 skb->dev = dev;
1972 skb->priority = sk->sk_priority;
2d37a186 1973 skb->mark = sk->sk_mark;
3d0ba8c0 1974 skb->tstamp = sockc.transmit_time;
bf84a010 1975
8f932f76 1976 skb_setup_tx_timestamp(skb, sockc.tsflags);
1da177e4 1977
3bdc0eba
BG
1978 if (unlikely(extra_len == 4))
1979 skb->no_fcs = 1;
1980
75c65772 1981 packet_parse_headers(skb, sock);
c1aad275 1982
1da177e4 1983 dev_queue_xmit(skb);
654d1f8a 1984 rcu_read_unlock();
40d4e3df 1985 return len;
1da177e4 1986
1da177e4 1987out_unlock:
654d1f8a 1988 rcu_read_unlock();
1a35ca80
ED
1989out_free:
1990 kfree_skb(skb);
1da177e4
LT
1991 return err;
1992}
1da177e4 1993
ff936a04
AS
1994static unsigned int run_filter(struct sk_buff *skb,
1995 const struct sock *sk,
1996 unsigned int res)
1da177e4
LT
1997{
1998 struct sk_filter *filter;
fda9ef5d 1999
80f8f102
ED
2000 rcu_read_lock();
2001 filter = rcu_dereference(sk->sk_filter);
dbcb5855 2002 if (filter != NULL)
ff936a04 2003 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 2004 rcu_read_unlock();
1da177e4 2005
dbcb5855 2006 return res;
1da177e4
LT
2007}
2008
16cc1400
WB
2009static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2010 size_t *len)
2011{
2012 struct virtio_net_hdr vnet_hdr;
2013
2014 if (*len < sizeof(vnet_hdr))
2015 return -EINVAL;
2016 *len -= sizeof(vnet_hdr);
2017
fd3a8862 2018 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
16cc1400
WB
2019 return -EINVAL;
2020
2021 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2022}
2023
1da177e4 2024/*
62ab0812
ED
2025 * This function makes lazy skb cloning in hope that most of packets
2026 * are discarded by BPF.
2027 *
2028 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2029 * and skb->cb are mangled. It works because (and until) packets
2030 * falling here are owned by current CPU. Output packets are cloned
2031 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2032 * sequencially, so that if we return skb to original state on exit,
2033 * we will not harm anyone.
1da177e4
LT
2034 */
2035
40d4e3df
ED
2036static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2037 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2038{
2039 struct sock *sk;
2040 struct sockaddr_ll *sll;
2041 struct packet_sock *po;
40d4e3df 2042 u8 *skb_head = skb->data;
1da177e4 2043 int skb_len = skb->len;
dbcb5855 2044 unsigned int snaplen, res;
da37845f 2045 bool is_drop_n_account = false;
1da177e4
LT
2046
2047 if (skb->pkt_type == PACKET_LOOPBACK)
2048 goto drop;
2049
2050 sk = pt->af_packet_priv;
2051 po = pkt_sk(sk);
2052
09ad9bc7 2053 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2054 goto drop;
2055
1da177e4
LT
2056 skb->dev = dev;
2057
3b04ddde 2058 if (dev->header_ops) {
1da177e4 2059 /* The device has an explicit notion of ll header,
62ab0812
ED
2060 * exported to higher levels.
2061 *
2062 * Otherwise, the device hides details of its frame
2063 * structure, so that corresponding packet head is
2064 * never delivered to user.
1da177e4
LT
2065 */
2066 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2067 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2068 else if (skb->pkt_type == PACKET_OUTGOING) {
2069 /* Special case: outgoing packets have ll header at head */
bbe735e4 2070 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2071 }
2072 }
2073
2074 snaplen = skb->len;
2075
dbcb5855
DM
2076 res = run_filter(skb, sk, snaplen);
2077 if (!res)
fda9ef5d 2078 goto drop_n_restore;
dbcb5855
DM
2079 if (snaplen > res)
2080 snaplen = res;
1da177e4 2081
0fd7bac6 2082 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2083 goto drop_n_acct;
2084
2085 if (skb_shared(skb)) {
2086 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2087 if (nskb == NULL)
2088 goto drop_n_acct;
2089
2090 if (skb_head != skb->data) {
2091 skb->data = skb_head;
2092 skb->len = skb_len;
2093 }
abc4e4fa 2094 consume_skb(skb);
1da177e4
LT
2095 skb = nskb;
2096 }
2097
b4772ef8 2098 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2099
2100 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2101 sll->sll_hatype = dev->type;
1da177e4 2102 sll->sll_pkttype = skb->pkt_type;
8032b464 2103 if (unlikely(po->origdev))
80feaacb
PWJ
2104 sll->sll_ifindex = orig_dev->ifindex;
2105 else
2106 sll->sll_ifindex = dev->ifindex;
1da177e4 2107
b95cce35 2108 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2109
2472d761
EB
2110 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2111 * Use their space for storing the original skb length.
2112 */
2113 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2114
1da177e4
LT
2115 if (pskb_trim(skb, snaplen))
2116 goto drop_n_acct;
2117
2118 skb_set_owner_r(skb, sk);
2119 skb->dev = NULL;
adf30907 2120 skb_dst_drop(skb);
1da177e4 2121
84531c24
PO
2122 /* drop conntrack reference */
2123 nf_reset(skb);
2124
1da177e4 2125 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2126 po->stats.stats1.tp_packets++;
3bc3b96f 2127 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2128 __skb_queue_tail(&sk->sk_receive_queue, skb);
2129 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2130 sk->sk_data_ready(sk);
1da177e4
LT
2131 return 0;
2132
2133drop_n_acct:
da37845f 2134 is_drop_n_account = true;
7091fbd8 2135 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2136 po->stats.stats1.tp_drops++;
7091fbd8
WB
2137 atomic_inc(&sk->sk_drops);
2138 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
2139
2140drop_n_restore:
2141 if (skb_head != skb->data && skb_shared(skb)) {
2142 skb->data = skb_head;
2143 skb->len = skb_len;
2144 }
2145drop:
da37845f
WJ
2146 if (!is_drop_n_account)
2147 consume_skb(skb);
2148 else
2149 kfree_skb(skb);
1da177e4
LT
2150 return 0;
2151}
2152
40d4e3df
ED
2153static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2154 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2155{
2156 struct sock *sk;
2157 struct packet_sock *po;
2158 struct sockaddr_ll *sll;
184f489e 2159 union tpacket_uhdr h;
40d4e3df 2160 u8 *skb_head = skb->data;
1da177e4 2161 int skb_len = skb->len;
dbcb5855 2162 unsigned int snaplen, res;
f6fb8f10 2163 unsigned long status = TP_STATUS_USER;
bbd6ef87 2164 unsigned short macoff, netoff, hdrlen;
1da177e4 2165 struct sk_buff *copy_skb = NULL;
bbd6ef87 2166 struct timespec ts;
b9c32fb2 2167 __u32 ts_status;
da37845f 2168 bool is_drop_n_account = false;
edbd58be 2169 bool do_vnet = false;
1da177e4 2170
51846355
AW
2171 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2172 * We may add members to them until current aligned size without forcing
2173 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2174 */
2175 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2176 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2177
1da177e4
LT
2178 if (skb->pkt_type == PACKET_LOOPBACK)
2179 goto drop;
2180
2181 sk = pt->af_packet_priv;
2182 po = pkt_sk(sk);
2183
09ad9bc7 2184 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2185 goto drop;
2186
3b04ddde 2187 if (dev->header_ops) {
1da177e4 2188 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2189 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2190 else if (skb->pkt_type == PACKET_OUTGOING) {
2191 /* Special case: outgoing packets have ll header at head */
bbe735e4 2192 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2193 }
2194 }
2195
2196 snaplen = skb->len;
2197
dbcb5855
DM
2198 res = run_filter(skb, sk, snaplen);
2199 if (!res)
fda9ef5d 2200 goto drop_n_restore;
68c2e5de
AD
2201
2202 if (skb->ip_summed == CHECKSUM_PARTIAL)
2203 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2204 else if (skb->pkt_type != PACKET_OUTGOING &&
2205 (skb->ip_summed == CHECKSUM_COMPLETE ||
2206 skb_csum_unnecessary(skb)))
2207 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2208
dbcb5855
DM
2209 if (snaplen > res)
2210 snaplen = res;
1da177e4
LT
2211
2212 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2213 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2214 po->tp_reserve;
1da177e4 2215 } else {
95c96174 2216 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2217 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2218 (maclen < 16 ? 16 : maclen)) +
58d19b19 2219 po->tp_reserve;
edbd58be 2220 if (po->has_vnet_hdr) {
58d19b19 2221 netoff += sizeof(struct virtio_net_hdr);
edbd58be
BP
2222 do_vnet = true;
2223 }
1da177e4
LT
2224 macoff = netoff - maclen;
2225 }
f6fb8f10 2226 if (po->tp_version <= TPACKET_V2) {
2227 if (macoff + snaplen > po->rx_ring.frame_size) {
2228 if (po->copy_thresh &&
0fd7bac6 2229 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2230 if (skb_shared(skb)) {
2231 copy_skb = skb_clone(skb, GFP_ATOMIC);
2232 } else {
2233 copy_skb = skb_get(skb);
2234 skb_head = skb->data;
2235 }
2236 if (copy_skb)
2237 skb_set_owner_r(copy_skb, sk);
1da177e4 2238 }
f6fb8f10 2239 snaplen = po->rx_ring.frame_size - macoff;
edbd58be 2240 if ((int)snaplen < 0) {
f6fb8f10 2241 snaplen = 0;
edbd58be
BP
2242 do_vnet = false;
2243 }
1da177e4 2244 }
dc808110
ED
2245 } else if (unlikely(macoff + snaplen >
2246 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2247 u32 nval;
2248
2249 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2250 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2251 snaplen, nval, macoff);
2252 snaplen = nval;
2253 if (unlikely((int)snaplen < 0)) {
2254 snaplen = 0;
2255 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
edbd58be 2256 do_vnet = false;
dc808110 2257 }
1da177e4 2258 }
1da177e4 2259 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2260 h.raw = packet_current_rx_frame(po, skb,
2261 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2262 if (!h.raw)
58d19b19 2263 goto drop_n_account;
f6fb8f10 2264 if (po->tp_version <= TPACKET_V2) {
2265 packet_increment_rx_head(po, &po->rx_ring);
2266 /*
2267 * LOSING will be reported till you read the stats,
2268 * because it's COR - Clear On Read.
2269 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2270 * at packet level.
2271 */
ee80fbf3 2272 if (po->stats.stats1.tp_drops)
f6fb8f10 2273 status |= TP_STATUS_LOSING;
2274 }
945d015e
ED
2275
2276 if (do_vnet &&
2277 virtio_net_hdr_from_skb(skb, h.raw + macoff -
2278 sizeof(struct virtio_net_hdr),
2279 vio_le(), true, 0))
2280 goto drop_n_account;
2281
ee80fbf3 2282 po->stats.stats1.tp_packets++;
1da177e4
LT
2283 if (copy_skb) {
2284 status |= TP_STATUS_COPY;
2285 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2286 }
1da177e4
LT
2287 spin_unlock(&sk->sk_receive_queue.lock);
2288
bbd6ef87 2289 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2290
2291 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2292 getnstimeofday(&ts);
1da177e4 2293
b9c32fb2
DB
2294 status |= ts_status;
2295
bbd6ef87
PM
2296 switch (po->tp_version) {
2297 case TPACKET_V1:
2298 h.h1->tp_len = skb->len;
2299 h.h1->tp_snaplen = snaplen;
2300 h.h1->tp_mac = macoff;
2301 h.h1->tp_net = netoff;
4b457bdf
DB
2302 h.h1->tp_sec = ts.tv_sec;
2303 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2304 hdrlen = sizeof(*h.h1);
2305 break;
2306 case TPACKET_V2:
2307 h.h2->tp_len = skb->len;
2308 h.h2->tp_snaplen = snaplen;
2309 h.h2->tp_mac = macoff;
2310 h.h2->tp_net = netoff;
bbd6ef87
PM
2311 h.h2->tp_sec = ts.tv_sec;
2312 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2313 if (skb_vlan_tag_present(skb)) {
2314 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2315 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2316 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2317 } else {
2318 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2319 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2320 }
e4d26f4b 2321 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2322 hdrlen = sizeof(*h.h2);
2323 break;
f6fb8f10 2324 case TPACKET_V3:
2325 /* tp_nxt_offset,vlan are already populated above.
2326 * So DONT clear those fields here
2327 */
2328 h.h3->tp_status |= status;
2329 h.h3->tp_len = skb->len;
2330 h.h3->tp_snaplen = snaplen;
2331 h.h3->tp_mac = macoff;
2332 h.h3->tp_net = netoff;
f6fb8f10 2333 h.h3->tp_sec = ts.tv_sec;
2334 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2335 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2336 hdrlen = sizeof(*h.h3);
2337 break;
bbd6ef87
PM
2338 default:
2339 BUG();
2340 }
1da177e4 2341
bbd6ef87 2342 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2343 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2344 sll->sll_family = AF_PACKET;
2345 sll->sll_hatype = dev->type;
2346 sll->sll_protocol = skb->protocol;
2347 sll->sll_pkttype = skb->pkt_type;
8032b464 2348 if (unlikely(po->origdev))
80feaacb
PWJ
2349 sll->sll_ifindex = orig_dev->ifindex;
2350 else
2351 sll->sll_ifindex = dev->ifindex;
1da177e4 2352
e16aa207 2353 smp_mb();
f0d4eb29 2354
f6dafa95 2355#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2356 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2357 u8 *start, *end;
2358
f0d4eb29
DB
2359 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2360 macoff + snaplen);
2361
2362 for (start = h.raw; start < end; start += PAGE_SIZE)
2363 flush_dcache_page(pgv_to_page(start));
1da177e4 2364 }
f0d4eb29 2365 smp_wmb();
f6dafa95 2366#endif
f0d4eb29 2367
da413eec 2368 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2369 __packet_set_status(po, h.raw, status);
da413eec
DC
2370 sk->sk_data_ready(sk);
2371 } else {
f6fb8f10 2372 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2373 }
1da177e4
LT
2374
2375drop_n_restore:
2376 if (skb_head != skb->data && skb_shared(skb)) {
2377 skb->data = skb_head;
2378 skb->len = skb_len;
2379 }
2380drop:
da37845f
WJ
2381 if (!is_drop_n_account)
2382 consume_skb(skb);
2383 else
2384 kfree_skb(skb);
1da177e4
LT
2385 return 0;
2386
58d19b19 2387drop_n_account:
da37845f 2388 is_drop_n_account = true;
ee80fbf3 2389 po->stats.stats1.tp_drops++;
1da177e4
LT
2390 spin_unlock(&sk->sk_receive_queue.lock);
2391
676d2369 2392 sk->sk_data_ready(sk);
acb5d75b 2393 kfree_skb(copy_skb);
1da177e4
LT
2394 goto drop_n_restore;
2395}
2396
69e3c75f
JB
2397static void tpacket_destruct_skb(struct sk_buff *skb)
2398{
2399 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2400
69e3c75f 2401 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2402 void *ph;
b9c32fb2
DB
2403 __u32 ts;
2404
5cd8d46e 2405 ph = skb_zcopy_get_nouarg(skb);
b0138408 2406 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2407
2408 ts = __packet_set_timestamp(po, ph, skb);
2409 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2410 }
2411
2412 sock_wfree(skb);
2413}
2414
16cc1400
WB
2415static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2416{
16cc1400
WB
2417 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2418 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2419 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2420 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2421 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2422 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2423 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2424
2425 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2426 return -EINVAL;
2427
16cc1400
WB
2428 return 0;
2429}
2430
2431static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2432 struct virtio_net_hdr *vnet_hdr)
2433{
16cc1400
WB
2434 if (*len < sizeof(*vnet_hdr))
2435 return -EINVAL;
2436 *len -= sizeof(*vnet_hdr);
2437
cbbd26b8 2438 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
16cc1400
WB
2439 return -EFAULT;
2440
2441 return __packet_snd_vnet_parse(vnet_hdr, *len);
2442}
2443
40d4e3df 2444static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2445 void *frame, struct net_device *dev, void *data, int tp_len,
c14ac945
SHY
2446 __be16 proto, unsigned char *addr, int hlen, int copylen,
2447 const struct sockcm_cookie *sockc)
69e3c75f 2448{
184f489e 2449 union tpacket_uhdr ph;
8d39b4a6 2450 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2451 struct socket *sock = po->sk.sk_socket;
2452 struct page *page;
69e3c75f
JB
2453 int err;
2454
2455 ph.raw = frame;
2456
2457 skb->protocol = proto;
2458 skb->dev = dev;
2459 skb->priority = po->sk.sk_priority;
2d37a186 2460 skb->mark = po->sk.sk_mark;
3d0ba8c0 2461 skb->tstamp = sockc->transmit_time;
8f932f76 2462 skb_setup_tx_timestamp(skb, sockc->tsflags);
5cd8d46e 2463 skb_zcopy_set_nouarg(skb, ph.raw);
69e3c75f 2464
ae641949 2465 skb_reserve(skb, hlen);
69e3c75f 2466 skb_reset_network_header(skb);
c1aad275 2467
69e3c75f
JB
2468 to_write = tp_len;
2469
2470 if (sock->type == SOCK_DGRAM) {
2471 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2472 NULL, tp_len);
2473 if (unlikely(err < 0))
2474 return -EINVAL;
1d036d25 2475 } else if (copylen) {
9ed988cd
WB
2476 int hdrlen = min_t(int, copylen, tp_len);
2477
69e3c75f 2478 skb_push(skb, dev->hard_header_len);
1d036d25 2479 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2480 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2481 if (unlikely(err))
2482 return err;
9ed988cd
WB
2483 if (!dev_validate_header(dev, skb->data, hdrlen))
2484 return -EINVAL;
69e3c75f 2485
9ed988cd
WB
2486 data += hdrlen;
2487 to_write -= hdrlen;
69e3c75f
JB
2488 }
2489
69e3c75f
JB
2490 offset = offset_in_page(data);
2491 len_max = PAGE_SIZE - offset;
2492 len = ((to_write > len_max) ? len_max : to_write);
2493
2494 skb->data_len = to_write;
2495 skb->len += to_write;
2496 skb->truesize += to_write;
14afee4b 2497 refcount_add(to_write, &po->sk.sk_wmem_alloc);
69e3c75f
JB
2498
2499 while (likely(to_write)) {
2500 nr_frags = skb_shinfo(skb)->nr_frags;
2501
2502 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2503 pr_err("Packet exceed the number of skb frags(%lu)\n",
2504 MAX_SKB_FRAGS);
69e3c75f
JB
2505 return -EFAULT;
2506 }
2507
0af55bb5
CG
2508 page = pgv_to_page(data);
2509 data += len;
69e3c75f
JB
2510 flush_dcache_page(page);
2511 get_page(page);
0af55bb5 2512 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2513 to_write -= len;
2514 offset = 0;
2515 len_max = PAGE_SIZE;
2516 len = ((to_write > len_max) ? len_max : to_write);
2517 }
2518
75c65772 2519 packet_parse_headers(skb, sock);
efdfa2f7 2520
69e3c75f
JB
2521 return tp_len;
2522}
2523
8d39b4a6
WB
2524static int tpacket_parse_header(struct packet_sock *po, void *frame,
2525 int size_max, void **data)
2526{
2527 union tpacket_uhdr ph;
2528 int tp_len, off;
2529
2530 ph.raw = frame;
2531
2532 switch (po->tp_version) {
7f953ab2
SV
2533 case TPACKET_V3:
2534 if (ph.h3->tp_next_offset != 0) {
2535 pr_warn_once("variable sized slot not supported");
2536 return -EINVAL;
2537 }
2538 tp_len = ph.h3->tp_len;
2539 break;
8d39b4a6
WB
2540 case TPACKET_V2:
2541 tp_len = ph.h2->tp_len;
2542 break;
2543 default:
2544 tp_len = ph.h1->tp_len;
2545 break;
2546 }
2547 if (unlikely(tp_len > size_max)) {
2548 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2549 return -EMSGSIZE;
2550 }
2551
2552 if (unlikely(po->tp_tx_has_off)) {
2553 int off_min, off_max;
2554
2555 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2556 off_max = po->tx_ring.frame_size - tp_len;
2557 if (po->sk.sk_type == SOCK_DGRAM) {
2558 switch (po->tp_version) {
7f953ab2
SV
2559 case TPACKET_V3:
2560 off = ph.h3->tp_net;
2561 break;
8d39b4a6
WB
2562 case TPACKET_V2:
2563 off = ph.h2->tp_net;
2564 break;
2565 default:
2566 off = ph.h1->tp_net;
2567 break;
2568 }
2569 } else {
2570 switch (po->tp_version) {
7f953ab2
SV
2571 case TPACKET_V3:
2572 off = ph.h3->tp_mac;
2573 break;
8d39b4a6
WB
2574 case TPACKET_V2:
2575 off = ph.h2->tp_mac;
2576 break;
2577 default:
2578 off = ph.h1->tp_mac;
2579 break;
2580 }
2581 }
2582 if (unlikely((off < off_min) || (off_max < off)))
2583 return -EINVAL;
2584 } else {
2585 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2586 }
2587
2588 *data = frame + off;
2589 return tp_len;
2590}
2591
69e3c75f
JB
2592static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2593{
69e3c75f
JB
2594 struct sk_buff *skb;
2595 struct net_device *dev;
1d036d25 2596 struct virtio_net_hdr *vnet_hdr = NULL;
c14ac945 2597 struct sockcm_cookie sockc;
69e3c75f 2598 __be16 proto;
09effa67 2599 int err, reserve = 0;
40d4e3df 2600 void *ph;
342dfc30 2601 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2602 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
486efdc8 2603 unsigned char *addr = NULL;
69e3c75f 2604 int tp_len, size_max;
8d39b4a6 2605 void *data;
69e3c75f 2606 int len_sum = 0;
9e67030a 2607 int status = TP_STATUS_AVAILABLE;
1d036d25 2608 int hlen, tlen, copylen = 0;
69e3c75f 2609
69e3c75f
JB
2610 mutex_lock(&po->pg_vec_lock);
2611
66e56cd4 2612 if (likely(saddr == NULL)) {
e40526cb 2613 dev = packet_cached_dev_get(po);
69e3c75f 2614 proto = po->num;
69e3c75f
JB
2615 } else {
2616 err = -EINVAL;
2617 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2618 goto out;
2619 if (msg->msg_namelen < (saddr->sll_halen
2620 + offsetof(struct sockaddr_ll,
2621 sll_addr)))
2622 goto out;
69e3c75f 2623 proto = saddr->sll_protocol;
827d9780 2624 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
486efdc8
WB
2625 if (po->sk.sk_socket->type == SOCK_DGRAM) {
2626 if (dev && msg->msg_namelen < dev->addr_len +
2627 offsetof(struct sockaddr_ll, sll_addr))
2628 goto out_put;
2629 addr = saddr->sll_addr;
2630 }
69e3c75f
JB
2631 }
2632
69e3c75f
JB
2633 err = -ENXIO;
2634 if (unlikely(dev == NULL))
2635 goto out;
69e3c75f
JB
2636 err = -ENETDOWN;
2637 if (unlikely(!(dev->flags & IFF_UP)))
2638 goto out_put;
2639
657a0667 2640 sockcm_init(&sockc, &po->sk);
d19b183c
DCS
2641 if (msg->msg_controllen) {
2642 err = sock_cmsg_send(&po->sk, msg, &sockc);
2643 if (unlikely(err))
2644 goto out_put;
2645 }
2646
5cfb4c8d
DB
2647 if (po->sk.sk_socket->type == SOCK_RAW)
2648 reserve = dev->hard_header_len;
69e3c75f 2649 size_max = po->tx_ring.frame_size
b5dd884e 2650 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2651
1d036d25 2652 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2653 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2654
69e3c75f
JB
2655 do {
2656 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2657 TP_STATUS_SEND_REQUEST);
69e3c75f 2658 if (unlikely(ph == NULL)) {
87a2fd28
DB
2659 if (need_wait && need_resched())
2660 schedule();
69e3c75f
JB
2661 continue;
2662 }
2663
8d39b4a6
WB
2664 skb = NULL;
2665 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2666 if (tp_len < 0)
2667 goto tpacket_error;
2668
69e3c75f 2669 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2670 hlen = LL_RESERVED_SPACE(dev);
2671 tlen = dev->needed_tailroom;
1d036d25
WB
2672 if (po->has_vnet_hdr) {
2673 vnet_hdr = data;
2674 data += sizeof(*vnet_hdr);
2675 tp_len -= sizeof(*vnet_hdr);
2676 if (tp_len < 0 ||
2677 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2678 tp_len = -EINVAL;
2679 goto tpacket_error;
2680 }
2681 copylen = __virtio16_to_cpu(vio_le(),
2682 vnet_hdr->hdr_len);
2683 }
9ed988cd 2684 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2685 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2686 hlen + tlen + sizeof(struct sockaddr_ll) +
2687 (copylen - dev->hard_header_len),
fbf33a28 2688 !need_wait, &err);
69e3c75f 2689
fbf33a28
KM
2690 if (unlikely(skb == NULL)) {
2691 /* we assume the socket was initially writeable ... */
2692 if (likely(len_sum > 0))
2693 err = len_sum;
69e3c75f 2694 goto out_status;
fbf33a28 2695 }
8d39b4a6 2696 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
c14ac945 2697 addr, hlen, copylen, &sockc);
dbd46ab4 2698 if (likely(tp_len >= 0) &&
5cfb4c8d 2699 tp_len > dev->mtu + reserve &&
1d036d25 2700 !po->has_vnet_hdr &&
3c70c132
DB
2701 !packet_extra_vlan_len_allowed(dev, skb))
2702 tp_len = -EMSGSIZE;
69e3c75f
JB
2703
2704 if (unlikely(tp_len < 0)) {
8d39b4a6 2705tpacket_error:
69e3c75f
JB
2706 if (po->tp_loss) {
2707 __packet_set_status(po, ph,
2708 TP_STATUS_AVAILABLE);
2709 packet_increment_head(&po->tx_ring);
2710 kfree_skb(skb);
2711 continue;
2712 } else {
2713 status = TP_STATUS_WRONG_FORMAT;
2714 err = tp_len;
2715 goto out_status;
2716 }
2717 }
2718
9d2f67e4
JT
2719 if (po->has_vnet_hdr) {
2720 if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
2721 tp_len = -EINVAL;
2722 goto tpacket_error;
2723 }
2724 virtio_net_hdr_set_proto(skb, vnet_hdr);
1d036d25
WB
2725 }
2726
69e3c75f
JB
2727 skb->destructor = tpacket_destruct_skb;
2728 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2729 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2730
2731 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2732 err = po->xmit(skb);
eb70df13
JP
2733 if (unlikely(err > 0)) {
2734 err = net_xmit_errno(err);
2735 if (err && __packet_get_status(po, ph) ==
2736 TP_STATUS_AVAILABLE) {
2737 /* skb was destructed already */
2738 skb = NULL;
2739 goto out_status;
2740 }
2741 /*
2742 * skb was dropped but not destructed yet;
2743 * let's treat it like congestion or err < 0
2744 */
2745 err = 0;
2746 }
69e3c75f
JB
2747 packet_increment_head(&po->tx_ring);
2748 len_sum += tp_len;
b0138408
DB
2749 } while (likely((ph != NULL) ||
2750 /* Note: packet_read_pending() might be slow if we have
2751 * to call it as it's per_cpu variable, but in fast-path
2752 * we already short-circuit the loop with the first
2753 * condition, and luckily don't have to go that path
2754 * anyway.
2755 */
2756 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2757
2758 err = len_sum;
2759 goto out_put;
2760
69e3c75f
JB
2761out_status:
2762 __packet_set_status(po, ph, status);
2763 kfree_skb(skb);
2764out_put:
e40526cb 2765 dev_put(dev);
69e3c75f
JB
2766out:
2767 mutex_unlock(&po->pg_vec_lock);
2768 return err;
2769}
69e3c75f 2770
eea49cc9
OJ
2771static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2772 size_t reserve, size_t len,
2773 size_t linear, int noblock,
2774 int *err)
bfd5f4a3
SS
2775{
2776 struct sk_buff *skb;
2777
2778 /* Under a page? Don't bother with paged skb. */
2779 if (prepad + len < PAGE_SIZE || !linear)
2780 linear = len;
2781
2782 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2783 err, 0);
bfd5f4a3
SS
2784 if (!skb)
2785 return NULL;
2786
2787 skb_reserve(skb, reserve);
2788 skb_put(skb, linear);
2789 skb->data_len = len - linear;
2790 skb->len += len - linear;
2791
2792 return skb;
2793}
2794
d346a3fa 2795static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2796{
2797 struct sock *sk = sock->sk;
342dfc30 2798 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2799 struct sk_buff *skb;
2800 struct net_device *dev;
0e11c91e 2801 __be16 proto;
486efdc8 2802 unsigned char *addr = NULL;
827d9780 2803 int err, reserve = 0;
c7d39e32 2804 struct sockcm_cookie sockc;
bfd5f4a3
SS
2805 struct virtio_net_hdr vnet_hdr = { 0 };
2806 int offset = 0;
bfd5f4a3 2807 struct packet_sock *po = pkt_sk(sk);
da7c9561 2808 bool has_vnet_hdr = false;
57031eb7 2809 int hlen, tlen, linear;
3bdc0eba 2810 int extra_len = 0;
1da177e4
LT
2811
2812 /*
1ce4f28b 2813 * Get and verify the address.
1da177e4 2814 */
1ce4f28b 2815
66e56cd4 2816 if (likely(saddr == NULL)) {
e40526cb 2817 dev = packet_cached_dev_get(po);
1da177e4 2818 proto = po->num;
1da177e4
LT
2819 } else {
2820 err = -EINVAL;
2821 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2822 goto out;
0fb375fb
EB
2823 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2824 goto out;
1da177e4 2825 proto = saddr->sll_protocol;
827d9780 2826 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
486efdc8
WB
2827 if (sock->type == SOCK_DGRAM) {
2828 if (dev && msg->msg_namelen < dev->addr_len +
2829 offsetof(struct sockaddr_ll, sll_addr))
2830 goto out_unlock;
2831 addr = saddr->sll_addr;
2832 }
1da177e4
LT
2833 }
2834
1da177e4 2835 err = -ENXIO;
e40526cb 2836 if (unlikely(dev == NULL))
1da177e4 2837 goto out_unlock;
d5e76b0a 2838 err = -ENETDOWN;
e40526cb 2839 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2840 goto out_unlock;
2841
657a0667 2842 sockcm_init(&sockc, sk);
c7d39e32
EJ
2843 sockc.mark = sk->sk_mark;
2844 if (msg->msg_controllen) {
2845 err = sock_cmsg_send(sk, msg, &sockc);
2846 if (unlikely(err))
2847 goto out_unlock;
2848 }
2849
e40526cb
DB
2850 if (sock->type == SOCK_RAW)
2851 reserve = dev->hard_header_len;
bfd5f4a3 2852 if (po->has_vnet_hdr) {
16cc1400
WB
2853 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2854 if (err)
bfd5f4a3 2855 goto out_unlock;
da7c9561 2856 has_vnet_hdr = true;
bfd5f4a3
SS
2857 }
2858
3bdc0eba
BG
2859 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2860 if (!netif_supports_nofcs(dev)) {
2861 err = -EPROTONOSUPPORT;
2862 goto out_unlock;
2863 }
2864 extra_len = 4; /* We're doing our own CRC */
2865 }
2866
1da177e4 2867 err = -EMSGSIZE;
16cc1400
WB
2868 if (!vnet_hdr.gso_type &&
2869 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2870 goto out_unlock;
2871
bfd5f4a3 2872 err = -ENOBUFS;
ae641949
HX
2873 hlen = LL_RESERVED_SPACE(dev);
2874 tlen = dev->needed_tailroom;
57031eb7
WB
2875 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2876 linear = max(linear, min_t(int, len, dev->hard_header_len));
2877 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
bfd5f4a3 2878 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2879 if (skb == NULL)
1da177e4
LT
2880 goto out_unlock;
2881
b84bbaf7 2882 skb_reset_network_header(skb);
1da177e4 2883
0c4e8581 2884 err = -EINVAL;
9c707762
WB
2885 if (sock->type == SOCK_DGRAM) {
2886 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2887 if (unlikely(offset < 0))
9c707762 2888 goto out_free;
b84bbaf7 2889 } else if (reserve) {
9aad13b0 2890 skb_reserve(skb, -reserve);
88a8121d
ND
2891 if (len < reserve + sizeof(struct ipv6hdr) &&
2892 dev->min_header_len != dev->hard_header_len)
993675a3 2893 skb_reset_network_header(skb);
9c707762 2894 }
1da177e4
LT
2895
2896 /* Returns -EFAULT on error */
c0371da6 2897 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2898 if (err)
2899 goto out_free;
bf84a010 2900
9ed988cd
WB
2901 if (sock->type == SOCK_RAW &&
2902 !dev_validate_header(dev, skb->data, len)) {
2903 err = -EINVAL;
2904 goto out_free;
2905 }
2906
8f932f76 2907 skb_setup_tx_timestamp(skb, sockc.tsflags);
1da177e4 2908
16cc1400 2909 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2910 !packet_extra_vlan_len_allowed(dev, skb)) {
2911 err = -EMSGSIZE;
2912 goto out_free;
57f89bfa
BG
2913 }
2914
09effa67
DM
2915 skb->protocol = proto;
2916 skb->dev = dev;
1da177e4 2917 skb->priority = sk->sk_priority;
c7d39e32 2918 skb->mark = sockc.mark;
3d0ba8c0 2919 skb->tstamp = sockc.transmit_time;
0fd5d57b 2920
da7c9561 2921 if (has_vnet_hdr) {
db60eb5f 2922 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
16cc1400
WB
2923 if (err)
2924 goto out_free;
2925 len += sizeof(vnet_hdr);
9d2f67e4 2926 virtio_net_hdr_set_proto(skb, &vnet_hdr);
bfd5f4a3
SS
2927 }
2928
75c65772 2929 packet_parse_headers(skb, sock);
8fd6c80d 2930
3bdc0eba
BG
2931 if (unlikely(extra_len == 4))
2932 skb->no_fcs = 1;
2933
d346a3fa 2934 err = po->xmit(skb);
1da177e4
LT
2935 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2936 goto out_unlock;
2937
e40526cb 2938 dev_put(dev);
1da177e4 2939
40d4e3df 2940 return len;
1da177e4
LT
2941
2942out_free:
2943 kfree_skb(skb);
2944out_unlock:
e40526cb 2945 if (dev)
1da177e4
LT
2946 dev_put(dev);
2947out:
2948 return err;
2949}
2950
1b784140 2951static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2952{
69e3c75f
JB
2953 struct sock *sk = sock->sk;
2954 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2955
69e3c75f
JB
2956 if (po->tx_ring.pg_vec)
2957 return tpacket_snd(po, msg);
2958 else
69e3c75f
JB
2959 return packet_snd(sock, msg, len);
2960}
2961
1da177e4
LT
2962/*
2963 * Close a PACKET socket. This is fairly simple. We immediately go
2964 * to 'closed' state and remove our protocol entry in the device list.
2965 */
2966
2967static int packet_release(struct socket *sock)
2968{
2969 struct sock *sk = sock->sk;
2970 struct packet_sock *po;
2bd624b4 2971 struct packet_fanout *f;
d12d01d6 2972 struct net *net;
f6fb8f10 2973 union tpacket_req_u req_u;
1da177e4
LT
2974
2975 if (!sk)
2976 return 0;
2977
3b1e0a65 2978 net = sock_net(sk);
1da177e4
LT
2979 po = pkt_sk(sk);
2980
0fa7fa98 2981 mutex_lock(&net->packet.sklist_lock);
808f5114 2982 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2983 mutex_unlock(&net->packet.sklist_lock);
2984
2985 preempt_disable();
920de804 2986 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2987 preempt_enable();
1da177e4 2988
808f5114 2989 spin_lock(&po->bind_lock);
ce06b03e 2990 unregister_prot_hook(sk, false);
66e56cd4
DB
2991 packet_cached_dev_reset(po);
2992
160ff18a
BG
2993 if (po->prot_hook.dev) {
2994 dev_put(po->prot_hook.dev);
2995 po->prot_hook.dev = NULL;
2996 }
808f5114 2997 spin_unlock(&po->bind_lock);
1da177e4 2998
1da177e4 2999 packet_flush_mclist(sk);
1da177e4 3000
5171b37d 3001 lock_sock(sk);
9665d5d6
PS
3002 if (po->rx_ring.pg_vec) {
3003 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3004 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 3005 }
69e3c75f 3006
9665d5d6
PS
3007 if (po->tx_ring.pg_vec) {
3008 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3009 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 3010 }
5171b37d 3011 release_sock(sk);
1da177e4 3012
2bd624b4 3013 f = fanout_release(sk);
dc99f600 3014
808f5114 3015 synchronize_net();
2bd624b4
AS
3016
3017 if (f) {
57f015f5 3018 kfree(po->rollover);
2bd624b4
AS
3019 fanout_release_data(f);
3020 kfree(f);
3021 }
1da177e4
LT
3022 /*
3023 * Now the socket is dead. No more input will appear.
3024 */
1da177e4
LT
3025 sock_orphan(sk);
3026 sock->sk = NULL;
3027
3028 /* Purge queues */
3029
3030 skb_queue_purge(&sk->sk_receive_queue);
b0138408 3031 packet_free_pending(po);
17ab56a2 3032 sk_refcnt_debug_release(sk);
1da177e4
LT
3033
3034 sock_put(sk);
3035 return 0;
3036}
3037
3038/*
3039 * Attach a packet hook.
3040 */
3041
30f7ea1c
FR
3042static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3043 __be16 proto)
1da177e4
LT
3044{
3045 struct packet_sock *po = pkt_sk(sk);
158cd4af 3046 struct net_device *dev_curr;
902fefb8
DB
3047 __be16 proto_curr;
3048 bool need_rehook;
30f7ea1c
FR
3049 struct net_device *dev = NULL;
3050 int ret = 0;
3051 bool unlisted = false;
dc99f600 3052
1da177e4 3053 lock_sock(sk);
1da177e4 3054 spin_lock(&po->bind_lock);
30f7ea1c
FR
3055 rcu_read_lock();
3056
4971613c
WB
3057 if (po->fanout) {
3058 ret = -EINVAL;
3059 goto out_unlock;
3060 }
3061
30f7ea1c
FR
3062 if (name) {
3063 dev = dev_get_by_name_rcu(sock_net(sk), name);
3064 if (!dev) {
3065 ret = -ENODEV;
3066 goto out_unlock;
3067 }
3068 } else if (ifindex) {
3069 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3070 if (!dev) {
3071 ret = -ENODEV;
3072 goto out_unlock;
3073 }
3074 }
3075
3076 if (dev)
3077 dev_hold(dev);
66e56cd4 3078
902fefb8
DB
3079 proto_curr = po->prot_hook.type;
3080 dev_curr = po->prot_hook.dev;
3081
3082 need_rehook = proto_curr != proto || dev_curr != dev;
3083
3084 if (need_rehook) {
30f7ea1c
FR
3085 if (po->running) {
3086 rcu_read_unlock();
15fe076e
ED
3087 /* prevents packet_notifier() from calling
3088 * register_prot_hook()
3089 */
3090 po->num = 0;
30f7ea1c
FR
3091 __unregister_prot_hook(sk, true);
3092 rcu_read_lock();
3093 dev_curr = po->prot_hook.dev;
3094 if (dev)
3095 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3096 dev->ifindex);
3097 }
1da177e4 3098
15fe076e 3099 BUG_ON(po->running);
902fefb8
DB
3100 po->num = proto;
3101 po->prot_hook.type = proto;
902fefb8 3102
30f7ea1c
FR
3103 if (unlikely(unlisted)) {
3104 dev_put(dev);
3105 po->prot_hook.dev = NULL;
3106 po->ifindex = -1;
3107 packet_cached_dev_reset(po);
3108 } else {
3109 po->prot_hook.dev = dev;
3110 po->ifindex = dev ? dev->ifindex : 0;
3111 packet_cached_dev_assign(po, dev);
3112 }
902fefb8 3113 }
158cd4af
LW
3114 if (dev_curr)
3115 dev_put(dev_curr);
66e56cd4 3116
902fefb8 3117 if (proto == 0 || !need_rehook)
1da177e4
LT
3118 goto out_unlock;
3119
30f7ea1c 3120 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3121 register_prot_hook(sk);
be85d4ad
UT
3122 } else {
3123 sk->sk_err = ENETDOWN;
3124 if (!sock_flag(sk, SOCK_DEAD))
3125 sk->sk_error_report(sk);
1da177e4
LT
3126 }
3127
3128out_unlock:
30f7ea1c 3129 rcu_read_unlock();
1da177e4
LT
3130 spin_unlock(&po->bind_lock);
3131 release_sock(sk);
30f7ea1c 3132 return ret;
1da177e4
LT
3133}
3134
3135/*
3136 * Bind a packet socket to a device
3137 */
3138
40d4e3df
ED
3139static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3140 int addr_len)
1da177e4 3141{
40d4e3df 3142 struct sock *sk = sock->sk;
540e2894 3143 char name[sizeof(uaddr->sa_data) + 1];
1ce4f28b 3144
1da177e4
LT
3145 /*
3146 * Check legality
3147 */
1ce4f28b 3148
8ae55f04 3149 if (addr_len != sizeof(struct sockaddr))
1da177e4 3150 return -EINVAL;
540e2894
AP
3151 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3152 * zero-terminated.
3153 */
3154 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3155 name[sizeof(uaddr->sa_data)] = 0;
1da177e4 3156
30f7ea1c 3157 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3158}
1da177e4
LT
3159
3160static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3161{
40d4e3df
ED
3162 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3163 struct sock *sk = sock->sk;
1da177e4
LT
3164
3165 /*
3166 * Check legality
3167 */
1ce4f28b 3168
1da177e4
LT
3169 if (addr_len < sizeof(struct sockaddr_ll))
3170 return -EINVAL;
3171 if (sll->sll_family != AF_PACKET)
3172 return -EINVAL;
3173
30f7ea1c
FR
3174 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3175 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3176}
3177
3178static struct proto packet_proto = {
3179 .name = "PACKET",
3180 .owner = THIS_MODULE,
3181 .obj_size = sizeof(struct packet_sock),
3182};
3183
3184/*
1ce4f28b 3185 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3186 */
3187
3f378b68
EP
3188static int packet_create(struct net *net, struct socket *sock, int protocol,
3189 int kern)
1da177e4
LT
3190{
3191 struct sock *sk;
3192 struct packet_sock *po;
0e11c91e 3193 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3194 int err;
3195
df008c91 3196 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3197 return -EPERM;
be02097c
DM
3198 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3199 sock->type != SOCK_PACKET)
1da177e4
LT
3200 return -ESOCKTNOSUPPORT;
3201
3202 sock->state = SS_UNCONNECTED;
3203
3204 err = -ENOBUFS;
11aa9c28 3205 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3206 if (sk == NULL)
3207 goto out;
3208
3209 sock->ops = &packet_ops;
1da177e4
LT
3210 if (sock->type == SOCK_PACKET)
3211 sock->ops = &packet_ops_spkt;
be02097c 3212
1da177e4
LT
3213 sock_init_data(sock, sk);
3214
3215 po = pkt_sk(sk);
3216 sk->sk_family = PF_PACKET;
0e11c91e 3217 po->num = proto;
d346a3fa 3218 po->xmit = dev_queue_xmit;
66e56cd4 3219
b0138408
DB
3220 err = packet_alloc_pending(po);
3221 if (err)
3222 goto out2;
3223
66e56cd4 3224 packet_cached_dev_reset(po);
1da177e4
LT
3225
3226 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3227 sk_refcnt_debug_inc(sk);
1da177e4
LT
3228
3229 /*
3230 * Attach a protocol block
3231 */
3232
3233 spin_lock_init(&po->bind_lock);
905db440 3234 mutex_init(&po->pg_vec_lock);
0648ab70 3235 po->rollover = NULL;
1da177e4 3236 po->prot_hook.func = packet_rcv;
be02097c 3237
1da177e4
LT
3238 if (sock->type == SOCK_PACKET)
3239 po->prot_hook.func = packet_rcv_spkt;
be02097c 3240
1da177e4
LT
3241 po->prot_hook.af_packet_priv = sk;
3242
0e11c91e
AV
3243 if (proto) {
3244 po->prot_hook.type = proto;
a6361f0c 3245 __register_prot_hook(sk);
1da177e4
LT
3246 }
3247
0fa7fa98 3248 mutex_lock(&net->packet.sklist_lock);
a4dc6a49 3249 sk_add_node_tail_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3250 mutex_unlock(&net->packet.sklist_lock);
3251
3252 preempt_disable();
3680453c 3253 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3254 preempt_enable();
808f5114 3255
40d4e3df 3256 return 0;
b0138408
DB
3257out2:
3258 sk_free(sk);
1da177e4
LT
3259out:
3260 return err;
3261}
3262
3263/*
3264 * Pull a packet from our receive queue and hand it to the user.
3265 * If necessary we block.
3266 */
3267
1b784140
YX
3268static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3269 int flags)
1da177e4
LT
3270{
3271 struct sock *sk = sock->sk;
3272 struct sk_buff *skb;
3273 int copied, err;
bfd5f4a3 3274 int vnet_hdr_len = 0;
2472d761 3275 unsigned int origlen = 0;
1da177e4
LT
3276
3277 err = -EINVAL;
ed85b565 3278 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3279 goto out;
3280
3281#if 0
3282 /* What error should we return now? EUNATTACH? */
3283 if (pkt_sk(sk)->ifindex < 0)
3284 return -ENODEV;
3285#endif
3286
ed85b565 3287 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3288 err = sock_recv_errqueue(sk, msg, len,
3289 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3290 goto out;
3291 }
3292
1da177e4
LT
3293 /*
3294 * Call the generic datagram receiver. This handles all sorts
3295 * of horrible races and re-entrancy so we can forget about it
3296 * in the protocol layers.
3297 *
3298 * Now it will return ENETDOWN, if device have just gone down,
3299 * but then it will block.
3300 */
3301
40d4e3df 3302 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3303
3304 /*
1ce4f28b 3305 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3306 * handles the blocking we don't see and worry about blocking
3307 * retries.
3308 */
3309
8ae55f04 3310 if (skb == NULL)
1da177e4
LT
3311 goto out;
3312
2ccdbaa6
WB
3313 if (pkt_sk(sk)->pressure)
3314 packet_rcv_has_room(pkt_sk(sk), NULL);
3315
bfd5f4a3 3316 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3317 err = packet_rcv_vnet(msg, skb, &len);
3318 if (err)
bfd5f4a3 3319 goto out_free;
16cc1400 3320 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3321 }
3322
f3d33426
HFS
3323 /* You lose any data beyond the buffer you gave. If it worries
3324 * a user program they can ask the device for its MTU
3325 * anyway.
1da177e4 3326 */
1da177e4 3327 copied = skb->len;
40d4e3df
ED
3328 if (copied > len) {
3329 copied = len;
3330 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3331 }
3332
51f3d02b 3333 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3334 if (err)
3335 goto out_free;
3336
2472d761
EB
3337 if (sock->type != SOCK_PACKET) {
3338 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3339
3340 /* Original length was stored in sockaddr_ll fields */
3341 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3342 sll->sll_family = AF_PACKET;
3343 sll->sll_protocol = skb->protocol;
3344 }
3345
3b885787 3346 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3347
f3d33426 3348 if (msg->msg_name) {
b2cf86e1
WB
3349 int copy_len;
3350
f3d33426
HFS
3351 /* If the address length field is there to be filled
3352 * in, we fill it in now.
3353 */
3354 if (sock->type == SOCK_PACKET) {
342dfc30 3355 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426 3356 msg->msg_namelen = sizeof(struct sockaddr_pkt);
b2cf86e1 3357 copy_len = msg->msg_namelen;
f3d33426
HFS
3358 } else {
3359 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3360
f3d33426
HFS
3361 msg->msg_namelen = sll->sll_halen +
3362 offsetof(struct sockaddr_ll, sll_addr);
b2cf86e1
WB
3363 copy_len = msg->msg_namelen;
3364 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
3365 memset(msg->msg_name +
3366 offsetof(struct sockaddr_ll, sll_addr),
3367 0, sizeof(sll->sll_addr));
3368 msg->msg_namelen = sizeof(struct sockaddr_ll);
3369 }
f3d33426 3370 }
b2cf86e1 3371 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
f3d33426 3372 }
1da177e4 3373
8dc41944 3374 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3375 struct tpacket_auxdata aux;
3376
3377 aux.tp_status = TP_STATUS_USER;
3378 if (skb->ip_summed == CHECKSUM_PARTIAL)
3379 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3380 else if (skb->pkt_type != PACKET_OUTGOING &&
3381 (skb->ip_summed == CHECKSUM_COMPLETE ||
3382 skb_csum_unnecessary(skb)))
3383 aux.tp_status |= TP_STATUS_CSUM_VALID;
3384
2472d761 3385 aux.tp_len = origlen;
ffbc6111
HX
3386 aux.tp_snaplen = skb->len;
3387 aux.tp_mac = 0;
bbe735e4 3388 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3389 if (skb_vlan_tag_present(skb)) {
3390 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3391 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3392 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3393 } else {
3394 aux.tp_vlan_tci = 0;
a0cdfcf3 3395 aux.tp_vlan_tpid = 0;
a3bcc23e 3396 }
ffbc6111 3397 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3398 }
3399
1da177e4
LT
3400 /*
3401 * Free or return the buffer as appropriate. Again this
3402 * hides all the races and re-entrancy issues from us.
3403 */
bfd5f4a3 3404 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3405
3406out_free:
3407 skb_free_datagram(sk, skb);
3408out:
3409 return err;
3410}
3411
1da177e4 3412static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3413 int peer)
1da177e4
LT
3414{
3415 struct net_device *dev;
3416 struct sock *sk = sock->sk;
3417
3418 if (peer)
3419 return -EOPNOTSUPP;
3420
3421 uaddr->sa_family = AF_PACKET;
2dc85bf3 3422 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3423 rcu_read_lock();
3424 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3425 if (dev)
2dc85bf3 3426 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3427 rcu_read_unlock();
1da177e4 3428
9b2c45d4 3429 return sizeof(*uaddr);
1da177e4 3430}
1da177e4
LT
3431
3432static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3433 int peer)
1da177e4
LT
3434{
3435 struct net_device *dev;
3436 struct sock *sk = sock->sk;
3437 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3438 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3439
3440 if (peer)
3441 return -EOPNOTSUPP;
3442
3443 sll->sll_family = AF_PACKET;
3444 sll->sll_ifindex = po->ifindex;
3445 sll->sll_protocol = po->num;
67286640 3446 sll->sll_pkttype = 0;
654d1f8a
ED
3447 rcu_read_lock();
3448 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3449 if (dev) {
3450 sll->sll_hatype = dev->type;
3451 sll->sll_halen = dev->addr_len;
3452 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3453 } else {
3454 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3455 sll->sll_halen = 0;
3456 }
654d1f8a 3457 rcu_read_unlock();
1da177e4 3458
9b2c45d4 3459 return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3460}
3461
2aeb0b88
WC
3462static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3463 int what)
1da177e4
LT
3464{
3465 switch (i->type) {
3466 case PACKET_MR_MULTICAST:
1162563f
JP
3467 if (i->alen != dev->addr_len)
3468 return -EINVAL;
1da177e4 3469 if (what > 0)
22bedad3 3470 return dev_mc_add(dev, i->addr);
1da177e4 3471 else
22bedad3 3472 return dev_mc_del(dev, i->addr);
1da177e4
LT
3473 break;
3474 case PACKET_MR_PROMISC:
2aeb0b88 3475 return dev_set_promiscuity(dev, what);
1da177e4 3476 case PACKET_MR_ALLMULTI:
2aeb0b88 3477 return dev_set_allmulti(dev, what);
d95ed927 3478 case PACKET_MR_UNICAST:
1162563f
JP
3479 if (i->alen != dev->addr_len)
3480 return -EINVAL;
d95ed927 3481 if (what > 0)
a748ee24 3482 return dev_uc_add(dev, i->addr);
d95ed927 3483 else
a748ee24 3484 return dev_uc_del(dev, i->addr);
d95ed927 3485 break;
40d4e3df
ED
3486 default:
3487 break;
1da177e4 3488 }
2aeb0b88 3489 return 0;
1da177e4
LT
3490}
3491
82f17091
FR
3492static void packet_dev_mclist_delete(struct net_device *dev,
3493 struct packet_mclist **mlp)
1da177e4 3494{
82f17091
FR
3495 struct packet_mclist *ml;
3496
3497 while ((ml = *mlp) != NULL) {
3498 if (ml->ifindex == dev->ifindex) {
3499 packet_dev_mc(dev, ml, -1);
3500 *mlp = ml->next;
3501 kfree(ml);
3502 } else
3503 mlp = &ml->next;
1da177e4
LT
3504 }
3505}
3506
0fb375fb 3507static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3508{
3509 struct packet_sock *po = pkt_sk(sk);
3510 struct packet_mclist *ml, *i;
3511 struct net_device *dev;
3512 int err;
3513
3514 rtnl_lock();
3515
3516 err = -ENODEV;
3b1e0a65 3517 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3518 if (!dev)
3519 goto done;
3520
3521 err = -EINVAL;
1162563f 3522 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3523 goto done;
3524
3525 err = -ENOBUFS;
8b3a7005 3526 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3527 if (i == NULL)
3528 goto done;
3529
3530 err = 0;
3531 for (ml = po->mclist; ml; ml = ml->next) {
3532 if (ml->ifindex == mreq->mr_ifindex &&
3533 ml->type == mreq->mr_type &&
3534 ml->alen == mreq->mr_alen &&
3535 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3536 ml->count++;
3537 /* Free the new element ... */
3538 kfree(i);
3539 goto done;
3540 }
3541 }
3542
3543 i->type = mreq->mr_type;
3544 i->ifindex = mreq->mr_ifindex;
3545 i->alen = mreq->mr_alen;
3546 memcpy(i->addr, mreq->mr_address, i->alen);
309cf37f 3547 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3548 i->count = 1;
3549 i->next = po->mclist;
3550 po->mclist = i;
2aeb0b88
WC
3551 err = packet_dev_mc(dev, i, 1);
3552 if (err) {
3553 po->mclist = i->next;
3554 kfree(i);
3555 }
1da177e4
LT
3556
3557done:
3558 rtnl_unlock();
3559 return err;
3560}
3561
0fb375fb 3562static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3563{
3564 struct packet_mclist *ml, **mlp;
3565
3566 rtnl_lock();
3567
3568 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3569 if (ml->ifindex == mreq->mr_ifindex &&
3570 ml->type == mreq->mr_type &&
3571 ml->alen == mreq->mr_alen &&
3572 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3573 if (--ml->count == 0) {
3574 struct net_device *dev;
3575 *mlp = ml->next;
ad959e76
ED
3576 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3577 if (dev)
1da177e4 3578 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3579 kfree(ml);
3580 }
82f17091 3581 break;
1da177e4
LT
3582 }
3583 }
3584 rtnl_unlock();
82f17091 3585 return 0;
1da177e4
LT
3586}
3587
3588static void packet_flush_mclist(struct sock *sk)
3589{
3590 struct packet_sock *po = pkt_sk(sk);
3591 struct packet_mclist *ml;
3592
3593 if (!po->mclist)
3594 return;
3595
3596 rtnl_lock();
3597 while ((ml = po->mclist) != NULL) {
3598 struct net_device *dev;
3599
3600 po->mclist = ml->next;
ad959e76
ED
3601 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3602 if (dev != NULL)
1da177e4 3603 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3604 kfree(ml);
3605 }
3606 rtnl_unlock();
3607}
1da177e4
LT
3608
3609static int
b7058842 3610packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3611{
3612 struct sock *sk = sock->sk;
8dc41944 3613 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3614 int ret;
3615
3616 if (level != SOL_PACKET)
3617 return -ENOPROTOOPT;
3618
69e3c75f 3619 switch (optname) {
1ce4f28b 3620 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3621 case PACKET_DROP_MEMBERSHIP:
3622 {
0fb375fb
EB
3623 struct packet_mreq_max mreq;
3624 int len = optlen;
3625 memset(&mreq, 0, sizeof(mreq));
3626 if (len < sizeof(struct packet_mreq))
1da177e4 3627 return -EINVAL;
0fb375fb
EB
3628 if (len > sizeof(mreq))
3629 len = sizeof(mreq);
40d4e3df 3630 if (copy_from_user(&mreq, optval, len))
1da177e4 3631 return -EFAULT;
0fb375fb
EB
3632 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3633 return -EINVAL;
1da177e4
LT
3634 if (optname == PACKET_ADD_MEMBERSHIP)
3635 ret = packet_mc_add(sk, &mreq);
3636 else
3637 ret = packet_mc_drop(sk, &mreq);
3638 return ret;
3639 }
a2efcfa0 3640
1da177e4 3641 case PACKET_RX_RING:
69e3c75f 3642 case PACKET_TX_RING:
1da177e4 3643 {
f6fb8f10 3644 union tpacket_req_u req_u;
3645 int len;
1da177e4 3646
5171b37d 3647 lock_sock(sk);
f6fb8f10 3648 switch (po->tp_version) {
3649 case TPACKET_V1:
3650 case TPACKET_V2:
3651 len = sizeof(req_u.req);
3652 break;
3653 case TPACKET_V3:
3654 default:
3655 len = sizeof(req_u.req3);
3656 break;
3657 }
5171b37d
ED
3658 if (optlen < len) {
3659 ret = -EINVAL;
3660 } else {
3661 if (copy_from_user(&req_u.req, optval, len))
3662 ret = -EFAULT;
3663 else
3664 ret = packet_set_ring(sk, &req_u, 0,
3665 optname == PACKET_TX_RING);
3666 }
3667 release_sock(sk);
3668 return ret;
1da177e4
LT
3669 }
3670 case PACKET_COPY_THRESH:
3671 {
3672 int val;
3673
40d4e3df 3674 if (optlen != sizeof(val))
1da177e4 3675 return -EINVAL;
40d4e3df 3676 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3677 return -EFAULT;
3678
3679 pkt_sk(sk)->copy_thresh = val;
3680 return 0;
3681 }
bbd6ef87
PM
3682 case PACKET_VERSION:
3683 {
3684 int val;
3685
3686 if (optlen != sizeof(val))
3687 return -EINVAL;
bbd6ef87
PM
3688 if (copy_from_user(&val, optval, sizeof(val)))
3689 return -EFAULT;
3690 switch (val) {
3691 case TPACKET_V1:
3692 case TPACKET_V2:
f6fb8f10 3693 case TPACKET_V3:
84ac7260 3694 break;
bbd6ef87
PM
3695 default:
3696 return -EINVAL;
3697 }
84ac7260
PP
3698 lock_sock(sk);
3699 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3700 ret = -EBUSY;
3701 } else {
3702 po->tp_version = val;
3703 ret = 0;
3704 }
3705 release_sock(sk);
3706 return ret;
bbd6ef87 3707 }
8913336a
PM
3708 case PACKET_RESERVE:
3709 {
3710 unsigned int val;
3711
3712 if (optlen != sizeof(val))
3713 return -EINVAL;
8913336a
PM
3714 if (copy_from_user(&val, optval, sizeof(val)))
3715 return -EFAULT;
bcc5364b
AK
3716 if (val > INT_MAX)
3717 return -EINVAL;
c27927e3
WB
3718 lock_sock(sk);
3719 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3720 ret = -EBUSY;
3721 } else {
3722 po->tp_reserve = val;
3723 ret = 0;
3724 }
3725 release_sock(sk);
3726 return ret;
8913336a 3727 }
69e3c75f
JB
3728 case PACKET_LOSS:
3729 {
3730 unsigned int val;
3731
3732 if (optlen != sizeof(val))
3733 return -EINVAL;
69e3c75f
JB
3734 if (copy_from_user(&val, optval, sizeof(val)))
3735 return -EFAULT;
a6361f0c
WB
3736
3737 lock_sock(sk);
3738 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3739 ret = -EBUSY;
3740 } else {
3741 po->tp_loss = !!val;
3742 ret = 0;
3743 }
3744 release_sock(sk);
3745 return ret;
69e3c75f 3746 }
8dc41944
HX
3747 case PACKET_AUXDATA:
3748 {
3749 int val;
3750
3751 if (optlen < sizeof(val))
3752 return -EINVAL;
3753 if (copy_from_user(&val, optval, sizeof(val)))
3754 return -EFAULT;
3755
a6361f0c 3756 lock_sock(sk);
8dc41944 3757 po->auxdata = !!val;
a6361f0c 3758 release_sock(sk);
8dc41944
HX
3759 return 0;
3760 }
80feaacb
PWJ
3761 case PACKET_ORIGDEV:
3762 {
3763 int val;
3764
3765 if (optlen < sizeof(val))
3766 return -EINVAL;
3767 if (copy_from_user(&val, optval, sizeof(val)))
3768 return -EFAULT;
3769
a6361f0c 3770 lock_sock(sk);
80feaacb 3771 po->origdev = !!val;
a6361f0c 3772 release_sock(sk);
80feaacb
PWJ
3773 return 0;
3774 }
bfd5f4a3
SS
3775 case PACKET_VNET_HDR:
3776 {
3777 int val;
3778
3779 if (sock->type != SOCK_RAW)
3780 return -EINVAL;
bfd5f4a3
SS
3781 if (optlen < sizeof(val))
3782 return -EINVAL;
3783 if (copy_from_user(&val, optval, sizeof(val)))
3784 return -EFAULT;
3785
a6361f0c
WB
3786 lock_sock(sk);
3787 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3788 ret = -EBUSY;
3789 } else {
3790 po->has_vnet_hdr = !!val;
3791 ret = 0;
3792 }
3793 release_sock(sk);
3794 return ret;
bfd5f4a3 3795 }
614f60fa
SM
3796 case PACKET_TIMESTAMP:
3797 {
3798 int val;
3799
3800 if (optlen != sizeof(val))
3801 return -EINVAL;
3802 if (copy_from_user(&val, optval, sizeof(val)))
3803 return -EFAULT;
3804
3805 po->tp_tstamp = val;
3806 return 0;
3807 }
dc99f600
DM
3808 case PACKET_FANOUT:
3809 {
3810 int val;
3811
3812 if (optlen != sizeof(val))
3813 return -EINVAL;
3814 if (copy_from_user(&val, optval, sizeof(val)))
3815 return -EFAULT;
3816
3817 return fanout_add(sk, val & 0xffff, val >> 16);
3818 }
47dceb8e
WB
3819 case PACKET_FANOUT_DATA:
3820 {
3821 if (!po->fanout)
3822 return -EINVAL;
3823
3824 return fanout_set_data(po, optval, optlen);
3825 }
fa788d98
VW
3826 case PACKET_IGNORE_OUTGOING:
3827 {
3828 int val;
3829
3830 if (optlen != sizeof(val))
3831 return -EINVAL;
3832 if (copy_from_user(&val, optval, sizeof(val)))
3833 return -EFAULT;
3834 if (val < 0 || val > 1)
3835 return -EINVAL;
3836
3837 po->prot_hook.ignore_outgoing = !!val;
3838 return 0;
3839 }
5920cd3a
PC
3840 case PACKET_TX_HAS_OFF:
3841 {
3842 unsigned int val;
3843
3844 if (optlen != sizeof(val))
3845 return -EINVAL;
5920cd3a
PC
3846 if (copy_from_user(&val, optval, sizeof(val)))
3847 return -EFAULT;
a6361f0c
WB
3848
3849 lock_sock(sk);
3850 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3851 ret = -EBUSY;
3852 } else {
3853 po->tp_tx_has_off = !!val;
3854 ret = 0;
3855 }
3856 release_sock(sk);
5920cd3a
PC
3857 return 0;
3858 }
d346a3fa
DB
3859 case PACKET_QDISC_BYPASS:
3860 {
3861 int val;
3862
3863 if (optlen != sizeof(val))
3864 return -EINVAL;
3865 if (copy_from_user(&val, optval, sizeof(val)))
3866 return -EFAULT;
3867
3868 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3869 return 0;
3870 }
1da177e4
LT
3871 default:
3872 return -ENOPROTOOPT;
3873 }
3874}
3875
3876static int packet_getsockopt(struct socket *sock, int level, int optname,
3877 char __user *optval, int __user *optlen)
3878{
3879 int len;
c06fff6e 3880 int val, lv = sizeof(val);
1da177e4
LT
3881 struct sock *sk = sock->sk;
3882 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3883 void *data = &val;
ee80fbf3 3884 union tpacket_stats_u st;
a9b63918 3885 struct tpacket_rollover_stats rstats;
1da177e4
LT
3886
3887 if (level != SOL_PACKET)
3888 return -ENOPROTOOPT;
3889
8ae55f04
KK
3890 if (get_user(len, optlen))
3891 return -EFAULT;
1da177e4
LT
3892
3893 if (len < 0)
3894 return -EINVAL;
1ce4f28b 3895
69e3c75f 3896 switch (optname) {
1da177e4 3897 case PACKET_STATISTICS:
1da177e4 3898 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3899 memcpy(&st, &po->stats, sizeof(st));
3900 memset(&po->stats, 0, sizeof(po->stats));
3901 spin_unlock_bh(&sk->sk_receive_queue.lock);
3902
f6fb8f10 3903 if (po->tp_version == TPACKET_V3) {
c06fff6e 3904 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3905 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3906 data = &st.stats3;
f6fb8f10 3907 } else {
c06fff6e 3908 lv = sizeof(struct tpacket_stats);
8bcdeaff 3909 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3910 data = &st.stats1;
f6fb8f10 3911 }
ee80fbf3 3912
8dc41944
HX
3913 break;
3914 case PACKET_AUXDATA:
8dc41944 3915 val = po->auxdata;
80feaacb
PWJ
3916 break;
3917 case PACKET_ORIGDEV:
80feaacb 3918 val = po->origdev;
bfd5f4a3
SS
3919 break;
3920 case PACKET_VNET_HDR:
bfd5f4a3 3921 val = po->has_vnet_hdr;
1da177e4 3922 break;
bbd6ef87 3923 case PACKET_VERSION:
bbd6ef87 3924 val = po->tp_version;
bbd6ef87
PM
3925 break;
3926 case PACKET_HDRLEN:
3927 if (len > sizeof(int))
3928 len = sizeof(int);
fd2c83b3
AP
3929 if (len < sizeof(int))
3930 return -EINVAL;
bbd6ef87
PM
3931 if (copy_from_user(&val, optval, len))
3932 return -EFAULT;
3933 switch (val) {
3934 case TPACKET_V1:
3935 val = sizeof(struct tpacket_hdr);
3936 break;
3937 case TPACKET_V2:
3938 val = sizeof(struct tpacket2_hdr);
3939 break;
f6fb8f10 3940 case TPACKET_V3:
3941 val = sizeof(struct tpacket3_hdr);
3942 break;
bbd6ef87
PM
3943 default:
3944 return -EINVAL;
3945 }
bbd6ef87 3946 break;
8913336a 3947 case PACKET_RESERVE:
8913336a 3948 val = po->tp_reserve;
8913336a 3949 break;
69e3c75f 3950 case PACKET_LOSS:
69e3c75f 3951 val = po->tp_loss;
69e3c75f 3952 break;
614f60fa 3953 case PACKET_TIMESTAMP:
614f60fa 3954 val = po->tp_tstamp;
614f60fa 3955 break;
dc99f600 3956 case PACKET_FANOUT:
dc99f600
DM
3957 val = (po->fanout ?
3958 ((u32)po->fanout->id |
77f65ebd
WB
3959 ((u32)po->fanout->type << 16) |
3960 ((u32)po->fanout->flags << 24)) :
dc99f600 3961 0);
dc99f600 3962 break;
fa788d98
VW
3963 case PACKET_IGNORE_OUTGOING:
3964 val = po->prot_hook.ignore_outgoing;
3965 break;
a9b63918 3966 case PACKET_ROLLOVER_STATS:
57f015f5 3967 if (!po->rollover)
a9b63918 3968 return -EINVAL;
57f015f5
MM
3969 rstats.tp_all = atomic_long_read(&po->rollover->num);
3970 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3971 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3972 data = &rstats;
3973 lv = sizeof(rstats);
a9b63918 3974 break;
5920cd3a
PC
3975 case PACKET_TX_HAS_OFF:
3976 val = po->tp_tx_has_off;
3977 break;
d346a3fa
DB
3978 case PACKET_QDISC_BYPASS:
3979 val = packet_use_direct_xmit(po);
3980 break;
1da177e4
LT
3981 default:
3982 return -ENOPROTOOPT;
3983 }
3984
c06fff6e
ED
3985 if (len > lv)
3986 len = lv;
8ae55f04
KK
3987 if (put_user(len, optlen))
3988 return -EFAULT;
8dc41944
HX
3989 if (copy_to_user(optval, data, len))
3990 return -EFAULT;
8ae55f04 3991 return 0;
1da177e4
LT
3992}
3993
3994
719c44d3
WB
3995#ifdef CONFIG_COMPAT
3996static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
3997 char __user *optval, unsigned int optlen)
3998{
3999 struct packet_sock *po = pkt_sk(sock->sk);
4000
4001 if (level != SOL_PACKET)
4002 return -ENOPROTOOPT;
4003
4004 if (optname == PACKET_FANOUT_DATA &&
4005 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
4006 optval = (char __user *)get_compat_bpf_fprog(optval);
4007 if (!optval)
4008 return -EFAULT;
4009 optlen = sizeof(struct sock_fprog);
4010 }
4011
4012 return packet_setsockopt(sock, level, optname, optval, optlen);
4013}
4014#endif
4015
351638e7
JP
4016static int packet_notifier(struct notifier_block *this,
4017 unsigned long msg, void *ptr)
1da177e4
LT
4018{
4019 struct sock *sk;
351638e7 4020 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 4021 struct net *net = dev_net(dev);
1da177e4 4022
808f5114 4023 rcu_read_lock();
b67bfe0d 4024 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
4025 struct packet_sock *po = pkt_sk(sk);
4026
4027 switch (msg) {
4028 case NETDEV_UNREGISTER:
1da177e4 4029 if (po->mclist)
82f17091 4030 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
4031 /* fallthrough */
4032
1da177e4
LT
4033 case NETDEV_DOWN:
4034 if (dev->ifindex == po->ifindex) {
4035 spin_lock(&po->bind_lock);
4036 if (po->running) {
ce06b03e 4037 __unregister_prot_hook(sk, false);
1da177e4
LT
4038 sk->sk_err = ENETDOWN;
4039 if (!sock_flag(sk, SOCK_DEAD))
4040 sk->sk_error_report(sk);
4041 }
4042 if (msg == NETDEV_UNREGISTER) {
66e56cd4 4043 packet_cached_dev_reset(po);
1da177e4 4044 po->ifindex = -1;
160ff18a
BG
4045 if (po->prot_hook.dev)
4046 dev_put(po->prot_hook.dev);
1da177e4
LT
4047 po->prot_hook.dev = NULL;
4048 }
4049 spin_unlock(&po->bind_lock);
4050 }
4051 break;
4052 case NETDEV_UP:
808f5114 4053 if (dev->ifindex == po->ifindex) {
4054 spin_lock(&po->bind_lock);
ce06b03e
DM
4055 if (po->num)
4056 register_prot_hook(sk);
808f5114 4057 spin_unlock(&po->bind_lock);
1da177e4 4058 }
1da177e4
LT
4059 break;
4060 }
4061 }
808f5114 4062 rcu_read_unlock();
1da177e4
LT
4063 return NOTIFY_DONE;
4064}
4065
4066
4067static int packet_ioctl(struct socket *sock, unsigned int cmd,
4068 unsigned long arg)
4069{
4070 struct sock *sk = sock->sk;
4071
69e3c75f 4072 switch (cmd) {
40d4e3df
ED
4073 case SIOCOUTQ:
4074 {
4075 int amount = sk_wmem_alloc_get(sk);
31e6d363 4076
40d4e3df
ED
4077 return put_user(amount, (int __user *)arg);
4078 }
4079 case SIOCINQ:
4080 {
4081 struct sk_buff *skb;
4082 int amount = 0;
4083
4084 spin_lock_bh(&sk->sk_receive_queue.lock);
4085 skb = skb_peek(&sk->sk_receive_queue);
4086 if (skb)
4087 amount = skb->len;
4088 spin_unlock_bh(&sk->sk_receive_queue.lock);
4089 return put_user(amount, (int __user *)arg);
4090 }
1da177e4 4091#ifdef CONFIG_INET
40d4e3df
ED
4092 case SIOCADDRT:
4093 case SIOCDELRT:
4094 case SIOCDARP:
4095 case SIOCGARP:
4096 case SIOCSARP:
4097 case SIOCGIFADDR:
4098 case SIOCSIFADDR:
4099 case SIOCGIFBRDADDR:
4100 case SIOCSIFBRDADDR:
4101 case SIOCGIFNETMASK:
4102 case SIOCSIFNETMASK:
4103 case SIOCGIFDSTADDR:
4104 case SIOCSIFDSTADDR:
4105 case SIOCSIFFLAGS:
40d4e3df 4106 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
4107#endif
4108
40d4e3df
ED
4109 default:
4110 return -ENOIOCTLCMD;
1da177e4
LT
4111 }
4112 return 0;
4113}
4114
a11e1d43
LT
4115static __poll_t packet_poll(struct file *file, struct socket *sock,
4116 poll_table *wait)
1da177e4
LT
4117{
4118 struct sock *sk = sock->sk;
4119 struct packet_sock *po = pkt_sk(sk);
a11e1d43 4120 __poll_t mask = datagram_poll(file, sock, wait);
1da177e4
LT
4121
4122 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4123 if (po->rx_ring.pg_vec) {
f6fb8f10 4124 if (!packet_previous_rx_frame(po, &po->rx_ring,
4125 TP_STATUS_KERNEL))
a9a08845 4126 mask |= EPOLLIN | EPOLLRDNORM;
1da177e4 4127 }
2ccdbaa6 4128 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 4129 po->pressure = 0;
1da177e4 4130 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4131 spin_lock_bh(&sk->sk_write_queue.lock);
4132 if (po->tx_ring.pg_vec) {
4133 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
a9a08845 4134 mask |= EPOLLOUT | EPOLLWRNORM;
69e3c75f
JB
4135 }
4136 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4137 return mask;
4138}
4139
4140
4141/* Dirty? Well, I still did not learn better way to account
4142 * for user mmaps.
4143 */
4144
4145static void packet_mm_open(struct vm_area_struct *vma)
4146{
4147 struct file *file = vma->vm_file;
40d4e3df 4148 struct socket *sock = file->private_data;
1da177e4 4149 struct sock *sk = sock->sk;
1ce4f28b 4150
1da177e4
LT
4151 if (sk)
4152 atomic_inc(&pkt_sk(sk)->mapped);
4153}
4154
4155static void packet_mm_close(struct vm_area_struct *vma)
4156{
4157 struct file *file = vma->vm_file;
40d4e3df 4158 struct socket *sock = file->private_data;
1da177e4 4159 struct sock *sk = sock->sk;
1ce4f28b 4160
1da177e4
LT
4161 if (sk)
4162 atomic_dec(&pkt_sk(sk)->mapped);
4163}
4164
f0f37e2f 4165static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4166 .open = packet_mm_open,
4167 .close = packet_mm_close,
1da177e4
LT
4168};
4169
3a7ad063
ED
4170static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4171 unsigned int len)
1da177e4
LT
4172{
4173 int i;
4174
4ebf0ae2 4175 for (i = 0; i < len; i++) {
0e3125c7 4176 if (likely(pg_vec[i].buffer)) {
3a7ad063
ED
4177 if (is_vmalloc_addr(pg_vec[i].buffer))
4178 vfree(pg_vec[i].buffer);
4179 else
4180 free_pages((unsigned long)pg_vec[i].buffer,
4181 order);
0e3125c7
NH
4182 pg_vec[i].buffer = NULL;
4183 }
1da177e4
LT
4184 }
4185 kfree(pg_vec);
4186}
4187
3a7ad063 4188static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4189{
f0d4eb29 4190 char *buffer;
3a7ad063
ED
4191 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4192 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
0e3125c7 4193
3a7ad063 4194 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4195 if (buffer)
4196 return buffer;
4197
3a7ad063
ED
4198 /* __get_free_pages failed, fall back to vmalloc */
4199 buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
4200 if (buffer)
4201 return buffer;
0e3125c7 4202
3a7ad063
ED
4203 /* vmalloc failed, lets dig into swap here */
4204 gfp_flags &= ~__GFP_NORETRY;
4205 buffer = (char *) __get_free_pages(gfp_flags, order);
4206 if (buffer)
4207 return buffer;
4208
4209 /* complete and utter failure */
4210 return NULL;
4ebf0ae2
DM
4211}
4212
3a7ad063 4213static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4214{
4215 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4216 struct pgv *pg_vec;
4ebf0ae2
DM
4217 int i;
4218
398f0132 4219 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
4ebf0ae2
DM
4220 if (unlikely(!pg_vec))
4221 goto out;
4222
4223 for (i = 0; i < block_nr; i++) {
3a7ad063 4224 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4225 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4226 goto out_free_pgvec;
4227 }
4228
4229out:
4230 return pg_vec;
4231
4232out_free_pgvec:
3a7ad063 4233 free_pg_vec(pg_vec, order, block_nr);
4ebf0ae2
DM
4234 pg_vec = NULL;
4235 goto out;
4236}
1da177e4 4237
f6fb8f10 4238static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4239 int closing, int tx_ring)
1da177e4 4240{
0e3125c7 4241 struct pgv *pg_vec = NULL;
1da177e4 4242 struct packet_sock *po = pkt_sk(sk);
3a7ad063 4243 int was_running, order = 0;
69e3c75f
JB
4244 struct packet_ring_buffer *rb;
4245 struct sk_buff_head *rb_queue;
0e11c91e 4246 __be16 num;
f6fb8f10 4247 int err = -EINVAL;
4248 /* Added to avoid minimal code churn */
4249 struct tpacket_req *req = &req_u->req;
4250
69e3c75f
JB
4251 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4252 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4253
69e3c75f
JB
4254 err = -EBUSY;
4255 if (!closing) {
4256 if (atomic_read(&po->mapped))
4257 goto out;
b0138408 4258 if (packet_read_pending(rb))
69e3c75f
JB
4259 goto out;
4260 }
1da177e4 4261
69e3c75f 4262 if (req->tp_block_nr) {
4576cd46
WB
4263 unsigned int min_frame_size;
4264
69e3c75f
JB
4265 /* Sanity tests and some calculations */
4266 err = -EBUSY;
4267 if (unlikely(rb->pg_vec))
4268 goto out;
1da177e4 4269
bbd6ef87
PM
4270 switch (po->tp_version) {
4271 case TPACKET_V1:
4272 po->tp_hdrlen = TPACKET_HDRLEN;
4273 break;
4274 case TPACKET_V2:
4275 po->tp_hdrlen = TPACKET2_HDRLEN;
4276 break;
f6fb8f10 4277 case TPACKET_V3:
4278 po->tp_hdrlen = TPACKET3_HDRLEN;
4279 break;
bbd6ef87
PM
4280 }
4281
69e3c75f 4282 err = -EINVAL;
4ebf0ae2 4283 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4284 goto out;
90836b67 4285 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4286 goto out;
4576cd46 4287 min_frame_size = po->tp_hdrlen + po->tp_reserve;
dc808110 4288 if (po->tp_version >= TPACKET_V3 &&
4576cd46
WB
4289 req->tp_block_size <
4290 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
dc808110 4291 goto out;
4576cd46 4292 if (unlikely(req->tp_frame_size < min_frame_size))
69e3c75f 4293 goto out;
4ebf0ae2 4294 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4295 goto out;
1da177e4 4296
4194b491
TK
4297 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4298 if (unlikely(rb->frames_per_block == 0))
69e3c75f 4299 goto out;
fc62814d 4300 if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
8f8d28e4 4301 goto out;
69e3c75f
JB
4302 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4303 req->tp_frame_nr))
4304 goto out;
1da177e4
LT
4305
4306 err = -ENOMEM;
3a7ad063
ED
4307 order = get_order(req->tp_block_size);
4308 pg_vec = alloc_pg_vec(req, order);
4ebf0ae2 4309 if (unlikely(!pg_vec))
1da177e4 4310 goto out;
f6fb8f10 4311 switch (po->tp_version) {
4312 case TPACKET_V3:
7f953ab2
SV
4313 /* Block transmit is not supported yet */
4314 if (!tx_ring) {
e8e85cc5 4315 init_prb_bdqc(po, rb, pg_vec, req_u);
7f953ab2
SV
4316 } else {
4317 struct tpacket_req3 *req3 = &req_u->req3;
4318
4319 if (req3->tp_retire_blk_tov ||
4320 req3->tp_sizeof_priv ||
4321 req3->tp_feature_req_word) {
4322 err = -EINVAL;
4323 goto out;
4324 }
4325 }
d7cf0c34 4326 break;
f6fb8f10 4327 default:
4328 break;
4329 }
69e3c75f
JB
4330 }
4331 /* Done */
4332 else {
4333 err = -EINVAL;
4ebf0ae2 4334 if (unlikely(req->tp_frame_nr))
69e3c75f 4335 goto out;
1da177e4
LT
4336 }
4337
1da177e4
LT
4338
4339 /* Detach socket from network */
4340 spin_lock(&po->bind_lock);
4341 was_running = po->running;
4342 num = po->num;
4343 if (was_running) {
1da177e4 4344 po->num = 0;
ce06b03e 4345 __unregister_prot_hook(sk, false);
1da177e4
LT
4346 }
4347 spin_unlock(&po->bind_lock);
1ce4f28b 4348
1da177e4
LT
4349 synchronize_net();
4350
4351 err = -EBUSY;
905db440 4352 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4353 if (closing || atomic_read(&po->mapped) == 0) {
4354 err = 0;
69e3c75f 4355 spin_lock_bh(&rb_queue->lock);
c053fd96 4356 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4357 rb->frame_max = (req->tp_frame_nr - 1);
4358 rb->head = 0;
4359 rb->frame_size = req->tp_frame_size;
4360 spin_unlock_bh(&rb_queue->lock);
4361
3a7ad063 4362 swap(rb->pg_vec_order, order);
c053fd96 4363 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4364
4365 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4366 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4367 tpacket_rcv : packet_rcv;
4368 skb_queue_purge(rb_queue);
1da177e4 4369 if (atomic_read(&po->mapped))
40d4e3df
ED
4370 pr_err("packet_mmap: vma is busy: %d\n",
4371 atomic_read(&po->mapped));
1da177e4 4372 }
905db440 4373 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4374
4375 spin_lock(&po->bind_lock);
ce06b03e 4376 if (was_running) {
1da177e4 4377 po->num = num;
ce06b03e 4378 register_prot_hook(sk);
1da177e4
LT
4379 }
4380 spin_unlock(&po->bind_lock);
c800aaf8 4381 if (pg_vec && (po->tp_version > TPACKET_V2)) {
f6fb8f10 4382 /* Because we don't support block-based V3 on tx-ring */
4383 if (!tx_ring)
73d0fcf2 4384 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4385 }
1da177e4 4386
1da177e4 4387 if (pg_vec)
3a7ad063 4388 free_pg_vec(pg_vec, order, req->tp_block_nr);
1da177e4
LT
4389out:
4390 return err;
4391}
4392
69e3c75f
JB
4393static int packet_mmap(struct file *file, struct socket *sock,
4394 struct vm_area_struct *vma)
1da177e4
LT
4395{
4396 struct sock *sk = sock->sk;
4397 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4398 unsigned long size, expected_size;
4399 struct packet_ring_buffer *rb;
1da177e4
LT
4400 unsigned long start;
4401 int err = -EINVAL;
4402 int i;
4403
4404 if (vma->vm_pgoff)
4405 return -EINVAL;
4406
905db440 4407 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4408
4409 expected_size = 0;
4410 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4411 if (rb->pg_vec) {
4412 expected_size += rb->pg_vec_len
4413 * rb->pg_vec_pages
4414 * PAGE_SIZE;
4415 }
4416 }
4417
4418 if (expected_size == 0)
1da177e4 4419 goto out;
69e3c75f
JB
4420
4421 size = vma->vm_end - vma->vm_start;
4422 if (size != expected_size)
1da177e4
LT
4423 goto out;
4424
1da177e4 4425 start = vma->vm_start;
69e3c75f
JB
4426 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4427 if (rb->pg_vec == NULL)
4428 continue;
4429
4430 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4431 struct page *page;
4432 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4433 int pg_num;
4434
c56b4d90
CG
4435 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4436 page = pgv_to_page(kaddr);
69e3c75f
JB
4437 err = vm_insert_page(vma, start, page);
4438 if (unlikely(err))
4439 goto out;
4440 start += PAGE_SIZE;
0e3125c7 4441 kaddr += PAGE_SIZE;
69e3c75f 4442 }
4ebf0ae2 4443 }
1da177e4 4444 }
69e3c75f 4445
4ebf0ae2 4446 atomic_inc(&po->mapped);
1da177e4
LT
4447 vma->vm_ops = &packet_mmap_ops;
4448 err = 0;
4449
4450out:
905db440 4451 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4452 return err;
4453}
1da177e4 4454
90ddc4f0 4455static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4456 .family = PF_PACKET,
4457 .owner = THIS_MODULE,
4458 .release = packet_release,
4459 .bind = packet_bind_spkt,
4460 .connect = sock_no_connect,
4461 .socketpair = sock_no_socketpair,
4462 .accept = sock_no_accept,
4463 .getname = packet_getname_spkt,
a11e1d43 4464 .poll = datagram_poll,
1da177e4 4465 .ioctl = packet_ioctl,
c7cbdbf2 4466 .gettstamp = sock_gettstamp,
1da177e4
LT
4467 .listen = sock_no_listen,
4468 .shutdown = sock_no_shutdown,
4469 .setsockopt = sock_no_setsockopt,
4470 .getsockopt = sock_no_getsockopt,
4471 .sendmsg = packet_sendmsg_spkt,
4472 .recvmsg = packet_recvmsg,
4473 .mmap = sock_no_mmap,
4474 .sendpage = sock_no_sendpage,
4475};
1da177e4 4476
90ddc4f0 4477static const struct proto_ops packet_ops = {
1da177e4
LT
4478 .family = PF_PACKET,
4479 .owner = THIS_MODULE,
4480 .release = packet_release,
4481 .bind = packet_bind,
4482 .connect = sock_no_connect,
4483 .socketpair = sock_no_socketpair,
4484 .accept = sock_no_accept,
1ce4f28b 4485 .getname = packet_getname,
a11e1d43 4486 .poll = packet_poll,
1da177e4 4487 .ioctl = packet_ioctl,
c7cbdbf2 4488 .gettstamp = sock_gettstamp,
1da177e4
LT
4489 .listen = sock_no_listen,
4490 .shutdown = sock_no_shutdown,
4491 .setsockopt = packet_setsockopt,
4492 .getsockopt = packet_getsockopt,
719c44d3
WB
4493#ifdef CONFIG_COMPAT
4494 .compat_setsockopt = compat_packet_setsockopt,
4495#endif
1da177e4
LT
4496 .sendmsg = packet_sendmsg,
4497 .recvmsg = packet_recvmsg,
4498 .mmap = packet_mmap,
4499 .sendpage = sock_no_sendpage,
4500};
4501
ec1b4cf7 4502static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4503 .family = PF_PACKET,
4504 .create = packet_create,
4505 .owner = THIS_MODULE,
4506};
4507
4508static struct notifier_block packet_netdev_notifier = {
40d4e3df 4509 .notifier_call = packet_notifier,
1da177e4
LT
4510};
4511
4512#ifdef CONFIG_PROC_FS
1da177e4
LT
4513
4514static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4515 __acquires(RCU)
1da177e4 4516{
e372c414 4517 struct net *net = seq_file_net(seq);
808f5114 4518
4519 rcu_read_lock();
4520 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4521}
4522
4523static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4524{
1bf40954 4525 struct net *net = seq_file_net(seq);
808f5114 4526 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4527}
4528
4529static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4530 __releases(RCU)
1da177e4 4531{
808f5114 4532 rcu_read_unlock();
1da177e4
LT
4533}
4534
1ce4f28b 4535static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4536{
4537 if (v == SEQ_START_TOKEN)
4538 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4539 else {
b7ceabd9 4540 struct sock *s = sk_entry(v);
1da177e4
LT
4541 const struct packet_sock *po = pkt_sk(s);
4542
4543 seq_printf(seq,
71338aa7 4544 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4 4545 s,
41c6d650 4546 refcount_read(&s->sk_refcnt),
1da177e4
LT
4547 s->sk_type,
4548 ntohs(po->num),
4549 po->ifindex,
4550 po->running,
4551 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4552 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4553 sock_i_ino(s));
1da177e4
LT
4554 }
4555
4556 return 0;
4557}
4558
56b3d975 4559static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4560 .start = packet_seq_start,
4561 .next = packet_seq_next,
4562 .stop = packet_seq_stop,
4563 .show = packet_seq_show,
4564};
1da177e4
LT
4565#endif
4566
2c8c1e72 4567static int __net_init packet_net_init(struct net *net)
d12d01d6 4568{
0fa7fa98 4569 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4570 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4571
c3506372
CH
4572 if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
4573 sizeof(struct seq_net_private)))
d12d01d6
DL
4574 return -ENOMEM;
4575
4576 return 0;
4577}
4578
2c8c1e72 4579static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4580{
ece31ffd 4581 remove_proc_entry("packet", net->proc_net);
669f8f1a 4582 WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
d12d01d6
DL
4583}
4584
4585static struct pernet_operations packet_net_ops = {
4586 .init = packet_net_init,
4587 .exit = packet_net_exit,
4588};
4589
4590
1da177e4
LT
4591static void __exit packet_exit(void)
4592{
1da177e4 4593 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4594 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4595 sock_unregister(PF_PACKET);
4596 proto_unregister(&packet_proto);
4597}
4598
4599static int __init packet_init(void)
4600{
36096f2f 4601 int rc;
1da177e4 4602
36096f2f
Y
4603 rc = proto_register(&packet_proto, 0);
4604 if (rc)
1da177e4 4605 goto out;
36096f2f
Y
4606 rc = sock_register(&packet_family_ops);
4607 if (rc)
4608 goto out_proto;
4609 rc = register_pernet_subsys(&packet_net_ops);
4610 if (rc)
4611 goto out_sock;
4612 rc = register_netdevice_notifier(&packet_netdev_notifier);
4613 if (rc)
4614 goto out_pernet;
1da177e4 4615
36096f2f
Y
4616 return 0;
4617
4618out_pernet:
4619 unregister_pernet_subsys(&packet_net_ops);
4620out_sock:
4621 sock_unregister(PF_PACKET);
4622out_proto:
4623 proto_unregister(&packet_proto);
1da177e4
LT
4624out:
4625 return rc;
4626}
4627
4628module_init(packet_init);
4629module_exit(packet_exit);
4630MODULE_LICENSE("GPL");
4631MODULE_ALIAS_NETPROTO(PF_PACKET);