2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * PACKET - implements raw packet sockets.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
35 * Ulises Alonso : Frame number limit removal and
36 * packet_set_ring memory leak.
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
40 * byte arrays at the end of sockaddr_ll
42 * Johann Baudy : Added TX RING.
43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
55 #include <linux/types.h>
57 #include <linux/capability.h>
58 #include <linux/fcntl.h>
59 #include <linux/socket.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_packet.h>
64 #include <linux/wireless.h>
65 #include <linux/kernel.h>
66 #include <linux/kmod.h>
67 #include <linux/slab.h>
68 #include <linux/vmalloc.h>
69 #include <net/net_namespace.h>
71 #include <net/protocol.h>
72 #include <linux/skbuff.h>
74 #include <linux/errno.h>
75 #include <linux/timer.h>
76 #include <linux/uaccess.h>
77 #include <asm/ioctls.h>
79 #include <asm/cacheflush.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/poll.h>
84 #include <linux/module.h>
85 #include <linux/init.h>
86 #include <linux/mutex.h>
87 #include <linux/if_vlan.h>
88 #include <linux/virtio_net.h>
89 #include <linux/errqueue.h>
90 #include <linux/net_tstamp.h>
91 #include <linux/percpu.h>
93 #include <net/inet_common.h>
95 #include <linux/bpf.h>
96 #include <net/compat.h>
102 - if device has no dev->hard_header routine, it adds and removes ll header
103 inside itself. In this case ll header is invisible outside of device,
104 but higher levels still should reserve dev->hard_header_len.
105 Some devices are enough clever to reallocate skb, when header
106 will not fit to reserved space (tunnel), another ones are silly
108 - packet socket receives packets with pulled ll header,
109 so that SOCK_RAW should push it back.
114 Incoming, dev->hard_header!=NULL
115 mac_header -> ll header
118 Outgoing, dev->hard_header!=NULL
119 mac_header -> ll header
122 Incoming, dev->hard_header==NULL
123 mac_header -> UNKNOWN position. It is very likely, that it points to ll
124 header. PPP makes it, that is wrong, because introduce
125 assymetry between rx and tx paths.
128 Outgoing, dev->hard_header==NULL
129 mac_header -> data. ll header is still not built!
133 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
139 dev->hard_header != NULL
140 mac_header -> ll header
143 dev->hard_header == NULL (ll header is added by device, we cannot control it)
147 We should set nh.raw on output to correct posistion,
148 packet classifier depends on it.
151 /* Private packet socket structures. */
153 /* identical to struct packet_mreq except it has
154 * a longer address field.
156 struct packet_mreq_max {
158 unsigned short mr_type;
159 unsigned short mr_alen;
160 unsigned char mr_address[MAX_ADDR_LEN];
164 struct tpacket_hdr *h1;
165 struct tpacket2_hdr *h2;
166 struct tpacket3_hdr *h3;
170 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
171 int closing, int tx_ring);
173 #define V3_ALIGNMENT (8)
175 #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
177 #define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
180 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181 #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182 #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186 #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
189 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
190 struct packet_type *pt, struct net_device *orig_dev);
192 static void *packet_previous_frame(struct packet_sock *po,
193 struct packet_ring_buffer *rb,
195 static void packet_increment_head(struct packet_ring_buffer *buff);
196 static int prb_curr_blk_in_use(struct tpacket_block_desc *);
197 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
198 struct packet_sock *);
199 static void prb_retire_current_block(struct tpacket_kbdq_core *,
200 struct packet_sock *, unsigned int status);
201 static int prb_queue_frozen(struct tpacket_kbdq_core *);
202 static void prb_open_block(struct tpacket_kbdq_core *,
203 struct tpacket_block_desc *);
204 static void prb_retire_rx_blk_timer_expired(struct timer_list *);
205 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
206 static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
207 static void prb_clear_rxhash(struct tpacket_kbdq_core *,
208 struct tpacket3_hdr *);
209 static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
210 struct tpacket3_hdr *);
211 static void packet_flush_mclist(struct sock *sk);
212 static u16 packet_pick_tx_queue(struct sk_buff *skb);
214 struct packet_skb_cb {
216 struct sockaddr_pkt pkt;
218 /* Trick: alias skb original length with
219 * ll.sll_family and ll.protocol in order
222 unsigned int origlen;
223 struct sockaddr_ll ll;
228 #define vio_le() virtio_legacy_is_little_endian()
230 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
232 #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
233 #define GET_PBLOCK_DESC(x, bid) \
234 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
235 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
236 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
237 #define GET_NEXT_PRB_BLK_NUM(x) \
238 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
239 ((x)->kactive_blk_num+1) : 0)
241 static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
242 static void __fanout_link(struct sock *sk, struct packet_sock *po);
244 static int packet_direct_xmit(struct sk_buff *skb)
246 return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
249 static struct net_device *packet_cached_dev_get(struct packet_sock *po)
251 struct net_device *dev;
254 dev = rcu_dereference(po->cached_dev);
262 static void packet_cached_dev_assign(struct packet_sock *po,
263 struct net_device *dev)
265 rcu_assign_pointer(po->cached_dev, dev);
268 static void packet_cached_dev_reset(struct packet_sock *po)
270 RCU_INIT_POINTER(po->cached_dev, NULL);
273 static bool packet_use_direct_xmit(const struct packet_sock *po)
275 return po->xmit == packet_direct_xmit;
278 static u16 packet_pick_tx_queue(struct sk_buff *skb)
280 struct net_device *dev = skb->dev;
281 const struct net_device_ops *ops = dev->netdev_ops;
282 int cpu = raw_smp_processor_id();
286 skb->sender_cpu = cpu + 1;
288 skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
289 if (ops->ndo_select_queue) {
290 queue_index = ops->ndo_select_queue(dev, skb, NULL);
291 queue_index = netdev_cap_txqueue(dev, queue_index);
293 queue_index = netdev_pick_tx(dev, skb, NULL);
299 /* __register_prot_hook must be invoked through register_prot_hook
300 * or from a context in which asynchronous accesses to the packet
301 * socket is not possible (packet_create()).
303 static void __register_prot_hook(struct sock *sk)
305 struct packet_sock *po = pkt_sk(sk);
309 __fanout_link(sk, po);
311 dev_add_pack(&po->prot_hook);
318 static void register_prot_hook(struct sock *sk)
320 lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
321 __register_prot_hook(sk);
324 /* If the sync parameter is true, we will temporarily drop
325 * the po->bind_lock and do a synchronize_net to make sure no
326 * asynchronous packet processing paths still refer to the elements
327 * of po->prot_hook. If the sync parameter is false, it is the
328 * callers responsibility to take care of this.
330 static void __unregister_prot_hook(struct sock *sk, bool sync)
332 struct packet_sock *po = pkt_sk(sk);
334 lockdep_assert_held_once(&po->bind_lock);
339 __fanout_unlink(sk, po);
341 __dev_remove_pack(&po->prot_hook);
346 spin_unlock(&po->bind_lock);
348 spin_lock(&po->bind_lock);
352 static void unregister_prot_hook(struct sock *sk, bool sync)
354 struct packet_sock *po = pkt_sk(sk);
357 __unregister_prot_hook(sk, sync);
360 static inline struct page * __pure pgv_to_page(void *addr)
362 if (is_vmalloc_addr(addr))
363 return vmalloc_to_page(addr);
364 return virt_to_page(addr);
367 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
369 union tpacket_uhdr h;
372 switch (po->tp_version) {
374 h.h1->tp_status = status;
375 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
378 h.h2->tp_status = status;
379 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
382 h.h3->tp_status = status;
383 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
386 WARN(1, "TPACKET version not supported.\n");
393 static int __packet_get_status(struct packet_sock *po, void *frame)
395 union tpacket_uhdr h;
400 switch (po->tp_version) {
402 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
403 return h.h1->tp_status;
405 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
406 return h.h2->tp_status;
408 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
409 return h.h3->tp_status;
411 WARN(1, "TPACKET version not supported.\n");
417 static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
420 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
423 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
424 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
425 return TP_STATUS_TS_RAW_HARDWARE;
427 if (ktime_to_timespec_cond(skb->tstamp, ts))
428 return TP_STATUS_TS_SOFTWARE;
433 static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
436 union tpacket_uhdr h;
440 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
444 switch (po->tp_version) {
446 h.h1->tp_sec = ts.tv_sec;
447 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
450 h.h2->tp_sec = ts.tv_sec;
451 h.h2->tp_nsec = ts.tv_nsec;
454 h.h3->tp_sec = ts.tv_sec;
455 h.h3->tp_nsec = ts.tv_nsec;
458 WARN(1, "TPACKET version not supported.\n");
462 /* one flush is safe, as both fields always lie on the same cacheline */
463 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
469 static void *packet_lookup_frame(struct packet_sock *po,
470 struct packet_ring_buffer *rb,
471 unsigned int position,
474 unsigned int pg_vec_pos, frame_offset;
475 union tpacket_uhdr h;
477 pg_vec_pos = position / rb->frames_per_block;
478 frame_offset = position % rb->frames_per_block;
480 h.raw = rb->pg_vec[pg_vec_pos].buffer +
481 (frame_offset * rb->frame_size);
483 if (status != __packet_get_status(po, h.raw))
489 static void *packet_current_frame(struct packet_sock *po,
490 struct packet_ring_buffer *rb,
493 return packet_lookup_frame(po, rb, rb->head, status);
496 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
498 del_timer_sync(&pkc->retire_blk_timer);
501 static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
502 struct sk_buff_head *rb_queue)
504 struct tpacket_kbdq_core *pkc;
506 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
508 spin_lock_bh(&rb_queue->lock);
509 pkc->delete_blk_timer = 1;
510 spin_unlock_bh(&rb_queue->lock);
512 prb_del_retire_blk_timer(pkc);
515 static void prb_setup_retire_blk_timer(struct packet_sock *po)
517 struct tpacket_kbdq_core *pkc;
519 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
520 timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
522 pkc->retire_blk_timer.expires = jiffies;
525 static int prb_calc_retire_blk_tmo(struct packet_sock *po,
526 int blk_size_in_bytes)
528 struct net_device *dev;
529 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
530 struct ethtool_link_ksettings ecmd;
534 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
535 if (unlikely(!dev)) {
537 return DEFAULT_PRB_RETIRE_TOV;
539 err = __ethtool_get_link_ksettings(dev, &ecmd);
543 * If the link speed is so slow you don't really
544 * need to worry about perf anyways
546 if (ecmd.base.speed < SPEED_1000 ||
547 ecmd.base.speed == SPEED_UNKNOWN) {
548 return DEFAULT_PRB_RETIRE_TOV;
551 div = ecmd.base.speed / 1000;
555 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
567 static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
568 union tpacket_req_u *req_u)
570 p1->feature_req_word = req_u->req3.tp_feature_req_word;
573 static void init_prb_bdqc(struct packet_sock *po,
574 struct packet_ring_buffer *rb,
576 union tpacket_req_u *req_u)
578 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
579 struct tpacket_block_desc *pbd;
581 memset(p1, 0x0, sizeof(*p1));
583 p1->knxt_seq_num = 1;
585 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
586 p1->pkblk_start = pg_vec[0].buffer;
587 p1->kblk_size = req_u->req3.tp_block_size;
588 p1->knum_blocks = req_u->req3.tp_block_nr;
589 p1->hdrlen = po->tp_hdrlen;
590 p1->version = po->tp_version;
591 p1->last_kactive_blk_num = 0;
592 po->stats.stats3.tp_freeze_q_cnt = 0;
593 if (req_u->req3.tp_retire_blk_tov)
594 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
596 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
597 req_u->req3.tp_block_size);
598 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
599 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
601 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
602 prb_init_ft_ops(p1, req_u);
603 prb_setup_retire_blk_timer(po);
604 prb_open_block(p1, pbd);
607 /* Do NOT update the last_blk_num first.
608 * Assumes sk_buff_head lock is held.
610 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
612 mod_timer(&pkc->retire_blk_timer,
613 jiffies + pkc->tov_in_jiffies);
614 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
619 * 1) We refresh the timer only when we open a block.
620 * By doing this we don't waste cycles refreshing the timer
621 * on packet-by-packet basis.
623 * With a 1MB block-size, on a 1Gbps line, it will take
624 * i) ~8 ms to fill a block + ii) memcpy etc.
625 * In this cut we are not accounting for the memcpy time.
627 * So, if the user sets the 'tmo' to 10ms then the timer
628 * will never fire while the block is still getting filled
629 * (which is what we want). However, the user could choose
630 * to close a block early and that's fine.
632 * But when the timer does fire, we check whether or not to refresh it.
633 * Since the tmo granularity is in msecs, it is not too expensive
634 * to refresh the timer, lets say every '8' msecs.
635 * Either the user can set the 'tmo' or we can derive it based on
636 * a) line-speed and b) block-size.
637 * prb_calc_retire_blk_tmo() calculates the tmo.
640 static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
642 struct packet_sock *po =
643 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
644 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
646 struct tpacket_block_desc *pbd;
648 spin_lock(&po->sk.sk_receive_queue.lock);
650 frozen = prb_queue_frozen(pkc);
651 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
653 if (unlikely(pkc->delete_blk_timer))
656 /* We only need to plug the race when the block is partially filled.
658 * lock(); increment BLOCK_NUM_PKTS; unlock()
659 * copy_bits() is in progress ...
660 * timer fires on other cpu:
661 * we can't retire the current block because copy_bits
665 if (BLOCK_NUM_PKTS(pbd)) {
666 while (atomic_read(&pkc->blk_fill_in_prog)) {
667 /* Waiting for skb_copy_bits to finish... */
672 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
674 if (!BLOCK_NUM_PKTS(pbd)) {
675 /* An empty block. Just refresh the timer. */
678 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
679 if (!prb_dispatch_next_block(pkc, po))
684 /* Case 1. Queue was frozen because user-space was
687 if (prb_curr_blk_in_use(pbd)) {
689 * Ok, user-space is still behind.
690 * So just refresh the timer.
694 /* Case 2. queue was frozen,user-space caught up,
695 * now the link went idle && the timer fired.
696 * We don't have a block to close.So we open this
697 * block and restart the timer.
698 * opening a block thaws the queue,restarts timer
699 * Thawing/timer-refresh is a side effect.
701 prb_open_block(pkc, pbd);
708 _prb_refresh_rx_retire_blk_timer(pkc);
711 spin_unlock(&po->sk.sk_receive_queue.lock);
714 static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
715 struct tpacket_block_desc *pbd1, __u32 status)
717 /* Flush everything minus the block header */
719 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
724 /* Skip the block header(we know header WILL fit in 4K) */
727 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
728 for (; start < end; start += PAGE_SIZE)
729 flush_dcache_page(pgv_to_page(start));
734 /* Now update the block status. */
736 BLOCK_STATUS(pbd1) = status;
738 /* Flush the block header */
740 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
742 flush_dcache_page(pgv_to_page(start));
752 * 2) Increment active_blk_num
754 * Note:We DONT refresh the timer on purpose.
755 * Because almost always the next block will be opened.
757 static void prb_close_block(struct tpacket_kbdq_core *pkc1,
758 struct tpacket_block_desc *pbd1,
759 struct packet_sock *po, unsigned int stat)
761 __u32 status = TP_STATUS_USER | stat;
763 struct tpacket3_hdr *last_pkt;
764 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
765 struct sock *sk = &po->sk;
767 if (po->stats.stats3.tp_drops)
768 status |= TP_STATUS_LOSING;
770 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
771 last_pkt->tp_next_offset = 0;
773 /* Get the ts of the last pkt */
774 if (BLOCK_NUM_PKTS(pbd1)) {
775 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
776 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
778 /* Ok, we tmo'd - so get the current time.
780 * It shouldn't really happen as we don't close empty
781 * blocks. See prb_retire_rx_blk_timer_expired().
785 h1->ts_last_pkt.ts_sec = ts.tv_sec;
786 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
791 /* Flush the block */
792 prb_flush_block(pkc1, pbd1, status);
794 sk->sk_data_ready(sk);
796 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
799 static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
801 pkc->reset_pending_on_curr_blk = 0;
805 * Side effect of opening a block:
807 * 1) prb_queue is thawed.
808 * 2) retire_blk_timer is refreshed.
811 static void prb_open_block(struct tpacket_kbdq_core *pkc1,
812 struct tpacket_block_desc *pbd1)
815 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
819 /* We could have just memset this but we will lose the
820 * flexibility of making the priv area sticky
823 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
824 BLOCK_NUM_PKTS(pbd1) = 0;
825 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
829 h1->ts_first_pkt.ts_sec = ts.tv_sec;
830 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
832 pkc1->pkblk_start = (char *)pbd1;
833 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
835 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
836 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
838 pbd1->version = pkc1->version;
839 pkc1->prev = pkc1->nxt_offset;
840 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
842 prb_thaw_queue(pkc1);
843 _prb_refresh_rx_retire_blk_timer(pkc1);
849 * Queue freeze logic:
850 * 1) Assume tp_block_nr = 8 blocks.
851 * 2) At time 't0', user opens Rx ring.
852 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
853 * 4) user-space is either sleeping or processing block '0'.
854 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
855 * it will close block-7,loop around and try to fill block '0'.
857 * __packet_lookup_frame_in_block
858 * prb_retire_current_block()
859 * prb_dispatch_next_block()
860 * |->(BLOCK_STATUS == USER) evaluates to true
861 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
862 * 6) Now there are two cases:
863 * 6.1) Link goes idle right after the queue is frozen.
864 * But remember, the last open_block() refreshed the timer.
865 * When this timer expires,it will refresh itself so that we can
866 * re-open block-0 in near future.
867 * 6.2) Link is busy and keeps on receiving packets. This is a simple
868 * case and __packet_lookup_frame_in_block will check if block-0
869 * is free and can now be re-used.
871 static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
872 struct packet_sock *po)
874 pkc->reset_pending_on_curr_blk = 1;
875 po->stats.stats3.tp_freeze_q_cnt++;
878 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
881 * If the next block is free then we will dispatch it
882 * and return a good offset.
883 * Else, we will freeze the queue.
884 * So, caller must check the return value.
886 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
887 struct packet_sock *po)
889 struct tpacket_block_desc *pbd;
893 /* 1. Get current block num */
894 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
896 /* 2. If this block is currently in_use then freeze the queue */
897 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
898 prb_freeze_queue(pkc, po);
904 * open this block and return the offset where the first packet
905 * needs to get stored.
907 prb_open_block(pkc, pbd);
908 return (void *)pkc->nxt_offset;
911 static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
912 struct packet_sock *po, unsigned int status)
914 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
916 /* retire/close the current block */
917 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
919 * Plug the case where copy_bits() is in progress on
920 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
921 * have space to copy the pkt in the current block and
922 * called prb_retire_current_block()
924 * We don't need to worry about the TMO case because
925 * the timer-handler already handled this case.
927 if (!(status & TP_STATUS_BLK_TMO)) {
928 while (atomic_read(&pkc->blk_fill_in_prog)) {
929 /* Waiting for skb_copy_bits to finish... */
933 prb_close_block(pkc, pbd, po, status);
938 static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
940 return TP_STATUS_USER & BLOCK_STATUS(pbd);
943 static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
945 return pkc->reset_pending_on_curr_blk;
948 static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
950 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
951 atomic_dec(&pkc->blk_fill_in_prog);
954 static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
955 struct tpacket3_hdr *ppd)
957 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
960 static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
961 struct tpacket3_hdr *ppd)
963 ppd->hv1.tp_rxhash = 0;
966 static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
967 struct tpacket3_hdr *ppd)
969 if (skb_vlan_tag_present(pkc->skb)) {
970 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
971 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
972 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
974 ppd->hv1.tp_vlan_tci = 0;
975 ppd->hv1.tp_vlan_tpid = 0;
976 ppd->tp_status = TP_STATUS_AVAILABLE;
980 static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
981 struct tpacket3_hdr *ppd)
983 ppd->hv1.tp_padding = 0;
984 prb_fill_vlan_info(pkc, ppd);
986 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
987 prb_fill_rxhash(pkc, ppd);
989 prb_clear_rxhash(pkc, ppd);
992 static void prb_fill_curr_block(char *curr,
993 struct tpacket_kbdq_core *pkc,
994 struct tpacket_block_desc *pbd,
997 struct tpacket3_hdr *ppd;
999 ppd = (struct tpacket3_hdr *)curr;
1000 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1002 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1003 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1004 BLOCK_NUM_PKTS(pbd) += 1;
1005 atomic_inc(&pkc->blk_fill_in_prog);
1006 prb_run_all_ft_ops(pkc, ppd);
1009 /* Assumes caller has the sk->rx_queue.lock */
1010 static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1011 struct sk_buff *skb,
1016 struct tpacket_kbdq_core *pkc;
1017 struct tpacket_block_desc *pbd;
1020 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
1021 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1023 /* Queue is frozen when user space is lagging behind */
1024 if (prb_queue_frozen(pkc)) {
1026 * Check if that last block which caused the queue to freeze,
1027 * is still in_use by user-space.
1029 if (prb_curr_blk_in_use(pbd)) {
1030 /* Can't record this packet */
1034 * Ok, the block was released by user-space.
1035 * Now let's open that block.
1036 * opening a block also thaws the queue.
1037 * Thawing is a side effect.
1039 prb_open_block(pkc, pbd);
1044 curr = pkc->nxt_offset;
1046 end = (char *)pbd + pkc->kblk_size;
1048 /* first try the current block */
1049 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1050 prb_fill_curr_block(curr, pkc, pbd, len);
1051 return (void *)curr;
1054 /* Ok, close the current block */
1055 prb_retire_current_block(pkc, po, 0);
1057 /* Now, try to dispatch the next block */
1058 curr = (char *)prb_dispatch_next_block(pkc, po);
1060 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1061 prb_fill_curr_block(curr, pkc, pbd, len);
1062 return (void *)curr;
1066 * No free blocks are available.user_space hasn't caught up yet.
1067 * Queue was just frozen and now this packet will get dropped.
1072 static void *packet_current_rx_frame(struct packet_sock *po,
1073 struct sk_buff *skb,
1074 int status, unsigned int len)
1077 switch (po->tp_version) {
1080 curr = packet_lookup_frame(po, &po->rx_ring,
1081 po->rx_ring.head, status);
1084 return __packet_lookup_frame_in_block(po, skb, status, len);
1086 WARN(1, "TPACKET version not supported\n");
1092 static void *prb_lookup_block(struct packet_sock *po,
1093 struct packet_ring_buffer *rb,
1097 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1098 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
1100 if (status != BLOCK_STATUS(pbd))
1105 static int prb_previous_blk_num(struct packet_ring_buffer *rb)
1108 if (rb->prb_bdqc.kactive_blk_num)
1109 prev = rb->prb_bdqc.kactive_blk_num-1;
1111 prev = rb->prb_bdqc.knum_blocks-1;
1115 /* Assumes caller has held the rx_queue.lock */
1116 static void *__prb_previous_block(struct packet_sock *po,
1117 struct packet_ring_buffer *rb,
1120 unsigned int previous = prb_previous_blk_num(rb);
1121 return prb_lookup_block(po, rb, previous, status);
1124 static void *packet_previous_rx_frame(struct packet_sock *po,
1125 struct packet_ring_buffer *rb,
1128 if (po->tp_version <= TPACKET_V2)
1129 return packet_previous_frame(po, rb, status);
1131 return __prb_previous_block(po, rb, status);
1134 static void packet_increment_rx_head(struct packet_sock *po,
1135 struct packet_ring_buffer *rb)
1137 switch (po->tp_version) {
1140 return packet_increment_head(rb);
1143 WARN(1, "TPACKET version not supported.\n");
1149 static void *packet_previous_frame(struct packet_sock *po,
1150 struct packet_ring_buffer *rb,
1153 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1154 return packet_lookup_frame(po, rb, previous, status);
1157 static void packet_increment_head(struct packet_ring_buffer *buff)
1159 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1162 static void packet_inc_pending(struct packet_ring_buffer *rb)
1164 this_cpu_inc(*rb->pending_refcnt);
1167 static void packet_dec_pending(struct packet_ring_buffer *rb)
1169 this_cpu_dec(*rb->pending_refcnt);
1172 static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1174 unsigned int refcnt = 0;
1177 /* We don't use pending refcount in rx_ring. */
1178 if (rb->pending_refcnt == NULL)
1181 for_each_possible_cpu(cpu)
1182 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1187 static int packet_alloc_pending(struct packet_sock *po)
1189 po->rx_ring.pending_refcnt = NULL;
1191 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1192 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1198 static void packet_free_pending(struct packet_sock *po)
1200 free_percpu(po->tx_ring.pending_refcnt);
1203 #define ROOM_POW_OFF 2
1204 #define ROOM_NONE 0x0
1205 #define ROOM_LOW 0x1
1206 #define ROOM_NORMAL 0x2
1208 static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
1212 len = po->rx_ring.frame_max + 1;
1213 idx = po->rx_ring.head;
1215 idx += len >> pow_off;
1218 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1221 static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1225 len = po->rx_ring.prb_bdqc.knum_blocks;
1226 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1228 idx += len >> pow_off;
1231 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1234 static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1236 struct sock *sk = &po->sk;
1237 int ret = ROOM_NONE;
1239 if (po->prot_hook.func != tpacket_rcv) {
1240 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1241 - (skb ? skb->truesize : 0);
1242 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1250 if (po->tp_version == TPACKET_V3) {
1251 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1253 else if (__tpacket_v3_has_room(po, 0))
1256 if (__tpacket_has_room(po, ROOM_POW_OFF))
1258 else if (__tpacket_has_room(po, 0))
1265 static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1270 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1271 ret = __packet_rcv_has_room(po, skb);
1272 has_room = ret == ROOM_NORMAL;
1273 if (po->pressure == has_room)
1274 po->pressure = !has_room;
1275 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
1280 static void packet_sock_destruct(struct sock *sk)
1282 skb_queue_purge(&sk->sk_error_queue);
1284 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1285 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1287 if (!sock_flag(sk, SOCK_DEAD)) {
1288 pr_err("Attempt to release alive packet socket: %p\n", sk);
1292 sk_refcnt_debug_dec(sk);
1295 static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1300 rxhash = skb_get_hash(skb);
1301 for (i = 0; i < ROLLOVER_HLEN; i++)
1302 if (po->rollover->history[i] == rxhash)
1305 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1306 return count > (ROLLOVER_HLEN >> 1);
1309 static unsigned int fanout_demux_hash(struct packet_fanout *f,
1310 struct sk_buff *skb,
1313 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
1316 static unsigned int fanout_demux_lb(struct packet_fanout *f,
1317 struct sk_buff *skb,
1320 unsigned int val = atomic_inc_return(&f->rr_cur);
1325 static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1326 struct sk_buff *skb,
1329 return smp_processor_id() % num;
1332 static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1333 struct sk_buff *skb,
1336 return prandom_u32_max(num);
1339 static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1340 struct sk_buff *skb,
1341 unsigned int idx, bool try_self,
1344 struct packet_sock *po, *po_next, *po_skip = NULL;
1345 unsigned int i, j, room = ROOM_NONE;
1347 po = pkt_sk(f->arr[idx]);
1350 room = packet_rcv_has_room(po, skb);
1351 if (room == ROOM_NORMAL ||
1352 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1357 i = j = min_t(int, po->rollover->sock, num - 1);
1359 po_next = pkt_sk(f->arr[i]);
1360 if (po_next != po_skip && !po_next->pressure &&
1361 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
1363 po->rollover->sock = i;
1364 atomic_long_inc(&po->rollover->num);
1365 if (room == ROOM_LOW)
1366 atomic_long_inc(&po->rollover->num_huge);
1374 atomic_long_inc(&po->rollover->num_failed);
1378 static unsigned int fanout_demux_qm(struct packet_fanout *f,
1379 struct sk_buff *skb,
1382 return skb_get_queue_mapping(skb) % num;
1385 static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1386 struct sk_buff *skb,
1389 struct bpf_prog *prog;
1390 unsigned int ret = 0;
1393 prog = rcu_dereference(f->bpf_prog);
1395 ret = bpf_prog_run_clear_cb(prog, skb) % num;
1401 static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1403 return f->flags & (flag >> 8);
1406 static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1407 struct packet_type *pt, struct net_device *orig_dev)
1409 struct packet_fanout *f = pt->af_packet_priv;
1410 unsigned int num = READ_ONCE(f->num_members);
1411 struct net *net = read_pnet(&f->net);
1412 struct packet_sock *po;
1415 if (!net_eq(dev_net(dev), net) || !num) {
1420 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1421 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
1426 case PACKET_FANOUT_HASH:
1428 idx = fanout_demux_hash(f, skb, num);
1430 case PACKET_FANOUT_LB:
1431 idx = fanout_demux_lb(f, skb, num);
1433 case PACKET_FANOUT_CPU:
1434 idx = fanout_demux_cpu(f, skb, num);
1436 case PACKET_FANOUT_RND:
1437 idx = fanout_demux_rnd(f, skb, num);
1439 case PACKET_FANOUT_QM:
1440 idx = fanout_demux_qm(f, skb, num);
1442 case PACKET_FANOUT_ROLLOVER:
1443 idx = fanout_demux_rollover(f, skb, 0, false, num);
1445 case PACKET_FANOUT_CBPF:
1446 case PACKET_FANOUT_EBPF:
1447 idx = fanout_demux_bpf(f, skb, num);
1451 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1452 idx = fanout_demux_rollover(f, skb, idx, true, num);
1454 po = pkt_sk(f->arr[idx]);
1455 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1458 DEFINE_MUTEX(fanout_mutex);
1459 EXPORT_SYMBOL_GPL(fanout_mutex);
1460 static LIST_HEAD(fanout_list);
1461 static u16 fanout_next_id;
1463 static void __fanout_link(struct sock *sk, struct packet_sock *po)
1465 struct packet_fanout *f = po->fanout;
1467 spin_lock(&f->lock);
1468 f->arr[f->num_members] = sk;
1471 if (f->num_members == 1)
1472 dev_add_pack(&f->prot_hook);
1473 spin_unlock(&f->lock);
1476 static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1478 struct packet_fanout *f = po->fanout;
1481 spin_lock(&f->lock);
1482 for (i = 0; i < f->num_members; i++) {
1483 if (f->arr[i] == sk)
1486 BUG_ON(i >= f->num_members);
1487 f->arr[i] = f->arr[f->num_members - 1];
1489 if (f->num_members == 0)
1490 __dev_remove_pack(&f->prot_hook);
1491 spin_unlock(&f->lock);
1494 static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
1496 if (sk->sk_family != PF_PACKET)
1499 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
1502 static void fanout_init_data(struct packet_fanout *f)
1505 case PACKET_FANOUT_LB:
1506 atomic_set(&f->rr_cur, 0);
1508 case PACKET_FANOUT_CBPF:
1509 case PACKET_FANOUT_EBPF:
1510 RCU_INIT_POINTER(f->bpf_prog, NULL);
1515 static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1517 struct bpf_prog *old;
1519 spin_lock(&f->lock);
1520 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1521 rcu_assign_pointer(f->bpf_prog, new);
1522 spin_unlock(&f->lock);
1526 bpf_prog_destroy(old);
1530 static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1533 struct bpf_prog *new;
1534 struct sock_fprog fprog;
1537 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1539 if (len != sizeof(fprog))
1541 if (copy_from_user(&fprog, data, len))
1544 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
1548 __fanout_set_data_bpf(po->fanout, new);
1552 static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1555 struct bpf_prog *new;
1558 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1560 if (len != sizeof(fd))
1562 if (copy_from_user(&fd, data, len))
1565 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
1567 return PTR_ERR(new);
1569 __fanout_set_data_bpf(po->fanout, new);
1573 static int fanout_set_data(struct packet_sock *po, char __user *data,
1576 switch (po->fanout->type) {
1577 case PACKET_FANOUT_CBPF:
1578 return fanout_set_data_cbpf(po, data, len);
1579 case PACKET_FANOUT_EBPF:
1580 return fanout_set_data_ebpf(po, data, len);
1586 static void fanout_release_data(struct packet_fanout *f)
1589 case PACKET_FANOUT_CBPF:
1590 case PACKET_FANOUT_EBPF:
1591 __fanout_set_data_bpf(f, NULL);
1595 static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1597 struct packet_fanout *f;
1599 list_for_each_entry(f, &fanout_list, list) {
1600 if (f->id == candidate_id &&
1601 read_pnet(&f->net) == sock_net(sk)) {
1608 static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1610 u16 id = fanout_next_id;
1613 if (__fanout_id_is_free(sk, id)) {
1615 fanout_next_id = id + 1;
1620 } while (id != fanout_next_id);
1625 static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1627 struct packet_rollover *rollover = NULL;
1628 struct packet_sock *po = pkt_sk(sk);
1629 struct packet_fanout *f, *match;
1630 u8 type = type_flags & 0xff;
1631 u8 flags = type_flags >> 8;
1635 case PACKET_FANOUT_ROLLOVER:
1636 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1638 case PACKET_FANOUT_HASH:
1639 case PACKET_FANOUT_LB:
1640 case PACKET_FANOUT_CPU:
1641 case PACKET_FANOUT_RND:
1642 case PACKET_FANOUT_QM:
1643 case PACKET_FANOUT_CBPF:
1644 case PACKET_FANOUT_EBPF:
1650 mutex_lock(&fanout_mutex);
1656 if (type == PACKET_FANOUT_ROLLOVER ||
1657 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
1659 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1662 atomic_long_set(&rollover->num, 0);
1663 atomic_long_set(&rollover->num_huge, 0);
1664 atomic_long_set(&rollover->num_failed, 0);
1667 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1672 if (!fanout_find_new_id(sk, &id)) {
1676 /* ephemeral flag for the first socket in the group: drop it */
1677 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1681 list_for_each_entry(f, &fanout_list, list) {
1683 read_pnet(&f->net) == sock_net(sk)) {
1689 if (match && match->flags != flags)
1693 match = kzalloc(sizeof(*match), GFP_KERNEL);
1696 write_pnet(&match->net, sock_net(sk));
1699 match->flags = flags;
1700 INIT_LIST_HEAD(&match->list);
1701 spin_lock_init(&match->lock);
1702 refcount_set(&match->sk_ref, 0);
1703 fanout_init_data(match);
1704 match->prot_hook.type = po->prot_hook.type;
1705 match->prot_hook.dev = po->prot_hook.dev;
1706 match->prot_hook.func = packet_rcv_fanout;
1707 match->prot_hook.af_packet_priv = match;
1708 match->prot_hook.id_match = match_fanout_group;
1709 list_add(&match->list, &fanout_list);
1713 spin_lock(&po->bind_lock);
1715 match->type == type &&
1716 match->prot_hook.type == po->prot_hook.type &&
1717 match->prot_hook.dev == po->prot_hook.dev) {
1719 if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1720 __dev_remove_pack(&po->prot_hook);
1722 po->rollover = rollover;
1724 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
1725 __fanout_link(sk, po);
1729 spin_unlock(&po->bind_lock);
1731 if (err && !refcount_read(&match->sk_ref)) {
1732 list_del(&match->list);
1738 mutex_unlock(&fanout_mutex);
1742 /* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1743 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1744 * It is the responsibility of the caller to call fanout_release_data() and
1745 * free the returned packet_fanout (after synchronize_net())
1747 static struct packet_fanout *fanout_release(struct sock *sk)
1749 struct packet_sock *po = pkt_sk(sk);
1750 struct packet_fanout *f;
1752 mutex_lock(&fanout_mutex);
1757 if (refcount_dec_and_test(&f->sk_ref))
1762 mutex_unlock(&fanout_mutex);
1767 static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1768 struct sk_buff *skb)
1770 /* Earlier code assumed this would be a VLAN pkt, double-check
1771 * this now that we have the actual packet in hand. We can only
1772 * do this check on Ethernet devices.
1774 if (unlikely(dev->type != ARPHRD_ETHER))
1777 skb_reset_mac_header(skb);
1778 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1781 static const struct proto_ops packet_ops;
1783 static const struct proto_ops packet_ops_spkt;
1785 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1786 struct packet_type *pt, struct net_device *orig_dev)
1789 struct sockaddr_pkt *spkt;
1792 * When we registered the protocol we saved the socket in the data
1793 * field for just this event.
1796 sk = pt->af_packet_priv;
1799 * Yank back the headers [hope the device set this
1800 * right or kerboom...]
1802 * Incoming packets have ll header pulled,
1805 * For outgoing ones skb->data == skb_mac_header(skb)
1806 * so that this procedure is noop.
1809 if (skb->pkt_type == PACKET_LOOPBACK)
1812 if (!net_eq(dev_net(dev), sock_net(sk)))
1815 skb = skb_share_check(skb, GFP_ATOMIC);
1819 /* drop any routing info */
1822 /* drop conntrack reference */
1825 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1827 skb_push(skb, skb->data - skb_mac_header(skb));
1830 * The SOCK_PACKET socket receives _all_ frames.
1833 spkt->spkt_family = dev->type;
1834 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1835 spkt->spkt_protocol = skb->protocol;
1838 * Charge the memory to the socket. This is done specifically
1839 * to prevent sockets using all the memory up.
1842 if (sock_queue_rcv_skb(sk, skb) == 0)
1851 static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
1853 if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
1854 sock->type == SOCK_RAW) {
1855 skb_reset_mac_header(skb);
1856 skb->protocol = dev_parse_header_protocol(skb);
1859 skb_probe_transport_header(skb);
1863 * Output a raw packet to a device layer. This bypasses all the other
1864 * protocol layers and you must therefore supply it with a complete frame
1867 static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1870 struct sock *sk = sock->sk;
1871 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1872 struct sk_buff *skb = NULL;
1873 struct net_device *dev;
1874 struct sockcm_cookie sockc;
1880 * Get and verify the address.
1884 if (msg->msg_namelen < sizeof(struct sockaddr))
1886 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1887 proto = saddr->spkt_protocol;
1889 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1892 * Find the device first to size check it
1895 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1898 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1904 if (!(dev->flags & IFF_UP))
1908 * You may not queue a frame bigger than the mtu. This is the lowest level
1909 * raw protocol and you must do your own fragmentation at this level.
1912 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1913 if (!netif_supports_nofcs(dev)) {
1914 err = -EPROTONOSUPPORT;
1917 extra_len = 4; /* We're doing our own CRC */
1921 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1925 size_t reserved = LL_RESERVED_SPACE(dev);
1926 int tlen = dev->needed_tailroom;
1927 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1930 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1933 /* FIXME: Save some space for broken drivers that write a hard
1934 * header at transmission time by themselves. PPP is the notable
1935 * one here. This should really be fixed at the driver level.
1937 skb_reserve(skb, reserved);
1938 skb_reset_network_header(skb);
1940 /* Try to align data part correctly */
1945 skb_reset_network_header(skb);
1947 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1953 if (!dev_validate_header(dev, skb->data, len)) {
1957 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1958 !packet_extra_vlan_len_allowed(dev, skb)) {
1963 sockcm_init(&sockc, sk);
1964 if (msg->msg_controllen) {
1965 err = sock_cmsg_send(sk, msg, &sockc);
1970 skb->protocol = proto;
1972 skb->priority = sk->sk_priority;
1973 skb->mark = sk->sk_mark;
1974 skb->tstamp = sockc.transmit_time;
1976 skb_setup_tx_timestamp(skb, sockc.tsflags);
1978 if (unlikely(extra_len == 4))
1981 packet_parse_headers(skb, sock);
1983 dev_queue_xmit(skb);
1994 static unsigned int run_filter(struct sk_buff *skb,
1995 const struct sock *sk,
1998 struct sk_filter *filter;
2001 filter = rcu_dereference(sk->sk_filter);
2003 res = bpf_prog_run_clear_cb(filter->prog, skb);
2009 static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2012 struct virtio_net_hdr vnet_hdr;
2014 if (*len < sizeof(vnet_hdr))
2016 *len -= sizeof(vnet_hdr);
2018 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
2021 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2025 * This function makes lazy skb cloning in hope that most of packets
2026 * are discarded by BPF.
2028 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2029 * and skb->cb are mangled. It works because (and until) packets
2030 * falling here are owned by current CPU. Output packets are cloned
2031 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2032 * sequencially, so that if we return skb to original state on exit,
2033 * we will not harm anyone.
2036 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2037 struct packet_type *pt, struct net_device *orig_dev)
2040 struct sockaddr_ll *sll;
2041 struct packet_sock *po;
2042 u8 *skb_head = skb->data;
2043 int skb_len = skb->len;
2044 unsigned int snaplen, res;
2045 bool is_drop_n_account = false;
2047 if (skb->pkt_type == PACKET_LOOPBACK)
2050 sk = pt->af_packet_priv;
2053 if (!net_eq(dev_net(dev), sock_net(sk)))
2058 if (dev->header_ops) {
2059 /* The device has an explicit notion of ll header,
2060 * exported to higher levels.
2062 * Otherwise, the device hides details of its frame
2063 * structure, so that corresponding packet head is
2064 * never delivered to user.
2066 if (sk->sk_type != SOCK_DGRAM)
2067 skb_push(skb, skb->data - skb_mac_header(skb));
2068 else if (skb->pkt_type == PACKET_OUTGOING) {
2069 /* Special case: outgoing packets have ll header at head */
2070 skb_pull(skb, skb_network_offset(skb));
2076 res = run_filter(skb, sk, snaplen);
2078 goto drop_n_restore;
2082 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
2085 if (skb_shared(skb)) {
2086 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2090 if (skb_head != skb->data) {
2091 skb->data = skb_head;
2098 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
2100 sll = &PACKET_SKB_CB(skb)->sa.ll;
2101 sll->sll_hatype = dev->type;
2102 sll->sll_pkttype = skb->pkt_type;
2103 if (unlikely(po->origdev))
2104 sll->sll_ifindex = orig_dev->ifindex;
2106 sll->sll_ifindex = dev->ifindex;
2108 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2110 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2111 * Use their space for storing the original skb length.
2113 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
2115 if (pskb_trim(skb, snaplen))
2118 skb_set_owner_r(skb, sk);
2122 /* drop conntrack reference */
2125 spin_lock(&sk->sk_receive_queue.lock);
2126 po->stats.stats1.tp_packets++;
2127 sock_skb_set_dropcount(sk, skb);
2128 __skb_queue_tail(&sk->sk_receive_queue, skb);
2129 spin_unlock(&sk->sk_receive_queue.lock);
2130 sk->sk_data_ready(sk);
2134 is_drop_n_account = true;
2135 spin_lock(&sk->sk_receive_queue.lock);
2136 po->stats.stats1.tp_drops++;
2137 atomic_inc(&sk->sk_drops);
2138 spin_unlock(&sk->sk_receive_queue.lock);
2141 if (skb_head != skb->data && skb_shared(skb)) {
2142 skb->data = skb_head;
2146 if (!is_drop_n_account)
2153 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2154 struct packet_type *pt, struct net_device *orig_dev)
2157 struct packet_sock *po;
2158 struct sockaddr_ll *sll;
2159 union tpacket_uhdr h;
2160 u8 *skb_head = skb->data;
2161 int skb_len = skb->len;
2162 unsigned int snaplen, res;
2163 unsigned long status = TP_STATUS_USER;
2164 unsigned short macoff, netoff, hdrlen;
2165 struct sk_buff *copy_skb = NULL;
2168 bool is_drop_n_account = false;
2169 bool do_vnet = false;
2171 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2172 * We may add members to them until current aligned size without forcing
2173 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2175 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2176 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2178 if (skb->pkt_type == PACKET_LOOPBACK)
2181 sk = pt->af_packet_priv;
2184 if (!net_eq(dev_net(dev), sock_net(sk)))
2187 if (dev->header_ops) {
2188 if (sk->sk_type != SOCK_DGRAM)
2189 skb_push(skb, skb->data - skb_mac_header(skb));
2190 else if (skb->pkt_type == PACKET_OUTGOING) {
2191 /* Special case: outgoing packets have ll header at head */
2192 skb_pull(skb, skb_network_offset(skb));
2198 res = run_filter(skb, sk, snaplen);
2200 goto drop_n_restore;
2202 if (skb->ip_summed == CHECKSUM_PARTIAL)
2203 status |= TP_STATUS_CSUMNOTREADY;
2204 else if (skb->pkt_type != PACKET_OUTGOING &&
2205 (skb->ip_summed == CHECKSUM_COMPLETE ||
2206 skb_csum_unnecessary(skb)))
2207 status |= TP_STATUS_CSUM_VALID;
2212 if (sk->sk_type == SOCK_DGRAM) {
2213 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2216 unsigned int maclen = skb_network_offset(skb);
2217 netoff = TPACKET_ALIGN(po->tp_hdrlen +
2218 (maclen < 16 ? 16 : maclen)) +
2220 if (po->has_vnet_hdr) {
2221 netoff += sizeof(struct virtio_net_hdr);
2224 macoff = netoff - maclen;
2226 if (po->tp_version <= TPACKET_V2) {
2227 if (macoff + snaplen > po->rx_ring.frame_size) {
2228 if (po->copy_thresh &&
2229 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
2230 if (skb_shared(skb)) {
2231 copy_skb = skb_clone(skb, GFP_ATOMIC);
2233 copy_skb = skb_get(skb);
2234 skb_head = skb->data;
2237 skb_set_owner_r(copy_skb, sk);
2239 snaplen = po->rx_ring.frame_size - macoff;
2240 if ((int)snaplen < 0) {
2245 } else if (unlikely(macoff + snaplen >
2246 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2249 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2250 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2251 snaplen, nval, macoff);
2253 if (unlikely((int)snaplen < 0)) {
2255 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2259 spin_lock(&sk->sk_receive_queue.lock);
2260 h.raw = packet_current_rx_frame(po, skb,
2261 TP_STATUS_KERNEL, (macoff+snaplen));
2263 goto drop_n_account;
2264 if (po->tp_version <= TPACKET_V2) {
2265 packet_increment_rx_head(po, &po->rx_ring);
2267 * LOSING will be reported till you read the stats,
2268 * because it's COR - Clear On Read.
2269 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2272 if (po->stats.stats1.tp_drops)
2273 status |= TP_STATUS_LOSING;
2277 virtio_net_hdr_from_skb(skb, h.raw + macoff -
2278 sizeof(struct virtio_net_hdr),
2280 goto drop_n_account;
2282 po->stats.stats1.tp_packets++;
2284 status |= TP_STATUS_COPY;
2285 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2287 spin_unlock(&sk->sk_receive_queue.lock);
2289 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
2291 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
2292 getnstimeofday(&ts);
2294 status |= ts_status;
2296 switch (po->tp_version) {
2298 h.h1->tp_len = skb->len;
2299 h.h1->tp_snaplen = snaplen;
2300 h.h1->tp_mac = macoff;
2301 h.h1->tp_net = netoff;
2302 h.h1->tp_sec = ts.tv_sec;
2303 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
2304 hdrlen = sizeof(*h.h1);
2307 h.h2->tp_len = skb->len;
2308 h.h2->tp_snaplen = snaplen;
2309 h.h2->tp_mac = macoff;
2310 h.h2->tp_net = netoff;
2311 h.h2->tp_sec = ts.tv_sec;
2312 h.h2->tp_nsec = ts.tv_nsec;
2313 if (skb_vlan_tag_present(skb)) {
2314 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
2315 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2316 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
2318 h.h2->tp_vlan_tci = 0;
2319 h.h2->tp_vlan_tpid = 0;
2321 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
2322 hdrlen = sizeof(*h.h2);
2325 /* tp_nxt_offset,vlan are already populated above.
2326 * So DONT clear those fields here
2328 h.h3->tp_status |= status;
2329 h.h3->tp_len = skb->len;
2330 h.h3->tp_snaplen = snaplen;
2331 h.h3->tp_mac = macoff;
2332 h.h3->tp_net = netoff;
2333 h.h3->tp_sec = ts.tv_sec;
2334 h.h3->tp_nsec = ts.tv_nsec;
2335 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
2336 hdrlen = sizeof(*h.h3);
2342 sll = h.raw + TPACKET_ALIGN(hdrlen);
2343 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2344 sll->sll_family = AF_PACKET;
2345 sll->sll_hatype = dev->type;
2346 sll->sll_protocol = skb->protocol;
2347 sll->sll_pkttype = skb->pkt_type;
2348 if (unlikely(po->origdev))
2349 sll->sll_ifindex = orig_dev->ifindex;
2351 sll->sll_ifindex = dev->ifindex;
2355 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2356 if (po->tp_version <= TPACKET_V2) {
2359 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2362 for (start = h.raw; start < end; start += PAGE_SIZE)
2363 flush_dcache_page(pgv_to_page(start));
2368 if (po->tp_version <= TPACKET_V2) {
2369 __packet_set_status(po, h.raw, status);
2370 sk->sk_data_ready(sk);
2372 prb_clear_blk_fill_status(&po->rx_ring);
2376 if (skb_head != skb->data && skb_shared(skb)) {
2377 skb->data = skb_head;
2381 if (!is_drop_n_account)
2388 is_drop_n_account = true;
2389 po->stats.stats1.tp_drops++;
2390 spin_unlock(&sk->sk_receive_queue.lock);
2392 sk->sk_data_ready(sk);
2393 kfree_skb(copy_skb);
2394 goto drop_n_restore;
2397 static void tpacket_destruct_skb(struct sk_buff *skb)
2399 struct packet_sock *po = pkt_sk(skb->sk);
2401 if (likely(po->tx_ring.pg_vec)) {
2405 ph = skb_zcopy_get_nouarg(skb);
2406 packet_dec_pending(&po->tx_ring);
2408 ts = __packet_set_timestamp(po, ph, skb);
2409 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
2415 static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2417 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2418 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2419 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2420 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2421 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2422 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2423 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2425 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2431 static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2432 struct virtio_net_hdr *vnet_hdr)
2434 if (*len < sizeof(*vnet_hdr))
2436 *len -= sizeof(*vnet_hdr);
2438 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
2441 return __packet_snd_vnet_parse(vnet_hdr, *len);
2444 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2445 void *frame, struct net_device *dev, void *data, int tp_len,
2446 __be16 proto, unsigned char *addr, int hlen, int copylen,
2447 const struct sockcm_cookie *sockc)
2449 union tpacket_uhdr ph;
2450 int to_write, offset, len, nr_frags, len_max;
2451 struct socket *sock = po->sk.sk_socket;
2457 skb->protocol = proto;
2459 skb->priority = po->sk.sk_priority;
2460 skb->mark = po->sk.sk_mark;
2461 skb->tstamp = sockc->transmit_time;
2462 skb_setup_tx_timestamp(skb, sockc->tsflags);
2463 skb_zcopy_set_nouarg(skb, ph.raw);
2465 skb_reserve(skb, hlen);
2466 skb_reset_network_header(skb);
2470 if (sock->type == SOCK_DGRAM) {
2471 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2473 if (unlikely(err < 0))
2475 } else if (copylen) {
2476 int hdrlen = min_t(int, copylen, tp_len);
2478 skb_push(skb, dev->hard_header_len);
2479 skb_put(skb, copylen - dev->hard_header_len);
2480 err = skb_store_bits(skb, 0, data, hdrlen);
2483 if (!dev_validate_header(dev, skb->data, hdrlen))
2490 offset = offset_in_page(data);
2491 len_max = PAGE_SIZE - offset;
2492 len = ((to_write > len_max) ? len_max : to_write);
2494 skb->data_len = to_write;
2495 skb->len += to_write;
2496 skb->truesize += to_write;
2497 refcount_add(to_write, &po->sk.sk_wmem_alloc);
2499 while (likely(to_write)) {
2500 nr_frags = skb_shinfo(skb)->nr_frags;
2502 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
2503 pr_err("Packet exceed the number of skb frags(%lu)\n",
2508 page = pgv_to_page(data);
2510 flush_dcache_page(page);
2512 skb_fill_page_desc(skb, nr_frags, page, offset, len);
2515 len_max = PAGE_SIZE;
2516 len = ((to_write > len_max) ? len_max : to_write);
2519 packet_parse_headers(skb, sock);
2524 static int tpacket_parse_header(struct packet_sock *po, void *frame,
2525 int size_max, void **data)
2527 union tpacket_uhdr ph;
2532 switch (po->tp_version) {
2534 if (ph.h3->tp_next_offset != 0) {
2535 pr_warn_once("variable sized slot not supported");
2538 tp_len = ph.h3->tp_len;
2541 tp_len = ph.h2->tp_len;
2544 tp_len = ph.h1->tp_len;
2547 if (unlikely(tp_len > size_max)) {
2548 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2552 if (unlikely(po->tp_tx_has_off)) {
2553 int off_min, off_max;
2555 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2556 off_max = po->tx_ring.frame_size - tp_len;
2557 if (po->sk.sk_type == SOCK_DGRAM) {
2558 switch (po->tp_version) {
2560 off = ph.h3->tp_net;
2563 off = ph.h2->tp_net;
2566 off = ph.h1->tp_net;
2570 switch (po->tp_version) {
2572 off = ph.h3->tp_mac;
2575 off = ph.h2->tp_mac;
2578 off = ph.h1->tp_mac;
2582 if (unlikely((off < off_min) || (off_max < off)))
2585 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2588 *data = frame + off;
2592 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2594 struct sk_buff *skb;
2595 struct net_device *dev;
2596 struct virtio_net_hdr *vnet_hdr = NULL;
2597 struct sockcm_cookie sockc;
2599 int err, reserve = 0;
2601 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2602 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
2603 unsigned char *addr = NULL;
2604 int tp_len, size_max;
2607 int status = TP_STATUS_AVAILABLE;
2608 int hlen, tlen, copylen = 0;
2610 mutex_lock(&po->pg_vec_lock);
2612 if (likely(saddr == NULL)) {
2613 dev = packet_cached_dev_get(po);
2617 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2619 if (msg->msg_namelen < (saddr->sll_halen
2620 + offsetof(struct sockaddr_ll,
2623 proto = saddr->sll_protocol;
2624 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2625 if (po->sk.sk_socket->type == SOCK_DGRAM) {
2626 if (dev && msg->msg_namelen < dev->addr_len +
2627 offsetof(struct sockaddr_ll, sll_addr))
2629 addr = saddr->sll_addr;
2634 if (unlikely(dev == NULL))
2637 if (unlikely(!(dev->flags & IFF_UP)))
2640 sockcm_init(&sockc, &po->sk);
2641 if (msg->msg_controllen) {
2642 err = sock_cmsg_send(&po->sk, msg, &sockc);
2647 if (po->sk.sk_socket->type == SOCK_RAW)
2648 reserve = dev->hard_header_len;
2649 size_max = po->tx_ring.frame_size
2650 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
2652 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
2653 size_max = dev->mtu + reserve + VLAN_HLEN;
2656 ph = packet_current_frame(po, &po->tx_ring,
2657 TP_STATUS_SEND_REQUEST);
2658 if (unlikely(ph == NULL)) {
2659 if (need_wait && need_resched())
2665 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2669 status = TP_STATUS_SEND_REQUEST;
2670 hlen = LL_RESERVED_SPACE(dev);
2671 tlen = dev->needed_tailroom;
2672 if (po->has_vnet_hdr) {
2674 data += sizeof(*vnet_hdr);
2675 tp_len -= sizeof(*vnet_hdr);
2677 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2681 copylen = __virtio16_to_cpu(vio_le(),
2684 copylen = max_t(int, copylen, dev->hard_header_len);
2685 skb = sock_alloc_send_skb(&po->sk,
2686 hlen + tlen + sizeof(struct sockaddr_ll) +
2687 (copylen - dev->hard_header_len),
2690 if (unlikely(skb == NULL)) {
2691 /* we assume the socket was initially writeable ... */
2692 if (likely(len_sum > 0))
2696 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
2697 addr, hlen, copylen, &sockc);
2698 if (likely(tp_len >= 0) &&
2699 tp_len > dev->mtu + reserve &&
2700 !po->has_vnet_hdr &&
2701 !packet_extra_vlan_len_allowed(dev, skb))
2704 if (unlikely(tp_len < 0)) {
2707 __packet_set_status(po, ph,
2708 TP_STATUS_AVAILABLE);
2709 packet_increment_head(&po->tx_ring);
2713 status = TP_STATUS_WRONG_FORMAT;
2719 if (po->has_vnet_hdr) {
2720 if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
2724 virtio_net_hdr_set_proto(skb, vnet_hdr);
2727 skb->destructor = tpacket_destruct_skb;
2728 __packet_set_status(po, ph, TP_STATUS_SENDING);
2729 packet_inc_pending(&po->tx_ring);
2731 status = TP_STATUS_SEND_REQUEST;
2732 err = po->xmit(skb);
2733 if (unlikely(err > 0)) {
2734 err = net_xmit_errno(err);
2735 if (err && __packet_get_status(po, ph) ==
2736 TP_STATUS_AVAILABLE) {
2737 /* skb was destructed already */
2742 * skb was dropped but not destructed yet;
2743 * let's treat it like congestion or err < 0
2747 packet_increment_head(&po->tx_ring);
2749 } while (likely((ph != NULL) ||
2750 /* Note: packet_read_pending() might be slow if we have
2751 * to call it as it's per_cpu variable, but in fast-path
2752 * we already short-circuit the loop with the first
2753 * condition, and luckily don't have to go that path
2756 (need_wait && packet_read_pending(&po->tx_ring))));
2762 __packet_set_status(po, ph, status);
2767 mutex_unlock(&po->pg_vec_lock);
2771 static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2772 size_t reserve, size_t len,
2773 size_t linear, int noblock,
2776 struct sk_buff *skb;
2778 /* Under a page? Don't bother with paged skb. */
2779 if (prepad + len < PAGE_SIZE || !linear)
2782 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2787 skb_reserve(skb, reserve);
2788 skb_put(skb, linear);
2789 skb->data_len = len - linear;
2790 skb->len += len - linear;
2795 static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2797 struct sock *sk = sock->sk;
2798 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2799 struct sk_buff *skb;
2800 struct net_device *dev;
2802 unsigned char *addr = NULL;
2803 int err, reserve = 0;
2804 struct sockcm_cookie sockc;
2805 struct virtio_net_hdr vnet_hdr = { 0 };
2807 struct packet_sock *po = pkt_sk(sk);
2808 bool has_vnet_hdr = false;
2809 int hlen, tlen, linear;
2813 * Get and verify the address.
2816 if (likely(saddr == NULL)) {
2817 dev = packet_cached_dev_get(po);
2821 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2823 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2825 proto = saddr->sll_protocol;
2826 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2827 if (sock->type == SOCK_DGRAM) {
2828 if (dev && msg->msg_namelen < dev->addr_len +
2829 offsetof(struct sockaddr_ll, sll_addr))
2831 addr = saddr->sll_addr;
2836 if (unlikely(dev == NULL))
2839 if (unlikely(!(dev->flags & IFF_UP)))
2842 sockcm_init(&sockc, sk);
2843 sockc.mark = sk->sk_mark;
2844 if (msg->msg_controllen) {
2845 err = sock_cmsg_send(sk, msg, &sockc);
2850 if (sock->type == SOCK_RAW)
2851 reserve = dev->hard_header_len;
2852 if (po->has_vnet_hdr) {
2853 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2856 has_vnet_hdr = true;
2859 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2860 if (!netif_supports_nofcs(dev)) {
2861 err = -EPROTONOSUPPORT;
2864 extra_len = 4; /* We're doing our own CRC */
2868 if (!vnet_hdr.gso_type &&
2869 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
2873 hlen = LL_RESERVED_SPACE(dev);
2874 tlen = dev->needed_tailroom;
2875 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2876 linear = max(linear, min_t(int, len, dev->hard_header_len));
2877 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
2878 msg->msg_flags & MSG_DONTWAIT, &err);
2882 skb_reset_network_header(skb);
2885 if (sock->type == SOCK_DGRAM) {
2886 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
2887 if (unlikely(offset < 0))
2889 } else if (reserve) {
2890 skb_reserve(skb, -reserve);
2891 if (len < reserve + sizeof(struct ipv6hdr) &&
2892 dev->min_header_len != dev->hard_header_len)
2893 skb_reset_network_header(skb);
2896 /* Returns -EFAULT on error */
2897 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
2901 if (sock->type == SOCK_RAW &&
2902 !dev_validate_header(dev, skb->data, len)) {
2907 skb_setup_tx_timestamp(skb, sockc.tsflags);
2909 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
2910 !packet_extra_vlan_len_allowed(dev, skb)) {
2915 skb->protocol = proto;
2917 skb->priority = sk->sk_priority;
2918 skb->mark = sockc.mark;
2919 skb->tstamp = sockc.transmit_time;
2922 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
2925 len += sizeof(vnet_hdr);
2926 virtio_net_hdr_set_proto(skb, &vnet_hdr);
2929 packet_parse_headers(skb, sock);
2931 if (unlikely(extra_len == 4))
2934 err = po->xmit(skb);
2935 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2951 static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
2953 struct sock *sk = sock->sk;
2954 struct packet_sock *po = pkt_sk(sk);
2956 if (po->tx_ring.pg_vec)
2957 return tpacket_snd(po, msg);
2959 return packet_snd(sock, msg, len);
2963 * Close a PACKET socket. This is fairly simple. We immediately go
2964 * to 'closed' state and remove our protocol entry in the device list.
2967 static int packet_release(struct socket *sock)
2969 struct sock *sk = sock->sk;
2970 struct packet_sock *po;
2971 struct packet_fanout *f;
2973 union tpacket_req_u req_u;
2981 mutex_lock(&net->packet.sklist_lock);
2982 sk_del_node_init_rcu(sk);
2983 mutex_unlock(&net->packet.sklist_lock);
2986 sock_prot_inuse_add(net, sk->sk_prot, -1);
2989 spin_lock(&po->bind_lock);
2990 unregister_prot_hook(sk, false);
2991 packet_cached_dev_reset(po);
2993 if (po->prot_hook.dev) {
2994 dev_put(po->prot_hook.dev);
2995 po->prot_hook.dev = NULL;
2997 spin_unlock(&po->bind_lock);
2999 packet_flush_mclist(sk);
3002 if (po->rx_ring.pg_vec) {
3003 memset(&req_u, 0, sizeof(req_u));
3004 packet_set_ring(sk, &req_u, 1, 0);
3007 if (po->tx_ring.pg_vec) {
3008 memset(&req_u, 0, sizeof(req_u));
3009 packet_set_ring(sk, &req_u, 1, 1);
3013 f = fanout_release(sk);
3018 kfree(po->rollover);
3019 fanout_release_data(f);
3023 * Now the socket is dead. No more input will appear.
3030 skb_queue_purge(&sk->sk_receive_queue);
3031 packet_free_pending(po);
3032 sk_refcnt_debug_release(sk);
3039 * Attach a packet hook.
3042 static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3045 struct packet_sock *po = pkt_sk(sk);
3046 struct net_device *dev_curr;
3049 struct net_device *dev = NULL;
3051 bool unlisted = false;
3054 spin_lock(&po->bind_lock);
3063 dev = dev_get_by_name_rcu(sock_net(sk), name);
3068 } else if (ifindex) {
3069 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3079 proto_curr = po->prot_hook.type;
3080 dev_curr = po->prot_hook.dev;
3082 need_rehook = proto_curr != proto || dev_curr != dev;
3087 /* prevents packet_notifier() from calling
3088 * register_prot_hook()
3091 __unregister_prot_hook(sk, true);
3093 dev_curr = po->prot_hook.dev;
3095 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3099 BUG_ON(po->running);
3101 po->prot_hook.type = proto;
3103 if (unlikely(unlisted)) {
3105 po->prot_hook.dev = NULL;
3107 packet_cached_dev_reset(po);
3109 po->prot_hook.dev = dev;
3110 po->ifindex = dev ? dev->ifindex : 0;
3111 packet_cached_dev_assign(po, dev);
3117 if (proto == 0 || !need_rehook)
3120 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
3121 register_prot_hook(sk);
3123 sk->sk_err = ENETDOWN;
3124 if (!sock_flag(sk, SOCK_DEAD))
3125 sk->sk_error_report(sk);
3130 spin_unlock(&po->bind_lock);
3136 * Bind a packet socket to a device
3139 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3142 struct sock *sk = sock->sk;
3143 char name[sizeof(uaddr->sa_data) + 1];
3149 if (addr_len != sizeof(struct sockaddr))
3151 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3154 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3155 name[sizeof(uaddr->sa_data)] = 0;
3157 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
3160 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3162 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3163 struct sock *sk = sock->sk;
3169 if (addr_len < sizeof(struct sockaddr_ll))
3171 if (sll->sll_family != AF_PACKET)
3174 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3175 sll->sll_protocol ? : pkt_sk(sk)->num);
3178 static struct proto packet_proto = {
3180 .owner = THIS_MODULE,
3181 .obj_size = sizeof(struct packet_sock),
3185 * Create a packet of type SOCK_PACKET.
3188 static int packet_create(struct net *net, struct socket *sock, int protocol,
3192 struct packet_sock *po;
3193 __be16 proto = (__force __be16)protocol; /* weird, but documented */
3196 if (!ns_capable(net->user_ns, CAP_NET_RAW))
3198 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3199 sock->type != SOCK_PACKET)
3200 return -ESOCKTNOSUPPORT;
3202 sock->state = SS_UNCONNECTED;
3205 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
3209 sock->ops = &packet_ops;
3210 if (sock->type == SOCK_PACKET)
3211 sock->ops = &packet_ops_spkt;
3213 sock_init_data(sock, sk);
3216 sk->sk_family = PF_PACKET;
3218 po->xmit = dev_queue_xmit;
3220 err = packet_alloc_pending(po);
3224 packet_cached_dev_reset(po);
3226 sk->sk_destruct = packet_sock_destruct;
3227 sk_refcnt_debug_inc(sk);
3230 * Attach a protocol block
3233 spin_lock_init(&po->bind_lock);
3234 mutex_init(&po->pg_vec_lock);
3235 po->rollover = NULL;
3236 po->prot_hook.func = packet_rcv;
3238 if (sock->type == SOCK_PACKET)
3239 po->prot_hook.func = packet_rcv_spkt;
3241 po->prot_hook.af_packet_priv = sk;
3244 po->prot_hook.type = proto;
3245 __register_prot_hook(sk);
3248 mutex_lock(&net->packet.sklist_lock);
3249 sk_add_node_tail_rcu(sk, &net->packet.sklist);
3250 mutex_unlock(&net->packet.sklist_lock);
3253 sock_prot_inuse_add(net, &packet_proto, 1);
3264 * Pull a packet from our receive queue and hand it to the user.
3265 * If necessary we block.
3268 static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3271 struct sock *sk = sock->sk;
3272 struct sk_buff *skb;
3274 int vnet_hdr_len = 0;
3275 unsigned int origlen = 0;
3278 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
3282 /* What error should we return now? EUNATTACH? */
3283 if (pkt_sk(sk)->ifindex < 0)
3287 if (flags & MSG_ERRQUEUE) {
3288 err = sock_recv_errqueue(sk, msg, len,
3289 SOL_PACKET, PACKET_TX_TIMESTAMP);
3294 * Call the generic datagram receiver. This handles all sorts
3295 * of horrible races and re-entrancy so we can forget about it
3296 * in the protocol layers.
3298 * Now it will return ENETDOWN, if device have just gone down,
3299 * but then it will block.
3302 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
3305 * An error occurred so return it. Because skb_recv_datagram()
3306 * handles the blocking we don't see and worry about blocking
3313 if (pkt_sk(sk)->pressure)
3314 packet_rcv_has_room(pkt_sk(sk), NULL);
3316 if (pkt_sk(sk)->has_vnet_hdr) {
3317 err = packet_rcv_vnet(msg, skb, &len);
3320 vnet_hdr_len = sizeof(struct virtio_net_hdr);
3323 /* You lose any data beyond the buffer you gave. If it worries
3324 * a user program they can ask the device for its MTU
3330 msg->msg_flags |= MSG_TRUNC;
3333 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3337 if (sock->type != SOCK_PACKET) {
3338 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3340 /* Original length was stored in sockaddr_ll fields */
3341 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3342 sll->sll_family = AF_PACKET;
3343 sll->sll_protocol = skb->protocol;
3346 sock_recv_ts_and_drops(msg, sk, skb);
3348 if (msg->msg_name) {
3351 /* If the address length field is there to be filled
3352 * in, we fill it in now.
3354 if (sock->type == SOCK_PACKET) {
3355 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
3356 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3357 copy_len = msg->msg_namelen;
3359 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3361 msg->msg_namelen = sll->sll_halen +
3362 offsetof(struct sockaddr_ll, sll_addr);
3363 copy_len = msg->msg_namelen;
3364 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
3365 memset(msg->msg_name +
3366 offsetof(struct sockaddr_ll, sll_addr),
3367 0, sizeof(sll->sll_addr));
3368 msg->msg_namelen = sizeof(struct sockaddr_ll);
3371 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
3374 if (pkt_sk(sk)->auxdata) {
3375 struct tpacket_auxdata aux;
3377 aux.tp_status = TP_STATUS_USER;
3378 if (skb->ip_summed == CHECKSUM_PARTIAL)
3379 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
3380 else if (skb->pkt_type != PACKET_OUTGOING &&
3381 (skb->ip_summed == CHECKSUM_COMPLETE ||
3382 skb_csum_unnecessary(skb)))
3383 aux.tp_status |= TP_STATUS_CSUM_VALID;
3385 aux.tp_len = origlen;
3386 aux.tp_snaplen = skb->len;
3388 aux.tp_net = skb_network_offset(skb);
3389 if (skb_vlan_tag_present(skb)) {
3390 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
3391 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3392 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
3394 aux.tp_vlan_tci = 0;
3395 aux.tp_vlan_tpid = 0;
3397 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
3401 * Free or return the buffer as appropriate. Again this
3402 * hides all the races and re-entrancy issues from us.
3404 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
3407 skb_free_datagram(sk, skb);
3412 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3415 struct net_device *dev;
3416 struct sock *sk = sock->sk;
3421 uaddr->sa_family = AF_PACKET;
3422 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
3424 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3426 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
3429 return sizeof(*uaddr);
3432 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3435 struct net_device *dev;
3436 struct sock *sk = sock->sk;
3437 struct packet_sock *po = pkt_sk(sk);
3438 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
3443 sll->sll_family = AF_PACKET;
3444 sll->sll_ifindex = po->ifindex;
3445 sll->sll_protocol = po->num;
3446 sll->sll_pkttype = 0;
3448 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
3450 sll->sll_hatype = dev->type;
3451 sll->sll_halen = dev->addr_len;
3452 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
3454 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3459 return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
3462 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3466 case PACKET_MR_MULTICAST:
3467 if (i->alen != dev->addr_len)
3470 return dev_mc_add(dev, i->addr);
3472 return dev_mc_del(dev, i->addr);
3474 case PACKET_MR_PROMISC:
3475 return dev_set_promiscuity(dev, what);
3476 case PACKET_MR_ALLMULTI:
3477 return dev_set_allmulti(dev, what);
3478 case PACKET_MR_UNICAST:
3479 if (i->alen != dev->addr_len)
3482 return dev_uc_add(dev, i->addr);
3484 return dev_uc_del(dev, i->addr);
3492 static void packet_dev_mclist_delete(struct net_device *dev,
3493 struct packet_mclist **mlp)
3495 struct packet_mclist *ml;
3497 while ((ml = *mlp) != NULL) {
3498 if (ml->ifindex == dev->ifindex) {
3499 packet_dev_mc(dev, ml, -1);
3507 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
3509 struct packet_sock *po = pkt_sk(sk);
3510 struct packet_mclist *ml, *i;
3511 struct net_device *dev;
3517 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
3522 if (mreq->mr_alen > dev->addr_len)
3526 i = kmalloc(sizeof(*i), GFP_KERNEL);
3531 for (ml = po->mclist; ml; ml = ml->next) {
3532 if (ml->ifindex == mreq->mr_ifindex &&
3533 ml->type == mreq->mr_type &&
3534 ml->alen == mreq->mr_alen &&
3535 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3537 /* Free the new element ... */
3543 i->type = mreq->mr_type;
3544 i->ifindex = mreq->mr_ifindex;
3545 i->alen = mreq->mr_alen;
3546 memcpy(i->addr, mreq->mr_address, i->alen);
3547 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
3549 i->next = po->mclist;
3551 err = packet_dev_mc(dev, i, 1);
3553 po->mclist = i->next;
3562 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
3564 struct packet_mclist *ml, **mlp;
3568 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3569 if (ml->ifindex == mreq->mr_ifindex &&
3570 ml->type == mreq->mr_type &&
3571 ml->alen == mreq->mr_alen &&
3572 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3573 if (--ml->count == 0) {
3574 struct net_device *dev;
3576 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3578 packet_dev_mc(dev, ml, -1);
3588 static void packet_flush_mclist(struct sock *sk)
3590 struct packet_sock *po = pkt_sk(sk);
3591 struct packet_mclist *ml;
3597 while ((ml = po->mclist) != NULL) {
3598 struct net_device *dev;
3600 po->mclist = ml->next;
3601 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3603 packet_dev_mc(dev, ml, -1);
3610 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
3612 struct sock *sk = sock->sk;
3613 struct packet_sock *po = pkt_sk(sk);
3616 if (level != SOL_PACKET)
3617 return -ENOPROTOOPT;
3620 case PACKET_ADD_MEMBERSHIP:
3621 case PACKET_DROP_MEMBERSHIP:
3623 struct packet_mreq_max mreq;
3625 memset(&mreq, 0, sizeof(mreq));
3626 if (len < sizeof(struct packet_mreq))
3628 if (len > sizeof(mreq))
3630 if (copy_from_user(&mreq, optval, len))
3632 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3634 if (optname == PACKET_ADD_MEMBERSHIP)
3635 ret = packet_mc_add(sk, &mreq);
3637 ret = packet_mc_drop(sk, &mreq);
3641 case PACKET_RX_RING:
3642 case PACKET_TX_RING:
3644 union tpacket_req_u req_u;
3648 switch (po->tp_version) {
3651 len = sizeof(req_u.req);
3655 len = sizeof(req_u.req3);
3661 if (copy_from_user(&req_u.req, optval, len))
3664 ret = packet_set_ring(sk, &req_u, 0,
3665 optname == PACKET_TX_RING);
3670 case PACKET_COPY_THRESH:
3674 if (optlen != sizeof(val))
3676 if (copy_from_user(&val, optval, sizeof(val)))
3679 pkt_sk(sk)->copy_thresh = val;
3682 case PACKET_VERSION:
3686 if (optlen != sizeof(val))
3688 if (copy_from_user(&val, optval, sizeof(val)))
3699 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3702 po->tp_version = val;
3708 case PACKET_RESERVE:
3712 if (optlen != sizeof(val))
3714 if (copy_from_user(&val, optval, sizeof(val)))
3719 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3722 po->tp_reserve = val;
3732 if (optlen != sizeof(val))
3734 if (copy_from_user(&val, optval, sizeof(val)))
3738 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3741 po->tp_loss = !!val;
3747 case PACKET_AUXDATA:
3751 if (optlen < sizeof(val))
3753 if (copy_from_user(&val, optval, sizeof(val)))
3757 po->auxdata = !!val;
3761 case PACKET_ORIGDEV:
3765 if (optlen < sizeof(val))
3767 if (copy_from_user(&val, optval, sizeof(val)))
3771 po->origdev = !!val;
3775 case PACKET_VNET_HDR:
3779 if (sock->type != SOCK_RAW)
3781 if (optlen < sizeof(val))
3783 if (copy_from_user(&val, optval, sizeof(val)))
3787 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3790 po->has_vnet_hdr = !!val;
3796 case PACKET_TIMESTAMP:
3800 if (optlen != sizeof(val))
3802 if (copy_from_user(&val, optval, sizeof(val)))
3805 po->tp_tstamp = val;
3812 if (optlen != sizeof(val))
3814 if (copy_from_user(&val, optval, sizeof(val)))
3817 return fanout_add(sk, val & 0xffff, val >> 16);
3819 case PACKET_FANOUT_DATA:
3824 return fanout_set_data(po, optval, optlen);
3826 case PACKET_IGNORE_OUTGOING:
3830 if (optlen != sizeof(val))
3832 if (copy_from_user(&val, optval, sizeof(val)))
3834 if (val < 0 || val > 1)
3837 po->prot_hook.ignore_outgoing = !!val;
3840 case PACKET_TX_HAS_OFF:
3844 if (optlen != sizeof(val))
3846 if (copy_from_user(&val, optval, sizeof(val)))
3850 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3853 po->tp_tx_has_off = !!val;
3859 case PACKET_QDISC_BYPASS:
3863 if (optlen != sizeof(val))
3865 if (copy_from_user(&val, optval, sizeof(val)))
3868 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3872 return -ENOPROTOOPT;
3876 static int packet_getsockopt(struct socket *sock, int level, int optname,
3877 char __user *optval, int __user *optlen)
3880 int val, lv = sizeof(val);
3881 struct sock *sk = sock->sk;
3882 struct packet_sock *po = pkt_sk(sk);
3884 union tpacket_stats_u st;
3885 struct tpacket_rollover_stats rstats;
3887 if (level != SOL_PACKET)
3888 return -ENOPROTOOPT;
3890 if (get_user(len, optlen))
3897 case PACKET_STATISTICS:
3898 spin_lock_bh(&sk->sk_receive_queue.lock);
3899 memcpy(&st, &po->stats, sizeof(st));
3900 memset(&po->stats, 0, sizeof(po->stats));
3901 spin_unlock_bh(&sk->sk_receive_queue.lock);
3903 if (po->tp_version == TPACKET_V3) {
3904 lv = sizeof(struct tpacket_stats_v3);
3905 st.stats3.tp_packets += st.stats3.tp_drops;
3908 lv = sizeof(struct tpacket_stats);
3909 st.stats1.tp_packets += st.stats1.tp_drops;
3914 case PACKET_AUXDATA:
3917 case PACKET_ORIGDEV:
3920 case PACKET_VNET_HDR:
3921 val = po->has_vnet_hdr;
3923 case PACKET_VERSION:
3924 val = po->tp_version;
3927 if (len > sizeof(int))
3929 if (len < sizeof(int))
3931 if (copy_from_user(&val, optval, len))
3935 val = sizeof(struct tpacket_hdr);
3938 val = sizeof(struct tpacket2_hdr);
3941 val = sizeof(struct tpacket3_hdr);
3947 case PACKET_RESERVE:
3948 val = po->tp_reserve;
3953 case PACKET_TIMESTAMP:
3954 val = po->tp_tstamp;
3958 ((u32)po->fanout->id |
3959 ((u32)po->fanout->type << 16) |
3960 ((u32)po->fanout->flags << 24)) :
3963 case PACKET_IGNORE_OUTGOING:
3964 val = po->prot_hook.ignore_outgoing;
3966 case PACKET_ROLLOVER_STATS:
3969 rstats.tp_all = atomic_long_read(&po->rollover->num);
3970 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3971 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3973 lv = sizeof(rstats);
3975 case PACKET_TX_HAS_OFF:
3976 val = po->tp_tx_has_off;
3978 case PACKET_QDISC_BYPASS:
3979 val = packet_use_direct_xmit(po);
3982 return -ENOPROTOOPT;
3987 if (put_user(len, optlen))
3989 if (copy_to_user(optval, data, len))
3995 #ifdef CONFIG_COMPAT
3996 static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
3997 char __user *optval, unsigned int optlen)
3999 struct packet_sock *po = pkt_sk(sock->sk);
4001 if (level != SOL_PACKET)
4002 return -ENOPROTOOPT;
4004 if (optname == PACKET_FANOUT_DATA &&
4005 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
4006 optval = (char __user *)get_compat_bpf_fprog(optval);
4009 optlen = sizeof(struct sock_fprog);
4012 return packet_setsockopt(sock, level, optname, optval, optlen);
4016 static int packet_notifier(struct notifier_block *this,
4017 unsigned long msg, void *ptr)
4020 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4021 struct net *net = dev_net(dev);
4024 sk_for_each_rcu(sk, &net->packet.sklist) {
4025 struct packet_sock *po = pkt_sk(sk);
4028 case NETDEV_UNREGISTER:
4030 packet_dev_mclist_delete(dev, &po->mclist);
4034 if (dev->ifindex == po->ifindex) {
4035 spin_lock(&po->bind_lock);
4037 __unregister_prot_hook(sk, false);
4038 sk->sk_err = ENETDOWN;
4039 if (!sock_flag(sk, SOCK_DEAD))
4040 sk->sk_error_report(sk);
4042 if (msg == NETDEV_UNREGISTER) {
4043 packet_cached_dev_reset(po);
4045 if (po->prot_hook.dev)
4046 dev_put(po->prot_hook.dev);
4047 po->prot_hook.dev = NULL;
4049 spin_unlock(&po->bind_lock);
4053 if (dev->ifindex == po->ifindex) {
4054 spin_lock(&po->bind_lock);
4056 register_prot_hook(sk);
4057 spin_unlock(&po->bind_lock);
4067 static int packet_ioctl(struct socket *sock, unsigned int cmd,
4070 struct sock *sk = sock->sk;
4075 int amount = sk_wmem_alloc_get(sk);
4077 return put_user(amount, (int __user *)arg);
4081 struct sk_buff *skb;
4084 spin_lock_bh(&sk->sk_receive_queue.lock);
4085 skb = skb_peek(&sk->sk_receive_queue);
4088 spin_unlock_bh(&sk->sk_receive_queue.lock);
4089 return put_user(amount, (int __user *)arg);
4099 case SIOCGIFBRDADDR:
4100 case SIOCSIFBRDADDR:
4101 case SIOCGIFNETMASK:
4102 case SIOCSIFNETMASK:
4103 case SIOCGIFDSTADDR:
4104 case SIOCSIFDSTADDR:
4106 return inet_dgram_ops.ioctl(sock, cmd, arg);
4110 return -ENOIOCTLCMD;
4115 static __poll_t packet_poll(struct file *file, struct socket *sock,
4118 struct sock *sk = sock->sk;
4119 struct packet_sock *po = pkt_sk(sk);
4120 __poll_t mask = datagram_poll(file, sock, wait);
4122 spin_lock_bh(&sk->sk_receive_queue.lock);
4123 if (po->rx_ring.pg_vec) {
4124 if (!packet_previous_rx_frame(po, &po->rx_ring,
4126 mask |= EPOLLIN | EPOLLRDNORM;
4128 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
4130 spin_unlock_bh(&sk->sk_receive_queue.lock);
4131 spin_lock_bh(&sk->sk_write_queue.lock);
4132 if (po->tx_ring.pg_vec) {
4133 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4134 mask |= EPOLLOUT | EPOLLWRNORM;
4136 spin_unlock_bh(&sk->sk_write_queue.lock);
4141 /* Dirty? Well, I still did not learn better way to account
4145 static void packet_mm_open(struct vm_area_struct *vma)
4147 struct file *file = vma->vm_file;
4148 struct socket *sock = file->private_data;
4149 struct sock *sk = sock->sk;
4152 atomic_inc(&pkt_sk(sk)->mapped);
4155 static void packet_mm_close(struct vm_area_struct *vma)
4157 struct file *file = vma->vm_file;
4158 struct socket *sock = file->private_data;
4159 struct sock *sk = sock->sk;
4162 atomic_dec(&pkt_sk(sk)->mapped);
4165 static const struct vm_operations_struct packet_mmap_ops = {
4166 .open = packet_mm_open,
4167 .close = packet_mm_close,
4170 static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4175 for (i = 0; i < len; i++) {
4176 if (likely(pg_vec[i].buffer)) {
4177 if (is_vmalloc_addr(pg_vec[i].buffer))
4178 vfree(pg_vec[i].buffer);
4180 free_pages((unsigned long)pg_vec[i].buffer,
4182 pg_vec[i].buffer = NULL;
4188 static char *alloc_one_pg_vec_page(unsigned long order)
4191 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4192 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4194 buffer = (char *) __get_free_pages(gfp_flags, order);
4198 /* __get_free_pages failed, fall back to vmalloc */
4199 buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
4203 /* vmalloc failed, lets dig into swap here */
4204 gfp_flags &= ~__GFP_NORETRY;
4205 buffer = (char *) __get_free_pages(gfp_flags, order);
4209 /* complete and utter failure */
4213 static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4215 unsigned int block_nr = req->tp_block_nr;
4219 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
4220 if (unlikely(!pg_vec))
4223 for (i = 0; i < block_nr; i++) {
4224 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
4225 if (unlikely(!pg_vec[i].buffer))
4226 goto out_free_pgvec;
4233 free_pg_vec(pg_vec, order, block_nr);
4238 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4239 int closing, int tx_ring)
4241 struct pgv *pg_vec = NULL;
4242 struct packet_sock *po = pkt_sk(sk);
4243 int was_running, order = 0;
4244 struct packet_ring_buffer *rb;
4245 struct sk_buff_head *rb_queue;
4248 /* Added to avoid minimal code churn */
4249 struct tpacket_req *req = &req_u->req;
4251 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4252 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
4256 if (atomic_read(&po->mapped))
4258 if (packet_read_pending(rb))
4262 if (req->tp_block_nr) {
4263 unsigned int min_frame_size;
4265 /* Sanity tests and some calculations */
4267 if (unlikely(rb->pg_vec))
4270 switch (po->tp_version) {
4272 po->tp_hdrlen = TPACKET_HDRLEN;
4275 po->tp_hdrlen = TPACKET2_HDRLEN;
4278 po->tp_hdrlen = TPACKET3_HDRLEN;
4283 if (unlikely((int)req->tp_block_size <= 0))
4285 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
4287 min_frame_size = po->tp_hdrlen + po->tp_reserve;
4288 if (po->tp_version >= TPACKET_V3 &&
4289 req->tp_block_size <
4290 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
4292 if (unlikely(req->tp_frame_size < min_frame_size))
4294 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
4297 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4298 if (unlikely(rb->frames_per_block == 0))
4300 if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
4302 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4307 order = get_order(req->tp_block_size);
4308 pg_vec = alloc_pg_vec(req, order);
4309 if (unlikely(!pg_vec))
4311 switch (po->tp_version) {
4313 /* Block transmit is not supported yet */
4315 init_prb_bdqc(po, rb, pg_vec, req_u);
4317 struct tpacket_req3 *req3 = &req_u->req3;
4319 if (req3->tp_retire_blk_tov ||
4320 req3->tp_sizeof_priv ||
4321 req3->tp_feature_req_word) {
4334 if (unlikely(req->tp_frame_nr))
4339 /* Detach socket from network */
4340 spin_lock(&po->bind_lock);
4341 was_running = po->running;
4345 __unregister_prot_hook(sk, false);
4347 spin_unlock(&po->bind_lock);
4352 mutex_lock(&po->pg_vec_lock);
4353 if (closing || atomic_read(&po->mapped) == 0) {
4355 spin_lock_bh(&rb_queue->lock);
4356 swap(rb->pg_vec, pg_vec);
4357 rb->frame_max = (req->tp_frame_nr - 1);
4359 rb->frame_size = req->tp_frame_size;
4360 spin_unlock_bh(&rb_queue->lock);
4362 swap(rb->pg_vec_order, order);
4363 swap(rb->pg_vec_len, req->tp_block_nr);
4365 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4366 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4367 tpacket_rcv : packet_rcv;
4368 skb_queue_purge(rb_queue);
4369 if (atomic_read(&po->mapped))
4370 pr_err("packet_mmap: vma is busy: %d\n",
4371 atomic_read(&po->mapped));
4373 mutex_unlock(&po->pg_vec_lock);
4375 spin_lock(&po->bind_lock);
4378 register_prot_hook(sk);
4380 spin_unlock(&po->bind_lock);
4381 if (pg_vec && (po->tp_version > TPACKET_V2)) {
4382 /* Because we don't support block-based V3 on tx-ring */
4384 prb_shutdown_retire_blk_timer(po, rb_queue);
4388 free_pg_vec(pg_vec, order, req->tp_block_nr);
4393 static int packet_mmap(struct file *file, struct socket *sock,
4394 struct vm_area_struct *vma)
4396 struct sock *sk = sock->sk;
4397 struct packet_sock *po = pkt_sk(sk);
4398 unsigned long size, expected_size;
4399 struct packet_ring_buffer *rb;
4400 unsigned long start;
4407 mutex_lock(&po->pg_vec_lock);
4410 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4412 expected_size += rb->pg_vec_len
4418 if (expected_size == 0)
4421 size = vma->vm_end - vma->vm_start;
4422 if (size != expected_size)
4425 start = vma->vm_start;
4426 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4427 if (rb->pg_vec == NULL)
4430 for (i = 0; i < rb->pg_vec_len; i++) {
4432 void *kaddr = rb->pg_vec[i].buffer;
4435 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4436 page = pgv_to_page(kaddr);
4437 err = vm_insert_page(vma, start, page);
4446 atomic_inc(&po->mapped);
4447 vma->vm_ops = &packet_mmap_ops;
4451 mutex_unlock(&po->pg_vec_lock);
4455 static const struct proto_ops packet_ops_spkt = {
4456 .family = PF_PACKET,
4457 .owner = THIS_MODULE,
4458 .release = packet_release,
4459 .bind = packet_bind_spkt,
4460 .connect = sock_no_connect,
4461 .socketpair = sock_no_socketpair,
4462 .accept = sock_no_accept,
4463 .getname = packet_getname_spkt,
4464 .poll = datagram_poll,
4465 .ioctl = packet_ioctl,
4466 .gettstamp = sock_gettstamp,
4467 .listen = sock_no_listen,
4468 .shutdown = sock_no_shutdown,
4469 .setsockopt = sock_no_setsockopt,
4470 .getsockopt = sock_no_getsockopt,
4471 .sendmsg = packet_sendmsg_spkt,
4472 .recvmsg = packet_recvmsg,
4473 .mmap = sock_no_mmap,
4474 .sendpage = sock_no_sendpage,
4477 static const struct proto_ops packet_ops = {
4478 .family = PF_PACKET,
4479 .owner = THIS_MODULE,
4480 .release = packet_release,
4481 .bind = packet_bind,
4482 .connect = sock_no_connect,
4483 .socketpair = sock_no_socketpair,
4484 .accept = sock_no_accept,
4485 .getname = packet_getname,
4486 .poll = packet_poll,
4487 .ioctl = packet_ioctl,
4488 .gettstamp = sock_gettstamp,
4489 .listen = sock_no_listen,
4490 .shutdown = sock_no_shutdown,
4491 .setsockopt = packet_setsockopt,
4492 .getsockopt = packet_getsockopt,
4493 #ifdef CONFIG_COMPAT
4494 .compat_setsockopt = compat_packet_setsockopt,
4496 .sendmsg = packet_sendmsg,
4497 .recvmsg = packet_recvmsg,
4498 .mmap = packet_mmap,
4499 .sendpage = sock_no_sendpage,
4502 static const struct net_proto_family packet_family_ops = {
4503 .family = PF_PACKET,
4504 .create = packet_create,
4505 .owner = THIS_MODULE,
4508 static struct notifier_block packet_netdev_notifier = {
4509 .notifier_call = packet_notifier,
4512 #ifdef CONFIG_PROC_FS
4514 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
4517 struct net *net = seq_file_net(seq);
4520 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
4523 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4525 struct net *net = seq_file_net(seq);
4526 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
4529 static void packet_seq_stop(struct seq_file *seq, void *v)
4535 static int packet_seq_show(struct seq_file *seq, void *v)
4537 if (v == SEQ_START_TOKEN)
4538 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4540 struct sock *s = sk_entry(v);
4541 const struct packet_sock *po = pkt_sk(s);
4544 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
4546 refcount_read(&s->sk_refcnt),
4551 atomic_read(&s->sk_rmem_alloc),
4552 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
4559 static const struct seq_operations packet_seq_ops = {
4560 .start = packet_seq_start,
4561 .next = packet_seq_next,
4562 .stop = packet_seq_stop,
4563 .show = packet_seq_show,
4567 static int __net_init packet_net_init(struct net *net)
4569 mutex_init(&net->packet.sklist_lock);
4570 INIT_HLIST_HEAD(&net->packet.sklist);
4572 if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
4573 sizeof(struct seq_net_private)))
4579 static void __net_exit packet_net_exit(struct net *net)
4581 remove_proc_entry("packet", net->proc_net);
4582 WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
4585 static struct pernet_operations packet_net_ops = {
4586 .init = packet_net_init,
4587 .exit = packet_net_exit,
4591 static void __exit packet_exit(void)
4593 unregister_netdevice_notifier(&packet_netdev_notifier);
4594 unregister_pernet_subsys(&packet_net_ops);
4595 sock_unregister(PF_PACKET);
4596 proto_unregister(&packet_proto);
4599 static int __init packet_init(void)
4603 rc = proto_register(&packet_proto, 0);
4606 rc = sock_register(&packet_family_ops);
4609 rc = register_pernet_subsys(&packet_net_ops);
4612 rc = register_netdevice_notifier(&packet_netdev_notifier);
4619 unregister_pernet_subsys(&packet_net_ops);
4621 sock_unregister(PF_PACKET);
4623 proto_unregister(&packet_proto);
4628 module_init(packet_init);
4629 module_exit(packet_exit);
4630 MODULE_LICENSE("GPL");
4631 MODULE_ALIAS_NETPROTO(PF_PACKET);