ibmveth: fix DMA unmap error in ibmveth_xmit_start error path
[linux-2.6-block.git] / net / core / sock.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
4ec93edb 35 * code. The ACK stuff can wait and needs major
1da177e4
LT
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
e005d193
JP
92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
80b14dee 94#include <asm/unaligned.h>
4fc268d2 95#include <linux/capability.h>
1da177e4 96#include <linux/errno.h>
cb820f8e 97#include <linux/errqueue.h>
1da177e4
LT
98#include <linux/types.h>
99#include <linux/socket.h>
100#include <linux/in.h>
101#include <linux/kernel.h>
1da177e4
LT
102#include <linux/module.h>
103#include <linux/proc_fs.h>
104#include <linux/seq_file.h>
105#include <linux/sched.h>
f1083048 106#include <linux/sched/mm.h>
1da177e4
LT
107#include <linux/timer.h>
108#include <linux/string.h>
109#include <linux/sockios.h>
110#include <linux/net.h>
111#include <linux/mm.h>
112#include <linux/slab.h>
113#include <linux/interrupt.h>
114#include <linux/poll.h>
115#include <linux/tcp.h>
116#include <linux/init.h>
a1f8e7f7 117#include <linux/highmem.h>
3f551f94 118#include <linux/user_namespace.h>
c5905afb 119#include <linux/static_key.h>
3969eb38 120#include <linux/memcontrol.h>
8c1ae10d 121#include <linux/prefetch.h>
1da177e4 122
7c0f6ba6 123#include <linux/uaccess.h>
1da177e4
LT
124
125#include <linux/netdevice.h>
126#include <net/protocol.h>
127#include <linux/skbuff.h>
457c4cbc 128#include <net/net_namespace.h>
2e6599cb 129#include <net/request_sock.h>
1da177e4 130#include <net/sock.h>
20d49473 131#include <linux/net_tstamp.h>
1da177e4
LT
132#include <net/xfrm.h>
133#include <linux/ipsec.h>
f8451725 134#include <net/cls_cgroup.h>
5bc1421e 135#include <net/netprio_cgroup.h>
eb4cb008 136#include <linux/sock_diag.h>
1da177e4
LT
137
138#include <linux/filter.h>
538950a1 139#include <net/sock_reuseport.h>
1da177e4 140
3847ce32
SM
141#include <trace/events/sock.h>
142
1da177e4 143#include <net/tcp.h>
076bb0c8 144#include <net/busy_poll.h>
06021292 145
36b77a52 146static DEFINE_MUTEX(proto_list_mutex);
d1a4c0b3
GC
147static LIST_HEAD(proto_list);
148
648845ab
TZ
149static void sock_inuse_add(struct net *net, int val);
150
a3b299da
EB
151/**
152 * sk_ns_capable - General socket capability test
153 * @sk: Socket to use a capability on or through
154 * @user_ns: The user namespace of the capability to use
155 * @cap: The capability to use
156 *
157 * Test to see if the opener of the socket had when the socket was
158 * created and the current process has the capability @cap in the user
159 * namespace @user_ns.
160 */
161bool sk_ns_capable(const struct sock *sk,
162 struct user_namespace *user_ns, int cap)
163{
164 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
165 ns_capable(user_ns, cap);
166}
167EXPORT_SYMBOL(sk_ns_capable);
168
169/**
170 * sk_capable - Socket global capability test
171 * @sk: Socket to use a capability on or through
e793c0f7 172 * @cap: The global capability to use
a3b299da
EB
173 *
174 * Test to see if the opener of the socket had when the socket was
175 * created and the current process has the capability @cap in all user
176 * namespaces.
177 */
178bool sk_capable(const struct sock *sk, int cap)
179{
180 return sk_ns_capable(sk, &init_user_ns, cap);
181}
182EXPORT_SYMBOL(sk_capable);
183
184/**
185 * sk_net_capable - Network namespace socket capability test
186 * @sk: Socket to use a capability on or through
187 * @cap: The capability to use
188 *
e793c0f7 189 * Test to see if the opener of the socket had when the socket was created
a3b299da
EB
190 * and the current process has the capability @cap over the network namespace
191 * the socket is a member of.
192 */
193bool sk_net_capable(const struct sock *sk, int cap)
194{
195 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
196}
197EXPORT_SYMBOL(sk_net_capable);
198
da21f24d
IM
199/*
200 * Each address family might have different locking rules, so we have
cdfbabfb
DH
201 * one slock key per address family and separate keys for internal and
202 * userspace sockets.
da21f24d 203 */
a5b5bb9a 204static struct lock_class_key af_family_keys[AF_MAX];
cdfbabfb 205static struct lock_class_key af_family_kern_keys[AF_MAX];
a5b5bb9a 206static struct lock_class_key af_family_slock_keys[AF_MAX];
cdfbabfb 207static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
a5b5bb9a 208
a5b5bb9a
IM
209/*
210 * Make lock validator output more readable. (we pre-construct these
211 * strings build-time, so that runtime initialization of socket
212 * locks is fast):
213 */
cdfbabfb
DH
214
215#define _sock_locks(x) \
216 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
217 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
218 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
219 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
220 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
221 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
222 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
223 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
224 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
225 x "27" , x "28" , x "AF_CAN" , \
226 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
227 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
228 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
229 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
68e8b849
BT
230 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
231 x "AF_MAX"
cdfbabfb 232
36cbd3dc 233static const char *const af_family_key_strings[AF_MAX+1] = {
cdfbabfb 234 _sock_locks("sk_lock-")
a5b5bb9a 235};
36cbd3dc 236static const char *const af_family_slock_key_strings[AF_MAX+1] = {
cdfbabfb 237 _sock_locks("slock-")
a5b5bb9a 238};
36cbd3dc 239static const char *const af_family_clock_key_strings[AF_MAX+1] = {
cdfbabfb
DH
240 _sock_locks("clock-")
241};
242
243static const char *const af_family_kern_key_strings[AF_MAX+1] = {
244 _sock_locks("k-sk_lock-")
245};
246static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
247 _sock_locks("k-slock-")
248};
249static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
250 _sock_locks("k-clock-")
443aef0e 251};
581319c5 252static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
6b431d50 253 _sock_locks("rlock-")
581319c5
PA
254};
255static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
6b431d50 256 _sock_locks("wlock-")
581319c5
PA
257};
258static const char *const af_family_elock_key_strings[AF_MAX+1] = {
6b431d50 259 _sock_locks("elock-")
581319c5 260};
da21f24d
IM
261
262/*
581319c5 263 * sk_callback_lock and sk queues locking rules are per-address-family,
da21f24d
IM
264 * so split the lock classes by using a per-AF key:
265 */
266static struct lock_class_key af_callback_keys[AF_MAX];
581319c5
PA
267static struct lock_class_key af_rlock_keys[AF_MAX];
268static struct lock_class_key af_wlock_keys[AF_MAX];
269static struct lock_class_key af_elock_keys[AF_MAX];
cdfbabfb 270static struct lock_class_key af_kern_callback_keys[AF_MAX];
da21f24d 271
1da177e4 272/* Run time adjustable parameters. */
ab32ea5d 273__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
6d8ebc8a 274EXPORT_SYMBOL(sysctl_wmem_max);
ab32ea5d 275__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
6d8ebc8a 276EXPORT_SYMBOL(sysctl_rmem_max);
ab32ea5d
BH
277__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
278__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
1da177e4 279
25985edc 280/* Maximal space eaten by iovec or ancillary data plus some space */
ab32ea5d 281int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
2a91525c 282EXPORT_SYMBOL(sysctl_optmem_max);
1da177e4 283
b245be1f
WB
284int sysctl_tstamp_allow_data __read_mostly = 1;
285
a7950ae8
DB
286DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
287EXPORT_SYMBOL_GPL(memalloc_socks_key);
c93bdd0e 288
7cb02404
MG
289/**
290 * sk_set_memalloc - sets %SOCK_MEMALLOC
291 * @sk: socket to set it on
292 *
293 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
294 * It's the responsibility of the admin to adjust min_free_kbytes
295 * to meet the requirements
296 */
297void sk_set_memalloc(struct sock *sk)
298{
299 sock_set_flag(sk, SOCK_MEMALLOC);
300 sk->sk_allocation |= __GFP_MEMALLOC;
a7950ae8 301 static_branch_inc(&memalloc_socks_key);
7cb02404
MG
302}
303EXPORT_SYMBOL_GPL(sk_set_memalloc);
304
305void sk_clear_memalloc(struct sock *sk)
306{
307 sock_reset_flag(sk, SOCK_MEMALLOC);
308 sk->sk_allocation &= ~__GFP_MEMALLOC;
a7950ae8 309 static_branch_dec(&memalloc_socks_key);
c76562b6
MG
310
311 /*
312 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
5d753610
MG
313 * progress of swapping. SOCK_MEMALLOC may be cleared while
314 * it has rmem allocations due to the last swapfile being deactivated
315 * but there is a risk that the socket is unusable due to exceeding
316 * the rmem limits. Reclaim the reserves and obey rmem limits again.
c76562b6 317 */
5d753610 318 sk_mem_reclaim(sk);
7cb02404
MG
319}
320EXPORT_SYMBOL_GPL(sk_clear_memalloc);
321
b4b9e355
MG
322int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
323{
324 int ret;
f1083048 325 unsigned int noreclaim_flag;
b4b9e355
MG
326
327 /* these should have been dropped before queueing */
328 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
329
f1083048 330 noreclaim_flag = memalloc_noreclaim_save();
b4b9e355 331 ret = sk->sk_backlog_rcv(sk, skb);
f1083048 332 memalloc_noreclaim_restore(noreclaim_flag);
b4b9e355
MG
333
334 return ret;
335}
336EXPORT_SYMBOL(__sk_backlog_rcv);
337
1da177e4
LT
338static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
339{
340 struct timeval tv;
341
342 if (optlen < sizeof(tv))
343 return -EINVAL;
344 if (copy_from_user(&tv, optval, sizeof(tv)))
345 return -EFAULT;
ba78073e
VA
346 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
347 return -EDOM;
1da177e4 348
ba78073e 349 if (tv.tv_sec < 0) {
6f11df83
AM
350 static int warned __read_mostly;
351
ba78073e 352 *timeo_p = 0;
50aab54f 353 if (warned < 10 && net_ratelimit()) {
ba78073e 354 warned++;
e005d193
JP
355 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
356 __func__, current->comm, task_pid_nr(current));
50aab54f 357 }
ba78073e
VA
358 return 0;
359 }
1da177e4
LT
360 *timeo_p = MAX_SCHEDULE_TIMEOUT;
361 if (tv.tv_sec == 0 && tv.tv_usec == 0)
362 return 0;
363 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
8ccde4c5 364 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
1da177e4
LT
365 return 0;
366}
367
368static void sock_warn_obsolete_bsdism(const char *name)
369{
370 static int warned;
371 static char warncomm[TASK_COMM_LEN];
4ec93edb
YH
372 if (strcmp(warncomm, current->comm) && warned < 5) {
373 strcpy(warncomm, current->comm);
e005d193
JP
374 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
375 warncomm, name);
1da177e4
LT
376 warned++;
377 }
378}
379
080a270f
HFS
380static bool sock_needs_netstamp(const struct sock *sk)
381{
382 switch (sk->sk_family) {
383 case AF_UNSPEC:
384 case AF_UNIX:
385 return false;
386 default:
387 return true;
388 }
389}
390
08e29af3 391static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
4ec93edb 392{
08e29af3
ED
393 if (sk->sk_flags & flags) {
394 sk->sk_flags &= ~flags;
080a270f
HFS
395 if (sock_needs_netstamp(sk) &&
396 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
20d49473 397 net_disable_timestamp();
1da177e4
LT
398 }
399}
400
401
e6afc8ac 402int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
f0088a50 403{
3b885787
NH
404 unsigned long flags;
405 struct sk_buff_head *list = &sk->sk_receive_queue;
f0088a50 406
0fd7bac6 407 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
766e9037 408 atomic_inc(&sk->sk_drops);
3847ce32 409 trace_sock_rcvqueue_full(sk, skb);
766e9037 410 return -ENOMEM;
f0088a50
DV
411 }
412
c76562b6 413 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
766e9037
ED
414 atomic_inc(&sk->sk_drops);
415 return -ENOBUFS;
3ab224be
HA
416 }
417
f0088a50
DV
418 skb->dev = NULL;
419 skb_set_owner_r(skb, sk);
49ad9599 420
7fee226a
ED
421 /* we escape from rcu protected region, make sure we dont leak
422 * a norefcounted dst
423 */
424 skb_dst_force(skb);
425
3b885787 426 spin_lock_irqsave(&list->lock, flags);
3bc3b96f 427 sock_skb_set_dropcount(sk, skb);
3b885787
NH
428 __skb_queue_tail(list, skb);
429 spin_unlock_irqrestore(&list->lock, flags);
f0088a50
DV
430
431 if (!sock_flag(sk, SOCK_DEAD))
676d2369 432 sk->sk_data_ready(sk);
766e9037 433 return 0;
f0088a50 434}
e6afc8ac 435EXPORT_SYMBOL(__sock_queue_rcv_skb);
436
437int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
438{
439 int err;
440
441 err = sk_filter(sk, skb);
442 if (err)
443 return err;
444
445 return __sock_queue_rcv_skb(sk, skb);
446}
f0088a50
DV
447EXPORT_SYMBOL(sock_queue_rcv_skb);
448
4f0c40d9 449int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
c3f24cfb 450 const int nested, unsigned int trim_cap, bool refcounted)
f0088a50
DV
451{
452 int rc = NET_RX_SUCCESS;
453
4f0c40d9 454 if (sk_filter_trim_cap(sk, skb, trim_cap))
f0088a50
DV
455 goto discard_and_relse;
456
457 skb->dev = NULL;
458
274f482d 459 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
c377411f
ED
460 atomic_inc(&sk->sk_drops);
461 goto discard_and_relse;
462 }
58a5a7b9
ACM
463 if (nested)
464 bh_lock_sock_nested(sk);
465 else
466 bh_lock_sock(sk);
a5b5bb9a
IM
467 if (!sock_owned_by_user(sk)) {
468 /*
469 * trylock + unlock semantics:
470 */
471 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
472
c57943a1 473 rc = sk_backlog_rcv(sk, skb);
a5b5bb9a
IM
474
475 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
f545a38f 476 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
8eae939f
ZY
477 bh_unlock_sock(sk);
478 atomic_inc(&sk->sk_drops);
479 goto discard_and_relse;
480 }
481
f0088a50
DV
482 bh_unlock_sock(sk);
483out:
c3f24cfb
ED
484 if (refcounted)
485 sock_put(sk);
f0088a50
DV
486 return rc;
487discard_and_relse:
488 kfree_skb(skb);
489 goto out;
490}
4f0c40d9 491EXPORT_SYMBOL(__sk_receive_skb);
f0088a50
DV
492
493struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
494{
b6c6712a 495 struct dst_entry *dst = __sk_dst_get(sk);
f0088a50
DV
496
497 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
e022f0b4 498 sk_tx_queue_clear(sk);
9b8805a3 499 sk->sk_dst_pending_confirm = 0;
a9b3cd7f 500 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
f0088a50
DV
501 dst_release(dst);
502 return NULL;
503 }
504
505 return dst;
506}
507EXPORT_SYMBOL(__sk_dst_check);
508
509struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
510{
511 struct dst_entry *dst = sk_dst_get(sk);
512
513 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
514 sk_dst_reset(sk);
515 dst_release(dst);
516 return NULL;
517 }
518
519 return dst;
520}
521EXPORT_SYMBOL(sk_dst_check);
522
c91f6df2
BH
523static int sock_setbindtodevice(struct sock *sk, char __user *optval,
524 int optlen)
4878809f
DM
525{
526 int ret = -ENOPROTOOPT;
527#ifdef CONFIG_NETDEVICES
3b1e0a65 528 struct net *net = sock_net(sk);
4878809f
DM
529 char devname[IFNAMSIZ];
530 int index;
531
532 /* Sorry... */
533 ret = -EPERM;
5e1fccc0 534 if (!ns_capable(net->user_ns, CAP_NET_RAW))
4878809f
DM
535 goto out;
536
537 ret = -EINVAL;
538 if (optlen < 0)
539 goto out;
540
541 /* Bind this socket to a particular device like "eth0",
542 * as specified in the passed interface name. If the
543 * name is "" or the option length is zero the socket
544 * is not bound.
545 */
546 if (optlen > IFNAMSIZ - 1)
547 optlen = IFNAMSIZ - 1;
548 memset(devname, 0, sizeof(devname));
549
550 ret = -EFAULT;
551 if (copy_from_user(devname, optval, optlen))
552 goto out;
553
000ba2e4
DM
554 index = 0;
555 if (devname[0] != '\0') {
bf8e56bf 556 struct net_device *dev;
4878809f 557
bf8e56bf
ED
558 rcu_read_lock();
559 dev = dev_get_by_name_rcu(net, devname);
560 if (dev)
561 index = dev->ifindex;
562 rcu_read_unlock();
4878809f
DM
563 ret = -ENODEV;
564 if (!dev)
565 goto out;
4878809f
DM
566 }
567
568 lock_sock(sk);
569 sk->sk_bound_dev_if = index;
6da5b0f0
MM
570 if (sk->sk_prot->rehash)
571 sk->sk_prot->rehash(sk);
4878809f
DM
572 sk_dst_reset(sk);
573 release_sock(sk);
574
575 ret = 0;
576
577out:
578#endif
579
580 return ret;
581}
582
c91f6df2
BH
583static int sock_getbindtodevice(struct sock *sk, char __user *optval,
584 int __user *optlen, int len)
585{
586 int ret = -ENOPROTOOPT;
587#ifdef CONFIG_NETDEVICES
588 struct net *net = sock_net(sk);
c91f6df2 589 char devname[IFNAMSIZ];
c91f6df2
BH
590
591 if (sk->sk_bound_dev_if == 0) {
592 len = 0;
593 goto zero;
594 }
595
596 ret = -EINVAL;
597 if (len < IFNAMSIZ)
598 goto out;
599
5dbe7c17
NS
600 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
601 if (ret)
c91f6df2 602 goto out;
c91f6df2
BH
603
604 len = strlen(devname) + 1;
605
606 ret = -EFAULT;
607 if (copy_to_user(optval, devname, len))
608 goto out;
609
610zero:
611 ret = -EFAULT;
612 if (put_user(len, optlen))
613 goto out;
614
615 ret = 0;
616
617out:
618#endif
619
620 return ret;
621}
622
c0ef877b
PE
623static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
624{
625 if (valbool)
626 sock_set_flag(sk, bit);
627 else
628 sock_reset_flag(sk, bit);
629}
630
f60e5990 631bool sk_mc_loop(struct sock *sk)
632{
633 if (dev_recursion_level())
634 return false;
635 if (!sk)
636 return true;
637 switch (sk->sk_family) {
638 case AF_INET:
639 return inet_sk(sk)->mc_loop;
640#if IS_ENABLED(CONFIG_IPV6)
641 case AF_INET6:
642 return inet6_sk(sk)->mc_loop;
643#endif
644 }
645 WARN_ON(1);
646 return true;
647}
648EXPORT_SYMBOL(sk_mc_loop);
649
1da177e4
LT
650/*
651 * This is meant for all protocols to use and covers goings on
652 * at the socket level. Everything here is generic.
653 */
654
655int sock_setsockopt(struct socket *sock, int level, int optname,
b7058842 656 char __user *optval, unsigned int optlen)
1da177e4 657{
80b14dee 658 struct sock_txtime sk_txtime;
2a91525c 659 struct sock *sk = sock->sk;
1da177e4
LT
660 int val;
661 int valbool;
662 struct linger ling;
663 int ret = 0;
4ec93edb 664
1da177e4
LT
665 /*
666 * Options without arguments
667 */
668
4878809f 669 if (optname == SO_BINDTODEVICE)
c91f6df2 670 return sock_setbindtodevice(sk, optval, optlen);
4878809f 671
e71a4783
SH
672 if (optlen < sizeof(int))
673 return -EINVAL;
4ec93edb 674
1da177e4
LT
675 if (get_user(val, (int __user *)optval))
676 return -EFAULT;
4ec93edb 677
2a91525c 678 valbool = val ? 1 : 0;
1da177e4
LT
679
680 lock_sock(sk);
681
2a91525c 682 switch (optname) {
e71a4783 683 case SO_DEBUG:
2a91525c 684 if (val && !capable(CAP_NET_ADMIN))
e71a4783 685 ret = -EACCES;
2a91525c 686 else
c0ef877b 687 sock_valbool_flag(sk, SOCK_DBG, valbool);
e71a4783
SH
688 break;
689 case SO_REUSEADDR:
cdb8744d 690 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
e71a4783 691 break;
055dc21a
TH
692 case SO_REUSEPORT:
693 sk->sk_reuseport = valbool;
694 break;
e71a4783 695 case SO_TYPE:
49c794e9 696 case SO_PROTOCOL:
0d6038ee 697 case SO_DOMAIN:
e71a4783
SH
698 case SO_ERROR:
699 ret = -ENOPROTOOPT;
700 break;
701 case SO_DONTROUTE:
c0ef877b 702 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
0fbe82e6 703 sk_dst_reset(sk);
e71a4783
SH
704 break;
705 case SO_BROADCAST:
706 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
707 break;
708 case SO_SNDBUF:
709 /* Don't error on this BSD doesn't and if you think
82981930
ED
710 * about it this is right. Otherwise apps have to
711 * play 'guess the biggest size' games. RCVBUF/SNDBUF
712 * are treated in BSD as hints
713 */
714 val = min_t(u32, val, sysctl_wmem_max);
b0573dea 715set_sndbuf:
e71a4783 716 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
b98b0bc8 717 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
82981930 718 /* Wake up sending tasks if we upped the value. */
e71a4783
SH
719 sk->sk_write_space(sk);
720 break;
1da177e4 721
e71a4783
SH
722 case SO_SNDBUFFORCE:
723 if (!capable(CAP_NET_ADMIN)) {
724 ret = -EPERM;
725 break;
726 }
727 goto set_sndbuf;
b0573dea 728
e71a4783
SH
729 case SO_RCVBUF:
730 /* Don't error on this BSD doesn't and if you think
82981930
ED
731 * about it this is right. Otherwise apps have to
732 * play 'guess the biggest size' games. RCVBUF/SNDBUF
733 * are treated in BSD as hints
734 */
735 val = min_t(u32, val, sysctl_rmem_max);
b0573dea 736set_rcvbuf:
e71a4783
SH
737 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
738 /*
739 * We double it on the way in to account for
740 * "struct sk_buff" etc. overhead. Applications
741 * assume that the SO_RCVBUF setting they make will
742 * allow that much actual data to be received on that
743 * socket.
744 *
745 * Applications are unaware that "struct sk_buff" and
746 * other overheads allocate from the receive buffer
747 * during socket buffer allocation.
748 *
749 * And after considering the possible alternatives,
750 * returning the value we actually used in getsockopt
751 * is the most desirable behavior.
752 */
b98b0bc8 753 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
e71a4783
SH
754 break;
755
756 case SO_RCVBUFFORCE:
757 if (!capable(CAP_NET_ADMIN)) {
758 ret = -EPERM;
1da177e4 759 break;
e71a4783
SH
760 }
761 goto set_rcvbuf;
1da177e4 762
e71a4783 763 case SO_KEEPALIVE:
4b9d07a4
UB
764 if (sk->sk_prot->keepalive)
765 sk->sk_prot->keepalive(sk, valbool);
e71a4783
SH
766 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
767 break;
768
769 case SO_OOBINLINE:
770 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
771 break;
772
773 case SO_NO_CHECK:
28448b80 774 sk->sk_no_check_tx = valbool;
e71a4783
SH
775 break;
776
777 case SO_PRIORITY:
5e1fccc0
EB
778 if ((val >= 0 && val <= 6) ||
779 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
e71a4783
SH
780 sk->sk_priority = val;
781 else
782 ret = -EPERM;
783 break;
784
785 case SO_LINGER:
786 if (optlen < sizeof(ling)) {
787 ret = -EINVAL; /* 1003.1g */
1da177e4 788 break;
e71a4783 789 }
2a91525c 790 if (copy_from_user(&ling, optval, sizeof(ling))) {
e71a4783 791 ret = -EFAULT;
1da177e4 792 break;
e71a4783
SH
793 }
794 if (!ling.l_onoff)
795 sock_reset_flag(sk, SOCK_LINGER);
796 else {
1da177e4 797#if (BITS_PER_LONG == 32)
e71a4783
SH
798 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
799 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1da177e4 800 else
e71a4783
SH
801#endif
802 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
803 sock_set_flag(sk, SOCK_LINGER);
804 }
805 break;
806
807 case SO_BSDCOMPAT:
808 sock_warn_obsolete_bsdism("setsockopt");
809 break;
810
811 case SO_PASSCRED:
812 if (valbool)
813 set_bit(SOCK_PASSCRED, &sock->flags);
814 else
815 clear_bit(SOCK_PASSCRED, &sock->flags);
816 break;
817
818 case SO_TIMESTAMP:
92f37fd2 819 case SO_TIMESTAMPNS:
e71a4783 820 if (valbool) {
92f37fd2
ED
821 if (optname == SO_TIMESTAMP)
822 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
823 else
824 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783 825 sock_set_flag(sk, SOCK_RCVTSTAMP);
20d49473 826 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
92f37fd2 827 } else {
e71a4783 828 sock_reset_flag(sk, SOCK_RCVTSTAMP);
92f37fd2
ED
829 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
830 }
e71a4783
SH
831 break;
832
20d49473
PO
833 case SO_TIMESTAMPING:
834 if (val & ~SOF_TIMESTAMPING_MASK) {
f249fb78 835 ret = -EINVAL;
20d49473
PO
836 break;
837 }
b245be1f 838
09c2d251 839 if (val & SOF_TIMESTAMPING_OPT_ID &&
4ed2d765 840 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
ac5cc977
WC
841 if (sk->sk_protocol == IPPROTO_TCP &&
842 sk->sk_type == SOCK_STREAM) {
6db8b963
SHY
843 if ((1 << sk->sk_state) &
844 (TCPF_CLOSE | TCPF_LISTEN)) {
4ed2d765
WB
845 ret = -EINVAL;
846 break;
847 }
848 sk->sk_tskey = tcp_sk(sk)->snd_una;
849 } else {
850 sk->sk_tskey = 0;
851 }
852 }
1c885808
FY
853
854 if (val & SOF_TIMESTAMPING_OPT_STATS &&
855 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
856 ret = -EINVAL;
857 break;
858 }
859
b9f40e21 860 sk->sk_tsflags = val;
20d49473
PO
861 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
862 sock_enable_timestamp(sk,
863 SOCK_TIMESTAMPING_RX_SOFTWARE);
864 else
865 sock_disable_timestamp(sk,
08e29af3 866 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
20d49473
PO
867 break;
868
e71a4783
SH
869 case SO_RCVLOWAT:
870 if (val < 0)
871 val = INT_MAX;
d1361840
ED
872 if (sock->ops->set_rcvlowat)
873 ret = sock->ops->set_rcvlowat(sk, val);
874 else
875 sk->sk_rcvlowat = val ? : 1;
e71a4783
SH
876 break;
877
878 case SO_RCVTIMEO:
879 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
880 break;
881
882 case SO_SNDTIMEO:
883 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
884 break;
1da177e4 885
e71a4783
SH
886 case SO_ATTACH_FILTER:
887 ret = -EINVAL;
888 if (optlen == sizeof(struct sock_fprog)) {
889 struct sock_fprog fprog;
1da177e4 890
e71a4783
SH
891 ret = -EFAULT;
892 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1da177e4 893 break;
e71a4783
SH
894
895 ret = sk_attach_filter(&fprog, sk);
896 }
897 break;
898
89aa0758
AS
899 case SO_ATTACH_BPF:
900 ret = -EINVAL;
901 if (optlen == sizeof(u32)) {
902 u32 ufd;
903
904 ret = -EFAULT;
905 if (copy_from_user(&ufd, optval, sizeof(ufd)))
906 break;
907
908 ret = sk_attach_bpf(ufd, sk);
909 }
910 break;
911
538950a1
CG
912 case SO_ATTACH_REUSEPORT_CBPF:
913 ret = -EINVAL;
914 if (optlen == sizeof(struct sock_fprog)) {
915 struct sock_fprog fprog;
916
917 ret = -EFAULT;
918 if (copy_from_user(&fprog, optval, sizeof(fprog)))
919 break;
920
921 ret = sk_reuseport_attach_filter(&fprog, sk);
922 }
923 break;
924
925 case SO_ATTACH_REUSEPORT_EBPF:
926 ret = -EINVAL;
927 if (optlen == sizeof(u32)) {
928 u32 ufd;
929
930 ret = -EFAULT;
931 if (copy_from_user(&ufd, optval, sizeof(ufd)))
932 break;
933
934 ret = sk_reuseport_attach_bpf(ufd, sk);
935 }
936 break;
937
e71a4783 938 case SO_DETACH_FILTER:
55b33325 939 ret = sk_detach_filter(sk);
e71a4783 940 break;
1da177e4 941
d59577b6
VB
942 case SO_LOCK_FILTER:
943 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
944 ret = -EPERM;
945 else
946 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
947 break;
948
e71a4783
SH
949 case SO_PASSSEC:
950 if (valbool)
951 set_bit(SOCK_PASSSEC, &sock->flags);
952 else
953 clear_bit(SOCK_PASSSEC, &sock->flags);
954 break;
4a19ec58 955 case SO_MARK:
50254256 956 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
4a19ec58 957 ret = -EPERM;
50254256 958 } else if (val != sk->sk_mark) {
4a19ec58 959 sk->sk_mark = val;
50254256
DB
960 sk_dst_reset(sk);
961 }
4a19ec58 962 break;
877ce7c1 963
3b885787 964 case SO_RXQ_OVFL:
8083f0fc 965 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
3b885787 966 break;
6e3e939f
JB
967
968 case SO_WIFI_STATUS:
969 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
970 break;
971
ef64a54f
PE
972 case SO_PEEK_OFF:
973 if (sock->ops->set_peek_off)
12663bfc 974 ret = sock->ops->set_peek_off(sk, val);
ef64a54f
PE
975 else
976 ret = -EOPNOTSUPP;
977 break;
3bdc0eba
BG
978
979 case SO_NOFCS:
980 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
981 break;
982
7d4c04fc
KJ
983 case SO_SELECT_ERR_QUEUE:
984 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
985 break;
986
e0d1095a 987#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 988 case SO_BUSY_POLL:
dafcc438
ET
989 /* allow unprivileged users to decrease the value */
990 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
991 ret = -EPERM;
992 else {
993 if (val < 0)
994 ret = -EINVAL;
995 else
996 sk->sk_ll_usec = val;
997 }
998 break;
999#endif
62748f32
ED
1000
1001 case SO_MAX_PACING_RATE:
218af599
ED
1002 if (val != ~0U)
1003 cmpxchg(&sk->sk_pacing_status,
1004 SK_PACING_NONE,
1005 SK_PACING_NEEDED);
76a9ebe8 1006 sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
62748f32
ED
1007 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1008 sk->sk_max_pacing_rate);
1009 break;
1010
70da268b
ED
1011 case SO_INCOMING_CPU:
1012 sk->sk_incoming_cpu = val;
1013 break;
1014
a87cb3e4
TH
1015 case SO_CNX_ADVICE:
1016 if (val == 1)
1017 dst_negative_advice(sk);
1018 break;
76851d12
WB
1019
1020 case SO_ZEROCOPY:
28190752 1021 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
b5947e5d
WB
1022 if (!((sk->sk_type == SOCK_STREAM &&
1023 sk->sk_protocol == IPPROTO_TCP) ||
1024 (sk->sk_type == SOCK_DGRAM &&
1025 sk->sk_protocol == IPPROTO_UDP)))
28190752 1026 ret = -ENOTSUPP;
28190752 1027 } else if (sk->sk_family != PF_RDS) {
76851d12 1028 ret = -ENOTSUPP;
28190752
SV
1029 }
1030 if (!ret) {
1031 if (val < 0 || val > 1)
1032 ret = -EINVAL;
1033 else
1034 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
28190752 1035 }
334e6413
JSP
1036 break;
1037
80b14dee
RC
1038 case SO_TXTIME:
1039 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1040 ret = -EPERM;
1041 } else if (optlen != sizeof(struct sock_txtime)) {
1042 ret = -EINVAL;
1043 } else if (copy_from_user(&sk_txtime, optval,
1044 sizeof(struct sock_txtime))) {
1045 ret = -EFAULT;
1046 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1047 ret = -EINVAL;
1048 } else {
1049 sock_valbool_flag(sk, SOCK_TXTIME, true);
1050 sk->sk_clockid = sk_txtime.clockid;
1051 sk->sk_txtime_deadline_mode =
1052 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
4b15c707
JSP
1053 sk->sk_txtime_report_errors =
1054 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
80b14dee
RC
1055 }
1056 break;
1057
e71a4783
SH
1058 default:
1059 ret = -ENOPROTOOPT;
1060 break;
4ec93edb 1061 }
1da177e4
LT
1062 release_sock(sk);
1063 return ret;
1064}
2a91525c 1065EXPORT_SYMBOL(sock_setsockopt);
1da177e4
LT
1066
1067
8f09898b 1068static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1069 struct ucred *ucred)
3f551f94
EB
1070{
1071 ucred->pid = pid_vnr(pid);
1072 ucred->uid = ucred->gid = -1;
1073 if (cred) {
1074 struct user_namespace *current_ns = current_user_ns();
1075
b2e4f544
EB
1076 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1077 ucred->gid = from_kgid_munged(current_ns, cred->egid);
3f551f94
EB
1078 }
1079}
1080
28b5ba2a
DH
1081static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1082{
1083 struct user_namespace *user_ns = current_user_ns();
1084 int i;
1085
1086 for (i = 0; i < src->ngroups; i++)
1087 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1088 return -EFAULT;
1089
1090 return 0;
1091}
1092
1da177e4
LT
1093int sock_getsockopt(struct socket *sock, int level, int optname,
1094 char __user *optval, int __user *optlen)
1095{
1096 struct sock *sk = sock->sk;
4ec93edb 1097
e71a4783 1098 union {
4ec93edb 1099 int val;
5daab9db 1100 u64 val64;
4ec93edb 1101 struct linger ling;
1da177e4 1102 struct timeval tm;
80b14dee 1103 struct sock_txtime txtime;
1da177e4 1104 } v;
4ec93edb 1105
4d0392be 1106 int lv = sizeof(int);
1da177e4 1107 int len;
4ec93edb 1108
e71a4783 1109 if (get_user(len, optlen))
4ec93edb 1110 return -EFAULT;
e71a4783 1111 if (len < 0)
1da177e4 1112 return -EINVAL;
4ec93edb 1113
50fee1de 1114 memset(&v, 0, sizeof(v));
df0bca04 1115
2a91525c 1116 switch (optname) {
e71a4783
SH
1117 case SO_DEBUG:
1118 v.val = sock_flag(sk, SOCK_DBG);
1119 break;
1120
1121 case SO_DONTROUTE:
1122 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1123 break;
1124
1125 case SO_BROADCAST:
1b23a5df 1126 v.val = sock_flag(sk, SOCK_BROADCAST);
e71a4783
SH
1127 break;
1128
1129 case SO_SNDBUF:
1130 v.val = sk->sk_sndbuf;
1131 break;
1132
1133 case SO_RCVBUF:
1134 v.val = sk->sk_rcvbuf;
1135 break;
1136
1137 case SO_REUSEADDR:
1138 v.val = sk->sk_reuse;
1139 break;
1140
055dc21a
TH
1141 case SO_REUSEPORT:
1142 v.val = sk->sk_reuseport;
1143 break;
1144
e71a4783 1145 case SO_KEEPALIVE:
1b23a5df 1146 v.val = sock_flag(sk, SOCK_KEEPOPEN);
e71a4783
SH
1147 break;
1148
1149 case SO_TYPE:
1150 v.val = sk->sk_type;
1151 break;
1152
49c794e9
JE
1153 case SO_PROTOCOL:
1154 v.val = sk->sk_protocol;
1155 break;
1156
0d6038ee
JE
1157 case SO_DOMAIN:
1158 v.val = sk->sk_family;
1159 break;
1160
e71a4783
SH
1161 case SO_ERROR:
1162 v.val = -sock_error(sk);
2a91525c 1163 if (v.val == 0)
e71a4783
SH
1164 v.val = xchg(&sk->sk_err_soft, 0);
1165 break;
1166
1167 case SO_OOBINLINE:
1b23a5df 1168 v.val = sock_flag(sk, SOCK_URGINLINE);
e71a4783
SH
1169 break;
1170
1171 case SO_NO_CHECK:
28448b80 1172 v.val = sk->sk_no_check_tx;
e71a4783
SH
1173 break;
1174
1175 case SO_PRIORITY:
1176 v.val = sk->sk_priority;
1177 break;
1178
1179 case SO_LINGER:
1180 lv = sizeof(v.ling);
1b23a5df 1181 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
e71a4783
SH
1182 v.ling.l_linger = sk->sk_lingertime / HZ;
1183 break;
1184
1185 case SO_BSDCOMPAT:
1186 sock_warn_obsolete_bsdism("getsockopt");
1187 break;
1188
1189 case SO_TIMESTAMP:
92f37fd2
ED
1190 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1191 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1192 break;
1193
1194 case SO_TIMESTAMPNS:
1195 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783
SH
1196 break;
1197
20d49473 1198 case SO_TIMESTAMPING:
b9f40e21 1199 v.val = sk->sk_tsflags;
20d49473
PO
1200 break;
1201
e71a4783 1202 case SO_RCVTIMEO:
2a91525c 1203 lv = sizeof(struct timeval);
e71a4783
SH
1204 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1205 v.tm.tv_sec = 0;
1206 v.tm.tv_usec = 0;
1207 } else {
1208 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
8ccde4c5 1209 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
e71a4783
SH
1210 }
1211 break;
1212
1213 case SO_SNDTIMEO:
2a91525c 1214 lv = sizeof(struct timeval);
e71a4783
SH
1215 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1216 v.tm.tv_sec = 0;
1217 v.tm.tv_usec = 0;
1218 } else {
1219 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
8ccde4c5 1220 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
e71a4783
SH
1221 }
1222 break;
1da177e4 1223
e71a4783
SH
1224 case SO_RCVLOWAT:
1225 v.val = sk->sk_rcvlowat;
1226 break;
1da177e4 1227
e71a4783 1228 case SO_SNDLOWAT:
2a91525c 1229 v.val = 1;
e71a4783 1230 break;
1da177e4 1231
e71a4783 1232 case SO_PASSCRED:
82981930 1233 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
e71a4783 1234 break;
1da177e4 1235
e71a4783 1236 case SO_PEERCRED:
109f6e39
EB
1237 {
1238 struct ucred peercred;
1239 if (len > sizeof(peercred))
1240 len = sizeof(peercred);
1241 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1242 if (copy_to_user(optval, &peercred, len))
e71a4783
SH
1243 return -EFAULT;
1244 goto lenout;
109f6e39 1245 }
1da177e4 1246
28b5ba2a
DH
1247 case SO_PEERGROUPS:
1248 {
1249 int ret, n;
1250
1251 if (!sk->sk_peer_cred)
1252 return -ENODATA;
1253
1254 n = sk->sk_peer_cred->group_info->ngroups;
1255 if (len < n * sizeof(gid_t)) {
1256 len = n * sizeof(gid_t);
1257 return put_user(len, optlen) ? -EFAULT : -ERANGE;
1258 }
1259 len = n * sizeof(gid_t);
1260
1261 ret = groups_to_user((gid_t __user *)optval,
1262 sk->sk_peer_cred->group_info);
1263 if (ret)
1264 return ret;
1265 goto lenout;
1266 }
1267
e71a4783
SH
1268 case SO_PEERNAME:
1269 {
1270 char address[128];
1271
9b2c45d4
DV
1272 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1273 if (lv < 0)
e71a4783
SH
1274 return -ENOTCONN;
1275 if (lv < len)
1276 return -EINVAL;
1277 if (copy_to_user(optval, address, len))
1278 return -EFAULT;
1279 goto lenout;
1280 }
1da177e4 1281
e71a4783
SH
1282 /* Dubious BSD thing... Probably nobody even uses it, but
1283 * the UNIX standard wants it for whatever reason... -DaveM
1284 */
1285 case SO_ACCEPTCONN:
1286 v.val = sk->sk_state == TCP_LISTEN;
1287 break;
1da177e4 1288
e71a4783 1289 case SO_PASSSEC:
82981930 1290 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
e71a4783 1291 break;
877ce7c1 1292
e71a4783
SH
1293 case SO_PEERSEC:
1294 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1da177e4 1295
4a19ec58
LAT
1296 case SO_MARK:
1297 v.val = sk->sk_mark;
1298 break;
1299
3b885787 1300 case SO_RXQ_OVFL:
1b23a5df 1301 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
3b885787
NH
1302 break;
1303
6e3e939f 1304 case SO_WIFI_STATUS:
1b23a5df 1305 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
6e3e939f
JB
1306 break;
1307
ef64a54f
PE
1308 case SO_PEEK_OFF:
1309 if (!sock->ops->set_peek_off)
1310 return -EOPNOTSUPP;
1311
1312 v.val = sk->sk_peek_off;
1313 break;
bc2f7996 1314 case SO_NOFCS:
1b23a5df 1315 v.val = sock_flag(sk, SOCK_NOFCS);
bc2f7996 1316 break;
c91f6df2 1317
f7b86bfe 1318 case SO_BINDTODEVICE:
c91f6df2
BH
1319 return sock_getbindtodevice(sk, optval, optlen, len);
1320
a8fc9277
PE
1321 case SO_GET_FILTER:
1322 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1323 if (len < 0)
1324 return len;
1325
1326 goto lenout;
c91f6df2 1327
d59577b6
VB
1328 case SO_LOCK_FILTER:
1329 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1330 break;
1331
ea02f941
MS
1332 case SO_BPF_EXTENSIONS:
1333 v.val = bpf_tell_extensions();
1334 break;
1335
7d4c04fc
KJ
1336 case SO_SELECT_ERR_QUEUE:
1337 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1338 break;
1339
e0d1095a 1340#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 1341 case SO_BUSY_POLL:
dafcc438
ET
1342 v.val = sk->sk_ll_usec;
1343 break;
1344#endif
1345
62748f32 1346 case SO_MAX_PACING_RATE:
76a9ebe8
ED
1347 /* 32bit version */
1348 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
62748f32
ED
1349 break;
1350
2c8c56e1
ED
1351 case SO_INCOMING_CPU:
1352 v.val = sk->sk_incoming_cpu;
1353 break;
1354
a2d133b1
JH
1355 case SO_MEMINFO:
1356 {
1357 u32 meminfo[SK_MEMINFO_VARS];
1358
1359 if (get_user(len, optlen))
1360 return -EFAULT;
1361
1362 sk_get_meminfo(sk, meminfo);
1363
1364 len = min_t(unsigned int, len, sizeof(meminfo));
1365 if (copy_to_user(optval, &meminfo, len))
1366 return -EFAULT;
1367
1368 goto lenout;
1369 }
6d433902
SS
1370
1371#ifdef CONFIG_NET_RX_BUSY_POLL
1372 case SO_INCOMING_NAPI_ID:
1373 v.val = READ_ONCE(sk->sk_napi_id);
1374
1375 /* aggregate non-NAPI IDs down to 0 */
1376 if (v.val < MIN_NAPI_ID)
1377 v.val = 0;
1378
1379 break;
1380#endif
1381
5daab9db
CF
1382 case SO_COOKIE:
1383 lv = sizeof(u64);
1384 if (len < lv)
1385 return -EINVAL;
1386 v.val64 = sock_gen_cookie(sk);
1387 break;
1388
76851d12
WB
1389 case SO_ZEROCOPY:
1390 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1391 break;
1392
80b14dee
RC
1393 case SO_TXTIME:
1394 lv = sizeof(v.txtime);
1395 v.txtime.clockid = sk->sk_clockid;
1396 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1397 SOF_TXTIME_DEADLINE_MODE : 0;
4b15c707
JSP
1398 v.txtime.flags |= sk->sk_txtime_report_errors ?
1399 SOF_TXTIME_REPORT_ERRORS : 0;
80b14dee
RC
1400 break;
1401
e71a4783 1402 default:
443b5991
YH
1403 /* We implement the SO_SNDLOWAT etc to not be settable
1404 * (1003.1g 7).
1405 */
e71a4783 1406 return -ENOPROTOOPT;
1da177e4 1407 }
e71a4783 1408
1da177e4
LT
1409 if (len > lv)
1410 len = lv;
1411 if (copy_to_user(optval, &v, len))
1412 return -EFAULT;
1413lenout:
4ec93edb
YH
1414 if (put_user(len, optlen))
1415 return -EFAULT;
1416 return 0;
1da177e4
LT
1417}
1418
a5b5bb9a
IM
1419/*
1420 * Initialize an sk_lock.
1421 *
1422 * (We also register the sk_lock with the lock validator.)
1423 */
b6f99a21 1424static inline void sock_lock_init(struct sock *sk)
a5b5bb9a 1425{
cdfbabfb
DH
1426 if (sk->sk_kern_sock)
1427 sock_lock_init_class_and_name(
1428 sk,
1429 af_family_kern_slock_key_strings[sk->sk_family],
1430 af_family_kern_slock_keys + sk->sk_family,
1431 af_family_kern_key_strings[sk->sk_family],
1432 af_family_kern_keys + sk->sk_family);
1433 else
1434 sock_lock_init_class_and_name(
1435 sk,
ed07536e
PZ
1436 af_family_slock_key_strings[sk->sk_family],
1437 af_family_slock_keys + sk->sk_family,
1438 af_family_key_strings[sk->sk_family],
1439 af_family_keys + sk->sk_family);
a5b5bb9a
IM
1440}
1441
4dc6dc71
ED
1442/*
1443 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1444 * even temporarly, because of RCU lookups. sk_node should also be left as is.
68835aba 1445 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
4dc6dc71 1446 */
f1a6c4da
PE
1447static void sock_copy(struct sock *nsk, const struct sock *osk)
1448{
1449#ifdef CONFIG_SECURITY_NETWORK
1450 void *sptr = nsk->sk_security;
1451#endif
68835aba
ED
1452 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1453
1454 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1455 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1456
f1a6c4da
PE
1457#ifdef CONFIG_SECURITY_NETWORK
1458 nsk->sk_security = sptr;
1459 security_sk_clone(osk, nsk);
1460#endif
1461}
1462
2e4afe7b
PE
1463static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1464 int family)
c308c1b2
PE
1465{
1466 struct sock *sk;
1467 struct kmem_cache *slab;
1468
1469 slab = prot->slab;
e912b114
ED
1470 if (slab != NULL) {
1471 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1472 if (!sk)
1473 return sk;
ba2489b0
ED
1474 if (priority & __GFP_ZERO)
1475 sk_prot_clear_nulls(sk, prot->obj_size);
fcbdf09d 1476 } else
c308c1b2
PE
1477 sk = kmalloc(prot->obj_size, priority);
1478
2e4afe7b
PE
1479 if (sk != NULL) {
1480 if (security_sk_alloc(sk, family, priority))
1481 goto out_free;
1482
1483 if (!try_module_get(prot->owner))
1484 goto out_free_sec;
e022f0b4 1485 sk_tx_queue_clear(sk);
2e4afe7b
PE
1486 }
1487
c308c1b2 1488 return sk;
2e4afe7b
PE
1489
1490out_free_sec:
1491 security_sk_free(sk);
1492out_free:
1493 if (slab != NULL)
1494 kmem_cache_free(slab, sk);
1495 else
1496 kfree(sk);
1497 return NULL;
c308c1b2
PE
1498}
1499
1500static void sk_prot_free(struct proto *prot, struct sock *sk)
1501{
1502 struct kmem_cache *slab;
2e4afe7b 1503 struct module *owner;
c308c1b2 1504
2e4afe7b 1505 owner = prot->owner;
c308c1b2 1506 slab = prot->slab;
2e4afe7b 1507
bd1060a1 1508 cgroup_sk_free(&sk->sk_cgrp_data);
2d758073 1509 mem_cgroup_sk_free(sk);
2e4afe7b 1510 security_sk_free(sk);
c308c1b2
PE
1511 if (slab != NULL)
1512 kmem_cache_free(slab, sk);
1513 else
1514 kfree(sk);
2e4afe7b 1515 module_put(owner);
c308c1b2
PE
1516}
1517
1da177e4
LT
1518/**
1519 * sk_alloc - All socket objects are allocated here
c4ea43c5 1520 * @net: the applicable net namespace
4dc3b16b
PP
1521 * @family: protocol family
1522 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1523 * @prot: struct proto associated with this new sock instance
11aa9c28 1524 * @kern: is this to be a kernel socket?
1da177e4 1525 */
1b8d7ae4 1526struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
11aa9c28 1527 struct proto *prot, int kern)
1da177e4 1528{
c308c1b2 1529 struct sock *sk;
1da177e4 1530
154adbc8 1531 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1da177e4 1532 if (sk) {
154adbc8
PE
1533 sk->sk_family = family;
1534 /*
1535 * See comment in struct sock definition to understand
1536 * why we need sk_prot_creator -acme
1537 */
1538 sk->sk_prot = sk->sk_prot_creator = prot;
cdfbabfb 1539 sk->sk_kern_sock = kern;
154adbc8 1540 sock_lock_init(sk);
26abe143 1541 sk->sk_net_refcnt = kern ? 0 : 1;
648845ab 1542 if (likely(sk->sk_net_refcnt)) {
26abe143 1543 get_net(net);
648845ab
TZ
1544 sock_inuse_add(net, 1);
1545 }
1546
26abe143 1547 sock_net_set(sk, net);
14afee4b 1548 refcount_set(&sk->sk_wmem_alloc, 1);
f8451725 1549
2d758073 1550 mem_cgroup_sk_alloc(sk);
d979a39d 1551 cgroup_sk_alloc(&sk->sk_cgrp_data);
2a56a1fe
TH
1552 sock_update_classid(&sk->sk_cgrp_data);
1553 sock_update_netprioidx(&sk->sk_cgrp_data);
1da177e4 1554 }
a79af59e 1555
2e4afe7b 1556 return sk;
1da177e4 1557}
2a91525c 1558EXPORT_SYMBOL(sk_alloc);
1da177e4 1559
a4298e45
ED
1560/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1561 * grace period. This is the case for UDP sockets and TCP listeners.
1562 */
1563static void __sk_destruct(struct rcu_head *head)
1da177e4 1564{
a4298e45 1565 struct sock *sk = container_of(head, struct sock, sk_rcu);
1da177e4 1566 struct sk_filter *filter;
1da177e4
LT
1567
1568 if (sk->sk_destruct)
1569 sk->sk_destruct(sk);
1570
a898def2 1571 filter = rcu_dereference_check(sk->sk_filter,
14afee4b 1572 refcount_read(&sk->sk_wmem_alloc) == 0);
1da177e4 1573 if (filter) {
309dd5fc 1574 sk_filter_uncharge(sk, filter);
a9b3cd7f 1575 RCU_INIT_POINTER(sk->sk_filter, NULL);
1da177e4 1576 }
538950a1
CG
1577 if (rcu_access_pointer(sk->sk_reuseport_cb))
1578 reuseport_detach_sock(sk);
1da177e4 1579
08e29af3 1580 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1da177e4
LT
1581
1582 if (atomic_read(&sk->sk_omem_alloc))
e005d193
JP
1583 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1584 __func__, atomic_read(&sk->sk_omem_alloc));
1da177e4 1585
22a0e18e
ED
1586 if (sk->sk_frag.page) {
1587 put_page(sk->sk_frag.page);
1588 sk->sk_frag.page = NULL;
1589 }
1590
109f6e39
EB
1591 if (sk->sk_peer_cred)
1592 put_cred(sk->sk_peer_cred);
1593 put_pid(sk->sk_peer_pid);
26abe143
EB
1594 if (likely(sk->sk_net_refcnt))
1595 put_net(sock_net(sk));
c308c1b2 1596 sk_prot_free(sk->sk_prot_creator, sk);
1da177e4 1597}
2b85a34e 1598
a4298e45
ED
1599void sk_destruct(struct sock *sk)
1600{
1601 if (sock_flag(sk, SOCK_RCU_FREE))
1602 call_rcu(&sk->sk_rcu, __sk_destruct);
1603 else
1604 __sk_destruct(&sk->sk_rcu);
1605}
1606
eb4cb008
CG
1607static void __sk_free(struct sock *sk)
1608{
648845ab
TZ
1609 if (likely(sk->sk_net_refcnt))
1610 sock_inuse_add(sock_net(sk), -1);
1611
9709020c 1612 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
eb4cb008
CG
1613 sock_diag_broadcast_destroy(sk);
1614 else
1615 sk_destruct(sk);
1616}
1617
2b85a34e
ED
1618void sk_free(struct sock *sk)
1619{
1620 /*
25985edc 1621 * We subtract one from sk_wmem_alloc and can know if
2b85a34e
ED
1622 * some packets are still in some tx queue.
1623 * If not null, sock_wfree() will call __sk_free(sk) later
1624 */
14afee4b 1625 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2b85a34e
ED
1626 __sk_free(sk);
1627}
2a91525c 1628EXPORT_SYMBOL(sk_free);
1da177e4 1629
581319c5
PA
1630static void sk_init_common(struct sock *sk)
1631{
1632 skb_queue_head_init(&sk->sk_receive_queue);
1633 skb_queue_head_init(&sk->sk_write_queue);
1634 skb_queue_head_init(&sk->sk_error_queue);
1635
1636 rwlock_init(&sk->sk_callback_lock);
1637 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1638 af_rlock_keys + sk->sk_family,
1639 af_family_rlock_key_strings[sk->sk_family]);
1640 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1641 af_wlock_keys + sk->sk_family,
1642 af_family_wlock_key_strings[sk->sk_family]);
1643 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1644 af_elock_keys + sk->sk_family,
1645 af_family_elock_key_strings[sk->sk_family]);
1646 lockdep_set_class_and_name(&sk->sk_callback_lock,
1647 af_callback_keys + sk->sk_family,
1648 af_family_clock_key_strings[sk->sk_family]);
1649}
1650
e56c57d0
ED
1651/**
1652 * sk_clone_lock - clone a socket, and lock its clone
1653 * @sk: the socket to clone
1654 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1655 *
1656 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1657 */
1658struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
87d11ceb 1659{
8fd1d178 1660 struct sock *newsk;
278571ba 1661 bool is_charged = true;
87d11ceb 1662
8fd1d178 1663 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
87d11ceb
ACM
1664 if (newsk != NULL) {
1665 struct sk_filter *filter;
1666
892c141e 1667 sock_copy(newsk, sk);
87d11ceb 1668
9d538fa6
CP
1669 newsk->sk_prot_creator = sk->sk_prot;
1670
87d11ceb 1671 /* SANITY */
8a681736
SV
1672 if (likely(newsk->sk_net_refcnt))
1673 get_net(sock_net(newsk));
87d11ceb
ACM
1674 sk_node_init(&newsk->sk_node);
1675 sock_lock_init(newsk);
1676 bh_lock_sock(newsk);
fa438ccf 1677 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
8eae939f 1678 newsk->sk_backlog.len = 0;
87d11ceb
ACM
1679
1680 atomic_set(&newsk->sk_rmem_alloc, 0);
2b85a34e
ED
1681 /*
1682 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1683 */
14afee4b 1684 refcount_set(&newsk->sk_wmem_alloc, 1);
87d11ceb 1685 atomic_set(&newsk->sk_omem_alloc, 0);
581319c5 1686 sk_init_common(newsk);
87d11ceb
ACM
1687
1688 newsk->sk_dst_cache = NULL;
9b8805a3 1689 newsk->sk_dst_pending_confirm = 0;
87d11ceb
ACM
1690 newsk->sk_wmem_queued = 0;
1691 newsk->sk_forward_alloc = 0;
9caad864 1692 atomic_set(&newsk->sk_drops, 0);
87d11ceb 1693 newsk->sk_send_head = NULL;
87d11ceb 1694 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
52267790 1695 atomic_set(&newsk->sk_zckey, 0);
87d11ceb
ACM
1696
1697 sock_reset_flag(newsk, SOCK_DONE);
edbe69ef 1698 mem_cgroup_sk_alloc(newsk);
c0576e39 1699 cgroup_sk_alloc(&newsk->sk_cgrp_data);
87d11ceb 1700
eefca20e
ED
1701 rcu_read_lock();
1702 filter = rcu_dereference(sk->sk_filter);
87d11ceb 1703 if (filter != NULL)
278571ba
AS
1704 /* though it's an empty new sock, the charging may fail
1705 * if sysctl_optmem_max was changed between creation of
1706 * original socket and cloning
1707 */
1708 is_charged = sk_filter_charge(newsk, filter);
eefca20e
ED
1709 RCU_INIT_POINTER(newsk->sk_filter, filter);
1710 rcu_read_unlock();
87d11ceb 1711
d188ba86 1712 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
a97e50cc
DB
1713 /* We need to make sure that we don't uncharge the new
1714 * socket if we couldn't charge it in the first place
1715 * as otherwise we uncharge the parent's filter.
1716 */
1717 if (!is_charged)
1718 RCU_INIT_POINTER(newsk->sk_filter, NULL);
94352d45 1719 sk_free_unlock_clone(newsk);
87d11ceb
ACM
1720 newsk = NULL;
1721 goto out;
1722 }
fa463497 1723 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
87d11ceb
ACM
1724
1725 newsk->sk_err = 0;
e551c32d 1726 newsk->sk_err_soft = 0;
87d11ceb 1727 newsk->sk_priority = 0;
2c8c56e1 1728 newsk->sk_incoming_cpu = raw_smp_processor_id();
33cf7c90 1729 atomic64_set(&newsk->sk_cookie, 0);
648845ab
TZ
1730 if (likely(newsk->sk_net_refcnt))
1731 sock_inuse_add(sock_net(newsk), 1);
d979a39d 1732
4dc6dc71
ED
1733 /*
1734 * Before updating sk_refcnt, we must commit prior changes to memory
1735 * (Documentation/RCU/rculist_nulls.txt for details)
1736 */
1737 smp_wmb();
41c6d650 1738 refcount_set(&newsk->sk_refcnt, 2);
87d11ceb
ACM
1739
1740 /*
1741 * Increment the counter in the same struct proto as the master
1742 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1743 * is the same as sk->sk_prot->socks, as this field was copied
1744 * with memcpy).
1745 *
1746 * This _changes_ the previous behaviour, where
1747 * tcp_create_openreq_child always was incrementing the
1748 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1749 * to be taken into account in all callers. -acme
1750 */
1751 sk_refcnt_debug_inc(newsk);
972692e0 1752 sk_set_socket(newsk, NULL);
43815482 1753 newsk->sk_wq = NULL;
87d11ceb
ACM
1754
1755 if (newsk->sk_prot->sockets_allocated)
180d8cd9 1756 sk_sockets_allocated_inc(newsk);
704da560 1757
080a270f
HFS
1758 if (sock_needs_netstamp(sk) &&
1759 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
704da560 1760 net_enable_timestamp();
87d11ceb
ACM
1761 }
1762out:
1763 return newsk;
1764}
e56c57d0 1765EXPORT_SYMBOL_GPL(sk_clone_lock);
87d11ceb 1766
94352d45
ACM
1767void sk_free_unlock_clone(struct sock *sk)
1768{
1769 /* It is still raw copy of parent, so invalidate
1770 * destructor and make plain sk_free() */
1771 sk->sk_destruct = NULL;
1772 bh_unlock_sock(sk);
1773 sk_free(sk);
1774}
1775EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1776
9958089a
AK
1777void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1778{
d6a4e26a
ED
1779 u32 max_segs = 1;
1780
6bd4f355 1781 sk_dst_set(sk, dst);
0a6b2a1d 1782 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
9958089a 1783 if (sk->sk_route_caps & NETIF_F_GSO)
4fcd6b99 1784 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
a465419b 1785 sk->sk_route_caps &= ~sk->sk_route_nocaps;
9958089a 1786 if (sk_can_gso(sk)) {
f70f250a 1787 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
9958089a 1788 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
82cc1a7a 1789 } else {
9958089a 1790 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
82cc1a7a 1791 sk->sk_gso_max_size = dst->dev->gso_max_size;
d6a4e26a 1792 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
82cc1a7a 1793 }
9958089a 1794 }
d6a4e26a 1795 sk->sk_gso_max_segs = max_segs;
9958089a
AK
1796}
1797EXPORT_SYMBOL_GPL(sk_setup_caps);
1798
1da177e4
LT
1799/*
1800 * Simple resource managers for sockets.
1801 */
1802
1803
4ec93edb
YH
1804/*
1805 * Write buffer destructor automatically called from kfree_skb.
1da177e4
LT
1806 */
1807void sock_wfree(struct sk_buff *skb)
1808{
1809 struct sock *sk = skb->sk;
d99927f4 1810 unsigned int len = skb->truesize;
1da177e4 1811
d99927f4
ED
1812 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1813 /*
1814 * Keep a reference on sk_wmem_alloc, this will be released
1815 * after sk_write_space() call
1816 */
14afee4b 1817 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1da177e4 1818 sk->sk_write_space(sk);
d99927f4
ED
1819 len = 1;
1820 }
2b85a34e 1821 /*
d99927f4
ED
1822 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1823 * could not do because of in-flight packets
2b85a34e 1824 */
14afee4b 1825 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2b85a34e 1826 __sk_free(sk);
1da177e4 1827}
2a91525c 1828EXPORT_SYMBOL(sock_wfree);
1da177e4 1829
1d2077ac
ED
1830/* This variant of sock_wfree() is used by TCP,
1831 * since it sets SOCK_USE_WRITE_QUEUE.
1832 */
1833void __sock_wfree(struct sk_buff *skb)
1834{
1835 struct sock *sk = skb->sk;
1836
14afee4b 1837 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1d2077ac
ED
1838 __sk_free(sk);
1839}
1840
9e17f8a4
ED
1841void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1842{
1843 skb_orphan(skb);
1844 skb->sk = sk;
1845#ifdef CONFIG_INET
1846 if (unlikely(!sk_fullsock(sk))) {
1847 skb->destructor = sock_edemux;
1848 sock_hold(sk);
1849 return;
1850 }
1851#endif
1852 skb->destructor = sock_wfree;
1853 skb_set_hash_from_sk(skb, sk);
1854 /*
1855 * We used to take a refcount on sk, but following operation
1856 * is enough to guarantee sk_free() wont free this sock until
1857 * all in-flight packets are completed
1858 */
14afee4b 1859 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
9e17f8a4
ED
1860}
1861EXPORT_SYMBOL(skb_set_owner_w);
1862
1d2077ac
ED
1863/* This helper is used by netem, as it can hold packets in its
1864 * delay queue. We want to allow the owner socket to send more
1865 * packets, as if they were already TX completed by a typical driver.
1866 * But we also want to keep skb->sk set because some packet schedulers
f6ba8d33 1867 * rely on it (sch_fq for example).
1d2077ac 1868 */
f2f872f9
ED
1869void skb_orphan_partial(struct sk_buff *skb)
1870{
f6ba8d33 1871 if (skb_is_tcp_pure_ack(skb))
1d2077ac
ED
1872 return;
1873
f2f872f9
ED
1874 if (skb->destructor == sock_wfree
1875#ifdef CONFIG_INET
1876 || skb->destructor == tcp_wfree
1877#endif
1878 ) {
f6ba8d33
ED
1879 struct sock *sk = skb->sk;
1880
41c6d650 1881 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
14afee4b 1882 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
f6ba8d33
ED
1883 skb->destructor = sock_efree;
1884 }
f2f872f9
ED
1885 } else {
1886 skb_orphan(skb);
1887 }
1888}
1889EXPORT_SYMBOL(skb_orphan_partial);
1890
4ec93edb
YH
1891/*
1892 * Read buffer destructor automatically called from kfree_skb.
1da177e4
LT
1893 */
1894void sock_rfree(struct sk_buff *skb)
1895{
1896 struct sock *sk = skb->sk;
d361fd59 1897 unsigned int len = skb->truesize;
1da177e4 1898
d361fd59
ED
1899 atomic_sub(len, &sk->sk_rmem_alloc);
1900 sk_mem_uncharge(sk, len);
1da177e4 1901}
2a91525c 1902EXPORT_SYMBOL(sock_rfree);
1da177e4 1903
7768eed8
OH
1904/*
1905 * Buffer destructor for skbs that are not used directly in read or write
1906 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1907 */
62bccb8c
AD
1908void sock_efree(struct sk_buff *skb)
1909{
1910 sock_put(skb->sk);
1911}
1912EXPORT_SYMBOL(sock_efree);
1913
976d0201 1914kuid_t sock_i_uid(struct sock *sk)
1da177e4 1915{
976d0201 1916 kuid_t uid;
1da177e4 1917
f064af1e 1918 read_lock_bh(&sk->sk_callback_lock);
976d0201 1919 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
f064af1e 1920 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1921 return uid;
1922}
2a91525c 1923EXPORT_SYMBOL(sock_i_uid);
1da177e4
LT
1924
1925unsigned long sock_i_ino(struct sock *sk)
1926{
1927 unsigned long ino;
1928
f064af1e 1929 read_lock_bh(&sk->sk_callback_lock);
1da177e4 1930 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
f064af1e 1931 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1932 return ino;
1933}
2a91525c 1934EXPORT_SYMBOL(sock_i_ino);
1da177e4
LT
1935
1936/*
1937 * Allocate a skb from the socket's send buffer.
1938 */
86a76caf 1939struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 1940 gfp_t priority)
1da177e4 1941{
14afee4b 1942 if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
2a91525c 1943 struct sk_buff *skb = alloc_skb(size, priority);
1da177e4
LT
1944 if (skb) {
1945 skb_set_owner_w(skb, sk);
1946 return skb;
1947 }
1948 }
1949 return NULL;
1950}
2a91525c 1951EXPORT_SYMBOL(sock_wmalloc);
1da177e4 1952
98ba0bd5
WB
1953static void sock_ofree(struct sk_buff *skb)
1954{
1955 struct sock *sk = skb->sk;
1956
1957 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1958}
1959
1960struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1961 gfp_t priority)
1962{
1963 struct sk_buff *skb;
1964
1965 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1966 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1967 sysctl_optmem_max)
1968 return NULL;
1969
1970 skb = alloc_skb(size, priority);
1971 if (!skb)
1972 return NULL;
1973
1974 atomic_add(skb->truesize, &sk->sk_omem_alloc);
1975 skb->sk = sk;
1976 skb->destructor = sock_ofree;
1977 return skb;
1978}
1979
4ec93edb 1980/*
1da177e4 1981 * Allocate a memory block from the socket's option memory buffer.
4ec93edb 1982 */
dd0fc66f 1983void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1da177e4 1984{
95c96174 1985 if ((unsigned int)size <= sysctl_optmem_max &&
1da177e4
LT
1986 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1987 void *mem;
1988 /* First do the add, to avoid the race if kmalloc
4ec93edb 1989 * might sleep.
1da177e4
LT
1990 */
1991 atomic_add(size, &sk->sk_omem_alloc);
1992 mem = kmalloc(size, priority);
1993 if (mem)
1994 return mem;
1995 atomic_sub(size, &sk->sk_omem_alloc);
1996 }
1997 return NULL;
1998}
2a91525c 1999EXPORT_SYMBOL(sock_kmalloc);
1da177e4 2000
79e88659
DB
2001/* Free an option memory block. Note, we actually want the inline
2002 * here as this allows gcc to detect the nullify and fold away the
2003 * condition entirely.
1da177e4 2004 */
79e88659
DB
2005static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2006 const bool nullify)
1da177e4 2007{
e53da5fb
DM
2008 if (WARN_ON_ONCE(!mem))
2009 return;
79e88659
DB
2010 if (nullify)
2011 kzfree(mem);
2012 else
2013 kfree(mem);
1da177e4
LT
2014 atomic_sub(size, &sk->sk_omem_alloc);
2015}
79e88659
DB
2016
2017void sock_kfree_s(struct sock *sk, void *mem, int size)
2018{
2019 __sock_kfree_s(sk, mem, size, false);
2020}
2a91525c 2021EXPORT_SYMBOL(sock_kfree_s);
1da177e4 2022
79e88659
DB
2023void sock_kzfree_s(struct sock *sk, void *mem, int size)
2024{
2025 __sock_kfree_s(sk, mem, size, true);
2026}
2027EXPORT_SYMBOL(sock_kzfree_s);
2028
1da177e4
LT
2029/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2030 I think, these locks should be removed for datagram sockets.
2031 */
2a91525c 2032static long sock_wait_for_wmem(struct sock *sk, long timeo)
1da177e4
LT
2033{
2034 DEFINE_WAIT(wait);
2035
9cd3e072 2036 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1da177e4
LT
2037 for (;;) {
2038 if (!timeo)
2039 break;
2040 if (signal_pending(current))
2041 break;
2042 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
aa395145 2043 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
14afee4b 2044 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1da177e4
LT
2045 break;
2046 if (sk->sk_shutdown & SEND_SHUTDOWN)
2047 break;
2048 if (sk->sk_err)
2049 break;
2050 timeo = schedule_timeout(timeo);
2051 }
aa395145 2052 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
2053 return timeo;
2054}
2055
2056
2057/*
2058 * Generic send/receive buffer handlers
2059 */
2060
4cc7f68d
HX
2061struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2062 unsigned long data_len, int noblock,
28d64271 2063 int *errcode, int max_page_order)
1da177e4 2064{
2e4e4410 2065 struct sk_buff *skb;
1da177e4
LT
2066 long timeo;
2067 int err;
2068
1da177e4 2069 timeo = sock_sndtimeo(sk, noblock);
2e4e4410 2070 for (;;) {
1da177e4
LT
2071 err = sock_error(sk);
2072 if (err != 0)
2073 goto failure;
2074
2075 err = -EPIPE;
2076 if (sk->sk_shutdown & SEND_SHUTDOWN)
2077 goto failure;
2078
2e4e4410
ED
2079 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2080 break;
28d64271 2081
9cd3e072 2082 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2e4e4410
ED
2083 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2084 err = -EAGAIN;
2085 if (!timeo)
1da177e4 2086 goto failure;
2e4e4410
ED
2087 if (signal_pending(current))
2088 goto interrupted;
2089 timeo = sock_wait_for_wmem(sk, timeo);
1da177e4 2090 }
2e4e4410
ED
2091 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2092 errcode, sk->sk_allocation);
2093 if (skb)
2094 skb_set_owner_w(skb, sk);
1da177e4
LT
2095 return skb;
2096
2097interrupted:
2098 err = sock_intr_errno(timeo);
2099failure:
2100 *errcode = err;
2101 return NULL;
2102}
4cc7f68d 2103EXPORT_SYMBOL(sock_alloc_send_pskb);
1da177e4 2104
4ec93edb 2105struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1da177e4
LT
2106 int noblock, int *errcode)
2107{
28d64271 2108 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1da177e4 2109}
2a91525c 2110EXPORT_SYMBOL(sock_alloc_send_skb);
1da177e4 2111
39771b12
WB
2112int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2113 struct sockcm_cookie *sockc)
2114{
3dd17e63
SHY
2115 u32 tsflags;
2116
39771b12
WB
2117 switch (cmsg->cmsg_type) {
2118 case SO_MARK:
2119 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2120 return -EPERM;
2121 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2122 return -EINVAL;
2123 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2124 break;
3dd17e63
SHY
2125 case SO_TIMESTAMPING:
2126 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2127 return -EINVAL;
2128
2129 tsflags = *(u32 *)CMSG_DATA(cmsg);
2130 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2131 return -EINVAL;
2132
2133 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2134 sockc->tsflags |= tsflags;
2135 break;
80b14dee
RC
2136 case SCM_TXTIME:
2137 if (!sock_flag(sk, SOCK_TXTIME))
2138 return -EINVAL;
2139 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2140 return -EINVAL;
2141 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2142 break;
779f1ede
SHY
2143 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2144 case SCM_RIGHTS:
2145 case SCM_CREDENTIALS:
2146 break;
39771b12
WB
2147 default:
2148 return -EINVAL;
2149 }
2150 return 0;
2151}
2152EXPORT_SYMBOL(__sock_cmsg_send);
2153
f28ea365
EJ
2154int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2155 struct sockcm_cookie *sockc)
2156{
2157 struct cmsghdr *cmsg;
39771b12 2158 int ret;
f28ea365
EJ
2159
2160 for_each_cmsghdr(cmsg, msg) {
2161 if (!CMSG_OK(msg, cmsg))
2162 return -EINVAL;
2163 if (cmsg->cmsg_level != SOL_SOCKET)
2164 continue;
39771b12
WB
2165 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2166 if (ret)
2167 return ret;
f28ea365
EJ
2168 }
2169 return 0;
2170}
2171EXPORT_SYMBOL(sock_cmsg_send);
2172
06044751
ED
2173static void sk_enter_memory_pressure(struct sock *sk)
2174{
2175 if (!sk->sk_prot->enter_memory_pressure)
2176 return;
2177
2178 sk->sk_prot->enter_memory_pressure(sk);
2179}
2180
2181static void sk_leave_memory_pressure(struct sock *sk)
2182{
2183 if (sk->sk_prot->leave_memory_pressure) {
2184 sk->sk_prot->leave_memory_pressure(sk);
2185 } else {
2186 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2187
2188 if (memory_pressure && *memory_pressure)
2189 *memory_pressure = 0;
2190 }
2191}
2192
5640f768
ED
2193/* On 32bit arches, an skb frag is limited to 2^15 */
2194#define SKB_FRAG_PAGE_ORDER get_order(32768)
2195
400dfd3a
ED
2196/**
2197 * skb_page_frag_refill - check that a page_frag contains enough room
2198 * @sz: minimum size of the fragment we want to get
2199 * @pfrag: pointer to page_frag
82d5e2b8 2200 * @gfp: priority for memory allocation
400dfd3a
ED
2201 *
2202 * Note: While this allocator tries to use high order pages, there is
2203 * no guarantee that allocations succeed. Therefore, @sz MUST be
2204 * less or equal than PAGE_SIZE.
2205 */
d9b2938a 2206bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
5640f768 2207{
5640f768 2208 if (pfrag->page) {
fe896d18 2209 if (page_ref_count(pfrag->page) == 1) {
5640f768
ED
2210 pfrag->offset = 0;
2211 return true;
2212 }
400dfd3a 2213 if (pfrag->offset + sz <= pfrag->size)
5640f768
ED
2214 return true;
2215 put_page(pfrag->page);
2216 }
2217
d9b2938a
ED
2218 pfrag->offset = 0;
2219 if (SKB_FRAG_PAGE_ORDER) {
d0164adc
MG
2220 /* Avoid direct reclaim but allow kswapd to wake */
2221 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2222 __GFP_COMP | __GFP_NOWARN |
2223 __GFP_NORETRY,
d9b2938a 2224 SKB_FRAG_PAGE_ORDER);
5640f768 2225 if (likely(pfrag->page)) {
d9b2938a 2226 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
5640f768
ED
2227 return true;
2228 }
d9b2938a
ED
2229 }
2230 pfrag->page = alloc_page(gfp);
2231 if (likely(pfrag->page)) {
2232 pfrag->size = PAGE_SIZE;
2233 return true;
2234 }
400dfd3a
ED
2235 return false;
2236}
2237EXPORT_SYMBOL(skb_page_frag_refill);
2238
2239bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2240{
2241 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2242 return true;
2243
5640f768
ED
2244 sk_enter_memory_pressure(sk);
2245 sk_stream_moderate_sndbuf(sk);
2246 return false;
2247}
2248EXPORT_SYMBOL(sk_page_frag_refill);
2249
1da177e4 2250static void __lock_sock(struct sock *sk)
f39234d6
NK
2251 __releases(&sk->sk_lock.slock)
2252 __acquires(&sk->sk_lock.slock)
1da177e4
LT
2253{
2254 DEFINE_WAIT(wait);
2255
e71a4783 2256 for (;;) {
1da177e4
LT
2257 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2258 TASK_UNINTERRUPTIBLE);
2259 spin_unlock_bh(&sk->sk_lock.slock);
2260 schedule();
2261 spin_lock_bh(&sk->sk_lock.slock);
e71a4783 2262 if (!sock_owned_by_user(sk))
1da177e4
LT
2263 break;
2264 }
2265 finish_wait(&sk->sk_lock.wq, &wait);
2266}
2267
8873c064 2268void __release_sock(struct sock *sk)
f39234d6
NK
2269 __releases(&sk->sk_lock.slock)
2270 __acquires(&sk->sk_lock.slock)
1da177e4 2271{
5413d1ba 2272 struct sk_buff *skb, *next;
1da177e4 2273
5413d1ba 2274 while ((skb = sk->sk_backlog.head) != NULL) {
1da177e4 2275 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1da177e4 2276
5413d1ba 2277 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4 2278
5413d1ba
ED
2279 do {
2280 next = skb->next;
e4cbb02a 2281 prefetch(next);
7fee226a 2282 WARN_ON_ONCE(skb_dst_is_noref(skb));
a8305bff 2283 skb_mark_not_on_list(skb);
c57943a1 2284 sk_backlog_rcv(sk, skb);
1da177e4 2285
5413d1ba 2286 cond_resched();
1da177e4
LT
2287
2288 skb = next;
2289 } while (skb != NULL);
2290
5413d1ba
ED
2291 spin_lock_bh(&sk->sk_lock.slock);
2292 }
8eae939f
ZY
2293
2294 /*
2295 * Doing the zeroing here guarantee we can not loop forever
2296 * while a wild producer attempts to flood us.
2297 */
2298 sk->sk_backlog.len = 0;
1da177e4
LT
2299}
2300
d41a69f1
ED
2301void __sk_flush_backlog(struct sock *sk)
2302{
2303 spin_lock_bh(&sk->sk_lock.slock);
2304 __release_sock(sk);
2305 spin_unlock_bh(&sk->sk_lock.slock);
2306}
2307
1da177e4
LT
2308/**
2309 * sk_wait_data - wait for data to arrive at sk_receive_queue
4dc3b16b
PP
2310 * @sk: sock to wait on
2311 * @timeo: for how long
dfbafc99 2312 * @skb: last skb seen on sk_receive_queue
1da177e4
LT
2313 *
2314 * Now socket state including sk->sk_err is changed only under lock,
2315 * hence we may omit checks after joining wait queue.
2316 * We check receive queue before schedule() only as optimization;
2317 * it is very likely that release_sock() added new data.
2318 */
dfbafc99 2319int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
1da177e4 2320{
d9dc8b0f 2321 DEFINE_WAIT_FUNC(wait, woken_wake_function);
1da177e4 2322 int rc;
1da177e4 2323
d9dc8b0f 2324 add_wait_queue(sk_sleep(sk), &wait);
9cd3e072 2325 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
d9dc8b0f 2326 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
9cd3e072 2327 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
d9dc8b0f 2328 remove_wait_queue(sk_sleep(sk), &wait);
1da177e4
LT
2329 return rc;
2330}
1da177e4
LT
2331EXPORT_SYMBOL(sk_wait_data);
2332
3ab224be 2333/**
f8c3bf00 2334 * __sk_mem_raise_allocated - increase memory_allocated
3ab224be
HA
2335 * @sk: socket
2336 * @size: memory size to allocate
f8c3bf00 2337 * @amt: pages to allocate
3ab224be
HA
2338 * @kind: allocation type
2339 *
f8c3bf00 2340 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
3ab224be 2341 */
f8c3bf00 2342int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3ab224be
HA
2343{
2344 struct proto *prot = sk->sk_prot;
f8c3bf00 2345 long allocated = sk_memory_allocated_add(sk, amt);
d6f19938 2346 bool charged = true;
e805605c 2347
baac50bb 2348 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
d6f19938 2349 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
e805605c 2350 goto suppress_allocation;
3ab224be
HA
2351
2352 /* Under limit. */
e805605c 2353 if (allocated <= sk_prot_mem_limits(sk, 0)) {
180d8cd9 2354 sk_leave_memory_pressure(sk);
3ab224be
HA
2355 return 1;
2356 }
2357
e805605c
JW
2358 /* Under pressure. */
2359 if (allocated > sk_prot_mem_limits(sk, 1))
180d8cd9 2360 sk_enter_memory_pressure(sk);
3ab224be 2361
e805605c
JW
2362 /* Over hard limit. */
2363 if (allocated > sk_prot_mem_limits(sk, 2))
3ab224be
HA
2364 goto suppress_allocation;
2365
2366 /* guarantee minimum buffer size under pressure */
2367 if (kind == SK_MEM_RECV) {
a3dcaf17 2368 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3ab224be 2369 return 1;
180d8cd9 2370
3ab224be 2371 } else { /* SK_MEM_SEND */
a3dcaf17
ED
2372 int wmem0 = sk_get_wmem0(sk, prot);
2373
3ab224be 2374 if (sk->sk_type == SOCK_STREAM) {
a3dcaf17 2375 if (sk->sk_wmem_queued < wmem0)
3ab224be 2376 return 1;
a3dcaf17 2377 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3ab224be 2378 return 1;
a3dcaf17 2379 }
3ab224be
HA
2380 }
2381
180d8cd9 2382 if (sk_has_memory_pressure(sk)) {
1748376b
ED
2383 int alloc;
2384
180d8cd9 2385 if (!sk_under_memory_pressure(sk))
1748376b 2386 return 1;
180d8cd9
GC
2387 alloc = sk_sockets_allocated_read_positive(sk);
2388 if (sk_prot_mem_limits(sk, 2) > alloc *
3ab224be
HA
2389 sk_mem_pages(sk->sk_wmem_queued +
2390 atomic_read(&sk->sk_rmem_alloc) +
2391 sk->sk_forward_alloc))
2392 return 1;
2393 }
2394
2395suppress_allocation:
2396
2397 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2398 sk_stream_moderate_sndbuf(sk);
2399
2400 /* Fail only if socket is _under_ its sndbuf.
2401 * In this case we cannot block, so that we have to fail.
2402 */
2403 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2404 return 1;
2405 }
2406
d6f19938
YS
2407 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2408 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3847ce32 2409
0e90b31f 2410 sk_memory_allocated_sub(sk, amt);
180d8cd9 2411
baac50bb
JW
2412 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2413 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
e805605c 2414
3ab224be
HA
2415 return 0;
2416}
f8c3bf00
PA
2417EXPORT_SYMBOL(__sk_mem_raise_allocated);
2418
2419/**
2420 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2421 * @sk: socket
2422 * @size: memory size to allocate
2423 * @kind: allocation type
2424 *
2425 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2426 * rmem allocation. This function assumes that protocols which have
2427 * memory_pressure use sk_wmem_queued as write buffer accounting.
2428 */
2429int __sk_mem_schedule(struct sock *sk, int size, int kind)
2430{
2431 int ret, amt = sk_mem_pages(size);
2432
2433 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2434 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2435 if (!ret)
2436 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2437 return ret;
2438}
3ab224be
HA
2439EXPORT_SYMBOL(__sk_mem_schedule);
2440
2441/**
f8c3bf00 2442 * __sk_mem_reduce_allocated - reclaim memory_allocated
3ab224be 2443 * @sk: socket
f8c3bf00
PA
2444 * @amount: number of quanta
2445 *
2446 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3ab224be 2447 */
f8c3bf00 2448void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3ab224be 2449{
1a24e04e 2450 sk_memory_allocated_sub(sk, amount);
3ab224be 2451
baac50bb
JW
2452 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2453 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
e805605c 2454
180d8cd9
GC
2455 if (sk_under_memory_pressure(sk) &&
2456 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2457 sk_leave_memory_pressure(sk);
3ab224be 2458}
f8c3bf00
PA
2459EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2460
2461/**
2462 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2463 * @sk: socket
2464 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2465 */
2466void __sk_mem_reclaim(struct sock *sk, int amount)
2467{
2468 amount >>= SK_MEM_QUANTUM_SHIFT;
2469 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2470 __sk_mem_reduce_allocated(sk, amount);
2471}
3ab224be
HA
2472EXPORT_SYMBOL(__sk_mem_reclaim);
2473
627d2d6b 2474int sk_set_peek_off(struct sock *sk, int val)
2475{
627d2d6b 2476 sk->sk_peek_off = val;
2477 return 0;
2478}
2479EXPORT_SYMBOL_GPL(sk_set_peek_off);
3ab224be 2480
1da177e4
LT
2481/*
2482 * Set of default routines for initialising struct proto_ops when
2483 * the protocol does not support a particular function. In certain
2484 * cases where it makes no sense for a protocol to have a "do nothing"
2485 * function, some default processing is provided.
2486 */
2487
2488int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2489{
2490 return -EOPNOTSUPP;
2491}
2a91525c 2492EXPORT_SYMBOL(sock_no_bind);
1da177e4 2493
4ec93edb 2494int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
2495 int len, int flags)
2496{
2497 return -EOPNOTSUPP;
2498}
2a91525c 2499EXPORT_SYMBOL(sock_no_connect);
1da177e4
LT
2500
2501int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2502{
2503 return -EOPNOTSUPP;
2504}
2a91525c 2505EXPORT_SYMBOL(sock_no_socketpair);
1da177e4 2506
cdfbabfb
DH
2507int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2508 bool kern)
1da177e4
LT
2509{
2510 return -EOPNOTSUPP;
2511}
2a91525c 2512EXPORT_SYMBOL(sock_no_accept);
1da177e4 2513
4ec93edb 2514int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
9b2c45d4 2515 int peer)
1da177e4
LT
2516{
2517 return -EOPNOTSUPP;
2518}
2a91525c 2519EXPORT_SYMBOL(sock_no_getname);
1da177e4 2520
1da177e4
LT
2521int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2522{
2523 return -EOPNOTSUPP;
2524}
2a91525c 2525EXPORT_SYMBOL(sock_no_ioctl);
1da177e4
LT
2526
2527int sock_no_listen(struct socket *sock, int backlog)
2528{
2529 return -EOPNOTSUPP;
2530}
2a91525c 2531EXPORT_SYMBOL(sock_no_listen);
1da177e4
LT
2532
2533int sock_no_shutdown(struct socket *sock, int how)
2534{
2535 return -EOPNOTSUPP;
2536}
2a91525c 2537EXPORT_SYMBOL(sock_no_shutdown);
1da177e4
LT
2538
2539int sock_no_setsockopt(struct socket *sock, int level, int optname,
b7058842 2540 char __user *optval, unsigned int optlen)
1da177e4
LT
2541{
2542 return -EOPNOTSUPP;
2543}
2a91525c 2544EXPORT_SYMBOL(sock_no_setsockopt);
1da177e4
LT
2545
2546int sock_no_getsockopt(struct socket *sock, int level, int optname,
2547 char __user *optval, int __user *optlen)
2548{
2549 return -EOPNOTSUPP;
2550}
2a91525c 2551EXPORT_SYMBOL(sock_no_getsockopt);
1da177e4 2552
1b784140 2553int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
1da177e4
LT
2554{
2555 return -EOPNOTSUPP;
2556}
2a91525c 2557EXPORT_SYMBOL(sock_no_sendmsg);
1da177e4 2558
306b13eb
TH
2559int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2560{
2561 return -EOPNOTSUPP;
2562}
2563EXPORT_SYMBOL(sock_no_sendmsg_locked);
2564
1b784140
YX
2565int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2566 int flags)
1da177e4
LT
2567{
2568 return -EOPNOTSUPP;
2569}
2a91525c 2570EXPORT_SYMBOL(sock_no_recvmsg);
1da177e4
LT
2571
2572int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2573{
2574 /* Mirror missing mmap method error code */
2575 return -ENODEV;
2576}
2a91525c 2577EXPORT_SYMBOL(sock_no_mmap);
1da177e4
LT
2578
2579ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2580{
2581 ssize_t res;
2582 struct msghdr msg = {.msg_flags = flags};
2583 struct kvec iov;
2584 char *kaddr = kmap(page);
2585 iov.iov_base = kaddr + offset;
2586 iov.iov_len = size;
2587 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2588 kunmap(page);
2589 return res;
2590}
2a91525c 2591EXPORT_SYMBOL(sock_no_sendpage);
1da177e4 2592
306b13eb
TH
2593ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2594 int offset, size_t size, int flags)
2595{
2596 ssize_t res;
2597 struct msghdr msg = {.msg_flags = flags};
2598 struct kvec iov;
2599 char *kaddr = kmap(page);
2600
2601 iov.iov_base = kaddr + offset;
2602 iov.iov_len = size;
2603 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2604 kunmap(page);
2605 return res;
2606}
2607EXPORT_SYMBOL(sock_no_sendpage_locked);
2608
1da177e4
LT
2609/*
2610 * Default Socket Callbacks
2611 */
2612
2613static void sock_def_wakeup(struct sock *sk)
2614{
43815482
ED
2615 struct socket_wq *wq;
2616
2617 rcu_read_lock();
2618 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2619 if (skwq_has_sleeper(wq))
43815482
ED
2620 wake_up_interruptible_all(&wq->wait);
2621 rcu_read_unlock();
1da177e4
LT
2622}
2623
2624static void sock_def_error_report(struct sock *sk)
2625{
43815482
ED
2626 struct socket_wq *wq;
2627
2628 rcu_read_lock();
2629 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2630 if (skwq_has_sleeper(wq))
a9a08845 2631 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
8d8ad9d7 2632 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
43815482 2633 rcu_read_unlock();
1da177e4
LT
2634}
2635
676d2369 2636static void sock_def_readable(struct sock *sk)
1da177e4 2637{
43815482
ED
2638 struct socket_wq *wq;
2639
2640 rcu_read_lock();
2641 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2642 if (skwq_has_sleeper(wq))
a9a08845
LT
2643 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2644 EPOLLRDNORM | EPOLLRDBAND);
8d8ad9d7 2645 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
43815482 2646 rcu_read_unlock();
1da177e4
LT
2647}
2648
2649static void sock_def_write_space(struct sock *sk)
2650{
43815482
ED
2651 struct socket_wq *wq;
2652
2653 rcu_read_lock();
1da177e4
LT
2654
2655 /* Do not wake up a writer until he can make "significant"
2656 * progress. --DaveM
2657 */
14afee4b 2658 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
43815482 2659 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2660 if (skwq_has_sleeper(wq))
a9a08845
LT
2661 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2662 EPOLLWRNORM | EPOLLWRBAND);
1da177e4
LT
2663
2664 /* Should agree with poll, otherwise some programs break */
2665 if (sock_writeable(sk))
8d8ad9d7 2666 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1da177e4
LT
2667 }
2668
43815482 2669 rcu_read_unlock();
1da177e4
LT
2670}
2671
2672static void sock_def_destruct(struct sock *sk)
2673{
1da177e4
LT
2674}
2675
2676void sk_send_sigurg(struct sock *sk)
2677{
2678 if (sk->sk_socket && sk->sk_socket->file)
2679 if (send_sigurg(&sk->sk_socket->file->f_owner))
8d8ad9d7 2680 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1da177e4 2681}
2a91525c 2682EXPORT_SYMBOL(sk_send_sigurg);
1da177e4
LT
2683
2684void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2685 unsigned long expires)
2686{
2687 if (!mod_timer(timer, expires))
2688 sock_hold(sk);
2689}
1da177e4
LT
2690EXPORT_SYMBOL(sk_reset_timer);
2691
2692void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2693{
25cc4ae9 2694 if (del_timer(timer))
1da177e4
LT
2695 __sock_put(sk);
2696}
1da177e4
LT
2697EXPORT_SYMBOL(sk_stop_timer);
2698
2699void sock_init_data(struct socket *sock, struct sock *sk)
2700{
581319c5 2701 sk_init_common(sk);
1da177e4
LT
2702 sk->sk_send_head = NULL;
2703
99767f27 2704 timer_setup(&sk->sk_timer, NULL, 0);
4ec93edb 2705
1da177e4
LT
2706 sk->sk_allocation = GFP_KERNEL;
2707 sk->sk_rcvbuf = sysctl_rmem_default;
2708 sk->sk_sndbuf = sysctl_wmem_default;
2709 sk->sk_state = TCP_CLOSE;
972692e0 2710 sk_set_socket(sk, sock);
1da177e4
LT
2711
2712 sock_set_flag(sk, SOCK_ZAPPED);
2713
e71a4783 2714 if (sock) {
1da177e4 2715 sk->sk_type = sock->type;
43815482 2716 sk->sk_wq = sock->wq;
1da177e4 2717 sock->sk = sk;
86741ec2
LC
2718 sk->sk_uid = SOCK_INODE(sock)->i_uid;
2719 } else {
43815482 2720 sk->sk_wq = NULL;
86741ec2
LC
2721 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
2722 }
1da177e4 2723
1da177e4 2724 rwlock_init(&sk->sk_callback_lock);
cdfbabfb
DH
2725 if (sk->sk_kern_sock)
2726 lockdep_set_class_and_name(
2727 &sk->sk_callback_lock,
2728 af_kern_callback_keys + sk->sk_family,
2729 af_family_kern_clock_key_strings[sk->sk_family]);
2730 else
2731 lockdep_set_class_and_name(
2732 &sk->sk_callback_lock,
443aef0e
PZ
2733 af_callback_keys + sk->sk_family,
2734 af_family_clock_key_strings[sk->sk_family]);
1da177e4
LT
2735
2736 sk->sk_state_change = sock_def_wakeup;
2737 sk->sk_data_ready = sock_def_readable;
2738 sk->sk_write_space = sock_def_write_space;
2739 sk->sk_error_report = sock_def_error_report;
2740 sk->sk_destruct = sock_def_destruct;
2741
5640f768
ED
2742 sk->sk_frag.page = NULL;
2743 sk->sk_frag.offset = 0;
ef64a54f 2744 sk->sk_peek_off = -1;
1da177e4 2745
109f6e39
EB
2746 sk->sk_peer_pid = NULL;
2747 sk->sk_peer_cred = NULL;
1da177e4
LT
2748 sk->sk_write_pending = 0;
2749 sk->sk_rcvlowat = 1;
2750 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2751 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2752
6c7c98ba 2753 sk->sk_stamp = SK_DEFAULT_STAMP;
52267790 2754 atomic_set(&sk->sk_zckey, 0);
1da177e4 2755
e0d1095a 2756#ifdef CONFIG_NET_RX_BUSY_POLL
06021292 2757 sk->sk_napi_id = 0;
64b0dc51 2758 sk->sk_ll_usec = sysctl_net_busy_read;
06021292
ET
2759#endif
2760
76a9ebe8
ED
2761 sk->sk_max_pacing_rate = ~0UL;
2762 sk->sk_pacing_rate = ~0UL;
3a9b76fd 2763 sk->sk_pacing_shift = 10;
70da268b 2764 sk->sk_incoming_cpu = -1;
c6345ce7
AN
2765
2766 sk_rx_queue_clear(sk);
4dc6dc71
ED
2767 /*
2768 * Before updating sk_refcnt, we must commit prior changes to memory
2769 * (Documentation/RCU/rculist_nulls.txt for details)
2770 */
2771 smp_wmb();
41c6d650 2772 refcount_set(&sk->sk_refcnt, 1);
33c732c3 2773 atomic_set(&sk->sk_drops, 0);
1da177e4 2774}
2a91525c 2775EXPORT_SYMBOL(sock_init_data);
1da177e4 2776
b5606c2d 2777void lock_sock_nested(struct sock *sk, int subclass)
1da177e4
LT
2778{
2779 might_sleep();
a5b5bb9a 2780 spin_lock_bh(&sk->sk_lock.slock);
d2e9117c 2781 if (sk->sk_lock.owned)
1da177e4 2782 __lock_sock(sk);
d2e9117c 2783 sk->sk_lock.owned = 1;
a5b5bb9a
IM
2784 spin_unlock(&sk->sk_lock.slock);
2785 /*
2786 * The sk_lock has mutex_lock() semantics here:
2787 */
fcc70d5f 2788 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
a5b5bb9a 2789 local_bh_enable();
1da177e4 2790}
fcc70d5f 2791EXPORT_SYMBOL(lock_sock_nested);
1da177e4 2792
b5606c2d 2793void release_sock(struct sock *sk)
1da177e4 2794{
a5b5bb9a 2795 spin_lock_bh(&sk->sk_lock.slock);
1da177e4
LT
2796 if (sk->sk_backlog.tail)
2797 __release_sock(sk);
46d3ceab 2798
c3f9b018
ED
2799 /* Warning : release_cb() might need to release sk ownership,
2800 * ie call sock_release_ownership(sk) before us.
2801 */
46d3ceab
ED
2802 if (sk->sk_prot->release_cb)
2803 sk->sk_prot->release_cb(sk);
2804
c3f9b018 2805 sock_release_ownership(sk);
a5b5bb9a
IM
2806 if (waitqueue_active(&sk->sk_lock.wq))
2807 wake_up(&sk->sk_lock.wq);
2808 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4
LT
2809}
2810EXPORT_SYMBOL(release_sock);
2811
8a74ad60
ED
2812/**
2813 * lock_sock_fast - fast version of lock_sock
2814 * @sk: socket
2815 *
2816 * This version should be used for very small section, where process wont block
d651983d
MCC
2817 * return false if fast path is taken:
2818 *
8a74ad60 2819 * sk_lock.slock locked, owned = 0, BH disabled
d651983d
MCC
2820 *
2821 * return true if slow path is taken:
2822 *
8a74ad60
ED
2823 * sk_lock.slock unlocked, owned = 1, BH enabled
2824 */
2825bool lock_sock_fast(struct sock *sk)
2826{
2827 might_sleep();
2828 spin_lock_bh(&sk->sk_lock.slock);
2829
2830 if (!sk->sk_lock.owned)
2831 /*
2832 * Note : We must disable BH
2833 */
2834 return false;
2835
2836 __lock_sock(sk);
2837 sk->sk_lock.owned = 1;
2838 spin_unlock(&sk->sk_lock.slock);
2839 /*
2840 * The sk_lock has mutex_lock() semantics here:
2841 */
2842 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2843 local_bh_enable();
2844 return true;
2845}
2846EXPORT_SYMBOL(lock_sock_fast);
2847
1da177e4 2848int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
4ec93edb 2849{
b7aa0bf7 2850 struct timeval tv;
9dae3497
YS
2851
2852 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
b7aa0bf7
ED
2853 tv = ktime_to_timeval(sk->sk_stamp);
2854 if (tv.tv_sec == -1)
1da177e4 2855 return -ENOENT;
b7aa0bf7
ED
2856 if (tv.tv_sec == 0) {
2857 sk->sk_stamp = ktime_get_real();
2858 tv = ktime_to_timeval(sk->sk_stamp);
2859 }
2860 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
4ec93edb 2861}
1da177e4
LT
2862EXPORT_SYMBOL(sock_get_timestamp);
2863
ae40eb1e
ED
2864int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2865{
2866 struct timespec ts;
9dae3497
YS
2867
2868 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
ae40eb1e
ED
2869 ts = ktime_to_timespec(sk->sk_stamp);
2870 if (ts.tv_sec == -1)
2871 return -ENOENT;
2872 if (ts.tv_sec == 0) {
2873 sk->sk_stamp = ktime_get_real();
2874 ts = ktime_to_timespec(sk->sk_stamp);
2875 }
2876 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2877}
2878EXPORT_SYMBOL(sock_get_timestampns);
2879
20d49473 2880void sock_enable_timestamp(struct sock *sk, int flag)
4ec93edb 2881{
20d49473 2882 if (!sock_flag(sk, flag)) {
08e29af3
ED
2883 unsigned long previous_flags = sk->sk_flags;
2884
20d49473
PO
2885 sock_set_flag(sk, flag);
2886 /*
2887 * we just set one of the two flags which require net
2888 * time stamping, but time stamping might have been on
2889 * already because of the other one
2890 */
080a270f
HFS
2891 if (sock_needs_netstamp(sk) &&
2892 !(previous_flags & SK_FLAGS_TIMESTAMP))
20d49473 2893 net_enable_timestamp();
1da177e4
LT
2894 }
2895}
1da177e4 2896
cb820f8e
RC
2897int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2898 int level, int type)
2899{
2900 struct sock_exterr_skb *serr;
364a9e93 2901 struct sk_buff *skb;
cb820f8e
RC
2902 int copied, err;
2903
2904 err = -EAGAIN;
364a9e93 2905 skb = sock_dequeue_err_skb(sk);
cb820f8e
RC
2906 if (skb == NULL)
2907 goto out;
2908
2909 copied = skb->len;
2910 if (copied > len) {
2911 msg->msg_flags |= MSG_TRUNC;
2912 copied = len;
2913 }
51f3d02b 2914 err = skb_copy_datagram_msg(skb, 0, msg, copied);
cb820f8e
RC
2915 if (err)
2916 goto out_free_skb;
2917
2918 sock_recv_timestamp(msg, sk, skb);
2919
2920 serr = SKB_EXT_ERR(skb);
2921 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2922
2923 msg->msg_flags |= MSG_ERRQUEUE;
2924 err = copied;
2925
cb820f8e
RC
2926out_free_skb:
2927 kfree_skb(skb);
2928out:
2929 return err;
2930}
2931EXPORT_SYMBOL(sock_recv_errqueue);
2932
1da177e4
LT
2933/*
2934 * Get a socket option on an socket.
2935 *
2936 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2937 * asynchronous errors should be reported by getsockopt. We assume
2938 * this means if you specify SO_ERROR (otherwise whats the point of it).
2939 */
2940int sock_common_getsockopt(struct socket *sock, int level, int optname,
2941 char __user *optval, int __user *optlen)
2942{
2943 struct sock *sk = sock->sk;
2944
2945 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2946}
1da177e4
LT
2947EXPORT_SYMBOL(sock_common_getsockopt);
2948
3fdadf7d 2949#ifdef CONFIG_COMPAT
543d9cfe
ACM
2950int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2951 char __user *optval, int __user *optlen)
3fdadf7d
DM
2952{
2953 struct sock *sk = sock->sk;
2954
1e51f951 2955 if (sk->sk_prot->compat_getsockopt != NULL)
543d9cfe
ACM
2956 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2957 optval, optlen);
3fdadf7d
DM
2958 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2959}
2960EXPORT_SYMBOL(compat_sock_common_getsockopt);
2961#endif
2962
1b784140
YX
2963int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2964 int flags)
1da177e4
LT
2965{
2966 struct sock *sk = sock->sk;
2967 int addr_len = 0;
2968 int err;
2969
1b784140 2970 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
1da177e4
LT
2971 flags & ~MSG_DONTWAIT, &addr_len);
2972 if (err >= 0)
2973 msg->msg_namelen = addr_len;
2974 return err;
2975}
1da177e4
LT
2976EXPORT_SYMBOL(sock_common_recvmsg);
2977
2978/*
2979 * Set socket options on an inet socket.
2980 */
2981int sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 2982 char __user *optval, unsigned int optlen)
1da177e4
LT
2983{
2984 struct sock *sk = sock->sk;
2985
2986 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2987}
1da177e4
LT
2988EXPORT_SYMBOL(sock_common_setsockopt);
2989
3fdadf7d 2990#ifdef CONFIG_COMPAT
543d9cfe 2991int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 2992 char __user *optval, unsigned int optlen)
3fdadf7d
DM
2993{
2994 struct sock *sk = sock->sk;
2995
543d9cfe
ACM
2996 if (sk->sk_prot->compat_setsockopt != NULL)
2997 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2998 optval, optlen);
3fdadf7d
DM
2999 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3000}
3001EXPORT_SYMBOL(compat_sock_common_setsockopt);
3002#endif
3003
1da177e4
LT
3004void sk_common_release(struct sock *sk)
3005{
3006 if (sk->sk_prot->destroy)
3007 sk->sk_prot->destroy(sk);
3008
3009 /*
3010 * Observation: when sock_common_release is called, processes have
3011 * no access to socket. But net still has.
3012 * Step one, detach it from networking:
3013 *
3014 * A. Remove from hash tables.
3015 */
3016
3017 sk->sk_prot->unhash(sk);
3018
3019 /*
3020 * In this point socket cannot receive new packets, but it is possible
3021 * that some packets are in flight because some CPU runs receiver and
3022 * did hash table lookup before we unhashed socket. They will achieve
3023 * receive queue and will be purged by socket destructor.
3024 *
3025 * Also we still have packets pending on receive queue and probably,
3026 * our own packets waiting in device queues. sock_destroy will drain
3027 * receive queue, but transmitted packets will delay socket destruction
3028 * until the last reference will be released.
3029 */
3030
3031 sock_orphan(sk);
3032
3033 xfrm_sk_free_policy(sk);
3034
e6848976 3035 sk_refcnt_debug_release(sk);
5640f768 3036
1da177e4
LT
3037 sock_put(sk);
3038}
1da177e4
LT
3039EXPORT_SYMBOL(sk_common_release);
3040
a2d133b1
JH
3041void sk_get_meminfo(const struct sock *sk, u32 *mem)
3042{
3043 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3044
3045 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3046 mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3047 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3048 mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3049 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3050 mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3051 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3052 mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3053 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3054}
3055
13ff3d6f
PE
3056#ifdef CONFIG_PROC_FS
3057#define PROTO_INUSE_NR 64 /* should be enough for the first time */
1338d466
PE
3058struct prot_inuse {
3059 int val[PROTO_INUSE_NR];
3060};
13ff3d6f
PE
3061
3062static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
70ee1159 3063
70ee1159
PE
3064void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3065{
08fc7f81 3066 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
70ee1159
PE
3067}
3068EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3069
3070int sock_prot_inuse_get(struct net *net, struct proto *prot)
3071{
3072 int cpu, idx = prot->inuse_idx;
3073 int res = 0;
3074
3075 for_each_possible_cpu(cpu)
08fc7f81 3076 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
70ee1159
PE
3077
3078 return res >= 0 ? res : 0;
3079}
3080EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3081
648845ab
TZ
3082static void sock_inuse_add(struct net *net, int val)
3083{
3084 this_cpu_add(*net->core.sock_inuse, val);
3085}
3086
3087int sock_inuse_get(struct net *net)
3088{
3089 int cpu, res = 0;
3090
3091 for_each_possible_cpu(cpu)
3092 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3093
3094 return res;
3095}
3096
3097EXPORT_SYMBOL_GPL(sock_inuse_get);
3098
2c8c1e72 3099static int __net_init sock_inuse_init_net(struct net *net)
70ee1159 3100{
08fc7f81 3101 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
648845ab
TZ
3102 if (net->core.prot_inuse == NULL)
3103 return -ENOMEM;
3104
3105 net->core.sock_inuse = alloc_percpu(int);
3106 if (net->core.sock_inuse == NULL)
3107 goto out;
3108
3109 return 0;
3110
3111out:
3112 free_percpu(net->core.prot_inuse);
3113 return -ENOMEM;
70ee1159
PE
3114}
3115
2c8c1e72 3116static void __net_exit sock_inuse_exit_net(struct net *net)
70ee1159 3117{
08fc7f81 3118 free_percpu(net->core.prot_inuse);
648845ab 3119 free_percpu(net->core.sock_inuse);
70ee1159
PE
3120}
3121
3122static struct pernet_operations net_inuse_ops = {
3123 .init = sock_inuse_init_net,
3124 .exit = sock_inuse_exit_net,
3125};
3126
3127static __init int net_inuse_init(void)
3128{
3129 if (register_pernet_subsys(&net_inuse_ops))
3130 panic("Cannot initialize net inuse counters");
3131
3132 return 0;
3133}
3134
3135core_initcall(net_inuse_init);
13ff3d6f
PE
3136
3137static void assign_proto_idx(struct proto *prot)
3138{
3139 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3140
3141 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
e005d193 3142 pr_err("PROTO_INUSE_NR exhausted\n");
13ff3d6f
PE
3143 return;
3144 }
3145
3146 set_bit(prot->inuse_idx, proto_inuse_idx);
3147}
3148
3149static void release_proto_idx(struct proto *prot)
3150{
3151 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3152 clear_bit(prot->inuse_idx, proto_inuse_idx);
3153}
3154#else
3155static inline void assign_proto_idx(struct proto *prot)
3156{
3157}
3158
3159static inline void release_proto_idx(struct proto *prot)
3160{
3161}
648845ab
TZ
3162
3163static void sock_inuse_add(struct net *net, int val)
3164{
3165}
13ff3d6f
PE
3166#endif
3167
0159dfd3
ED
3168static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3169{
3170 if (!rsk_prot)
3171 return;
3172 kfree(rsk_prot->slab_name);
3173 rsk_prot->slab_name = NULL;
adf78eda
JL
3174 kmem_cache_destroy(rsk_prot->slab);
3175 rsk_prot->slab = NULL;
0159dfd3
ED
3176}
3177
3178static int req_prot_init(const struct proto *prot)
3179{
3180 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3181
3182 if (!rsk_prot)
3183 return 0;
3184
3185 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3186 prot->name);
3187 if (!rsk_prot->slab_name)
3188 return -ENOMEM;
3189
3190 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3191 rsk_prot->obj_size, 0,
e699e2c6
SB
3192 SLAB_ACCOUNT | prot->slab_flags,
3193 NULL);
0159dfd3
ED
3194
3195 if (!rsk_prot->slab) {
3196 pr_crit("%s: Can't create request sock SLAB cache!\n",
3197 prot->name);
3198 return -ENOMEM;
3199 }
3200 return 0;
3201}
3202
b733c007
PE
3203int proto_register(struct proto *prot, int alloc_slab)
3204{
1da177e4 3205 if (alloc_slab) {
30c2c9f1
DW
3206 prot->slab = kmem_cache_create_usercopy(prot->name,
3207 prot->obj_size, 0,
e699e2c6
SB
3208 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3209 prot->slab_flags,
289a4860 3210 prot->useroffset, prot->usersize,
271b72c7 3211 NULL);
1da177e4
LT
3212
3213 if (prot->slab == NULL) {
e005d193
JP
3214 pr_crit("%s: Can't create sock SLAB cache!\n",
3215 prot->name);
60e7663d 3216 goto out;
1da177e4 3217 }
2e6599cb 3218
0159dfd3
ED
3219 if (req_prot_init(prot))
3220 goto out_free_request_sock_slab;
8feaf0c0 3221
6d6ee43e 3222 if (prot->twsk_prot != NULL) {
faf23422 3223 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
8feaf0c0 3224
7e56b5d6 3225 if (prot->twsk_prot->twsk_slab_name == NULL)
8feaf0c0
ACM
3226 goto out_free_request_sock_slab;
3227
6d6ee43e 3228 prot->twsk_prot->twsk_slab =
7e56b5d6 3229 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
6d6ee43e 3230 prot->twsk_prot->twsk_obj_size,
3ab5aee7 3231 0,
e699e2c6 3232 SLAB_ACCOUNT |
52db70dc 3233 prot->slab_flags,
20c2df83 3234 NULL);
6d6ee43e 3235 if (prot->twsk_prot->twsk_slab == NULL)
8feaf0c0
ACM
3236 goto out_free_timewait_sock_slab_name;
3237 }
1da177e4
LT
3238 }
3239
36b77a52 3240 mutex_lock(&proto_list_mutex);
1da177e4 3241 list_add(&prot->node, &proto_list);
13ff3d6f 3242 assign_proto_idx(prot);
36b77a52 3243 mutex_unlock(&proto_list_mutex);
b733c007
PE
3244 return 0;
3245
8feaf0c0 3246out_free_timewait_sock_slab_name:
7e56b5d6 3247 kfree(prot->twsk_prot->twsk_slab_name);
8feaf0c0 3248out_free_request_sock_slab:
0159dfd3
ED
3249 req_prot_cleanup(prot->rsk_prot);
3250
2e6599cb
ACM
3251 kmem_cache_destroy(prot->slab);
3252 prot->slab = NULL;
b733c007
PE
3253out:
3254 return -ENOBUFS;
1da177e4 3255}
1da177e4
LT
3256EXPORT_SYMBOL(proto_register);
3257
3258void proto_unregister(struct proto *prot)
3259{
36b77a52 3260 mutex_lock(&proto_list_mutex);
13ff3d6f 3261 release_proto_idx(prot);
0a3f4358 3262 list_del(&prot->node);
36b77a52 3263 mutex_unlock(&proto_list_mutex);
1da177e4 3264
adf78eda
JL
3265 kmem_cache_destroy(prot->slab);
3266 prot->slab = NULL;
1da177e4 3267
0159dfd3 3268 req_prot_cleanup(prot->rsk_prot);
2e6599cb 3269
6d6ee43e 3270 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
6d6ee43e 3271 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
7e56b5d6 3272 kfree(prot->twsk_prot->twsk_slab_name);
6d6ee43e 3273 prot->twsk_prot->twsk_slab = NULL;
8feaf0c0 3274 }
1da177e4 3275}
1da177e4
LT
3276EXPORT_SYMBOL(proto_unregister);
3277
bf2ae2e4
XL
3278int sock_load_diag_module(int family, int protocol)
3279{
3280 if (!protocol) {
3281 if (!sock_is_registered(family))
3282 return -ENOENT;
3283
3284 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3285 NETLINK_SOCK_DIAG, family);
3286 }
3287
3288#ifdef CONFIG_INET
3289 if (family == AF_INET &&
c34c1287 3290 protocol != IPPROTO_RAW &&
bf2ae2e4
XL
3291 !rcu_access_pointer(inet_protos[protocol]))
3292 return -ENOENT;
3293#endif
3294
3295 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3296 NETLINK_SOCK_DIAG, family, protocol);
3297}
3298EXPORT_SYMBOL(sock_load_diag_module);
3299
1da177e4 3300#ifdef CONFIG_PROC_FS
1da177e4 3301static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
36b77a52 3302 __acquires(proto_list_mutex)
1da177e4 3303{
36b77a52 3304 mutex_lock(&proto_list_mutex);
60f0438a 3305 return seq_list_start_head(&proto_list, *pos);
1da177e4
LT
3306}
3307
3308static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3309{
60f0438a 3310 return seq_list_next(v, &proto_list, pos);
1da177e4
LT
3311}
3312
3313static void proto_seq_stop(struct seq_file *seq, void *v)
36b77a52 3314 __releases(proto_list_mutex)
1da177e4 3315{
36b77a52 3316 mutex_unlock(&proto_list_mutex);
1da177e4
LT
3317}
3318
3319static char proto_method_implemented(const void *method)
3320{
3321 return method == NULL ? 'n' : 'y';
3322}
180d8cd9
GC
3323static long sock_prot_memory_allocated(struct proto *proto)
3324{
cb75a36c 3325 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
180d8cd9
GC
3326}
3327
3328static char *sock_prot_memory_pressure(struct proto *proto)
3329{
3330 return proto->memory_pressure != NULL ?
3331 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3332}
1da177e4
LT
3333
3334static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3335{
180d8cd9 3336
8d987e5c 3337 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
1da177e4
LT
3338 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3339 proto->name,
3340 proto->obj_size,
14e943db 3341 sock_prot_inuse_get(seq_file_net(seq), proto),
180d8cd9
GC
3342 sock_prot_memory_allocated(proto),
3343 sock_prot_memory_pressure(proto),
1da177e4
LT
3344 proto->max_header,
3345 proto->slab == NULL ? "no" : "yes",
3346 module_name(proto->owner),
3347 proto_method_implemented(proto->close),
3348 proto_method_implemented(proto->connect),
3349 proto_method_implemented(proto->disconnect),
3350 proto_method_implemented(proto->accept),
3351 proto_method_implemented(proto->ioctl),
3352 proto_method_implemented(proto->init),
3353 proto_method_implemented(proto->destroy),
3354 proto_method_implemented(proto->shutdown),
3355 proto_method_implemented(proto->setsockopt),
3356 proto_method_implemented(proto->getsockopt),
3357 proto_method_implemented(proto->sendmsg),
3358 proto_method_implemented(proto->recvmsg),
3359 proto_method_implemented(proto->sendpage),
3360 proto_method_implemented(proto->bind),
3361 proto_method_implemented(proto->backlog_rcv),
3362 proto_method_implemented(proto->hash),
3363 proto_method_implemented(proto->unhash),
3364 proto_method_implemented(proto->get_port),
3365 proto_method_implemented(proto->enter_memory_pressure));
3366}
3367
3368static int proto_seq_show(struct seq_file *seq, void *v)
3369{
60f0438a 3370 if (v == &proto_list)
1da177e4
LT
3371 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3372 "protocol",
3373 "size",
3374 "sockets",
3375 "memory",
3376 "press",
3377 "maxhdr",
3378 "slab",
3379 "module",
3380 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3381 else
60f0438a 3382 proto_seq_printf(seq, list_entry(v, struct proto, node));
1da177e4
LT
3383 return 0;
3384}
3385
f690808e 3386static const struct seq_operations proto_seq_ops = {
1da177e4
LT
3387 .start = proto_seq_start,
3388 .next = proto_seq_next,
3389 .stop = proto_seq_stop,
3390 .show = proto_seq_show,
3391};
3392
14e943db
ED
3393static __net_init int proto_init_net(struct net *net)
3394{
c3506372
CH
3395 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3396 sizeof(struct seq_net_private)))
14e943db
ED
3397 return -ENOMEM;
3398
3399 return 0;
3400}
3401
3402static __net_exit void proto_exit_net(struct net *net)
3403{
ece31ffd 3404 remove_proc_entry("protocols", net->proc_net);
14e943db
ED
3405}
3406
3407
3408static __net_initdata struct pernet_operations proto_net_ops = {
3409 .init = proto_init_net,
3410 .exit = proto_exit_net,
1da177e4
LT
3411};
3412
3413static int __init proto_init(void)
3414{
14e943db 3415 return register_pernet_subsys(&proto_net_ops);
1da177e4
LT
3416}
3417
3418subsys_initcall(proto_init);
3419
3420#endif /* PROC_FS */
7db6b048
SS
3421
3422#ifdef CONFIG_NET_RX_BUSY_POLL
3423bool sk_busy_loop_end(void *p, unsigned long start_time)
3424{
3425 struct sock *sk = p;
3426
3427 return !skb_queue_empty(&sk->sk_receive_queue) ||
3428 sk_busy_loop_timeout(sk, start_time);
3429}
3430EXPORT_SYMBOL(sk_busy_loop_end);
3431#endif /* CONFIG_NET_RX_BUSY_POLL */