net: phy: let genphy_c45_read_abilities also check aneg capability
[linux-2.6-block.git] / net / core / sock.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
4ec93edb 35 * code. The ACK stuff can wait and needs major
1da177e4
LT
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
e005d193
JP
92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
80b14dee 94#include <asm/unaligned.h>
4fc268d2 95#include <linux/capability.h>
1da177e4 96#include <linux/errno.h>
cb820f8e 97#include <linux/errqueue.h>
1da177e4
LT
98#include <linux/types.h>
99#include <linux/socket.h>
100#include <linux/in.h>
101#include <linux/kernel.h>
1da177e4
LT
102#include <linux/module.h>
103#include <linux/proc_fs.h>
104#include <linux/seq_file.h>
105#include <linux/sched.h>
f1083048 106#include <linux/sched/mm.h>
1da177e4
LT
107#include <linux/timer.h>
108#include <linux/string.h>
109#include <linux/sockios.h>
110#include <linux/net.h>
111#include <linux/mm.h>
112#include <linux/slab.h>
113#include <linux/interrupt.h>
114#include <linux/poll.h>
115#include <linux/tcp.h>
116#include <linux/init.h>
a1f8e7f7 117#include <linux/highmem.h>
3f551f94 118#include <linux/user_namespace.h>
c5905afb 119#include <linux/static_key.h>
3969eb38 120#include <linux/memcontrol.h>
8c1ae10d 121#include <linux/prefetch.h>
1da177e4 122
7c0f6ba6 123#include <linux/uaccess.h>
1da177e4
LT
124
125#include <linux/netdevice.h>
126#include <net/protocol.h>
127#include <linux/skbuff.h>
457c4cbc 128#include <net/net_namespace.h>
2e6599cb 129#include <net/request_sock.h>
1da177e4 130#include <net/sock.h>
20d49473 131#include <linux/net_tstamp.h>
1da177e4
LT
132#include <net/xfrm.h>
133#include <linux/ipsec.h>
f8451725 134#include <net/cls_cgroup.h>
5bc1421e 135#include <net/netprio_cgroup.h>
eb4cb008 136#include <linux/sock_diag.h>
1da177e4
LT
137
138#include <linux/filter.h>
538950a1 139#include <net/sock_reuseport.h>
1da177e4 140
3847ce32
SM
141#include <trace/events/sock.h>
142
1da177e4 143#include <net/tcp.h>
076bb0c8 144#include <net/busy_poll.h>
06021292 145
36b77a52 146static DEFINE_MUTEX(proto_list_mutex);
d1a4c0b3
GC
147static LIST_HEAD(proto_list);
148
648845ab
TZ
149static void sock_inuse_add(struct net *net, int val);
150
a3b299da
EB
151/**
152 * sk_ns_capable - General socket capability test
153 * @sk: Socket to use a capability on or through
154 * @user_ns: The user namespace of the capability to use
155 * @cap: The capability to use
156 *
157 * Test to see if the opener of the socket had when the socket was
158 * created and the current process has the capability @cap in the user
159 * namespace @user_ns.
160 */
161bool sk_ns_capable(const struct sock *sk,
162 struct user_namespace *user_ns, int cap)
163{
164 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
165 ns_capable(user_ns, cap);
166}
167EXPORT_SYMBOL(sk_ns_capable);
168
169/**
170 * sk_capable - Socket global capability test
171 * @sk: Socket to use a capability on or through
e793c0f7 172 * @cap: The global capability to use
a3b299da
EB
173 *
174 * Test to see if the opener of the socket had when the socket was
175 * created and the current process has the capability @cap in all user
176 * namespaces.
177 */
178bool sk_capable(const struct sock *sk, int cap)
179{
180 return sk_ns_capable(sk, &init_user_ns, cap);
181}
182EXPORT_SYMBOL(sk_capable);
183
184/**
185 * sk_net_capable - Network namespace socket capability test
186 * @sk: Socket to use a capability on or through
187 * @cap: The capability to use
188 *
e793c0f7 189 * Test to see if the opener of the socket had when the socket was created
a3b299da
EB
190 * and the current process has the capability @cap over the network namespace
191 * the socket is a member of.
192 */
193bool sk_net_capable(const struct sock *sk, int cap)
194{
195 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
196}
197EXPORT_SYMBOL(sk_net_capable);
198
da21f24d
IM
199/*
200 * Each address family might have different locking rules, so we have
cdfbabfb
DH
201 * one slock key per address family and separate keys for internal and
202 * userspace sockets.
da21f24d 203 */
a5b5bb9a 204static struct lock_class_key af_family_keys[AF_MAX];
cdfbabfb 205static struct lock_class_key af_family_kern_keys[AF_MAX];
a5b5bb9a 206static struct lock_class_key af_family_slock_keys[AF_MAX];
cdfbabfb 207static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
a5b5bb9a 208
a5b5bb9a
IM
209/*
210 * Make lock validator output more readable. (we pre-construct these
211 * strings build-time, so that runtime initialization of socket
212 * locks is fast):
213 */
cdfbabfb
DH
214
215#define _sock_locks(x) \
216 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
217 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
218 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
219 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
220 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
221 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
222 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
223 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
224 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
225 x "27" , x "28" , x "AF_CAN" , \
226 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
227 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
228 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
229 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
68e8b849
BT
230 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
231 x "AF_MAX"
cdfbabfb 232
36cbd3dc 233static const char *const af_family_key_strings[AF_MAX+1] = {
cdfbabfb 234 _sock_locks("sk_lock-")
a5b5bb9a 235};
36cbd3dc 236static const char *const af_family_slock_key_strings[AF_MAX+1] = {
cdfbabfb 237 _sock_locks("slock-")
a5b5bb9a 238};
36cbd3dc 239static const char *const af_family_clock_key_strings[AF_MAX+1] = {
cdfbabfb
DH
240 _sock_locks("clock-")
241};
242
243static const char *const af_family_kern_key_strings[AF_MAX+1] = {
244 _sock_locks("k-sk_lock-")
245};
246static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
247 _sock_locks("k-slock-")
248};
249static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
250 _sock_locks("k-clock-")
443aef0e 251};
581319c5 252static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
6b431d50 253 _sock_locks("rlock-")
581319c5
PA
254};
255static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
6b431d50 256 _sock_locks("wlock-")
581319c5
PA
257};
258static const char *const af_family_elock_key_strings[AF_MAX+1] = {
6b431d50 259 _sock_locks("elock-")
581319c5 260};
da21f24d
IM
261
262/*
581319c5 263 * sk_callback_lock and sk queues locking rules are per-address-family,
da21f24d
IM
264 * so split the lock classes by using a per-AF key:
265 */
266static struct lock_class_key af_callback_keys[AF_MAX];
581319c5
PA
267static struct lock_class_key af_rlock_keys[AF_MAX];
268static struct lock_class_key af_wlock_keys[AF_MAX];
269static struct lock_class_key af_elock_keys[AF_MAX];
cdfbabfb 270static struct lock_class_key af_kern_callback_keys[AF_MAX];
da21f24d 271
1da177e4 272/* Run time adjustable parameters. */
ab32ea5d 273__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
6d8ebc8a 274EXPORT_SYMBOL(sysctl_wmem_max);
ab32ea5d 275__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
6d8ebc8a 276EXPORT_SYMBOL(sysctl_rmem_max);
ab32ea5d
BH
277__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
278__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
1da177e4 279
25985edc 280/* Maximal space eaten by iovec or ancillary data plus some space */
ab32ea5d 281int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
2a91525c 282EXPORT_SYMBOL(sysctl_optmem_max);
1da177e4 283
b245be1f
WB
284int sysctl_tstamp_allow_data __read_mostly = 1;
285
a7950ae8
DB
286DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
287EXPORT_SYMBOL_GPL(memalloc_socks_key);
c93bdd0e 288
7cb02404
MG
289/**
290 * sk_set_memalloc - sets %SOCK_MEMALLOC
291 * @sk: socket to set it on
292 *
293 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
294 * It's the responsibility of the admin to adjust min_free_kbytes
295 * to meet the requirements
296 */
297void sk_set_memalloc(struct sock *sk)
298{
299 sock_set_flag(sk, SOCK_MEMALLOC);
300 sk->sk_allocation |= __GFP_MEMALLOC;
a7950ae8 301 static_branch_inc(&memalloc_socks_key);
7cb02404
MG
302}
303EXPORT_SYMBOL_GPL(sk_set_memalloc);
304
305void sk_clear_memalloc(struct sock *sk)
306{
307 sock_reset_flag(sk, SOCK_MEMALLOC);
308 sk->sk_allocation &= ~__GFP_MEMALLOC;
a7950ae8 309 static_branch_dec(&memalloc_socks_key);
c76562b6
MG
310
311 /*
312 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
5d753610
MG
313 * progress of swapping. SOCK_MEMALLOC may be cleared while
314 * it has rmem allocations due to the last swapfile being deactivated
315 * but there is a risk that the socket is unusable due to exceeding
316 * the rmem limits. Reclaim the reserves and obey rmem limits again.
c76562b6 317 */
5d753610 318 sk_mem_reclaim(sk);
7cb02404
MG
319}
320EXPORT_SYMBOL_GPL(sk_clear_memalloc);
321
b4b9e355
MG
322int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
323{
324 int ret;
f1083048 325 unsigned int noreclaim_flag;
b4b9e355
MG
326
327 /* these should have been dropped before queueing */
328 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
329
f1083048 330 noreclaim_flag = memalloc_noreclaim_save();
b4b9e355 331 ret = sk->sk_backlog_rcv(sk, skb);
f1083048 332 memalloc_noreclaim_restore(noreclaim_flag);
b4b9e355
MG
333
334 return ret;
335}
336EXPORT_SYMBOL(__sk_backlog_rcv);
337
a9beb86a 338static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
fe0c72f3 339{
a9beb86a
DD
340 struct __kernel_sock_timeval tv;
341 int size;
fe0c72f3
AB
342
343 if (timeo == MAX_SCHEDULE_TIMEOUT) {
344 tv.tv_sec = 0;
345 tv.tv_usec = 0;
346 } else {
347 tv.tv_sec = timeo / HZ;
348 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
349 }
350
351 if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
352 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
353 *(struct old_timeval32 *)optval = tv32;
354 return sizeof(tv32);
355 }
356
a9beb86a
DD
357 if (old_timeval) {
358 struct __kernel_old_timeval old_tv;
359 old_tv.tv_sec = tv.tv_sec;
360 old_tv.tv_usec = tv.tv_usec;
361 *(struct __kernel_old_timeval *)optval = old_tv;
362 size = sizeof(old_tv);
363 } else {
364 *(struct __kernel_sock_timeval *)optval = tv;
365 size = sizeof(tv);
366 }
367
368 return size;
fe0c72f3
AB
369}
370
a9beb86a 371static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval)
1da177e4 372{
a9beb86a 373 struct __kernel_sock_timeval tv;
1da177e4 374
fe0c72f3
AB
375 if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
376 struct old_timeval32 tv32;
377
378 if (optlen < sizeof(tv32))
379 return -EINVAL;
380
381 if (copy_from_user(&tv32, optval, sizeof(tv32)))
382 return -EFAULT;
383 tv.tv_sec = tv32.tv_sec;
384 tv.tv_usec = tv32.tv_usec;
a9beb86a
DD
385 } else if (old_timeval) {
386 struct __kernel_old_timeval old_tv;
387
388 if (optlen < sizeof(old_tv))
389 return -EINVAL;
390 if (copy_from_user(&old_tv, optval, sizeof(old_tv)))
391 return -EFAULT;
392 tv.tv_sec = old_tv.tv_sec;
393 tv.tv_usec = old_tv.tv_usec;
fe0c72f3
AB
394 } else {
395 if (optlen < sizeof(tv))
396 return -EINVAL;
397 if (copy_from_user(&tv, optval, sizeof(tv)))
398 return -EFAULT;
399 }
ba78073e
VA
400 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
401 return -EDOM;
1da177e4 402
ba78073e 403 if (tv.tv_sec < 0) {
6f11df83
AM
404 static int warned __read_mostly;
405
ba78073e 406 *timeo_p = 0;
50aab54f 407 if (warned < 10 && net_ratelimit()) {
ba78073e 408 warned++;
e005d193
JP
409 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
410 __func__, current->comm, task_pid_nr(current));
50aab54f 411 }
ba78073e
VA
412 return 0;
413 }
1da177e4
LT
414 *timeo_p = MAX_SCHEDULE_TIMEOUT;
415 if (tv.tv_sec == 0 && tv.tv_usec == 0)
416 return 0;
a9beb86a
DD
417 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
418 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
1da177e4
LT
419 return 0;
420}
421
422static void sock_warn_obsolete_bsdism(const char *name)
423{
424 static int warned;
425 static char warncomm[TASK_COMM_LEN];
4ec93edb
YH
426 if (strcmp(warncomm, current->comm) && warned < 5) {
427 strcpy(warncomm, current->comm);
e005d193
JP
428 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
429 warncomm, name);
1da177e4
LT
430 warned++;
431 }
432}
433
080a270f
HFS
434static bool sock_needs_netstamp(const struct sock *sk)
435{
436 switch (sk->sk_family) {
437 case AF_UNSPEC:
438 case AF_UNIX:
439 return false;
440 default:
441 return true;
442 }
443}
444
08e29af3 445static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
4ec93edb 446{
08e29af3
ED
447 if (sk->sk_flags & flags) {
448 sk->sk_flags &= ~flags;
080a270f
HFS
449 if (sock_needs_netstamp(sk) &&
450 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
20d49473 451 net_disable_timestamp();
1da177e4
LT
452 }
453}
454
455
e6afc8ac 456int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
f0088a50 457{
3b885787
NH
458 unsigned long flags;
459 struct sk_buff_head *list = &sk->sk_receive_queue;
f0088a50 460
0fd7bac6 461 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
766e9037 462 atomic_inc(&sk->sk_drops);
3847ce32 463 trace_sock_rcvqueue_full(sk, skb);
766e9037 464 return -ENOMEM;
f0088a50
DV
465 }
466
c76562b6 467 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
766e9037
ED
468 atomic_inc(&sk->sk_drops);
469 return -ENOBUFS;
3ab224be
HA
470 }
471
f0088a50
DV
472 skb->dev = NULL;
473 skb_set_owner_r(skb, sk);
49ad9599 474
7fee226a
ED
475 /* we escape from rcu protected region, make sure we dont leak
476 * a norefcounted dst
477 */
478 skb_dst_force(skb);
479
3b885787 480 spin_lock_irqsave(&list->lock, flags);
3bc3b96f 481 sock_skb_set_dropcount(sk, skb);
3b885787
NH
482 __skb_queue_tail(list, skb);
483 spin_unlock_irqrestore(&list->lock, flags);
f0088a50
DV
484
485 if (!sock_flag(sk, SOCK_DEAD))
676d2369 486 sk->sk_data_ready(sk);
766e9037 487 return 0;
f0088a50 488}
e6afc8ac 489EXPORT_SYMBOL(__sock_queue_rcv_skb);
490
491int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
492{
493 int err;
494
495 err = sk_filter(sk, skb);
496 if (err)
497 return err;
498
499 return __sock_queue_rcv_skb(sk, skb);
500}
f0088a50
DV
501EXPORT_SYMBOL(sock_queue_rcv_skb);
502
4f0c40d9 503int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
c3f24cfb 504 const int nested, unsigned int trim_cap, bool refcounted)
f0088a50
DV
505{
506 int rc = NET_RX_SUCCESS;
507
4f0c40d9 508 if (sk_filter_trim_cap(sk, skb, trim_cap))
f0088a50
DV
509 goto discard_and_relse;
510
511 skb->dev = NULL;
512
274f482d 513 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
c377411f
ED
514 atomic_inc(&sk->sk_drops);
515 goto discard_and_relse;
516 }
58a5a7b9
ACM
517 if (nested)
518 bh_lock_sock_nested(sk);
519 else
520 bh_lock_sock(sk);
a5b5bb9a
IM
521 if (!sock_owned_by_user(sk)) {
522 /*
523 * trylock + unlock semantics:
524 */
525 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
526
c57943a1 527 rc = sk_backlog_rcv(sk, skb);
a5b5bb9a
IM
528
529 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
f545a38f 530 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
8eae939f
ZY
531 bh_unlock_sock(sk);
532 atomic_inc(&sk->sk_drops);
533 goto discard_and_relse;
534 }
535
f0088a50
DV
536 bh_unlock_sock(sk);
537out:
c3f24cfb
ED
538 if (refcounted)
539 sock_put(sk);
f0088a50
DV
540 return rc;
541discard_and_relse:
542 kfree_skb(skb);
543 goto out;
544}
4f0c40d9 545EXPORT_SYMBOL(__sk_receive_skb);
f0088a50
DV
546
547struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
548{
b6c6712a 549 struct dst_entry *dst = __sk_dst_get(sk);
f0088a50
DV
550
551 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
e022f0b4 552 sk_tx_queue_clear(sk);
9b8805a3 553 sk->sk_dst_pending_confirm = 0;
a9b3cd7f 554 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
f0088a50
DV
555 dst_release(dst);
556 return NULL;
557 }
558
559 return dst;
560}
561EXPORT_SYMBOL(__sk_dst_check);
562
563struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
564{
565 struct dst_entry *dst = sk_dst_get(sk);
566
567 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
568 sk_dst_reset(sk);
569 dst_release(dst);
570 return NULL;
571 }
572
573 return dst;
574}
575EXPORT_SYMBOL(sk_dst_check);
576
f5dd3d0c 577static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
4878809f
DM
578{
579 int ret = -ENOPROTOOPT;
580#ifdef CONFIG_NETDEVICES
3b1e0a65 581 struct net *net = sock_net(sk);
4878809f
DM
582
583 /* Sorry... */
584 ret = -EPERM;
5e1fccc0 585 if (!ns_capable(net->user_ns, CAP_NET_RAW))
4878809f
DM
586 goto out;
587
f5dd3d0c
DH
588 ret = -EINVAL;
589 if (ifindex < 0)
590 goto out;
591
592 sk->sk_bound_dev_if = ifindex;
593 if (sk->sk_prot->rehash)
594 sk->sk_prot->rehash(sk);
595 sk_dst_reset(sk);
596
597 ret = 0;
598
599out:
600#endif
601
602 return ret;
603}
604
605static int sock_setbindtodevice(struct sock *sk, char __user *optval,
606 int optlen)
607{
608 int ret = -ENOPROTOOPT;
609#ifdef CONFIG_NETDEVICES
610 struct net *net = sock_net(sk);
611 char devname[IFNAMSIZ];
612 int index;
613
4878809f
DM
614 ret = -EINVAL;
615 if (optlen < 0)
616 goto out;
617
618 /* Bind this socket to a particular device like "eth0",
619 * as specified in the passed interface name. If the
620 * name is "" or the option length is zero the socket
621 * is not bound.
622 */
623 if (optlen > IFNAMSIZ - 1)
624 optlen = IFNAMSIZ - 1;
625 memset(devname, 0, sizeof(devname));
626
627 ret = -EFAULT;
628 if (copy_from_user(devname, optval, optlen))
629 goto out;
630
000ba2e4
DM
631 index = 0;
632 if (devname[0] != '\0') {
bf8e56bf 633 struct net_device *dev;
4878809f 634
bf8e56bf
ED
635 rcu_read_lock();
636 dev = dev_get_by_name_rcu(net, devname);
637 if (dev)
638 index = dev->ifindex;
639 rcu_read_unlock();
4878809f
DM
640 ret = -ENODEV;
641 if (!dev)
642 goto out;
4878809f
DM
643 }
644
645 lock_sock(sk);
f5dd3d0c 646 ret = sock_setbindtodevice_locked(sk, index);
4878809f
DM
647 release_sock(sk);
648
4878809f
DM
649out:
650#endif
651
652 return ret;
653}
654
c91f6df2
BH
655static int sock_getbindtodevice(struct sock *sk, char __user *optval,
656 int __user *optlen, int len)
657{
658 int ret = -ENOPROTOOPT;
659#ifdef CONFIG_NETDEVICES
660 struct net *net = sock_net(sk);
c91f6df2 661 char devname[IFNAMSIZ];
c91f6df2
BH
662
663 if (sk->sk_bound_dev_if == 0) {
664 len = 0;
665 goto zero;
666 }
667
668 ret = -EINVAL;
669 if (len < IFNAMSIZ)
670 goto out;
671
5dbe7c17
NS
672 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
673 if (ret)
c91f6df2 674 goto out;
c91f6df2
BH
675
676 len = strlen(devname) + 1;
677
678 ret = -EFAULT;
679 if (copy_to_user(optval, devname, len))
680 goto out;
681
682zero:
683 ret = -EFAULT;
684 if (put_user(len, optlen))
685 goto out;
686
687 ret = 0;
688
689out:
690#endif
691
692 return ret;
693}
694
c0ef877b
PE
695static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
696{
697 if (valbool)
698 sock_set_flag(sk, bit);
699 else
700 sock_reset_flag(sk, bit);
701}
702
f60e5990 703bool sk_mc_loop(struct sock *sk)
704{
705 if (dev_recursion_level())
706 return false;
707 if (!sk)
708 return true;
709 switch (sk->sk_family) {
710 case AF_INET:
711 return inet_sk(sk)->mc_loop;
712#if IS_ENABLED(CONFIG_IPV6)
713 case AF_INET6:
714 return inet6_sk(sk)->mc_loop;
715#endif
716 }
717 WARN_ON(1);
718 return true;
719}
720EXPORT_SYMBOL(sk_mc_loop);
721
1da177e4
LT
722/*
723 * This is meant for all protocols to use and covers goings on
724 * at the socket level. Everything here is generic.
725 */
726
727int sock_setsockopt(struct socket *sock, int level, int optname,
b7058842 728 char __user *optval, unsigned int optlen)
1da177e4 729{
80b14dee 730 struct sock_txtime sk_txtime;
2a91525c 731 struct sock *sk = sock->sk;
1da177e4
LT
732 int val;
733 int valbool;
734 struct linger ling;
735 int ret = 0;
4ec93edb 736
1da177e4
LT
737 /*
738 * Options without arguments
739 */
740
4878809f 741 if (optname == SO_BINDTODEVICE)
c91f6df2 742 return sock_setbindtodevice(sk, optval, optlen);
4878809f 743
e71a4783
SH
744 if (optlen < sizeof(int))
745 return -EINVAL;
4ec93edb 746
1da177e4
LT
747 if (get_user(val, (int __user *)optval))
748 return -EFAULT;
4ec93edb 749
2a91525c 750 valbool = val ? 1 : 0;
1da177e4
LT
751
752 lock_sock(sk);
753
2a91525c 754 switch (optname) {
e71a4783 755 case SO_DEBUG:
2a91525c 756 if (val && !capable(CAP_NET_ADMIN))
e71a4783 757 ret = -EACCES;
2a91525c 758 else
c0ef877b 759 sock_valbool_flag(sk, SOCK_DBG, valbool);
e71a4783
SH
760 break;
761 case SO_REUSEADDR:
cdb8744d 762 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
e71a4783 763 break;
055dc21a
TH
764 case SO_REUSEPORT:
765 sk->sk_reuseport = valbool;
766 break;
e71a4783 767 case SO_TYPE:
49c794e9 768 case SO_PROTOCOL:
0d6038ee 769 case SO_DOMAIN:
e71a4783
SH
770 case SO_ERROR:
771 ret = -ENOPROTOOPT;
772 break;
773 case SO_DONTROUTE:
c0ef877b 774 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
0fbe82e6 775 sk_dst_reset(sk);
e71a4783
SH
776 break;
777 case SO_BROADCAST:
778 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
779 break;
780 case SO_SNDBUF:
781 /* Don't error on this BSD doesn't and if you think
82981930
ED
782 * about it this is right. Otherwise apps have to
783 * play 'guess the biggest size' games. RCVBUF/SNDBUF
784 * are treated in BSD as hints
785 */
786 val = min_t(u32, val, sysctl_wmem_max);
b0573dea 787set_sndbuf:
4057765f
GN
788 /* Ensure val * 2 fits into an int, to prevent max_t()
789 * from treating it as a negative value.
790 */
791 val = min_t(int, val, INT_MAX / 2);
e71a4783 792 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
b98b0bc8 793 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
82981930 794 /* Wake up sending tasks if we upped the value. */
e71a4783
SH
795 sk->sk_write_space(sk);
796 break;
1da177e4 797
e71a4783
SH
798 case SO_SNDBUFFORCE:
799 if (!capable(CAP_NET_ADMIN)) {
800 ret = -EPERM;
801 break;
802 }
4057765f
GN
803
804 /* No negative values (to prevent underflow, as val will be
805 * multiplied by 2).
806 */
807 if (val < 0)
808 val = 0;
e71a4783 809 goto set_sndbuf;
b0573dea 810
e71a4783
SH
811 case SO_RCVBUF:
812 /* Don't error on this BSD doesn't and if you think
82981930
ED
813 * about it this is right. Otherwise apps have to
814 * play 'guess the biggest size' games. RCVBUF/SNDBUF
815 * are treated in BSD as hints
816 */
817 val = min_t(u32, val, sysctl_rmem_max);
b0573dea 818set_rcvbuf:
4057765f
GN
819 /* Ensure val * 2 fits into an int, to prevent max_t()
820 * from treating it as a negative value.
821 */
822 val = min_t(int, val, INT_MAX / 2);
e71a4783
SH
823 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
824 /*
825 * We double it on the way in to account for
826 * "struct sk_buff" etc. overhead. Applications
827 * assume that the SO_RCVBUF setting they make will
828 * allow that much actual data to be received on that
829 * socket.
830 *
831 * Applications are unaware that "struct sk_buff" and
832 * other overheads allocate from the receive buffer
833 * during socket buffer allocation.
834 *
835 * And after considering the possible alternatives,
836 * returning the value we actually used in getsockopt
837 * is the most desirable behavior.
838 */
b98b0bc8 839 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
e71a4783
SH
840 break;
841
842 case SO_RCVBUFFORCE:
843 if (!capable(CAP_NET_ADMIN)) {
844 ret = -EPERM;
1da177e4 845 break;
e71a4783 846 }
4057765f
GN
847
848 /* No negative values (to prevent underflow, as val will be
849 * multiplied by 2).
850 */
851 if (val < 0)
852 val = 0;
e71a4783 853 goto set_rcvbuf;
1da177e4 854
e71a4783 855 case SO_KEEPALIVE:
4b9d07a4
UB
856 if (sk->sk_prot->keepalive)
857 sk->sk_prot->keepalive(sk, valbool);
e71a4783
SH
858 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
859 break;
860
861 case SO_OOBINLINE:
862 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
863 break;
864
865 case SO_NO_CHECK:
28448b80 866 sk->sk_no_check_tx = valbool;
e71a4783
SH
867 break;
868
869 case SO_PRIORITY:
5e1fccc0
EB
870 if ((val >= 0 && val <= 6) ||
871 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
e71a4783
SH
872 sk->sk_priority = val;
873 else
874 ret = -EPERM;
875 break;
876
877 case SO_LINGER:
878 if (optlen < sizeof(ling)) {
879 ret = -EINVAL; /* 1003.1g */
1da177e4 880 break;
e71a4783 881 }
2a91525c 882 if (copy_from_user(&ling, optval, sizeof(ling))) {
e71a4783 883 ret = -EFAULT;
1da177e4 884 break;
e71a4783
SH
885 }
886 if (!ling.l_onoff)
887 sock_reset_flag(sk, SOCK_LINGER);
888 else {
1da177e4 889#if (BITS_PER_LONG == 32)
e71a4783
SH
890 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
891 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1da177e4 892 else
e71a4783
SH
893#endif
894 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
895 sock_set_flag(sk, SOCK_LINGER);
896 }
897 break;
898
899 case SO_BSDCOMPAT:
900 sock_warn_obsolete_bsdism("setsockopt");
901 break;
902
903 case SO_PASSCRED:
904 if (valbool)
905 set_bit(SOCK_PASSCRED, &sock->flags);
906 else
907 clear_bit(SOCK_PASSCRED, &sock->flags);
908 break;
909
7f1bc6e9 910 case SO_TIMESTAMP_OLD:
887feae3 911 case SO_TIMESTAMP_NEW:
7f1bc6e9 912 case SO_TIMESTAMPNS_OLD:
887feae3 913 case SO_TIMESTAMPNS_NEW:
e71a4783 914 if (valbool) {
887feae3
DD
915 if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW)
916 sock_set_flag(sk, SOCK_TSTAMP_NEW);
917 else
918 sock_reset_flag(sk, SOCK_TSTAMP_NEW);
919
920 if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW)
92f37fd2
ED
921 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
922 else
923 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783 924 sock_set_flag(sk, SOCK_RCVTSTAMP);
20d49473 925 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
92f37fd2 926 } else {
e71a4783 927 sock_reset_flag(sk, SOCK_RCVTSTAMP);
92f37fd2 928 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
887feae3 929 sock_reset_flag(sk, SOCK_TSTAMP_NEW);
92f37fd2 930 }
e71a4783
SH
931 break;
932
9718475e
DD
933 case SO_TIMESTAMPING_NEW:
934 sock_set_flag(sk, SOCK_TSTAMP_NEW);
ff7653f9 935 /* fall through */
7f1bc6e9 936 case SO_TIMESTAMPING_OLD:
20d49473 937 if (val & ~SOF_TIMESTAMPING_MASK) {
f249fb78 938 ret = -EINVAL;
20d49473
PO
939 break;
940 }
b245be1f 941
09c2d251 942 if (val & SOF_TIMESTAMPING_OPT_ID &&
4ed2d765 943 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
ac5cc977
WC
944 if (sk->sk_protocol == IPPROTO_TCP &&
945 sk->sk_type == SOCK_STREAM) {
6db8b963
SHY
946 if ((1 << sk->sk_state) &
947 (TCPF_CLOSE | TCPF_LISTEN)) {
4ed2d765
WB
948 ret = -EINVAL;
949 break;
950 }
951 sk->sk_tskey = tcp_sk(sk)->snd_una;
952 } else {
953 sk->sk_tskey = 0;
954 }
955 }
1c885808
FY
956
957 if (val & SOF_TIMESTAMPING_OPT_STATS &&
958 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
959 ret = -EINVAL;
960 break;
961 }
962
b9f40e21 963 sk->sk_tsflags = val;
20d49473
PO
964 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
965 sock_enable_timestamp(sk,
966 SOCK_TIMESTAMPING_RX_SOFTWARE);
9718475e
DD
967 else {
968 if (optname == SO_TIMESTAMPING_NEW)
969 sock_reset_flag(sk, SOCK_TSTAMP_NEW);
970
20d49473 971 sock_disable_timestamp(sk,
08e29af3 972 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
9718475e 973 }
20d49473
PO
974 break;
975
e71a4783
SH
976 case SO_RCVLOWAT:
977 if (val < 0)
978 val = INT_MAX;
d1361840
ED
979 if (sock->ops->set_rcvlowat)
980 ret = sock->ops->set_rcvlowat(sk, val);
981 else
982 sk->sk_rcvlowat = val ? : 1;
e71a4783
SH
983 break;
984
45bdc661 985 case SO_RCVTIMEO_OLD:
a9beb86a
DD
986 case SO_RCVTIMEO_NEW:
987 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD);
e71a4783
SH
988 break;
989
45bdc661 990 case SO_SNDTIMEO_OLD:
a9beb86a
DD
991 case SO_SNDTIMEO_NEW:
992 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD);
e71a4783 993 break;
1da177e4 994
e71a4783
SH
995 case SO_ATTACH_FILTER:
996 ret = -EINVAL;
997 if (optlen == sizeof(struct sock_fprog)) {
998 struct sock_fprog fprog;
1da177e4 999
e71a4783
SH
1000 ret = -EFAULT;
1001 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1da177e4 1002 break;
e71a4783
SH
1003
1004 ret = sk_attach_filter(&fprog, sk);
1005 }
1006 break;
1007
89aa0758
AS
1008 case SO_ATTACH_BPF:
1009 ret = -EINVAL;
1010 if (optlen == sizeof(u32)) {
1011 u32 ufd;
1012
1013 ret = -EFAULT;
1014 if (copy_from_user(&ufd, optval, sizeof(ufd)))
1015 break;
1016
1017 ret = sk_attach_bpf(ufd, sk);
1018 }
1019 break;
1020
538950a1
CG
1021 case SO_ATTACH_REUSEPORT_CBPF:
1022 ret = -EINVAL;
1023 if (optlen == sizeof(struct sock_fprog)) {
1024 struct sock_fprog fprog;
1025
1026 ret = -EFAULT;
1027 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1028 break;
1029
1030 ret = sk_reuseport_attach_filter(&fprog, sk);
1031 }
1032 break;
1033
1034 case SO_ATTACH_REUSEPORT_EBPF:
1035 ret = -EINVAL;
1036 if (optlen == sizeof(u32)) {
1037 u32 ufd;
1038
1039 ret = -EFAULT;
1040 if (copy_from_user(&ufd, optval, sizeof(ufd)))
1041 break;
1042
1043 ret = sk_reuseport_attach_bpf(ufd, sk);
1044 }
1045 break;
1046
e71a4783 1047 case SO_DETACH_FILTER:
55b33325 1048 ret = sk_detach_filter(sk);
e71a4783 1049 break;
1da177e4 1050
d59577b6
VB
1051 case SO_LOCK_FILTER:
1052 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1053 ret = -EPERM;
1054 else
1055 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1056 break;
1057
e71a4783
SH
1058 case SO_PASSSEC:
1059 if (valbool)
1060 set_bit(SOCK_PASSSEC, &sock->flags);
1061 else
1062 clear_bit(SOCK_PASSSEC, &sock->flags);
1063 break;
4a19ec58 1064 case SO_MARK:
50254256 1065 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
4a19ec58 1066 ret = -EPERM;
50254256 1067 } else if (val != sk->sk_mark) {
4a19ec58 1068 sk->sk_mark = val;
50254256
DB
1069 sk_dst_reset(sk);
1070 }
4a19ec58 1071 break;
877ce7c1 1072
3b885787 1073 case SO_RXQ_OVFL:
8083f0fc 1074 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
3b885787 1075 break;
6e3e939f
JB
1076
1077 case SO_WIFI_STATUS:
1078 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1079 break;
1080
ef64a54f
PE
1081 case SO_PEEK_OFF:
1082 if (sock->ops->set_peek_off)
12663bfc 1083 ret = sock->ops->set_peek_off(sk, val);
ef64a54f
PE
1084 else
1085 ret = -EOPNOTSUPP;
1086 break;
3bdc0eba
BG
1087
1088 case SO_NOFCS:
1089 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1090 break;
1091
7d4c04fc
KJ
1092 case SO_SELECT_ERR_QUEUE:
1093 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1094 break;
1095
e0d1095a 1096#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 1097 case SO_BUSY_POLL:
dafcc438
ET
1098 /* allow unprivileged users to decrease the value */
1099 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1100 ret = -EPERM;
1101 else {
1102 if (val < 0)
1103 ret = -EINVAL;
1104 else
1105 sk->sk_ll_usec = val;
1106 }
1107 break;
1108#endif
62748f32
ED
1109
1110 case SO_MAX_PACING_RATE:
218af599
ED
1111 if (val != ~0U)
1112 cmpxchg(&sk->sk_pacing_status,
1113 SK_PACING_NONE,
1114 SK_PACING_NEEDED);
76a9ebe8 1115 sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
62748f32
ED
1116 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1117 sk->sk_max_pacing_rate);
1118 break;
1119
70da268b
ED
1120 case SO_INCOMING_CPU:
1121 sk->sk_incoming_cpu = val;
1122 break;
1123
a87cb3e4
TH
1124 case SO_CNX_ADVICE:
1125 if (val == 1)
1126 dst_negative_advice(sk);
1127 break;
76851d12
WB
1128
1129 case SO_ZEROCOPY:
28190752 1130 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
b5947e5d
WB
1131 if (!((sk->sk_type == SOCK_STREAM &&
1132 sk->sk_protocol == IPPROTO_TCP) ||
1133 (sk->sk_type == SOCK_DGRAM &&
1134 sk->sk_protocol == IPPROTO_UDP)))
28190752 1135 ret = -ENOTSUPP;
28190752 1136 } else if (sk->sk_family != PF_RDS) {
76851d12 1137 ret = -ENOTSUPP;
28190752
SV
1138 }
1139 if (!ret) {
1140 if (val < 0 || val > 1)
1141 ret = -EINVAL;
1142 else
1143 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
28190752 1144 }
334e6413
JSP
1145 break;
1146
80b14dee
RC
1147 case SO_TXTIME:
1148 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1149 ret = -EPERM;
1150 } else if (optlen != sizeof(struct sock_txtime)) {
1151 ret = -EINVAL;
1152 } else if (copy_from_user(&sk_txtime, optval,
1153 sizeof(struct sock_txtime))) {
1154 ret = -EFAULT;
1155 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1156 ret = -EINVAL;
1157 } else {
1158 sock_valbool_flag(sk, SOCK_TXTIME, true);
1159 sk->sk_clockid = sk_txtime.clockid;
1160 sk->sk_txtime_deadline_mode =
1161 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
4b15c707
JSP
1162 sk->sk_txtime_report_errors =
1163 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
80b14dee
RC
1164 }
1165 break;
1166
f5dd3d0c
DH
1167 case SO_BINDTOIFINDEX:
1168 ret = sock_setbindtodevice_locked(sk, val);
1169 break;
1170
e71a4783
SH
1171 default:
1172 ret = -ENOPROTOOPT;
1173 break;
4ec93edb 1174 }
1da177e4
LT
1175 release_sock(sk);
1176 return ret;
1177}
2a91525c 1178EXPORT_SYMBOL(sock_setsockopt);
1da177e4
LT
1179
1180
8f09898b 1181static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1182 struct ucred *ucred)
3f551f94
EB
1183{
1184 ucred->pid = pid_vnr(pid);
1185 ucred->uid = ucred->gid = -1;
1186 if (cred) {
1187 struct user_namespace *current_ns = current_user_ns();
1188
b2e4f544
EB
1189 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1190 ucred->gid = from_kgid_munged(current_ns, cred->egid);
3f551f94
EB
1191 }
1192}
1193
28b5ba2a
DH
1194static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1195{
1196 struct user_namespace *user_ns = current_user_ns();
1197 int i;
1198
1199 for (i = 0; i < src->ngroups; i++)
1200 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1201 return -EFAULT;
1202
1203 return 0;
1204}
1205
1da177e4
LT
1206int sock_getsockopt(struct socket *sock, int level, int optname,
1207 char __user *optval, int __user *optlen)
1208{
1209 struct sock *sk = sock->sk;
4ec93edb 1210
e71a4783 1211 union {
4ec93edb 1212 int val;
5daab9db 1213 u64 val64;
4ec93edb 1214 struct linger ling;
fe0c72f3
AB
1215 struct old_timeval32 tm32;
1216 struct __kernel_old_timeval tm;
a9beb86a 1217 struct __kernel_sock_timeval stm;
80b14dee 1218 struct sock_txtime txtime;
1da177e4 1219 } v;
4ec93edb 1220
4d0392be 1221 int lv = sizeof(int);
1da177e4 1222 int len;
4ec93edb 1223
e71a4783 1224 if (get_user(len, optlen))
4ec93edb 1225 return -EFAULT;
e71a4783 1226 if (len < 0)
1da177e4 1227 return -EINVAL;
4ec93edb 1228
50fee1de 1229 memset(&v, 0, sizeof(v));
df0bca04 1230
2a91525c 1231 switch (optname) {
e71a4783
SH
1232 case SO_DEBUG:
1233 v.val = sock_flag(sk, SOCK_DBG);
1234 break;
1235
1236 case SO_DONTROUTE:
1237 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1238 break;
1239
1240 case SO_BROADCAST:
1b23a5df 1241 v.val = sock_flag(sk, SOCK_BROADCAST);
e71a4783
SH
1242 break;
1243
1244 case SO_SNDBUF:
1245 v.val = sk->sk_sndbuf;
1246 break;
1247
1248 case SO_RCVBUF:
1249 v.val = sk->sk_rcvbuf;
1250 break;
1251
1252 case SO_REUSEADDR:
1253 v.val = sk->sk_reuse;
1254 break;
1255
055dc21a
TH
1256 case SO_REUSEPORT:
1257 v.val = sk->sk_reuseport;
1258 break;
1259
e71a4783 1260 case SO_KEEPALIVE:
1b23a5df 1261 v.val = sock_flag(sk, SOCK_KEEPOPEN);
e71a4783
SH
1262 break;
1263
1264 case SO_TYPE:
1265 v.val = sk->sk_type;
1266 break;
1267
49c794e9
JE
1268 case SO_PROTOCOL:
1269 v.val = sk->sk_protocol;
1270 break;
1271
0d6038ee
JE
1272 case SO_DOMAIN:
1273 v.val = sk->sk_family;
1274 break;
1275
e71a4783
SH
1276 case SO_ERROR:
1277 v.val = -sock_error(sk);
2a91525c 1278 if (v.val == 0)
e71a4783
SH
1279 v.val = xchg(&sk->sk_err_soft, 0);
1280 break;
1281
1282 case SO_OOBINLINE:
1b23a5df 1283 v.val = sock_flag(sk, SOCK_URGINLINE);
e71a4783
SH
1284 break;
1285
1286 case SO_NO_CHECK:
28448b80 1287 v.val = sk->sk_no_check_tx;
e71a4783
SH
1288 break;
1289
1290 case SO_PRIORITY:
1291 v.val = sk->sk_priority;
1292 break;
1293
1294 case SO_LINGER:
1295 lv = sizeof(v.ling);
1b23a5df 1296 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
e71a4783
SH
1297 v.ling.l_linger = sk->sk_lingertime / HZ;
1298 break;
1299
1300 case SO_BSDCOMPAT:
1301 sock_warn_obsolete_bsdism("getsockopt");
1302 break;
1303
7f1bc6e9 1304 case SO_TIMESTAMP_OLD:
92f37fd2 1305 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
887feae3 1306 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
92f37fd2
ED
1307 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1308 break;
1309
7f1bc6e9 1310 case SO_TIMESTAMPNS_OLD:
887feae3
DD
1311 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1312 break;
1313
1314 case SO_TIMESTAMP_NEW:
1315 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1316 break;
1317
1318 case SO_TIMESTAMPNS_NEW:
1319 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
e71a4783
SH
1320 break;
1321
7f1bc6e9 1322 case SO_TIMESTAMPING_OLD:
b9f40e21 1323 v.val = sk->sk_tsflags;
20d49473
PO
1324 break;
1325
a9beb86a
DD
1326 case SO_RCVTIMEO_OLD:
1327 case SO_RCVTIMEO_NEW:
1328 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
e71a4783
SH
1329 break;
1330
a9beb86a
DD
1331 case SO_SNDTIMEO_OLD:
1332 case SO_SNDTIMEO_NEW:
1333 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
e71a4783 1334 break;
1da177e4 1335
e71a4783
SH
1336 case SO_RCVLOWAT:
1337 v.val = sk->sk_rcvlowat;
1338 break;
1da177e4 1339
e71a4783 1340 case SO_SNDLOWAT:
2a91525c 1341 v.val = 1;
e71a4783 1342 break;
1da177e4 1343
e71a4783 1344 case SO_PASSCRED:
82981930 1345 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
e71a4783 1346 break;
1da177e4 1347
e71a4783 1348 case SO_PEERCRED:
109f6e39
EB
1349 {
1350 struct ucred peercred;
1351 if (len > sizeof(peercred))
1352 len = sizeof(peercred);
1353 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1354 if (copy_to_user(optval, &peercred, len))
e71a4783
SH
1355 return -EFAULT;
1356 goto lenout;
109f6e39 1357 }
1da177e4 1358
28b5ba2a
DH
1359 case SO_PEERGROUPS:
1360 {
1361 int ret, n;
1362
1363 if (!sk->sk_peer_cred)
1364 return -ENODATA;
1365
1366 n = sk->sk_peer_cred->group_info->ngroups;
1367 if (len < n * sizeof(gid_t)) {
1368 len = n * sizeof(gid_t);
1369 return put_user(len, optlen) ? -EFAULT : -ERANGE;
1370 }
1371 len = n * sizeof(gid_t);
1372
1373 ret = groups_to_user((gid_t __user *)optval,
1374 sk->sk_peer_cred->group_info);
1375 if (ret)
1376 return ret;
1377 goto lenout;
1378 }
1379
e71a4783
SH
1380 case SO_PEERNAME:
1381 {
1382 char address[128];
1383
9b2c45d4
DV
1384 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1385 if (lv < 0)
e71a4783
SH
1386 return -ENOTCONN;
1387 if (lv < len)
1388 return -EINVAL;
1389 if (copy_to_user(optval, address, len))
1390 return -EFAULT;
1391 goto lenout;
1392 }
1da177e4 1393
e71a4783
SH
1394 /* Dubious BSD thing... Probably nobody even uses it, but
1395 * the UNIX standard wants it for whatever reason... -DaveM
1396 */
1397 case SO_ACCEPTCONN:
1398 v.val = sk->sk_state == TCP_LISTEN;
1399 break;
1da177e4 1400
e71a4783 1401 case SO_PASSSEC:
82981930 1402 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
e71a4783 1403 break;
877ce7c1 1404
e71a4783
SH
1405 case SO_PEERSEC:
1406 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1da177e4 1407
4a19ec58
LAT
1408 case SO_MARK:
1409 v.val = sk->sk_mark;
1410 break;
1411
3b885787 1412 case SO_RXQ_OVFL:
1b23a5df 1413 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
3b885787
NH
1414 break;
1415
6e3e939f 1416 case SO_WIFI_STATUS:
1b23a5df 1417 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
6e3e939f
JB
1418 break;
1419
ef64a54f
PE
1420 case SO_PEEK_OFF:
1421 if (!sock->ops->set_peek_off)
1422 return -EOPNOTSUPP;
1423
1424 v.val = sk->sk_peek_off;
1425 break;
bc2f7996 1426 case SO_NOFCS:
1b23a5df 1427 v.val = sock_flag(sk, SOCK_NOFCS);
bc2f7996 1428 break;
c91f6df2 1429
f7b86bfe 1430 case SO_BINDTODEVICE:
c91f6df2
BH
1431 return sock_getbindtodevice(sk, optval, optlen, len);
1432
a8fc9277
PE
1433 case SO_GET_FILTER:
1434 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1435 if (len < 0)
1436 return len;
1437
1438 goto lenout;
c91f6df2 1439
d59577b6
VB
1440 case SO_LOCK_FILTER:
1441 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1442 break;
1443
ea02f941
MS
1444 case SO_BPF_EXTENSIONS:
1445 v.val = bpf_tell_extensions();
1446 break;
1447
7d4c04fc
KJ
1448 case SO_SELECT_ERR_QUEUE:
1449 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1450 break;
1451
e0d1095a 1452#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 1453 case SO_BUSY_POLL:
dafcc438
ET
1454 v.val = sk->sk_ll_usec;
1455 break;
1456#endif
1457
62748f32 1458 case SO_MAX_PACING_RATE:
76a9ebe8
ED
1459 /* 32bit version */
1460 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
62748f32
ED
1461 break;
1462
2c8c56e1
ED
1463 case SO_INCOMING_CPU:
1464 v.val = sk->sk_incoming_cpu;
1465 break;
1466
a2d133b1
JH
1467 case SO_MEMINFO:
1468 {
1469 u32 meminfo[SK_MEMINFO_VARS];
1470
1471 if (get_user(len, optlen))
1472 return -EFAULT;
1473
1474 sk_get_meminfo(sk, meminfo);
1475
1476 len = min_t(unsigned int, len, sizeof(meminfo));
1477 if (copy_to_user(optval, &meminfo, len))
1478 return -EFAULT;
1479
1480 goto lenout;
1481 }
6d433902
SS
1482
1483#ifdef CONFIG_NET_RX_BUSY_POLL
1484 case SO_INCOMING_NAPI_ID:
1485 v.val = READ_ONCE(sk->sk_napi_id);
1486
1487 /* aggregate non-NAPI IDs down to 0 */
1488 if (v.val < MIN_NAPI_ID)
1489 v.val = 0;
1490
1491 break;
1492#endif
1493
5daab9db
CF
1494 case SO_COOKIE:
1495 lv = sizeof(u64);
1496 if (len < lv)
1497 return -EINVAL;
1498 v.val64 = sock_gen_cookie(sk);
1499 break;
1500
76851d12
WB
1501 case SO_ZEROCOPY:
1502 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1503 break;
1504
80b14dee
RC
1505 case SO_TXTIME:
1506 lv = sizeof(v.txtime);
1507 v.txtime.clockid = sk->sk_clockid;
1508 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1509 SOF_TXTIME_DEADLINE_MODE : 0;
4b15c707
JSP
1510 v.txtime.flags |= sk->sk_txtime_report_errors ?
1511 SOF_TXTIME_REPORT_ERRORS : 0;
80b14dee
RC
1512 break;
1513
f5dd3d0c
DH
1514 case SO_BINDTOIFINDEX:
1515 v.val = sk->sk_bound_dev_if;
1516 break;
1517
e71a4783 1518 default:
443b5991
YH
1519 /* We implement the SO_SNDLOWAT etc to not be settable
1520 * (1003.1g 7).
1521 */
e71a4783 1522 return -ENOPROTOOPT;
1da177e4 1523 }
e71a4783 1524
1da177e4
LT
1525 if (len > lv)
1526 len = lv;
1527 if (copy_to_user(optval, &v, len))
1528 return -EFAULT;
1529lenout:
4ec93edb
YH
1530 if (put_user(len, optlen))
1531 return -EFAULT;
1532 return 0;
1da177e4
LT
1533}
1534
a5b5bb9a
IM
1535/*
1536 * Initialize an sk_lock.
1537 *
1538 * (We also register the sk_lock with the lock validator.)
1539 */
b6f99a21 1540static inline void sock_lock_init(struct sock *sk)
a5b5bb9a 1541{
cdfbabfb
DH
1542 if (sk->sk_kern_sock)
1543 sock_lock_init_class_and_name(
1544 sk,
1545 af_family_kern_slock_key_strings[sk->sk_family],
1546 af_family_kern_slock_keys + sk->sk_family,
1547 af_family_kern_key_strings[sk->sk_family],
1548 af_family_kern_keys + sk->sk_family);
1549 else
1550 sock_lock_init_class_and_name(
1551 sk,
ed07536e
PZ
1552 af_family_slock_key_strings[sk->sk_family],
1553 af_family_slock_keys + sk->sk_family,
1554 af_family_key_strings[sk->sk_family],
1555 af_family_keys + sk->sk_family);
a5b5bb9a
IM
1556}
1557
4dc6dc71
ED
1558/*
1559 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1560 * even temporarly, because of RCU lookups. sk_node should also be left as is.
68835aba 1561 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
4dc6dc71 1562 */
f1a6c4da
PE
1563static void sock_copy(struct sock *nsk, const struct sock *osk)
1564{
1565#ifdef CONFIG_SECURITY_NETWORK
1566 void *sptr = nsk->sk_security;
1567#endif
68835aba
ED
1568 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1569
1570 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1571 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1572
f1a6c4da
PE
1573#ifdef CONFIG_SECURITY_NETWORK
1574 nsk->sk_security = sptr;
1575 security_sk_clone(osk, nsk);
1576#endif
1577}
1578
2e4afe7b
PE
1579static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1580 int family)
c308c1b2
PE
1581{
1582 struct sock *sk;
1583 struct kmem_cache *slab;
1584
1585 slab = prot->slab;
e912b114
ED
1586 if (slab != NULL) {
1587 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1588 if (!sk)
1589 return sk;
ba2489b0
ED
1590 if (priority & __GFP_ZERO)
1591 sk_prot_clear_nulls(sk, prot->obj_size);
fcbdf09d 1592 } else
c308c1b2
PE
1593 sk = kmalloc(prot->obj_size, priority);
1594
2e4afe7b
PE
1595 if (sk != NULL) {
1596 if (security_sk_alloc(sk, family, priority))
1597 goto out_free;
1598
1599 if (!try_module_get(prot->owner))
1600 goto out_free_sec;
e022f0b4 1601 sk_tx_queue_clear(sk);
2e4afe7b
PE
1602 }
1603
c308c1b2 1604 return sk;
2e4afe7b
PE
1605
1606out_free_sec:
1607 security_sk_free(sk);
1608out_free:
1609 if (slab != NULL)
1610 kmem_cache_free(slab, sk);
1611 else
1612 kfree(sk);
1613 return NULL;
c308c1b2
PE
1614}
1615
1616static void sk_prot_free(struct proto *prot, struct sock *sk)
1617{
1618 struct kmem_cache *slab;
2e4afe7b 1619 struct module *owner;
c308c1b2 1620
2e4afe7b 1621 owner = prot->owner;
c308c1b2 1622 slab = prot->slab;
2e4afe7b 1623
bd1060a1 1624 cgroup_sk_free(&sk->sk_cgrp_data);
2d758073 1625 mem_cgroup_sk_free(sk);
2e4afe7b 1626 security_sk_free(sk);
c308c1b2
PE
1627 if (slab != NULL)
1628 kmem_cache_free(slab, sk);
1629 else
1630 kfree(sk);
2e4afe7b 1631 module_put(owner);
c308c1b2
PE
1632}
1633
1da177e4
LT
1634/**
1635 * sk_alloc - All socket objects are allocated here
c4ea43c5 1636 * @net: the applicable net namespace
4dc3b16b
PP
1637 * @family: protocol family
1638 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1639 * @prot: struct proto associated with this new sock instance
11aa9c28 1640 * @kern: is this to be a kernel socket?
1da177e4 1641 */
1b8d7ae4 1642struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
11aa9c28 1643 struct proto *prot, int kern)
1da177e4 1644{
c308c1b2 1645 struct sock *sk;
1da177e4 1646
154adbc8 1647 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1da177e4 1648 if (sk) {
154adbc8
PE
1649 sk->sk_family = family;
1650 /*
1651 * See comment in struct sock definition to understand
1652 * why we need sk_prot_creator -acme
1653 */
1654 sk->sk_prot = sk->sk_prot_creator = prot;
cdfbabfb 1655 sk->sk_kern_sock = kern;
154adbc8 1656 sock_lock_init(sk);
26abe143 1657 sk->sk_net_refcnt = kern ? 0 : 1;
648845ab 1658 if (likely(sk->sk_net_refcnt)) {
26abe143 1659 get_net(net);
648845ab
TZ
1660 sock_inuse_add(net, 1);
1661 }
1662
26abe143 1663 sock_net_set(sk, net);
14afee4b 1664 refcount_set(&sk->sk_wmem_alloc, 1);
f8451725 1665
2d758073 1666 mem_cgroup_sk_alloc(sk);
d979a39d 1667 cgroup_sk_alloc(&sk->sk_cgrp_data);
2a56a1fe
TH
1668 sock_update_classid(&sk->sk_cgrp_data);
1669 sock_update_netprioidx(&sk->sk_cgrp_data);
1da177e4 1670 }
a79af59e 1671
2e4afe7b 1672 return sk;
1da177e4 1673}
2a91525c 1674EXPORT_SYMBOL(sk_alloc);
1da177e4 1675
a4298e45
ED
1676/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1677 * grace period. This is the case for UDP sockets and TCP listeners.
1678 */
1679static void __sk_destruct(struct rcu_head *head)
1da177e4 1680{
a4298e45 1681 struct sock *sk = container_of(head, struct sock, sk_rcu);
1da177e4 1682 struct sk_filter *filter;
1da177e4
LT
1683
1684 if (sk->sk_destruct)
1685 sk->sk_destruct(sk);
1686
a898def2 1687 filter = rcu_dereference_check(sk->sk_filter,
14afee4b 1688 refcount_read(&sk->sk_wmem_alloc) == 0);
1da177e4 1689 if (filter) {
309dd5fc 1690 sk_filter_uncharge(sk, filter);
a9b3cd7f 1691 RCU_INIT_POINTER(sk->sk_filter, NULL);
1da177e4 1692 }
538950a1
CG
1693 if (rcu_access_pointer(sk->sk_reuseport_cb))
1694 reuseport_detach_sock(sk);
1da177e4 1695
08e29af3 1696 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1da177e4
LT
1697
1698 if (atomic_read(&sk->sk_omem_alloc))
e005d193
JP
1699 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1700 __func__, atomic_read(&sk->sk_omem_alloc));
1da177e4 1701
22a0e18e
ED
1702 if (sk->sk_frag.page) {
1703 put_page(sk->sk_frag.page);
1704 sk->sk_frag.page = NULL;
1705 }
1706
109f6e39
EB
1707 if (sk->sk_peer_cred)
1708 put_cred(sk->sk_peer_cred);
1709 put_pid(sk->sk_peer_pid);
26abe143
EB
1710 if (likely(sk->sk_net_refcnt))
1711 put_net(sock_net(sk));
c308c1b2 1712 sk_prot_free(sk->sk_prot_creator, sk);
1da177e4 1713}
2b85a34e 1714
a4298e45
ED
1715void sk_destruct(struct sock *sk)
1716{
1717 if (sock_flag(sk, SOCK_RCU_FREE))
1718 call_rcu(&sk->sk_rcu, __sk_destruct);
1719 else
1720 __sk_destruct(&sk->sk_rcu);
1721}
1722
eb4cb008
CG
1723static void __sk_free(struct sock *sk)
1724{
648845ab
TZ
1725 if (likely(sk->sk_net_refcnt))
1726 sock_inuse_add(sock_net(sk), -1);
1727
9709020c 1728 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
eb4cb008
CG
1729 sock_diag_broadcast_destroy(sk);
1730 else
1731 sk_destruct(sk);
1732}
1733
2b85a34e
ED
1734void sk_free(struct sock *sk)
1735{
1736 /*
25985edc 1737 * We subtract one from sk_wmem_alloc and can know if
2b85a34e
ED
1738 * some packets are still in some tx queue.
1739 * If not null, sock_wfree() will call __sk_free(sk) later
1740 */
14afee4b 1741 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2b85a34e
ED
1742 __sk_free(sk);
1743}
2a91525c 1744EXPORT_SYMBOL(sk_free);
1da177e4 1745
581319c5
PA
1746static void sk_init_common(struct sock *sk)
1747{
1748 skb_queue_head_init(&sk->sk_receive_queue);
1749 skb_queue_head_init(&sk->sk_write_queue);
1750 skb_queue_head_init(&sk->sk_error_queue);
1751
1752 rwlock_init(&sk->sk_callback_lock);
1753 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1754 af_rlock_keys + sk->sk_family,
1755 af_family_rlock_key_strings[sk->sk_family]);
1756 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1757 af_wlock_keys + sk->sk_family,
1758 af_family_wlock_key_strings[sk->sk_family]);
1759 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1760 af_elock_keys + sk->sk_family,
1761 af_family_elock_key_strings[sk->sk_family]);
1762 lockdep_set_class_and_name(&sk->sk_callback_lock,
1763 af_callback_keys + sk->sk_family,
1764 af_family_clock_key_strings[sk->sk_family]);
1765}
1766
e56c57d0
ED
1767/**
1768 * sk_clone_lock - clone a socket, and lock its clone
1769 * @sk: the socket to clone
1770 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1771 *
1772 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1773 */
1774struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
87d11ceb 1775{
8fd1d178 1776 struct sock *newsk;
278571ba 1777 bool is_charged = true;
87d11ceb 1778
8fd1d178 1779 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
87d11ceb
ACM
1780 if (newsk != NULL) {
1781 struct sk_filter *filter;
1782
892c141e 1783 sock_copy(newsk, sk);
87d11ceb 1784
9d538fa6
CP
1785 newsk->sk_prot_creator = sk->sk_prot;
1786
87d11ceb 1787 /* SANITY */
8a681736
SV
1788 if (likely(newsk->sk_net_refcnt))
1789 get_net(sock_net(newsk));
87d11ceb
ACM
1790 sk_node_init(&newsk->sk_node);
1791 sock_lock_init(newsk);
1792 bh_lock_sock(newsk);
fa438ccf 1793 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
8eae939f 1794 newsk->sk_backlog.len = 0;
87d11ceb
ACM
1795
1796 atomic_set(&newsk->sk_rmem_alloc, 0);
2b85a34e
ED
1797 /*
1798 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1799 */
14afee4b 1800 refcount_set(&newsk->sk_wmem_alloc, 1);
87d11ceb 1801 atomic_set(&newsk->sk_omem_alloc, 0);
581319c5 1802 sk_init_common(newsk);
87d11ceb
ACM
1803
1804 newsk->sk_dst_cache = NULL;
9b8805a3 1805 newsk->sk_dst_pending_confirm = 0;
87d11ceb
ACM
1806 newsk->sk_wmem_queued = 0;
1807 newsk->sk_forward_alloc = 0;
9caad864 1808 atomic_set(&newsk->sk_drops, 0);
87d11ceb 1809 newsk->sk_send_head = NULL;
87d11ceb 1810 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
52267790 1811 atomic_set(&newsk->sk_zckey, 0);
87d11ceb
ACM
1812
1813 sock_reset_flag(newsk, SOCK_DONE);
edbe69ef 1814 mem_cgroup_sk_alloc(newsk);
c0576e39 1815 cgroup_sk_alloc(&newsk->sk_cgrp_data);
87d11ceb 1816
eefca20e
ED
1817 rcu_read_lock();
1818 filter = rcu_dereference(sk->sk_filter);
87d11ceb 1819 if (filter != NULL)
278571ba
AS
1820 /* though it's an empty new sock, the charging may fail
1821 * if sysctl_optmem_max was changed between creation of
1822 * original socket and cloning
1823 */
1824 is_charged = sk_filter_charge(newsk, filter);
eefca20e
ED
1825 RCU_INIT_POINTER(newsk->sk_filter, filter);
1826 rcu_read_unlock();
87d11ceb 1827
d188ba86 1828 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
a97e50cc
DB
1829 /* We need to make sure that we don't uncharge the new
1830 * socket if we couldn't charge it in the first place
1831 * as otherwise we uncharge the parent's filter.
1832 */
1833 if (!is_charged)
1834 RCU_INIT_POINTER(newsk->sk_filter, NULL);
94352d45 1835 sk_free_unlock_clone(newsk);
87d11ceb
ACM
1836 newsk = NULL;
1837 goto out;
1838 }
fa463497 1839 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
87d11ceb
ACM
1840
1841 newsk->sk_err = 0;
e551c32d 1842 newsk->sk_err_soft = 0;
87d11ceb 1843 newsk->sk_priority = 0;
2c8c56e1 1844 newsk->sk_incoming_cpu = raw_smp_processor_id();
648845ab
TZ
1845 if (likely(newsk->sk_net_refcnt))
1846 sock_inuse_add(sock_net(newsk), 1);
d979a39d 1847
4dc6dc71
ED
1848 /*
1849 * Before updating sk_refcnt, we must commit prior changes to memory
1850 * (Documentation/RCU/rculist_nulls.txt for details)
1851 */
1852 smp_wmb();
41c6d650 1853 refcount_set(&newsk->sk_refcnt, 2);
87d11ceb
ACM
1854
1855 /*
1856 * Increment the counter in the same struct proto as the master
1857 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1858 * is the same as sk->sk_prot->socks, as this field was copied
1859 * with memcpy).
1860 *
1861 * This _changes_ the previous behaviour, where
1862 * tcp_create_openreq_child always was incrementing the
1863 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1864 * to be taken into account in all callers. -acme
1865 */
1866 sk_refcnt_debug_inc(newsk);
972692e0 1867 sk_set_socket(newsk, NULL);
43815482 1868 newsk->sk_wq = NULL;
87d11ceb
ACM
1869
1870 if (newsk->sk_prot->sockets_allocated)
180d8cd9 1871 sk_sockets_allocated_inc(newsk);
704da560 1872
080a270f
HFS
1873 if (sock_needs_netstamp(sk) &&
1874 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
704da560 1875 net_enable_timestamp();
87d11ceb
ACM
1876 }
1877out:
1878 return newsk;
1879}
e56c57d0 1880EXPORT_SYMBOL_GPL(sk_clone_lock);
87d11ceb 1881
94352d45
ACM
1882void sk_free_unlock_clone(struct sock *sk)
1883{
1884 /* It is still raw copy of parent, so invalidate
1885 * destructor and make plain sk_free() */
1886 sk->sk_destruct = NULL;
1887 bh_unlock_sock(sk);
1888 sk_free(sk);
1889}
1890EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1891
9958089a
AK
1892void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1893{
d6a4e26a
ED
1894 u32 max_segs = 1;
1895
6bd4f355 1896 sk_dst_set(sk, dst);
0a6b2a1d 1897 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
9958089a 1898 if (sk->sk_route_caps & NETIF_F_GSO)
4fcd6b99 1899 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
a465419b 1900 sk->sk_route_caps &= ~sk->sk_route_nocaps;
9958089a 1901 if (sk_can_gso(sk)) {
f70f250a 1902 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
9958089a 1903 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
82cc1a7a 1904 } else {
9958089a 1905 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
82cc1a7a 1906 sk->sk_gso_max_size = dst->dev->gso_max_size;
d6a4e26a 1907 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
82cc1a7a 1908 }
9958089a 1909 }
d6a4e26a 1910 sk->sk_gso_max_segs = max_segs;
9958089a
AK
1911}
1912EXPORT_SYMBOL_GPL(sk_setup_caps);
1913
1da177e4
LT
1914/*
1915 * Simple resource managers for sockets.
1916 */
1917
1918
4ec93edb
YH
1919/*
1920 * Write buffer destructor automatically called from kfree_skb.
1da177e4
LT
1921 */
1922void sock_wfree(struct sk_buff *skb)
1923{
1924 struct sock *sk = skb->sk;
d99927f4 1925 unsigned int len = skb->truesize;
1da177e4 1926
d99927f4
ED
1927 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1928 /*
1929 * Keep a reference on sk_wmem_alloc, this will be released
1930 * after sk_write_space() call
1931 */
14afee4b 1932 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1da177e4 1933 sk->sk_write_space(sk);
d99927f4
ED
1934 len = 1;
1935 }
2b85a34e 1936 /*
d99927f4
ED
1937 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1938 * could not do because of in-flight packets
2b85a34e 1939 */
14afee4b 1940 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2b85a34e 1941 __sk_free(sk);
1da177e4 1942}
2a91525c 1943EXPORT_SYMBOL(sock_wfree);
1da177e4 1944
1d2077ac
ED
1945/* This variant of sock_wfree() is used by TCP,
1946 * since it sets SOCK_USE_WRITE_QUEUE.
1947 */
1948void __sock_wfree(struct sk_buff *skb)
1949{
1950 struct sock *sk = skb->sk;
1951
14afee4b 1952 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1d2077ac
ED
1953 __sk_free(sk);
1954}
1955
9e17f8a4
ED
1956void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1957{
1958 skb_orphan(skb);
1959 skb->sk = sk;
1960#ifdef CONFIG_INET
1961 if (unlikely(!sk_fullsock(sk))) {
1962 skb->destructor = sock_edemux;
1963 sock_hold(sk);
1964 return;
1965 }
1966#endif
1967 skb->destructor = sock_wfree;
1968 skb_set_hash_from_sk(skb, sk);
1969 /*
1970 * We used to take a refcount on sk, but following operation
1971 * is enough to guarantee sk_free() wont free this sock until
1972 * all in-flight packets are completed
1973 */
14afee4b 1974 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
9e17f8a4
ED
1975}
1976EXPORT_SYMBOL(skb_set_owner_w);
1977
1d2077ac
ED
1978/* This helper is used by netem, as it can hold packets in its
1979 * delay queue. We want to allow the owner socket to send more
1980 * packets, as if they were already TX completed by a typical driver.
1981 * But we also want to keep skb->sk set because some packet schedulers
f6ba8d33 1982 * rely on it (sch_fq for example).
1d2077ac 1983 */
f2f872f9
ED
1984void skb_orphan_partial(struct sk_buff *skb)
1985{
f6ba8d33 1986 if (skb_is_tcp_pure_ack(skb))
1d2077ac
ED
1987 return;
1988
f2f872f9
ED
1989 if (skb->destructor == sock_wfree
1990#ifdef CONFIG_INET
1991 || skb->destructor == tcp_wfree
1992#endif
1993 ) {
f6ba8d33
ED
1994 struct sock *sk = skb->sk;
1995
41c6d650 1996 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
14afee4b 1997 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
f6ba8d33
ED
1998 skb->destructor = sock_efree;
1999 }
f2f872f9
ED
2000 } else {
2001 skb_orphan(skb);
2002 }
2003}
2004EXPORT_SYMBOL(skb_orphan_partial);
2005
4ec93edb
YH
2006/*
2007 * Read buffer destructor automatically called from kfree_skb.
1da177e4
LT
2008 */
2009void sock_rfree(struct sk_buff *skb)
2010{
2011 struct sock *sk = skb->sk;
d361fd59 2012 unsigned int len = skb->truesize;
1da177e4 2013
d361fd59
ED
2014 atomic_sub(len, &sk->sk_rmem_alloc);
2015 sk_mem_uncharge(sk, len);
1da177e4 2016}
2a91525c 2017EXPORT_SYMBOL(sock_rfree);
1da177e4 2018
7768eed8
OH
2019/*
2020 * Buffer destructor for skbs that are not used directly in read or write
2021 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2022 */
62bccb8c
AD
2023void sock_efree(struct sk_buff *skb)
2024{
2025 sock_put(skb->sk);
2026}
2027EXPORT_SYMBOL(sock_efree);
2028
976d0201 2029kuid_t sock_i_uid(struct sock *sk)
1da177e4 2030{
976d0201 2031 kuid_t uid;
1da177e4 2032
f064af1e 2033 read_lock_bh(&sk->sk_callback_lock);
976d0201 2034 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
f064af1e 2035 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
2036 return uid;
2037}
2a91525c 2038EXPORT_SYMBOL(sock_i_uid);
1da177e4
LT
2039
2040unsigned long sock_i_ino(struct sock *sk)
2041{
2042 unsigned long ino;
2043
f064af1e 2044 read_lock_bh(&sk->sk_callback_lock);
1da177e4 2045 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
f064af1e 2046 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
2047 return ino;
2048}
2a91525c 2049EXPORT_SYMBOL(sock_i_ino);
1da177e4
LT
2050
2051/*
2052 * Allocate a skb from the socket's send buffer.
2053 */
86a76caf 2054struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 2055 gfp_t priority)
1da177e4 2056{
14afee4b 2057 if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
2a91525c 2058 struct sk_buff *skb = alloc_skb(size, priority);
1da177e4
LT
2059 if (skb) {
2060 skb_set_owner_w(skb, sk);
2061 return skb;
2062 }
2063 }
2064 return NULL;
2065}
2a91525c 2066EXPORT_SYMBOL(sock_wmalloc);
1da177e4 2067
98ba0bd5
WB
2068static void sock_ofree(struct sk_buff *skb)
2069{
2070 struct sock *sk = skb->sk;
2071
2072 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2073}
2074
2075struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2076 gfp_t priority)
2077{
2078 struct sk_buff *skb;
2079
2080 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2081 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2082 sysctl_optmem_max)
2083 return NULL;
2084
2085 skb = alloc_skb(size, priority);
2086 if (!skb)
2087 return NULL;
2088
2089 atomic_add(skb->truesize, &sk->sk_omem_alloc);
2090 skb->sk = sk;
2091 skb->destructor = sock_ofree;
2092 return skb;
2093}
2094
4ec93edb 2095/*
1da177e4 2096 * Allocate a memory block from the socket's option memory buffer.
4ec93edb 2097 */
dd0fc66f 2098void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1da177e4 2099{
95c96174 2100 if ((unsigned int)size <= sysctl_optmem_max &&
1da177e4
LT
2101 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2102 void *mem;
2103 /* First do the add, to avoid the race if kmalloc
4ec93edb 2104 * might sleep.
1da177e4
LT
2105 */
2106 atomic_add(size, &sk->sk_omem_alloc);
2107 mem = kmalloc(size, priority);
2108 if (mem)
2109 return mem;
2110 atomic_sub(size, &sk->sk_omem_alloc);
2111 }
2112 return NULL;
2113}
2a91525c 2114EXPORT_SYMBOL(sock_kmalloc);
1da177e4 2115
79e88659
DB
2116/* Free an option memory block. Note, we actually want the inline
2117 * here as this allows gcc to detect the nullify and fold away the
2118 * condition entirely.
1da177e4 2119 */
79e88659
DB
2120static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2121 const bool nullify)
1da177e4 2122{
e53da5fb
DM
2123 if (WARN_ON_ONCE(!mem))
2124 return;
79e88659
DB
2125 if (nullify)
2126 kzfree(mem);
2127 else
2128 kfree(mem);
1da177e4
LT
2129 atomic_sub(size, &sk->sk_omem_alloc);
2130}
79e88659
DB
2131
2132void sock_kfree_s(struct sock *sk, void *mem, int size)
2133{
2134 __sock_kfree_s(sk, mem, size, false);
2135}
2a91525c 2136EXPORT_SYMBOL(sock_kfree_s);
1da177e4 2137
79e88659
DB
2138void sock_kzfree_s(struct sock *sk, void *mem, int size)
2139{
2140 __sock_kfree_s(sk, mem, size, true);
2141}
2142EXPORT_SYMBOL(sock_kzfree_s);
2143
1da177e4
LT
2144/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2145 I think, these locks should be removed for datagram sockets.
2146 */
2a91525c 2147static long sock_wait_for_wmem(struct sock *sk, long timeo)
1da177e4
LT
2148{
2149 DEFINE_WAIT(wait);
2150
9cd3e072 2151 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1da177e4
LT
2152 for (;;) {
2153 if (!timeo)
2154 break;
2155 if (signal_pending(current))
2156 break;
2157 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
aa395145 2158 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
14afee4b 2159 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1da177e4
LT
2160 break;
2161 if (sk->sk_shutdown & SEND_SHUTDOWN)
2162 break;
2163 if (sk->sk_err)
2164 break;
2165 timeo = schedule_timeout(timeo);
2166 }
aa395145 2167 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
2168 return timeo;
2169}
2170
2171
2172/*
2173 * Generic send/receive buffer handlers
2174 */
2175
4cc7f68d
HX
2176struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2177 unsigned long data_len, int noblock,
28d64271 2178 int *errcode, int max_page_order)
1da177e4 2179{
2e4e4410 2180 struct sk_buff *skb;
1da177e4
LT
2181 long timeo;
2182 int err;
2183
1da177e4 2184 timeo = sock_sndtimeo(sk, noblock);
2e4e4410 2185 for (;;) {
1da177e4
LT
2186 err = sock_error(sk);
2187 if (err != 0)
2188 goto failure;
2189
2190 err = -EPIPE;
2191 if (sk->sk_shutdown & SEND_SHUTDOWN)
2192 goto failure;
2193
2e4e4410
ED
2194 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2195 break;
28d64271 2196
9cd3e072 2197 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2e4e4410
ED
2198 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2199 err = -EAGAIN;
2200 if (!timeo)
1da177e4 2201 goto failure;
2e4e4410
ED
2202 if (signal_pending(current))
2203 goto interrupted;
2204 timeo = sock_wait_for_wmem(sk, timeo);
1da177e4 2205 }
2e4e4410
ED
2206 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2207 errcode, sk->sk_allocation);
2208 if (skb)
2209 skb_set_owner_w(skb, sk);
1da177e4
LT
2210 return skb;
2211
2212interrupted:
2213 err = sock_intr_errno(timeo);
2214failure:
2215 *errcode = err;
2216 return NULL;
2217}
4cc7f68d 2218EXPORT_SYMBOL(sock_alloc_send_pskb);
1da177e4 2219
4ec93edb 2220struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1da177e4
LT
2221 int noblock, int *errcode)
2222{
28d64271 2223 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1da177e4 2224}
2a91525c 2225EXPORT_SYMBOL(sock_alloc_send_skb);
1da177e4 2226
39771b12
WB
2227int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2228 struct sockcm_cookie *sockc)
2229{
3dd17e63
SHY
2230 u32 tsflags;
2231
39771b12
WB
2232 switch (cmsg->cmsg_type) {
2233 case SO_MARK:
2234 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2235 return -EPERM;
2236 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2237 return -EINVAL;
2238 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2239 break;
7f1bc6e9 2240 case SO_TIMESTAMPING_OLD:
3dd17e63
SHY
2241 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2242 return -EINVAL;
2243
2244 tsflags = *(u32 *)CMSG_DATA(cmsg);
2245 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2246 return -EINVAL;
2247
2248 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2249 sockc->tsflags |= tsflags;
2250 break;
80b14dee
RC
2251 case SCM_TXTIME:
2252 if (!sock_flag(sk, SOCK_TXTIME))
2253 return -EINVAL;
2254 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2255 return -EINVAL;
2256 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2257 break;
779f1ede
SHY
2258 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2259 case SCM_RIGHTS:
2260 case SCM_CREDENTIALS:
2261 break;
39771b12
WB
2262 default:
2263 return -EINVAL;
2264 }
2265 return 0;
2266}
2267EXPORT_SYMBOL(__sock_cmsg_send);
2268
f28ea365
EJ
2269int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2270 struct sockcm_cookie *sockc)
2271{
2272 struct cmsghdr *cmsg;
39771b12 2273 int ret;
f28ea365
EJ
2274
2275 for_each_cmsghdr(cmsg, msg) {
2276 if (!CMSG_OK(msg, cmsg))
2277 return -EINVAL;
2278 if (cmsg->cmsg_level != SOL_SOCKET)
2279 continue;
39771b12
WB
2280 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2281 if (ret)
2282 return ret;
f28ea365
EJ
2283 }
2284 return 0;
2285}
2286EXPORT_SYMBOL(sock_cmsg_send);
2287
06044751
ED
2288static void sk_enter_memory_pressure(struct sock *sk)
2289{
2290 if (!sk->sk_prot->enter_memory_pressure)
2291 return;
2292
2293 sk->sk_prot->enter_memory_pressure(sk);
2294}
2295
2296static void sk_leave_memory_pressure(struct sock *sk)
2297{
2298 if (sk->sk_prot->leave_memory_pressure) {
2299 sk->sk_prot->leave_memory_pressure(sk);
2300 } else {
2301 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2302
2303 if (memory_pressure && *memory_pressure)
2304 *memory_pressure = 0;
2305 }
2306}
2307
5640f768
ED
2308/* On 32bit arches, an skb frag is limited to 2^15 */
2309#define SKB_FRAG_PAGE_ORDER get_order(32768)
2310
400dfd3a
ED
2311/**
2312 * skb_page_frag_refill - check that a page_frag contains enough room
2313 * @sz: minimum size of the fragment we want to get
2314 * @pfrag: pointer to page_frag
82d5e2b8 2315 * @gfp: priority for memory allocation
400dfd3a
ED
2316 *
2317 * Note: While this allocator tries to use high order pages, there is
2318 * no guarantee that allocations succeed. Therefore, @sz MUST be
2319 * less or equal than PAGE_SIZE.
2320 */
d9b2938a 2321bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
5640f768 2322{
5640f768 2323 if (pfrag->page) {
fe896d18 2324 if (page_ref_count(pfrag->page) == 1) {
5640f768
ED
2325 pfrag->offset = 0;
2326 return true;
2327 }
400dfd3a 2328 if (pfrag->offset + sz <= pfrag->size)
5640f768
ED
2329 return true;
2330 put_page(pfrag->page);
2331 }
2332
d9b2938a
ED
2333 pfrag->offset = 0;
2334 if (SKB_FRAG_PAGE_ORDER) {
d0164adc
MG
2335 /* Avoid direct reclaim but allow kswapd to wake */
2336 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2337 __GFP_COMP | __GFP_NOWARN |
2338 __GFP_NORETRY,
d9b2938a 2339 SKB_FRAG_PAGE_ORDER);
5640f768 2340 if (likely(pfrag->page)) {
d9b2938a 2341 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
5640f768
ED
2342 return true;
2343 }
d9b2938a
ED
2344 }
2345 pfrag->page = alloc_page(gfp);
2346 if (likely(pfrag->page)) {
2347 pfrag->size = PAGE_SIZE;
2348 return true;
2349 }
400dfd3a
ED
2350 return false;
2351}
2352EXPORT_SYMBOL(skb_page_frag_refill);
2353
2354bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2355{
2356 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2357 return true;
2358
5640f768
ED
2359 sk_enter_memory_pressure(sk);
2360 sk_stream_moderate_sndbuf(sk);
2361 return false;
2362}
2363EXPORT_SYMBOL(sk_page_frag_refill);
2364
1da177e4 2365static void __lock_sock(struct sock *sk)
f39234d6
NK
2366 __releases(&sk->sk_lock.slock)
2367 __acquires(&sk->sk_lock.slock)
1da177e4
LT
2368{
2369 DEFINE_WAIT(wait);
2370
e71a4783 2371 for (;;) {
1da177e4
LT
2372 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2373 TASK_UNINTERRUPTIBLE);
2374 spin_unlock_bh(&sk->sk_lock.slock);
2375 schedule();
2376 spin_lock_bh(&sk->sk_lock.slock);
e71a4783 2377 if (!sock_owned_by_user(sk))
1da177e4
LT
2378 break;
2379 }
2380 finish_wait(&sk->sk_lock.wq, &wait);
2381}
2382
8873c064 2383void __release_sock(struct sock *sk)
f39234d6
NK
2384 __releases(&sk->sk_lock.slock)
2385 __acquires(&sk->sk_lock.slock)
1da177e4 2386{
5413d1ba 2387 struct sk_buff *skb, *next;
1da177e4 2388
5413d1ba 2389 while ((skb = sk->sk_backlog.head) != NULL) {
1da177e4 2390 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1da177e4 2391
5413d1ba 2392 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4 2393
5413d1ba
ED
2394 do {
2395 next = skb->next;
e4cbb02a 2396 prefetch(next);
7fee226a 2397 WARN_ON_ONCE(skb_dst_is_noref(skb));
a8305bff 2398 skb_mark_not_on_list(skb);
c57943a1 2399 sk_backlog_rcv(sk, skb);
1da177e4 2400
5413d1ba 2401 cond_resched();
1da177e4
LT
2402
2403 skb = next;
2404 } while (skb != NULL);
2405
5413d1ba
ED
2406 spin_lock_bh(&sk->sk_lock.slock);
2407 }
8eae939f
ZY
2408
2409 /*
2410 * Doing the zeroing here guarantee we can not loop forever
2411 * while a wild producer attempts to flood us.
2412 */
2413 sk->sk_backlog.len = 0;
1da177e4
LT
2414}
2415
d41a69f1
ED
2416void __sk_flush_backlog(struct sock *sk)
2417{
2418 spin_lock_bh(&sk->sk_lock.slock);
2419 __release_sock(sk);
2420 spin_unlock_bh(&sk->sk_lock.slock);
2421}
2422
1da177e4
LT
2423/**
2424 * sk_wait_data - wait for data to arrive at sk_receive_queue
4dc3b16b
PP
2425 * @sk: sock to wait on
2426 * @timeo: for how long
dfbafc99 2427 * @skb: last skb seen on sk_receive_queue
1da177e4
LT
2428 *
2429 * Now socket state including sk->sk_err is changed only under lock,
2430 * hence we may omit checks after joining wait queue.
2431 * We check receive queue before schedule() only as optimization;
2432 * it is very likely that release_sock() added new data.
2433 */
dfbafc99 2434int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
1da177e4 2435{
d9dc8b0f 2436 DEFINE_WAIT_FUNC(wait, woken_wake_function);
1da177e4 2437 int rc;
1da177e4 2438
d9dc8b0f 2439 add_wait_queue(sk_sleep(sk), &wait);
9cd3e072 2440 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
d9dc8b0f 2441 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
9cd3e072 2442 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
d9dc8b0f 2443 remove_wait_queue(sk_sleep(sk), &wait);
1da177e4
LT
2444 return rc;
2445}
1da177e4
LT
2446EXPORT_SYMBOL(sk_wait_data);
2447
3ab224be 2448/**
f8c3bf00 2449 * __sk_mem_raise_allocated - increase memory_allocated
3ab224be
HA
2450 * @sk: socket
2451 * @size: memory size to allocate
f8c3bf00 2452 * @amt: pages to allocate
3ab224be
HA
2453 * @kind: allocation type
2454 *
f8c3bf00 2455 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
3ab224be 2456 */
f8c3bf00 2457int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3ab224be
HA
2458{
2459 struct proto *prot = sk->sk_prot;
f8c3bf00 2460 long allocated = sk_memory_allocated_add(sk, amt);
d6f19938 2461 bool charged = true;
e805605c 2462
baac50bb 2463 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
d6f19938 2464 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
e805605c 2465 goto suppress_allocation;
3ab224be
HA
2466
2467 /* Under limit. */
e805605c 2468 if (allocated <= sk_prot_mem_limits(sk, 0)) {
180d8cd9 2469 sk_leave_memory_pressure(sk);
3ab224be
HA
2470 return 1;
2471 }
2472
e805605c
JW
2473 /* Under pressure. */
2474 if (allocated > sk_prot_mem_limits(sk, 1))
180d8cd9 2475 sk_enter_memory_pressure(sk);
3ab224be 2476
e805605c
JW
2477 /* Over hard limit. */
2478 if (allocated > sk_prot_mem_limits(sk, 2))
3ab224be
HA
2479 goto suppress_allocation;
2480
2481 /* guarantee minimum buffer size under pressure */
2482 if (kind == SK_MEM_RECV) {
a3dcaf17 2483 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3ab224be 2484 return 1;
180d8cd9 2485
3ab224be 2486 } else { /* SK_MEM_SEND */
a3dcaf17
ED
2487 int wmem0 = sk_get_wmem0(sk, prot);
2488
3ab224be 2489 if (sk->sk_type == SOCK_STREAM) {
a3dcaf17 2490 if (sk->sk_wmem_queued < wmem0)
3ab224be 2491 return 1;
a3dcaf17 2492 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3ab224be 2493 return 1;
a3dcaf17 2494 }
3ab224be
HA
2495 }
2496
180d8cd9 2497 if (sk_has_memory_pressure(sk)) {
5bf325a5 2498 u64 alloc;
1748376b 2499
180d8cd9 2500 if (!sk_under_memory_pressure(sk))
1748376b 2501 return 1;
180d8cd9
GC
2502 alloc = sk_sockets_allocated_read_positive(sk);
2503 if (sk_prot_mem_limits(sk, 2) > alloc *
3ab224be
HA
2504 sk_mem_pages(sk->sk_wmem_queued +
2505 atomic_read(&sk->sk_rmem_alloc) +
2506 sk->sk_forward_alloc))
2507 return 1;
2508 }
2509
2510suppress_allocation:
2511
2512 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2513 sk_stream_moderate_sndbuf(sk);
2514
2515 /* Fail only if socket is _under_ its sndbuf.
2516 * In this case we cannot block, so that we have to fail.
2517 */
2518 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2519 return 1;
2520 }
2521
d6f19938
YS
2522 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2523 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3847ce32 2524
0e90b31f 2525 sk_memory_allocated_sub(sk, amt);
180d8cd9 2526
baac50bb
JW
2527 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2528 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
e805605c 2529
3ab224be
HA
2530 return 0;
2531}
f8c3bf00
PA
2532EXPORT_SYMBOL(__sk_mem_raise_allocated);
2533
2534/**
2535 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2536 * @sk: socket
2537 * @size: memory size to allocate
2538 * @kind: allocation type
2539 *
2540 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2541 * rmem allocation. This function assumes that protocols which have
2542 * memory_pressure use sk_wmem_queued as write buffer accounting.
2543 */
2544int __sk_mem_schedule(struct sock *sk, int size, int kind)
2545{
2546 int ret, amt = sk_mem_pages(size);
2547
2548 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2549 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2550 if (!ret)
2551 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2552 return ret;
2553}
3ab224be
HA
2554EXPORT_SYMBOL(__sk_mem_schedule);
2555
2556/**
f8c3bf00 2557 * __sk_mem_reduce_allocated - reclaim memory_allocated
3ab224be 2558 * @sk: socket
f8c3bf00
PA
2559 * @amount: number of quanta
2560 *
2561 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3ab224be 2562 */
f8c3bf00 2563void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3ab224be 2564{
1a24e04e 2565 sk_memory_allocated_sub(sk, amount);
3ab224be 2566
baac50bb
JW
2567 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2568 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
e805605c 2569
180d8cd9
GC
2570 if (sk_under_memory_pressure(sk) &&
2571 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2572 sk_leave_memory_pressure(sk);
3ab224be 2573}
f8c3bf00
PA
2574EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2575
2576/**
2577 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2578 * @sk: socket
2579 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2580 */
2581void __sk_mem_reclaim(struct sock *sk, int amount)
2582{
2583 amount >>= SK_MEM_QUANTUM_SHIFT;
2584 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2585 __sk_mem_reduce_allocated(sk, amount);
2586}
3ab224be
HA
2587EXPORT_SYMBOL(__sk_mem_reclaim);
2588
627d2d6b 2589int sk_set_peek_off(struct sock *sk, int val)
2590{
627d2d6b 2591 sk->sk_peek_off = val;
2592 return 0;
2593}
2594EXPORT_SYMBOL_GPL(sk_set_peek_off);
3ab224be 2595
1da177e4
LT
2596/*
2597 * Set of default routines for initialising struct proto_ops when
2598 * the protocol does not support a particular function. In certain
2599 * cases where it makes no sense for a protocol to have a "do nothing"
2600 * function, some default processing is provided.
2601 */
2602
2603int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2604{
2605 return -EOPNOTSUPP;
2606}
2a91525c 2607EXPORT_SYMBOL(sock_no_bind);
1da177e4 2608
4ec93edb 2609int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
2610 int len, int flags)
2611{
2612 return -EOPNOTSUPP;
2613}
2a91525c 2614EXPORT_SYMBOL(sock_no_connect);
1da177e4
LT
2615
2616int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2617{
2618 return -EOPNOTSUPP;
2619}
2a91525c 2620EXPORT_SYMBOL(sock_no_socketpair);
1da177e4 2621
cdfbabfb
DH
2622int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2623 bool kern)
1da177e4
LT
2624{
2625 return -EOPNOTSUPP;
2626}
2a91525c 2627EXPORT_SYMBOL(sock_no_accept);
1da177e4 2628
4ec93edb 2629int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
9b2c45d4 2630 int peer)
1da177e4
LT
2631{
2632 return -EOPNOTSUPP;
2633}
2a91525c 2634EXPORT_SYMBOL(sock_no_getname);
1da177e4 2635
1da177e4
LT
2636int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2637{
2638 return -EOPNOTSUPP;
2639}
2a91525c 2640EXPORT_SYMBOL(sock_no_ioctl);
1da177e4
LT
2641
2642int sock_no_listen(struct socket *sock, int backlog)
2643{
2644 return -EOPNOTSUPP;
2645}
2a91525c 2646EXPORT_SYMBOL(sock_no_listen);
1da177e4
LT
2647
2648int sock_no_shutdown(struct socket *sock, int how)
2649{
2650 return -EOPNOTSUPP;
2651}
2a91525c 2652EXPORT_SYMBOL(sock_no_shutdown);
1da177e4
LT
2653
2654int sock_no_setsockopt(struct socket *sock, int level, int optname,
b7058842 2655 char __user *optval, unsigned int optlen)
1da177e4
LT
2656{
2657 return -EOPNOTSUPP;
2658}
2a91525c 2659EXPORT_SYMBOL(sock_no_setsockopt);
1da177e4
LT
2660
2661int sock_no_getsockopt(struct socket *sock, int level, int optname,
2662 char __user *optval, int __user *optlen)
2663{
2664 return -EOPNOTSUPP;
2665}
2a91525c 2666EXPORT_SYMBOL(sock_no_getsockopt);
1da177e4 2667
1b784140 2668int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
1da177e4
LT
2669{
2670 return -EOPNOTSUPP;
2671}
2a91525c 2672EXPORT_SYMBOL(sock_no_sendmsg);
1da177e4 2673
306b13eb
TH
2674int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2675{
2676 return -EOPNOTSUPP;
2677}
2678EXPORT_SYMBOL(sock_no_sendmsg_locked);
2679
1b784140
YX
2680int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2681 int flags)
1da177e4
LT
2682{
2683 return -EOPNOTSUPP;
2684}
2a91525c 2685EXPORT_SYMBOL(sock_no_recvmsg);
1da177e4
LT
2686
2687int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2688{
2689 /* Mirror missing mmap method error code */
2690 return -ENODEV;
2691}
2a91525c 2692EXPORT_SYMBOL(sock_no_mmap);
1da177e4
LT
2693
2694ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2695{
2696 ssize_t res;
2697 struct msghdr msg = {.msg_flags = flags};
2698 struct kvec iov;
2699 char *kaddr = kmap(page);
2700 iov.iov_base = kaddr + offset;
2701 iov.iov_len = size;
2702 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2703 kunmap(page);
2704 return res;
2705}
2a91525c 2706EXPORT_SYMBOL(sock_no_sendpage);
1da177e4 2707
306b13eb
TH
2708ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2709 int offset, size_t size, int flags)
2710{
2711 ssize_t res;
2712 struct msghdr msg = {.msg_flags = flags};
2713 struct kvec iov;
2714 char *kaddr = kmap(page);
2715
2716 iov.iov_base = kaddr + offset;
2717 iov.iov_len = size;
2718 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2719 kunmap(page);
2720 return res;
2721}
2722EXPORT_SYMBOL(sock_no_sendpage_locked);
2723
1da177e4
LT
2724/*
2725 * Default Socket Callbacks
2726 */
2727
2728static void sock_def_wakeup(struct sock *sk)
2729{
43815482
ED
2730 struct socket_wq *wq;
2731
2732 rcu_read_lock();
2733 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2734 if (skwq_has_sleeper(wq))
43815482
ED
2735 wake_up_interruptible_all(&wq->wait);
2736 rcu_read_unlock();
1da177e4
LT
2737}
2738
2739static void sock_def_error_report(struct sock *sk)
2740{
43815482
ED
2741 struct socket_wq *wq;
2742
2743 rcu_read_lock();
2744 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2745 if (skwq_has_sleeper(wq))
a9a08845 2746 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
8d8ad9d7 2747 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
43815482 2748 rcu_read_unlock();
1da177e4
LT
2749}
2750
676d2369 2751static void sock_def_readable(struct sock *sk)
1da177e4 2752{
43815482
ED
2753 struct socket_wq *wq;
2754
2755 rcu_read_lock();
2756 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2757 if (skwq_has_sleeper(wq))
a9a08845
LT
2758 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2759 EPOLLRDNORM | EPOLLRDBAND);
8d8ad9d7 2760 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
43815482 2761 rcu_read_unlock();
1da177e4
LT
2762}
2763
2764static void sock_def_write_space(struct sock *sk)
2765{
43815482
ED
2766 struct socket_wq *wq;
2767
2768 rcu_read_lock();
1da177e4
LT
2769
2770 /* Do not wake up a writer until he can make "significant"
2771 * progress. --DaveM
2772 */
14afee4b 2773 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
43815482 2774 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2775 if (skwq_has_sleeper(wq))
a9a08845
LT
2776 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2777 EPOLLWRNORM | EPOLLWRBAND);
1da177e4
LT
2778
2779 /* Should agree with poll, otherwise some programs break */
2780 if (sock_writeable(sk))
8d8ad9d7 2781 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1da177e4
LT
2782 }
2783
43815482 2784 rcu_read_unlock();
1da177e4
LT
2785}
2786
2787static void sock_def_destruct(struct sock *sk)
2788{
1da177e4
LT
2789}
2790
2791void sk_send_sigurg(struct sock *sk)
2792{
2793 if (sk->sk_socket && sk->sk_socket->file)
2794 if (send_sigurg(&sk->sk_socket->file->f_owner))
8d8ad9d7 2795 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1da177e4 2796}
2a91525c 2797EXPORT_SYMBOL(sk_send_sigurg);
1da177e4
LT
2798
2799void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2800 unsigned long expires)
2801{
2802 if (!mod_timer(timer, expires))
2803 sock_hold(sk);
2804}
1da177e4
LT
2805EXPORT_SYMBOL(sk_reset_timer);
2806
2807void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2808{
25cc4ae9 2809 if (del_timer(timer))
1da177e4
LT
2810 __sock_put(sk);
2811}
1da177e4
LT
2812EXPORT_SYMBOL(sk_stop_timer);
2813
2814void sock_init_data(struct socket *sock, struct sock *sk)
2815{
581319c5 2816 sk_init_common(sk);
1da177e4
LT
2817 sk->sk_send_head = NULL;
2818
99767f27 2819 timer_setup(&sk->sk_timer, NULL, 0);
4ec93edb 2820
1da177e4
LT
2821 sk->sk_allocation = GFP_KERNEL;
2822 sk->sk_rcvbuf = sysctl_rmem_default;
2823 sk->sk_sndbuf = sysctl_wmem_default;
2824 sk->sk_state = TCP_CLOSE;
972692e0 2825 sk_set_socket(sk, sock);
1da177e4
LT
2826
2827 sock_set_flag(sk, SOCK_ZAPPED);
2828
e71a4783 2829 if (sock) {
1da177e4 2830 sk->sk_type = sock->type;
43815482 2831 sk->sk_wq = sock->wq;
1da177e4 2832 sock->sk = sk;
86741ec2
LC
2833 sk->sk_uid = SOCK_INODE(sock)->i_uid;
2834 } else {
43815482 2835 sk->sk_wq = NULL;
86741ec2
LC
2836 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
2837 }
1da177e4 2838
1da177e4 2839 rwlock_init(&sk->sk_callback_lock);
cdfbabfb
DH
2840 if (sk->sk_kern_sock)
2841 lockdep_set_class_and_name(
2842 &sk->sk_callback_lock,
2843 af_kern_callback_keys + sk->sk_family,
2844 af_family_kern_clock_key_strings[sk->sk_family]);
2845 else
2846 lockdep_set_class_and_name(
2847 &sk->sk_callback_lock,
443aef0e
PZ
2848 af_callback_keys + sk->sk_family,
2849 af_family_clock_key_strings[sk->sk_family]);
1da177e4
LT
2850
2851 sk->sk_state_change = sock_def_wakeup;
2852 sk->sk_data_ready = sock_def_readable;
2853 sk->sk_write_space = sock_def_write_space;
2854 sk->sk_error_report = sock_def_error_report;
2855 sk->sk_destruct = sock_def_destruct;
2856
5640f768
ED
2857 sk->sk_frag.page = NULL;
2858 sk->sk_frag.offset = 0;
ef64a54f 2859 sk->sk_peek_off = -1;
1da177e4 2860
109f6e39
EB
2861 sk->sk_peer_pid = NULL;
2862 sk->sk_peer_cred = NULL;
1da177e4
LT
2863 sk->sk_write_pending = 0;
2864 sk->sk_rcvlowat = 1;
2865 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2866 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2867
6c7c98ba 2868 sk->sk_stamp = SK_DEFAULT_STAMP;
3a0ed3e9
DD
2869#if BITS_PER_LONG==32
2870 seqlock_init(&sk->sk_stamp_seq);
2871#endif
52267790 2872 atomic_set(&sk->sk_zckey, 0);
1da177e4 2873
e0d1095a 2874#ifdef CONFIG_NET_RX_BUSY_POLL
06021292 2875 sk->sk_napi_id = 0;
64b0dc51 2876 sk->sk_ll_usec = sysctl_net_busy_read;
06021292
ET
2877#endif
2878
76a9ebe8
ED
2879 sk->sk_max_pacing_rate = ~0UL;
2880 sk->sk_pacing_rate = ~0UL;
3a9b76fd 2881 sk->sk_pacing_shift = 10;
70da268b 2882 sk->sk_incoming_cpu = -1;
c6345ce7
AN
2883
2884 sk_rx_queue_clear(sk);
4dc6dc71
ED
2885 /*
2886 * Before updating sk_refcnt, we must commit prior changes to memory
2887 * (Documentation/RCU/rculist_nulls.txt for details)
2888 */
2889 smp_wmb();
41c6d650 2890 refcount_set(&sk->sk_refcnt, 1);
33c732c3 2891 atomic_set(&sk->sk_drops, 0);
1da177e4 2892}
2a91525c 2893EXPORT_SYMBOL(sock_init_data);
1da177e4 2894
b5606c2d 2895void lock_sock_nested(struct sock *sk, int subclass)
1da177e4
LT
2896{
2897 might_sleep();
a5b5bb9a 2898 spin_lock_bh(&sk->sk_lock.slock);
d2e9117c 2899 if (sk->sk_lock.owned)
1da177e4 2900 __lock_sock(sk);
d2e9117c 2901 sk->sk_lock.owned = 1;
a5b5bb9a
IM
2902 spin_unlock(&sk->sk_lock.slock);
2903 /*
2904 * The sk_lock has mutex_lock() semantics here:
2905 */
fcc70d5f 2906 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
a5b5bb9a 2907 local_bh_enable();
1da177e4 2908}
fcc70d5f 2909EXPORT_SYMBOL(lock_sock_nested);
1da177e4 2910
b5606c2d 2911void release_sock(struct sock *sk)
1da177e4 2912{
a5b5bb9a 2913 spin_lock_bh(&sk->sk_lock.slock);
1da177e4
LT
2914 if (sk->sk_backlog.tail)
2915 __release_sock(sk);
46d3ceab 2916
c3f9b018
ED
2917 /* Warning : release_cb() might need to release sk ownership,
2918 * ie call sock_release_ownership(sk) before us.
2919 */
46d3ceab
ED
2920 if (sk->sk_prot->release_cb)
2921 sk->sk_prot->release_cb(sk);
2922
c3f9b018 2923 sock_release_ownership(sk);
a5b5bb9a
IM
2924 if (waitqueue_active(&sk->sk_lock.wq))
2925 wake_up(&sk->sk_lock.wq);
2926 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4
LT
2927}
2928EXPORT_SYMBOL(release_sock);
2929
8a74ad60
ED
2930/**
2931 * lock_sock_fast - fast version of lock_sock
2932 * @sk: socket
2933 *
2934 * This version should be used for very small section, where process wont block
d651983d
MCC
2935 * return false if fast path is taken:
2936 *
8a74ad60 2937 * sk_lock.slock locked, owned = 0, BH disabled
d651983d
MCC
2938 *
2939 * return true if slow path is taken:
2940 *
8a74ad60
ED
2941 * sk_lock.slock unlocked, owned = 1, BH enabled
2942 */
2943bool lock_sock_fast(struct sock *sk)
2944{
2945 might_sleep();
2946 spin_lock_bh(&sk->sk_lock.slock);
2947
2948 if (!sk->sk_lock.owned)
2949 /*
2950 * Note : We must disable BH
2951 */
2952 return false;
2953
2954 __lock_sock(sk);
2955 sk->sk_lock.owned = 1;
2956 spin_unlock(&sk->sk_lock.slock);
2957 /*
2958 * The sk_lock has mutex_lock() semantics here:
2959 */
2960 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2961 local_bh_enable();
2962 return true;
2963}
2964EXPORT_SYMBOL(lock_sock_fast);
2965
1da177e4 2966int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
4ec93edb 2967{
b7aa0bf7 2968 struct timeval tv;
9dae3497
YS
2969
2970 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3a0ed3e9 2971 tv = ktime_to_timeval(sock_read_timestamp(sk));
b7aa0bf7 2972 if (tv.tv_sec == -1)
1da177e4 2973 return -ENOENT;
b7aa0bf7 2974 if (tv.tv_sec == 0) {
3a0ed3e9
DD
2975 ktime_t kt = ktime_get_real();
2976 sock_write_timestamp(sk, kt);
2977 tv = ktime_to_timeval(kt);
b7aa0bf7
ED
2978 }
2979 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
4ec93edb 2980}
1da177e4
LT
2981EXPORT_SYMBOL(sock_get_timestamp);
2982
ae40eb1e
ED
2983int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2984{
2985 struct timespec ts;
9dae3497
YS
2986
2987 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3a0ed3e9 2988 ts = ktime_to_timespec(sock_read_timestamp(sk));
ae40eb1e
ED
2989 if (ts.tv_sec == -1)
2990 return -ENOENT;
2991 if (ts.tv_sec == 0) {
3a0ed3e9
DD
2992 ktime_t kt = ktime_get_real();
2993 sock_write_timestamp(sk, kt);
ae40eb1e
ED
2994 ts = ktime_to_timespec(sk->sk_stamp);
2995 }
2996 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2997}
2998EXPORT_SYMBOL(sock_get_timestampns);
2999
20d49473 3000void sock_enable_timestamp(struct sock *sk, int flag)
4ec93edb 3001{
20d49473 3002 if (!sock_flag(sk, flag)) {
08e29af3
ED
3003 unsigned long previous_flags = sk->sk_flags;
3004
20d49473
PO
3005 sock_set_flag(sk, flag);
3006 /*
3007 * we just set one of the two flags which require net
3008 * time stamping, but time stamping might have been on
3009 * already because of the other one
3010 */
080a270f
HFS
3011 if (sock_needs_netstamp(sk) &&
3012 !(previous_flags & SK_FLAGS_TIMESTAMP))
20d49473 3013 net_enable_timestamp();
1da177e4
LT
3014 }
3015}
1da177e4 3016
cb820f8e
RC
3017int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3018 int level, int type)
3019{
3020 struct sock_exterr_skb *serr;
364a9e93 3021 struct sk_buff *skb;
cb820f8e
RC
3022 int copied, err;
3023
3024 err = -EAGAIN;
364a9e93 3025 skb = sock_dequeue_err_skb(sk);
cb820f8e
RC
3026 if (skb == NULL)
3027 goto out;
3028
3029 copied = skb->len;
3030 if (copied > len) {
3031 msg->msg_flags |= MSG_TRUNC;
3032 copied = len;
3033 }
51f3d02b 3034 err = skb_copy_datagram_msg(skb, 0, msg, copied);
cb820f8e
RC
3035 if (err)
3036 goto out_free_skb;
3037
3038 sock_recv_timestamp(msg, sk, skb);
3039
3040 serr = SKB_EXT_ERR(skb);
3041 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3042
3043 msg->msg_flags |= MSG_ERRQUEUE;
3044 err = copied;
3045
cb820f8e
RC
3046out_free_skb:
3047 kfree_skb(skb);
3048out:
3049 return err;
3050}
3051EXPORT_SYMBOL(sock_recv_errqueue);
3052
1da177e4
LT
3053/*
3054 * Get a socket option on an socket.
3055 *
3056 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3057 * asynchronous errors should be reported by getsockopt. We assume
3058 * this means if you specify SO_ERROR (otherwise whats the point of it).
3059 */
3060int sock_common_getsockopt(struct socket *sock, int level, int optname,
3061 char __user *optval, int __user *optlen)
3062{
3063 struct sock *sk = sock->sk;
3064
3065 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3066}
1da177e4
LT
3067EXPORT_SYMBOL(sock_common_getsockopt);
3068
3fdadf7d 3069#ifdef CONFIG_COMPAT
543d9cfe
ACM
3070int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3071 char __user *optval, int __user *optlen)
3fdadf7d
DM
3072{
3073 struct sock *sk = sock->sk;
3074
1e51f951 3075 if (sk->sk_prot->compat_getsockopt != NULL)
543d9cfe
ACM
3076 return sk->sk_prot->compat_getsockopt(sk, level, optname,
3077 optval, optlen);
3fdadf7d
DM
3078 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3079}
3080EXPORT_SYMBOL(compat_sock_common_getsockopt);
3081#endif
3082
1b784140
YX
3083int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3084 int flags)
1da177e4
LT
3085{
3086 struct sock *sk = sock->sk;
3087 int addr_len = 0;
3088 int err;
3089
1b784140 3090 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
1da177e4
LT
3091 flags & ~MSG_DONTWAIT, &addr_len);
3092 if (err >= 0)
3093 msg->msg_namelen = addr_len;
3094 return err;
3095}
1da177e4
LT
3096EXPORT_SYMBOL(sock_common_recvmsg);
3097
3098/*
3099 * Set socket options on an inet socket.
3100 */
3101int sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 3102 char __user *optval, unsigned int optlen)
1da177e4
LT
3103{
3104 struct sock *sk = sock->sk;
3105
3106 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3107}
1da177e4
LT
3108EXPORT_SYMBOL(sock_common_setsockopt);
3109
3fdadf7d 3110#ifdef CONFIG_COMPAT
543d9cfe 3111int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 3112 char __user *optval, unsigned int optlen)
3fdadf7d
DM
3113{
3114 struct sock *sk = sock->sk;
3115
543d9cfe
ACM
3116 if (sk->sk_prot->compat_setsockopt != NULL)
3117 return sk->sk_prot->compat_setsockopt(sk, level, optname,
3118 optval, optlen);
3fdadf7d
DM
3119 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3120}
3121EXPORT_SYMBOL(compat_sock_common_setsockopt);
3122#endif
3123
1da177e4
LT
3124void sk_common_release(struct sock *sk)
3125{
3126 if (sk->sk_prot->destroy)
3127 sk->sk_prot->destroy(sk);
3128
3129 /*
3130 * Observation: when sock_common_release is called, processes have
3131 * no access to socket. But net still has.
3132 * Step one, detach it from networking:
3133 *
3134 * A. Remove from hash tables.
3135 */
3136
3137 sk->sk_prot->unhash(sk);
3138
3139 /*
3140 * In this point socket cannot receive new packets, but it is possible
3141 * that some packets are in flight because some CPU runs receiver and
3142 * did hash table lookup before we unhashed socket. They will achieve
3143 * receive queue and will be purged by socket destructor.
3144 *
3145 * Also we still have packets pending on receive queue and probably,
3146 * our own packets waiting in device queues. sock_destroy will drain
3147 * receive queue, but transmitted packets will delay socket destruction
3148 * until the last reference will be released.
3149 */
3150
3151 sock_orphan(sk);
3152
3153 xfrm_sk_free_policy(sk);
3154
e6848976 3155 sk_refcnt_debug_release(sk);
5640f768 3156
1da177e4
LT
3157 sock_put(sk);
3158}
1da177e4
LT
3159EXPORT_SYMBOL(sk_common_release);
3160
a2d133b1
JH
3161void sk_get_meminfo(const struct sock *sk, u32 *mem)
3162{
3163 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3164
3165 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3166 mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3167 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3168 mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3169 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3170 mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3171 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3172 mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3173 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3174}
3175
13ff3d6f
PE
3176#ifdef CONFIG_PROC_FS
3177#define PROTO_INUSE_NR 64 /* should be enough for the first time */
1338d466
PE
3178struct prot_inuse {
3179 int val[PROTO_INUSE_NR];
3180};
13ff3d6f
PE
3181
3182static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
70ee1159 3183
70ee1159
PE
3184void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3185{
08fc7f81 3186 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
70ee1159
PE
3187}
3188EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3189
3190int sock_prot_inuse_get(struct net *net, struct proto *prot)
3191{
3192 int cpu, idx = prot->inuse_idx;
3193 int res = 0;
3194
3195 for_each_possible_cpu(cpu)
08fc7f81 3196 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
70ee1159
PE
3197
3198 return res >= 0 ? res : 0;
3199}
3200EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3201
648845ab
TZ
3202static void sock_inuse_add(struct net *net, int val)
3203{
3204 this_cpu_add(*net->core.sock_inuse, val);
3205}
3206
3207int sock_inuse_get(struct net *net)
3208{
3209 int cpu, res = 0;
3210
3211 for_each_possible_cpu(cpu)
3212 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3213
3214 return res;
3215}
3216
3217EXPORT_SYMBOL_GPL(sock_inuse_get);
3218
2c8c1e72 3219static int __net_init sock_inuse_init_net(struct net *net)
70ee1159 3220{
08fc7f81 3221 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
648845ab
TZ
3222 if (net->core.prot_inuse == NULL)
3223 return -ENOMEM;
3224
3225 net->core.sock_inuse = alloc_percpu(int);
3226 if (net->core.sock_inuse == NULL)
3227 goto out;
3228
3229 return 0;
3230
3231out:
3232 free_percpu(net->core.prot_inuse);
3233 return -ENOMEM;
70ee1159
PE
3234}
3235
2c8c1e72 3236static void __net_exit sock_inuse_exit_net(struct net *net)
70ee1159 3237{
08fc7f81 3238 free_percpu(net->core.prot_inuse);
648845ab 3239 free_percpu(net->core.sock_inuse);
70ee1159
PE
3240}
3241
3242static struct pernet_operations net_inuse_ops = {
3243 .init = sock_inuse_init_net,
3244 .exit = sock_inuse_exit_net,
3245};
3246
3247static __init int net_inuse_init(void)
3248{
3249 if (register_pernet_subsys(&net_inuse_ops))
3250 panic("Cannot initialize net inuse counters");
3251
3252 return 0;
3253}
3254
3255core_initcall(net_inuse_init);
13ff3d6f
PE
3256
3257static void assign_proto_idx(struct proto *prot)
3258{
3259 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3260
3261 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
e005d193 3262 pr_err("PROTO_INUSE_NR exhausted\n");
13ff3d6f
PE
3263 return;
3264 }
3265
3266 set_bit(prot->inuse_idx, proto_inuse_idx);
3267}
3268
3269static void release_proto_idx(struct proto *prot)
3270{
3271 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3272 clear_bit(prot->inuse_idx, proto_inuse_idx);
3273}
3274#else
3275static inline void assign_proto_idx(struct proto *prot)
3276{
3277}
3278
3279static inline void release_proto_idx(struct proto *prot)
3280{
3281}
648845ab
TZ
3282
3283static void sock_inuse_add(struct net *net, int val)
3284{
3285}
13ff3d6f
PE
3286#endif
3287
0159dfd3
ED
3288static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3289{
3290 if (!rsk_prot)
3291 return;
3292 kfree(rsk_prot->slab_name);
3293 rsk_prot->slab_name = NULL;
adf78eda
JL
3294 kmem_cache_destroy(rsk_prot->slab);
3295 rsk_prot->slab = NULL;
0159dfd3
ED
3296}
3297
3298static int req_prot_init(const struct proto *prot)
3299{
3300 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3301
3302 if (!rsk_prot)
3303 return 0;
3304
3305 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3306 prot->name);
3307 if (!rsk_prot->slab_name)
3308 return -ENOMEM;
3309
3310 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3311 rsk_prot->obj_size, 0,
e699e2c6
SB
3312 SLAB_ACCOUNT | prot->slab_flags,
3313 NULL);
0159dfd3
ED
3314
3315 if (!rsk_prot->slab) {
3316 pr_crit("%s: Can't create request sock SLAB cache!\n",
3317 prot->name);
3318 return -ENOMEM;
3319 }
3320 return 0;
3321}
3322
b733c007
PE
3323int proto_register(struct proto *prot, int alloc_slab)
3324{
1da177e4 3325 if (alloc_slab) {
30c2c9f1
DW
3326 prot->slab = kmem_cache_create_usercopy(prot->name,
3327 prot->obj_size, 0,
e699e2c6
SB
3328 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3329 prot->slab_flags,
289a4860 3330 prot->useroffset, prot->usersize,
271b72c7 3331 NULL);
1da177e4
LT
3332
3333 if (prot->slab == NULL) {
e005d193
JP
3334 pr_crit("%s: Can't create sock SLAB cache!\n",
3335 prot->name);
60e7663d 3336 goto out;
1da177e4 3337 }
2e6599cb 3338
0159dfd3
ED
3339 if (req_prot_init(prot))
3340 goto out_free_request_sock_slab;
8feaf0c0 3341
6d6ee43e 3342 if (prot->twsk_prot != NULL) {
faf23422 3343 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
8feaf0c0 3344
7e56b5d6 3345 if (prot->twsk_prot->twsk_slab_name == NULL)
8feaf0c0
ACM
3346 goto out_free_request_sock_slab;
3347
6d6ee43e 3348 prot->twsk_prot->twsk_slab =
7e56b5d6 3349 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
6d6ee43e 3350 prot->twsk_prot->twsk_obj_size,
3ab5aee7 3351 0,
e699e2c6 3352 SLAB_ACCOUNT |
52db70dc 3353 prot->slab_flags,
20c2df83 3354 NULL);
6d6ee43e 3355 if (prot->twsk_prot->twsk_slab == NULL)
8feaf0c0
ACM
3356 goto out_free_timewait_sock_slab_name;
3357 }
1da177e4
LT
3358 }
3359
36b77a52 3360 mutex_lock(&proto_list_mutex);
1da177e4 3361 list_add(&prot->node, &proto_list);
13ff3d6f 3362 assign_proto_idx(prot);
36b77a52 3363 mutex_unlock(&proto_list_mutex);
b733c007
PE
3364 return 0;
3365
8feaf0c0 3366out_free_timewait_sock_slab_name:
7e56b5d6 3367 kfree(prot->twsk_prot->twsk_slab_name);
8feaf0c0 3368out_free_request_sock_slab:
0159dfd3
ED
3369 req_prot_cleanup(prot->rsk_prot);
3370
2e6599cb
ACM
3371 kmem_cache_destroy(prot->slab);
3372 prot->slab = NULL;
b733c007
PE
3373out:
3374 return -ENOBUFS;
1da177e4 3375}
1da177e4
LT
3376EXPORT_SYMBOL(proto_register);
3377
3378void proto_unregister(struct proto *prot)
3379{
36b77a52 3380 mutex_lock(&proto_list_mutex);
13ff3d6f 3381 release_proto_idx(prot);
0a3f4358 3382 list_del(&prot->node);
36b77a52 3383 mutex_unlock(&proto_list_mutex);
1da177e4 3384
adf78eda
JL
3385 kmem_cache_destroy(prot->slab);
3386 prot->slab = NULL;
1da177e4 3387
0159dfd3 3388 req_prot_cleanup(prot->rsk_prot);
2e6599cb 3389
6d6ee43e 3390 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
6d6ee43e 3391 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
7e56b5d6 3392 kfree(prot->twsk_prot->twsk_slab_name);
6d6ee43e 3393 prot->twsk_prot->twsk_slab = NULL;
8feaf0c0 3394 }
1da177e4 3395}
1da177e4
LT
3396EXPORT_SYMBOL(proto_unregister);
3397
bf2ae2e4
XL
3398int sock_load_diag_module(int family, int protocol)
3399{
3400 if (!protocol) {
3401 if (!sock_is_registered(family))
3402 return -ENOENT;
3403
3404 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3405 NETLINK_SOCK_DIAG, family);
3406 }
3407
3408#ifdef CONFIG_INET
3409 if (family == AF_INET &&
c34c1287 3410 protocol != IPPROTO_RAW &&
bf2ae2e4
XL
3411 !rcu_access_pointer(inet_protos[protocol]))
3412 return -ENOENT;
3413#endif
3414
3415 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3416 NETLINK_SOCK_DIAG, family, protocol);
3417}
3418EXPORT_SYMBOL(sock_load_diag_module);
3419
1da177e4 3420#ifdef CONFIG_PROC_FS
1da177e4 3421static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
36b77a52 3422 __acquires(proto_list_mutex)
1da177e4 3423{
36b77a52 3424 mutex_lock(&proto_list_mutex);
60f0438a 3425 return seq_list_start_head(&proto_list, *pos);
1da177e4
LT
3426}
3427
3428static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3429{
60f0438a 3430 return seq_list_next(v, &proto_list, pos);
1da177e4
LT
3431}
3432
3433static void proto_seq_stop(struct seq_file *seq, void *v)
36b77a52 3434 __releases(proto_list_mutex)
1da177e4 3435{
36b77a52 3436 mutex_unlock(&proto_list_mutex);
1da177e4
LT
3437}
3438
3439static char proto_method_implemented(const void *method)
3440{
3441 return method == NULL ? 'n' : 'y';
3442}
180d8cd9
GC
3443static long sock_prot_memory_allocated(struct proto *proto)
3444{
cb75a36c 3445 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
180d8cd9
GC
3446}
3447
3448static char *sock_prot_memory_pressure(struct proto *proto)
3449{
3450 return proto->memory_pressure != NULL ?
3451 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3452}
1da177e4
LT
3453
3454static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3455{
180d8cd9 3456
8d987e5c 3457 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
1da177e4
LT
3458 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3459 proto->name,
3460 proto->obj_size,
14e943db 3461 sock_prot_inuse_get(seq_file_net(seq), proto),
180d8cd9
GC
3462 sock_prot_memory_allocated(proto),
3463 sock_prot_memory_pressure(proto),
1da177e4
LT
3464 proto->max_header,
3465 proto->slab == NULL ? "no" : "yes",
3466 module_name(proto->owner),
3467 proto_method_implemented(proto->close),
3468 proto_method_implemented(proto->connect),
3469 proto_method_implemented(proto->disconnect),
3470 proto_method_implemented(proto->accept),
3471 proto_method_implemented(proto->ioctl),
3472 proto_method_implemented(proto->init),
3473 proto_method_implemented(proto->destroy),
3474 proto_method_implemented(proto->shutdown),
3475 proto_method_implemented(proto->setsockopt),
3476 proto_method_implemented(proto->getsockopt),
3477 proto_method_implemented(proto->sendmsg),
3478 proto_method_implemented(proto->recvmsg),
3479 proto_method_implemented(proto->sendpage),
3480 proto_method_implemented(proto->bind),
3481 proto_method_implemented(proto->backlog_rcv),
3482 proto_method_implemented(proto->hash),
3483 proto_method_implemented(proto->unhash),
3484 proto_method_implemented(proto->get_port),
3485 proto_method_implemented(proto->enter_memory_pressure));
3486}
3487
3488static int proto_seq_show(struct seq_file *seq, void *v)
3489{
60f0438a 3490 if (v == &proto_list)
1da177e4
LT
3491 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3492 "protocol",
3493 "size",
3494 "sockets",
3495 "memory",
3496 "press",
3497 "maxhdr",
3498 "slab",
3499 "module",
3500 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3501 else
60f0438a 3502 proto_seq_printf(seq, list_entry(v, struct proto, node));
1da177e4
LT
3503 return 0;
3504}
3505
f690808e 3506static const struct seq_operations proto_seq_ops = {
1da177e4
LT
3507 .start = proto_seq_start,
3508 .next = proto_seq_next,
3509 .stop = proto_seq_stop,
3510 .show = proto_seq_show,
3511};
3512
14e943db
ED
3513static __net_init int proto_init_net(struct net *net)
3514{
c3506372
CH
3515 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3516 sizeof(struct seq_net_private)))
14e943db
ED
3517 return -ENOMEM;
3518
3519 return 0;
3520}
3521
3522static __net_exit void proto_exit_net(struct net *net)
3523{
ece31ffd 3524 remove_proc_entry("protocols", net->proc_net);
14e943db
ED
3525}
3526
3527
3528static __net_initdata struct pernet_operations proto_net_ops = {
3529 .init = proto_init_net,
3530 .exit = proto_exit_net,
1da177e4
LT
3531};
3532
3533static int __init proto_init(void)
3534{
14e943db 3535 return register_pernet_subsys(&proto_net_ops);
1da177e4
LT
3536}
3537
3538subsys_initcall(proto_init);
3539
3540#endif /* PROC_FS */
7db6b048
SS
3541
3542#ifdef CONFIG_NET_RX_BUSY_POLL
3543bool sk_busy_loop_end(void *p, unsigned long start_time)
3544{
3545 struct sock *sk = p;
3546
3547 return !skb_queue_empty(&sk->sk_receive_queue) ||
3548 sk_busy_loop_timeout(sk, start_time);
3549}
3550EXPORT_SYMBOL(sk_busy_loop_end);
3551#endif /* CONFIG_NET_RX_BUSY_POLL */