Merge tag 'mlx5-updates-2019-02-15' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6-block.git] / net / core / sock.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
4ec93edb 35 * code. The ACK stuff can wait and needs major
1da177e4
LT
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
e005d193
JP
92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
80b14dee 94#include <asm/unaligned.h>
4fc268d2 95#include <linux/capability.h>
1da177e4 96#include <linux/errno.h>
cb820f8e 97#include <linux/errqueue.h>
1da177e4
LT
98#include <linux/types.h>
99#include <linux/socket.h>
100#include <linux/in.h>
101#include <linux/kernel.h>
1da177e4
LT
102#include <linux/module.h>
103#include <linux/proc_fs.h>
104#include <linux/seq_file.h>
105#include <linux/sched.h>
f1083048 106#include <linux/sched/mm.h>
1da177e4
LT
107#include <linux/timer.h>
108#include <linux/string.h>
109#include <linux/sockios.h>
110#include <linux/net.h>
111#include <linux/mm.h>
112#include <linux/slab.h>
113#include <linux/interrupt.h>
114#include <linux/poll.h>
115#include <linux/tcp.h>
116#include <linux/init.h>
a1f8e7f7 117#include <linux/highmem.h>
3f551f94 118#include <linux/user_namespace.h>
c5905afb 119#include <linux/static_key.h>
3969eb38 120#include <linux/memcontrol.h>
8c1ae10d 121#include <linux/prefetch.h>
1da177e4 122
7c0f6ba6 123#include <linux/uaccess.h>
1da177e4
LT
124
125#include <linux/netdevice.h>
126#include <net/protocol.h>
127#include <linux/skbuff.h>
457c4cbc 128#include <net/net_namespace.h>
2e6599cb 129#include <net/request_sock.h>
1da177e4 130#include <net/sock.h>
20d49473 131#include <linux/net_tstamp.h>
1da177e4
LT
132#include <net/xfrm.h>
133#include <linux/ipsec.h>
f8451725 134#include <net/cls_cgroup.h>
5bc1421e 135#include <net/netprio_cgroup.h>
eb4cb008 136#include <linux/sock_diag.h>
1da177e4
LT
137
138#include <linux/filter.h>
538950a1 139#include <net/sock_reuseport.h>
1da177e4 140
3847ce32
SM
141#include <trace/events/sock.h>
142
1da177e4 143#include <net/tcp.h>
076bb0c8 144#include <net/busy_poll.h>
06021292 145
36b77a52 146static DEFINE_MUTEX(proto_list_mutex);
d1a4c0b3
GC
147static LIST_HEAD(proto_list);
148
648845ab
TZ
149static void sock_inuse_add(struct net *net, int val);
150
a3b299da
EB
151/**
152 * sk_ns_capable - General socket capability test
153 * @sk: Socket to use a capability on or through
154 * @user_ns: The user namespace of the capability to use
155 * @cap: The capability to use
156 *
157 * Test to see if the opener of the socket had when the socket was
158 * created and the current process has the capability @cap in the user
159 * namespace @user_ns.
160 */
161bool sk_ns_capable(const struct sock *sk,
162 struct user_namespace *user_ns, int cap)
163{
164 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
165 ns_capable(user_ns, cap);
166}
167EXPORT_SYMBOL(sk_ns_capable);
168
169/**
170 * sk_capable - Socket global capability test
171 * @sk: Socket to use a capability on or through
e793c0f7 172 * @cap: The global capability to use
a3b299da
EB
173 *
174 * Test to see if the opener of the socket had when the socket was
175 * created and the current process has the capability @cap in all user
176 * namespaces.
177 */
178bool sk_capable(const struct sock *sk, int cap)
179{
180 return sk_ns_capable(sk, &init_user_ns, cap);
181}
182EXPORT_SYMBOL(sk_capable);
183
184/**
185 * sk_net_capable - Network namespace socket capability test
186 * @sk: Socket to use a capability on or through
187 * @cap: The capability to use
188 *
e793c0f7 189 * Test to see if the opener of the socket had when the socket was created
a3b299da
EB
190 * and the current process has the capability @cap over the network namespace
191 * the socket is a member of.
192 */
193bool sk_net_capable(const struct sock *sk, int cap)
194{
195 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
196}
197EXPORT_SYMBOL(sk_net_capable);
198
da21f24d
IM
199/*
200 * Each address family might have different locking rules, so we have
cdfbabfb
DH
201 * one slock key per address family and separate keys for internal and
202 * userspace sockets.
da21f24d 203 */
a5b5bb9a 204static struct lock_class_key af_family_keys[AF_MAX];
cdfbabfb 205static struct lock_class_key af_family_kern_keys[AF_MAX];
a5b5bb9a 206static struct lock_class_key af_family_slock_keys[AF_MAX];
cdfbabfb 207static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
a5b5bb9a 208
a5b5bb9a
IM
209/*
210 * Make lock validator output more readable. (we pre-construct these
211 * strings build-time, so that runtime initialization of socket
212 * locks is fast):
213 */
cdfbabfb
DH
214
215#define _sock_locks(x) \
216 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
217 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
218 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
219 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
220 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
221 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
222 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
223 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
224 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
225 x "27" , x "28" , x "AF_CAN" , \
226 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
227 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
228 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
229 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
68e8b849
BT
230 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
231 x "AF_MAX"
cdfbabfb 232
36cbd3dc 233static const char *const af_family_key_strings[AF_MAX+1] = {
cdfbabfb 234 _sock_locks("sk_lock-")
a5b5bb9a 235};
36cbd3dc 236static const char *const af_family_slock_key_strings[AF_MAX+1] = {
cdfbabfb 237 _sock_locks("slock-")
a5b5bb9a 238};
36cbd3dc 239static const char *const af_family_clock_key_strings[AF_MAX+1] = {
cdfbabfb
DH
240 _sock_locks("clock-")
241};
242
243static const char *const af_family_kern_key_strings[AF_MAX+1] = {
244 _sock_locks("k-sk_lock-")
245};
246static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
247 _sock_locks("k-slock-")
248};
249static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
250 _sock_locks("k-clock-")
443aef0e 251};
581319c5 252static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
6b431d50 253 _sock_locks("rlock-")
581319c5
PA
254};
255static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
6b431d50 256 _sock_locks("wlock-")
581319c5
PA
257};
258static const char *const af_family_elock_key_strings[AF_MAX+1] = {
6b431d50 259 _sock_locks("elock-")
581319c5 260};
da21f24d
IM
261
262/*
581319c5 263 * sk_callback_lock and sk queues locking rules are per-address-family,
da21f24d
IM
264 * so split the lock classes by using a per-AF key:
265 */
266static struct lock_class_key af_callback_keys[AF_MAX];
581319c5
PA
267static struct lock_class_key af_rlock_keys[AF_MAX];
268static struct lock_class_key af_wlock_keys[AF_MAX];
269static struct lock_class_key af_elock_keys[AF_MAX];
cdfbabfb 270static struct lock_class_key af_kern_callback_keys[AF_MAX];
da21f24d 271
1da177e4 272/* Run time adjustable parameters. */
ab32ea5d 273__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
6d8ebc8a 274EXPORT_SYMBOL(sysctl_wmem_max);
ab32ea5d 275__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
6d8ebc8a 276EXPORT_SYMBOL(sysctl_rmem_max);
ab32ea5d
BH
277__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
278__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
1da177e4 279
25985edc 280/* Maximal space eaten by iovec or ancillary data plus some space */
ab32ea5d 281int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
2a91525c 282EXPORT_SYMBOL(sysctl_optmem_max);
1da177e4 283
b245be1f
WB
284int sysctl_tstamp_allow_data __read_mostly = 1;
285
a7950ae8
DB
286DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
287EXPORT_SYMBOL_GPL(memalloc_socks_key);
c93bdd0e 288
7cb02404
MG
289/**
290 * sk_set_memalloc - sets %SOCK_MEMALLOC
291 * @sk: socket to set it on
292 *
293 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
294 * It's the responsibility of the admin to adjust min_free_kbytes
295 * to meet the requirements
296 */
297void sk_set_memalloc(struct sock *sk)
298{
299 sock_set_flag(sk, SOCK_MEMALLOC);
300 sk->sk_allocation |= __GFP_MEMALLOC;
a7950ae8 301 static_branch_inc(&memalloc_socks_key);
7cb02404
MG
302}
303EXPORT_SYMBOL_GPL(sk_set_memalloc);
304
305void sk_clear_memalloc(struct sock *sk)
306{
307 sock_reset_flag(sk, SOCK_MEMALLOC);
308 sk->sk_allocation &= ~__GFP_MEMALLOC;
a7950ae8 309 static_branch_dec(&memalloc_socks_key);
c76562b6
MG
310
311 /*
312 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
5d753610
MG
313 * progress of swapping. SOCK_MEMALLOC may be cleared while
314 * it has rmem allocations due to the last swapfile being deactivated
315 * but there is a risk that the socket is unusable due to exceeding
316 * the rmem limits. Reclaim the reserves and obey rmem limits again.
c76562b6 317 */
5d753610 318 sk_mem_reclaim(sk);
7cb02404
MG
319}
320EXPORT_SYMBOL_GPL(sk_clear_memalloc);
321
b4b9e355
MG
322int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
323{
324 int ret;
f1083048 325 unsigned int noreclaim_flag;
b4b9e355
MG
326
327 /* these should have been dropped before queueing */
328 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
329
f1083048 330 noreclaim_flag = memalloc_noreclaim_save();
b4b9e355 331 ret = sk->sk_backlog_rcv(sk, skb);
f1083048 332 memalloc_noreclaim_restore(noreclaim_flag);
b4b9e355
MG
333
334 return ret;
335}
336EXPORT_SYMBOL(__sk_backlog_rcv);
337
a9beb86a 338static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
fe0c72f3 339{
a9beb86a
DD
340 struct __kernel_sock_timeval tv;
341 int size;
fe0c72f3
AB
342
343 if (timeo == MAX_SCHEDULE_TIMEOUT) {
344 tv.tv_sec = 0;
345 tv.tv_usec = 0;
346 } else {
347 tv.tv_sec = timeo / HZ;
348 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
349 }
350
351 if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
352 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
353 *(struct old_timeval32 *)optval = tv32;
354 return sizeof(tv32);
355 }
356
a9beb86a
DD
357 if (old_timeval) {
358 struct __kernel_old_timeval old_tv;
359 old_tv.tv_sec = tv.tv_sec;
360 old_tv.tv_usec = tv.tv_usec;
361 *(struct __kernel_old_timeval *)optval = old_tv;
362 size = sizeof(old_tv);
363 } else {
364 *(struct __kernel_sock_timeval *)optval = tv;
365 size = sizeof(tv);
366 }
367
368 return size;
fe0c72f3
AB
369}
370
a9beb86a 371static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval)
1da177e4 372{
a9beb86a 373 struct __kernel_sock_timeval tv;
1da177e4 374
fe0c72f3
AB
375 if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
376 struct old_timeval32 tv32;
377
378 if (optlen < sizeof(tv32))
379 return -EINVAL;
380
381 if (copy_from_user(&tv32, optval, sizeof(tv32)))
382 return -EFAULT;
383 tv.tv_sec = tv32.tv_sec;
384 tv.tv_usec = tv32.tv_usec;
a9beb86a
DD
385 } else if (old_timeval) {
386 struct __kernel_old_timeval old_tv;
387
388 if (optlen < sizeof(old_tv))
389 return -EINVAL;
390 if (copy_from_user(&old_tv, optval, sizeof(old_tv)))
391 return -EFAULT;
392 tv.tv_sec = old_tv.tv_sec;
393 tv.tv_usec = old_tv.tv_usec;
fe0c72f3
AB
394 } else {
395 if (optlen < sizeof(tv))
396 return -EINVAL;
397 if (copy_from_user(&tv, optval, sizeof(tv)))
398 return -EFAULT;
399 }
ba78073e
VA
400 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
401 return -EDOM;
1da177e4 402
ba78073e 403 if (tv.tv_sec < 0) {
6f11df83
AM
404 static int warned __read_mostly;
405
ba78073e 406 *timeo_p = 0;
50aab54f 407 if (warned < 10 && net_ratelimit()) {
ba78073e 408 warned++;
e005d193
JP
409 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
410 __func__, current->comm, task_pid_nr(current));
50aab54f 411 }
ba78073e
VA
412 return 0;
413 }
1da177e4
LT
414 *timeo_p = MAX_SCHEDULE_TIMEOUT;
415 if (tv.tv_sec == 0 && tv.tv_usec == 0)
416 return 0;
a9beb86a
DD
417 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
418 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
1da177e4
LT
419 return 0;
420}
421
422static void sock_warn_obsolete_bsdism(const char *name)
423{
424 static int warned;
425 static char warncomm[TASK_COMM_LEN];
4ec93edb
YH
426 if (strcmp(warncomm, current->comm) && warned < 5) {
427 strcpy(warncomm, current->comm);
e005d193
JP
428 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
429 warncomm, name);
1da177e4
LT
430 warned++;
431 }
432}
433
080a270f
HFS
434static bool sock_needs_netstamp(const struct sock *sk)
435{
436 switch (sk->sk_family) {
437 case AF_UNSPEC:
438 case AF_UNIX:
439 return false;
440 default:
441 return true;
442 }
443}
444
08e29af3 445static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
4ec93edb 446{
08e29af3
ED
447 if (sk->sk_flags & flags) {
448 sk->sk_flags &= ~flags;
080a270f
HFS
449 if (sock_needs_netstamp(sk) &&
450 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
20d49473 451 net_disable_timestamp();
1da177e4
LT
452 }
453}
454
455
e6afc8ac 456int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
f0088a50 457{
3b885787
NH
458 unsigned long flags;
459 struct sk_buff_head *list = &sk->sk_receive_queue;
f0088a50 460
0fd7bac6 461 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
766e9037 462 atomic_inc(&sk->sk_drops);
3847ce32 463 trace_sock_rcvqueue_full(sk, skb);
766e9037 464 return -ENOMEM;
f0088a50
DV
465 }
466
c76562b6 467 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
766e9037
ED
468 atomic_inc(&sk->sk_drops);
469 return -ENOBUFS;
3ab224be
HA
470 }
471
f0088a50
DV
472 skb->dev = NULL;
473 skb_set_owner_r(skb, sk);
49ad9599 474
7fee226a
ED
475 /* we escape from rcu protected region, make sure we dont leak
476 * a norefcounted dst
477 */
478 skb_dst_force(skb);
479
3b885787 480 spin_lock_irqsave(&list->lock, flags);
3bc3b96f 481 sock_skb_set_dropcount(sk, skb);
3b885787
NH
482 __skb_queue_tail(list, skb);
483 spin_unlock_irqrestore(&list->lock, flags);
f0088a50
DV
484
485 if (!sock_flag(sk, SOCK_DEAD))
676d2369 486 sk->sk_data_ready(sk);
766e9037 487 return 0;
f0088a50 488}
e6afc8ac 489EXPORT_SYMBOL(__sock_queue_rcv_skb);
490
491int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
492{
493 int err;
494
495 err = sk_filter(sk, skb);
496 if (err)
497 return err;
498
499 return __sock_queue_rcv_skb(sk, skb);
500}
f0088a50
DV
501EXPORT_SYMBOL(sock_queue_rcv_skb);
502
4f0c40d9 503int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
c3f24cfb 504 const int nested, unsigned int trim_cap, bool refcounted)
f0088a50
DV
505{
506 int rc = NET_RX_SUCCESS;
507
4f0c40d9 508 if (sk_filter_trim_cap(sk, skb, trim_cap))
f0088a50
DV
509 goto discard_and_relse;
510
511 skb->dev = NULL;
512
274f482d 513 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
c377411f
ED
514 atomic_inc(&sk->sk_drops);
515 goto discard_and_relse;
516 }
58a5a7b9
ACM
517 if (nested)
518 bh_lock_sock_nested(sk);
519 else
520 bh_lock_sock(sk);
a5b5bb9a
IM
521 if (!sock_owned_by_user(sk)) {
522 /*
523 * trylock + unlock semantics:
524 */
525 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
526
c57943a1 527 rc = sk_backlog_rcv(sk, skb);
a5b5bb9a
IM
528
529 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
f545a38f 530 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
8eae939f
ZY
531 bh_unlock_sock(sk);
532 atomic_inc(&sk->sk_drops);
533 goto discard_and_relse;
534 }
535
f0088a50
DV
536 bh_unlock_sock(sk);
537out:
c3f24cfb
ED
538 if (refcounted)
539 sock_put(sk);
f0088a50
DV
540 return rc;
541discard_and_relse:
542 kfree_skb(skb);
543 goto out;
544}
4f0c40d9 545EXPORT_SYMBOL(__sk_receive_skb);
f0088a50
DV
546
547struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
548{
b6c6712a 549 struct dst_entry *dst = __sk_dst_get(sk);
f0088a50
DV
550
551 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
e022f0b4 552 sk_tx_queue_clear(sk);
9b8805a3 553 sk->sk_dst_pending_confirm = 0;
a9b3cd7f 554 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
f0088a50
DV
555 dst_release(dst);
556 return NULL;
557 }
558
559 return dst;
560}
561EXPORT_SYMBOL(__sk_dst_check);
562
563struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
564{
565 struct dst_entry *dst = sk_dst_get(sk);
566
567 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
568 sk_dst_reset(sk);
569 dst_release(dst);
570 return NULL;
571 }
572
573 return dst;
574}
575EXPORT_SYMBOL(sk_dst_check);
576
f5dd3d0c 577static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
4878809f
DM
578{
579 int ret = -ENOPROTOOPT;
580#ifdef CONFIG_NETDEVICES
3b1e0a65 581 struct net *net = sock_net(sk);
4878809f
DM
582
583 /* Sorry... */
584 ret = -EPERM;
5e1fccc0 585 if (!ns_capable(net->user_ns, CAP_NET_RAW))
4878809f
DM
586 goto out;
587
f5dd3d0c
DH
588 ret = -EINVAL;
589 if (ifindex < 0)
590 goto out;
591
592 sk->sk_bound_dev_if = ifindex;
593 if (sk->sk_prot->rehash)
594 sk->sk_prot->rehash(sk);
595 sk_dst_reset(sk);
596
597 ret = 0;
598
599out:
600#endif
601
602 return ret;
603}
604
605static int sock_setbindtodevice(struct sock *sk, char __user *optval,
606 int optlen)
607{
608 int ret = -ENOPROTOOPT;
609#ifdef CONFIG_NETDEVICES
610 struct net *net = sock_net(sk);
611 char devname[IFNAMSIZ];
612 int index;
613
4878809f
DM
614 ret = -EINVAL;
615 if (optlen < 0)
616 goto out;
617
618 /* Bind this socket to a particular device like "eth0",
619 * as specified in the passed interface name. If the
620 * name is "" or the option length is zero the socket
621 * is not bound.
622 */
623 if (optlen > IFNAMSIZ - 1)
624 optlen = IFNAMSIZ - 1;
625 memset(devname, 0, sizeof(devname));
626
627 ret = -EFAULT;
628 if (copy_from_user(devname, optval, optlen))
629 goto out;
630
000ba2e4
DM
631 index = 0;
632 if (devname[0] != '\0') {
bf8e56bf 633 struct net_device *dev;
4878809f 634
bf8e56bf
ED
635 rcu_read_lock();
636 dev = dev_get_by_name_rcu(net, devname);
637 if (dev)
638 index = dev->ifindex;
639 rcu_read_unlock();
4878809f
DM
640 ret = -ENODEV;
641 if (!dev)
642 goto out;
4878809f
DM
643 }
644
645 lock_sock(sk);
f5dd3d0c 646 ret = sock_setbindtodevice_locked(sk, index);
4878809f
DM
647 release_sock(sk);
648
4878809f
DM
649out:
650#endif
651
652 return ret;
653}
654
c91f6df2
BH
655static int sock_getbindtodevice(struct sock *sk, char __user *optval,
656 int __user *optlen, int len)
657{
658 int ret = -ENOPROTOOPT;
659#ifdef CONFIG_NETDEVICES
660 struct net *net = sock_net(sk);
c91f6df2 661 char devname[IFNAMSIZ];
c91f6df2
BH
662
663 if (sk->sk_bound_dev_if == 0) {
664 len = 0;
665 goto zero;
666 }
667
668 ret = -EINVAL;
669 if (len < IFNAMSIZ)
670 goto out;
671
5dbe7c17
NS
672 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
673 if (ret)
c91f6df2 674 goto out;
c91f6df2
BH
675
676 len = strlen(devname) + 1;
677
678 ret = -EFAULT;
679 if (copy_to_user(optval, devname, len))
680 goto out;
681
682zero:
683 ret = -EFAULT;
684 if (put_user(len, optlen))
685 goto out;
686
687 ret = 0;
688
689out:
690#endif
691
692 return ret;
693}
694
c0ef877b
PE
695static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
696{
697 if (valbool)
698 sock_set_flag(sk, bit);
699 else
700 sock_reset_flag(sk, bit);
701}
702
f60e5990 703bool sk_mc_loop(struct sock *sk)
704{
705 if (dev_recursion_level())
706 return false;
707 if (!sk)
708 return true;
709 switch (sk->sk_family) {
710 case AF_INET:
711 return inet_sk(sk)->mc_loop;
712#if IS_ENABLED(CONFIG_IPV6)
713 case AF_INET6:
714 return inet6_sk(sk)->mc_loop;
715#endif
716 }
717 WARN_ON(1);
718 return true;
719}
720EXPORT_SYMBOL(sk_mc_loop);
721
1da177e4
LT
722/*
723 * This is meant for all protocols to use and covers goings on
724 * at the socket level. Everything here is generic.
725 */
726
727int sock_setsockopt(struct socket *sock, int level, int optname,
b7058842 728 char __user *optval, unsigned int optlen)
1da177e4 729{
80b14dee 730 struct sock_txtime sk_txtime;
2a91525c 731 struct sock *sk = sock->sk;
1da177e4
LT
732 int val;
733 int valbool;
734 struct linger ling;
735 int ret = 0;
4ec93edb 736
1da177e4
LT
737 /*
738 * Options without arguments
739 */
740
4878809f 741 if (optname == SO_BINDTODEVICE)
c91f6df2 742 return sock_setbindtodevice(sk, optval, optlen);
4878809f 743
e71a4783
SH
744 if (optlen < sizeof(int))
745 return -EINVAL;
4ec93edb 746
1da177e4
LT
747 if (get_user(val, (int __user *)optval))
748 return -EFAULT;
4ec93edb 749
2a91525c 750 valbool = val ? 1 : 0;
1da177e4
LT
751
752 lock_sock(sk);
753
2a91525c 754 switch (optname) {
e71a4783 755 case SO_DEBUG:
2a91525c 756 if (val && !capable(CAP_NET_ADMIN))
e71a4783 757 ret = -EACCES;
2a91525c 758 else
c0ef877b 759 sock_valbool_flag(sk, SOCK_DBG, valbool);
e71a4783
SH
760 break;
761 case SO_REUSEADDR:
cdb8744d 762 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
e71a4783 763 break;
055dc21a
TH
764 case SO_REUSEPORT:
765 sk->sk_reuseport = valbool;
766 break;
e71a4783 767 case SO_TYPE:
49c794e9 768 case SO_PROTOCOL:
0d6038ee 769 case SO_DOMAIN:
e71a4783
SH
770 case SO_ERROR:
771 ret = -ENOPROTOOPT;
772 break;
773 case SO_DONTROUTE:
c0ef877b 774 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
0fbe82e6 775 sk_dst_reset(sk);
e71a4783
SH
776 break;
777 case SO_BROADCAST:
778 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
779 break;
780 case SO_SNDBUF:
781 /* Don't error on this BSD doesn't and if you think
82981930
ED
782 * about it this is right. Otherwise apps have to
783 * play 'guess the biggest size' games. RCVBUF/SNDBUF
784 * are treated in BSD as hints
785 */
786 val = min_t(u32, val, sysctl_wmem_max);
b0573dea 787set_sndbuf:
e71a4783 788 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
b98b0bc8 789 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
82981930 790 /* Wake up sending tasks if we upped the value. */
e71a4783
SH
791 sk->sk_write_space(sk);
792 break;
1da177e4 793
e71a4783
SH
794 case SO_SNDBUFFORCE:
795 if (!capable(CAP_NET_ADMIN)) {
796 ret = -EPERM;
797 break;
798 }
799 goto set_sndbuf;
b0573dea 800
e71a4783
SH
801 case SO_RCVBUF:
802 /* Don't error on this BSD doesn't and if you think
82981930
ED
803 * about it this is right. Otherwise apps have to
804 * play 'guess the biggest size' games. RCVBUF/SNDBUF
805 * are treated in BSD as hints
806 */
807 val = min_t(u32, val, sysctl_rmem_max);
b0573dea 808set_rcvbuf:
e71a4783
SH
809 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
810 /*
811 * We double it on the way in to account for
812 * "struct sk_buff" etc. overhead. Applications
813 * assume that the SO_RCVBUF setting they make will
814 * allow that much actual data to be received on that
815 * socket.
816 *
817 * Applications are unaware that "struct sk_buff" and
818 * other overheads allocate from the receive buffer
819 * during socket buffer allocation.
820 *
821 * And after considering the possible alternatives,
822 * returning the value we actually used in getsockopt
823 * is the most desirable behavior.
824 */
b98b0bc8 825 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
e71a4783
SH
826 break;
827
828 case SO_RCVBUFFORCE:
829 if (!capable(CAP_NET_ADMIN)) {
830 ret = -EPERM;
1da177e4 831 break;
e71a4783
SH
832 }
833 goto set_rcvbuf;
1da177e4 834
e71a4783 835 case SO_KEEPALIVE:
4b9d07a4
UB
836 if (sk->sk_prot->keepalive)
837 sk->sk_prot->keepalive(sk, valbool);
e71a4783
SH
838 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
839 break;
840
841 case SO_OOBINLINE:
842 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
843 break;
844
845 case SO_NO_CHECK:
28448b80 846 sk->sk_no_check_tx = valbool;
e71a4783
SH
847 break;
848
849 case SO_PRIORITY:
5e1fccc0
EB
850 if ((val >= 0 && val <= 6) ||
851 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
e71a4783
SH
852 sk->sk_priority = val;
853 else
854 ret = -EPERM;
855 break;
856
857 case SO_LINGER:
858 if (optlen < sizeof(ling)) {
859 ret = -EINVAL; /* 1003.1g */
1da177e4 860 break;
e71a4783 861 }
2a91525c 862 if (copy_from_user(&ling, optval, sizeof(ling))) {
e71a4783 863 ret = -EFAULT;
1da177e4 864 break;
e71a4783
SH
865 }
866 if (!ling.l_onoff)
867 sock_reset_flag(sk, SOCK_LINGER);
868 else {
1da177e4 869#if (BITS_PER_LONG == 32)
e71a4783
SH
870 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
871 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1da177e4 872 else
e71a4783
SH
873#endif
874 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
875 sock_set_flag(sk, SOCK_LINGER);
876 }
877 break;
878
879 case SO_BSDCOMPAT:
880 sock_warn_obsolete_bsdism("setsockopt");
881 break;
882
883 case SO_PASSCRED:
884 if (valbool)
885 set_bit(SOCK_PASSCRED, &sock->flags);
886 else
887 clear_bit(SOCK_PASSCRED, &sock->flags);
888 break;
889
7f1bc6e9 890 case SO_TIMESTAMP_OLD:
887feae3 891 case SO_TIMESTAMP_NEW:
7f1bc6e9 892 case SO_TIMESTAMPNS_OLD:
887feae3 893 case SO_TIMESTAMPNS_NEW:
e71a4783 894 if (valbool) {
887feae3
DD
895 if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW)
896 sock_set_flag(sk, SOCK_TSTAMP_NEW);
897 else
898 sock_reset_flag(sk, SOCK_TSTAMP_NEW);
899
900 if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW)
92f37fd2
ED
901 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
902 else
903 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783 904 sock_set_flag(sk, SOCK_RCVTSTAMP);
20d49473 905 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
92f37fd2 906 } else {
e71a4783 907 sock_reset_flag(sk, SOCK_RCVTSTAMP);
92f37fd2 908 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
887feae3 909 sock_reset_flag(sk, SOCK_TSTAMP_NEW);
92f37fd2 910 }
e71a4783
SH
911 break;
912
9718475e
DD
913 case SO_TIMESTAMPING_NEW:
914 sock_set_flag(sk, SOCK_TSTAMP_NEW);
ff7653f9 915 /* fall through */
7f1bc6e9 916 case SO_TIMESTAMPING_OLD:
20d49473 917 if (val & ~SOF_TIMESTAMPING_MASK) {
f249fb78 918 ret = -EINVAL;
20d49473
PO
919 break;
920 }
b245be1f 921
09c2d251 922 if (val & SOF_TIMESTAMPING_OPT_ID &&
4ed2d765 923 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
ac5cc977
WC
924 if (sk->sk_protocol == IPPROTO_TCP &&
925 sk->sk_type == SOCK_STREAM) {
6db8b963
SHY
926 if ((1 << sk->sk_state) &
927 (TCPF_CLOSE | TCPF_LISTEN)) {
4ed2d765
WB
928 ret = -EINVAL;
929 break;
930 }
931 sk->sk_tskey = tcp_sk(sk)->snd_una;
932 } else {
933 sk->sk_tskey = 0;
934 }
935 }
1c885808
FY
936
937 if (val & SOF_TIMESTAMPING_OPT_STATS &&
938 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
939 ret = -EINVAL;
940 break;
941 }
942
b9f40e21 943 sk->sk_tsflags = val;
20d49473
PO
944 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
945 sock_enable_timestamp(sk,
946 SOCK_TIMESTAMPING_RX_SOFTWARE);
9718475e
DD
947 else {
948 if (optname == SO_TIMESTAMPING_NEW)
949 sock_reset_flag(sk, SOCK_TSTAMP_NEW);
950
20d49473 951 sock_disable_timestamp(sk,
08e29af3 952 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
9718475e 953 }
20d49473
PO
954 break;
955
e71a4783
SH
956 case SO_RCVLOWAT:
957 if (val < 0)
958 val = INT_MAX;
d1361840
ED
959 if (sock->ops->set_rcvlowat)
960 ret = sock->ops->set_rcvlowat(sk, val);
961 else
962 sk->sk_rcvlowat = val ? : 1;
e71a4783
SH
963 break;
964
45bdc661 965 case SO_RCVTIMEO_OLD:
a9beb86a
DD
966 case SO_RCVTIMEO_NEW:
967 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD);
e71a4783
SH
968 break;
969
45bdc661 970 case SO_SNDTIMEO_OLD:
a9beb86a
DD
971 case SO_SNDTIMEO_NEW:
972 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD);
e71a4783 973 break;
1da177e4 974
e71a4783
SH
975 case SO_ATTACH_FILTER:
976 ret = -EINVAL;
977 if (optlen == sizeof(struct sock_fprog)) {
978 struct sock_fprog fprog;
1da177e4 979
e71a4783
SH
980 ret = -EFAULT;
981 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1da177e4 982 break;
e71a4783
SH
983
984 ret = sk_attach_filter(&fprog, sk);
985 }
986 break;
987
89aa0758
AS
988 case SO_ATTACH_BPF:
989 ret = -EINVAL;
990 if (optlen == sizeof(u32)) {
991 u32 ufd;
992
993 ret = -EFAULT;
994 if (copy_from_user(&ufd, optval, sizeof(ufd)))
995 break;
996
997 ret = sk_attach_bpf(ufd, sk);
998 }
999 break;
1000
538950a1
CG
1001 case SO_ATTACH_REUSEPORT_CBPF:
1002 ret = -EINVAL;
1003 if (optlen == sizeof(struct sock_fprog)) {
1004 struct sock_fprog fprog;
1005
1006 ret = -EFAULT;
1007 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1008 break;
1009
1010 ret = sk_reuseport_attach_filter(&fprog, sk);
1011 }
1012 break;
1013
1014 case SO_ATTACH_REUSEPORT_EBPF:
1015 ret = -EINVAL;
1016 if (optlen == sizeof(u32)) {
1017 u32 ufd;
1018
1019 ret = -EFAULT;
1020 if (copy_from_user(&ufd, optval, sizeof(ufd)))
1021 break;
1022
1023 ret = sk_reuseport_attach_bpf(ufd, sk);
1024 }
1025 break;
1026
e71a4783 1027 case SO_DETACH_FILTER:
55b33325 1028 ret = sk_detach_filter(sk);
e71a4783 1029 break;
1da177e4 1030
d59577b6
VB
1031 case SO_LOCK_FILTER:
1032 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1033 ret = -EPERM;
1034 else
1035 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1036 break;
1037
e71a4783
SH
1038 case SO_PASSSEC:
1039 if (valbool)
1040 set_bit(SOCK_PASSSEC, &sock->flags);
1041 else
1042 clear_bit(SOCK_PASSSEC, &sock->flags);
1043 break;
4a19ec58 1044 case SO_MARK:
50254256 1045 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
4a19ec58 1046 ret = -EPERM;
50254256 1047 } else if (val != sk->sk_mark) {
4a19ec58 1048 sk->sk_mark = val;
50254256
DB
1049 sk_dst_reset(sk);
1050 }
4a19ec58 1051 break;
877ce7c1 1052
3b885787 1053 case SO_RXQ_OVFL:
8083f0fc 1054 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
3b885787 1055 break;
6e3e939f
JB
1056
1057 case SO_WIFI_STATUS:
1058 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1059 break;
1060
ef64a54f
PE
1061 case SO_PEEK_OFF:
1062 if (sock->ops->set_peek_off)
12663bfc 1063 ret = sock->ops->set_peek_off(sk, val);
ef64a54f
PE
1064 else
1065 ret = -EOPNOTSUPP;
1066 break;
3bdc0eba
BG
1067
1068 case SO_NOFCS:
1069 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1070 break;
1071
7d4c04fc
KJ
1072 case SO_SELECT_ERR_QUEUE:
1073 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1074 break;
1075
e0d1095a 1076#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 1077 case SO_BUSY_POLL:
dafcc438
ET
1078 /* allow unprivileged users to decrease the value */
1079 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1080 ret = -EPERM;
1081 else {
1082 if (val < 0)
1083 ret = -EINVAL;
1084 else
1085 sk->sk_ll_usec = val;
1086 }
1087 break;
1088#endif
62748f32
ED
1089
1090 case SO_MAX_PACING_RATE:
218af599
ED
1091 if (val != ~0U)
1092 cmpxchg(&sk->sk_pacing_status,
1093 SK_PACING_NONE,
1094 SK_PACING_NEEDED);
76a9ebe8 1095 sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
62748f32
ED
1096 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1097 sk->sk_max_pacing_rate);
1098 break;
1099
70da268b
ED
1100 case SO_INCOMING_CPU:
1101 sk->sk_incoming_cpu = val;
1102 break;
1103
a87cb3e4
TH
1104 case SO_CNX_ADVICE:
1105 if (val == 1)
1106 dst_negative_advice(sk);
1107 break;
76851d12
WB
1108
1109 case SO_ZEROCOPY:
28190752 1110 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
b5947e5d
WB
1111 if (!((sk->sk_type == SOCK_STREAM &&
1112 sk->sk_protocol == IPPROTO_TCP) ||
1113 (sk->sk_type == SOCK_DGRAM &&
1114 sk->sk_protocol == IPPROTO_UDP)))
28190752 1115 ret = -ENOTSUPP;
28190752 1116 } else if (sk->sk_family != PF_RDS) {
76851d12 1117 ret = -ENOTSUPP;
28190752
SV
1118 }
1119 if (!ret) {
1120 if (val < 0 || val > 1)
1121 ret = -EINVAL;
1122 else
1123 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
28190752 1124 }
334e6413
JSP
1125 break;
1126
80b14dee
RC
1127 case SO_TXTIME:
1128 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1129 ret = -EPERM;
1130 } else if (optlen != sizeof(struct sock_txtime)) {
1131 ret = -EINVAL;
1132 } else if (copy_from_user(&sk_txtime, optval,
1133 sizeof(struct sock_txtime))) {
1134 ret = -EFAULT;
1135 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1136 ret = -EINVAL;
1137 } else {
1138 sock_valbool_flag(sk, SOCK_TXTIME, true);
1139 sk->sk_clockid = sk_txtime.clockid;
1140 sk->sk_txtime_deadline_mode =
1141 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
4b15c707
JSP
1142 sk->sk_txtime_report_errors =
1143 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
80b14dee
RC
1144 }
1145 break;
1146
f5dd3d0c
DH
1147 case SO_BINDTOIFINDEX:
1148 ret = sock_setbindtodevice_locked(sk, val);
1149 break;
1150
e71a4783
SH
1151 default:
1152 ret = -ENOPROTOOPT;
1153 break;
4ec93edb 1154 }
1da177e4
LT
1155 release_sock(sk);
1156 return ret;
1157}
2a91525c 1158EXPORT_SYMBOL(sock_setsockopt);
1da177e4
LT
1159
1160
8f09898b 1161static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1162 struct ucred *ucred)
3f551f94
EB
1163{
1164 ucred->pid = pid_vnr(pid);
1165 ucred->uid = ucred->gid = -1;
1166 if (cred) {
1167 struct user_namespace *current_ns = current_user_ns();
1168
b2e4f544
EB
1169 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1170 ucred->gid = from_kgid_munged(current_ns, cred->egid);
3f551f94
EB
1171 }
1172}
1173
28b5ba2a
DH
1174static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1175{
1176 struct user_namespace *user_ns = current_user_ns();
1177 int i;
1178
1179 for (i = 0; i < src->ngroups; i++)
1180 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1181 return -EFAULT;
1182
1183 return 0;
1184}
1185
1da177e4
LT
1186int sock_getsockopt(struct socket *sock, int level, int optname,
1187 char __user *optval, int __user *optlen)
1188{
1189 struct sock *sk = sock->sk;
4ec93edb 1190
e71a4783 1191 union {
4ec93edb 1192 int val;
5daab9db 1193 u64 val64;
4ec93edb 1194 struct linger ling;
fe0c72f3
AB
1195 struct old_timeval32 tm32;
1196 struct __kernel_old_timeval tm;
a9beb86a 1197 struct __kernel_sock_timeval stm;
80b14dee 1198 struct sock_txtime txtime;
1da177e4 1199 } v;
4ec93edb 1200
4d0392be 1201 int lv = sizeof(int);
1da177e4 1202 int len;
4ec93edb 1203
e71a4783 1204 if (get_user(len, optlen))
4ec93edb 1205 return -EFAULT;
e71a4783 1206 if (len < 0)
1da177e4 1207 return -EINVAL;
4ec93edb 1208
50fee1de 1209 memset(&v, 0, sizeof(v));
df0bca04 1210
2a91525c 1211 switch (optname) {
e71a4783
SH
1212 case SO_DEBUG:
1213 v.val = sock_flag(sk, SOCK_DBG);
1214 break;
1215
1216 case SO_DONTROUTE:
1217 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1218 break;
1219
1220 case SO_BROADCAST:
1b23a5df 1221 v.val = sock_flag(sk, SOCK_BROADCAST);
e71a4783
SH
1222 break;
1223
1224 case SO_SNDBUF:
1225 v.val = sk->sk_sndbuf;
1226 break;
1227
1228 case SO_RCVBUF:
1229 v.val = sk->sk_rcvbuf;
1230 break;
1231
1232 case SO_REUSEADDR:
1233 v.val = sk->sk_reuse;
1234 break;
1235
055dc21a
TH
1236 case SO_REUSEPORT:
1237 v.val = sk->sk_reuseport;
1238 break;
1239
e71a4783 1240 case SO_KEEPALIVE:
1b23a5df 1241 v.val = sock_flag(sk, SOCK_KEEPOPEN);
e71a4783
SH
1242 break;
1243
1244 case SO_TYPE:
1245 v.val = sk->sk_type;
1246 break;
1247
49c794e9
JE
1248 case SO_PROTOCOL:
1249 v.val = sk->sk_protocol;
1250 break;
1251
0d6038ee
JE
1252 case SO_DOMAIN:
1253 v.val = sk->sk_family;
1254 break;
1255
e71a4783
SH
1256 case SO_ERROR:
1257 v.val = -sock_error(sk);
2a91525c 1258 if (v.val == 0)
e71a4783
SH
1259 v.val = xchg(&sk->sk_err_soft, 0);
1260 break;
1261
1262 case SO_OOBINLINE:
1b23a5df 1263 v.val = sock_flag(sk, SOCK_URGINLINE);
e71a4783
SH
1264 break;
1265
1266 case SO_NO_CHECK:
28448b80 1267 v.val = sk->sk_no_check_tx;
e71a4783
SH
1268 break;
1269
1270 case SO_PRIORITY:
1271 v.val = sk->sk_priority;
1272 break;
1273
1274 case SO_LINGER:
1275 lv = sizeof(v.ling);
1b23a5df 1276 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
e71a4783
SH
1277 v.ling.l_linger = sk->sk_lingertime / HZ;
1278 break;
1279
1280 case SO_BSDCOMPAT:
1281 sock_warn_obsolete_bsdism("getsockopt");
1282 break;
1283
7f1bc6e9 1284 case SO_TIMESTAMP_OLD:
92f37fd2 1285 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
887feae3 1286 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
92f37fd2
ED
1287 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1288 break;
1289
7f1bc6e9 1290 case SO_TIMESTAMPNS_OLD:
887feae3
DD
1291 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1292 break;
1293
1294 case SO_TIMESTAMP_NEW:
1295 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1296 break;
1297
1298 case SO_TIMESTAMPNS_NEW:
1299 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
e71a4783
SH
1300 break;
1301
7f1bc6e9 1302 case SO_TIMESTAMPING_OLD:
b9f40e21 1303 v.val = sk->sk_tsflags;
20d49473
PO
1304 break;
1305
a9beb86a
DD
1306 case SO_RCVTIMEO_OLD:
1307 case SO_RCVTIMEO_NEW:
1308 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
e71a4783
SH
1309 break;
1310
a9beb86a
DD
1311 case SO_SNDTIMEO_OLD:
1312 case SO_SNDTIMEO_NEW:
1313 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
e71a4783 1314 break;
1da177e4 1315
e71a4783
SH
1316 case SO_RCVLOWAT:
1317 v.val = sk->sk_rcvlowat;
1318 break;
1da177e4 1319
e71a4783 1320 case SO_SNDLOWAT:
2a91525c 1321 v.val = 1;
e71a4783 1322 break;
1da177e4 1323
e71a4783 1324 case SO_PASSCRED:
82981930 1325 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
e71a4783 1326 break;
1da177e4 1327
e71a4783 1328 case SO_PEERCRED:
109f6e39
EB
1329 {
1330 struct ucred peercred;
1331 if (len > sizeof(peercred))
1332 len = sizeof(peercred);
1333 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1334 if (copy_to_user(optval, &peercred, len))
e71a4783
SH
1335 return -EFAULT;
1336 goto lenout;
109f6e39 1337 }
1da177e4 1338
28b5ba2a
DH
1339 case SO_PEERGROUPS:
1340 {
1341 int ret, n;
1342
1343 if (!sk->sk_peer_cred)
1344 return -ENODATA;
1345
1346 n = sk->sk_peer_cred->group_info->ngroups;
1347 if (len < n * sizeof(gid_t)) {
1348 len = n * sizeof(gid_t);
1349 return put_user(len, optlen) ? -EFAULT : -ERANGE;
1350 }
1351 len = n * sizeof(gid_t);
1352
1353 ret = groups_to_user((gid_t __user *)optval,
1354 sk->sk_peer_cred->group_info);
1355 if (ret)
1356 return ret;
1357 goto lenout;
1358 }
1359
e71a4783
SH
1360 case SO_PEERNAME:
1361 {
1362 char address[128];
1363
9b2c45d4
DV
1364 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1365 if (lv < 0)
e71a4783
SH
1366 return -ENOTCONN;
1367 if (lv < len)
1368 return -EINVAL;
1369 if (copy_to_user(optval, address, len))
1370 return -EFAULT;
1371 goto lenout;
1372 }
1da177e4 1373
e71a4783
SH
1374 /* Dubious BSD thing... Probably nobody even uses it, but
1375 * the UNIX standard wants it for whatever reason... -DaveM
1376 */
1377 case SO_ACCEPTCONN:
1378 v.val = sk->sk_state == TCP_LISTEN;
1379 break;
1da177e4 1380
e71a4783 1381 case SO_PASSSEC:
82981930 1382 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
e71a4783 1383 break;
877ce7c1 1384
e71a4783
SH
1385 case SO_PEERSEC:
1386 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1da177e4 1387
4a19ec58
LAT
1388 case SO_MARK:
1389 v.val = sk->sk_mark;
1390 break;
1391
3b885787 1392 case SO_RXQ_OVFL:
1b23a5df 1393 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
3b885787
NH
1394 break;
1395
6e3e939f 1396 case SO_WIFI_STATUS:
1b23a5df 1397 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
6e3e939f
JB
1398 break;
1399
ef64a54f
PE
1400 case SO_PEEK_OFF:
1401 if (!sock->ops->set_peek_off)
1402 return -EOPNOTSUPP;
1403
1404 v.val = sk->sk_peek_off;
1405 break;
bc2f7996 1406 case SO_NOFCS:
1b23a5df 1407 v.val = sock_flag(sk, SOCK_NOFCS);
bc2f7996 1408 break;
c91f6df2 1409
f7b86bfe 1410 case SO_BINDTODEVICE:
c91f6df2
BH
1411 return sock_getbindtodevice(sk, optval, optlen, len);
1412
a8fc9277
PE
1413 case SO_GET_FILTER:
1414 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1415 if (len < 0)
1416 return len;
1417
1418 goto lenout;
c91f6df2 1419
d59577b6
VB
1420 case SO_LOCK_FILTER:
1421 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1422 break;
1423
ea02f941
MS
1424 case SO_BPF_EXTENSIONS:
1425 v.val = bpf_tell_extensions();
1426 break;
1427
7d4c04fc
KJ
1428 case SO_SELECT_ERR_QUEUE:
1429 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1430 break;
1431
e0d1095a 1432#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 1433 case SO_BUSY_POLL:
dafcc438
ET
1434 v.val = sk->sk_ll_usec;
1435 break;
1436#endif
1437
62748f32 1438 case SO_MAX_PACING_RATE:
76a9ebe8
ED
1439 /* 32bit version */
1440 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
62748f32
ED
1441 break;
1442
2c8c56e1
ED
1443 case SO_INCOMING_CPU:
1444 v.val = sk->sk_incoming_cpu;
1445 break;
1446
a2d133b1
JH
1447 case SO_MEMINFO:
1448 {
1449 u32 meminfo[SK_MEMINFO_VARS];
1450
1451 if (get_user(len, optlen))
1452 return -EFAULT;
1453
1454 sk_get_meminfo(sk, meminfo);
1455
1456 len = min_t(unsigned int, len, sizeof(meminfo));
1457 if (copy_to_user(optval, &meminfo, len))
1458 return -EFAULT;
1459
1460 goto lenout;
1461 }
6d433902
SS
1462
1463#ifdef CONFIG_NET_RX_BUSY_POLL
1464 case SO_INCOMING_NAPI_ID:
1465 v.val = READ_ONCE(sk->sk_napi_id);
1466
1467 /* aggregate non-NAPI IDs down to 0 */
1468 if (v.val < MIN_NAPI_ID)
1469 v.val = 0;
1470
1471 break;
1472#endif
1473
5daab9db
CF
1474 case SO_COOKIE:
1475 lv = sizeof(u64);
1476 if (len < lv)
1477 return -EINVAL;
1478 v.val64 = sock_gen_cookie(sk);
1479 break;
1480
76851d12
WB
1481 case SO_ZEROCOPY:
1482 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1483 break;
1484
80b14dee
RC
1485 case SO_TXTIME:
1486 lv = sizeof(v.txtime);
1487 v.txtime.clockid = sk->sk_clockid;
1488 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1489 SOF_TXTIME_DEADLINE_MODE : 0;
4b15c707
JSP
1490 v.txtime.flags |= sk->sk_txtime_report_errors ?
1491 SOF_TXTIME_REPORT_ERRORS : 0;
80b14dee
RC
1492 break;
1493
f5dd3d0c
DH
1494 case SO_BINDTOIFINDEX:
1495 v.val = sk->sk_bound_dev_if;
1496 break;
1497
e71a4783 1498 default:
443b5991
YH
1499 /* We implement the SO_SNDLOWAT etc to not be settable
1500 * (1003.1g 7).
1501 */
e71a4783 1502 return -ENOPROTOOPT;
1da177e4 1503 }
e71a4783 1504
1da177e4
LT
1505 if (len > lv)
1506 len = lv;
1507 if (copy_to_user(optval, &v, len))
1508 return -EFAULT;
1509lenout:
4ec93edb
YH
1510 if (put_user(len, optlen))
1511 return -EFAULT;
1512 return 0;
1da177e4
LT
1513}
1514
a5b5bb9a
IM
1515/*
1516 * Initialize an sk_lock.
1517 *
1518 * (We also register the sk_lock with the lock validator.)
1519 */
b6f99a21 1520static inline void sock_lock_init(struct sock *sk)
a5b5bb9a 1521{
cdfbabfb
DH
1522 if (sk->sk_kern_sock)
1523 sock_lock_init_class_and_name(
1524 sk,
1525 af_family_kern_slock_key_strings[sk->sk_family],
1526 af_family_kern_slock_keys + sk->sk_family,
1527 af_family_kern_key_strings[sk->sk_family],
1528 af_family_kern_keys + sk->sk_family);
1529 else
1530 sock_lock_init_class_and_name(
1531 sk,
ed07536e
PZ
1532 af_family_slock_key_strings[sk->sk_family],
1533 af_family_slock_keys + sk->sk_family,
1534 af_family_key_strings[sk->sk_family],
1535 af_family_keys + sk->sk_family);
a5b5bb9a
IM
1536}
1537
4dc6dc71
ED
1538/*
1539 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1540 * even temporarly, because of RCU lookups. sk_node should also be left as is.
68835aba 1541 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
4dc6dc71 1542 */
f1a6c4da
PE
1543static void sock_copy(struct sock *nsk, const struct sock *osk)
1544{
1545#ifdef CONFIG_SECURITY_NETWORK
1546 void *sptr = nsk->sk_security;
1547#endif
68835aba
ED
1548 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1549
1550 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1551 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1552
f1a6c4da
PE
1553#ifdef CONFIG_SECURITY_NETWORK
1554 nsk->sk_security = sptr;
1555 security_sk_clone(osk, nsk);
1556#endif
1557}
1558
2e4afe7b
PE
1559static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1560 int family)
c308c1b2
PE
1561{
1562 struct sock *sk;
1563 struct kmem_cache *slab;
1564
1565 slab = prot->slab;
e912b114
ED
1566 if (slab != NULL) {
1567 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1568 if (!sk)
1569 return sk;
ba2489b0
ED
1570 if (priority & __GFP_ZERO)
1571 sk_prot_clear_nulls(sk, prot->obj_size);
fcbdf09d 1572 } else
c308c1b2
PE
1573 sk = kmalloc(prot->obj_size, priority);
1574
2e4afe7b
PE
1575 if (sk != NULL) {
1576 if (security_sk_alloc(sk, family, priority))
1577 goto out_free;
1578
1579 if (!try_module_get(prot->owner))
1580 goto out_free_sec;
e022f0b4 1581 sk_tx_queue_clear(sk);
2e4afe7b
PE
1582 }
1583
c308c1b2 1584 return sk;
2e4afe7b
PE
1585
1586out_free_sec:
1587 security_sk_free(sk);
1588out_free:
1589 if (slab != NULL)
1590 kmem_cache_free(slab, sk);
1591 else
1592 kfree(sk);
1593 return NULL;
c308c1b2
PE
1594}
1595
1596static void sk_prot_free(struct proto *prot, struct sock *sk)
1597{
1598 struct kmem_cache *slab;
2e4afe7b 1599 struct module *owner;
c308c1b2 1600
2e4afe7b 1601 owner = prot->owner;
c308c1b2 1602 slab = prot->slab;
2e4afe7b 1603
bd1060a1 1604 cgroup_sk_free(&sk->sk_cgrp_data);
2d758073 1605 mem_cgroup_sk_free(sk);
2e4afe7b 1606 security_sk_free(sk);
c308c1b2
PE
1607 if (slab != NULL)
1608 kmem_cache_free(slab, sk);
1609 else
1610 kfree(sk);
2e4afe7b 1611 module_put(owner);
c308c1b2
PE
1612}
1613
1da177e4
LT
1614/**
1615 * sk_alloc - All socket objects are allocated here
c4ea43c5 1616 * @net: the applicable net namespace
4dc3b16b
PP
1617 * @family: protocol family
1618 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1619 * @prot: struct proto associated with this new sock instance
11aa9c28 1620 * @kern: is this to be a kernel socket?
1da177e4 1621 */
1b8d7ae4 1622struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
11aa9c28 1623 struct proto *prot, int kern)
1da177e4 1624{
c308c1b2 1625 struct sock *sk;
1da177e4 1626
154adbc8 1627 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1da177e4 1628 if (sk) {
154adbc8
PE
1629 sk->sk_family = family;
1630 /*
1631 * See comment in struct sock definition to understand
1632 * why we need sk_prot_creator -acme
1633 */
1634 sk->sk_prot = sk->sk_prot_creator = prot;
cdfbabfb 1635 sk->sk_kern_sock = kern;
154adbc8 1636 sock_lock_init(sk);
26abe143 1637 sk->sk_net_refcnt = kern ? 0 : 1;
648845ab 1638 if (likely(sk->sk_net_refcnt)) {
26abe143 1639 get_net(net);
648845ab
TZ
1640 sock_inuse_add(net, 1);
1641 }
1642
26abe143 1643 sock_net_set(sk, net);
14afee4b 1644 refcount_set(&sk->sk_wmem_alloc, 1);
f8451725 1645
2d758073 1646 mem_cgroup_sk_alloc(sk);
d979a39d 1647 cgroup_sk_alloc(&sk->sk_cgrp_data);
2a56a1fe
TH
1648 sock_update_classid(&sk->sk_cgrp_data);
1649 sock_update_netprioidx(&sk->sk_cgrp_data);
1da177e4 1650 }
a79af59e 1651
2e4afe7b 1652 return sk;
1da177e4 1653}
2a91525c 1654EXPORT_SYMBOL(sk_alloc);
1da177e4 1655
a4298e45
ED
1656/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1657 * grace period. This is the case for UDP sockets and TCP listeners.
1658 */
1659static void __sk_destruct(struct rcu_head *head)
1da177e4 1660{
a4298e45 1661 struct sock *sk = container_of(head, struct sock, sk_rcu);
1da177e4 1662 struct sk_filter *filter;
1da177e4
LT
1663
1664 if (sk->sk_destruct)
1665 sk->sk_destruct(sk);
1666
a898def2 1667 filter = rcu_dereference_check(sk->sk_filter,
14afee4b 1668 refcount_read(&sk->sk_wmem_alloc) == 0);
1da177e4 1669 if (filter) {
309dd5fc 1670 sk_filter_uncharge(sk, filter);
a9b3cd7f 1671 RCU_INIT_POINTER(sk->sk_filter, NULL);
1da177e4 1672 }
538950a1
CG
1673 if (rcu_access_pointer(sk->sk_reuseport_cb))
1674 reuseport_detach_sock(sk);
1da177e4 1675
08e29af3 1676 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1da177e4
LT
1677
1678 if (atomic_read(&sk->sk_omem_alloc))
e005d193
JP
1679 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1680 __func__, atomic_read(&sk->sk_omem_alloc));
1da177e4 1681
22a0e18e
ED
1682 if (sk->sk_frag.page) {
1683 put_page(sk->sk_frag.page);
1684 sk->sk_frag.page = NULL;
1685 }
1686
109f6e39
EB
1687 if (sk->sk_peer_cred)
1688 put_cred(sk->sk_peer_cred);
1689 put_pid(sk->sk_peer_pid);
26abe143
EB
1690 if (likely(sk->sk_net_refcnt))
1691 put_net(sock_net(sk));
c308c1b2 1692 sk_prot_free(sk->sk_prot_creator, sk);
1da177e4 1693}
2b85a34e 1694
a4298e45
ED
1695void sk_destruct(struct sock *sk)
1696{
1697 if (sock_flag(sk, SOCK_RCU_FREE))
1698 call_rcu(&sk->sk_rcu, __sk_destruct);
1699 else
1700 __sk_destruct(&sk->sk_rcu);
1701}
1702
eb4cb008
CG
1703static void __sk_free(struct sock *sk)
1704{
648845ab
TZ
1705 if (likely(sk->sk_net_refcnt))
1706 sock_inuse_add(sock_net(sk), -1);
1707
9709020c 1708 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
eb4cb008
CG
1709 sock_diag_broadcast_destroy(sk);
1710 else
1711 sk_destruct(sk);
1712}
1713
2b85a34e
ED
1714void sk_free(struct sock *sk)
1715{
1716 /*
25985edc 1717 * We subtract one from sk_wmem_alloc and can know if
2b85a34e
ED
1718 * some packets are still in some tx queue.
1719 * If not null, sock_wfree() will call __sk_free(sk) later
1720 */
14afee4b 1721 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2b85a34e
ED
1722 __sk_free(sk);
1723}
2a91525c 1724EXPORT_SYMBOL(sk_free);
1da177e4 1725
581319c5
PA
1726static void sk_init_common(struct sock *sk)
1727{
1728 skb_queue_head_init(&sk->sk_receive_queue);
1729 skb_queue_head_init(&sk->sk_write_queue);
1730 skb_queue_head_init(&sk->sk_error_queue);
1731
1732 rwlock_init(&sk->sk_callback_lock);
1733 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1734 af_rlock_keys + sk->sk_family,
1735 af_family_rlock_key_strings[sk->sk_family]);
1736 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1737 af_wlock_keys + sk->sk_family,
1738 af_family_wlock_key_strings[sk->sk_family]);
1739 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1740 af_elock_keys + sk->sk_family,
1741 af_family_elock_key_strings[sk->sk_family]);
1742 lockdep_set_class_and_name(&sk->sk_callback_lock,
1743 af_callback_keys + sk->sk_family,
1744 af_family_clock_key_strings[sk->sk_family]);
1745}
1746
e56c57d0
ED
1747/**
1748 * sk_clone_lock - clone a socket, and lock its clone
1749 * @sk: the socket to clone
1750 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1751 *
1752 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1753 */
1754struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
87d11ceb 1755{
8fd1d178 1756 struct sock *newsk;
278571ba 1757 bool is_charged = true;
87d11ceb 1758
8fd1d178 1759 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
87d11ceb
ACM
1760 if (newsk != NULL) {
1761 struct sk_filter *filter;
1762
892c141e 1763 sock_copy(newsk, sk);
87d11ceb 1764
9d538fa6
CP
1765 newsk->sk_prot_creator = sk->sk_prot;
1766
87d11ceb 1767 /* SANITY */
8a681736
SV
1768 if (likely(newsk->sk_net_refcnt))
1769 get_net(sock_net(newsk));
87d11ceb
ACM
1770 sk_node_init(&newsk->sk_node);
1771 sock_lock_init(newsk);
1772 bh_lock_sock(newsk);
fa438ccf 1773 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
8eae939f 1774 newsk->sk_backlog.len = 0;
87d11ceb
ACM
1775
1776 atomic_set(&newsk->sk_rmem_alloc, 0);
2b85a34e
ED
1777 /*
1778 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1779 */
14afee4b 1780 refcount_set(&newsk->sk_wmem_alloc, 1);
87d11ceb 1781 atomic_set(&newsk->sk_omem_alloc, 0);
581319c5 1782 sk_init_common(newsk);
87d11ceb
ACM
1783
1784 newsk->sk_dst_cache = NULL;
9b8805a3 1785 newsk->sk_dst_pending_confirm = 0;
87d11ceb
ACM
1786 newsk->sk_wmem_queued = 0;
1787 newsk->sk_forward_alloc = 0;
9caad864 1788 atomic_set(&newsk->sk_drops, 0);
87d11ceb 1789 newsk->sk_send_head = NULL;
87d11ceb 1790 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
52267790 1791 atomic_set(&newsk->sk_zckey, 0);
87d11ceb
ACM
1792
1793 sock_reset_flag(newsk, SOCK_DONE);
edbe69ef 1794 mem_cgroup_sk_alloc(newsk);
c0576e39 1795 cgroup_sk_alloc(&newsk->sk_cgrp_data);
87d11ceb 1796
eefca20e
ED
1797 rcu_read_lock();
1798 filter = rcu_dereference(sk->sk_filter);
87d11ceb 1799 if (filter != NULL)
278571ba
AS
1800 /* though it's an empty new sock, the charging may fail
1801 * if sysctl_optmem_max was changed between creation of
1802 * original socket and cloning
1803 */
1804 is_charged = sk_filter_charge(newsk, filter);
eefca20e
ED
1805 RCU_INIT_POINTER(newsk->sk_filter, filter);
1806 rcu_read_unlock();
87d11ceb 1807
d188ba86 1808 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
a97e50cc
DB
1809 /* We need to make sure that we don't uncharge the new
1810 * socket if we couldn't charge it in the first place
1811 * as otherwise we uncharge the parent's filter.
1812 */
1813 if (!is_charged)
1814 RCU_INIT_POINTER(newsk->sk_filter, NULL);
94352d45 1815 sk_free_unlock_clone(newsk);
87d11ceb
ACM
1816 newsk = NULL;
1817 goto out;
1818 }
fa463497 1819 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
87d11ceb
ACM
1820
1821 newsk->sk_err = 0;
e551c32d 1822 newsk->sk_err_soft = 0;
87d11ceb 1823 newsk->sk_priority = 0;
2c8c56e1 1824 newsk->sk_incoming_cpu = raw_smp_processor_id();
648845ab
TZ
1825 if (likely(newsk->sk_net_refcnt))
1826 sock_inuse_add(sock_net(newsk), 1);
d979a39d 1827
4dc6dc71
ED
1828 /*
1829 * Before updating sk_refcnt, we must commit prior changes to memory
1830 * (Documentation/RCU/rculist_nulls.txt for details)
1831 */
1832 smp_wmb();
41c6d650 1833 refcount_set(&newsk->sk_refcnt, 2);
87d11ceb
ACM
1834
1835 /*
1836 * Increment the counter in the same struct proto as the master
1837 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1838 * is the same as sk->sk_prot->socks, as this field was copied
1839 * with memcpy).
1840 *
1841 * This _changes_ the previous behaviour, where
1842 * tcp_create_openreq_child always was incrementing the
1843 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1844 * to be taken into account in all callers. -acme
1845 */
1846 sk_refcnt_debug_inc(newsk);
972692e0 1847 sk_set_socket(newsk, NULL);
43815482 1848 newsk->sk_wq = NULL;
87d11ceb
ACM
1849
1850 if (newsk->sk_prot->sockets_allocated)
180d8cd9 1851 sk_sockets_allocated_inc(newsk);
704da560 1852
080a270f
HFS
1853 if (sock_needs_netstamp(sk) &&
1854 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
704da560 1855 net_enable_timestamp();
87d11ceb
ACM
1856 }
1857out:
1858 return newsk;
1859}
e56c57d0 1860EXPORT_SYMBOL_GPL(sk_clone_lock);
87d11ceb 1861
94352d45
ACM
1862void sk_free_unlock_clone(struct sock *sk)
1863{
1864 /* It is still raw copy of parent, so invalidate
1865 * destructor and make plain sk_free() */
1866 sk->sk_destruct = NULL;
1867 bh_unlock_sock(sk);
1868 sk_free(sk);
1869}
1870EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1871
9958089a
AK
1872void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1873{
d6a4e26a
ED
1874 u32 max_segs = 1;
1875
6bd4f355 1876 sk_dst_set(sk, dst);
0a6b2a1d 1877 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
9958089a 1878 if (sk->sk_route_caps & NETIF_F_GSO)
4fcd6b99 1879 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
a465419b 1880 sk->sk_route_caps &= ~sk->sk_route_nocaps;
9958089a 1881 if (sk_can_gso(sk)) {
f70f250a 1882 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
9958089a 1883 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
82cc1a7a 1884 } else {
9958089a 1885 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
82cc1a7a 1886 sk->sk_gso_max_size = dst->dev->gso_max_size;
d6a4e26a 1887 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
82cc1a7a 1888 }
9958089a 1889 }
d6a4e26a 1890 sk->sk_gso_max_segs = max_segs;
9958089a
AK
1891}
1892EXPORT_SYMBOL_GPL(sk_setup_caps);
1893
1da177e4
LT
1894/*
1895 * Simple resource managers for sockets.
1896 */
1897
1898
4ec93edb
YH
1899/*
1900 * Write buffer destructor automatically called from kfree_skb.
1da177e4
LT
1901 */
1902void sock_wfree(struct sk_buff *skb)
1903{
1904 struct sock *sk = skb->sk;
d99927f4 1905 unsigned int len = skb->truesize;
1da177e4 1906
d99927f4
ED
1907 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1908 /*
1909 * Keep a reference on sk_wmem_alloc, this will be released
1910 * after sk_write_space() call
1911 */
14afee4b 1912 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1da177e4 1913 sk->sk_write_space(sk);
d99927f4
ED
1914 len = 1;
1915 }
2b85a34e 1916 /*
d99927f4
ED
1917 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1918 * could not do because of in-flight packets
2b85a34e 1919 */
14afee4b 1920 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2b85a34e 1921 __sk_free(sk);
1da177e4 1922}
2a91525c 1923EXPORT_SYMBOL(sock_wfree);
1da177e4 1924
1d2077ac
ED
1925/* This variant of sock_wfree() is used by TCP,
1926 * since it sets SOCK_USE_WRITE_QUEUE.
1927 */
1928void __sock_wfree(struct sk_buff *skb)
1929{
1930 struct sock *sk = skb->sk;
1931
14afee4b 1932 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1d2077ac
ED
1933 __sk_free(sk);
1934}
1935
9e17f8a4
ED
1936void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1937{
1938 skb_orphan(skb);
1939 skb->sk = sk;
1940#ifdef CONFIG_INET
1941 if (unlikely(!sk_fullsock(sk))) {
1942 skb->destructor = sock_edemux;
1943 sock_hold(sk);
1944 return;
1945 }
1946#endif
1947 skb->destructor = sock_wfree;
1948 skb_set_hash_from_sk(skb, sk);
1949 /*
1950 * We used to take a refcount on sk, but following operation
1951 * is enough to guarantee sk_free() wont free this sock until
1952 * all in-flight packets are completed
1953 */
14afee4b 1954 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
9e17f8a4
ED
1955}
1956EXPORT_SYMBOL(skb_set_owner_w);
1957
1d2077ac
ED
1958/* This helper is used by netem, as it can hold packets in its
1959 * delay queue. We want to allow the owner socket to send more
1960 * packets, as if they were already TX completed by a typical driver.
1961 * But we also want to keep skb->sk set because some packet schedulers
f6ba8d33 1962 * rely on it (sch_fq for example).
1d2077ac 1963 */
f2f872f9
ED
1964void skb_orphan_partial(struct sk_buff *skb)
1965{
f6ba8d33 1966 if (skb_is_tcp_pure_ack(skb))
1d2077ac
ED
1967 return;
1968
f2f872f9
ED
1969 if (skb->destructor == sock_wfree
1970#ifdef CONFIG_INET
1971 || skb->destructor == tcp_wfree
1972#endif
1973 ) {
f6ba8d33
ED
1974 struct sock *sk = skb->sk;
1975
41c6d650 1976 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
14afee4b 1977 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
f6ba8d33
ED
1978 skb->destructor = sock_efree;
1979 }
f2f872f9
ED
1980 } else {
1981 skb_orphan(skb);
1982 }
1983}
1984EXPORT_SYMBOL(skb_orphan_partial);
1985
4ec93edb
YH
1986/*
1987 * Read buffer destructor automatically called from kfree_skb.
1da177e4
LT
1988 */
1989void sock_rfree(struct sk_buff *skb)
1990{
1991 struct sock *sk = skb->sk;
d361fd59 1992 unsigned int len = skb->truesize;
1da177e4 1993
d361fd59
ED
1994 atomic_sub(len, &sk->sk_rmem_alloc);
1995 sk_mem_uncharge(sk, len);
1da177e4 1996}
2a91525c 1997EXPORT_SYMBOL(sock_rfree);
1da177e4 1998
7768eed8
OH
1999/*
2000 * Buffer destructor for skbs that are not used directly in read or write
2001 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2002 */
62bccb8c
AD
2003void sock_efree(struct sk_buff *skb)
2004{
2005 sock_put(skb->sk);
2006}
2007EXPORT_SYMBOL(sock_efree);
2008
976d0201 2009kuid_t sock_i_uid(struct sock *sk)
1da177e4 2010{
976d0201 2011 kuid_t uid;
1da177e4 2012
f064af1e 2013 read_lock_bh(&sk->sk_callback_lock);
976d0201 2014 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
f064af1e 2015 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
2016 return uid;
2017}
2a91525c 2018EXPORT_SYMBOL(sock_i_uid);
1da177e4
LT
2019
2020unsigned long sock_i_ino(struct sock *sk)
2021{
2022 unsigned long ino;
2023
f064af1e 2024 read_lock_bh(&sk->sk_callback_lock);
1da177e4 2025 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
f064af1e 2026 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
2027 return ino;
2028}
2a91525c 2029EXPORT_SYMBOL(sock_i_ino);
1da177e4
LT
2030
2031/*
2032 * Allocate a skb from the socket's send buffer.
2033 */
86a76caf 2034struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 2035 gfp_t priority)
1da177e4 2036{
14afee4b 2037 if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
2a91525c 2038 struct sk_buff *skb = alloc_skb(size, priority);
1da177e4
LT
2039 if (skb) {
2040 skb_set_owner_w(skb, sk);
2041 return skb;
2042 }
2043 }
2044 return NULL;
2045}
2a91525c 2046EXPORT_SYMBOL(sock_wmalloc);
1da177e4 2047
98ba0bd5
WB
2048static void sock_ofree(struct sk_buff *skb)
2049{
2050 struct sock *sk = skb->sk;
2051
2052 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2053}
2054
2055struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2056 gfp_t priority)
2057{
2058 struct sk_buff *skb;
2059
2060 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2061 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2062 sysctl_optmem_max)
2063 return NULL;
2064
2065 skb = alloc_skb(size, priority);
2066 if (!skb)
2067 return NULL;
2068
2069 atomic_add(skb->truesize, &sk->sk_omem_alloc);
2070 skb->sk = sk;
2071 skb->destructor = sock_ofree;
2072 return skb;
2073}
2074
4ec93edb 2075/*
1da177e4 2076 * Allocate a memory block from the socket's option memory buffer.
4ec93edb 2077 */
dd0fc66f 2078void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1da177e4 2079{
95c96174 2080 if ((unsigned int)size <= sysctl_optmem_max &&
1da177e4
LT
2081 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2082 void *mem;
2083 /* First do the add, to avoid the race if kmalloc
4ec93edb 2084 * might sleep.
1da177e4
LT
2085 */
2086 atomic_add(size, &sk->sk_omem_alloc);
2087 mem = kmalloc(size, priority);
2088 if (mem)
2089 return mem;
2090 atomic_sub(size, &sk->sk_omem_alloc);
2091 }
2092 return NULL;
2093}
2a91525c 2094EXPORT_SYMBOL(sock_kmalloc);
1da177e4 2095
79e88659
DB
2096/* Free an option memory block. Note, we actually want the inline
2097 * here as this allows gcc to detect the nullify and fold away the
2098 * condition entirely.
1da177e4 2099 */
79e88659
DB
2100static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2101 const bool nullify)
1da177e4 2102{
e53da5fb
DM
2103 if (WARN_ON_ONCE(!mem))
2104 return;
79e88659
DB
2105 if (nullify)
2106 kzfree(mem);
2107 else
2108 kfree(mem);
1da177e4
LT
2109 atomic_sub(size, &sk->sk_omem_alloc);
2110}
79e88659
DB
2111
2112void sock_kfree_s(struct sock *sk, void *mem, int size)
2113{
2114 __sock_kfree_s(sk, mem, size, false);
2115}
2a91525c 2116EXPORT_SYMBOL(sock_kfree_s);
1da177e4 2117
79e88659
DB
2118void sock_kzfree_s(struct sock *sk, void *mem, int size)
2119{
2120 __sock_kfree_s(sk, mem, size, true);
2121}
2122EXPORT_SYMBOL(sock_kzfree_s);
2123
1da177e4
LT
2124/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2125 I think, these locks should be removed for datagram sockets.
2126 */
2a91525c 2127static long sock_wait_for_wmem(struct sock *sk, long timeo)
1da177e4
LT
2128{
2129 DEFINE_WAIT(wait);
2130
9cd3e072 2131 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1da177e4
LT
2132 for (;;) {
2133 if (!timeo)
2134 break;
2135 if (signal_pending(current))
2136 break;
2137 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
aa395145 2138 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
14afee4b 2139 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1da177e4
LT
2140 break;
2141 if (sk->sk_shutdown & SEND_SHUTDOWN)
2142 break;
2143 if (sk->sk_err)
2144 break;
2145 timeo = schedule_timeout(timeo);
2146 }
aa395145 2147 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
2148 return timeo;
2149}
2150
2151
2152/*
2153 * Generic send/receive buffer handlers
2154 */
2155
4cc7f68d
HX
2156struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2157 unsigned long data_len, int noblock,
28d64271 2158 int *errcode, int max_page_order)
1da177e4 2159{
2e4e4410 2160 struct sk_buff *skb;
1da177e4
LT
2161 long timeo;
2162 int err;
2163
1da177e4 2164 timeo = sock_sndtimeo(sk, noblock);
2e4e4410 2165 for (;;) {
1da177e4
LT
2166 err = sock_error(sk);
2167 if (err != 0)
2168 goto failure;
2169
2170 err = -EPIPE;
2171 if (sk->sk_shutdown & SEND_SHUTDOWN)
2172 goto failure;
2173
2e4e4410
ED
2174 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2175 break;
28d64271 2176
9cd3e072 2177 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2e4e4410
ED
2178 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2179 err = -EAGAIN;
2180 if (!timeo)
1da177e4 2181 goto failure;
2e4e4410
ED
2182 if (signal_pending(current))
2183 goto interrupted;
2184 timeo = sock_wait_for_wmem(sk, timeo);
1da177e4 2185 }
2e4e4410
ED
2186 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2187 errcode, sk->sk_allocation);
2188 if (skb)
2189 skb_set_owner_w(skb, sk);
1da177e4
LT
2190 return skb;
2191
2192interrupted:
2193 err = sock_intr_errno(timeo);
2194failure:
2195 *errcode = err;
2196 return NULL;
2197}
4cc7f68d 2198EXPORT_SYMBOL(sock_alloc_send_pskb);
1da177e4 2199
4ec93edb 2200struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1da177e4
LT
2201 int noblock, int *errcode)
2202{
28d64271 2203 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1da177e4 2204}
2a91525c 2205EXPORT_SYMBOL(sock_alloc_send_skb);
1da177e4 2206
39771b12
WB
2207int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2208 struct sockcm_cookie *sockc)
2209{
3dd17e63
SHY
2210 u32 tsflags;
2211
39771b12
WB
2212 switch (cmsg->cmsg_type) {
2213 case SO_MARK:
2214 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2215 return -EPERM;
2216 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2217 return -EINVAL;
2218 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2219 break;
7f1bc6e9 2220 case SO_TIMESTAMPING_OLD:
3dd17e63
SHY
2221 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2222 return -EINVAL;
2223
2224 tsflags = *(u32 *)CMSG_DATA(cmsg);
2225 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2226 return -EINVAL;
2227
2228 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2229 sockc->tsflags |= tsflags;
2230 break;
80b14dee
RC
2231 case SCM_TXTIME:
2232 if (!sock_flag(sk, SOCK_TXTIME))
2233 return -EINVAL;
2234 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2235 return -EINVAL;
2236 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2237 break;
779f1ede
SHY
2238 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2239 case SCM_RIGHTS:
2240 case SCM_CREDENTIALS:
2241 break;
39771b12
WB
2242 default:
2243 return -EINVAL;
2244 }
2245 return 0;
2246}
2247EXPORT_SYMBOL(__sock_cmsg_send);
2248
f28ea365
EJ
2249int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2250 struct sockcm_cookie *sockc)
2251{
2252 struct cmsghdr *cmsg;
39771b12 2253 int ret;
f28ea365
EJ
2254
2255 for_each_cmsghdr(cmsg, msg) {
2256 if (!CMSG_OK(msg, cmsg))
2257 return -EINVAL;
2258 if (cmsg->cmsg_level != SOL_SOCKET)
2259 continue;
39771b12
WB
2260 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2261 if (ret)
2262 return ret;
f28ea365
EJ
2263 }
2264 return 0;
2265}
2266EXPORT_SYMBOL(sock_cmsg_send);
2267
06044751
ED
2268static void sk_enter_memory_pressure(struct sock *sk)
2269{
2270 if (!sk->sk_prot->enter_memory_pressure)
2271 return;
2272
2273 sk->sk_prot->enter_memory_pressure(sk);
2274}
2275
2276static void sk_leave_memory_pressure(struct sock *sk)
2277{
2278 if (sk->sk_prot->leave_memory_pressure) {
2279 sk->sk_prot->leave_memory_pressure(sk);
2280 } else {
2281 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2282
2283 if (memory_pressure && *memory_pressure)
2284 *memory_pressure = 0;
2285 }
2286}
2287
5640f768
ED
2288/* On 32bit arches, an skb frag is limited to 2^15 */
2289#define SKB_FRAG_PAGE_ORDER get_order(32768)
2290
400dfd3a
ED
2291/**
2292 * skb_page_frag_refill - check that a page_frag contains enough room
2293 * @sz: minimum size of the fragment we want to get
2294 * @pfrag: pointer to page_frag
82d5e2b8 2295 * @gfp: priority for memory allocation
400dfd3a
ED
2296 *
2297 * Note: While this allocator tries to use high order pages, there is
2298 * no guarantee that allocations succeed. Therefore, @sz MUST be
2299 * less or equal than PAGE_SIZE.
2300 */
d9b2938a 2301bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
5640f768 2302{
5640f768 2303 if (pfrag->page) {
fe896d18 2304 if (page_ref_count(pfrag->page) == 1) {
5640f768
ED
2305 pfrag->offset = 0;
2306 return true;
2307 }
400dfd3a 2308 if (pfrag->offset + sz <= pfrag->size)
5640f768
ED
2309 return true;
2310 put_page(pfrag->page);
2311 }
2312
d9b2938a
ED
2313 pfrag->offset = 0;
2314 if (SKB_FRAG_PAGE_ORDER) {
d0164adc
MG
2315 /* Avoid direct reclaim but allow kswapd to wake */
2316 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2317 __GFP_COMP | __GFP_NOWARN |
2318 __GFP_NORETRY,
d9b2938a 2319 SKB_FRAG_PAGE_ORDER);
5640f768 2320 if (likely(pfrag->page)) {
d9b2938a 2321 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
5640f768
ED
2322 return true;
2323 }
d9b2938a
ED
2324 }
2325 pfrag->page = alloc_page(gfp);
2326 if (likely(pfrag->page)) {
2327 pfrag->size = PAGE_SIZE;
2328 return true;
2329 }
400dfd3a
ED
2330 return false;
2331}
2332EXPORT_SYMBOL(skb_page_frag_refill);
2333
2334bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2335{
2336 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2337 return true;
2338
5640f768
ED
2339 sk_enter_memory_pressure(sk);
2340 sk_stream_moderate_sndbuf(sk);
2341 return false;
2342}
2343EXPORT_SYMBOL(sk_page_frag_refill);
2344
1da177e4 2345static void __lock_sock(struct sock *sk)
f39234d6
NK
2346 __releases(&sk->sk_lock.slock)
2347 __acquires(&sk->sk_lock.slock)
1da177e4
LT
2348{
2349 DEFINE_WAIT(wait);
2350
e71a4783 2351 for (;;) {
1da177e4
LT
2352 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2353 TASK_UNINTERRUPTIBLE);
2354 spin_unlock_bh(&sk->sk_lock.slock);
2355 schedule();
2356 spin_lock_bh(&sk->sk_lock.slock);
e71a4783 2357 if (!sock_owned_by_user(sk))
1da177e4
LT
2358 break;
2359 }
2360 finish_wait(&sk->sk_lock.wq, &wait);
2361}
2362
8873c064 2363void __release_sock(struct sock *sk)
f39234d6
NK
2364 __releases(&sk->sk_lock.slock)
2365 __acquires(&sk->sk_lock.slock)
1da177e4 2366{
5413d1ba 2367 struct sk_buff *skb, *next;
1da177e4 2368
5413d1ba 2369 while ((skb = sk->sk_backlog.head) != NULL) {
1da177e4 2370 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1da177e4 2371
5413d1ba 2372 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4 2373
5413d1ba
ED
2374 do {
2375 next = skb->next;
e4cbb02a 2376 prefetch(next);
7fee226a 2377 WARN_ON_ONCE(skb_dst_is_noref(skb));
a8305bff 2378 skb_mark_not_on_list(skb);
c57943a1 2379 sk_backlog_rcv(sk, skb);
1da177e4 2380
5413d1ba 2381 cond_resched();
1da177e4
LT
2382
2383 skb = next;
2384 } while (skb != NULL);
2385
5413d1ba
ED
2386 spin_lock_bh(&sk->sk_lock.slock);
2387 }
8eae939f
ZY
2388
2389 /*
2390 * Doing the zeroing here guarantee we can not loop forever
2391 * while a wild producer attempts to flood us.
2392 */
2393 sk->sk_backlog.len = 0;
1da177e4
LT
2394}
2395
d41a69f1
ED
2396void __sk_flush_backlog(struct sock *sk)
2397{
2398 spin_lock_bh(&sk->sk_lock.slock);
2399 __release_sock(sk);
2400 spin_unlock_bh(&sk->sk_lock.slock);
2401}
2402
1da177e4
LT
2403/**
2404 * sk_wait_data - wait for data to arrive at sk_receive_queue
4dc3b16b
PP
2405 * @sk: sock to wait on
2406 * @timeo: for how long
dfbafc99 2407 * @skb: last skb seen on sk_receive_queue
1da177e4
LT
2408 *
2409 * Now socket state including sk->sk_err is changed only under lock,
2410 * hence we may omit checks after joining wait queue.
2411 * We check receive queue before schedule() only as optimization;
2412 * it is very likely that release_sock() added new data.
2413 */
dfbafc99 2414int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
1da177e4 2415{
d9dc8b0f 2416 DEFINE_WAIT_FUNC(wait, woken_wake_function);
1da177e4 2417 int rc;
1da177e4 2418
d9dc8b0f 2419 add_wait_queue(sk_sleep(sk), &wait);
9cd3e072 2420 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
d9dc8b0f 2421 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
9cd3e072 2422 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
d9dc8b0f 2423 remove_wait_queue(sk_sleep(sk), &wait);
1da177e4
LT
2424 return rc;
2425}
1da177e4
LT
2426EXPORT_SYMBOL(sk_wait_data);
2427
3ab224be 2428/**
f8c3bf00 2429 * __sk_mem_raise_allocated - increase memory_allocated
3ab224be
HA
2430 * @sk: socket
2431 * @size: memory size to allocate
f8c3bf00 2432 * @amt: pages to allocate
3ab224be
HA
2433 * @kind: allocation type
2434 *
f8c3bf00 2435 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
3ab224be 2436 */
f8c3bf00 2437int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3ab224be
HA
2438{
2439 struct proto *prot = sk->sk_prot;
f8c3bf00 2440 long allocated = sk_memory_allocated_add(sk, amt);
d6f19938 2441 bool charged = true;
e805605c 2442
baac50bb 2443 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
d6f19938 2444 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
e805605c 2445 goto suppress_allocation;
3ab224be
HA
2446
2447 /* Under limit. */
e805605c 2448 if (allocated <= sk_prot_mem_limits(sk, 0)) {
180d8cd9 2449 sk_leave_memory_pressure(sk);
3ab224be
HA
2450 return 1;
2451 }
2452
e805605c
JW
2453 /* Under pressure. */
2454 if (allocated > sk_prot_mem_limits(sk, 1))
180d8cd9 2455 sk_enter_memory_pressure(sk);
3ab224be 2456
e805605c
JW
2457 /* Over hard limit. */
2458 if (allocated > sk_prot_mem_limits(sk, 2))
3ab224be
HA
2459 goto suppress_allocation;
2460
2461 /* guarantee minimum buffer size under pressure */
2462 if (kind == SK_MEM_RECV) {
a3dcaf17 2463 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3ab224be 2464 return 1;
180d8cd9 2465
3ab224be 2466 } else { /* SK_MEM_SEND */
a3dcaf17
ED
2467 int wmem0 = sk_get_wmem0(sk, prot);
2468
3ab224be 2469 if (sk->sk_type == SOCK_STREAM) {
a3dcaf17 2470 if (sk->sk_wmem_queued < wmem0)
3ab224be 2471 return 1;
a3dcaf17 2472 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3ab224be 2473 return 1;
a3dcaf17 2474 }
3ab224be
HA
2475 }
2476
180d8cd9 2477 if (sk_has_memory_pressure(sk)) {
5bf325a5 2478 u64 alloc;
1748376b 2479
180d8cd9 2480 if (!sk_under_memory_pressure(sk))
1748376b 2481 return 1;
180d8cd9
GC
2482 alloc = sk_sockets_allocated_read_positive(sk);
2483 if (sk_prot_mem_limits(sk, 2) > alloc *
3ab224be
HA
2484 sk_mem_pages(sk->sk_wmem_queued +
2485 atomic_read(&sk->sk_rmem_alloc) +
2486 sk->sk_forward_alloc))
2487 return 1;
2488 }
2489
2490suppress_allocation:
2491
2492 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2493 sk_stream_moderate_sndbuf(sk);
2494
2495 /* Fail only if socket is _under_ its sndbuf.
2496 * In this case we cannot block, so that we have to fail.
2497 */
2498 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2499 return 1;
2500 }
2501
d6f19938
YS
2502 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2503 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3847ce32 2504
0e90b31f 2505 sk_memory_allocated_sub(sk, amt);
180d8cd9 2506
baac50bb
JW
2507 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2508 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
e805605c 2509
3ab224be
HA
2510 return 0;
2511}
f8c3bf00
PA
2512EXPORT_SYMBOL(__sk_mem_raise_allocated);
2513
2514/**
2515 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2516 * @sk: socket
2517 * @size: memory size to allocate
2518 * @kind: allocation type
2519 *
2520 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2521 * rmem allocation. This function assumes that protocols which have
2522 * memory_pressure use sk_wmem_queued as write buffer accounting.
2523 */
2524int __sk_mem_schedule(struct sock *sk, int size, int kind)
2525{
2526 int ret, amt = sk_mem_pages(size);
2527
2528 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2529 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2530 if (!ret)
2531 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2532 return ret;
2533}
3ab224be
HA
2534EXPORT_SYMBOL(__sk_mem_schedule);
2535
2536/**
f8c3bf00 2537 * __sk_mem_reduce_allocated - reclaim memory_allocated
3ab224be 2538 * @sk: socket
f8c3bf00
PA
2539 * @amount: number of quanta
2540 *
2541 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3ab224be 2542 */
f8c3bf00 2543void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3ab224be 2544{
1a24e04e 2545 sk_memory_allocated_sub(sk, amount);
3ab224be 2546
baac50bb
JW
2547 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2548 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
e805605c 2549
180d8cd9
GC
2550 if (sk_under_memory_pressure(sk) &&
2551 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2552 sk_leave_memory_pressure(sk);
3ab224be 2553}
f8c3bf00
PA
2554EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2555
2556/**
2557 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2558 * @sk: socket
2559 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2560 */
2561void __sk_mem_reclaim(struct sock *sk, int amount)
2562{
2563 amount >>= SK_MEM_QUANTUM_SHIFT;
2564 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2565 __sk_mem_reduce_allocated(sk, amount);
2566}
3ab224be
HA
2567EXPORT_SYMBOL(__sk_mem_reclaim);
2568
627d2d6b 2569int sk_set_peek_off(struct sock *sk, int val)
2570{
627d2d6b 2571 sk->sk_peek_off = val;
2572 return 0;
2573}
2574EXPORT_SYMBOL_GPL(sk_set_peek_off);
3ab224be 2575
1da177e4
LT
2576/*
2577 * Set of default routines for initialising struct proto_ops when
2578 * the protocol does not support a particular function. In certain
2579 * cases where it makes no sense for a protocol to have a "do nothing"
2580 * function, some default processing is provided.
2581 */
2582
2583int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2584{
2585 return -EOPNOTSUPP;
2586}
2a91525c 2587EXPORT_SYMBOL(sock_no_bind);
1da177e4 2588
4ec93edb 2589int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
2590 int len, int flags)
2591{
2592 return -EOPNOTSUPP;
2593}
2a91525c 2594EXPORT_SYMBOL(sock_no_connect);
1da177e4
LT
2595
2596int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2597{
2598 return -EOPNOTSUPP;
2599}
2a91525c 2600EXPORT_SYMBOL(sock_no_socketpair);
1da177e4 2601
cdfbabfb
DH
2602int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2603 bool kern)
1da177e4
LT
2604{
2605 return -EOPNOTSUPP;
2606}
2a91525c 2607EXPORT_SYMBOL(sock_no_accept);
1da177e4 2608
4ec93edb 2609int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
9b2c45d4 2610 int peer)
1da177e4
LT
2611{
2612 return -EOPNOTSUPP;
2613}
2a91525c 2614EXPORT_SYMBOL(sock_no_getname);
1da177e4 2615
1da177e4
LT
2616int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2617{
2618 return -EOPNOTSUPP;
2619}
2a91525c 2620EXPORT_SYMBOL(sock_no_ioctl);
1da177e4
LT
2621
2622int sock_no_listen(struct socket *sock, int backlog)
2623{
2624 return -EOPNOTSUPP;
2625}
2a91525c 2626EXPORT_SYMBOL(sock_no_listen);
1da177e4
LT
2627
2628int sock_no_shutdown(struct socket *sock, int how)
2629{
2630 return -EOPNOTSUPP;
2631}
2a91525c 2632EXPORT_SYMBOL(sock_no_shutdown);
1da177e4
LT
2633
2634int sock_no_setsockopt(struct socket *sock, int level, int optname,
b7058842 2635 char __user *optval, unsigned int optlen)
1da177e4
LT
2636{
2637 return -EOPNOTSUPP;
2638}
2a91525c 2639EXPORT_SYMBOL(sock_no_setsockopt);
1da177e4
LT
2640
2641int sock_no_getsockopt(struct socket *sock, int level, int optname,
2642 char __user *optval, int __user *optlen)
2643{
2644 return -EOPNOTSUPP;
2645}
2a91525c 2646EXPORT_SYMBOL(sock_no_getsockopt);
1da177e4 2647
1b784140 2648int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
1da177e4
LT
2649{
2650 return -EOPNOTSUPP;
2651}
2a91525c 2652EXPORT_SYMBOL(sock_no_sendmsg);
1da177e4 2653
306b13eb
TH
2654int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2655{
2656 return -EOPNOTSUPP;
2657}
2658EXPORT_SYMBOL(sock_no_sendmsg_locked);
2659
1b784140
YX
2660int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2661 int flags)
1da177e4
LT
2662{
2663 return -EOPNOTSUPP;
2664}
2a91525c 2665EXPORT_SYMBOL(sock_no_recvmsg);
1da177e4
LT
2666
2667int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2668{
2669 /* Mirror missing mmap method error code */
2670 return -ENODEV;
2671}
2a91525c 2672EXPORT_SYMBOL(sock_no_mmap);
1da177e4
LT
2673
2674ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2675{
2676 ssize_t res;
2677 struct msghdr msg = {.msg_flags = flags};
2678 struct kvec iov;
2679 char *kaddr = kmap(page);
2680 iov.iov_base = kaddr + offset;
2681 iov.iov_len = size;
2682 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2683 kunmap(page);
2684 return res;
2685}
2a91525c 2686EXPORT_SYMBOL(sock_no_sendpage);
1da177e4 2687
306b13eb
TH
2688ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2689 int offset, size_t size, int flags)
2690{
2691 ssize_t res;
2692 struct msghdr msg = {.msg_flags = flags};
2693 struct kvec iov;
2694 char *kaddr = kmap(page);
2695
2696 iov.iov_base = kaddr + offset;
2697 iov.iov_len = size;
2698 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2699 kunmap(page);
2700 return res;
2701}
2702EXPORT_SYMBOL(sock_no_sendpage_locked);
2703
1da177e4
LT
2704/*
2705 * Default Socket Callbacks
2706 */
2707
2708static void sock_def_wakeup(struct sock *sk)
2709{
43815482
ED
2710 struct socket_wq *wq;
2711
2712 rcu_read_lock();
2713 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2714 if (skwq_has_sleeper(wq))
43815482
ED
2715 wake_up_interruptible_all(&wq->wait);
2716 rcu_read_unlock();
1da177e4
LT
2717}
2718
2719static void sock_def_error_report(struct sock *sk)
2720{
43815482
ED
2721 struct socket_wq *wq;
2722
2723 rcu_read_lock();
2724 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2725 if (skwq_has_sleeper(wq))
a9a08845 2726 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
8d8ad9d7 2727 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
43815482 2728 rcu_read_unlock();
1da177e4
LT
2729}
2730
676d2369 2731static void sock_def_readable(struct sock *sk)
1da177e4 2732{
43815482
ED
2733 struct socket_wq *wq;
2734
2735 rcu_read_lock();
2736 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2737 if (skwq_has_sleeper(wq))
a9a08845
LT
2738 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2739 EPOLLRDNORM | EPOLLRDBAND);
8d8ad9d7 2740 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
43815482 2741 rcu_read_unlock();
1da177e4
LT
2742}
2743
2744static void sock_def_write_space(struct sock *sk)
2745{
43815482
ED
2746 struct socket_wq *wq;
2747
2748 rcu_read_lock();
1da177e4
LT
2749
2750 /* Do not wake up a writer until he can make "significant"
2751 * progress. --DaveM
2752 */
14afee4b 2753 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
43815482 2754 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2755 if (skwq_has_sleeper(wq))
a9a08845
LT
2756 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2757 EPOLLWRNORM | EPOLLWRBAND);
1da177e4
LT
2758
2759 /* Should agree with poll, otherwise some programs break */
2760 if (sock_writeable(sk))
8d8ad9d7 2761 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1da177e4
LT
2762 }
2763
43815482 2764 rcu_read_unlock();
1da177e4
LT
2765}
2766
2767static void sock_def_destruct(struct sock *sk)
2768{
1da177e4
LT
2769}
2770
2771void sk_send_sigurg(struct sock *sk)
2772{
2773 if (sk->sk_socket && sk->sk_socket->file)
2774 if (send_sigurg(&sk->sk_socket->file->f_owner))
8d8ad9d7 2775 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1da177e4 2776}
2a91525c 2777EXPORT_SYMBOL(sk_send_sigurg);
1da177e4
LT
2778
2779void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2780 unsigned long expires)
2781{
2782 if (!mod_timer(timer, expires))
2783 sock_hold(sk);
2784}
1da177e4
LT
2785EXPORT_SYMBOL(sk_reset_timer);
2786
2787void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2788{
25cc4ae9 2789 if (del_timer(timer))
1da177e4
LT
2790 __sock_put(sk);
2791}
1da177e4
LT
2792EXPORT_SYMBOL(sk_stop_timer);
2793
2794void sock_init_data(struct socket *sock, struct sock *sk)
2795{
581319c5 2796 sk_init_common(sk);
1da177e4
LT
2797 sk->sk_send_head = NULL;
2798
99767f27 2799 timer_setup(&sk->sk_timer, NULL, 0);
4ec93edb 2800
1da177e4
LT
2801 sk->sk_allocation = GFP_KERNEL;
2802 sk->sk_rcvbuf = sysctl_rmem_default;
2803 sk->sk_sndbuf = sysctl_wmem_default;
2804 sk->sk_state = TCP_CLOSE;
972692e0 2805 sk_set_socket(sk, sock);
1da177e4
LT
2806
2807 sock_set_flag(sk, SOCK_ZAPPED);
2808
e71a4783 2809 if (sock) {
1da177e4 2810 sk->sk_type = sock->type;
43815482 2811 sk->sk_wq = sock->wq;
1da177e4 2812 sock->sk = sk;
86741ec2
LC
2813 sk->sk_uid = SOCK_INODE(sock)->i_uid;
2814 } else {
43815482 2815 sk->sk_wq = NULL;
86741ec2
LC
2816 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
2817 }
1da177e4 2818
1da177e4 2819 rwlock_init(&sk->sk_callback_lock);
cdfbabfb
DH
2820 if (sk->sk_kern_sock)
2821 lockdep_set_class_and_name(
2822 &sk->sk_callback_lock,
2823 af_kern_callback_keys + sk->sk_family,
2824 af_family_kern_clock_key_strings[sk->sk_family]);
2825 else
2826 lockdep_set_class_and_name(
2827 &sk->sk_callback_lock,
443aef0e
PZ
2828 af_callback_keys + sk->sk_family,
2829 af_family_clock_key_strings[sk->sk_family]);
1da177e4
LT
2830
2831 sk->sk_state_change = sock_def_wakeup;
2832 sk->sk_data_ready = sock_def_readable;
2833 sk->sk_write_space = sock_def_write_space;
2834 sk->sk_error_report = sock_def_error_report;
2835 sk->sk_destruct = sock_def_destruct;
2836
5640f768
ED
2837 sk->sk_frag.page = NULL;
2838 sk->sk_frag.offset = 0;
ef64a54f 2839 sk->sk_peek_off = -1;
1da177e4 2840
109f6e39
EB
2841 sk->sk_peer_pid = NULL;
2842 sk->sk_peer_cred = NULL;
1da177e4
LT
2843 sk->sk_write_pending = 0;
2844 sk->sk_rcvlowat = 1;
2845 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2846 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2847
6c7c98ba 2848 sk->sk_stamp = SK_DEFAULT_STAMP;
3a0ed3e9
DD
2849#if BITS_PER_LONG==32
2850 seqlock_init(&sk->sk_stamp_seq);
2851#endif
52267790 2852 atomic_set(&sk->sk_zckey, 0);
1da177e4 2853
e0d1095a 2854#ifdef CONFIG_NET_RX_BUSY_POLL
06021292 2855 sk->sk_napi_id = 0;
64b0dc51 2856 sk->sk_ll_usec = sysctl_net_busy_read;
06021292
ET
2857#endif
2858
76a9ebe8
ED
2859 sk->sk_max_pacing_rate = ~0UL;
2860 sk->sk_pacing_rate = ~0UL;
3a9b76fd 2861 sk->sk_pacing_shift = 10;
70da268b 2862 sk->sk_incoming_cpu = -1;
c6345ce7
AN
2863
2864 sk_rx_queue_clear(sk);
4dc6dc71
ED
2865 /*
2866 * Before updating sk_refcnt, we must commit prior changes to memory
2867 * (Documentation/RCU/rculist_nulls.txt for details)
2868 */
2869 smp_wmb();
41c6d650 2870 refcount_set(&sk->sk_refcnt, 1);
33c732c3 2871 atomic_set(&sk->sk_drops, 0);
1da177e4 2872}
2a91525c 2873EXPORT_SYMBOL(sock_init_data);
1da177e4 2874
b5606c2d 2875void lock_sock_nested(struct sock *sk, int subclass)
1da177e4
LT
2876{
2877 might_sleep();
a5b5bb9a 2878 spin_lock_bh(&sk->sk_lock.slock);
d2e9117c 2879 if (sk->sk_lock.owned)
1da177e4 2880 __lock_sock(sk);
d2e9117c 2881 sk->sk_lock.owned = 1;
a5b5bb9a
IM
2882 spin_unlock(&sk->sk_lock.slock);
2883 /*
2884 * The sk_lock has mutex_lock() semantics here:
2885 */
fcc70d5f 2886 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
a5b5bb9a 2887 local_bh_enable();
1da177e4 2888}
fcc70d5f 2889EXPORT_SYMBOL(lock_sock_nested);
1da177e4 2890
b5606c2d 2891void release_sock(struct sock *sk)
1da177e4 2892{
a5b5bb9a 2893 spin_lock_bh(&sk->sk_lock.slock);
1da177e4
LT
2894 if (sk->sk_backlog.tail)
2895 __release_sock(sk);
46d3ceab 2896
c3f9b018
ED
2897 /* Warning : release_cb() might need to release sk ownership,
2898 * ie call sock_release_ownership(sk) before us.
2899 */
46d3ceab
ED
2900 if (sk->sk_prot->release_cb)
2901 sk->sk_prot->release_cb(sk);
2902
c3f9b018 2903 sock_release_ownership(sk);
a5b5bb9a
IM
2904 if (waitqueue_active(&sk->sk_lock.wq))
2905 wake_up(&sk->sk_lock.wq);
2906 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4
LT
2907}
2908EXPORT_SYMBOL(release_sock);
2909
8a74ad60
ED
2910/**
2911 * lock_sock_fast - fast version of lock_sock
2912 * @sk: socket
2913 *
2914 * This version should be used for very small section, where process wont block
d651983d
MCC
2915 * return false if fast path is taken:
2916 *
8a74ad60 2917 * sk_lock.slock locked, owned = 0, BH disabled
d651983d
MCC
2918 *
2919 * return true if slow path is taken:
2920 *
8a74ad60
ED
2921 * sk_lock.slock unlocked, owned = 1, BH enabled
2922 */
2923bool lock_sock_fast(struct sock *sk)
2924{
2925 might_sleep();
2926 spin_lock_bh(&sk->sk_lock.slock);
2927
2928 if (!sk->sk_lock.owned)
2929 /*
2930 * Note : We must disable BH
2931 */
2932 return false;
2933
2934 __lock_sock(sk);
2935 sk->sk_lock.owned = 1;
2936 spin_unlock(&sk->sk_lock.slock);
2937 /*
2938 * The sk_lock has mutex_lock() semantics here:
2939 */
2940 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2941 local_bh_enable();
2942 return true;
2943}
2944EXPORT_SYMBOL(lock_sock_fast);
2945
1da177e4 2946int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
4ec93edb 2947{
b7aa0bf7 2948 struct timeval tv;
9dae3497
YS
2949
2950 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3a0ed3e9 2951 tv = ktime_to_timeval(sock_read_timestamp(sk));
b7aa0bf7 2952 if (tv.tv_sec == -1)
1da177e4 2953 return -ENOENT;
b7aa0bf7 2954 if (tv.tv_sec == 0) {
3a0ed3e9
DD
2955 ktime_t kt = ktime_get_real();
2956 sock_write_timestamp(sk, kt);
2957 tv = ktime_to_timeval(kt);
b7aa0bf7
ED
2958 }
2959 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
4ec93edb 2960}
1da177e4
LT
2961EXPORT_SYMBOL(sock_get_timestamp);
2962
ae40eb1e
ED
2963int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2964{
2965 struct timespec ts;
9dae3497
YS
2966
2967 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3a0ed3e9 2968 ts = ktime_to_timespec(sock_read_timestamp(sk));
ae40eb1e
ED
2969 if (ts.tv_sec == -1)
2970 return -ENOENT;
2971 if (ts.tv_sec == 0) {
3a0ed3e9
DD
2972 ktime_t kt = ktime_get_real();
2973 sock_write_timestamp(sk, kt);
ae40eb1e
ED
2974 ts = ktime_to_timespec(sk->sk_stamp);
2975 }
2976 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2977}
2978EXPORT_SYMBOL(sock_get_timestampns);
2979
20d49473 2980void sock_enable_timestamp(struct sock *sk, int flag)
4ec93edb 2981{
20d49473 2982 if (!sock_flag(sk, flag)) {
08e29af3
ED
2983 unsigned long previous_flags = sk->sk_flags;
2984
20d49473
PO
2985 sock_set_flag(sk, flag);
2986 /*
2987 * we just set one of the two flags which require net
2988 * time stamping, but time stamping might have been on
2989 * already because of the other one
2990 */
080a270f
HFS
2991 if (sock_needs_netstamp(sk) &&
2992 !(previous_flags & SK_FLAGS_TIMESTAMP))
20d49473 2993 net_enable_timestamp();
1da177e4
LT
2994 }
2995}
1da177e4 2996
cb820f8e
RC
2997int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2998 int level, int type)
2999{
3000 struct sock_exterr_skb *serr;
364a9e93 3001 struct sk_buff *skb;
cb820f8e
RC
3002 int copied, err;
3003
3004 err = -EAGAIN;
364a9e93 3005 skb = sock_dequeue_err_skb(sk);
cb820f8e
RC
3006 if (skb == NULL)
3007 goto out;
3008
3009 copied = skb->len;
3010 if (copied > len) {
3011 msg->msg_flags |= MSG_TRUNC;
3012 copied = len;
3013 }
51f3d02b 3014 err = skb_copy_datagram_msg(skb, 0, msg, copied);
cb820f8e
RC
3015 if (err)
3016 goto out_free_skb;
3017
3018 sock_recv_timestamp(msg, sk, skb);
3019
3020 serr = SKB_EXT_ERR(skb);
3021 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3022
3023 msg->msg_flags |= MSG_ERRQUEUE;
3024 err = copied;
3025
cb820f8e
RC
3026out_free_skb:
3027 kfree_skb(skb);
3028out:
3029 return err;
3030}
3031EXPORT_SYMBOL(sock_recv_errqueue);
3032
1da177e4
LT
3033/*
3034 * Get a socket option on an socket.
3035 *
3036 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3037 * asynchronous errors should be reported by getsockopt. We assume
3038 * this means if you specify SO_ERROR (otherwise whats the point of it).
3039 */
3040int sock_common_getsockopt(struct socket *sock, int level, int optname,
3041 char __user *optval, int __user *optlen)
3042{
3043 struct sock *sk = sock->sk;
3044
3045 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3046}
1da177e4
LT
3047EXPORT_SYMBOL(sock_common_getsockopt);
3048
3fdadf7d 3049#ifdef CONFIG_COMPAT
543d9cfe
ACM
3050int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3051 char __user *optval, int __user *optlen)
3fdadf7d
DM
3052{
3053 struct sock *sk = sock->sk;
3054
1e51f951 3055 if (sk->sk_prot->compat_getsockopt != NULL)
543d9cfe
ACM
3056 return sk->sk_prot->compat_getsockopt(sk, level, optname,
3057 optval, optlen);
3fdadf7d
DM
3058 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3059}
3060EXPORT_SYMBOL(compat_sock_common_getsockopt);
3061#endif
3062
1b784140
YX
3063int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3064 int flags)
1da177e4
LT
3065{
3066 struct sock *sk = sock->sk;
3067 int addr_len = 0;
3068 int err;
3069
1b784140 3070 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
1da177e4
LT
3071 flags & ~MSG_DONTWAIT, &addr_len);
3072 if (err >= 0)
3073 msg->msg_namelen = addr_len;
3074 return err;
3075}
1da177e4
LT
3076EXPORT_SYMBOL(sock_common_recvmsg);
3077
3078/*
3079 * Set socket options on an inet socket.
3080 */
3081int sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 3082 char __user *optval, unsigned int optlen)
1da177e4
LT
3083{
3084 struct sock *sk = sock->sk;
3085
3086 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3087}
1da177e4
LT
3088EXPORT_SYMBOL(sock_common_setsockopt);
3089
3fdadf7d 3090#ifdef CONFIG_COMPAT
543d9cfe 3091int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 3092 char __user *optval, unsigned int optlen)
3fdadf7d
DM
3093{
3094 struct sock *sk = sock->sk;
3095
543d9cfe
ACM
3096 if (sk->sk_prot->compat_setsockopt != NULL)
3097 return sk->sk_prot->compat_setsockopt(sk, level, optname,
3098 optval, optlen);
3fdadf7d
DM
3099 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3100}
3101EXPORT_SYMBOL(compat_sock_common_setsockopt);
3102#endif
3103
1da177e4
LT
3104void sk_common_release(struct sock *sk)
3105{
3106 if (sk->sk_prot->destroy)
3107 sk->sk_prot->destroy(sk);
3108
3109 /*
3110 * Observation: when sock_common_release is called, processes have
3111 * no access to socket. But net still has.
3112 * Step one, detach it from networking:
3113 *
3114 * A. Remove from hash tables.
3115 */
3116
3117 sk->sk_prot->unhash(sk);
3118
3119 /*
3120 * In this point socket cannot receive new packets, but it is possible
3121 * that some packets are in flight because some CPU runs receiver and
3122 * did hash table lookup before we unhashed socket. They will achieve
3123 * receive queue and will be purged by socket destructor.
3124 *
3125 * Also we still have packets pending on receive queue and probably,
3126 * our own packets waiting in device queues. sock_destroy will drain
3127 * receive queue, but transmitted packets will delay socket destruction
3128 * until the last reference will be released.
3129 */
3130
3131 sock_orphan(sk);
3132
3133 xfrm_sk_free_policy(sk);
3134
e6848976 3135 sk_refcnt_debug_release(sk);
5640f768 3136
1da177e4
LT
3137 sock_put(sk);
3138}
1da177e4
LT
3139EXPORT_SYMBOL(sk_common_release);
3140
a2d133b1
JH
3141void sk_get_meminfo(const struct sock *sk, u32 *mem)
3142{
3143 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3144
3145 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3146 mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3147 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3148 mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3149 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3150 mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3151 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3152 mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3153 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3154}
3155
13ff3d6f
PE
3156#ifdef CONFIG_PROC_FS
3157#define PROTO_INUSE_NR 64 /* should be enough for the first time */
1338d466
PE
3158struct prot_inuse {
3159 int val[PROTO_INUSE_NR];
3160};
13ff3d6f
PE
3161
3162static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
70ee1159 3163
70ee1159
PE
3164void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3165{
08fc7f81 3166 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
70ee1159
PE
3167}
3168EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3169
3170int sock_prot_inuse_get(struct net *net, struct proto *prot)
3171{
3172 int cpu, idx = prot->inuse_idx;
3173 int res = 0;
3174
3175 for_each_possible_cpu(cpu)
08fc7f81 3176 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
70ee1159
PE
3177
3178 return res >= 0 ? res : 0;
3179}
3180EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3181
648845ab
TZ
3182static void sock_inuse_add(struct net *net, int val)
3183{
3184 this_cpu_add(*net->core.sock_inuse, val);
3185}
3186
3187int sock_inuse_get(struct net *net)
3188{
3189 int cpu, res = 0;
3190
3191 for_each_possible_cpu(cpu)
3192 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3193
3194 return res;
3195}
3196
3197EXPORT_SYMBOL_GPL(sock_inuse_get);
3198
2c8c1e72 3199static int __net_init sock_inuse_init_net(struct net *net)
70ee1159 3200{
08fc7f81 3201 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
648845ab
TZ
3202 if (net->core.prot_inuse == NULL)
3203 return -ENOMEM;
3204
3205 net->core.sock_inuse = alloc_percpu(int);
3206 if (net->core.sock_inuse == NULL)
3207 goto out;
3208
3209 return 0;
3210
3211out:
3212 free_percpu(net->core.prot_inuse);
3213 return -ENOMEM;
70ee1159
PE
3214}
3215
2c8c1e72 3216static void __net_exit sock_inuse_exit_net(struct net *net)
70ee1159 3217{
08fc7f81 3218 free_percpu(net->core.prot_inuse);
648845ab 3219 free_percpu(net->core.sock_inuse);
70ee1159
PE
3220}
3221
3222static struct pernet_operations net_inuse_ops = {
3223 .init = sock_inuse_init_net,
3224 .exit = sock_inuse_exit_net,
3225};
3226
3227static __init int net_inuse_init(void)
3228{
3229 if (register_pernet_subsys(&net_inuse_ops))
3230 panic("Cannot initialize net inuse counters");
3231
3232 return 0;
3233}
3234
3235core_initcall(net_inuse_init);
13ff3d6f
PE
3236
3237static void assign_proto_idx(struct proto *prot)
3238{
3239 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3240
3241 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
e005d193 3242 pr_err("PROTO_INUSE_NR exhausted\n");
13ff3d6f
PE
3243 return;
3244 }
3245
3246 set_bit(prot->inuse_idx, proto_inuse_idx);
3247}
3248
3249static void release_proto_idx(struct proto *prot)
3250{
3251 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3252 clear_bit(prot->inuse_idx, proto_inuse_idx);
3253}
3254#else
3255static inline void assign_proto_idx(struct proto *prot)
3256{
3257}
3258
3259static inline void release_proto_idx(struct proto *prot)
3260{
3261}
648845ab
TZ
3262
3263static void sock_inuse_add(struct net *net, int val)
3264{
3265}
13ff3d6f
PE
3266#endif
3267
0159dfd3
ED
3268static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3269{
3270 if (!rsk_prot)
3271 return;
3272 kfree(rsk_prot->slab_name);
3273 rsk_prot->slab_name = NULL;
adf78eda
JL
3274 kmem_cache_destroy(rsk_prot->slab);
3275 rsk_prot->slab = NULL;
0159dfd3
ED
3276}
3277
3278static int req_prot_init(const struct proto *prot)
3279{
3280 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3281
3282 if (!rsk_prot)
3283 return 0;
3284
3285 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3286 prot->name);
3287 if (!rsk_prot->slab_name)
3288 return -ENOMEM;
3289
3290 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3291 rsk_prot->obj_size, 0,
e699e2c6
SB
3292 SLAB_ACCOUNT | prot->slab_flags,
3293 NULL);
0159dfd3
ED
3294
3295 if (!rsk_prot->slab) {
3296 pr_crit("%s: Can't create request sock SLAB cache!\n",
3297 prot->name);
3298 return -ENOMEM;
3299 }
3300 return 0;
3301}
3302
b733c007
PE
3303int proto_register(struct proto *prot, int alloc_slab)
3304{
1da177e4 3305 if (alloc_slab) {
30c2c9f1
DW
3306 prot->slab = kmem_cache_create_usercopy(prot->name,
3307 prot->obj_size, 0,
e699e2c6
SB
3308 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3309 prot->slab_flags,
289a4860 3310 prot->useroffset, prot->usersize,
271b72c7 3311 NULL);
1da177e4
LT
3312
3313 if (prot->slab == NULL) {
e005d193
JP
3314 pr_crit("%s: Can't create sock SLAB cache!\n",
3315 prot->name);
60e7663d 3316 goto out;
1da177e4 3317 }
2e6599cb 3318
0159dfd3
ED
3319 if (req_prot_init(prot))
3320 goto out_free_request_sock_slab;
8feaf0c0 3321
6d6ee43e 3322 if (prot->twsk_prot != NULL) {
faf23422 3323 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
8feaf0c0 3324
7e56b5d6 3325 if (prot->twsk_prot->twsk_slab_name == NULL)
8feaf0c0
ACM
3326 goto out_free_request_sock_slab;
3327
6d6ee43e 3328 prot->twsk_prot->twsk_slab =
7e56b5d6 3329 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
6d6ee43e 3330 prot->twsk_prot->twsk_obj_size,
3ab5aee7 3331 0,
e699e2c6 3332 SLAB_ACCOUNT |
52db70dc 3333 prot->slab_flags,
20c2df83 3334 NULL);
6d6ee43e 3335 if (prot->twsk_prot->twsk_slab == NULL)
8feaf0c0
ACM
3336 goto out_free_timewait_sock_slab_name;
3337 }
1da177e4
LT
3338 }
3339
36b77a52 3340 mutex_lock(&proto_list_mutex);
1da177e4 3341 list_add(&prot->node, &proto_list);
13ff3d6f 3342 assign_proto_idx(prot);
36b77a52 3343 mutex_unlock(&proto_list_mutex);
b733c007
PE
3344 return 0;
3345
8feaf0c0 3346out_free_timewait_sock_slab_name:
7e56b5d6 3347 kfree(prot->twsk_prot->twsk_slab_name);
8feaf0c0 3348out_free_request_sock_slab:
0159dfd3
ED
3349 req_prot_cleanup(prot->rsk_prot);
3350
2e6599cb
ACM
3351 kmem_cache_destroy(prot->slab);
3352 prot->slab = NULL;
b733c007
PE
3353out:
3354 return -ENOBUFS;
1da177e4 3355}
1da177e4
LT
3356EXPORT_SYMBOL(proto_register);
3357
3358void proto_unregister(struct proto *prot)
3359{
36b77a52 3360 mutex_lock(&proto_list_mutex);
13ff3d6f 3361 release_proto_idx(prot);
0a3f4358 3362 list_del(&prot->node);
36b77a52 3363 mutex_unlock(&proto_list_mutex);
1da177e4 3364
adf78eda
JL
3365 kmem_cache_destroy(prot->slab);
3366 prot->slab = NULL;
1da177e4 3367
0159dfd3 3368 req_prot_cleanup(prot->rsk_prot);
2e6599cb 3369
6d6ee43e 3370 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
6d6ee43e 3371 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
7e56b5d6 3372 kfree(prot->twsk_prot->twsk_slab_name);
6d6ee43e 3373 prot->twsk_prot->twsk_slab = NULL;
8feaf0c0 3374 }
1da177e4 3375}
1da177e4
LT
3376EXPORT_SYMBOL(proto_unregister);
3377
bf2ae2e4
XL
3378int sock_load_diag_module(int family, int protocol)
3379{
3380 if (!protocol) {
3381 if (!sock_is_registered(family))
3382 return -ENOENT;
3383
3384 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3385 NETLINK_SOCK_DIAG, family);
3386 }
3387
3388#ifdef CONFIG_INET
3389 if (family == AF_INET &&
c34c1287 3390 protocol != IPPROTO_RAW &&
bf2ae2e4
XL
3391 !rcu_access_pointer(inet_protos[protocol]))
3392 return -ENOENT;
3393#endif
3394
3395 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3396 NETLINK_SOCK_DIAG, family, protocol);
3397}
3398EXPORT_SYMBOL(sock_load_diag_module);
3399
1da177e4 3400#ifdef CONFIG_PROC_FS
1da177e4 3401static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
36b77a52 3402 __acquires(proto_list_mutex)
1da177e4 3403{
36b77a52 3404 mutex_lock(&proto_list_mutex);
60f0438a 3405 return seq_list_start_head(&proto_list, *pos);
1da177e4
LT
3406}
3407
3408static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3409{
60f0438a 3410 return seq_list_next(v, &proto_list, pos);
1da177e4
LT
3411}
3412
3413static void proto_seq_stop(struct seq_file *seq, void *v)
36b77a52 3414 __releases(proto_list_mutex)
1da177e4 3415{
36b77a52 3416 mutex_unlock(&proto_list_mutex);
1da177e4
LT
3417}
3418
3419static char proto_method_implemented(const void *method)
3420{
3421 return method == NULL ? 'n' : 'y';
3422}
180d8cd9
GC
3423static long sock_prot_memory_allocated(struct proto *proto)
3424{
cb75a36c 3425 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
180d8cd9
GC
3426}
3427
3428static char *sock_prot_memory_pressure(struct proto *proto)
3429{
3430 return proto->memory_pressure != NULL ?
3431 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3432}
1da177e4
LT
3433
3434static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3435{
180d8cd9 3436
8d987e5c 3437 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
1da177e4
LT
3438 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3439 proto->name,
3440 proto->obj_size,
14e943db 3441 sock_prot_inuse_get(seq_file_net(seq), proto),
180d8cd9
GC
3442 sock_prot_memory_allocated(proto),
3443 sock_prot_memory_pressure(proto),
1da177e4
LT
3444 proto->max_header,
3445 proto->slab == NULL ? "no" : "yes",
3446 module_name(proto->owner),
3447 proto_method_implemented(proto->close),
3448 proto_method_implemented(proto->connect),
3449 proto_method_implemented(proto->disconnect),
3450 proto_method_implemented(proto->accept),
3451 proto_method_implemented(proto->ioctl),
3452 proto_method_implemented(proto->init),
3453 proto_method_implemented(proto->destroy),
3454 proto_method_implemented(proto->shutdown),
3455 proto_method_implemented(proto->setsockopt),
3456 proto_method_implemented(proto->getsockopt),
3457 proto_method_implemented(proto->sendmsg),
3458 proto_method_implemented(proto->recvmsg),
3459 proto_method_implemented(proto->sendpage),
3460 proto_method_implemented(proto->bind),
3461 proto_method_implemented(proto->backlog_rcv),
3462 proto_method_implemented(proto->hash),
3463 proto_method_implemented(proto->unhash),
3464 proto_method_implemented(proto->get_port),
3465 proto_method_implemented(proto->enter_memory_pressure));
3466}
3467
3468static int proto_seq_show(struct seq_file *seq, void *v)
3469{
60f0438a 3470 if (v == &proto_list)
1da177e4
LT
3471 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3472 "protocol",
3473 "size",
3474 "sockets",
3475 "memory",
3476 "press",
3477 "maxhdr",
3478 "slab",
3479 "module",
3480 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3481 else
60f0438a 3482 proto_seq_printf(seq, list_entry(v, struct proto, node));
1da177e4
LT
3483 return 0;
3484}
3485
f690808e 3486static const struct seq_operations proto_seq_ops = {
1da177e4
LT
3487 .start = proto_seq_start,
3488 .next = proto_seq_next,
3489 .stop = proto_seq_stop,
3490 .show = proto_seq_show,
3491};
3492
14e943db
ED
3493static __net_init int proto_init_net(struct net *net)
3494{
c3506372
CH
3495 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3496 sizeof(struct seq_net_private)))
14e943db
ED
3497 return -ENOMEM;
3498
3499 return 0;
3500}
3501
3502static __net_exit void proto_exit_net(struct net *net)
3503{
ece31ffd 3504 remove_proc_entry("protocols", net->proc_net);
14e943db
ED
3505}
3506
3507
3508static __net_initdata struct pernet_operations proto_net_ops = {
3509 .init = proto_init_net,
3510 .exit = proto_exit_net,
1da177e4
LT
3511};
3512
3513static int __init proto_init(void)
3514{
14e943db 3515 return register_pernet_subsys(&proto_net_ops);
1da177e4
LT
3516}
3517
3518subsys_initcall(proto_init);
3519
3520#endif /* PROC_FS */
7db6b048
SS
3521
3522#ifdef CONFIG_NET_RX_BUSY_POLL
3523bool sk_busy_loop_end(void *p, unsigned long start_time)
3524{
3525 struct sock *sk = p;
3526
3527 return !skb_queue_empty(&sk->sk_receive_queue) ||
3528 sk_busy_loop_timeout(sk, start_time);
3529}
3530EXPORT_SYMBOL(sk_busy_loop_end);
3531#endif /* CONFIG_NET_RX_BUSY_POLL */