net/ipv4: Update ip_tunnel_metadata_cnt static key to modern api
[linux-2.6-block.git] / net / core / sock.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
4ec93edb 35 * code. The ACK stuff can wait and needs major
1da177e4
LT
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
e005d193
JP
92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
4fc268d2 94#include <linux/capability.h>
1da177e4 95#include <linux/errno.h>
cb820f8e 96#include <linux/errqueue.h>
1da177e4
LT
97#include <linux/types.h>
98#include <linux/socket.h>
99#include <linux/in.h>
100#include <linux/kernel.h>
1da177e4
LT
101#include <linux/module.h>
102#include <linux/proc_fs.h>
103#include <linux/seq_file.h>
104#include <linux/sched.h>
f1083048 105#include <linux/sched/mm.h>
1da177e4
LT
106#include <linux/timer.h>
107#include <linux/string.h>
108#include <linux/sockios.h>
109#include <linux/net.h>
110#include <linux/mm.h>
111#include <linux/slab.h>
112#include <linux/interrupt.h>
113#include <linux/poll.h>
114#include <linux/tcp.h>
115#include <linux/init.h>
a1f8e7f7 116#include <linux/highmem.h>
3f551f94 117#include <linux/user_namespace.h>
c5905afb 118#include <linux/static_key.h>
3969eb38 119#include <linux/memcontrol.h>
8c1ae10d 120#include <linux/prefetch.h>
1da177e4 121
7c0f6ba6 122#include <linux/uaccess.h>
1da177e4
LT
123
124#include <linux/netdevice.h>
125#include <net/protocol.h>
126#include <linux/skbuff.h>
457c4cbc 127#include <net/net_namespace.h>
2e6599cb 128#include <net/request_sock.h>
1da177e4 129#include <net/sock.h>
20d49473 130#include <linux/net_tstamp.h>
1da177e4
LT
131#include <net/xfrm.h>
132#include <linux/ipsec.h>
f8451725 133#include <net/cls_cgroup.h>
5bc1421e 134#include <net/netprio_cgroup.h>
eb4cb008 135#include <linux/sock_diag.h>
1da177e4
LT
136
137#include <linux/filter.h>
538950a1 138#include <net/sock_reuseport.h>
1da177e4 139
3847ce32
SM
140#include <trace/events/sock.h>
141
1da177e4 142#include <net/tcp.h>
076bb0c8 143#include <net/busy_poll.h>
06021292 144
36b77a52 145static DEFINE_MUTEX(proto_list_mutex);
d1a4c0b3
GC
146static LIST_HEAD(proto_list);
147
648845ab
TZ
148static void sock_inuse_add(struct net *net, int val);
149
a3b299da
EB
150/**
151 * sk_ns_capable - General socket capability test
152 * @sk: Socket to use a capability on or through
153 * @user_ns: The user namespace of the capability to use
154 * @cap: The capability to use
155 *
156 * Test to see if the opener of the socket had when the socket was
157 * created and the current process has the capability @cap in the user
158 * namespace @user_ns.
159 */
160bool sk_ns_capable(const struct sock *sk,
161 struct user_namespace *user_ns, int cap)
162{
163 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
164 ns_capable(user_ns, cap);
165}
166EXPORT_SYMBOL(sk_ns_capable);
167
168/**
169 * sk_capable - Socket global capability test
170 * @sk: Socket to use a capability on or through
e793c0f7 171 * @cap: The global capability to use
a3b299da
EB
172 *
173 * Test to see if the opener of the socket had when the socket was
174 * created and the current process has the capability @cap in all user
175 * namespaces.
176 */
177bool sk_capable(const struct sock *sk, int cap)
178{
179 return sk_ns_capable(sk, &init_user_ns, cap);
180}
181EXPORT_SYMBOL(sk_capable);
182
183/**
184 * sk_net_capable - Network namespace socket capability test
185 * @sk: Socket to use a capability on or through
186 * @cap: The capability to use
187 *
e793c0f7 188 * Test to see if the opener of the socket had when the socket was created
a3b299da
EB
189 * and the current process has the capability @cap over the network namespace
190 * the socket is a member of.
191 */
192bool sk_net_capable(const struct sock *sk, int cap)
193{
194 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
195}
196EXPORT_SYMBOL(sk_net_capable);
197
da21f24d
IM
198/*
199 * Each address family might have different locking rules, so we have
cdfbabfb
DH
200 * one slock key per address family and separate keys for internal and
201 * userspace sockets.
da21f24d 202 */
a5b5bb9a 203static struct lock_class_key af_family_keys[AF_MAX];
cdfbabfb 204static struct lock_class_key af_family_kern_keys[AF_MAX];
a5b5bb9a 205static struct lock_class_key af_family_slock_keys[AF_MAX];
cdfbabfb 206static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
a5b5bb9a 207
a5b5bb9a
IM
208/*
209 * Make lock validator output more readable. (we pre-construct these
210 * strings build-time, so that runtime initialization of socket
211 * locks is fast):
212 */
cdfbabfb
DH
213
214#define _sock_locks(x) \
215 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
216 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
217 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
218 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
219 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
220 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
221 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
222 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
223 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
224 x "27" , x "28" , x "AF_CAN" , \
225 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
226 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
227 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
228 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
68e8b849
BT
229 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
230 x "AF_MAX"
cdfbabfb 231
36cbd3dc 232static const char *const af_family_key_strings[AF_MAX+1] = {
cdfbabfb 233 _sock_locks("sk_lock-")
a5b5bb9a 234};
36cbd3dc 235static const char *const af_family_slock_key_strings[AF_MAX+1] = {
cdfbabfb 236 _sock_locks("slock-")
a5b5bb9a 237};
36cbd3dc 238static const char *const af_family_clock_key_strings[AF_MAX+1] = {
cdfbabfb
DH
239 _sock_locks("clock-")
240};
241
242static const char *const af_family_kern_key_strings[AF_MAX+1] = {
243 _sock_locks("k-sk_lock-")
244};
245static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
246 _sock_locks("k-slock-")
247};
248static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
249 _sock_locks("k-clock-")
443aef0e 250};
581319c5
PA
251static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
252 "rlock-AF_UNSPEC", "rlock-AF_UNIX" , "rlock-AF_INET" ,
253 "rlock-AF_AX25" , "rlock-AF_IPX" , "rlock-AF_APPLETALK",
254 "rlock-AF_NETROM", "rlock-AF_BRIDGE" , "rlock-AF_ATMPVC" ,
255 "rlock-AF_X25" , "rlock-AF_INET6" , "rlock-AF_ROSE" ,
256 "rlock-AF_DECnet", "rlock-AF_NETBEUI" , "rlock-AF_SECURITY" ,
257 "rlock-AF_KEY" , "rlock-AF_NETLINK" , "rlock-AF_PACKET" ,
258 "rlock-AF_ASH" , "rlock-AF_ECONET" , "rlock-AF_ATMSVC" ,
259 "rlock-AF_RDS" , "rlock-AF_SNA" , "rlock-AF_IRDA" ,
260 "rlock-AF_PPPOX" , "rlock-AF_WANPIPE" , "rlock-AF_LLC" ,
261 "rlock-27" , "rlock-28" , "rlock-AF_CAN" ,
262 "rlock-AF_TIPC" , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV" ,
263 "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" ,
264 "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" ,
265 "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" ,
68e8b849
BT
266 "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_XDP" ,
267 "rlock-AF_MAX"
581319c5
PA
268};
269static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
270 "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" ,
271 "wlock-AF_AX25" , "wlock-AF_IPX" , "wlock-AF_APPLETALK",
272 "wlock-AF_NETROM", "wlock-AF_BRIDGE" , "wlock-AF_ATMPVC" ,
273 "wlock-AF_X25" , "wlock-AF_INET6" , "wlock-AF_ROSE" ,
274 "wlock-AF_DECnet", "wlock-AF_NETBEUI" , "wlock-AF_SECURITY" ,
275 "wlock-AF_KEY" , "wlock-AF_NETLINK" , "wlock-AF_PACKET" ,
276 "wlock-AF_ASH" , "wlock-AF_ECONET" , "wlock-AF_ATMSVC" ,
277 "wlock-AF_RDS" , "wlock-AF_SNA" , "wlock-AF_IRDA" ,
278 "wlock-AF_PPPOX" , "wlock-AF_WANPIPE" , "wlock-AF_LLC" ,
279 "wlock-27" , "wlock-28" , "wlock-AF_CAN" ,
280 "wlock-AF_TIPC" , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV" ,
281 "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" ,
282 "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" ,
283 "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" ,
68e8b849
BT
284 "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_XDP" ,
285 "wlock-AF_MAX"
581319c5
PA
286};
287static const char *const af_family_elock_key_strings[AF_MAX+1] = {
288 "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" ,
289 "elock-AF_AX25" , "elock-AF_IPX" , "elock-AF_APPLETALK",
290 "elock-AF_NETROM", "elock-AF_BRIDGE" , "elock-AF_ATMPVC" ,
291 "elock-AF_X25" , "elock-AF_INET6" , "elock-AF_ROSE" ,
292 "elock-AF_DECnet", "elock-AF_NETBEUI" , "elock-AF_SECURITY" ,
293 "elock-AF_KEY" , "elock-AF_NETLINK" , "elock-AF_PACKET" ,
294 "elock-AF_ASH" , "elock-AF_ECONET" , "elock-AF_ATMSVC" ,
295 "elock-AF_RDS" , "elock-AF_SNA" , "elock-AF_IRDA" ,
296 "elock-AF_PPPOX" , "elock-AF_WANPIPE" , "elock-AF_LLC" ,
297 "elock-27" , "elock-28" , "elock-AF_CAN" ,
298 "elock-AF_TIPC" , "elock-AF_BLUETOOTH", "elock-AF_IUCV" ,
299 "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" ,
300 "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" ,
301 "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" ,
68e8b849
BT
302 "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_XDP" ,
303 "elock-AF_MAX"
581319c5 304};
da21f24d
IM
305
306/*
581319c5 307 * sk_callback_lock and sk queues locking rules are per-address-family,
da21f24d
IM
308 * so split the lock classes by using a per-AF key:
309 */
310static struct lock_class_key af_callback_keys[AF_MAX];
581319c5
PA
311static struct lock_class_key af_rlock_keys[AF_MAX];
312static struct lock_class_key af_wlock_keys[AF_MAX];
313static struct lock_class_key af_elock_keys[AF_MAX];
cdfbabfb 314static struct lock_class_key af_kern_callback_keys[AF_MAX];
da21f24d 315
1da177e4 316/* Run time adjustable parameters. */
ab32ea5d 317__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
6d8ebc8a 318EXPORT_SYMBOL(sysctl_wmem_max);
ab32ea5d 319__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
6d8ebc8a 320EXPORT_SYMBOL(sysctl_rmem_max);
ab32ea5d
BH
321__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
322__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
1da177e4 323
25985edc 324/* Maximal space eaten by iovec or ancillary data plus some space */
ab32ea5d 325int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
2a91525c 326EXPORT_SYMBOL(sysctl_optmem_max);
1da177e4 327
b245be1f
WB
328int sysctl_tstamp_allow_data __read_mostly = 1;
329
c93bdd0e
MG
330struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
331EXPORT_SYMBOL_GPL(memalloc_socks);
332
7cb02404
MG
333/**
334 * sk_set_memalloc - sets %SOCK_MEMALLOC
335 * @sk: socket to set it on
336 *
337 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
338 * It's the responsibility of the admin to adjust min_free_kbytes
339 * to meet the requirements
340 */
341void sk_set_memalloc(struct sock *sk)
342{
343 sock_set_flag(sk, SOCK_MEMALLOC);
344 sk->sk_allocation |= __GFP_MEMALLOC;
c93bdd0e 345 static_key_slow_inc(&memalloc_socks);
7cb02404
MG
346}
347EXPORT_SYMBOL_GPL(sk_set_memalloc);
348
349void sk_clear_memalloc(struct sock *sk)
350{
351 sock_reset_flag(sk, SOCK_MEMALLOC);
352 sk->sk_allocation &= ~__GFP_MEMALLOC;
c93bdd0e 353 static_key_slow_dec(&memalloc_socks);
c76562b6
MG
354
355 /*
356 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
5d753610
MG
357 * progress of swapping. SOCK_MEMALLOC may be cleared while
358 * it has rmem allocations due to the last swapfile being deactivated
359 * but there is a risk that the socket is unusable due to exceeding
360 * the rmem limits. Reclaim the reserves and obey rmem limits again.
c76562b6 361 */
5d753610 362 sk_mem_reclaim(sk);
7cb02404
MG
363}
364EXPORT_SYMBOL_GPL(sk_clear_memalloc);
365
b4b9e355
MG
366int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
367{
368 int ret;
f1083048 369 unsigned int noreclaim_flag;
b4b9e355
MG
370
371 /* these should have been dropped before queueing */
372 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
373
f1083048 374 noreclaim_flag = memalloc_noreclaim_save();
b4b9e355 375 ret = sk->sk_backlog_rcv(sk, skb);
f1083048 376 memalloc_noreclaim_restore(noreclaim_flag);
b4b9e355
MG
377
378 return ret;
379}
380EXPORT_SYMBOL(__sk_backlog_rcv);
381
1da177e4
LT
382static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
383{
384 struct timeval tv;
385
386 if (optlen < sizeof(tv))
387 return -EINVAL;
388 if (copy_from_user(&tv, optval, sizeof(tv)))
389 return -EFAULT;
ba78073e
VA
390 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
391 return -EDOM;
1da177e4 392
ba78073e 393 if (tv.tv_sec < 0) {
6f11df83
AM
394 static int warned __read_mostly;
395
ba78073e 396 *timeo_p = 0;
50aab54f 397 if (warned < 10 && net_ratelimit()) {
ba78073e 398 warned++;
e005d193
JP
399 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
400 __func__, current->comm, task_pid_nr(current));
50aab54f 401 }
ba78073e
VA
402 return 0;
403 }
1da177e4
LT
404 *timeo_p = MAX_SCHEDULE_TIMEOUT;
405 if (tv.tv_sec == 0 && tv.tv_usec == 0)
406 return 0;
407 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
8ccde4c5 408 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
1da177e4
LT
409 return 0;
410}
411
412static void sock_warn_obsolete_bsdism(const char *name)
413{
414 static int warned;
415 static char warncomm[TASK_COMM_LEN];
4ec93edb
YH
416 if (strcmp(warncomm, current->comm) && warned < 5) {
417 strcpy(warncomm, current->comm);
e005d193
JP
418 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
419 warncomm, name);
1da177e4
LT
420 warned++;
421 }
422}
423
080a270f
HFS
424static bool sock_needs_netstamp(const struct sock *sk)
425{
426 switch (sk->sk_family) {
427 case AF_UNSPEC:
428 case AF_UNIX:
429 return false;
430 default:
431 return true;
432 }
433}
434
08e29af3 435static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
4ec93edb 436{
08e29af3
ED
437 if (sk->sk_flags & flags) {
438 sk->sk_flags &= ~flags;
080a270f
HFS
439 if (sock_needs_netstamp(sk) &&
440 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
20d49473 441 net_disable_timestamp();
1da177e4
LT
442 }
443}
444
445
e6afc8ac 446int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
f0088a50 447{
3b885787
NH
448 unsigned long flags;
449 struct sk_buff_head *list = &sk->sk_receive_queue;
f0088a50 450
0fd7bac6 451 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
766e9037 452 atomic_inc(&sk->sk_drops);
3847ce32 453 trace_sock_rcvqueue_full(sk, skb);
766e9037 454 return -ENOMEM;
f0088a50
DV
455 }
456
c76562b6 457 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
766e9037
ED
458 atomic_inc(&sk->sk_drops);
459 return -ENOBUFS;
3ab224be
HA
460 }
461
f0088a50
DV
462 skb->dev = NULL;
463 skb_set_owner_r(skb, sk);
49ad9599 464
7fee226a
ED
465 /* we escape from rcu protected region, make sure we dont leak
466 * a norefcounted dst
467 */
468 skb_dst_force(skb);
469
3b885787 470 spin_lock_irqsave(&list->lock, flags);
3bc3b96f 471 sock_skb_set_dropcount(sk, skb);
3b885787
NH
472 __skb_queue_tail(list, skb);
473 spin_unlock_irqrestore(&list->lock, flags);
f0088a50
DV
474
475 if (!sock_flag(sk, SOCK_DEAD))
676d2369 476 sk->sk_data_ready(sk);
766e9037 477 return 0;
f0088a50 478}
e6afc8ac 479EXPORT_SYMBOL(__sock_queue_rcv_skb);
480
481int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
482{
483 int err;
484
485 err = sk_filter(sk, skb);
486 if (err)
487 return err;
488
489 return __sock_queue_rcv_skb(sk, skb);
490}
f0088a50
DV
491EXPORT_SYMBOL(sock_queue_rcv_skb);
492
4f0c40d9 493int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
c3f24cfb 494 const int nested, unsigned int trim_cap, bool refcounted)
f0088a50
DV
495{
496 int rc = NET_RX_SUCCESS;
497
4f0c40d9 498 if (sk_filter_trim_cap(sk, skb, trim_cap))
f0088a50
DV
499 goto discard_and_relse;
500
501 skb->dev = NULL;
502
274f482d 503 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
c377411f
ED
504 atomic_inc(&sk->sk_drops);
505 goto discard_and_relse;
506 }
58a5a7b9
ACM
507 if (nested)
508 bh_lock_sock_nested(sk);
509 else
510 bh_lock_sock(sk);
a5b5bb9a
IM
511 if (!sock_owned_by_user(sk)) {
512 /*
513 * trylock + unlock semantics:
514 */
515 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
516
c57943a1 517 rc = sk_backlog_rcv(sk, skb);
a5b5bb9a
IM
518
519 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
f545a38f 520 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
8eae939f
ZY
521 bh_unlock_sock(sk);
522 atomic_inc(&sk->sk_drops);
523 goto discard_and_relse;
524 }
525
f0088a50
DV
526 bh_unlock_sock(sk);
527out:
c3f24cfb
ED
528 if (refcounted)
529 sock_put(sk);
f0088a50
DV
530 return rc;
531discard_and_relse:
532 kfree_skb(skb);
533 goto out;
534}
4f0c40d9 535EXPORT_SYMBOL(__sk_receive_skb);
f0088a50
DV
536
537struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
538{
b6c6712a 539 struct dst_entry *dst = __sk_dst_get(sk);
f0088a50
DV
540
541 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
e022f0b4 542 sk_tx_queue_clear(sk);
9b8805a3 543 sk->sk_dst_pending_confirm = 0;
a9b3cd7f 544 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
f0088a50
DV
545 dst_release(dst);
546 return NULL;
547 }
548
549 return dst;
550}
551EXPORT_SYMBOL(__sk_dst_check);
552
553struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
554{
555 struct dst_entry *dst = sk_dst_get(sk);
556
557 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
558 sk_dst_reset(sk);
559 dst_release(dst);
560 return NULL;
561 }
562
563 return dst;
564}
565EXPORT_SYMBOL(sk_dst_check);
566
c91f6df2
BH
567static int sock_setbindtodevice(struct sock *sk, char __user *optval,
568 int optlen)
4878809f
DM
569{
570 int ret = -ENOPROTOOPT;
571#ifdef CONFIG_NETDEVICES
3b1e0a65 572 struct net *net = sock_net(sk);
4878809f
DM
573 char devname[IFNAMSIZ];
574 int index;
575
576 /* Sorry... */
577 ret = -EPERM;
5e1fccc0 578 if (!ns_capable(net->user_ns, CAP_NET_RAW))
4878809f
DM
579 goto out;
580
581 ret = -EINVAL;
582 if (optlen < 0)
583 goto out;
584
585 /* Bind this socket to a particular device like "eth0",
586 * as specified in the passed interface name. If the
587 * name is "" or the option length is zero the socket
588 * is not bound.
589 */
590 if (optlen > IFNAMSIZ - 1)
591 optlen = IFNAMSIZ - 1;
592 memset(devname, 0, sizeof(devname));
593
594 ret = -EFAULT;
595 if (copy_from_user(devname, optval, optlen))
596 goto out;
597
000ba2e4
DM
598 index = 0;
599 if (devname[0] != '\0') {
bf8e56bf 600 struct net_device *dev;
4878809f 601
bf8e56bf
ED
602 rcu_read_lock();
603 dev = dev_get_by_name_rcu(net, devname);
604 if (dev)
605 index = dev->ifindex;
606 rcu_read_unlock();
4878809f
DM
607 ret = -ENODEV;
608 if (!dev)
609 goto out;
4878809f
DM
610 }
611
612 lock_sock(sk);
613 sk->sk_bound_dev_if = index;
614 sk_dst_reset(sk);
615 release_sock(sk);
616
617 ret = 0;
618
619out:
620#endif
621
622 return ret;
623}
624
c91f6df2
BH
625static int sock_getbindtodevice(struct sock *sk, char __user *optval,
626 int __user *optlen, int len)
627{
628 int ret = -ENOPROTOOPT;
629#ifdef CONFIG_NETDEVICES
630 struct net *net = sock_net(sk);
c91f6df2 631 char devname[IFNAMSIZ];
c91f6df2
BH
632
633 if (sk->sk_bound_dev_if == 0) {
634 len = 0;
635 goto zero;
636 }
637
638 ret = -EINVAL;
639 if (len < IFNAMSIZ)
640 goto out;
641
5dbe7c17
NS
642 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
643 if (ret)
c91f6df2 644 goto out;
c91f6df2
BH
645
646 len = strlen(devname) + 1;
647
648 ret = -EFAULT;
649 if (copy_to_user(optval, devname, len))
650 goto out;
651
652zero:
653 ret = -EFAULT;
654 if (put_user(len, optlen))
655 goto out;
656
657 ret = 0;
658
659out:
660#endif
661
662 return ret;
663}
664
c0ef877b
PE
665static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
666{
667 if (valbool)
668 sock_set_flag(sk, bit);
669 else
670 sock_reset_flag(sk, bit);
671}
672
f60e5990 673bool sk_mc_loop(struct sock *sk)
674{
675 if (dev_recursion_level())
676 return false;
677 if (!sk)
678 return true;
679 switch (sk->sk_family) {
680 case AF_INET:
681 return inet_sk(sk)->mc_loop;
682#if IS_ENABLED(CONFIG_IPV6)
683 case AF_INET6:
684 return inet6_sk(sk)->mc_loop;
685#endif
686 }
687 WARN_ON(1);
688 return true;
689}
690EXPORT_SYMBOL(sk_mc_loop);
691
1da177e4
LT
692/*
693 * This is meant for all protocols to use and covers goings on
694 * at the socket level. Everything here is generic.
695 */
696
697int sock_setsockopt(struct socket *sock, int level, int optname,
b7058842 698 char __user *optval, unsigned int optlen)
1da177e4 699{
2a91525c 700 struct sock *sk = sock->sk;
1da177e4
LT
701 int val;
702 int valbool;
703 struct linger ling;
704 int ret = 0;
4ec93edb 705
1da177e4
LT
706 /*
707 * Options without arguments
708 */
709
4878809f 710 if (optname == SO_BINDTODEVICE)
c91f6df2 711 return sock_setbindtodevice(sk, optval, optlen);
4878809f 712
e71a4783
SH
713 if (optlen < sizeof(int))
714 return -EINVAL;
4ec93edb 715
1da177e4
LT
716 if (get_user(val, (int __user *)optval))
717 return -EFAULT;
4ec93edb 718
2a91525c 719 valbool = val ? 1 : 0;
1da177e4
LT
720
721 lock_sock(sk);
722
2a91525c 723 switch (optname) {
e71a4783 724 case SO_DEBUG:
2a91525c 725 if (val && !capable(CAP_NET_ADMIN))
e71a4783 726 ret = -EACCES;
2a91525c 727 else
c0ef877b 728 sock_valbool_flag(sk, SOCK_DBG, valbool);
e71a4783
SH
729 break;
730 case SO_REUSEADDR:
4a17fd52 731 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
e71a4783 732 break;
055dc21a
TH
733 case SO_REUSEPORT:
734 sk->sk_reuseport = valbool;
735 break;
e71a4783 736 case SO_TYPE:
49c794e9 737 case SO_PROTOCOL:
0d6038ee 738 case SO_DOMAIN:
e71a4783
SH
739 case SO_ERROR:
740 ret = -ENOPROTOOPT;
741 break;
742 case SO_DONTROUTE:
c0ef877b 743 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
e71a4783
SH
744 break;
745 case SO_BROADCAST:
746 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
747 break;
748 case SO_SNDBUF:
749 /* Don't error on this BSD doesn't and if you think
82981930
ED
750 * about it this is right. Otherwise apps have to
751 * play 'guess the biggest size' games. RCVBUF/SNDBUF
752 * are treated in BSD as hints
753 */
754 val = min_t(u32, val, sysctl_wmem_max);
b0573dea 755set_sndbuf:
e71a4783 756 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
b98b0bc8 757 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
82981930 758 /* Wake up sending tasks if we upped the value. */
e71a4783
SH
759 sk->sk_write_space(sk);
760 break;
1da177e4 761
e71a4783
SH
762 case SO_SNDBUFFORCE:
763 if (!capable(CAP_NET_ADMIN)) {
764 ret = -EPERM;
765 break;
766 }
767 goto set_sndbuf;
b0573dea 768
e71a4783
SH
769 case SO_RCVBUF:
770 /* Don't error on this BSD doesn't and if you think
82981930
ED
771 * about it this is right. Otherwise apps have to
772 * play 'guess the biggest size' games. RCVBUF/SNDBUF
773 * are treated in BSD as hints
774 */
775 val = min_t(u32, val, sysctl_rmem_max);
b0573dea 776set_rcvbuf:
e71a4783
SH
777 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
778 /*
779 * We double it on the way in to account for
780 * "struct sk_buff" etc. overhead. Applications
781 * assume that the SO_RCVBUF setting they make will
782 * allow that much actual data to be received on that
783 * socket.
784 *
785 * Applications are unaware that "struct sk_buff" and
786 * other overheads allocate from the receive buffer
787 * during socket buffer allocation.
788 *
789 * And after considering the possible alternatives,
790 * returning the value we actually used in getsockopt
791 * is the most desirable behavior.
792 */
b98b0bc8 793 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
e71a4783
SH
794 break;
795
796 case SO_RCVBUFFORCE:
797 if (!capable(CAP_NET_ADMIN)) {
798 ret = -EPERM;
1da177e4 799 break;
e71a4783
SH
800 }
801 goto set_rcvbuf;
1da177e4 802
e71a4783 803 case SO_KEEPALIVE:
4b9d07a4
UB
804 if (sk->sk_prot->keepalive)
805 sk->sk_prot->keepalive(sk, valbool);
e71a4783
SH
806 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
807 break;
808
809 case SO_OOBINLINE:
810 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
811 break;
812
813 case SO_NO_CHECK:
28448b80 814 sk->sk_no_check_tx = valbool;
e71a4783
SH
815 break;
816
817 case SO_PRIORITY:
5e1fccc0
EB
818 if ((val >= 0 && val <= 6) ||
819 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
e71a4783
SH
820 sk->sk_priority = val;
821 else
822 ret = -EPERM;
823 break;
824
825 case SO_LINGER:
826 if (optlen < sizeof(ling)) {
827 ret = -EINVAL; /* 1003.1g */
1da177e4 828 break;
e71a4783 829 }
2a91525c 830 if (copy_from_user(&ling, optval, sizeof(ling))) {
e71a4783 831 ret = -EFAULT;
1da177e4 832 break;
e71a4783
SH
833 }
834 if (!ling.l_onoff)
835 sock_reset_flag(sk, SOCK_LINGER);
836 else {
1da177e4 837#if (BITS_PER_LONG == 32)
e71a4783
SH
838 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
839 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1da177e4 840 else
e71a4783
SH
841#endif
842 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
843 sock_set_flag(sk, SOCK_LINGER);
844 }
845 break;
846
847 case SO_BSDCOMPAT:
848 sock_warn_obsolete_bsdism("setsockopt");
849 break;
850
851 case SO_PASSCRED:
852 if (valbool)
853 set_bit(SOCK_PASSCRED, &sock->flags);
854 else
855 clear_bit(SOCK_PASSCRED, &sock->flags);
856 break;
857
858 case SO_TIMESTAMP:
92f37fd2 859 case SO_TIMESTAMPNS:
e71a4783 860 if (valbool) {
92f37fd2
ED
861 if (optname == SO_TIMESTAMP)
862 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
863 else
864 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783 865 sock_set_flag(sk, SOCK_RCVTSTAMP);
20d49473 866 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
92f37fd2 867 } else {
e71a4783 868 sock_reset_flag(sk, SOCK_RCVTSTAMP);
92f37fd2
ED
869 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
870 }
e71a4783
SH
871 break;
872
20d49473
PO
873 case SO_TIMESTAMPING:
874 if (val & ~SOF_TIMESTAMPING_MASK) {
f249fb78 875 ret = -EINVAL;
20d49473
PO
876 break;
877 }
b245be1f 878
09c2d251 879 if (val & SOF_TIMESTAMPING_OPT_ID &&
4ed2d765 880 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
ac5cc977
WC
881 if (sk->sk_protocol == IPPROTO_TCP &&
882 sk->sk_type == SOCK_STREAM) {
6db8b963
SHY
883 if ((1 << sk->sk_state) &
884 (TCPF_CLOSE | TCPF_LISTEN)) {
4ed2d765
WB
885 ret = -EINVAL;
886 break;
887 }
888 sk->sk_tskey = tcp_sk(sk)->snd_una;
889 } else {
890 sk->sk_tskey = 0;
891 }
892 }
1c885808
FY
893
894 if (val & SOF_TIMESTAMPING_OPT_STATS &&
895 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
896 ret = -EINVAL;
897 break;
898 }
899
b9f40e21 900 sk->sk_tsflags = val;
20d49473
PO
901 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
902 sock_enable_timestamp(sk,
903 SOCK_TIMESTAMPING_RX_SOFTWARE);
904 else
905 sock_disable_timestamp(sk,
08e29af3 906 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
20d49473
PO
907 break;
908
e71a4783
SH
909 case SO_RCVLOWAT:
910 if (val < 0)
911 val = INT_MAX;
d1361840
ED
912 if (sock->ops->set_rcvlowat)
913 ret = sock->ops->set_rcvlowat(sk, val);
914 else
915 sk->sk_rcvlowat = val ? : 1;
e71a4783
SH
916 break;
917
918 case SO_RCVTIMEO:
919 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
920 break;
921
922 case SO_SNDTIMEO:
923 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
924 break;
1da177e4 925
e71a4783
SH
926 case SO_ATTACH_FILTER:
927 ret = -EINVAL;
928 if (optlen == sizeof(struct sock_fprog)) {
929 struct sock_fprog fprog;
1da177e4 930
e71a4783
SH
931 ret = -EFAULT;
932 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1da177e4 933 break;
e71a4783
SH
934
935 ret = sk_attach_filter(&fprog, sk);
936 }
937 break;
938
89aa0758
AS
939 case SO_ATTACH_BPF:
940 ret = -EINVAL;
941 if (optlen == sizeof(u32)) {
942 u32 ufd;
943
944 ret = -EFAULT;
945 if (copy_from_user(&ufd, optval, sizeof(ufd)))
946 break;
947
948 ret = sk_attach_bpf(ufd, sk);
949 }
950 break;
951
538950a1
CG
952 case SO_ATTACH_REUSEPORT_CBPF:
953 ret = -EINVAL;
954 if (optlen == sizeof(struct sock_fprog)) {
955 struct sock_fprog fprog;
956
957 ret = -EFAULT;
958 if (copy_from_user(&fprog, optval, sizeof(fprog)))
959 break;
960
961 ret = sk_reuseport_attach_filter(&fprog, sk);
962 }
963 break;
964
965 case SO_ATTACH_REUSEPORT_EBPF:
966 ret = -EINVAL;
967 if (optlen == sizeof(u32)) {
968 u32 ufd;
969
970 ret = -EFAULT;
971 if (copy_from_user(&ufd, optval, sizeof(ufd)))
972 break;
973
974 ret = sk_reuseport_attach_bpf(ufd, sk);
975 }
976 break;
977
e71a4783 978 case SO_DETACH_FILTER:
55b33325 979 ret = sk_detach_filter(sk);
e71a4783 980 break;
1da177e4 981
d59577b6
VB
982 case SO_LOCK_FILTER:
983 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
984 ret = -EPERM;
985 else
986 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
987 break;
988
e71a4783
SH
989 case SO_PASSSEC:
990 if (valbool)
991 set_bit(SOCK_PASSSEC, &sock->flags);
992 else
993 clear_bit(SOCK_PASSSEC, &sock->flags);
994 break;
4a19ec58 995 case SO_MARK:
5e1fccc0 996 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
4a19ec58 997 ret = -EPERM;
2a91525c 998 else
4a19ec58 999 sk->sk_mark = val;
4a19ec58 1000 break;
877ce7c1 1001
3b885787 1002 case SO_RXQ_OVFL:
8083f0fc 1003 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
3b885787 1004 break;
6e3e939f
JB
1005
1006 case SO_WIFI_STATUS:
1007 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1008 break;
1009
ef64a54f
PE
1010 case SO_PEEK_OFF:
1011 if (sock->ops->set_peek_off)
12663bfc 1012 ret = sock->ops->set_peek_off(sk, val);
ef64a54f
PE
1013 else
1014 ret = -EOPNOTSUPP;
1015 break;
3bdc0eba
BG
1016
1017 case SO_NOFCS:
1018 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1019 break;
1020
7d4c04fc
KJ
1021 case SO_SELECT_ERR_QUEUE:
1022 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1023 break;
1024
e0d1095a 1025#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 1026 case SO_BUSY_POLL:
dafcc438
ET
1027 /* allow unprivileged users to decrease the value */
1028 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1029 ret = -EPERM;
1030 else {
1031 if (val < 0)
1032 ret = -EINVAL;
1033 else
1034 sk->sk_ll_usec = val;
1035 }
1036 break;
1037#endif
62748f32
ED
1038
1039 case SO_MAX_PACING_RATE:
218af599
ED
1040 if (val != ~0U)
1041 cmpxchg(&sk->sk_pacing_status,
1042 SK_PACING_NONE,
1043 SK_PACING_NEEDED);
62748f32
ED
1044 sk->sk_max_pacing_rate = val;
1045 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1046 sk->sk_max_pacing_rate);
1047 break;
1048
70da268b
ED
1049 case SO_INCOMING_CPU:
1050 sk->sk_incoming_cpu = val;
1051 break;
1052
a87cb3e4
TH
1053 case SO_CNX_ADVICE:
1054 if (val == 1)
1055 dst_negative_advice(sk);
1056 break;
76851d12
WB
1057
1058 case SO_ZEROCOPY:
28190752
SV
1059 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1060 if (sk->sk_protocol != IPPROTO_TCP)
1061 ret = -ENOTSUPP;
28190752 1062 } else if (sk->sk_family != PF_RDS) {
76851d12 1063 ret = -ENOTSUPP;
28190752
SV
1064 }
1065 if (!ret) {
1066 if (val < 0 || val > 1)
1067 ret = -EINVAL;
1068 else
1069 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
28190752 1070 }
334e6413
JSP
1071 break;
1072
e71a4783
SH
1073 default:
1074 ret = -ENOPROTOOPT;
1075 break;
4ec93edb 1076 }
1da177e4
LT
1077 release_sock(sk);
1078 return ret;
1079}
2a91525c 1080EXPORT_SYMBOL(sock_setsockopt);
1da177e4
LT
1081
1082
8f09898b 1083static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1084 struct ucred *ucred)
3f551f94
EB
1085{
1086 ucred->pid = pid_vnr(pid);
1087 ucred->uid = ucred->gid = -1;
1088 if (cred) {
1089 struct user_namespace *current_ns = current_user_ns();
1090
b2e4f544
EB
1091 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1092 ucred->gid = from_kgid_munged(current_ns, cred->egid);
3f551f94
EB
1093 }
1094}
1095
28b5ba2a
DH
1096static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1097{
1098 struct user_namespace *user_ns = current_user_ns();
1099 int i;
1100
1101 for (i = 0; i < src->ngroups; i++)
1102 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1103 return -EFAULT;
1104
1105 return 0;
1106}
1107
1da177e4
LT
1108int sock_getsockopt(struct socket *sock, int level, int optname,
1109 char __user *optval, int __user *optlen)
1110{
1111 struct sock *sk = sock->sk;
4ec93edb 1112
e71a4783 1113 union {
4ec93edb 1114 int val;
5daab9db 1115 u64 val64;
4ec93edb 1116 struct linger ling;
1da177e4
LT
1117 struct timeval tm;
1118 } v;
4ec93edb 1119
4d0392be 1120 int lv = sizeof(int);
1da177e4 1121 int len;
4ec93edb 1122
e71a4783 1123 if (get_user(len, optlen))
4ec93edb 1124 return -EFAULT;
e71a4783 1125 if (len < 0)
1da177e4 1126 return -EINVAL;
4ec93edb 1127
50fee1de 1128 memset(&v, 0, sizeof(v));
df0bca04 1129
2a91525c 1130 switch (optname) {
e71a4783
SH
1131 case SO_DEBUG:
1132 v.val = sock_flag(sk, SOCK_DBG);
1133 break;
1134
1135 case SO_DONTROUTE:
1136 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1137 break;
1138
1139 case SO_BROADCAST:
1b23a5df 1140 v.val = sock_flag(sk, SOCK_BROADCAST);
e71a4783
SH
1141 break;
1142
1143 case SO_SNDBUF:
1144 v.val = sk->sk_sndbuf;
1145 break;
1146
1147 case SO_RCVBUF:
1148 v.val = sk->sk_rcvbuf;
1149 break;
1150
1151 case SO_REUSEADDR:
1152 v.val = sk->sk_reuse;
1153 break;
1154
055dc21a
TH
1155 case SO_REUSEPORT:
1156 v.val = sk->sk_reuseport;
1157 break;
1158
e71a4783 1159 case SO_KEEPALIVE:
1b23a5df 1160 v.val = sock_flag(sk, SOCK_KEEPOPEN);
e71a4783
SH
1161 break;
1162
1163 case SO_TYPE:
1164 v.val = sk->sk_type;
1165 break;
1166
49c794e9
JE
1167 case SO_PROTOCOL:
1168 v.val = sk->sk_protocol;
1169 break;
1170
0d6038ee
JE
1171 case SO_DOMAIN:
1172 v.val = sk->sk_family;
1173 break;
1174
e71a4783
SH
1175 case SO_ERROR:
1176 v.val = -sock_error(sk);
2a91525c 1177 if (v.val == 0)
e71a4783
SH
1178 v.val = xchg(&sk->sk_err_soft, 0);
1179 break;
1180
1181 case SO_OOBINLINE:
1b23a5df 1182 v.val = sock_flag(sk, SOCK_URGINLINE);
e71a4783
SH
1183 break;
1184
1185 case SO_NO_CHECK:
28448b80 1186 v.val = sk->sk_no_check_tx;
e71a4783
SH
1187 break;
1188
1189 case SO_PRIORITY:
1190 v.val = sk->sk_priority;
1191 break;
1192
1193 case SO_LINGER:
1194 lv = sizeof(v.ling);
1b23a5df 1195 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
e71a4783
SH
1196 v.ling.l_linger = sk->sk_lingertime / HZ;
1197 break;
1198
1199 case SO_BSDCOMPAT:
1200 sock_warn_obsolete_bsdism("getsockopt");
1201 break;
1202
1203 case SO_TIMESTAMP:
92f37fd2
ED
1204 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1205 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1206 break;
1207
1208 case SO_TIMESTAMPNS:
1209 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783
SH
1210 break;
1211
20d49473 1212 case SO_TIMESTAMPING:
b9f40e21 1213 v.val = sk->sk_tsflags;
20d49473
PO
1214 break;
1215
e71a4783 1216 case SO_RCVTIMEO:
2a91525c 1217 lv = sizeof(struct timeval);
e71a4783
SH
1218 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1219 v.tm.tv_sec = 0;
1220 v.tm.tv_usec = 0;
1221 } else {
1222 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
8ccde4c5 1223 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
e71a4783
SH
1224 }
1225 break;
1226
1227 case SO_SNDTIMEO:
2a91525c 1228 lv = sizeof(struct timeval);
e71a4783
SH
1229 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1230 v.tm.tv_sec = 0;
1231 v.tm.tv_usec = 0;
1232 } else {
1233 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
8ccde4c5 1234 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
e71a4783
SH
1235 }
1236 break;
1da177e4 1237
e71a4783
SH
1238 case SO_RCVLOWAT:
1239 v.val = sk->sk_rcvlowat;
1240 break;
1da177e4 1241
e71a4783 1242 case SO_SNDLOWAT:
2a91525c 1243 v.val = 1;
e71a4783 1244 break;
1da177e4 1245
e71a4783 1246 case SO_PASSCRED:
82981930 1247 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
e71a4783 1248 break;
1da177e4 1249
e71a4783 1250 case SO_PEERCRED:
109f6e39
EB
1251 {
1252 struct ucred peercred;
1253 if (len > sizeof(peercred))
1254 len = sizeof(peercred);
1255 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1256 if (copy_to_user(optval, &peercred, len))
e71a4783
SH
1257 return -EFAULT;
1258 goto lenout;
109f6e39 1259 }
1da177e4 1260
28b5ba2a
DH
1261 case SO_PEERGROUPS:
1262 {
1263 int ret, n;
1264
1265 if (!sk->sk_peer_cred)
1266 return -ENODATA;
1267
1268 n = sk->sk_peer_cred->group_info->ngroups;
1269 if (len < n * sizeof(gid_t)) {
1270 len = n * sizeof(gid_t);
1271 return put_user(len, optlen) ? -EFAULT : -ERANGE;
1272 }
1273 len = n * sizeof(gid_t);
1274
1275 ret = groups_to_user((gid_t __user *)optval,
1276 sk->sk_peer_cred->group_info);
1277 if (ret)
1278 return ret;
1279 goto lenout;
1280 }
1281
e71a4783
SH
1282 case SO_PEERNAME:
1283 {
1284 char address[128];
1285
9b2c45d4
DV
1286 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1287 if (lv < 0)
e71a4783
SH
1288 return -ENOTCONN;
1289 if (lv < len)
1290 return -EINVAL;
1291 if (copy_to_user(optval, address, len))
1292 return -EFAULT;
1293 goto lenout;
1294 }
1da177e4 1295
e71a4783
SH
1296 /* Dubious BSD thing... Probably nobody even uses it, but
1297 * the UNIX standard wants it for whatever reason... -DaveM
1298 */
1299 case SO_ACCEPTCONN:
1300 v.val = sk->sk_state == TCP_LISTEN;
1301 break;
1da177e4 1302
e71a4783 1303 case SO_PASSSEC:
82981930 1304 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
e71a4783 1305 break;
877ce7c1 1306
e71a4783
SH
1307 case SO_PEERSEC:
1308 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1da177e4 1309
4a19ec58
LAT
1310 case SO_MARK:
1311 v.val = sk->sk_mark;
1312 break;
1313
3b885787 1314 case SO_RXQ_OVFL:
1b23a5df 1315 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
3b885787
NH
1316 break;
1317
6e3e939f 1318 case SO_WIFI_STATUS:
1b23a5df 1319 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
6e3e939f
JB
1320 break;
1321
ef64a54f
PE
1322 case SO_PEEK_OFF:
1323 if (!sock->ops->set_peek_off)
1324 return -EOPNOTSUPP;
1325
1326 v.val = sk->sk_peek_off;
1327 break;
bc2f7996 1328 case SO_NOFCS:
1b23a5df 1329 v.val = sock_flag(sk, SOCK_NOFCS);
bc2f7996 1330 break;
c91f6df2 1331
f7b86bfe 1332 case SO_BINDTODEVICE:
c91f6df2
BH
1333 return sock_getbindtodevice(sk, optval, optlen, len);
1334
a8fc9277
PE
1335 case SO_GET_FILTER:
1336 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1337 if (len < 0)
1338 return len;
1339
1340 goto lenout;
c91f6df2 1341
d59577b6
VB
1342 case SO_LOCK_FILTER:
1343 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1344 break;
1345
ea02f941
MS
1346 case SO_BPF_EXTENSIONS:
1347 v.val = bpf_tell_extensions();
1348 break;
1349
7d4c04fc
KJ
1350 case SO_SELECT_ERR_QUEUE:
1351 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1352 break;
1353
e0d1095a 1354#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 1355 case SO_BUSY_POLL:
dafcc438
ET
1356 v.val = sk->sk_ll_usec;
1357 break;
1358#endif
1359
62748f32
ED
1360 case SO_MAX_PACING_RATE:
1361 v.val = sk->sk_max_pacing_rate;
1362 break;
1363
2c8c56e1
ED
1364 case SO_INCOMING_CPU:
1365 v.val = sk->sk_incoming_cpu;
1366 break;
1367
a2d133b1
JH
1368 case SO_MEMINFO:
1369 {
1370 u32 meminfo[SK_MEMINFO_VARS];
1371
1372 if (get_user(len, optlen))
1373 return -EFAULT;
1374
1375 sk_get_meminfo(sk, meminfo);
1376
1377 len = min_t(unsigned int, len, sizeof(meminfo));
1378 if (copy_to_user(optval, &meminfo, len))
1379 return -EFAULT;
1380
1381 goto lenout;
1382 }
6d433902
SS
1383
1384#ifdef CONFIG_NET_RX_BUSY_POLL
1385 case SO_INCOMING_NAPI_ID:
1386 v.val = READ_ONCE(sk->sk_napi_id);
1387
1388 /* aggregate non-NAPI IDs down to 0 */
1389 if (v.val < MIN_NAPI_ID)
1390 v.val = 0;
1391
1392 break;
1393#endif
1394
5daab9db
CF
1395 case SO_COOKIE:
1396 lv = sizeof(u64);
1397 if (len < lv)
1398 return -EINVAL;
1399 v.val64 = sock_gen_cookie(sk);
1400 break;
1401
76851d12
WB
1402 case SO_ZEROCOPY:
1403 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1404 break;
1405
e71a4783 1406 default:
443b5991
YH
1407 /* We implement the SO_SNDLOWAT etc to not be settable
1408 * (1003.1g 7).
1409 */
e71a4783 1410 return -ENOPROTOOPT;
1da177e4 1411 }
e71a4783 1412
1da177e4
LT
1413 if (len > lv)
1414 len = lv;
1415 if (copy_to_user(optval, &v, len))
1416 return -EFAULT;
1417lenout:
4ec93edb
YH
1418 if (put_user(len, optlen))
1419 return -EFAULT;
1420 return 0;
1da177e4
LT
1421}
1422
a5b5bb9a
IM
1423/*
1424 * Initialize an sk_lock.
1425 *
1426 * (We also register the sk_lock with the lock validator.)
1427 */
b6f99a21 1428static inline void sock_lock_init(struct sock *sk)
a5b5bb9a 1429{
cdfbabfb
DH
1430 if (sk->sk_kern_sock)
1431 sock_lock_init_class_and_name(
1432 sk,
1433 af_family_kern_slock_key_strings[sk->sk_family],
1434 af_family_kern_slock_keys + sk->sk_family,
1435 af_family_kern_key_strings[sk->sk_family],
1436 af_family_kern_keys + sk->sk_family);
1437 else
1438 sock_lock_init_class_and_name(
1439 sk,
ed07536e
PZ
1440 af_family_slock_key_strings[sk->sk_family],
1441 af_family_slock_keys + sk->sk_family,
1442 af_family_key_strings[sk->sk_family],
1443 af_family_keys + sk->sk_family);
a5b5bb9a
IM
1444}
1445
4dc6dc71
ED
1446/*
1447 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1448 * even temporarly, because of RCU lookups. sk_node should also be left as is.
68835aba 1449 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
4dc6dc71 1450 */
f1a6c4da
PE
1451static void sock_copy(struct sock *nsk, const struct sock *osk)
1452{
1453#ifdef CONFIG_SECURITY_NETWORK
1454 void *sptr = nsk->sk_security;
1455#endif
68835aba
ED
1456 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1457
1458 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1459 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1460
f1a6c4da
PE
1461#ifdef CONFIG_SECURITY_NETWORK
1462 nsk->sk_security = sptr;
1463 security_sk_clone(osk, nsk);
1464#endif
1465}
1466
2e4afe7b
PE
1467static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1468 int family)
c308c1b2
PE
1469{
1470 struct sock *sk;
1471 struct kmem_cache *slab;
1472
1473 slab = prot->slab;
e912b114
ED
1474 if (slab != NULL) {
1475 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1476 if (!sk)
1477 return sk;
ba2489b0
ED
1478 if (priority & __GFP_ZERO)
1479 sk_prot_clear_nulls(sk, prot->obj_size);
fcbdf09d 1480 } else
c308c1b2
PE
1481 sk = kmalloc(prot->obj_size, priority);
1482
2e4afe7b
PE
1483 if (sk != NULL) {
1484 if (security_sk_alloc(sk, family, priority))
1485 goto out_free;
1486
1487 if (!try_module_get(prot->owner))
1488 goto out_free_sec;
e022f0b4 1489 sk_tx_queue_clear(sk);
2e4afe7b
PE
1490 }
1491
c308c1b2 1492 return sk;
2e4afe7b
PE
1493
1494out_free_sec:
1495 security_sk_free(sk);
1496out_free:
1497 if (slab != NULL)
1498 kmem_cache_free(slab, sk);
1499 else
1500 kfree(sk);
1501 return NULL;
c308c1b2
PE
1502}
1503
1504static void sk_prot_free(struct proto *prot, struct sock *sk)
1505{
1506 struct kmem_cache *slab;
2e4afe7b 1507 struct module *owner;
c308c1b2 1508
2e4afe7b 1509 owner = prot->owner;
c308c1b2 1510 slab = prot->slab;
2e4afe7b 1511
bd1060a1 1512 cgroup_sk_free(&sk->sk_cgrp_data);
2d758073 1513 mem_cgroup_sk_free(sk);
2e4afe7b 1514 security_sk_free(sk);
c308c1b2
PE
1515 if (slab != NULL)
1516 kmem_cache_free(slab, sk);
1517 else
1518 kfree(sk);
2e4afe7b 1519 module_put(owner);
c308c1b2
PE
1520}
1521
1da177e4
LT
1522/**
1523 * sk_alloc - All socket objects are allocated here
c4ea43c5 1524 * @net: the applicable net namespace
4dc3b16b
PP
1525 * @family: protocol family
1526 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1527 * @prot: struct proto associated with this new sock instance
11aa9c28 1528 * @kern: is this to be a kernel socket?
1da177e4 1529 */
1b8d7ae4 1530struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
11aa9c28 1531 struct proto *prot, int kern)
1da177e4 1532{
c308c1b2 1533 struct sock *sk;
1da177e4 1534
154adbc8 1535 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1da177e4 1536 if (sk) {
154adbc8
PE
1537 sk->sk_family = family;
1538 /*
1539 * See comment in struct sock definition to understand
1540 * why we need sk_prot_creator -acme
1541 */
1542 sk->sk_prot = sk->sk_prot_creator = prot;
cdfbabfb 1543 sk->sk_kern_sock = kern;
154adbc8 1544 sock_lock_init(sk);
26abe143 1545 sk->sk_net_refcnt = kern ? 0 : 1;
648845ab 1546 if (likely(sk->sk_net_refcnt)) {
26abe143 1547 get_net(net);
648845ab
TZ
1548 sock_inuse_add(net, 1);
1549 }
1550
26abe143 1551 sock_net_set(sk, net);
14afee4b 1552 refcount_set(&sk->sk_wmem_alloc, 1);
f8451725 1553
2d758073 1554 mem_cgroup_sk_alloc(sk);
d979a39d 1555 cgroup_sk_alloc(&sk->sk_cgrp_data);
2a56a1fe
TH
1556 sock_update_classid(&sk->sk_cgrp_data);
1557 sock_update_netprioidx(&sk->sk_cgrp_data);
1da177e4 1558 }
a79af59e 1559
2e4afe7b 1560 return sk;
1da177e4 1561}
2a91525c 1562EXPORT_SYMBOL(sk_alloc);
1da177e4 1563
a4298e45
ED
1564/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1565 * grace period. This is the case for UDP sockets and TCP listeners.
1566 */
1567static void __sk_destruct(struct rcu_head *head)
1da177e4 1568{
a4298e45 1569 struct sock *sk = container_of(head, struct sock, sk_rcu);
1da177e4 1570 struct sk_filter *filter;
1da177e4
LT
1571
1572 if (sk->sk_destruct)
1573 sk->sk_destruct(sk);
1574
a898def2 1575 filter = rcu_dereference_check(sk->sk_filter,
14afee4b 1576 refcount_read(&sk->sk_wmem_alloc) == 0);
1da177e4 1577 if (filter) {
309dd5fc 1578 sk_filter_uncharge(sk, filter);
a9b3cd7f 1579 RCU_INIT_POINTER(sk->sk_filter, NULL);
1da177e4 1580 }
538950a1
CG
1581 if (rcu_access_pointer(sk->sk_reuseport_cb))
1582 reuseport_detach_sock(sk);
1da177e4 1583
08e29af3 1584 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1da177e4
LT
1585
1586 if (atomic_read(&sk->sk_omem_alloc))
e005d193
JP
1587 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1588 __func__, atomic_read(&sk->sk_omem_alloc));
1da177e4 1589
22a0e18e
ED
1590 if (sk->sk_frag.page) {
1591 put_page(sk->sk_frag.page);
1592 sk->sk_frag.page = NULL;
1593 }
1594
109f6e39
EB
1595 if (sk->sk_peer_cred)
1596 put_cred(sk->sk_peer_cred);
1597 put_pid(sk->sk_peer_pid);
26abe143
EB
1598 if (likely(sk->sk_net_refcnt))
1599 put_net(sock_net(sk));
c308c1b2 1600 sk_prot_free(sk->sk_prot_creator, sk);
1da177e4 1601}
2b85a34e 1602
a4298e45
ED
1603void sk_destruct(struct sock *sk)
1604{
1605 if (sock_flag(sk, SOCK_RCU_FREE))
1606 call_rcu(&sk->sk_rcu, __sk_destruct);
1607 else
1608 __sk_destruct(&sk->sk_rcu);
1609}
1610
eb4cb008
CG
1611static void __sk_free(struct sock *sk)
1612{
648845ab
TZ
1613 if (likely(sk->sk_net_refcnt))
1614 sock_inuse_add(sock_net(sk), -1);
1615
b922622e 1616 if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
eb4cb008
CG
1617 sock_diag_broadcast_destroy(sk);
1618 else
1619 sk_destruct(sk);
1620}
1621
2b85a34e
ED
1622void sk_free(struct sock *sk)
1623{
1624 /*
25985edc 1625 * We subtract one from sk_wmem_alloc and can know if
2b85a34e
ED
1626 * some packets are still in some tx queue.
1627 * If not null, sock_wfree() will call __sk_free(sk) later
1628 */
14afee4b 1629 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2b85a34e
ED
1630 __sk_free(sk);
1631}
2a91525c 1632EXPORT_SYMBOL(sk_free);
1da177e4 1633
581319c5
PA
1634static void sk_init_common(struct sock *sk)
1635{
1636 skb_queue_head_init(&sk->sk_receive_queue);
1637 skb_queue_head_init(&sk->sk_write_queue);
1638 skb_queue_head_init(&sk->sk_error_queue);
1639
1640 rwlock_init(&sk->sk_callback_lock);
1641 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1642 af_rlock_keys + sk->sk_family,
1643 af_family_rlock_key_strings[sk->sk_family]);
1644 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1645 af_wlock_keys + sk->sk_family,
1646 af_family_wlock_key_strings[sk->sk_family]);
1647 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1648 af_elock_keys + sk->sk_family,
1649 af_family_elock_key_strings[sk->sk_family]);
1650 lockdep_set_class_and_name(&sk->sk_callback_lock,
1651 af_callback_keys + sk->sk_family,
1652 af_family_clock_key_strings[sk->sk_family]);
1653}
1654
e56c57d0
ED
1655/**
1656 * sk_clone_lock - clone a socket, and lock its clone
1657 * @sk: the socket to clone
1658 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1659 *
1660 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1661 */
1662struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
87d11ceb 1663{
8fd1d178 1664 struct sock *newsk;
278571ba 1665 bool is_charged = true;
87d11ceb 1666
8fd1d178 1667 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
87d11ceb
ACM
1668 if (newsk != NULL) {
1669 struct sk_filter *filter;
1670
892c141e 1671 sock_copy(newsk, sk);
87d11ceb 1672
9d538fa6
CP
1673 newsk->sk_prot_creator = sk->sk_prot;
1674
87d11ceb 1675 /* SANITY */
8a681736
SV
1676 if (likely(newsk->sk_net_refcnt))
1677 get_net(sock_net(newsk));
87d11ceb
ACM
1678 sk_node_init(&newsk->sk_node);
1679 sock_lock_init(newsk);
1680 bh_lock_sock(newsk);
fa438ccf 1681 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
8eae939f 1682 newsk->sk_backlog.len = 0;
87d11ceb
ACM
1683
1684 atomic_set(&newsk->sk_rmem_alloc, 0);
2b85a34e
ED
1685 /*
1686 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1687 */
14afee4b 1688 refcount_set(&newsk->sk_wmem_alloc, 1);
87d11ceb 1689 atomic_set(&newsk->sk_omem_alloc, 0);
581319c5 1690 sk_init_common(newsk);
87d11ceb
ACM
1691
1692 newsk->sk_dst_cache = NULL;
9b8805a3 1693 newsk->sk_dst_pending_confirm = 0;
87d11ceb
ACM
1694 newsk->sk_wmem_queued = 0;
1695 newsk->sk_forward_alloc = 0;
9caad864 1696 atomic_set(&newsk->sk_drops, 0);
87d11ceb 1697 newsk->sk_send_head = NULL;
87d11ceb 1698 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
52267790 1699 atomic_set(&newsk->sk_zckey, 0);
87d11ceb
ACM
1700
1701 sock_reset_flag(newsk, SOCK_DONE);
edbe69ef 1702 mem_cgroup_sk_alloc(newsk);
c0576e39 1703 cgroup_sk_alloc(&newsk->sk_cgrp_data);
87d11ceb 1704
eefca20e
ED
1705 rcu_read_lock();
1706 filter = rcu_dereference(sk->sk_filter);
87d11ceb 1707 if (filter != NULL)
278571ba
AS
1708 /* though it's an empty new sock, the charging may fail
1709 * if sysctl_optmem_max was changed between creation of
1710 * original socket and cloning
1711 */
1712 is_charged = sk_filter_charge(newsk, filter);
eefca20e
ED
1713 RCU_INIT_POINTER(newsk->sk_filter, filter);
1714 rcu_read_unlock();
87d11ceb 1715
d188ba86 1716 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
a97e50cc
DB
1717 /* We need to make sure that we don't uncharge the new
1718 * socket if we couldn't charge it in the first place
1719 * as otherwise we uncharge the parent's filter.
1720 */
1721 if (!is_charged)
1722 RCU_INIT_POINTER(newsk->sk_filter, NULL);
94352d45 1723 sk_free_unlock_clone(newsk);
87d11ceb
ACM
1724 newsk = NULL;
1725 goto out;
1726 }
fa463497 1727 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
87d11ceb
ACM
1728
1729 newsk->sk_err = 0;
e551c32d 1730 newsk->sk_err_soft = 0;
87d11ceb 1731 newsk->sk_priority = 0;
2c8c56e1 1732 newsk->sk_incoming_cpu = raw_smp_processor_id();
33cf7c90 1733 atomic64_set(&newsk->sk_cookie, 0);
648845ab
TZ
1734 if (likely(newsk->sk_net_refcnt))
1735 sock_inuse_add(sock_net(newsk), 1);
d979a39d 1736
4dc6dc71
ED
1737 /*
1738 * Before updating sk_refcnt, we must commit prior changes to memory
1739 * (Documentation/RCU/rculist_nulls.txt for details)
1740 */
1741 smp_wmb();
41c6d650 1742 refcount_set(&newsk->sk_refcnt, 2);
87d11ceb
ACM
1743
1744 /*
1745 * Increment the counter in the same struct proto as the master
1746 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1747 * is the same as sk->sk_prot->socks, as this field was copied
1748 * with memcpy).
1749 *
1750 * This _changes_ the previous behaviour, where
1751 * tcp_create_openreq_child always was incrementing the
1752 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1753 * to be taken into account in all callers. -acme
1754 */
1755 sk_refcnt_debug_inc(newsk);
972692e0 1756 sk_set_socket(newsk, NULL);
43815482 1757 newsk->sk_wq = NULL;
87d11ceb
ACM
1758
1759 if (newsk->sk_prot->sockets_allocated)
180d8cd9 1760 sk_sockets_allocated_inc(newsk);
704da560 1761
080a270f
HFS
1762 if (sock_needs_netstamp(sk) &&
1763 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
704da560 1764 net_enable_timestamp();
87d11ceb
ACM
1765 }
1766out:
1767 return newsk;
1768}
e56c57d0 1769EXPORT_SYMBOL_GPL(sk_clone_lock);
87d11ceb 1770
94352d45
ACM
1771void sk_free_unlock_clone(struct sock *sk)
1772{
1773 /* It is still raw copy of parent, so invalidate
1774 * destructor and make plain sk_free() */
1775 sk->sk_destruct = NULL;
1776 bh_unlock_sock(sk);
1777 sk_free(sk);
1778}
1779EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1780
9958089a
AK
1781void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1782{
d6a4e26a
ED
1783 u32 max_segs = 1;
1784
6bd4f355 1785 sk_dst_set(sk, dst);
0a6b2a1d 1786 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
9958089a 1787 if (sk->sk_route_caps & NETIF_F_GSO)
4fcd6b99 1788 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
a465419b 1789 sk->sk_route_caps &= ~sk->sk_route_nocaps;
9958089a 1790 if (sk_can_gso(sk)) {
f70f250a 1791 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
9958089a 1792 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
82cc1a7a 1793 } else {
9958089a 1794 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
82cc1a7a 1795 sk->sk_gso_max_size = dst->dev->gso_max_size;
d6a4e26a 1796 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
82cc1a7a 1797 }
9958089a 1798 }
d6a4e26a 1799 sk->sk_gso_max_segs = max_segs;
9958089a
AK
1800}
1801EXPORT_SYMBOL_GPL(sk_setup_caps);
1802
1da177e4
LT
1803/*
1804 * Simple resource managers for sockets.
1805 */
1806
1807
4ec93edb
YH
1808/*
1809 * Write buffer destructor automatically called from kfree_skb.
1da177e4
LT
1810 */
1811void sock_wfree(struct sk_buff *skb)
1812{
1813 struct sock *sk = skb->sk;
d99927f4 1814 unsigned int len = skb->truesize;
1da177e4 1815
d99927f4
ED
1816 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1817 /*
1818 * Keep a reference on sk_wmem_alloc, this will be released
1819 * after sk_write_space() call
1820 */
14afee4b 1821 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1da177e4 1822 sk->sk_write_space(sk);
d99927f4
ED
1823 len = 1;
1824 }
2b85a34e 1825 /*
d99927f4
ED
1826 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1827 * could not do because of in-flight packets
2b85a34e 1828 */
14afee4b 1829 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2b85a34e 1830 __sk_free(sk);
1da177e4 1831}
2a91525c 1832EXPORT_SYMBOL(sock_wfree);
1da177e4 1833
1d2077ac
ED
1834/* This variant of sock_wfree() is used by TCP,
1835 * since it sets SOCK_USE_WRITE_QUEUE.
1836 */
1837void __sock_wfree(struct sk_buff *skb)
1838{
1839 struct sock *sk = skb->sk;
1840
14afee4b 1841 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1d2077ac
ED
1842 __sk_free(sk);
1843}
1844
9e17f8a4
ED
1845void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1846{
1847 skb_orphan(skb);
1848 skb->sk = sk;
1849#ifdef CONFIG_INET
1850 if (unlikely(!sk_fullsock(sk))) {
1851 skb->destructor = sock_edemux;
1852 sock_hold(sk);
1853 return;
1854 }
1855#endif
1856 skb->destructor = sock_wfree;
1857 skb_set_hash_from_sk(skb, sk);
1858 /*
1859 * We used to take a refcount on sk, but following operation
1860 * is enough to guarantee sk_free() wont free this sock until
1861 * all in-flight packets are completed
1862 */
14afee4b 1863 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
9e17f8a4
ED
1864}
1865EXPORT_SYMBOL(skb_set_owner_w);
1866
1d2077ac
ED
1867/* This helper is used by netem, as it can hold packets in its
1868 * delay queue. We want to allow the owner socket to send more
1869 * packets, as if they were already TX completed by a typical driver.
1870 * But we also want to keep skb->sk set because some packet schedulers
f6ba8d33 1871 * rely on it (sch_fq for example).
1d2077ac 1872 */
f2f872f9
ED
1873void skb_orphan_partial(struct sk_buff *skb)
1874{
f6ba8d33 1875 if (skb_is_tcp_pure_ack(skb))
1d2077ac
ED
1876 return;
1877
f2f872f9
ED
1878 if (skb->destructor == sock_wfree
1879#ifdef CONFIG_INET
1880 || skb->destructor == tcp_wfree
1881#endif
1882 ) {
f6ba8d33
ED
1883 struct sock *sk = skb->sk;
1884
41c6d650 1885 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
14afee4b 1886 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
f6ba8d33
ED
1887 skb->destructor = sock_efree;
1888 }
f2f872f9
ED
1889 } else {
1890 skb_orphan(skb);
1891 }
1892}
1893EXPORT_SYMBOL(skb_orphan_partial);
1894
4ec93edb
YH
1895/*
1896 * Read buffer destructor automatically called from kfree_skb.
1da177e4
LT
1897 */
1898void sock_rfree(struct sk_buff *skb)
1899{
1900 struct sock *sk = skb->sk;
d361fd59 1901 unsigned int len = skb->truesize;
1da177e4 1902
d361fd59
ED
1903 atomic_sub(len, &sk->sk_rmem_alloc);
1904 sk_mem_uncharge(sk, len);
1da177e4 1905}
2a91525c 1906EXPORT_SYMBOL(sock_rfree);
1da177e4 1907
7768eed8
OH
1908/*
1909 * Buffer destructor for skbs that are not used directly in read or write
1910 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1911 */
62bccb8c
AD
1912void sock_efree(struct sk_buff *skb)
1913{
1914 sock_put(skb->sk);
1915}
1916EXPORT_SYMBOL(sock_efree);
1917
976d0201 1918kuid_t sock_i_uid(struct sock *sk)
1da177e4 1919{
976d0201 1920 kuid_t uid;
1da177e4 1921
f064af1e 1922 read_lock_bh(&sk->sk_callback_lock);
976d0201 1923 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
f064af1e 1924 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1925 return uid;
1926}
2a91525c 1927EXPORT_SYMBOL(sock_i_uid);
1da177e4
LT
1928
1929unsigned long sock_i_ino(struct sock *sk)
1930{
1931 unsigned long ino;
1932
f064af1e 1933 read_lock_bh(&sk->sk_callback_lock);
1da177e4 1934 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
f064af1e 1935 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1936 return ino;
1937}
2a91525c 1938EXPORT_SYMBOL(sock_i_ino);
1da177e4
LT
1939
1940/*
1941 * Allocate a skb from the socket's send buffer.
1942 */
86a76caf 1943struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 1944 gfp_t priority)
1da177e4 1945{
14afee4b 1946 if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
2a91525c 1947 struct sk_buff *skb = alloc_skb(size, priority);
1da177e4
LT
1948 if (skb) {
1949 skb_set_owner_w(skb, sk);
1950 return skb;
1951 }
1952 }
1953 return NULL;
1954}
2a91525c 1955EXPORT_SYMBOL(sock_wmalloc);
1da177e4 1956
98ba0bd5
WB
1957static void sock_ofree(struct sk_buff *skb)
1958{
1959 struct sock *sk = skb->sk;
1960
1961 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1962}
1963
1964struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1965 gfp_t priority)
1966{
1967 struct sk_buff *skb;
1968
1969 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1970 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1971 sysctl_optmem_max)
1972 return NULL;
1973
1974 skb = alloc_skb(size, priority);
1975 if (!skb)
1976 return NULL;
1977
1978 atomic_add(skb->truesize, &sk->sk_omem_alloc);
1979 skb->sk = sk;
1980 skb->destructor = sock_ofree;
1981 return skb;
1982}
1983
4ec93edb 1984/*
1da177e4 1985 * Allocate a memory block from the socket's option memory buffer.
4ec93edb 1986 */
dd0fc66f 1987void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1da177e4 1988{
95c96174 1989 if ((unsigned int)size <= sysctl_optmem_max &&
1da177e4
LT
1990 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1991 void *mem;
1992 /* First do the add, to avoid the race if kmalloc
4ec93edb 1993 * might sleep.
1da177e4
LT
1994 */
1995 atomic_add(size, &sk->sk_omem_alloc);
1996 mem = kmalloc(size, priority);
1997 if (mem)
1998 return mem;
1999 atomic_sub(size, &sk->sk_omem_alloc);
2000 }
2001 return NULL;
2002}
2a91525c 2003EXPORT_SYMBOL(sock_kmalloc);
1da177e4 2004
79e88659
DB
2005/* Free an option memory block. Note, we actually want the inline
2006 * here as this allows gcc to detect the nullify and fold away the
2007 * condition entirely.
1da177e4 2008 */
79e88659
DB
2009static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2010 const bool nullify)
1da177e4 2011{
e53da5fb
DM
2012 if (WARN_ON_ONCE(!mem))
2013 return;
79e88659
DB
2014 if (nullify)
2015 kzfree(mem);
2016 else
2017 kfree(mem);
1da177e4
LT
2018 atomic_sub(size, &sk->sk_omem_alloc);
2019}
79e88659
DB
2020
2021void sock_kfree_s(struct sock *sk, void *mem, int size)
2022{
2023 __sock_kfree_s(sk, mem, size, false);
2024}
2a91525c 2025EXPORT_SYMBOL(sock_kfree_s);
1da177e4 2026
79e88659
DB
2027void sock_kzfree_s(struct sock *sk, void *mem, int size)
2028{
2029 __sock_kfree_s(sk, mem, size, true);
2030}
2031EXPORT_SYMBOL(sock_kzfree_s);
2032
1da177e4
LT
2033/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2034 I think, these locks should be removed for datagram sockets.
2035 */
2a91525c 2036static long sock_wait_for_wmem(struct sock *sk, long timeo)
1da177e4
LT
2037{
2038 DEFINE_WAIT(wait);
2039
9cd3e072 2040 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1da177e4
LT
2041 for (;;) {
2042 if (!timeo)
2043 break;
2044 if (signal_pending(current))
2045 break;
2046 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
aa395145 2047 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
14afee4b 2048 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1da177e4
LT
2049 break;
2050 if (sk->sk_shutdown & SEND_SHUTDOWN)
2051 break;
2052 if (sk->sk_err)
2053 break;
2054 timeo = schedule_timeout(timeo);
2055 }
aa395145 2056 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
2057 return timeo;
2058}
2059
2060
2061/*
2062 * Generic send/receive buffer handlers
2063 */
2064
4cc7f68d
HX
2065struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2066 unsigned long data_len, int noblock,
28d64271 2067 int *errcode, int max_page_order)
1da177e4 2068{
2e4e4410 2069 struct sk_buff *skb;
1da177e4
LT
2070 long timeo;
2071 int err;
2072
1da177e4 2073 timeo = sock_sndtimeo(sk, noblock);
2e4e4410 2074 for (;;) {
1da177e4
LT
2075 err = sock_error(sk);
2076 if (err != 0)
2077 goto failure;
2078
2079 err = -EPIPE;
2080 if (sk->sk_shutdown & SEND_SHUTDOWN)
2081 goto failure;
2082
2e4e4410
ED
2083 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2084 break;
28d64271 2085
9cd3e072 2086 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2e4e4410
ED
2087 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2088 err = -EAGAIN;
2089 if (!timeo)
1da177e4 2090 goto failure;
2e4e4410
ED
2091 if (signal_pending(current))
2092 goto interrupted;
2093 timeo = sock_wait_for_wmem(sk, timeo);
1da177e4 2094 }
2e4e4410
ED
2095 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2096 errcode, sk->sk_allocation);
2097 if (skb)
2098 skb_set_owner_w(skb, sk);
1da177e4
LT
2099 return skb;
2100
2101interrupted:
2102 err = sock_intr_errno(timeo);
2103failure:
2104 *errcode = err;
2105 return NULL;
2106}
4cc7f68d 2107EXPORT_SYMBOL(sock_alloc_send_pskb);
1da177e4 2108
4ec93edb 2109struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1da177e4
LT
2110 int noblock, int *errcode)
2111{
28d64271 2112 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1da177e4 2113}
2a91525c 2114EXPORT_SYMBOL(sock_alloc_send_skb);
1da177e4 2115
39771b12
WB
2116int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2117 struct sockcm_cookie *sockc)
2118{
3dd17e63
SHY
2119 u32 tsflags;
2120
39771b12
WB
2121 switch (cmsg->cmsg_type) {
2122 case SO_MARK:
2123 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2124 return -EPERM;
2125 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2126 return -EINVAL;
2127 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2128 break;
3dd17e63
SHY
2129 case SO_TIMESTAMPING:
2130 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2131 return -EINVAL;
2132
2133 tsflags = *(u32 *)CMSG_DATA(cmsg);
2134 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2135 return -EINVAL;
2136
2137 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2138 sockc->tsflags |= tsflags;
2139 break;
779f1ede
SHY
2140 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2141 case SCM_RIGHTS:
2142 case SCM_CREDENTIALS:
2143 break;
39771b12
WB
2144 default:
2145 return -EINVAL;
2146 }
2147 return 0;
2148}
2149EXPORT_SYMBOL(__sock_cmsg_send);
2150
f28ea365
EJ
2151int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2152 struct sockcm_cookie *sockc)
2153{
2154 struct cmsghdr *cmsg;
39771b12 2155 int ret;
f28ea365
EJ
2156
2157 for_each_cmsghdr(cmsg, msg) {
2158 if (!CMSG_OK(msg, cmsg))
2159 return -EINVAL;
2160 if (cmsg->cmsg_level != SOL_SOCKET)
2161 continue;
39771b12
WB
2162 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2163 if (ret)
2164 return ret;
f28ea365
EJ
2165 }
2166 return 0;
2167}
2168EXPORT_SYMBOL(sock_cmsg_send);
2169
06044751
ED
2170static void sk_enter_memory_pressure(struct sock *sk)
2171{
2172 if (!sk->sk_prot->enter_memory_pressure)
2173 return;
2174
2175 sk->sk_prot->enter_memory_pressure(sk);
2176}
2177
2178static void sk_leave_memory_pressure(struct sock *sk)
2179{
2180 if (sk->sk_prot->leave_memory_pressure) {
2181 sk->sk_prot->leave_memory_pressure(sk);
2182 } else {
2183 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2184
2185 if (memory_pressure && *memory_pressure)
2186 *memory_pressure = 0;
2187 }
2188}
2189
5640f768
ED
2190/* On 32bit arches, an skb frag is limited to 2^15 */
2191#define SKB_FRAG_PAGE_ORDER get_order(32768)
2192
400dfd3a
ED
2193/**
2194 * skb_page_frag_refill - check that a page_frag contains enough room
2195 * @sz: minimum size of the fragment we want to get
2196 * @pfrag: pointer to page_frag
82d5e2b8 2197 * @gfp: priority for memory allocation
400dfd3a
ED
2198 *
2199 * Note: While this allocator tries to use high order pages, there is
2200 * no guarantee that allocations succeed. Therefore, @sz MUST be
2201 * less or equal than PAGE_SIZE.
2202 */
d9b2938a 2203bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
5640f768 2204{
5640f768 2205 if (pfrag->page) {
fe896d18 2206 if (page_ref_count(pfrag->page) == 1) {
5640f768
ED
2207 pfrag->offset = 0;
2208 return true;
2209 }
400dfd3a 2210 if (pfrag->offset + sz <= pfrag->size)
5640f768
ED
2211 return true;
2212 put_page(pfrag->page);
2213 }
2214
d9b2938a
ED
2215 pfrag->offset = 0;
2216 if (SKB_FRAG_PAGE_ORDER) {
d0164adc
MG
2217 /* Avoid direct reclaim but allow kswapd to wake */
2218 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2219 __GFP_COMP | __GFP_NOWARN |
2220 __GFP_NORETRY,
d9b2938a 2221 SKB_FRAG_PAGE_ORDER);
5640f768 2222 if (likely(pfrag->page)) {
d9b2938a 2223 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
5640f768
ED
2224 return true;
2225 }
d9b2938a
ED
2226 }
2227 pfrag->page = alloc_page(gfp);
2228 if (likely(pfrag->page)) {
2229 pfrag->size = PAGE_SIZE;
2230 return true;
2231 }
400dfd3a
ED
2232 return false;
2233}
2234EXPORT_SYMBOL(skb_page_frag_refill);
2235
2236bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2237{
2238 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2239 return true;
2240
5640f768
ED
2241 sk_enter_memory_pressure(sk);
2242 sk_stream_moderate_sndbuf(sk);
2243 return false;
2244}
2245EXPORT_SYMBOL(sk_page_frag_refill);
2246
2c3682f0 2247int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
8c05dbf0 2248 int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
2c3682f0
JF
2249 int first_coalesce)
2250{
8c05dbf0
JF
2251 int sg_curr = *sg_curr_index, use = 0, rc = 0;
2252 unsigned int size = *sg_curr_size;
2c3682f0 2253 struct page_frag *pfrag;
2c3682f0 2254 struct scatterlist *sge;
2c3682f0
JF
2255
2256 len -= size;
2257 pfrag = sk_page_frag(sk);
2258
2259 while (len > 0) {
8c05dbf0
JF
2260 unsigned int orig_offset;
2261
2c3682f0
JF
2262 if (!sk_page_frag_refill(sk, pfrag)) {
2263 rc = -ENOMEM;
2264 goto out;
2265 }
2266
2267 use = min_t(int, len, pfrag->size - pfrag->offset);
2268
2269 if (!sk_wmem_schedule(sk, use)) {
2270 rc = -ENOMEM;
2271 goto out;
2272 }
2273
2274 sk_mem_charge(sk, use);
2275 size += use;
2276 orig_offset = pfrag->offset;
2277 pfrag->offset += use;
2278
8c05dbf0
JF
2279 sge = sg + sg_curr - 1;
2280 if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page &&
2c3682f0
JF
2281 sg->offset + sg->length == orig_offset) {
2282 sg->length += use;
2283 } else {
8c05dbf0 2284 sge = sg + sg_curr;
2c3682f0
JF
2285 sg_unmark_end(sge);
2286 sg_set_page(sge, pfrag->page, use, orig_offset);
2287 get_page(pfrag->page);
8c05dbf0
JF
2288 sg_curr++;
2289
2290 if (sg_curr == MAX_SKB_FRAGS)
2291 sg_curr = 0;
2292
2293 if (sg_curr == sg_start) {
2c3682f0
JF
2294 rc = -ENOSPC;
2295 break;
2296 }
2297 }
2298
2299 len -= use;
2300 }
2301out:
8c05dbf0
JF
2302 *sg_curr_size = size;
2303 *sg_curr_index = sg_curr;
2c3682f0
JF
2304 return rc;
2305}
2306EXPORT_SYMBOL(sk_alloc_sg);
2307
1da177e4 2308static void __lock_sock(struct sock *sk)
f39234d6
NK
2309 __releases(&sk->sk_lock.slock)
2310 __acquires(&sk->sk_lock.slock)
1da177e4
LT
2311{
2312 DEFINE_WAIT(wait);
2313
e71a4783 2314 for (;;) {
1da177e4
LT
2315 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2316 TASK_UNINTERRUPTIBLE);
2317 spin_unlock_bh(&sk->sk_lock.slock);
2318 schedule();
2319 spin_lock_bh(&sk->sk_lock.slock);
e71a4783 2320 if (!sock_owned_by_user(sk))
1da177e4
LT
2321 break;
2322 }
2323 finish_wait(&sk->sk_lock.wq, &wait);
2324}
2325
2326static void __release_sock(struct sock *sk)
f39234d6
NK
2327 __releases(&sk->sk_lock.slock)
2328 __acquires(&sk->sk_lock.slock)
1da177e4 2329{
5413d1ba 2330 struct sk_buff *skb, *next;
1da177e4 2331
5413d1ba 2332 while ((skb = sk->sk_backlog.head) != NULL) {
1da177e4 2333 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1da177e4 2334
5413d1ba 2335 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4 2336
5413d1ba
ED
2337 do {
2338 next = skb->next;
e4cbb02a 2339 prefetch(next);
7fee226a 2340 WARN_ON_ONCE(skb_dst_is_noref(skb));
1da177e4 2341 skb->next = NULL;
c57943a1 2342 sk_backlog_rcv(sk, skb);
1da177e4 2343
5413d1ba 2344 cond_resched();
1da177e4
LT
2345
2346 skb = next;
2347 } while (skb != NULL);
2348
5413d1ba
ED
2349 spin_lock_bh(&sk->sk_lock.slock);
2350 }
8eae939f
ZY
2351
2352 /*
2353 * Doing the zeroing here guarantee we can not loop forever
2354 * while a wild producer attempts to flood us.
2355 */
2356 sk->sk_backlog.len = 0;
1da177e4
LT
2357}
2358
d41a69f1
ED
2359void __sk_flush_backlog(struct sock *sk)
2360{
2361 spin_lock_bh(&sk->sk_lock.slock);
2362 __release_sock(sk);
2363 spin_unlock_bh(&sk->sk_lock.slock);
2364}
2365
1da177e4
LT
2366/**
2367 * sk_wait_data - wait for data to arrive at sk_receive_queue
4dc3b16b
PP
2368 * @sk: sock to wait on
2369 * @timeo: for how long
dfbafc99 2370 * @skb: last skb seen on sk_receive_queue
1da177e4
LT
2371 *
2372 * Now socket state including sk->sk_err is changed only under lock,
2373 * hence we may omit checks after joining wait queue.
2374 * We check receive queue before schedule() only as optimization;
2375 * it is very likely that release_sock() added new data.
2376 */
dfbafc99 2377int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
1da177e4 2378{
d9dc8b0f 2379 DEFINE_WAIT_FUNC(wait, woken_wake_function);
1da177e4 2380 int rc;
1da177e4 2381
d9dc8b0f 2382 add_wait_queue(sk_sleep(sk), &wait);
9cd3e072 2383 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
d9dc8b0f 2384 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
9cd3e072 2385 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
d9dc8b0f 2386 remove_wait_queue(sk_sleep(sk), &wait);
1da177e4
LT
2387 return rc;
2388}
1da177e4
LT
2389EXPORT_SYMBOL(sk_wait_data);
2390
3ab224be 2391/**
f8c3bf00 2392 * __sk_mem_raise_allocated - increase memory_allocated
3ab224be
HA
2393 * @sk: socket
2394 * @size: memory size to allocate
f8c3bf00 2395 * @amt: pages to allocate
3ab224be
HA
2396 * @kind: allocation type
2397 *
f8c3bf00 2398 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
3ab224be 2399 */
f8c3bf00 2400int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3ab224be
HA
2401{
2402 struct proto *prot = sk->sk_prot;
f8c3bf00 2403 long allocated = sk_memory_allocated_add(sk, amt);
e805605c 2404
baac50bb
JW
2405 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2406 !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
e805605c 2407 goto suppress_allocation;
3ab224be
HA
2408
2409 /* Under limit. */
e805605c 2410 if (allocated <= sk_prot_mem_limits(sk, 0)) {
180d8cd9 2411 sk_leave_memory_pressure(sk);
3ab224be
HA
2412 return 1;
2413 }
2414
e805605c
JW
2415 /* Under pressure. */
2416 if (allocated > sk_prot_mem_limits(sk, 1))
180d8cd9 2417 sk_enter_memory_pressure(sk);
3ab224be 2418
e805605c
JW
2419 /* Over hard limit. */
2420 if (allocated > sk_prot_mem_limits(sk, 2))
3ab224be
HA
2421 goto suppress_allocation;
2422
2423 /* guarantee minimum buffer size under pressure */
2424 if (kind == SK_MEM_RECV) {
a3dcaf17 2425 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3ab224be 2426 return 1;
180d8cd9 2427
3ab224be 2428 } else { /* SK_MEM_SEND */
a3dcaf17
ED
2429 int wmem0 = sk_get_wmem0(sk, prot);
2430
3ab224be 2431 if (sk->sk_type == SOCK_STREAM) {
a3dcaf17 2432 if (sk->sk_wmem_queued < wmem0)
3ab224be 2433 return 1;
a3dcaf17 2434 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3ab224be 2435 return 1;
a3dcaf17 2436 }
3ab224be
HA
2437 }
2438
180d8cd9 2439 if (sk_has_memory_pressure(sk)) {
1748376b
ED
2440 int alloc;
2441
180d8cd9 2442 if (!sk_under_memory_pressure(sk))
1748376b 2443 return 1;
180d8cd9
GC
2444 alloc = sk_sockets_allocated_read_positive(sk);
2445 if (sk_prot_mem_limits(sk, 2) > alloc *
3ab224be
HA
2446 sk_mem_pages(sk->sk_wmem_queued +
2447 atomic_read(&sk->sk_rmem_alloc) +
2448 sk->sk_forward_alloc))
2449 return 1;
2450 }
2451
2452suppress_allocation:
2453
2454 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2455 sk_stream_moderate_sndbuf(sk);
2456
2457 /* Fail only if socket is _under_ its sndbuf.
2458 * In this case we cannot block, so that we have to fail.
2459 */
2460 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2461 return 1;
2462 }
2463
3847ce32
SM
2464 trace_sock_exceed_buf_limit(sk, prot, allocated);
2465
0e90b31f 2466 sk_memory_allocated_sub(sk, amt);
180d8cd9 2467
baac50bb
JW
2468 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2469 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
e805605c 2470
3ab224be
HA
2471 return 0;
2472}
f8c3bf00
PA
2473EXPORT_SYMBOL(__sk_mem_raise_allocated);
2474
2475/**
2476 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2477 * @sk: socket
2478 * @size: memory size to allocate
2479 * @kind: allocation type
2480 *
2481 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2482 * rmem allocation. This function assumes that protocols which have
2483 * memory_pressure use sk_wmem_queued as write buffer accounting.
2484 */
2485int __sk_mem_schedule(struct sock *sk, int size, int kind)
2486{
2487 int ret, amt = sk_mem_pages(size);
2488
2489 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2490 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2491 if (!ret)
2492 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2493 return ret;
2494}
3ab224be
HA
2495EXPORT_SYMBOL(__sk_mem_schedule);
2496
2497/**
f8c3bf00 2498 * __sk_mem_reduce_allocated - reclaim memory_allocated
3ab224be 2499 * @sk: socket
f8c3bf00
PA
2500 * @amount: number of quanta
2501 *
2502 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3ab224be 2503 */
f8c3bf00 2504void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3ab224be 2505{
1a24e04e 2506 sk_memory_allocated_sub(sk, amount);
3ab224be 2507
baac50bb
JW
2508 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2509 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
e805605c 2510
180d8cd9
GC
2511 if (sk_under_memory_pressure(sk) &&
2512 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2513 sk_leave_memory_pressure(sk);
3ab224be 2514}
f8c3bf00
PA
2515EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2516
2517/**
2518 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2519 * @sk: socket
2520 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2521 */
2522void __sk_mem_reclaim(struct sock *sk, int amount)
2523{
2524 amount >>= SK_MEM_QUANTUM_SHIFT;
2525 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2526 __sk_mem_reduce_allocated(sk, amount);
2527}
3ab224be
HA
2528EXPORT_SYMBOL(__sk_mem_reclaim);
2529
627d2d6b 2530int sk_set_peek_off(struct sock *sk, int val)
2531{
627d2d6b 2532 sk->sk_peek_off = val;
2533 return 0;
2534}
2535EXPORT_SYMBOL_GPL(sk_set_peek_off);
3ab224be 2536
1da177e4
LT
2537/*
2538 * Set of default routines for initialising struct proto_ops when
2539 * the protocol does not support a particular function. In certain
2540 * cases where it makes no sense for a protocol to have a "do nothing"
2541 * function, some default processing is provided.
2542 */
2543
2544int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2545{
2546 return -EOPNOTSUPP;
2547}
2a91525c 2548EXPORT_SYMBOL(sock_no_bind);
1da177e4 2549
4ec93edb 2550int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
2551 int len, int flags)
2552{
2553 return -EOPNOTSUPP;
2554}
2a91525c 2555EXPORT_SYMBOL(sock_no_connect);
1da177e4
LT
2556
2557int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2558{
2559 return -EOPNOTSUPP;
2560}
2a91525c 2561EXPORT_SYMBOL(sock_no_socketpair);
1da177e4 2562
cdfbabfb
DH
2563int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2564 bool kern)
1da177e4
LT
2565{
2566 return -EOPNOTSUPP;
2567}
2a91525c 2568EXPORT_SYMBOL(sock_no_accept);
1da177e4 2569
4ec93edb 2570int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
9b2c45d4 2571 int peer)
1da177e4
LT
2572{
2573 return -EOPNOTSUPP;
2574}
2a91525c 2575EXPORT_SYMBOL(sock_no_getname);
1da177e4 2576
ade994f4 2577__poll_t sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1da177e4
LT
2578{
2579 return 0;
2580}
2a91525c 2581EXPORT_SYMBOL(sock_no_poll);
1da177e4
LT
2582
2583int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2584{
2585 return -EOPNOTSUPP;
2586}
2a91525c 2587EXPORT_SYMBOL(sock_no_ioctl);
1da177e4
LT
2588
2589int sock_no_listen(struct socket *sock, int backlog)
2590{
2591 return -EOPNOTSUPP;
2592}
2a91525c 2593EXPORT_SYMBOL(sock_no_listen);
1da177e4
LT
2594
2595int sock_no_shutdown(struct socket *sock, int how)
2596{
2597 return -EOPNOTSUPP;
2598}
2a91525c 2599EXPORT_SYMBOL(sock_no_shutdown);
1da177e4
LT
2600
2601int sock_no_setsockopt(struct socket *sock, int level, int optname,
b7058842 2602 char __user *optval, unsigned int optlen)
1da177e4
LT
2603{
2604 return -EOPNOTSUPP;
2605}
2a91525c 2606EXPORT_SYMBOL(sock_no_setsockopt);
1da177e4
LT
2607
2608int sock_no_getsockopt(struct socket *sock, int level, int optname,
2609 char __user *optval, int __user *optlen)
2610{
2611 return -EOPNOTSUPP;
2612}
2a91525c 2613EXPORT_SYMBOL(sock_no_getsockopt);
1da177e4 2614
1b784140 2615int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
1da177e4
LT
2616{
2617 return -EOPNOTSUPP;
2618}
2a91525c 2619EXPORT_SYMBOL(sock_no_sendmsg);
1da177e4 2620
306b13eb
TH
2621int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2622{
2623 return -EOPNOTSUPP;
2624}
2625EXPORT_SYMBOL(sock_no_sendmsg_locked);
2626
1b784140
YX
2627int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2628 int flags)
1da177e4
LT
2629{
2630 return -EOPNOTSUPP;
2631}
2a91525c 2632EXPORT_SYMBOL(sock_no_recvmsg);
1da177e4
LT
2633
2634int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2635{
2636 /* Mirror missing mmap method error code */
2637 return -ENODEV;
2638}
2a91525c 2639EXPORT_SYMBOL(sock_no_mmap);
1da177e4
LT
2640
2641ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2642{
2643 ssize_t res;
2644 struct msghdr msg = {.msg_flags = flags};
2645 struct kvec iov;
2646 char *kaddr = kmap(page);
2647 iov.iov_base = kaddr + offset;
2648 iov.iov_len = size;
2649 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2650 kunmap(page);
2651 return res;
2652}
2a91525c 2653EXPORT_SYMBOL(sock_no_sendpage);
1da177e4 2654
306b13eb
TH
2655ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2656 int offset, size_t size, int flags)
2657{
2658 ssize_t res;
2659 struct msghdr msg = {.msg_flags = flags};
2660 struct kvec iov;
2661 char *kaddr = kmap(page);
2662
2663 iov.iov_base = kaddr + offset;
2664 iov.iov_len = size;
2665 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2666 kunmap(page);
2667 return res;
2668}
2669EXPORT_SYMBOL(sock_no_sendpage_locked);
2670
1da177e4
LT
2671/*
2672 * Default Socket Callbacks
2673 */
2674
2675static void sock_def_wakeup(struct sock *sk)
2676{
43815482
ED
2677 struct socket_wq *wq;
2678
2679 rcu_read_lock();
2680 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2681 if (skwq_has_sleeper(wq))
43815482
ED
2682 wake_up_interruptible_all(&wq->wait);
2683 rcu_read_unlock();
1da177e4
LT
2684}
2685
2686static void sock_def_error_report(struct sock *sk)
2687{
43815482
ED
2688 struct socket_wq *wq;
2689
2690 rcu_read_lock();
2691 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2692 if (skwq_has_sleeper(wq))
a9a08845 2693 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
8d8ad9d7 2694 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
43815482 2695 rcu_read_unlock();
1da177e4
LT
2696}
2697
676d2369 2698static void sock_def_readable(struct sock *sk)
1da177e4 2699{
43815482
ED
2700 struct socket_wq *wq;
2701
2702 rcu_read_lock();
2703 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2704 if (skwq_has_sleeper(wq))
a9a08845
LT
2705 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2706 EPOLLRDNORM | EPOLLRDBAND);
8d8ad9d7 2707 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
43815482 2708 rcu_read_unlock();
1da177e4
LT
2709}
2710
2711static void sock_def_write_space(struct sock *sk)
2712{
43815482
ED
2713 struct socket_wq *wq;
2714
2715 rcu_read_lock();
1da177e4
LT
2716
2717 /* Do not wake up a writer until he can make "significant"
2718 * progress. --DaveM
2719 */
14afee4b 2720 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
43815482 2721 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2722 if (skwq_has_sleeper(wq))
a9a08845
LT
2723 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2724 EPOLLWRNORM | EPOLLWRBAND);
1da177e4
LT
2725
2726 /* Should agree with poll, otherwise some programs break */
2727 if (sock_writeable(sk))
8d8ad9d7 2728 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1da177e4
LT
2729 }
2730
43815482 2731 rcu_read_unlock();
1da177e4
LT
2732}
2733
2734static void sock_def_destruct(struct sock *sk)
2735{
1da177e4
LT
2736}
2737
2738void sk_send_sigurg(struct sock *sk)
2739{
2740 if (sk->sk_socket && sk->sk_socket->file)
2741 if (send_sigurg(&sk->sk_socket->file->f_owner))
8d8ad9d7 2742 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1da177e4 2743}
2a91525c 2744EXPORT_SYMBOL(sk_send_sigurg);
1da177e4
LT
2745
2746void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2747 unsigned long expires)
2748{
2749 if (!mod_timer(timer, expires))
2750 sock_hold(sk);
2751}
1da177e4
LT
2752EXPORT_SYMBOL(sk_reset_timer);
2753
2754void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2755{
25cc4ae9 2756 if (del_timer(timer))
1da177e4
LT
2757 __sock_put(sk);
2758}
1da177e4
LT
2759EXPORT_SYMBOL(sk_stop_timer);
2760
2761void sock_init_data(struct socket *sock, struct sock *sk)
2762{
581319c5 2763 sk_init_common(sk);
1da177e4
LT
2764 sk->sk_send_head = NULL;
2765
99767f27 2766 timer_setup(&sk->sk_timer, NULL, 0);
4ec93edb 2767
1da177e4
LT
2768 sk->sk_allocation = GFP_KERNEL;
2769 sk->sk_rcvbuf = sysctl_rmem_default;
2770 sk->sk_sndbuf = sysctl_wmem_default;
2771 sk->sk_state = TCP_CLOSE;
972692e0 2772 sk_set_socket(sk, sock);
1da177e4
LT
2773
2774 sock_set_flag(sk, SOCK_ZAPPED);
2775
e71a4783 2776 if (sock) {
1da177e4 2777 sk->sk_type = sock->type;
43815482 2778 sk->sk_wq = sock->wq;
1da177e4 2779 sock->sk = sk;
86741ec2
LC
2780 sk->sk_uid = SOCK_INODE(sock)->i_uid;
2781 } else {
43815482 2782 sk->sk_wq = NULL;
86741ec2
LC
2783 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
2784 }
1da177e4 2785
1da177e4 2786 rwlock_init(&sk->sk_callback_lock);
cdfbabfb
DH
2787 if (sk->sk_kern_sock)
2788 lockdep_set_class_and_name(
2789 &sk->sk_callback_lock,
2790 af_kern_callback_keys + sk->sk_family,
2791 af_family_kern_clock_key_strings[sk->sk_family]);
2792 else
2793 lockdep_set_class_and_name(
2794 &sk->sk_callback_lock,
443aef0e
PZ
2795 af_callback_keys + sk->sk_family,
2796 af_family_clock_key_strings[sk->sk_family]);
1da177e4
LT
2797
2798 sk->sk_state_change = sock_def_wakeup;
2799 sk->sk_data_ready = sock_def_readable;
2800 sk->sk_write_space = sock_def_write_space;
2801 sk->sk_error_report = sock_def_error_report;
2802 sk->sk_destruct = sock_def_destruct;
2803
5640f768
ED
2804 sk->sk_frag.page = NULL;
2805 sk->sk_frag.offset = 0;
ef64a54f 2806 sk->sk_peek_off = -1;
1da177e4 2807
109f6e39
EB
2808 sk->sk_peer_pid = NULL;
2809 sk->sk_peer_cred = NULL;
1da177e4
LT
2810 sk->sk_write_pending = 0;
2811 sk->sk_rcvlowat = 1;
2812 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2813 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2814
6c7c98ba 2815 sk->sk_stamp = SK_DEFAULT_STAMP;
52267790 2816 atomic_set(&sk->sk_zckey, 0);
1da177e4 2817
e0d1095a 2818#ifdef CONFIG_NET_RX_BUSY_POLL
06021292 2819 sk->sk_napi_id = 0;
64b0dc51 2820 sk->sk_ll_usec = sysctl_net_busy_read;
06021292
ET
2821#endif
2822
62748f32 2823 sk->sk_max_pacing_rate = ~0U;
7eec4174 2824 sk->sk_pacing_rate = ~0U;
3a9b76fd 2825 sk->sk_pacing_shift = 10;
70da268b 2826 sk->sk_incoming_cpu = -1;
4dc6dc71
ED
2827 /*
2828 * Before updating sk_refcnt, we must commit prior changes to memory
2829 * (Documentation/RCU/rculist_nulls.txt for details)
2830 */
2831 smp_wmb();
41c6d650 2832 refcount_set(&sk->sk_refcnt, 1);
33c732c3 2833 atomic_set(&sk->sk_drops, 0);
1da177e4 2834}
2a91525c 2835EXPORT_SYMBOL(sock_init_data);
1da177e4 2836
b5606c2d 2837void lock_sock_nested(struct sock *sk, int subclass)
1da177e4
LT
2838{
2839 might_sleep();
a5b5bb9a 2840 spin_lock_bh(&sk->sk_lock.slock);
d2e9117c 2841 if (sk->sk_lock.owned)
1da177e4 2842 __lock_sock(sk);
d2e9117c 2843 sk->sk_lock.owned = 1;
a5b5bb9a
IM
2844 spin_unlock(&sk->sk_lock.slock);
2845 /*
2846 * The sk_lock has mutex_lock() semantics here:
2847 */
fcc70d5f 2848 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
a5b5bb9a 2849 local_bh_enable();
1da177e4 2850}
fcc70d5f 2851EXPORT_SYMBOL(lock_sock_nested);
1da177e4 2852
b5606c2d 2853void release_sock(struct sock *sk)
1da177e4 2854{
a5b5bb9a 2855 spin_lock_bh(&sk->sk_lock.slock);
1da177e4
LT
2856 if (sk->sk_backlog.tail)
2857 __release_sock(sk);
46d3ceab 2858
c3f9b018
ED
2859 /* Warning : release_cb() might need to release sk ownership,
2860 * ie call sock_release_ownership(sk) before us.
2861 */
46d3ceab
ED
2862 if (sk->sk_prot->release_cb)
2863 sk->sk_prot->release_cb(sk);
2864
c3f9b018 2865 sock_release_ownership(sk);
a5b5bb9a
IM
2866 if (waitqueue_active(&sk->sk_lock.wq))
2867 wake_up(&sk->sk_lock.wq);
2868 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4
LT
2869}
2870EXPORT_SYMBOL(release_sock);
2871
8a74ad60
ED
2872/**
2873 * lock_sock_fast - fast version of lock_sock
2874 * @sk: socket
2875 *
2876 * This version should be used for very small section, where process wont block
d651983d
MCC
2877 * return false if fast path is taken:
2878 *
8a74ad60 2879 * sk_lock.slock locked, owned = 0, BH disabled
d651983d
MCC
2880 *
2881 * return true if slow path is taken:
2882 *
8a74ad60
ED
2883 * sk_lock.slock unlocked, owned = 1, BH enabled
2884 */
2885bool lock_sock_fast(struct sock *sk)
2886{
2887 might_sleep();
2888 spin_lock_bh(&sk->sk_lock.slock);
2889
2890 if (!sk->sk_lock.owned)
2891 /*
2892 * Note : We must disable BH
2893 */
2894 return false;
2895
2896 __lock_sock(sk);
2897 sk->sk_lock.owned = 1;
2898 spin_unlock(&sk->sk_lock.slock);
2899 /*
2900 * The sk_lock has mutex_lock() semantics here:
2901 */
2902 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2903 local_bh_enable();
2904 return true;
2905}
2906EXPORT_SYMBOL(lock_sock_fast);
2907
1da177e4 2908int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
4ec93edb 2909{
b7aa0bf7 2910 struct timeval tv;
1da177e4 2911 if (!sock_flag(sk, SOCK_TIMESTAMP))
20d49473 2912 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
b7aa0bf7
ED
2913 tv = ktime_to_timeval(sk->sk_stamp);
2914 if (tv.tv_sec == -1)
1da177e4 2915 return -ENOENT;
b7aa0bf7
ED
2916 if (tv.tv_sec == 0) {
2917 sk->sk_stamp = ktime_get_real();
2918 tv = ktime_to_timeval(sk->sk_stamp);
2919 }
2920 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
4ec93edb 2921}
1da177e4
LT
2922EXPORT_SYMBOL(sock_get_timestamp);
2923
ae40eb1e
ED
2924int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2925{
2926 struct timespec ts;
2927 if (!sock_flag(sk, SOCK_TIMESTAMP))
20d49473 2928 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
ae40eb1e
ED
2929 ts = ktime_to_timespec(sk->sk_stamp);
2930 if (ts.tv_sec == -1)
2931 return -ENOENT;
2932 if (ts.tv_sec == 0) {
2933 sk->sk_stamp = ktime_get_real();
2934 ts = ktime_to_timespec(sk->sk_stamp);
2935 }
2936 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2937}
2938EXPORT_SYMBOL(sock_get_timestampns);
2939
20d49473 2940void sock_enable_timestamp(struct sock *sk, int flag)
4ec93edb 2941{
20d49473 2942 if (!sock_flag(sk, flag)) {
08e29af3
ED
2943 unsigned long previous_flags = sk->sk_flags;
2944
20d49473
PO
2945 sock_set_flag(sk, flag);
2946 /*
2947 * we just set one of the two flags which require net
2948 * time stamping, but time stamping might have been on
2949 * already because of the other one
2950 */
080a270f
HFS
2951 if (sock_needs_netstamp(sk) &&
2952 !(previous_flags & SK_FLAGS_TIMESTAMP))
20d49473 2953 net_enable_timestamp();
1da177e4
LT
2954 }
2955}
1da177e4 2956
cb820f8e
RC
2957int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2958 int level, int type)
2959{
2960 struct sock_exterr_skb *serr;
364a9e93 2961 struct sk_buff *skb;
cb820f8e
RC
2962 int copied, err;
2963
2964 err = -EAGAIN;
364a9e93 2965 skb = sock_dequeue_err_skb(sk);
cb820f8e
RC
2966 if (skb == NULL)
2967 goto out;
2968
2969 copied = skb->len;
2970 if (copied > len) {
2971 msg->msg_flags |= MSG_TRUNC;
2972 copied = len;
2973 }
51f3d02b 2974 err = skb_copy_datagram_msg(skb, 0, msg, copied);
cb820f8e
RC
2975 if (err)
2976 goto out_free_skb;
2977
2978 sock_recv_timestamp(msg, sk, skb);
2979
2980 serr = SKB_EXT_ERR(skb);
2981 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2982
2983 msg->msg_flags |= MSG_ERRQUEUE;
2984 err = copied;
2985
cb820f8e
RC
2986out_free_skb:
2987 kfree_skb(skb);
2988out:
2989 return err;
2990}
2991EXPORT_SYMBOL(sock_recv_errqueue);
2992
1da177e4
LT
2993/*
2994 * Get a socket option on an socket.
2995 *
2996 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2997 * asynchronous errors should be reported by getsockopt. We assume
2998 * this means if you specify SO_ERROR (otherwise whats the point of it).
2999 */
3000int sock_common_getsockopt(struct socket *sock, int level, int optname,
3001 char __user *optval, int __user *optlen)
3002{
3003 struct sock *sk = sock->sk;
3004
3005 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3006}
1da177e4
LT
3007EXPORT_SYMBOL(sock_common_getsockopt);
3008
3fdadf7d 3009#ifdef CONFIG_COMPAT
543d9cfe
ACM
3010int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3011 char __user *optval, int __user *optlen)
3fdadf7d
DM
3012{
3013 struct sock *sk = sock->sk;
3014
1e51f951 3015 if (sk->sk_prot->compat_getsockopt != NULL)
543d9cfe
ACM
3016 return sk->sk_prot->compat_getsockopt(sk, level, optname,
3017 optval, optlen);
3fdadf7d
DM
3018 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3019}
3020EXPORT_SYMBOL(compat_sock_common_getsockopt);
3021#endif
3022
1b784140
YX
3023int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3024 int flags)
1da177e4
LT
3025{
3026 struct sock *sk = sock->sk;
3027 int addr_len = 0;
3028 int err;
3029
1b784140 3030 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
1da177e4
LT
3031 flags & ~MSG_DONTWAIT, &addr_len);
3032 if (err >= 0)
3033 msg->msg_namelen = addr_len;
3034 return err;
3035}
1da177e4
LT
3036EXPORT_SYMBOL(sock_common_recvmsg);
3037
3038/*
3039 * Set socket options on an inet socket.
3040 */
3041int sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 3042 char __user *optval, unsigned int optlen)
1da177e4
LT
3043{
3044 struct sock *sk = sock->sk;
3045
3046 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3047}
1da177e4
LT
3048EXPORT_SYMBOL(sock_common_setsockopt);
3049
3fdadf7d 3050#ifdef CONFIG_COMPAT
543d9cfe 3051int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 3052 char __user *optval, unsigned int optlen)
3fdadf7d
DM
3053{
3054 struct sock *sk = sock->sk;
3055
543d9cfe
ACM
3056 if (sk->sk_prot->compat_setsockopt != NULL)
3057 return sk->sk_prot->compat_setsockopt(sk, level, optname,
3058 optval, optlen);
3fdadf7d
DM
3059 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3060}
3061EXPORT_SYMBOL(compat_sock_common_setsockopt);
3062#endif
3063
1da177e4
LT
3064void sk_common_release(struct sock *sk)
3065{
3066 if (sk->sk_prot->destroy)
3067 sk->sk_prot->destroy(sk);
3068
3069 /*
3070 * Observation: when sock_common_release is called, processes have
3071 * no access to socket. But net still has.
3072 * Step one, detach it from networking:
3073 *
3074 * A. Remove from hash tables.
3075 */
3076
3077 sk->sk_prot->unhash(sk);
3078
3079 /*
3080 * In this point socket cannot receive new packets, but it is possible
3081 * that some packets are in flight because some CPU runs receiver and
3082 * did hash table lookup before we unhashed socket. They will achieve
3083 * receive queue and will be purged by socket destructor.
3084 *
3085 * Also we still have packets pending on receive queue and probably,
3086 * our own packets waiting in device queues. sock_destroy will drain
3087 * receive queue, but transmitted packets will delay socket destruction
3088 * until the last reference will be released.
3089 */
3090
3091 sock_orphan(sk);
3092
3093 xfrm_sk_free_policy(sk);
3094
e6848976 3095 sk_refcnt_debug_release(sk);
5640f768 3096
1da177e4
LT
3097 sock_put(sk);
3098}
1da177e4
LT
3099EXPORT_SYMBOL(sk_common_release);
3100
a2d133b1
JH
3101void sk_get_meminfo(const struct sock *sk, u32 *mem)
3102{
3103 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3104
3105 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3106 mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3107 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3108 mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3109 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3110 mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3111 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3112 mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3113 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3114}
3115
13ff3d6f
PE
3116#ifdef CONFIG_PROC_FS
3117#define PROTO_INUSE_NR 64 /* should be enough for the first time */
1338d466
PE
3118struct prot_inuse {
3119 int val[PROTO_INUSE_NR];
3120};
13ff3d6f
PE
3121
3122static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
70ee1159 3123
70ee1159
PE
3124void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3125{
08fc7f81 3126 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
70ee1159
PE
3127}
3128EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3129
3130int sock_prot_inuse_get(struct net *net, struct proto *prot)
3131{
3132 int cpu, idx = prot->inuse_idx;
3133 int res = 0;
3134
3135 for_each_possible_cpu(cpu)
08fc7f81 3136 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
70ee1159
PE
3137
3138 return res >= 0 ? res : 0;
3139}
3140EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3141
648845ab
TZ
3142static void sock_inuse_add(struct net *net, int val)
3143{
3144 this_cpu_add(*net->core.sock_inuse, val);
3145}
3146
3147int sock_inuse_get(struct net *net)
3148{
3149 int cpu, res = 0;
3150
3151 for_each_possible_cpu(cpu)
3152 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3153
3154 return res;
3155}
3156
3157EXPORT_SYMBOL_GPL(sock_inuse_get);
3158
2c8c1e72 3159static int __net_init sock_inuse_init_net(struct net *net)
70ee1159 3160{
08fc7f81 3161 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
648845ab
TZ
3162 if (net->core.prot_inuse == NULL)
3163 return -ENOMEM;
3164
3165 net->core.sock_inuse = alloc_percpu(int);
3166 if (net->core.sock_inuse == NULL)
3167 goto out;
3168
3169 return 0;
3170
3171out:
3172 free_percpu(net->core.prot_inuse);
3173 return -ENOMEM;
70ee1159
PE
3174}
3175
2c8c1e72 3176static void __net_exit sock_inuse_exit_net(struct net *net)
70ee1159 3177{
08fc7f81 3178 free_percpu(net->core.prot_inuse);
648845ab 3179 free_percpu(net->core.sock_inuse);
70ee1159
PE
3180}
3181
3182static struct pernet_operations net_inuse_ops = {
3183 .init = sock_inuse_init_net,
3184 .exit = sock_inuse_exit_net,
3185};
3186
3187static __init int net_inuse_init(void)
3188{
3189 if (register_pernet_subsys(&net_inuse_ops))
3190 panic("Cannot initialize net inuse counters");
3191
3192 return 0;
3193}
3194
3195core_initcall(net_inuse_init);
13ff3d6f
PE
3196
3197static void assign_proto_idx(struct proto *prot)
3198{
3199 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3200
3201 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
e005d193 3202 pr_err("PROTO_INUSE_NR exhausted\n");
13ff3d6f
PE
3203 return;
3204 }
3205
3206 set_bit(prot->inuse_idx, proto_inuse_idx);
3207}
3208
3209static void release_proto_idx(struct proto *prot)
3210{
3211 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3212 clear_bit(prot->inuse_idx, proto_inuse_idx);
3213}
3214#else
3215static inline void assign_proto_idx(struct proto *prot)
3216{
3217}
3218
3219static inline void release_proto_idx(struct proto *prot)
3220{
3221}
648845ab
TZ
3222
3223static void sock_inuse_add(struct net *net, int val)
3224{
3225}
13ff3d6f
PE
3226#endif
3227
0159dfd3
ED
3228static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3229{
3230 if (!rsk_prot)
3231 return;
3232 kfree(rsk_prot->slab_name);
3233 rsk_prot->slab_name = NULL;
adf78eda
JL
3234 kmem_cache_destroy(rsk_prot->slab);
3235 rsk_prot->slab = NULL;
0159dfd3
ED
3236}
3237
3238static int req_prot_init(const struct proto *prot)
3239{
3240 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3241
3242 if (!rsk_prot)
3243 return 0;
3244
3245 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3246 prot->name);
3247 if (!rsk_prot->slab_name)
3248 return -ENOMEM;
3249
3250 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3251 rsk_prot->obj_size, 0,
e96f78ab 3252 prot->slab_flags, NULL);
0159dfd3
ED
3253
3254 if (!rsk_prot->slab) {
3255 pr_crit("%s: Can't create request sock SLAB cache!\n",
3256 prot->name);
3257 return -ENOMEM;
3258 }
3259 return 0;
3260}
3261
b733c007
PE
3262int proto_register(struct proto *prot, int alloc_slab)
3263{
1da177e4 3264 if (alloc_slab) {
30c2c9f1
DW
3265 prot->slab = kmem_cache_create_usercopy(prot->name,
3266 prot->obj_size, 0,
271b72c7 3267 SLAB_HWCACHE_ALIGN | prot->slab_flags,
289a4860 3268 prot->useroffset, prot->usersize,
271b72c7 3269 NULL);
1da177e4
LT
3270
3271 if (prot->slab == NULL) {
e005d193
JP
3272 pr_crit("%s: Can't create sock SLAB cache!\n",
3273 prot->name);
60e7663d 3274 goto out;
1da177e4 3275 }
2e6599cb 3276
0159dfd3
ED
3277 if (req_prot_init(prot))
3278 goto out_free_request_sock_slab;
8feaf0c0 3279
6d6ee43e 3280 if (prot->twsk_prot != NULL) {
faf23422 3281 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
8feaf0c0 3282
7e56b5d6 3283 if (prot->twsk_prot->twsk_slab_name == NULL)
8feaf0c0
ACM
3284 goto out_free_request_sock_slab;
3285
6d6ee43e 3286 prot->twsk_prot->twsk_slab =
7e56b5d6 3287 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
6d6ee43e 3288 prot->twsk_prot->twsk_obj_size,
3ab5aee7 3289 0,
52db70dc 3290 prot->slab_flags,
20c2df83 3291 NULL);
6d6ee43e 3292 if (prot->twsk_prot->twsk_slab == NULL)
8feaf0c0
ACM
3293 goto out_free_timewait_sock_slab_name;
3294 }
1da177e4
LT
3295 }
3296
36b77a52 3297 mutex_lock(&proto_list_mutex);
1da177e4 3298 list_add(&prot->node, &proto_list);
13ff3d6f 3299 assign_proto_idx(prot);
36b77a52 3300 mutex_unlock(&proto_list_mutex);
b733c007
PE
3301 return 0;
3302
8feaf0c0 3303out_free_timewait_sock_slab_name:
7e56b5d6 3304 kfree(prot->twsk_prot->twsk_slab_name);
8feaf0c0 3305out_free_request_sock_slab:
0159dfd3
ED
3306 req_prot_cleanup(prot->rsk_prot);
3307
2e6599cb
ACM
3308 kmem_cache_destroy(prot->slab);
3309 prot->slab = NULL;
b733c007
PE
3310out:
3311 return -ENOBUFS;
1da177e4 3312}
1da177e4
LT
3313EXPORT_SYMBOL(proto_register);
3314
3315void proto_unregister(struct proto *prot)
3316{
36b77a52 3317 mutex_lock(&proto_list_mutex);
13ff3d6f 3318 release_proto_idx(prot);
0a3f4358 3319 list_del(&prot->node);
36b77a52 3320 mutex_unlock(&proto_list_mutex);
1da177e4 3321
adf78eda
JL
3322 kmem_cache_destroy(prot->slab);
3323 prot->slab = NULL;
1da177e4 3324
0159dfd3 3325 req_prot_cleanup(prot->rsk_prot);
2e6599cb 3326
6d6ee43e 3327 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
6d6ee43e 3328 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
7e56b5d6 3329 kfree(prot->twsk_prot->twsk_slab_name);
6d6ee43e 3330 prot->twsk_prot->twsk_slab = NULL;
8feaf0c0 3331 }
1da177e4 3332}
1da177e4
LT
3333EXPORT_SYMBOL(proto_unregister);
3334
bf2ae2e4
XL
3335int sock_load_diag_module(int family, int protocol)
3336{
3337 if (!protocol) {
3338 if (!sock_is_registered(family))
3339 return -ENOENT;
3340
3341 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3342 NETLINK_SOCK_DIAG, family);
3343 }
3344
3345#ifdef CONFIG_INET
3346 if (family == AF_INET &&
3347 !rcu_access_pointer(inet_protos[protocol]))
3348 return -ENOENT;
3349#endif
3350
3351 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3352 NETLINK_SOCK_DIAG, family, protocol);
3353}
3354EXPORT_SYMBOL(sock_load_diag_module);
3355
1da177e4 3356#ifdef CONFIG_PROC_FS
1da177e4 3357static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
36b77a52 3358 __acquires(proto_list_mutex)
1da177e4 3359{
36b77a52 3360 mutex_lock(&proto_list_mutex);
60f0438a 3361 return seq_list_start_head(&proto_list, *pos);
1da177e4
LT
3362}
3363
3364static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3365{
60f0438a 3366 return seq_list_next(v, &proto_list, pos);
1da177e4
LT
3367}
3368
3369static void proto_seq_stop(struct seq_file *seq, void *v)
36b77a52 3370 __releases(proto_list_mutex)
1da177e4 3371{
36b77a52 3372 mutex_unlock(&proto_list_mutex);
1da177e4
LT
3373}
3374
3375static char proto_method_implemented(const void *method)
3376{
3377 return method == NULL ? 'n' : 'y';
3378}
180d8cd9
GC
3379static long sock_prot_memory_allocated(struct proto *proto)
3380{
cb75a36c 3381 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
180d8cd9
GC
3382}
3383
3384static char *sock_prot_memory_pressure(struct proto *proto)
3385{
3386 return proto->memory_pressure != NULL ?
3387 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3388}
1da177e4
LT
3389
3390static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3391{
180d8cd9 3392
8d987e5c 3393 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
1da177e4
LT
3394 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3395 proto->name,
3396 proto->obj_size,
14e943db 3397 sock_prot_inuse_get(seq_file_net(seq), proto),
180d8cd9
GC
3398 sock_prot_memory_allocated(proto),
3399 sock_prot_memory_pressure(proto),
1da177e4
LT
3400 proto->max_header,
3401 proto->slab == NULL ? "no" : "yes",
3402 module_name(proto->owner),
3403 proto_method_implemented(proto->close),
3404 proto_method_implemented(proto->connect),
3405 proto_method_implemented(proto->disconnect),
3406 proto_method_implemented(proto->accept),
3407 proto_method_implemented(proto->ioctl),
3408 proto_method_implemented(proto->init),
3409 proto_method_implemented(proto->destroy),
3410 proto_method_implemented(proto->shutdown),
3411 proto_method_implemented(proto->setsockopt),
3412 proto_method_implemented(proto->getsockopt),
3413 proto_method_implemented(proto->sendmsg),
3414 proto_method_implemented(proto->recvmsg),
3415 proto_method_implemented(proto->sendpage),
3416 proto_method_implemented(proto->bind),
3417 proto_method_implemented(proto->backlog_rcv),
3418 proto_method_implemented(proto->hash),
3419 proto_method_implemented(proto->unhash),
3420 proto_method_implemented(proto->get_port),
3421 proto_method_implemented(proto->enter_memory_pressure));
3422}
3423
3424static int proto_seq_show(struct seq_file *seq, void *v)
3425{
60f0438a 3426 if (v == &proto_list)
1da177e4
LT
3427 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3428 "protocol",
3429 "size",
3430 "sockets",
3431 "memory",
3432 "press",
3433 "maxhdr",
3434 "slab",
3435 "module",
3436 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3437 else
60f0438a 3438 proto_seq_printf(seq, list_entry(v, struct proto, node));
1da177e4
LT
3439 return 0;
3440}
3441
f690808e 3442static const struct seq_operations proto_seq_ops = {
1da177e4
LT
3443 .start = proto_seq_start,
3444 .next = proto_seq_next,
3445 .stop = proto_seq_stop,
3446 .show = proto_seq_show,
3447};
3448
3449static int proto_seq_open(struct inode *inode, struct file *file)
3450{
14e943db
ED
3451 return seq_open_net(inode, file, &proto_seq_ops,
3452 sizeof(struct seq_net_private));
1da177e4
LT
3453}
3454
9a32144e 3455static const struct file_operations proto_seq_fops = {
1da177e4
LT
3456 .open = proto_seq_open,
3457 .read = seq_read,
3458 .llseek = seq_lseek,
14e943db
ED
3459 .release = seq_release_net,
3460};
3461
3462static __net_init int proto_init_net(struct net *net)
3463{
d6444062 3464 if (!proc_create("protocols", 0444, net->proc_net, &proto_seq_fops))
14e943db
ED
3465 return -ENOMEM;
3466
3467 return 0;
3468}
3469
3470static __net_exit void proto_exit_net(struct net *net)
3471{
ece31ffd 3472 remove_proc_entry("protocols", net->proc_net);
14e943db
ED
3473}
3474
3475
3476static __net_initdata struct pernet_operations proto_net_ops = {
3477 .init = proto_init_net,
3478 .exit = proto_exit_net,
1da177e4
LT
3479};
3480
3481static int __init proto_init(void)
3482{
14e943db 3483 return register_pernet_subsys(&proto_net_ops);
1da177e4
LT
3484}
3485
3486subsys_initcall(proto_init);
3487
3488#endif /* PROC_FS */
7db6b048
SS
3489
3490#ifdef CONFIG_NET_RX_BUSY_POLL
3491bool sk_busy_loop_end(void *p, unsigned long start_time)
3492{
3493 struct sock *sk = p;
3494
3495 return !skb_queue_empty(&sk->sk_receive_queue) ||
3496 sk_busy_loop_timeout(sk, start_time);
3497}
3498EXPORT_SYMBOL(sk_busy_loop_end);
3499#endif /* CONFIG_NET_RX_BUSY_POLL */