tc-testing: add sample action tests
[linux-2.6-block.git] / net / core / sock.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
4ec93edb 35 * code. The ACK stuff can wait and needs major
1da177e4
LT
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
e005d193
JP
92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
4fc268d2 94#include <linux/capability.h>
1da177e4 95#include <linux/errno.h>
cb820f8e 96#include <linux/errqueue.h>
1da177e4
LT
97#include <linux/types.h>
98#include <linux/socket.h>
99#include <linux/in.h>
100#include <linux/kernel.h>
1da177e4
LT
101#include <linux/module.h>
102#include <linux/proc_fs.h>
103#include <linux/seq_file.h>
104#include <linux/sched.h>
f1083048 105#include <linux/sched/mm.h>
1da177e4
LT
106#include <linux/timer.h>
107#include <linux/string.h>
108#include <linux/sockios.h>
109#include <linux/net.h>
110#include <linux/mm.h>
111#include <linux/slab.h>
112#include <linux/interrupt.h>
113#include <linux/poll.h>
114#include <linux/tcp.h>
115#include <linux/init.h>
a1f8e7f7 116#include <linux/highmem.h>
3f551f94 117#include <linux/user_namespace.h>
c5905afb 118#include <linux/static_key.h>
3969eb38 119#include <linux/memcontrol.h>
8c1ae10d 120#include <linux/prefetch.h>
1da177e4 121
7c0f6ba6 122#include <linux/uaccess.h>
1da177e4
LT
123
124#include <linux/netdevice.h>
125#include <net/protocol.h>
126#include <linux/skbuff.h>
457c4cbc 127#include <net/net_namespace.h>
2e6599cb 128#include <net/request_sock.h>
1da177e4 129#include <net/sock.h>
20d49473 130#include <linux/net_tstamp.h>
1da177e4
LT
131#include <net/xfrm.h>
132#include <linux/ipsec.h>
f8451725 133#include <net/cls_cgroup.h>
5bc1421e 134#include <net/netprio_cgroup.h>
eb4cb008 135#include <linux/sock_diag.h>
1da177e4
LT
136
137#include <linux/filter.h>
538950a1 138#include <net/sock_reuseport.h>
1da177e4 139
3847ce32
SM
140#include <trace/events/sock.h>
141
1da177e4 142#include <net/tcp.h>
076bb0c8 143#include <net/busy_poll.h>
06021292 144
36b77a52 145static DEFINE_MUTEX(proto_list_mutex);
d1a4c0b3
GC
146static LIST_HEAD(proto_list);
147
648845ab
TZ
148static void sock_inuse_add(struct net *net, int val);
149
a3b299da
EB
150/**
151 * sk_ns_capable - General socket capability test
152 * @sk: Socket to use a capability on or through
153 * @user_ns: The user namespace of the capability to use
154 * @cap: The capability to use
155 *
156 * Test to see if the opener of the socket had when the socket was
157 * created and the current process has the capability @cap in the user
158 * namespace @user_ns.
159 */
160bool sk_ns_capable(const struct sock *sk,
161 struct user_namespace *user_ns, int cap)
162{
163 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
164 ns_capable(user_ns, cap);
165}
166EXPORT_SYMBOL(sk_ns_capable);
167
168/**
169 * sk_capable - Socket global capability test
170 * @sk: Socket to use a capability on or through
e793c0f7 171 * @cap: The global capability to use
a3b299da
EB
172 *
173 * Test to see if the opener of the socket had when the socket was
174 * created and the current process has the capability @cap in all user
175 * namespaces.
176 */
177bool sk_capable(const struct sock *sk, int cap)
178{
179 return sk_ns_capable(sk, &init_user_ns, cap);
180}
181EXPORT_SYMBOL(sk_capable);
182
183/**
184 * sk_net_capable - Network namespace socket capability test
185 * @sk: Socket to use a capability on or through
186 * @cap: The capability to use
187 *
e793c0f7 188 * Test to see if the opener of the socket had when the socket was created
a3b299da
EB
189 * and the current process has the capability @cap over the network namespace
190 * the socket is a member of.
191 */
192bool sk_net_capable(const struct sock *sk, int cap)
193{
194 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
195}
196EXPORT_SYMBOL(sk_net_capable);
197
da21f24d
IM
198/*
199 * Each address family might have different locking rules, so we have
cdfbabfb
DH
200 * one slock key per address family and separate keys for internal and
201 * userspace sockets.
da21f24d 202 */
a5b5bb9a 203static struct lock_class_key af_family_keys[AF_MAX];
cdfbabfb 204static struct lock_class_key af_family_kern_keys[AF_MAX];
a5b5bb9a 205static struct lock_class_key af_family_slock_keys[AF_MAX];
cdfbabfb 206static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
a5b5bb9a 207
a5b5bb9a
IM
208/*
209 * Make lock validator output more readable. (we pre-construct these
210 * strings build-time, so that runtime initialization of socket
211 * locks is fast):
212 */
cdfbabfb
DH
213
214#define _sock_locks(x) \
215 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
216 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
217 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
218 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
219 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
220 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
221 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
222 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
223 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
224 x "27" , x "28" , x "AF_CAN" , \
225 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
226 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
227 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
228 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
229 x "AF_QIPCRTR", x "AF_SMC" , x "AF_MAX"
230
36cbd3dc 231static const char *const af_family_key_strings[AF_MAX+1] = {
cdfbabfb 232 _sock_locks("sk_lock-")
a5b5bb9a 233};
36cbd3dc 234static const char *const af_family_slock_key_strings[AF_MAX+1] = {
cdfbabfb 235 _sock_locks("slock-")
a5b5bb9a 236};
36cbd3dc 237static const char *const af_family_clock_key_strings[AF_MAX+1] = {
cdfbabfb
DH
238 _sock_locks("clock-")
239};
240
241static const char *const af_family_kern_key_strings[AF_MAX+1] = {
242 _sock_locks("k-sk_lock-")
243};
244static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
245 _sock_locks("k-slock-")
246};
247static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
248 _sock_locks("k-clock-")
443aef0e 249};
581319c5
PA
250static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
251 "rlock-AF_UNSPEC", "rlock-AF_UNIX" , "rlock-AF_INET" ,
252 "rlock-AF_AX25" , "rlock-AF_IPX" , "rlock-AF_APPLETALK",
253 "rlock-AF_NETROM", "rlock-AF_BRIDGE" , "rlock-AF_ATMPVC" ,
254 "rlock-AF_X25" , "rlock-AF_INET6" , "rlock-AF_ROSE" ,
255 "rlock-AF_DECnet", "rlock-AF_NETBEUI" , "rlock-AF_SECURITY" ,
256 "rlock-AF_KEY" , "rlock-AF_NETLINK" , "rlock-AF_PACKET" ,
257 "rlock-AF_ASH" , "rlock-AF_ECONET" , "rlock-AF_ATMSVC" ,
258 "rlock-AF_RDS" , "rlock-AF_SNA" , "rlock-AF_IRDA" ,
259 "rlock-AF_PPPOX" , "rlock-AF_WANPIPE" , "rlock-AF_LLC" ,
260 "rlock-27" , "rlock-28" , "rlock-AF_CAN" ,
261 "rlock-AF_TIPC" , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV" ,
262 "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" ,
263 "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" ,
264 "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" ,
265 "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_MAX"
266};
267static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
268 "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" ,
269 "wlock-AF_AX25" , "wlock-AF_IPX" , "wlock-AF_APPLETALK",
270 "wlock-AF_NETROM", "wlock-AF_BRIDGE" , "wlock-AF_ATMPVC" ,
271 "wlock-AF_X25" , "wlock-AF_INET6" , "wlock-AF_ROSE" ,
272 "wlock-AF_DECnet", "wlock-AF_NETBEUI" , "wlock-AF_SECURITY" ,
273 "wlock-AF_KEY" , "wlock-AF_NETLINK" , "wlock-AF_PACKET" ,
274 "wlock-AF_ASH" , "wlock-AF_ECONET" , "wlock-AF_ATMSVC" ,
275 "wlock-AF_RDS" , "wlock-AF_SNA" , "wlock-AF_IRDA" ,
276 "wlock-AF_PPPOX" , "wlock-AF_WANPIPE" , "wlock-AF_LLC" ,
277 "wlock-27" , "wlock-28" , "wlock-AF_CAN" ,
278 "wlock-AF_TIPC" , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV" ,
279 "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" ,
280 "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" ,
281 "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" ,
282 "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_MAX"
283};
284static const char *const af_family_elock_key_strings[AF_MAX+1] = {
285 "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" ,
286 "elock-AF_AX25" , "elock-AF_IPX" , "elock-AF_APPLETALK",
287 "elock-AF_NETROM", "elock-AF_BRIDGE" , "elock-AF_ATMPVC" ,
288 "elock-AF_X25" , "elock-AF_INET6" , "elock-AF_ROSE" ,
289 "elock-AF_DECnet", "elock-AF_NETBEUI" , "elock-AF_SECURITY" ,
290 "elock-AF_KEY" , "elock-AF_NETLINK" , "elock-AF_PACKET" ,
291 "elock-AF_ASH" , "elock-AF_ECONET" , "elock-AF_ATMSVC" ,
292 "elock-AF_RDS" , "elock-AF_SNA" , "elock-AF_IRDA" ,
293 "elock-AF_PPPOX" , "elock-AF_WANPIPE" , "elock-AF_LLC" ,
294 "elock-27" , "elock-28" , "elock-AF_CAN" ,
295 "elock-AF_TIPC" , "elock-AF_BLUETOOTH", "elock-AF_IUCV" ,
296 "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" ,
297 "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" ,
298 "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" ,
299 "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_MAX"
300};
da21f24d
IM
301
302/*
581319c5 303 * sk_callback_lock and sk queues locking rules are per-address-family,
da21f24d
IM
304 * so split the lock classes by using a per-AF key:
305 */
306static struct lock_class_key af_callback_keys[AF_MAX];
581319c5
PA
307static struct lock_class_key af_rlock_keys[AF_MAX];
308static struct lock_class_key af_wlock_keys[AF_MAX];
309static struct lock_class_key af_elock_keys[AF_MAX];
cdfbabfb 310static struct lock_class_key af_kern_callback_keys[AF_MAX];
da21f24d 311
1da177e4 312/* Run time adjustable parameters. */
ab32ea5d 313__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
6d8ebc8a 314EXPORT_SYMBOL(sysctl_wmem_max);
ab32ea5d 315__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
6d8ebc8a 316EXPORT_SYMBOL(sysctl_rmem_max);
ab32ea5d
BH
317__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
318__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
1da177e4 319
25985edc 320/* Maximal space eaten by iovec or ancillary data plus some space */
ab32ea5d 321int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
2a91525c 322EXPORT_SYMBOL(sysctl_optmem_max);
1da177e4 323
b245be1f
WB
324int sysctl_tstamp_allow_data __read_mostly = 1;
325
c93bdd0e
MG
326struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
327EXPORT_SYMBOL_GPL(memalloc_socks);
328
7cb02404
MG
329/**
330 * sk_set_memalloc - sets %SOCK_MEMALLOC
331 * @sk: socket to set it on
332 *
333 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
334 * It's the responsibility of the admin to adjust min_free_kbytes
335 * to meet the requirements
336 */
337void sk_set_memalloc(struct sock *sk)
338{
339 sock_set_flag(sk, SOCK_MEMALLOC);
340 sk->sk_allocation |= __GFP_MEMALLOC;
c93bdd0e 341 static_key_slow_inc(&memalloc_socks);
7cb02404
MG
342}
343EXPORT_SYMBOL_GPL(sk_set_memalloc);
344
345void sk_clear_memalloc(struct sock *sk)
346{
347 sock_reset_flag(sk, SOCK_MEMALLOC);
348 sk->sk_allocation &= ~__GFP_MEMALLOC;
c93bdd0e 349 static_key_slow_dec(&memalloc_socks);
c76562b6
MG
350
351 /*
352 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
5d753610
MG
353 * progress of swapping. SOCK_MEMALLOC may be cleared while
354 * it has rmem allocations due to the last swapfile being deactivated
355 * but there is a risk that the socket is unusable due to exceeding
356 * the rmem limits. Reclaim the reserves and obey rmem limits again.
c76562b6 357 */
5d753610 358 sk_mem_reclaim(sk);
7cb02404
MG
359}
360EXPORT_SYMBOL_GPL(sk_clear_memalloc);
361
b4b9e355
MG
362int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
363{
364 int ret;
f1083048 365 unsigned int noreclaim_flag;
b4b9e355
MG
366
367 /* these should have been dropped before queueing */
368 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
369
f1083048 370 noreclaim_flag = memalloc_noreclaim_save();
b4b9e355 371 ret = sk->sk_backlog_rcv(sk, skb);
f1083048 372 memalloc_noreclaim_restore(noreclaim_flag);
b4b9e355
MG
373
374 return ret;
375}
376EXPORT_SYMBOL(__sk_backlog_rcv);
377
1da177e4
LT
378static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
379{
380 struct timeval tv;
381
382 if (optlen < sizeof(tv))
383 return -EINVAL;
384 if (copy_from_user(&tv, optval, sizeof(tv)))
385 return -EFAULT;
ba78073e
VA
386 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
387 return -EDOM;
1da177e4 388
ba78073e 389 if (tv.tv_sec < 0) {
6f11df83
AM
390 static int warned __read_mostly;
391
ba78073e 392 *timeo_p = 0;
50aab54f 393 if (warned < 10 && net_ratelimit()) {
ba78073e 394 warned++;
e005d193
JP
395 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
396 __func__, current->comm, task_pid_nr(current));
50aab54f 397 }
ba78073e
VA
398 return 0;
399 }
1da177e4
LT
400 *timeo_p = MAX_SCHEDULE_TIMEOUT;
401 if (tv.tv_sec == 0 && tv.tv_usec == 0)
402 return 0;
403 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
8ccde4c5 404 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
1da177e4
LT
405 return 0;
406}
407
408static void sock_warn_obsolete_bsdism(const char *name)
409{
410 static int warned;
411 static char warncomm[TASK_COMM_LEN];
4ec93edb
YH
412 if (strcmp(warncomm, current->comm) && warned < 5) {
413 strcpy(warncomm, current->comm);
e005d193
JP
414 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
415 warncomm, name);
1da177e4
LT
416 warned++;
417 }
418}
419
080a270f
HFS
420static bool sock_needs_netstamp(const struct sock *sk)
421{
422 switch (sk->sk_family) {
423 case AF_UNSPEC:
424 case AF_UNIX:
425 return false;
426 default:
427 return true;
428 }
429}
430
08e29af3 431static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
4ec93edb 432{
08e29af3
ED
433 if (sk->sk_flags & flags) {
434 sk->sk_flags &= ~flags;
080a270f
HFS
435 if (sock_needs_netstamp(sk) &&
436 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
20d49473 437 net_disable_timestamp();
1da177e4
LT
438 }
439}
440
441
e6afc8ac 442int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
f0088a50 443{
3b885787
NH
444 unsigned long flags;
445 struct sk_buff_head *list = &sk->sk_receive_queue;
f0088a50 446
0fd7bac6 447 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
766e9037 448 atomic_inc(&sk->sk_drops);
3847ce32 449 trace_sock_rcvqueue_full(sk, skb);
766e9037 450 return -ENOMEM;
f0088a50
DV
451 }
452
c76562b6 453 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
766e9037
ED
454 atomic_inc(&sk->sk_drops);
455 return -ENOBUFS;
3ab224be
HA
456 }
457
f0088a50
DV
458 skb->dev = NULL;
459 skb_set_owner_r(skb, sk);
49ad9599 460
7fee226a
ED
461 /* we escape from rcu protected region, make sure we dont leak
462 * a norefcounted dst
463 */
464 skb_dst_force(skb);
465
3b885787 466 spin_lock_irqsave(&list->lock, flags);
3bc3b96f 467 sock_skb_set_dropcount(sk, skb);
3b885787
NH
468 __skb_queue_tail(list, skb);
469 spin_unlock_irqrestore(&list->lock, flags);
f0088a50
DV
470
471 if (!sock_flag(sk, SOCK_DEAD))
676d2369 472 sk->sk_data_ready(sk);
766e9037 473 return 0;
f0088a50 474}
e6afc8ac 475EXPORT_SYMBOL(__sock_queue_rcv_skb);
476
477int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
478{
479 int err;
480
481 err = sk_filter(sk, skb);
482 if (err)
483 return err;
484
485 return __sock_queue_rcv_skb(sk, skb);
486}
f0088a50
DV
487EXPORT_SYMBOL(sock_queue_rcv_skb);
488
4f0c40d9 489int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
c3f24cfb 490 const int nested, unsigned int trim_cap, bool refcounted)
f0088a50
DV
491{
492 int rc = NET_RX_SUCCESS;
493
4f0c40d9 494 if (sk_filter_trim_cap(sk, skb, trim_cap))
f0088a50
DV
495 goto discard_and_relse;
496
497 skb->dev = NULL;
498
274f482d 499 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
c377411f
ED
500 atomic_inc(&sk->sk_drops);
501 goto discard_and_relse;
502 }
58a5a7b9
ACM
503 if (nested)
504 bh_lock_sock_nested(sk);
505 else
506 bh_lock_sock(sk);
a5b5bb9a
IM
507 if (!sock_owned_by_user(sk)) {
508 /*
509 * trylock + unlock semantics:
510 */
511 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
512
c57943a1 513 rc = sk_backlog_rcv(sk, skb);
a5b5bb9a
IM
514
515 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
f545a38f 516 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
8eae939f
ZY
517 bh_unlock_sock(sk);
518 atomic_inc(&sk->sk_drops);
519 goto discard_and_relse;
520 }
521
f0088a50
DV
522 bh_unlock_sock(sk);
523out:
c3f24cfb
ED
524 if (refcounted)
525 sock_put(sk);
f0088a50
DV
526 return rc;
527discard_and_relse:
528 kfree_skb(skb);
529 goto out;
530}
4f0c40d9 531EXPORT_SYMBOL(__sk_receive_skb);
f0088a50
DV
532
533struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
534{
b6c6712a 535 struct dst_entry *dst = __sk_dst_get(sk);
f0088a50
DV
536
537 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
e022f0b4 538 sk_tx_queue_clear(sk);
9b8805a3 539 sk->sk_dst_pending_confirm = 0;
a9b3cd7f 540 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
f0088a50
DV
541 dst_release(dst);
542 return NULL;
543 }
544
545 return dst;
546}
547EXPORT_SYMBOL(__sk_dst_check);
548
549struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
550{
551 struct dst_entry *dst = sk_dst_get(sk);
552
553 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
554 sk_dst_reset(sk);
555 dst_release(dst);
556 return NULL;
557 }
558
559 return dst;
560}
561EXPORT_SYMBOL(sk_dst_check);
562
c91f6df2
BH
563static int sock_setbindtodevice(struct sock *sk, char __user *optval,
564 int optlen)
4878809f
DM
565{
566 int ret = -ENOPROTOOPT;
567#ifdef CONFIG_NETDEVICES
3b1e0a65 568 struct net *net = sock_net(sk);
4878809f
DM
569 char devname[IFNAMSIZ];
570 int index;
571
572 /* Sorry... */
573 ret = -EPERM;
5e1fccc0 574 if (!ns_capable(net->user_ns, CAP_NET_RAW))
4878809f
DM
575 goto out;
576
577 ret = -EINVAL;
578 if (optlen < 0)
579 goto out;
580
581 /* Bind this socket to a particular device like "eth0",
582 * as specified in the passed interface name. If the
583 * name is "" or the option length is zero the socket
584 * is not bound.
585 */
586 if (optlen > IFNAMSIZ - 1)
587 optlen = IFNAMSIZ - 1;
588 memset(devname, 0, sizeof(devname));
589
590 ret = -EFAULT;
591 if (copy_from_user(devname, optval, optlen))
592 goto out;
593
000ba2e4
DM
594 index = 0;
595 if (devname[0] != '\0') {
bf8e56bf 596 struct net_device *dev;
4878809f 597
bf8e56bf
ED
598 rcu_read_lock();
599 dev = dev_get_by_name_rcu(net, devname);
600 if (dev)
601 index = dev->ifindex;
602 rcu_read_unlock();
4878809f
DM
603 ret = -ENODEV;
604 if (!dev)
605 goto out;
4878809f
DM
606 }
607
608 lock_sock(sk);
609 sk->sk_bound_dev_if = index;
610 sk_dst_reset(sk);
611 release_sock(sk);
612
613 ret = 0;
614
615out:
616#endif
617
618 return ret;
619}
620
c91f6df2
BH
621static int sock_getbindtodevice(struct sock *sk, char __user *optval,
622 int __user *optlen, int len)
623{
624 int ret = -ENOPROTOOPT;
625#ifdef CONFIG_NETDEVICES
626 struct net *net = sock_net(sk);
c91f6df2 627 char devname[IFNAMSIZ];
c91f6df2
BH
628
629 if (sk->sk_bound_dev_if == 0) {
630 len = 0;
631 goto zero;
632 }
633
634 ret = -EINVAL;
635 if (len < IFNAMSIZ)
636 goto out;
637
5dbe7c17
NS
638 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
639 if (ret)
c91f6df2 640 goto out;
c91f6df2
BH
641
642 len = strlen(devname) + 1;
643
644 ret = -EFAULT;
645 if (copy_to_user(optval, devname, len))
646 goto out;
647
648zero:
649 ret = -EFAULT;
650 if (put_user(len, optlen))
651 goto out;
652
653 ret = 0;
654
655out:
656#endif
657
658 return ret;
659}
660
c0ef877b
PE
661static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
662{
663 if (valbool)
664 sock_set_flag(sk, bit);
665 else
666 sock_reset_flag(sk, bit);
667}
668
f60e5990 669bool sk_mc_loop(struct sock *sk)
670{
671 if (dev_recursion_level())
672 return false;
673 if (!sk)
674 return true;
675 switch (sk->sk_family) {
676 case AF_INET:
677 return inet_sk(sk)->mc_loop;
678#if IS_ENABLED(CONFIG_IPV6)
679 case AF_INET6:
680 return inet6_sk(sk)->mc_loop;
681#endif
682 }
683 WARN_ON(1);
684 return true;
685}
686EXPORT_SYMBOL(sk_mc_loop);
687
1da177e4
LT
688/*
689 * This is meant for all protocols to use and covers goings on
690 * at the socket level. Everything here is generic.
691 */
692
693int sock_setsockopt(struct socket *sock, int level, int optname,
b7058842 694 char __user *optval, unsigned int optlen)
1da177e4 695{
2a91525c 696 struct sock *sk = sock->sk;
1da177e4
LT
697 int val;
698 int valbool;
699 struct linger ling;
700 int ret = 0;
4ec93edb 701
1da177e4
LT
702 /*
703 * Options without arguments
704 */
705
4878809f 706 if (optname == SO_BINDTODEVICE)
c91f6df2 707 return sock_setbindtodevice(sk, optval, optlen);
4878809f 708
e71a4783
SH
709 if (optlen < sizeof(int))
710 return -EINVAL;
4ec93edb 711
1da177e4
LT
712 if (get_user(val, (int __user *)optval))
713 return -EFAULT;
4ec93edb 714
2a91525c 715 valbool = val ? 1 : 0;
1da177e4
LT
716
717 lock_sock(sk);
718
2a91525c 719 switch (optname) {
e71a4783 720 case SO_DEBUG:
2a91525c 721 if (val && !capable(CAP_NET_ADMIN))
e71a4783 722 ret = -EACCES;
2a91525c 723 else
c0ef877b 724 sock_valbool_flag(sk, SOCK_DBG, valbool);
e71a4783
SH
725 break;
726 case SO_REUSEADDR:
4a17fd52 727 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
e71a4783 728 break;
055dc21a
TH
729 case SO_REUSEPORT:
730 sk->sk_reuseport = valbool;
731 break;
e71a4783 732 case SO_TYPE:
49c794e9 733 case SO_PROTOCOL:
0d6038ee 734 case SO_DOMAIN:
e71a4783
SH
735 case SO_ERROR:
736 ret = -ENOPROTOOPT;
737 break;
738 case SO_DONTROUTE:
c0ef877b 739 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
e71a4783
SH
740 break;
741 case SO_BROADCAST:
742 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
743 break;
744 case SO_SNDBUF:
745 /* Don't error on this BSD doesn't and if you think
82981930
ED
746 * about it this is right. Otherwise apps have to
747 * play 'guess the biggest size' games. RCVBUF/SNDBUF
748 * are treated in BSD as hints
749 */
750 val = min_t(u32, val, sysctl_wmem_max);
b0573dea 751set_sndbuf:
e71a4783 752 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
b98b0bc8 753 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
82981930 754 /* Wake up sending tasks if we upped the value. */
e71a4783
SH
755 sk->sk_write_space(sk);
756 break;
1da177e4 757
e71a4783
SH
758 case SO_SNDBUFFORCE:
759 if (!capable(CAP_NET_ADMIN)) {
760 ret = -EPERM;
761 break;
762 }
763 goto set_sndbuf;
b0573dea 764
e71a4783
SH
765 case SO_RCVBUF:
766 /* Don't error on this BSD doesn't and if you think
82981930
ED
767 * about it this is right. Otherwise apps have to
768 * play 'guess the biggest size' games. RCVBUF/SNDBUF
769 * are treated in BSD as hints
770 */
771 val = min_t(u32, val, sysctl_rmem_max);
b0573dea 772set_rcvbuf:
e71a4783
SH
773 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
774 /*
775 * We double it on the way in to account for
776 * "struct sk_buff" etc. overhead. Applications
777 * assume that the SO_RCVBUF setting they make will
778 * allow that much actual data to be received on that
779 * socket.
780 *
781 * Applications are unaware that "struct sk_buff" and
782 * other overheads allocate from the receive buffer
783 * during socket buffer allocation.
784 *
785 * And after considering the possible alternatives,
786 * returning the value we actually used in getsockopt
787 * is the most desirable behavior.
788 */
b98b0bc8 789 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
e71a4783
SH
790 break;
791
792 case SO_RCVBUFFORCE:
793 if (!capable(CAP_NET_ADMIN)) {
794 ret = -EPERM;
1da177e4 795 break;
e71a4783
SH
796 }
797 goto set_rcvbuf;
1da177e4 798
e71a4783 799 case SO_KEEPALIVE:
4b9d07a4
UB
800 if (sk->sk_prot->keepalive)
801 sk->sk_prot->keepalive(sk, valbool);
e71a4783
SH
802 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
803 break;
804
805 case SO_OOBINLINE:
806 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
807 break;
808
809 case SO_NO_CHECK:
28448b80 810 sk->sk_no_check_tx = valbool;
e71a4783
SH
811 break;
812
813 case SO_PRIORITY:
5e1fccc0
EB
814 if ((val >= 0 && val <= 6) ||
815 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
e71a4783
SH
816 sk->sk_priority = val;
817 else
818 ret = -EPERM;
819 break;
820
821 case SO_LINGER:
822 if (optlen < sizeof(ling)) {
823 ret = -EINVAL; /* 1003.1g */
1da177e4 824 break;
e71a4783 825 }
2a91525c 826 if (copy_from_user(&ling, optval, sizeof(ling))) {
e71a4783 827 ret = -EFAULT;
1da177e4 828 break;
e71a4783
SH
829 }
830 if (!ling.l_onoff)
831 sock_reset_flag(sk, SOCK_LINGER);
832 else {
1da177e4 833#if (BITS_PER_LONG == 32)
e71a4783
SH
834 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
835 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1da177e4 836 else
e71a4783
SH
837#endif
838 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
839 sock_set_flag(sk, SOCK_LINGER);
840 }
841 break;
842
843 case SO_BSDCOMPAT:
844 sock_warn_obsolete_bsdism("setsockopt");
845 break;
846
847 case SO_PASSCRED:
848 if (valbool)
849 set_bit(SOCK_PASSCRED, &sock->flags);
850 else
851 clear_bit(SOCK_PASSCRED, &sock->flags);
852 break;
853
854 case SO_TIMESTAMP:
92f37fd2 855 case SO_TIMESTAMPNS:
e71a4783 856 if (valbool) {
92f37fd2
ED
857 if (optname == SO_TIMESTAMP)
858 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
859 else
860 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783 861 sock_set_flag(sk, SOCK_RCVTSTAMP);
20d49473 862 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
92f37fd2 863 } else {
e71a4783 864 sock_reset_flag(sk, SOCK_RCVTSTAMP);
92f37fd2
ED
865 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
866 }
e71a4783
SH
867 break;
868
20d49473
PO
869 case SO_TIMESTAMPING:
870 if (val & ~SOF_TIMESTAMPING_MASK) {
f249fb78 871 ret = -EINVAL;
20d49473
PO
872 break;
873 }
b245be1f 874
09c2d251 875 if (val & SOF_TIMESTAMPING_OPT_ID &&
4ed2d765 876 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
ac5cc977
WC
877 if (sk->sk_protocol == IPPROTO_TCP &&
878 sk->sk_type == SOCK_STREAM) {
6db8b963
SHY
879 if ((1 << sk->sk_state) &
880 (TCPF_CLOSE | TCPF_LISTEN)) {
4ed2d765
WB
881 ret = -EINVAL;
882 break;
883 }
884 sk->sk_tskey = tcp_sk(sk)->snd_una;
885 } else {
886 sk->sk_tskey = 0;
887 }
888 }
1c885808
FY
889
890 if (val & SOF_TIMESTAMPING_OPT_STATS &&
891 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
892 ret = -EINVAL;
893 break;
894 }
895
b9f40e21 896 sk->sk_tsflags = val;
20d49473
PO
897 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
898 sock_enable_timestamp(sk,
899 SOCK_TIMESTAMPING_RX_SOFTWARE);
900 else
901 sock_disable_timestamp(sk,
08e29af3 902 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
20d49473
PO
903 break;
904
e71a4783
SH
905 case SO_RCVLOWAT:
906 if (val < 0)
907 val = INT_MAX;
908 sk->sk_rcvlowat = val ? : 1;
909 break;
910
911 case SO_RCVTIMEO:
912 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
913 break;
914
915 case SO_SNDTIMEO:
916 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
917 break;
1da177e4 918
e71a4783
SH
919 case SO_ATTACH_FILTER:
920 ret = -EINVAL;
921 if (optlen == sizeof(struct sock_fprog)) {
922 struct sock_fprog fprog;
1da177e4 923
e71a4783
SH
924 ret = -EFAULT;
925 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1da177e4 926 break;
e71a4783
SH
927
928 ret = sk_attach_filter(&fprog, sk);
929 }
930 break;
931
89aa0758
AS
932 case SO_ATTACH_BPF:
933 ret = -EINVAL;
934 if (optlen == sizeof(u32)) {
935 u32 ufd;
936
937 ret = -EFAULT;
938 if (copy_from_user(&ufd, optval, sizeof(ufd)))
939 break;
940
941 ret = sk_attach_bpf(ufd, sk);
942 }
943 break;
944
538950a1
CG
945 case SO_ATTACH_REUSEPORT_CBPF:
946 ret = -EINVAL;
947 if (optlen == sizeof(struct sock_fprog)) {
948 struct sock_fprog fprog;
949
950 ret = -EFAULT;
951 if (copy_from_user(&fprog, optval, sizeof(fprog)))
952 break;
953
954 ret = sk_reuseport_attach_filter(&fprog, sk);
955 }
956 break;
957
958 case SO_ATTACH_REUSEPORT_EBPF:
959 ret = -EINVAL;
960 if (optlen == sizeof(u32)) {
961 u32 ufd;
962
963 ret = -EFAULT;
964 if (copy_from_user(&ufd, optval, sizeof(ufd)))
965 break;
966
967 ret = sk_reuseport_attach_bpf(ufd, sk);
968 }
969 break;
970
e71a4783 971 case SO_DETACH_FILTER:
55b33325 972 ret = sk_detach_filter(sk);
e71a4783 973 break;
1da177e4 974
d59577b6
VB
975 case SO_LOCK_FILTER:
976 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
977 ret = -EPERM;
978 else
979 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
980 break;
981
e71a4783
SH
982 case SO_PASSSEC:
983 if (valbool)
984 set_bit(SOCK_PASSSEC, &sock->flags);
985 else
986 clear_bit(SOCK_PASSSEC, &sock->flags);
987 break;
4a19ec58 988 case SO_MARK:
5e1fccc0 989 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
4a19ec58 990 ret = -EPERM;
2a91525c 991 else
4a19ec58 992 sk->sk_mark = val;
4a19ec58 993 break;
877ce7c1 994
3b885787 995 case SO_RXQ_OVFL:
8083f0fc 996 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
3b885787 997 break;
6e3e939f
JB
998
999 case SO_WIFI_STATUS:
1000 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1001 break;
1002
ef64a54f
PE
1003 case SO_PEEK_OFF:
1004 if (sock->ops->set_peek_off)
12663bfc 1005 ret = sock->ops->set_peek_off(sk, val);
ef64a54f
PE
1006 else
1007 ret = -EOPNOTSUPP;
1008 break;
3bdc0eba
BG
1009
1010 case SO_NOFCS:
1011 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1012 break;
1013
7d4c04fc
KJ
1014 case SO_SELECT_ERR_QUEUE:
1015 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1016 break;
1017
e0d1095a 1018#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 1019 case SO_BUSY_POLL:
dafcc438
ET
1020 /* allow unprivileged users to decrease the value */
1021 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1022 ret = -EPERM;
1023 else {
1024 if (val < 0)
1025 ret = -EINVAL;
1026 else
1027 sk->sk_ll_usec = val;
1028 }
1029 break;
1030#endif
62748f32
ED
1031
1032 case SO_MAX_PACING_RATE:
218af599
ED
1033 if (val != ~0U)
1034 cmpxchg(&sk->sk_pacing_status,
1035 SK_PACING_NONE,
1036 SK_PACING_NEEDED);
62748f32
ED
1037 sk->sk_max_pacing_rate = val;
1038 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1039 sk->sk_max_pacing_rate);
1040 break;
1041
70da268b
ED
1042 case SO_INCOMING_CPU:
1043 sk->sk_incoming_cpu = val;
1044 break;
1045
a87cb3e4
TH
1046 case SO_CNX_ADVICE:
1047 if (val == 1)
1048 dst_negative_advice(sk);
1049 break;
76851d12
WB
1050
1051 case SO_ZEROCOPY:
28190752
SV
1052 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1053 if (sk->sk_protocol != IPPROTO_TCP)
1054 ret = -ENOTSUPP;
28190752 1055 } else if (sk->sk_family != PF_RDS) {
76851d12 1056 ret = -ENOTSUPP;
28190752
SV
1057 }
1058 if (!ret) {
1059 if (val < 0 || val > 1)
1060 ret = -EINVAL;
1061 else
1062 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
28190752 1063 }
334e6413
JSP
1064 break;
1065
e71a4783
SH
1066 default:
1067 ret = -ENOPROTOOPT;
1068 break;
4ec93edb 1069 }
1da177e4
LT
1070 release_sock(sk);
1071 return ret;
1072}
2a91525c 1073EXPORT_SYMBOL(sock_setsockopt);
1da177e4
LT
1074
1075
8f09898b 1076static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1077 struct ucred *ucred)
3f551f94
EB
1078{
1079 ucred->pid = pid_vnr(pid);
1080 ucred->uid = ucred->gid = -1;
1081 if (cred) {
1082 struct user_namespace *current_ns = current_user_ns();
1083
b2e4f544
EB
1084 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1085 ucred->gid = from_kgid_munged(current_ns, cred->egid);
3f551f94
EB
1086 }
1087}
1088
28b5ba2a
DH
1089static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1090{
1091 struct user_namespace *user_ns = current_user_ns();
1092 int i;
1093
1094 for (i = 0; i < src->ngroups; i++)
1095 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1096 return -EFAULT;
1097
1098 return 0;
1099}
1100
1da177e4
LT
1101int sock_getsockopt(struct socket *sock, int level, int optname,
1102 char __user *optval, int __user *optlen)
1103{
1104 struct sock *sk = sock->sk;
4ec93edb 1105
e71a4783 1106 union {
4ec93edb 1107 int val;
5daab9db 1108 u64 val64;
4ec93edb 1109 struct linger ling;
1da177e4
LT
1110 struct timeval tm;
1111 } v;
4ec93edb 1112
4d0392be 1113 int lv = sizeof(int);
1da177e4 1114 int len;
4ec93edb 1115
e71a4783 1116 if (get_user(len, optlen))
4ec93edb 1117 return -EFAULT;
e71a4783 1118 if (len < 0)
1da177e4 1119 return -EINVAL;
4ec93edb 1120
50fee1de 1121 memset(&v, 0, sizeof(v));
df0bca04 1122
2a91525c 1123 switch (optname) {
e71a4783
SH
1124 case SO_DEBUG:
1125 v.val = sock_flag(sk, SOCK_DBG);
1126 break;
1127
1128 case SO_DONTROUTE:
1129 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1130 break;
1131
1132 case SO_BROADCAST:
1b23a5df 1133 v.val = sock_flag(sk, SOCK_BROADCAST);
e71a4783
SH
1134 break;
1135
1136 case SO_SNDBUF:
1137 v.val = sk->sk_sndbuf;
1138 break;
1139
1140 case SO_RCVBUF:
1141 v.val = sk->sk_rcvbuf;
1142 break;
1143
1144 case SO_REUSEADDR:
1145 v.val = sk->sk_reuse;
1146 break;
1147
055dc21a
TH
1148 case SO_REUSEPORT:
1149 v.val = sk->sk_reuseport;
1150 break;
1151
e71a4783 1152 case SO_KEEPALIVE:
1b23a5df 1153 v.val = sock_flag(sk, SOCK_KEEPOPEN);
e71a4783
SH
1154 break;
1155
1156 case SO_TYPE:
1157 v.val = sk->sk_type;
1158 break;
1159
49c794e9
JE
1160 case SO_PROTOCOL:
1161 v.val = sk->sk_protocol;
1162 break;
1163
0d6038ee
JE
1164 case SO_DOMAIN:
1165 v.val = sk->sk_family;
1166 break;
1167
e71a4783
SH
1168 case SO_ERROR:
1169 v.val = -sock_error(sk);
2a91525c 1170 if (v.val == 0)
e71a4783
SH
1171 v.val = xchg(&sk->sk_err_soft, 0);
1172 break;
1173
1174 case SO_OOBINLINE:
1b23a5df 1175 v.val = sock_flag(sk, SOCK_URGINLINE);
e71a4783
SH
1176 break;
1177
1178 case SO_NO_CHECK:
28448b80 1179 v.val = sk->sk_no_check_tx;
e71a4783
SH
1180 break;
1181
1182 case SO_PRIORITY:
1183 v.val = sk->sk_priority;
1184 break;
1185
1186 case SO_LINGER:
1187 lv = sizeof(v.ling);
1b23a5df 1188 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
e71a4783
SH
1189 v.ling.l_linger = sk->sk_lingertime / HZ;
1190 break;
1191
1192 case SO_BSDCOMPAT:
1193 sock_warn_obsolete_bsdism("getsockopt");
1194 break;
1195
1196 case SO_TIMESTAMP:
92f37fd2
ED
1197 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1198 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1199 break;
1200
1201 case SO_TIMESTAMPNS:
1202 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783
SH
1203 break;
1204
20d49473 1205 case SO_TIMESTAMPING:
b9f40e21 1206 v.val = sk->sk_tsflags;
20d49473
PO
1207 break;
1208
e71a4783 1209 case SO_RCVTIMEO:
2a91525c 1210 lv = sizeof(struct timeval);
e71a4783
SH
1211 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1212 v.tm.tv_sec = 0;
1213 v.tm.tv_usec = 0;
1214 } else {
1215 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
8ccde4c5 1216 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
e71a4783
SH
1217 }
1218 break;
1219
1220 case SO_SNDTIMEO:
2a91525c 1221 lv = sizeof(struct timeval);
e71a4783
SH
1222 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1223 v.tm.tv_sec = 0;
1224 v.tm.tv_usec = 0;
1225 } else {
1226 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
8ccde4c5 1227 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
e71a4783
SH
1228 }
1229 break;
1da177e4 1230
e71a4783
SH
1231 case SO_RCVLOWAT:
1232 v.val = sk->sk_rcvlowat;
1233 break;
1da177e4 1234
e71a4783 1235 case SO_SNDLOWAT:
2a91525c 1236 v.val = 1;
e71a4783 1237 break;
1da177e4 1238
e71a4783 1239 case SO_PASSCRED:
82981930 1240 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
e71a4783 1241 break;
1da177e4 1242
e71a4783 1243 case SO_PEERCRED:
109f6e39
EB
1244 {
1245 struct ucred peercred;
1246 if (len > sizeof(peercred))
1247 len = sizeof(peercred);
1248 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1249 if (copy_to_user(optval, &peercred, len))
e71a4783
SH
1250 return -EFAULT;
1251 goto lenout;
109f6e39 1252 }
1da177e4 1253
28b5ba2a
DH
1254 case SO_PEERGROUPS:
1255 {
1256 int ret, n;
1257
1258 if (!sk->sk_peer_cred)
1259 return -ENODATA;
1260
1261 n = sk->sk_peer_cred->group_info->ngroups;
1262 if (len < n * sizeof(gid_t)) {
1263 len = n * sizeof(gid_t);
1264 return put_user(len, optlen) ? -EFAULT : -ERANGE;
1265 }
1266 len = n * sizeof(gid_t);
1267
1268 ret = groups_to_user((gid_t __user *)optval,
1269 sk->sk_peer_cred->group_info);
1270 if (ret)
1271 return ret;
1272 goto lenout;
1273 }
1274
e71a4783
SH
1275 case SO_PEERNAME:
1276 {
1277 char address[128];
1278
9b2c45d4
DV
1279 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1280 if (lv < 0)
e71a4783
SH
1281 return -ENOTCONN;
1282 if (lv < len)
1283 return -EINVAL;
1284 if (copy_to_user(optval, address, len))
1285 return -EFAULT;
1286 goto lenout;
1287 }
1da177e4 1288
e71a4783
SH
1289 /* Dubious BSD thing... Probably nobody even uses it, but
1290 * the UNIX standard wants it for whatever reason... -DaveM
1291 */
1292 case SO_ACCEPTCONN:
1293 v.val = sk->sk_state == TCP_LISTEN;
1294 break;
1da177e4 1295
e71a4783 1296 case SO_PASSSEC:
82981930 1297 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
e71a4783 1298 break;
877ce7c1 1299
e71a4783
SH
1300 case SO_PEERSEC:
1301 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1da177e4 1302
4a19ec58
LAT
1303 case SO_MARK:
1304 v.val = sk->sk_mark;
1305 break;
1306
3b885787 1307 case SO_RXQ_OVFL:
1b23a5df 1308 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
3b885787
NH
1309 break;
1310
6e3e939f 1311 case SO_WIFI_STATUS:
1b23a5df 1312 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
6e3e939f
JB
1313 break;
1314
ef64a54f
PE
1315 case SO_PEEK_OFF:
1316 if (!sock->ops->set_peek_off)
1317 return -EOPNOTSUPP;
1318
1319 v.val = sk->sk_peek_off;
1320 break;
bc2f7996 1321 case SO_NOFCS:
1b23a5df 1322 v.val = sock_flag(sk, SOCK_NOFCS);
bc2f7996 1323 break;
c91f6df2 1324
f7b86bfe 1325 case SO_BINDTODEVICE:
c91f6df2
BH
1326 return sock_getbindtodevice(sk, optval, optlen, len);
1327
a8fc9277
PE
1328 case SO_GET_FILTER:
1329 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1330 if (len < 0)
1331 return len;
1332
1333 goto lenout;
c91f6df2 1334
d59577b6
VB
1335 case SO_LOCK_FILTER:
1336 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1337 break;
1338
ea02f941
MS
1339 case SO_BPF_EXTENSIONS:
1340 v.val = bpf_tell_extensions();
1341 break;
1342
7d4c04fc
KJ
1343 case SO_SELECT_ERR_QUEUE:
1344 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1345 break;
1346
e0d1095a 1347#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 1348 case SO_BUSY_POLL:
dafcc438
ET
1349 v.val = sk->sk_ll_usec;
1350 break;
1351#endif
1352
62748f32
ED
1353 case SO_MAX_PACING_RATE:
1354 v.val = sk->sk_max_pacing_rate;
1355 break;
1356
2c8c56e1
ED
1357 case SO_INCOMING_CPU:
1358 v.val = sk->sk_incoming_cpu;
1359 break;
1360
a2d133b1
JH
1361 case SO_MEMINFO:
1362 {
1363 u32 meminfo[SK_MEMINFO_VARS];
1364
1365 if (get_user(len, optlen))
1366 return -EFAULT;
1367
1368 sk_get_meminfo(sk, meminfo);
1369
1370 len = min_t(unsigned int, len, sizeof(meminfo));
1371 if (copy_to_user(optval, &meminfo, len))
1372 return -EFAULT;
1373
1374 goto lenout;
1375 }
6d433902
SS
1376
1377#ifdef CONFIG_NET_RX_BUSY_POLL
1378 case SO_INCOMING_NAPI_ID:
1379 v.val = READ_ONCE(sk->sk_napi_id);
1380
1381 /* aggregate non-NAPI IDs down to 0 */
1382 if (v.val < MIN_NAPI_ID)
1383 v.val = 0;
1384
1385 break;
1386#endif
1387
5daab9db
CF
1388 case SO_COOKIE:
1389 lv = sizeof(u64);
1390 if (len < lv)
1391 return -EINVAL;
1392 v.val64 = sock_gen_cookie(sk);
1393 break;
1394
76851d12
WB
1395 case SO_ZEROCOPY:
1396 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1397 break;
1398
e71a4783 1399 default:
443b5991
YH
1400 /* We implement the SO_SNDLOWAT etc to not be settable
1401 * (1003.1g 7).
1402 */
e71a4783 1403 return -ENOPROTOOPT;
1da177e4 1404 }
e71a4783 1405
1da177e4
LT
1406 if (len > lv)
1407 len = lv;
1408 if (copy_to_user(optval, &v, len))
1409 return -EFAULT;
1410lenout:
4ec93edb
YH
1411 if (put_user(len, optlen))
1412 return -EFAULT;
1413 return 0;
1da177e4
LT
1414}
1415
a5b5bb9a
IM
1416/*
1417 * Initialize an sk_lock.
1418 *
1419 * (We also register the sk_lock with the lock validator.)
1420 */
b6f99a21 1421static inline void sock_lock_init(struct sock *sk)
a5b5bb9a 1422{
cdfbabfb
DH
1423 if (sk->sk_kern_sock)
1424 sock_lock_init_class_and_name(
1425 sk,
1426 af_family_kern_slock_key_strings[sk->sk_family],
1427 af_family_kern_slock_keys + sk->sk_family,
1428 af_family_kern_key_strings[sk->sk_family],
1429 af_family_kern_keys + sk->sk_family);
1430 else
1431 sock_lock_init_class_and_name(
1432 sk,
ed07536e
PZ
1433 af_family_slock_key_strings[sk->sk_family],
1434 af_family_slock_keys + sk->sk_family,
1435 af_family_key_strings[sk->sk_family],
1436 af_family_keys + sk->sk_family);
a5b5bb9a
IM
1437}
1438
4dc6dc71
ED
1439/*
1440 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1441 * even temporarly, because of RCU lookups. sk_node should also be left as is.
68835aba 1442 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
4dc6dc71 1443 */
f1a6c4da
PE
1444static void sock_copy(struct sock *nsk, const struct sock *osk)
1445{
1446#ifdef CONFIG_SECURITY_NETWORK
1447 void *sptr = nsk->sk_security;
1448#endif
68835aba
ED
1449 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1450
1451 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1452 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1453
f1a6c4da
PE
1454#ifdef CONFIG_SECURITY_NETWORK
1455 nsk->sk_security = sptr;
1456 security_sk_clone(osk, nsk);
1457#endif
1458}
1459
2e4afe7b
PE
1460static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1461 int family)
c308c1b2
PE
1462{
1463 struct sock *sk;
1464 struct kmem_cache *slab;
1465
1466 slab = prot->slab;
e912b114
ED
1467 if (slab != NULL) {
1468 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1469 if (!sk)
1470 return sk;
ba2489b0
ED
1471 if (priority & __GFP_ZERO)
1472 sk_prot_clear_nulls(sk, prot->obj_size);
fcbdf09d 1473 } else
c308c1b2
PE
1474 sk = kmalloc(prot->obj_size, priority);
1475
2e4afe7b
PE
1476 if (sk != NULL) {
1477 if (security_sk_alloc(sk, family, priority))
1478 goto out_free;
1479
1480 if (!try_module_get(prot->owner))
1481 goto out_free_sec;
e022f0b4 1482 sk_tx_queue_clear(sk);
2e4afe7b
PE
1483 }
1484
c308c1b2 1485 return sk;
2e4afe7b
PE
1486
1487out_free_sec:
1488 security_sk_free(sk);
1489out_free:
1490 if (slab != NULL)
1491 kmem_cache_free(slab, sk);
1492 else
1493 kfree(sk);
1494 return NULL;
c308c1b2
PE
1495}
1496
1497static void sk_prot_free(struct proto *prot, struct sock *sk)
1498{
1499 struct kmem_cache *slab;
2e4afe7b 1500 struct module *owner;
c308c1b2 1501
2e4afe7b 1502 owner = prot->owner;
c308c1b2 1503 slab = prot->slab;
2e4afe7b 1504
bd1060a1 1505 cgroup_sk_free(&sk->sk_cgrp_data);
2d758073 1506 mem_cgroup_sk_free(sk);
2e4afe7b 1507 security_sk_free(sk);
c308c1b2
PE
1508 if (slab != NULL)
1509 kmem_cache_free(slab, sk);
1510 else
1511 kfree(sk);
2e4afe7b 1512 module_put(owner);
c308c1b2
PE
1513}
1514
1da177e4
LT
1515/**
1516 * sk_alloc - All socket objects are allocated here
c4ea43c5 1517 * @net: the applicable net namespace
4dc3b16b
PP
1518 * @family: protocol family
1519 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1520 * @prot: struct proto associated with this new sock instance
11aa9c28 1521 * @kern: is this to be a kernel socket?
1da177e4 1522 */
1b8d7ae4 1523struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
11aa9c28 1524 struct proto *prot, int kern)
1da177e4 1525{
c308c1b2 1526 struct sock *sk;
1da177e4 1527
154adbc8 1528 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1da177e4 1529 if (sk) {
154adbc8
PE
1530 sk->sk_family = family;
1531 /*
1532 * See comment in struct sock definition to understand
1533 * why we need sk_prot_creator -acme
1534 */
1535 sk->sk_prot = sk->sk_prot_creator = prot;
cdfbabfb 1536 sk->sk_kern_sock = kern;
154adbc8 1537 sock_lock_init(sk);
26abe143 1538 sk->sk_net_refcnt = kern ? 0 : 1;
648845ab 1539 if (likely(sk->sk_net_refcnt)) {
26abe143 1540 get_net(net);
648845ab
TZ
1541 sock_inuse_add(net, 1);
1542 }
1543
26abe143 1544 sock_net_set(sk, net);
14afee4b 1545 refcount_set(&sk->sk_wmem_alloc, 1);
f8451725 1546
2d758073 1547 mem_cgroup_sk_alloc(sk);
d979a39d 1548 cgroup_sk_alloc(&sk->sk_cgrp_data);
2a56a1fe
TH
1549 sock_update_classid(&sk->sk_cgrp_data);
1550 sock_update_netprioidx(&sk->sk_cgrp_data);
1da177e4 1551 }
a79af59e 1552
2e4afe7b 1553 return sk;
1da177e4 1554}
2a91525c 1555EXPORT_SYMBOL(sk_alloc);
1da177e4 1556
a4298e45
ED
1557/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1558 * grace period. This is the case for UDP sockets and TCP listeners.
1559 */
1560static void __sk_destruct(struct rcu_head *head)
1da177e4 1561{
a4298e45 1562 struct sock *sk = container_of(head, struct sock, sk_rcu);
1da177e4 1563 struct sk_filter *filter;
1da177e4
LT
1564
1565 if (sk->sk_destruct)
1566 sk->sk_destruct(sk);
1567
a898def2 1568 filter = rcu_dereference_check(sk->sk_filter,
14afee4b 1569 refcount_read(&sk->sk_wmem_alloc) == 0);
1da177e4 1570 if (filter) {
309dd5fc 1571 sk_filter_uncharge(sk, filter);
a9b3cd7f 1572 RCU_INIT_POINTER(sk->sk_filter, NULL);
1da177e4 1573 }
538950a1
CG
1574 if (rcu_access_pointer(sk->sk_reuseport_cb))
1575 reuseport_detach_sock(sk);
1da177e4 1576
08e29af3 1577 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1da177e4
LT
1578
1579 if (atomic_read(&sk->sk_omem_alloc))
e005d193
JP
1580 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1581 __func__, atomic_read(&sk->sk_omem_alloc));
1da177e4 1582
22a0e18e
ED
1583 if (sk->sk_frag.page) {
1584 put_page(sk->sk_frag.page);
1585 sk->sk_frag.page = NULL;
1586 }
1587
109f6e39
EB
1588 if (sk->sk_peer_cred)
1589 put_cred(sk->sk_peer_cred);
1590 put_pid(sk->sk_peer_pid);
26abe143
EB
1591 if (likely(sk->sk_net_refcnt))
1592 put_net(sock_net(sk));
c308c1b2 1593 sk_prot_free(sk->sk_prot_creator, sk);
1da177e4 1594}
2b85a34e 1595
a4298e45
ED
1596void sk_destruct(struct sock *sk)
1597{
1598 if (sock_flag(sk, SOCK_RCU_FREE))
1599 call_rcu(&sk->sk_rcu, __sk_destruct);
1600 else
1601 __sk_destruct(&sk->sk_rcu);
1602}
1603
eb4cb008
CG
1604static void __sk_free(struct sock *sk)
1605{
648845ab
TZ
1606 if (likely(sk->sk_net_refcnt))
1607 sock_inuse_add(sock_net(sk), -1);
1608
b922622e 1609 if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
eb4cb008
CG
1610 sock_diag_broadcast_destroy(sk);
1611 else
1612 sk_destruct(sk);
1613}
1614
2b85a34e
ED
1615void sk_free(struct sock *sk)
1616{
1617 /*
25985edc 1618 * We subtract one from sk_wmem_alloc and can know if
2b85a34e
ED
1619 * some packets are still in some tx queue.
1620 * If not null, sock_wfree() will call __sk_free(sk) later
1621 */
14afee4b 1622 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2b85a34e
ED
1623 __sk_free(sk);
1624}
2a91525c 1625EXPORT_SYMBOL(sk_free);
1da177e4 1626
581319c5
PA
1627static void sk_init_common(struct sock *sk)
1628{
1629 skb_queue_head_init(&sk->sk_receive_queue);
1630 skb_queue_head_init(&sk->sk_write_queue);
1631 skb_queue_head_init(&sk->sk_error_queue);
1632
1633 rwlock_init(&sk->sk_callback_lock);
1634 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1635 af_rlock_keys + sk->sk_family,
1636 af_family_rlock_key_strings[sk->sk_family]);
1637 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1638 af_wlock_keys + sk->sk_family,
1639 af_family_wlock_key_strings[sk->sk_family]);
1640 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1641 af_elock_keys + sk->sk_family,
1642 af_family_elock_key_strings[sk->sk_family]);
1643 lockdep_set_class_and_name(&sk->sk_callback_lock,
1644 af_callback_keys + sk->sk_family,
1645 af_family_clock_key_strings[sk->sk_family]);
1646}
1647
e56c57d0
ED
1648/**
1649 * sk_clone_lock - clone a socket, and lock its clone
1650 * @sk: the socket to clone
1651 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1652 *
1653 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1654 */
1655struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
87d11ceb 1656{
8fd1d178 1657 struct sock *newsk;
278571ba 1658 bool is_charged = true;
87d11ceb 1659
8fd1d178 1660 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
87d11ceb
ACM
1661 if (newsk != NULL) {
1662 struct sk_filter *filter;
1663
892c141e 1664 sock_copy(newsk, sk);
87d11ceb 1665
9d538fa6
CP
1666 newsk->sk_prot_creator = sk->sk_prot;
1667
87d11ceb 1668 /* SANITY */
8a681736
SV
1669 if (likely(newsk->sk_net_refcnt))
1670 get_net(sock_net(newsk));
87d11ceb
ACM
1671 sk_node_init(&newsk->sk_node);
1672 sock_lock_init(newsk);
1673 bh_lock_sock(newsk);
fa438ccf 1674 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
8eae939f 1675 newsk->sk_backlog.len = 0;
87d11ceb
ACM
1676
1677 atomic_set(&newsk->sk_rmem_alloc, 0);
2b85a34e
ED
1678 /*
1679 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1680 */
14afee4b 1681 refcount_set(&newsk->sk_wmem_alloc, 1);
87d11ceb 1682 atomic_set(&newsk->sk_omem_alloc, 0);
581319c5 1683 sk_init_common(newsk);
87d11ceb
ACM
1684
1685 newsk->sk_dst_cache = NULL;
9b8805a3 1686 newsk->sk_dst_pending_confirm = 0;
87d11ceb
ACM
1687 newsk->sk_wmem_queued = 0;
1688 newsk->sk_forward_alloc = 0;
9caad864 1689 atomic_set(&newsk->sk_drops, 0);
87d11ceb 1690 newsk->sk_send_head = NULL;
87d11ceb 1691 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
52267790 1692 atomic_set(&newsk->sk_zckey, 0);
87d11ceb
ACM
1693
1694 sock_reset_flag(newsk, SOCK_DONE);
edbe69ef 1695 mem_cgroup_sk_alloc(newsk);
c0576e39 1696 cgroup_sk_alloc(&newsk->sk_cgrp_data);
87d11ceb 1697
eefca20e
ED
1698 rcu_read_lock();
1699 filter = rcu_dereference(sk->sk_filter);
87d11ceb 1700 if (filter != NULL)
278571ba
AS
1701 /* though it's an empty new sock, the charging may fail
1702 * if sysctl_optmem_max was changed between creation of
1703 * original socket and cloning
1704 */
1705 is_charged = sk_filter_charge(newsk, filter);
eefca20e
ED
1706 RCU_INIT_POINTER(newsk->sk_filter, filter);
1707 rcu_read_unlock();
87d11ceb 1708
d188ba86 1709 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
a97e50cc
DB
1710 /* We need to make sure that we don't uncharge the new
1711 * socket if we couldn't charge it in the first place
1712 * as otherwise we uncharge the parent's filter.
1713 */
1714 if (!is_charged)
1715 RCU_INIT_POINTER(newsk->sk_filter, NULL);
94352d45 1716 sk_free_unlock_clone(newsk);
87d11ceb
ACM
1717 newsk = NULL;
1718 goto out;
1719 }
fa463497 1720 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
87d11ceb
ACM
1721
1722 newsk->sk_err = 0;
e551c32d 1723 newsk->sk_err_soft = 0;
87d11ceb 1724 newsk->sk_priority = 0;
2c8c56e1 1725 newsk->sk_incoming_cpu = raw_smp_processor_id();
33cf7c90 1726 atomic64_set(&newsk->sk_cookie, 0);
648845ab
TZ
1727 if (likely(newsk->sk_net_refcnt))
1728 sock_inuse_add(sock_net(newsk), 1);
d979a39d 1729
4dc6dc71
ED
1730 /*
1731 * Before updating sk_refcnt, we must commit prior changes to memory
1732 * (Documentation/RCU/rculist_nulls.txt for details)
1733 */
1734 smp_wmb();
41c6d650 1735 refcount_set(&newsk->sk_refcnt, 2);
87d11ceb
ACM
1736
1737 /*
1738 * Increment the counter in the same struct proto as the master
1739 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1740 * is the same as sk->sk_prot->socks, as this field was copied
1741 * with memcpy).
1742 *
1743 * This _changes_ the previous behaviour, where
1744 * tcp_create_openreq_child always was incrementing the
1745 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1746 * to be taken into account in all callers. -acme
1747 */
1748 sk_refcnt_debug_inc(newsk);
972692e0 1749 sk_set_socket(newsk, NULL);
43815482 1750 newsk->sk_wq = NULL;
87d11ceb
ACM
1751
1752 if (newsk->sk_prot->sockets_allocated)
180d8cd9 1753 sk_sockets_allocated_inc(newsk);
704da560 1754
080a270f
HFS
1755 if (sock_needs_netstamp(sk) &&
1756 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
704da560 1757 net_enable_timestamp();
87d11ceb
ACM
1758 }
1759out:
1760 return newsk;
1761}
e56c57d0 1762EXPORT_SYMBOL_GPL(sk_clone_lock);
87d11ceb 1763
94352d45
ACM
1764void sk_free_unlock_clone(struct sock *sk)
1765{
1766 /* It is still raw copy of parent, so invalidate
1767 * destructor and make plain sk_free() */
1768 sk->sk_destruct = NULL;
1769 bh_unlock_sock(sk);
1770 sk_free(sk);
1771}
1772EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1773
9958089a
AK
1774void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1775{
d6a4e26a
ED
1776 u32 max_segs = 1;
1777
6bd4f355 1778 sk_dst_set(sk, dst);
0a6b2a1d 1779 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
9958089a 1780 if (sk->sk_route_caps & NETIF_F_GSO)
4fcd6b99 1781 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
a465419b 1782 sk->sk_route_caps &= ~sk->sk_route_nocaps;
9958089a 1783 if (sk_can_gso(sk)) {
f70f250a 1784 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
9958089a 1785 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
82cc1a7a 1786 } else {
9958089a 1787 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
82cc1a7a 1788 sk->sk_gso_max_size = dst->dev->gso_max_size;
d6a4e26a 1789 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
82cc1a7a 1790 }
9958089a 1791 }
d6a4e26a 1792 sk->sk_gso_max_segs = max_segs;
9958089a
AK
1793}
1794EXPORT_SYMBOL_GPL(sk_setup_caps);
1795
1da177e4
LT
1796/*
1797 * Simple resource managers for sockets.
1798 */
1799
1800
4ec93edb
YH
1801/*
1802 * Write buffer destructor automatically called from kfree_skb.
1da177e4
LT
1803 */
1804void sock_wfree(struct sk_buff *skb)
1805{
1806 struct sock *sk = skb->sk;
d99927f4 1807 unsigned int len = skb->truesize;
1da177e4 1808
d99927f4
ED
1809 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1810 /*
1811 * Keep a reference on sk_wmem_alloc, this will be released
1812 * after sk_write_space() call
1813 */
14afee4b 1814 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1da177e4 1815 sk->sk_write_space(sk);
d99927f4
ED
1816 len = 1;
1817 }
2b85a34e 1818 /*
d99927f4
ED
1819 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1820 * could not do because of in-flight packets
2b85a34e 1821 */
14afee4b 1822 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2b85a34e 1823 __sk_free(sk);
1da177e4 1824}
2a91525c 1825EXPORT_SYMBOL(sock_wfree);
1da177e4 1826
1d2077ac
ED
1827/* This variant of sock_wfree() is used by TCP,
1828 * since it sets SOCK_USE_WRITE_QUEUE.
1829 */
1830void __sock_wfree(struct sk_buff *skb)
1831{
1832 struct sock *sk = skb->sk;
1833
14afee4b 1834 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1d2077ac
ED
1835 __sk_free(sk);
1836}
1837
9e17f8a4
ED
1838void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1839{
1840 skb_orphan(skb);
1841 skb->sk = sk;
1842#ifdef CONFIG_INET
1843 if (unlikely(!sk_fullsock(sk))) {
1844 skb->destructor = sock_edemux;
1845 sock_hold(sk);
1846 return;
1847 }
1848#endif
1849 skb->destructor = sock_wfree;
1850 skb_set_hash_from_sk(skb, sk);
1851 /*
1852 * We used to take a refcount on sk, but following operation
1853 * is enough to guarantee sk_free() wont free this sock until
1854 * all in-flight packets are completed
1855 */
14afee4b 1856 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
9e17f8a4
ED
1857}
1858EXPORT_SYMBOL(skb_set_owner_w);
1859
1d2077ac
ED
1860/* This helper is used by netem, as it can hold packets in its
1861 * delay queue. We want to allow the owner socket to send more
1862 * packets, as if they were already TX completed by a typical driver.
1863 * But we also want to keep skb->sk set because some packet schedulers
f6ba8d33 1864 * rely on it (sch_fq for example).
1d2077ac 1865 */
f2f872f9
ED
1866void skb_orphan_partial(struct sk_buff *skb)
1867{
f6ba8d33 1868 if (skb_is_tcp_pure_ack(skb))
1d2077ac
ED
1869 return;
1870
f2f872f9
ED
1871 if (skb->destructor == sock_wfree
1872#ifdef CONFIG_INET
1873 || skb->destructor == tcp_wfree
1874#endif
1875 ) {
f6ba8d33
ED
1876 struct sock *sk = skb->sk;
1877
41c6d650 1878 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
14afee4b 1879 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
f6ba8d33
ED
1880 skb->destructor = sock_efree;
1881 }
f2f872f9
ED
1882 } else {
1883 skb_orphan(skb);
1884 }
1885}
1886EXPORT_SYMBOL(skb_orphan_partial);
1887
4ec93edb
YH
1888/*
1889 * Read buffer destructor automatically called from kfree_skb.
1da177e4
LT
1890 */
1891void sock_rfree(struct sk_buff *skb)
1892{
1893 struct sock *sk = skb->sk;
d361fd59 1894 unsigned int len = skb->truesize;
1da177e4 1895
d361fd59
ED
1896 atomic_sub(len, &sk->sk_rmem_alloc);
1897 sk_mem_uncharge(sk, len);
1da177e4 1898}
2a91525c 1899EXPORT_SYMBOL(sock_rfree);
1da177e4 1900
7768eed8
OH
1901/*
1902 * Buffer destructor for skbs that are not used directly in read or write
1903 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1904 */
62bccb8c
AD
1905void sock_efree(struct sk_buff *skb)
1906{
1907 sock_put(skb->sk);
1908}
1909EXPORT_SYMBOL(sock_efree);
1910
976d0201 1911kuid_t sock_i_uid(struct sock *sk)
1da177e4 1912{
976d0201 1913 kuid_t uid;
1da177e4 1914
f064af1e 1915 read_lock_bh(&sk->sk_callback_lock);
976d0201 1916 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
f064af1e 1917 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1918 return uid;
1919}
2a91525c 1920EXPORT_SYMBOL(sock_i_uid);
1da177e4
LT
1921
1922unsigned long sock_i_ino(struct sock *sk)
1923{
1924 unsigned long ino;
1925
f064af1e 1926 read_lock_bh(&sk->sk_callback_lock);
1da177e4 1927 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
f064af1e 1928 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1929 return ino;
1930}
2a91525c 1931EXPORT_SYMBOL(sock_i_ino);
1da177e4
LT
1932
1933/*
1934 * Allocate a skb from the socket's send buffer.
1935 */
86a76caf 1936struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 1937 gfp_t priority)
1da177e4 1938{
14afee4b 1939 if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
2a91525c 1940 struct sk_buff *skb = alloc_skb(size, priority);
1da177e4
LT
1941 if (skb) {
1942 skb_set_owner_w(skb, sk);
1943 return skb;
1944 }
1945 }
1946 return NULL;
1947}
2a91525c 1948EXPORT_SYMBOL(sock_wmalloc);
1da177e4 1949
98ba0bd5
WB
1950static void sock_ofree(struct sk_buff *skb)
1951{
1952 struct sock *sk = skb->sk;
1953
1954 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1955}
1956
1957struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1958 gfp_t priority)
1959{
1960 struct sk_buff *skb;
1961
1962 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1963 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1964 sysctl_optmem_max)
1965 return NULL;
1966
1967 skb = alloc_skb(size, priority);
1968 if (!skb)
1969 return NULL;
1970
1971 atomic_add(skb->truesize, &sk->sk_omem_alloc);
1972 skb->sk = sk;
1973 skb->destructor = sock_ofree;
1974 return skb;
1975}
1976
4ec93edb 1977/*
1da177e4 1978 * Allocate a memory block from the socket's option memory buffer.
4ec93edb 1979 */
dd0fc66f 1980void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1da177e4 1981{
95c96174 1982 if ((unsigned int)size <= sysctl_optmem_max &&
1da177e4
LT
1983 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1984 void *mem;
1985 /* First do the add, to avoid the race if kmalloc
4ec93edb 1986 * might sleep.
1da177e4
LT
1987 */
1988 atomic_add(size, &sk->sk_omem_alloc);
1989 mem = kmalloc(size, priority);
1990 if (mem)
1991 return mem;
1992 atomic_sub(size, &sk->sk_omem_alloc);
1993 }
1994 return NULL;
1995}
2a91525c 1996EXPORT_SYMBOL(sock_kmalloc);
1da177e4 1997
79e88659
DB
1998/* Free an option memory block. Note, we actually want the inline
1999 * here as this allows gcc to detect the nullify and fold away the
2000 * condition entirely.
1da177e4 2001 */
79e88659
DB
2002static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2003 const bool nullify)
1da177e4 2004{
e53da5fb
DM
2005 if (WARN_ON_ONCE(!mem))
2006 return;
79e88659
DB
2007 if (nullify)
2008 kzfree(mem);
2009 else
2010 kfree(mem);
1da177e4
LT
2011 atomic_sub(size, &sk->sk_omem_alloc);
2012}
79e88659
DB
2013
2014void sock_kfree_s(struct sock *sk, void *mem, int size)
2015{
2016 __sock_kfree_s(sk, mem, size, false);
2017}
2a91525c 2018EXPORT_SYMBOL(sock_kfree_s);
1da177e4 2019
79e88659
DB
2020void sock_kzfree_s(struct sock *sk, void *mem, int size)
2021{
2022 __sock_kfree_s(sk, mem, size, true);
2023}
2024EXPORT_SYMBOL(sock_kzfree_s);
2025
1da177e4
LT
2026/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2027 I think, these locks should be removed for datagram sockets.
2028 */
2a91525c 2029static long sock_wait_for_wmem(struct sock *sk, long timeo)
1da177e4
LT
2030{
2031 DEFINE_WAIT(wait);
2032
9cd3e072 2033 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1da177e4
LT
2034 for (;;) {
2035 if (!timeo)
2036 break;
2037 if (signal_pending(current))
2038 break;
2039 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
aa395145 2040 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
14afee4b 2041 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1da177e4
LT
2042 break;
2043 if (sk->sk_shutdown & SEND_SHUTDOWN)
2044 break;
2045 if (sk->sk_err)
2046 break;
2047 timeo = schedule_timeout(timeo);
2048 }
aa395145 2049 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
2050 return timeo;
2051}
2052
2053
2054/*
2055 * Generic send/receive buffer handlers
2056 */
2057
4cc7f68d
HX
2058struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2059 unsigned long data_len, int noblock,
28d64271 2060 int *errcode, int max_page_order)
1da177e4 2061{
2e4e4410 2062 struct sk_buff *skb;
1da177e4
LT
2063 long timeo;
2064 int err;
2065
1da177e4 2066 timeo = sock_sndtimeo(sk, noblock);
2e4e4410 2067 for (;;) {
1da177e4
LT
2068 err = sock_error(sk);
2069 if (err != 0)
2070 goto failure;
2071
2072 err = -EPIPE;
2073 if (sk->sk_shutdown & SEND_SHUTDOWN)
2074 goto failure;
2075
2e4e4410
ED
2076 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2077 break;
28d64271 2078
9cd3e072 2079 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2e4e4410
ED
2080 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2081 err = -EAGAIN;
2082 if (!timeo)
1da177e4 2083 goto failure;
2e4e4410
ED
2084 if (signal_pending(current))
2085 goto interrupted;
2086 timeo = sock_wait_for_wmem(sk, timeo);
1da177e4 2087 }
2e4e4410
ED
2088 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2089 errcode, sk->sk_allocation);
2090 if (skb)
2091 skb_set_owner_w(skb, sk);
1da177e4
LT
2092 return skb;
2093
2094interrupted:
2095 err = sock_intr_errno(timeo);
2096failure:
2097 *errcode = err;
2098 return NULL;
2099}
4cc7f68d 2100EXPORT_SYMBOL(sock_alloc_send_pskb);
1da177e4 2101
4ec93edb 2102struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1da177e4
LT
2103 int noblock, int *errcode)
2104{
28d64271 2105 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1da177e4 2106}
2a91525c 2107EXPORT_SYMBOL(sock_alloc_send_skb);
1da177e4 2108
39771b12
WB
2109int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2110 struct sockcm_cookie *sockc)
2111{
3dd17e63
SHY
2112 u32 tsflags;
2113
39771b12
WB
2114 switch (cmsg->cmsg_type) {
2115 case SO_MARK:
2116 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2117 return -EPERM;
2118 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2119 return -EINVAL;
2120 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2121 break;
3dd17e63
SHY
2122 case SO_TIMESTAMPING:
2123 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2124 return -EINVAL;
2125
2126 tsflags = *(u32 *)CMSG_DATA(cmsg);
2127 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2128 return -EINVAL;
2129
2130 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2131 sockc->tsflags |= tsflags;
2132 break;
779f1ede
SHY
2133 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2134 case SCM_RIGHTS:
2135 case SCM_CREDENTIALS:
2136 break;
39771b12
WB
2137 default:
2138 return -EINVAL;
2139 }
2140 return 0;
2141}
2142EXPORT_SYMBOL(__sock_cmsg_send);
2143
f28ea365
EJ
2144int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2145 struct sockcm_cookie *sockc)
2146{
2147 struct cmsghdr *cmsg;
39771b12 2148 int ret;
f28ea365
EJ
2149
2150 for_each_cmsghdr(cmsg, msg) {
2151 if (!CMSG_OK(msg, cmsg))
2152 return -EINVAL;
2153 if (cmsg->cmsg_level != SOL_SOCKET)
2154 continue;
39771b12
WB
2155 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2156 if (ret)
2157 return ret;
f28ea365
EJ
2158 }
2159 return 0;
2160}
2161EXPORT_SYMBOL(sock_cmsg_send);
2162
06044751
ED
2163static void sk_enter_memory_pressure(struct sock *sk)
2164{
2165 if (!sk->sk_prot->enter_memory_pressure)
2166 return;
2167
2168 sk->sk_prot->enter_memory_pressure(sk);
2169}
2170
2171static void sk_leave_memory_pressure(struct sock *sk)
2172{
2173 if (sk->sk_prot->leave_memory_pressure) {
2174 sk->sk_prot->leave_memory_pressure(sk);
2175 } else {
2176 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2177
2178 if (memory_pressure && *memory_pressure)
2179 *memory_pressure = 0;
2180 }
2181}
2182
5640f768
ED
2183/* On 32bit arches, an skb frag is limited to 2^15 */
2184#define SKB_FRAG_PAGE_ORDER get_order(32768)
2185
400dfd3a
ED
2186/**
2187 * skb_page_frag_refill - check that a page_frag contains enough room
2188 * @sz: minimum size of the fragment we want to get
2189 * @pfrag: pointer to page_frag
82d5e2b8 2190 * @gfp: priority for memory allocation
400dfd3a
ED
2191 *
2192 * Note: While this allocator tries to use high order pages, there is
2193 * no guarantee that allocations succeed. Therefore, @sz MUST be
2194 * less or equal than PAGE_SIZE.
2195 */
d9b2938a 2196bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
5640f768 2197{
5640f768 2198 if (pfrag->page) {
fe896d18 2199 if (page_ref_count(pfrag->page) == 1) {
5640f768
ED
2200 pfrag->offset = 0;
2201 return true;
2202 }
400dfd3a 2203 if (pfrag->offset + sz <= pfrag->size)
5640f768
ED
2204 return true;
2205 put_page(pfrag->page);
2206 }
2207
d9b2938a
ED
2208 pfrag->offset = 0;
2209 if (SKB_FRAG_PAGE_ORDER) {
d0164adc
MG
2210 /* Avoid direct reclaim but allow kswapd to wake */
2211 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2212 __GFP_COMP | __GFP_NOWARN |
2213 __GFP_NORETRY,
d9b2938a 2214 SKB_FRAG_PAGE_ORDER);
5640f768 2215 if (likely(pfrag->page)) {
d9b2938a 2216 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
5640f768
ED
2217 return true;
2218 }
d9b2938a
ED
2219 }
2220 pfrag->page = alloc_page(gfp);
2221 if (likely(pfrag->page)) {
2222 pfrag->size = PAGE_SIZE;
2223 return true;
2224 }
400dfd3a
ED
2225 return false;
2226}
2227EXPORT_SYMBOL(skb_page_frag_refill);
2228
2229bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2230{
2231 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2232 return true;
2233
5640f768
ED
2234 sk_enter_memory_pressure(sk);
2235 sk_stream_moderate_sndbuf(sk);
2236 return false;
2237}
2238EXPORT_SYMBOL(sk_page_frag_refill);
2239
2c3682f0 2240int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
8c05dbf0 2241 int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
2c3682f0
JF
2242 int first_coalesce)
2243{
8c05dbf0
JF
2244 int sg_curr = *sg_curr_index, use = 0, rc = 0;
2245 unsigned int size = *sg_curr_size;
2c3682f0 2246 struct page_frag *pfrag;
2c3682f0 2247 struct scatterlist *sge;
2c3682f0
JF
2248
2249 len -= size;
2250 pfrag = sk_page_frag(sk);
2251
2252 while (len > 0) {
8c05dbf0
JF
2253 unsigned int orig_offset;
2254
2c3682f0
JF
2255 if (!sk_page_frag_refill(sk, pfrag)) {
2256 rc = -ENOMEM;
2257 goto out;
2258 }
2259
2260 use = min_t(int, len, pfrag->size - pfrag->offset);
2261
2262 if (!sk_wmem_schedule(sk, use)) {
2263 rc = -ENOMEM;
2264 goto out;
2265 }
2266
2267 sk_mem_charge(sk, use);
2268 size += use;
2269 orig_offset = pfrag->offset;
2270 pfrag->offset += use;
2271
8c05dbf0
JF
2272 sge = sg + sg_curr - 1;
2273 if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page &&
2c3682f0
JF
2274 sg->offset + sg->length == orig_offset) {
2275 sg->length += use;
2276 } else {
8c05dbf0 2277 sge = sg + sg_curr;
2c3682f0
JF
2278 sg_unmark_end(sge);
2279 sg_set_page(sge, pfrag->page, use, orig_offset);
2280 get_page(pfrag->page);
8c05dbf0
JF
2281 sg_curr++;
2282
2283 if (sg_curr == MAX_SKB_FRAGS)
2284 sg_curr = 0;
2285
2286 if (sg_curr == sg_start) {
2c3682f0
JF
2287 rc = -ENOSPC;
2288 break;
2289 }
2290 }
2291
2292 len -= use;
2293 }
2294out:
8c05dbf0
JF
2295 *sg_curr_size = size;
2296 *sg_curr_index = sg_curr;
2c3682f0
JF
2297 return rc;
2298}
2299EXPORT_SYMBOL(sk_alloc_sg);
2300
1da177e4 2301static void __lock_sock(struct sock *sk)
f39234d6
NK
2302 __releases(&sk->sk_lock.slock)
2303 __acquires(&sk->sk_lock.slock)
1da177e4
LT
2304{
2305 DEFINE_WAIT(wait);
2306
e71a4783 2307 for (;;) {
1da177e4
LT
2308 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2309 TASK_UNINTERRUPTIBLE);
2310 spin_unlock_bh(&sk->sk_lock.slock);
2311 schedule();
2312 spin_lock_bh(&sk->sk_lock.slock);
e71a4783 2313 if (!sock_owned_by_user(sk))
1da177e4
LT
2314 break;
2315 }
2316 finish_wait(&sk->sk_lock.wq, &wait);
2317}
2318
2319static void __release_sock(struct sock *sk)
f39234d6
NK
2320 __releases(&sk->sk_lock.slock)
2321 __acquires(&sk->sk_lock.slock)
1da177e4 2322{
5413d1ba 2323 struct sk_buff *skb, *next;
1da177e4 2324
5413d1ba 2325 while ((skb = sk->sk_backlog.head) != NULL) {
1da177e4 2326 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1da177e4 2327
5413d1ba 2328 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4 2329
5413d1ba
ED
2330 do {
2331 next = skb->next;
e4cbb02a 2332 prefetch(next);
7fee226a 2333 WARN_ON_ONCE(skb_dst_is_noref(skb));
1da177e4 2334 skb->next = NULL;
c57943a1 2335 sk_backlog_rcv(sk, skb);
1da177e4 2336
5413d1ba 2337 cond_resched();
1da177e4
LT
2338
2339 skb = next;
2340 } while (skb != NULL);
2341
5413d1ba
ED
2342 spin_lock_bh(&sk->sk_lock.slock);
2343 }
8eae939f
ZY
2344
2345 /*
2346 * Doing the zeroing here guarantee we can not loop forever
2347 * while a wild producer attempts to flood us.
2348 */
2349 sk->sk_backlog.len = 0;
1da177e4
LT
2350}
2351
d41a69f1
ED
2352void __sk_flush_backlog(struct sock *sk)
2353{
2354 spin_lock_bh(&sk->sk_lock.slock);
2355 __release_sock(sk);
2356 spin_unlock_bh(&sk->sk_lock.slock);
2357}
2358
1da177e4
LT
2359/**
2360 * sk_wait_data - wait for data to arrive at sk_receive_queue
4dc3b16b
PP
2361 * @sk: sock to wait on
2362 * @timeo: for how long
dfbafc99 2363 * @skb: last skb seen on sk_receive_queue
1da177e4
LT
2364 *
2365 * Now socket state including sk->sk_err is changed only under lock,
2366 * hence we may omit checks after joining wait queue.
2367 * We check receive queue before schedule() only as optimization;
2368 * it is very likely that release_sock() added new data.
2369 */
dfbafc99 2370int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
1da177e4 2371{
d9dc8b0f 2372 DEFINE_WAIT_FUNC(wait, woken_wake_function);
1da177e4 2373 int rc;
1da177e4 2374
d9dc8b0f 2375 add_wait_queue(sk_sleep(sk), &wait);
9cd3e072 2376 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
d9dc8b0f 2377 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
9cd3e072 2378 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
d9dc8b0f 2379 remove_wait_queue(sk_sleep(sk), &wait);
1da177e4
LT
2380 return rc;
2381}
1da177e4
LT
2382EXPORT_SYMBOL(sk_wait_data);
2383
3ab224be 2384/**
f8c3bf00 2385 * __sk_mem_raise_allocated - increase memory_allocated
3ab224be
HA
2386 * @sk: socket
2387 * @size: memory size to allocate
f8c3bf00 2388 * @amt: pages to allocate
3ab224be
HA
2389 * @kind: allocation type
2390 *
f8c3bf00 2391 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
3ab224be 2392 */
f8c3bf00 2393int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3ab224be
HA
2394{
2395 struct proto *prot = sk->sk_prot;
f8c3bf00 2396 long allocated = sk_memory_allocated_add(sk, amt);
e805605c 2397
baac50bb
JW
2398 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2399 !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
e805605c 2400 goto suppress_allocation;
3ab224be
HA
2401
2402 /* Under limit. */
e805605c 2403 if (allocated <= sk_prot_mem_limits(sk, 0)) {
180d8cd9 2404 sk_leave_memory_pressure(sk);
3ab224be
HA
2405 return 1;
2406 }
2407
e805605c
JW
2408 /* Under pressure. */
2409 if (allocated > sk_prot_mem_limits(sk, 1))
180d8cd9 2410 sk_enter_memory_pressure(sk);
3ab224be 2411
e805605c
JW
2412 /* Over hard limit. */
2413 if (allocated > sk_prot_mem_limits(sk, 2))
3ab224be
HA
2414 goto suppress_allocation;
2415
2416 /* guarantee minimum buffer size under pressure */
2417 if (kind == SK_MEM_RECV) {
a3dcaf17 2418 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3ab224be 2419 return 1;
180d8cd9 2420
3ab224be 2421 } else { /* SK_MEM_SEND */
a3dcaf17
ED
2422 int wmem0 = sk_get_wmem0(sk, prot);
2423
3ab224be 2424 if (sk->sk_type == SOCK_STREAM) {
a3dcaf17 2425 if (sk->sk_wmem_queued < wmem0)
3ab224be 2426 return 1;
a3dcaf17 2427 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3ab224be 2428 return 1;
a3dcaf17 2429 }
3ab224be
HA
2430 }
2431
180d8cd9 2432 if (sk_has_memory_pressure(sk)) {
1748376b
ED
2433 int alloc;
2434
180d8cd9 2435 if (!sk_under_memory_pressure(sk))
1748376b 2436 return 1;
180d8cd9
GC
2437 alloc = sk_sockets_allocated_read_positive(sk);
2438 if (sk_prot_mem_limits(sk, 2) > alloc *
3ab224be
HA
2439 sk_mem_pages(sk->sk_wmem_queued +
2440 atomic_read(&sk->sk_rmem_alloc) +
2441 sk->sk_forward_alloc))
2442 return 1;
2443 }
2444
2445suppress_allocation:
2446
2447 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2448 sk_stream_moderate_sndbuf(sk);
2449
2450 /* Fail only if socket is _under_ its sndbuf.
2451 * In this case we cannot block, so that we have to fail.
2452 */
2453 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2454 return 1;
2455 }
2456
3847ce32
SM
2457 trace_sock_exceed_buf_limit(sk, prot, allocated);
2458
0e90b31f 2459 sk_memory_allocated_sub(sk, amt);
180d8cd9 2460
baac50bb
JW
2461 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2462 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
e805605c 2463
3ab224be
HA
2464 return 0;
2465}
f8c3bf00
PA
2466EXPORT_SYMBOL(__sk_mem_raise_allocated);
2467
2468/**
2469 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2470 * @sk: socket
2471 * @size: memory size to allocate
2472 * @kind: allocation type
2473 *
2474 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2475 * rmem allocation. This function assumes that protocols which have
2476 * memory_pressure use sk_wmem_queued as write buffer accounting.
2477 */
2478int __sk_mem_schedule(struct sock *sk, int size, int kind)
2479{
2480 int ret, amt = sk_mem_pages(size);
2481
2482 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2483 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2484 if (!ret)
2485 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2486 return ret;
2487}
3ab224be
HA
2488EXPORT_SYMBOL(__sk_mem_schedule);
2489
2490/**
f8c3bf00 2491 * __sk_mem_reduce_allocated - reclaim memory_allocated
3ab224be 2492 * @sk: socket
f8c3bf00
PA
2493 * @amount: number of quanta
2494 *
2495 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3ab224be 2496 */
f8c3bf00 2497void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3ab224be 2498{
1a24e04e 2499 sk_memory_allocated_sub(sk, amount);
3ab224be 2500
baac50bb
JW
2501 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2502 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
e805605c 2503
180d8cd9
GC
2504 if (sk_under_memory_pressure(sk) &&
2505 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2506 sk_leave_memory_pressure(sk);
3ab224be 2507}
f8c3bf00
PA
2508EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2509
2510/**
2511 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2512 * @sk: socket
2513 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2514 */
2515void __sk_mem_reclaim(struct sock *sk, int amount)
2516{
2517 amount >>= SK_MEM_QUANTUM_SHIFT;
2518 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2519 __sk_mem_reduce_allocated(sk, amount);
2520}
3ab224be
HA
2521EXPORT_SYMBOL(__sk_mem_reclaim);
2522
627d2d6b 2523int sk_set_peek_off(struct sock *sk, int val)
2524{
627d2d6b 2525 sk->sk_peek_off = val;
2526 return 0;
2527}
2528EXPORT_SYMBOL_GPL(sk_set_peek_off);
3ab224be 2529
1da177e4
LT
2530/*
2531 * Set of default routines for initialising struct proto_ops when
2532 * the protocol does not support a particular function. In certain
2533 * cases where it makes no sense for a protocol to have a "do nothing"
2534 * function, some default processing is provided.
2535 */
2536
2537int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2538{
2539 return -EOPNOTSUPP;
2540}
2a91525c 2541EXPORT_SYMBOL(sock_no_bind);
1da177e4 2542
4ec93edb 2543int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
2544 int len, int flags)
2545{
2546 return -EOPNOTSUPP;
2547}
2a91525c 2548EXPORT_SYMBOL(sock_no_connect);
1da177e4
LT
2549
2550int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2551{
2552 return -EOPNOTSUPP;
2553}
2a91525c 2554EXPORT_SYMBOL(sock_no_socketpair);
1da177e4 2555
cdfbabfb
DH
2556int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2557 bool kern)
1da177e4
LT
2558{
2559 return -EOPNOTSUPP;
2560}
2a91525c 2561EXPORT_SYMBOL(sock_no_accept);
1da177e4 2562
4ec93edb 2563int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
9b2c45d4 2564 int peer)
1da177e4
LT
2565{
2566 return -EOPNOTSUPP;
2567}
2a91525c 2568EXPORT_SYMBOL(sock_no_getname);
1da177e4 2569
ade994f4 2570__poll_t sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1da177e4
LT
2571{
2572 return 0;
2573}
2a91525c 2574EXPORT_SYMBOL(sock_no_poll);
1da177e4
LT
2575
2576int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2577{
2578 return -EOPNOTSUPP;
2579}
2a91525c 2580EXPORT_SYMBOL(sock_no_ioctl);
1da177e4
LT
2581
2582int sock_no_listen(struct socket *sock, int backlog)
2583{
2584 return -EOPNOTSUPP;
2585}
2a91525c 2586EXPORT_SYMBOL(sock_no_listen);
1da177e4
LT
2587
2588int sock_no_shutdown(struct socket *sock, int how)
2589{
2590 return -EOPNOTSUPP;
2591}
2a91525c 2592EXPORT_SYMBOL(sock_no_shutdown);
1da177e4
LT
2593
2594int sock_no_setsockopt(struct socket *sock, int level, int optname,
b7058842 2595 char __user *optval, unsigned int optlen)
1da177e4
LT
2596{
2597 return -EOPNOTSUPP;
2598}
2a91525c 2599EXPORT_SYMBOL(sock_no_setsockopt);
1da177e4
LT
2600
2601int sock_no_getsockopt(struct socket *sock, int level, int optname,
2602 char __user *optval, int __user *optlen)
2603{
2604 return -EOPNOTSUPP;
2605}
2a91525c 2606EXPORT_SYMBOL(sock_no_getsockopt);
1da177e4 2607
1b784140 2608int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
1da177e4
LT
2609{
2610 return -EOPNOTSUPP;
2611}
2a91525c 2612EXPORT_SYMBOL(sock_no_sendmsg);
1da177e4 2613
306b13eb
TH
2614int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2615{
2616 return -EOPNOTSUPP;
2617}
2618EXPORT_SYMBOL(sock_no_sendmsg_locked);
2619
1b784140
YX
2620int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2621 int flags)
1da177e4
LT
2622{
2623 return -EOPNOTSUPP;
2624}
2a91525c 2625EXPORT_SYMBOL(sock_no_recvmsg);
1da177e4
LT
2626
2627int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2628{
2629 /* Mirror missing mmap method error code */
2630 return -ENODEV;
2631}
2a91525c 2632EXPORT_SYMBOL(sock_no_mmap);
1da177e4
LT
2633
2634ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2635{
2636 ssize_t res;
2637 struct msghdr msg = {.msg_flags = flags};
2638 struct kvec iov;
2639 char *kaddr = kmap(page);
2640 iov.iov_base = kaddr + offset;
2641 iov.iov_len = size;
2642 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2643 kunmap(page);
2644 return res;
2645}
2a91525c 2646EXPORT_SYMBOL(sock_no_sendpage);
1da177e4 2647
306b13eb
TH
2648ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2649 int offset, size_t size, int flags)
2650{
2651 ssize_t res;
2652 struct msghdr msg = {.msg_flags = flags};
2653 struct kvec iov;
2654 char *kaddr = kmap(page);
2655
2656 iov.iov_base = kaddr + offset;
2657 iov.iov_len = size;
2658 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2659 kunmap(page);
2660 return res;
2661}
2662EXPORT_SYMBOL(sock_no_sendpage_locked);
2663
1da177e4
LT
2664/*
2665 * Default Socket Callbacks
2666 */
2667
2668static void sock_def_wakeup(struct sock *sk)
2669{
43815482
ED
2670 struct socket_wq *wq;
2671
2672 rcu_read_lock();
2673 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2674 if (skwq_has_sleeper(wq))
43815482
ED
2675 wake_up_interruptible_all(&wq->wait);
2676 rcu_read_unlock();
1da177e4
LT
2677}
2678
2679static void sock_def_error_report(struct sock *sk)
2680{
43815482
ED
2681 struct socket_wq *wq;
2682
2683 rcu_read_lock();
2684 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2685 if (skwq_has_sleeper(wq))
a9a08845 2686 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
8d8ad9d7 2687 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
43815482 2688 rcu_read_unlock();
1da177e4
LT
2689}
2690
676d2369 2691static void sock_def_readable(struct sock *sk)
1da177e4 2692{
43815482
ED
2693 struct socket_wq *wq;
2694
2695 rcu_read_lock();
2696 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2697 if (skwq_has_sleeper(wq))
a9a08845
LT
2698 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2699 EPOLLRDNORM | EPOLLRDBAND);
8d8ad9d7 2700 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
43815482 2701 rcu_read_unlock();
1da177e4
LT
2702}
2703
2704static void sock_def_write_space(struct sock *sk)
2705{
43815482
ED
2706 struct socket_wq *wq;
2707
2708 rcu_read_lock();
1da177e4
LT
2709
2710 /* Do not wake up a writer until he can make "significant"
2711 * progress. --DaveM
2712 */
14afee4b 2713 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
43815482 2714 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2715 if (skwq_has_sleeper(wq))
a9a08845
LT
2716 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2717 EPOLLWRNORM | EPOLLWRBAND);
1da177e4
LT
2718
2719 /* Should agree with poll, otherwise some programs break */
2720 if (sock_writeable(sk))
8d8ad9d7 2721 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1da177e4
LT
2722 }
2723
43815482 2724 rcu_read_unlock();
1da177e4
LT
2725}
2726
2727static void sock_def_destruct(struct sock *sk)
2728{
1da177e4
LT
2729}
2730
2731void sk_send_sigurg(struct sock *sk)
2732{
2733 if (sk->sk_socket && sk->sk_socket->file)
2734 if (send_sigurg(&sk->sk_socket->file->f_owner))
8d8ad9d7 2735 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1da177e4 2736}
2a91525c 2737EXPORT_SYMBOL(sk_send_sigurg);
1da177e4
LT
2738
2739void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2740 unsigned long expires)
2741{
2742 if (!mod_timer(timer, expires))
2743 sock_hold(sk);
2744}
1da177e4
LT
2745EXPORT_SYMBOL(sk_reset_timer);
2746
2747void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2748{
25cc4ae9 2749 if (del_timer(timer))
1da177e4
LT
2750 __sock_put(sk);
2751}
1da177e4
LT
2752EXPORT_SYMBOL(sk_stop_timer);
2753
2754void sock_init_data(struct socket *sock, struct sock *sk)
2755{
581319c5 2756 sk_init_common(sk);
1da177e4
LT
2757 sk->sk_send_head = NULL;
2758
99767f27 2759 timer_setup(&sk->sk_timer, NULL, 0);
4ec93edb 2760
1da177e4
LT
2761 sk->sk_allocation = GFP_KERNEL;
2762 sk->sk_rcvbuf = sysctl_rmem_default;
2763 sk->sk_sndbuf = sysctl_wmem_default;
2764 sk->sk_state = TCP_CLOSE;
972692e0 2765 sk_set_socket(sk, sock);
1da177e4
LT
2766
2767 sock_set_flag(sk, SOCK_ZAPPED);
2768
e71a4783 2769 if (sock) {
1da177e4 2770 sk->sk_type = sock->type;
43815482 2771 sk->sk_wq = sock->wq;
1da177e4 2772 sock->sk = sk;
86741ec2
LC
2773 sk->sk_uid = SOCK_INODE(sock)->i_uid;
2774 } else {
43815482 2775 sk->sk_wq = NULL;
86741ec2
LC
2776 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
2777 }
1da177e4 2778
1da177e4 2779 rwlock_init(&sk->sk_callback_lock);
cdfbabfb
DH
2780 if (sk->sk_kern_sock)
2781 lockdep_set_class_and_name(
2782 &sk->sk_callback_lock,
2783 af_kern_callback_keys + sk->sk_family,
2784 af_family_kern_clock_key_strings[sk->sk_family]);
2785 else
2786 lockdep_set_class_and_name(
2787 &sk->sk_callback_lock,
443aef0e
PZ
2788 af_callback_keys + sk->sk_family,
2789 af_family_clock_key_strings[sk->sk_family]);
1da177e4
LT
2790
2791 sk->sk_state_change = sock_def_wakeup;
2792 sk->sk_data_ready = sock_def_readable;
2793 sk->sk_write_space = sock_def_write_space;
2794 sk->sk_error_report = sock_def_error_report;
2795 sk->sk_destruct = sock_def_destruct;
2796
5640f768
ED
2797 sk->sk_frag.page = NULL;
2798 sk->sk_frag.offset = 0;
ef64a54f 2799 sk->sk_peek_off = -1;
1da177e4 2800
109f6e39
EB
2801 sk->sk_peer_pid = NULL;
2802 sk->sk_peer_cred = NULL;
1da177e4
LT
2803 sk->sk_write_pending = 0;
2804 sk->sk_rcvlowat = 1;
2805 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2806 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2807
6c7c98ba 2808 sk->sk_stamp = SK_DEFAULT_STAMP;
52267790 2809 atomic_set(&sk->sk_zckey, 0);
1da177e4 2810
e0d1095a 2811#ifdef CONFIG_NET_RX_BUSY_POLL
06021292 2812 sk->sk_napi_id = 0;
64b0dc51 2813 sk->sk_ll_usec = sysctl_net_busy_read;
06021292
ET
2814#endif
2815
62748f32 2816 sk->sk_max_pacing_rate = ~0U;
7eec4174 2817 sk->sk_pacing_rate = ~0U;
3a9b76fd 2818 sk->sk_pacing_shift = 10;
70da268b 2819 sk->sk_incoming_cpu = -1;
4dc6dc71
ED
2820 /*
2821 * Before updating sk_refcnt, we must commit prior changes to memory
2822 * (Documentation/RCU/rculist_nulls.txt for details)
2823 */
2824 smp_wmb();
41c6d650 2825 refcount_set(&sk->sk_refcnt, 1);
33c732c3 2826 atomic_set(&sk->sk_drops, 0);
1da177e4 2827}
2a91525c 2828EXPORT_SYMBOL(sock_init_data);
1da177e4 2829
b5606c2d 2830void lock_sock_nested(struct sock *sk, int subclass)
1da177e4
LT
2831{
2832 might_sleep();
a5b5bb9a 2833 spin_lock_bh(&sk->sk_lock.slock);
d2e9117c 2834 if (sk->sk_lock.owned)
1da177e4 2835 __lock_sock(sk);
d2e9117c 2836 sk->sk_lock.owned = 1;
a5b5bb9a
IM
2837 spin_unlock(&sk->sk_lock.slock);
2838 /*
2839 * The sk_lock has mutex_lock() semantics here:
2840 */
fcc70d5f 2841 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
a5b5bb9a 2842 local_bh_enable();
1da177e4 2843}
fcc70d5f 2844EXPORT_SYMBOL(lock_sock_nested);
1da177e4 2845
b5606c2d 2846void release_sock(struct sock *sk)
1da177e4 2847{
a5b5bb9a 2848 spin_lock_bh(&sk->sk_lock.slock);
1da177e4
LT
2849 if (sk->sk_backlog.tail)
2850 __release_sock(sk);
46d3ceab 2851
c3f9b018
ED
2852 /* Warning : release_cb() might need to release sk ownership,
2853 * ie call sock_release_ownership(sk) before us.
2854 */
46d3ceab
ED
2855 if (sk->sk_prot->release_cb)
2856 sk->sk_prot->release_cb(sk);
2857
c3f9b018 2858 sock_release_ownership(sk);
a5b5bb9a
IM
2859 if (waitqueue_active(&sk->sk_lock.wq))
2860 wake_up(&sk->sk_lock.wq);
2861 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4
LT
2862}
2863EXPORT_SYMBOL(release_sock);
2864
8a74ad60
ED
2865/**
2866 * lock_sock_fast - fast version of lock_sock
2867 * @sk: socket
2868 *
2869 * This version should be used for very small section, where process wont block
d651983d
MCC
2870 * return false if fast path is taken:
2871 *
8a74ad60 2872 * sk_lock.slock locked, owned = 0, BH disabled
d651983d
MCC
2873 *
2874 * return true if slow path is taken:
2875 *
8a74ad60
ED
2876 * sk_lock.slock unlocked, owned = 1, BH enabled
2877 */
2878bool lock_sock_fast(struct sock *sk)
2879{
2880 might_sleep();
2881 spin_lock_bh(&sk->sk_lock.slock);
2882
2883 if (!sk->sk_lock.owned)
2884 /*
2885 * Note : We must disable BH
2886 */
2887 return false;
2888
2889 __lock_sock(sk);
2890 sk->sk_lock.owned = 1;
2891 spin_unlock(&sk->sk_lock.slock);
2892 /*
2893 * The sk_lock has mutex_lock() semantics here:
2894 */
2895 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2896 local_bh_enable();
2897 return true;
2898}
2899EXPORT_SYMBOL(lock_sock_fast);
2900
1da177e4 2901int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
4ec93edb 2902{
b7aa0bf7 2903 struct timeval tv;
1da177e4 2904 if (!sock_flag(sk, SOCK_TIMESTAMP))
20d49473 2905 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
b7aa0bf7
ED
2906 tv = ktime_to_timeval(sk->sk_stamp);
2907 if (tv.tv_sec == -1)
1da177e4 2908 return -ENOENT;
b7aa0bf7
ED
2909 if (tv.tv_sec == 0) {
2910 sk->sk_stamp = ktime_get_real();
2911 tv = ktime_to_timeval(sk->sk_stamp);
2912 }
2913 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
4ec93edb 2914}
1da177e4
LT
2915EXPORT_SYMBOL(sock_get_timestamp);
2916
ae40eb1e
ED
2917int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2918{
2919 struct timespec ts;
2920 if (!sock_flag(sk, SOCK_TIMESTAMP))
20d49473 2921 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
ae40eb1e
ED
2922 ts = ktime_to_timespec(sk->sk_stamp);
2923 if (ts.tv_sec == -1)
2924 return -ENOENT;
2925 if (ts.tv_sec == 0) {
2926 sk->sk_stamp = ktime_get_real();
2927 ts = ktime_to_timespec(sk->sk_stamp);
2928 }
2929 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2930}
2931EXPORT_SYMBOL(sock_get_timestampns);
2932
20d49473 2933void sock_enable_timestamp(struct sock *sk, int flag)
4ec93edb 2934{
20d49473 2935 if (!sock_flag(sk, flag)) {
08e29af3
ED
2936 unsigned long previous_flags = sk->sk_flags;
2937
20d49473
PO
2938 sock_set_flag(sk, flag);
2939 /*
2940 * we just set one of the two flags which require net
2941 * time stamping, but time stamping might have been on
2942 * already because of the other one
2943 */
080a270f
HFS
2944 if (sock_needs_netstamp(sk) &&
2945 !(previous_flags & SK_FLAGS_TIMESTAMP))
20d49473 2946 net_enable_timestamp();
1da177e4
LT
2947 }
2948}
1da177e4 2949
cb820f8e
RC
2950int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2951 int level, int type)
2952{
2953 struct sock_exterr_skb *serr;
364a9e93 2954 struct sk_buff *skb;
cb820f8e
RC
2955 int copied, err;
2956
2957 err = -EAGAIN;
364a9e93 2958 skb = sock_dequeue_err_skb(sk);
cb820f8e
RC
2959 if (skb == NULL)
2960 goto out;
2961
2962 copied = skb->len;
2963 if (copied > len) {
2964 msg->msg_flags |= MSG_TRUNC;
2965 copied = len;
2966 }
51f3d02b 2967 err = skb_copy_datagram_msg(skb, 0, msg, copied);
cb820f8e
RC
2968 if (err)
2969 goto out_free_skb;
2970
2971 sock_recv_timestamp(msg, sk, skb);
2972
2973 serr = SKB_EXT_ERR(skb);
2974 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2975
2976 msg->msg_flags |= MSG_ERRQUEUE;
2977 err = copied;
2978
cb820f8e
RC
2979out_free_skb:
2980 kfree_skb(skb);
2981out:
2982 return err;
2983}
2984EXPORT_SYMBOL(sock_recv_errqueue);
2985
1da177e4
LT
2986/*
2987 * Get a socket option on an socket.
2988 *
2989 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2990 * asynchronous errors should be reported by getsockopt. We assume
2991 * this means if you specify SO_ERROR (otherwise whats the point of it).
2992 */
2993int sock_common_getsockopt(struct socket *sock, int level, int optname,
2994 char __user *optval, int __user *optlen)
2995{
2996 struct sock *sk = sock->sk;
2997
2998 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2999}
1da177e4
LT
3000EXPORT_SYMBOL(sock_common_getsockopt);
3001
3fdadf7d 3002#ifdef CONFIG_COMPAT
543d9cfe
ACM
3003int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3004 char __user *optval, int __user *optlen)
3fdadf7d
DM
3005{
3006 struct sock *sk = sock->sk;
3007
1e51f951 3008 if (sk->sk_prot->compat_getsockopt != NULL)
543d9cfe
ACM
3009 return sk->sk_prot->compat_getsockopt(sk, level, optname,
3010 optval, optlen);
3fdadf7d
DM
3011 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3012}
3013EXPORT_SYMBOL(compat_sock_common_getsockopt);
3014#endif
3015
1b784140
YX
3016int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3017 int flags)
1da177e4
LT
3018{
3019 struct sock *sk = sock->sk;
3020 int addr_len = 0;
3021 int err;
3022
1b784140 3023 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
1da177e4
LT
3024 flags & ~MSG_DONTWAIT, &addr_len);
3025 if (err >= 0)
3026 msg->msg_namelen = addr_len;
3027 return err;
3028}
1da177e4
LT
3029EXPORT_SYMBOL(sock_common_recvmsg);
3030
3031/*
3032 * Set socket options on an inet socket.
3033 */
3034int sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 3035 char __user *optval, unsigned int optlen)
1da177e4
LT
3036{
3037 struct sock *sk = sock->sk;
3038
3039 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3040}
1da177e4
LT
3041EXPORT_SYMBOL(sock_common_setsockopt);
3042
3fdadf7d 3043#ifdef CONFIG_COMPAT
543d9cfe 3044int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 3045 char __user *optval, unsigned int optlen)
3fdadf7d
DM
3046{
3047 struct sock *sk = sock->sk;
3048
543d9cfe
ACM
3049 if (sk->sk_prot->compat_setsockopt != NULL)
3050 return sk->sk_prot->compat_setsockopt(sk, level, optname,
3051 optval, optlen);
3fdadf7d
DM
3052 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3053}
3054EXPORT_SYMBOL(compat_sock_common_setsockopt);
3055#endif
3056
1da177e4
LT
3057void sk_common_release(struct sock *sk)
3058{
3059 if (sk->sk_prot->destroy)
3060 sk->sk_prot->destroy(sk);
3061
3062 /*
3063 * Observation: when sock_common_release is called, processes have
3064 * no access to socket. But net still has.
3065 * Step one, detach it from networking:
3066 *
3067 * A. Remove from hash tables.
3068 */
3069
3070 sk->sk_prot->unhash(sk);
3071
3072 /*
3073 * In this point socket cannot receive new packets, but it is possible
3074 * that some packets are in flight because some CPU runs receiver and
3075 * did hash table lookup before we unhashed socket. They will achieve
3076 * receive queue and will be purged by socket destructor.
3077 *
3078 * Also we still have packets pending on receive queue and probably,
3079 * our own packets waiting in device queues. sock_destroy will drain
3080 * receive queue, but transmitted packets will delay socket destruction
3081 * until the last reference will be released.
3082 */
3083
3084 sock_orphan(sk);
3085
3086 xfrm_sk_free_policy(sk);
3087
e6848976 3088 sk_refcnt_debug_release(sk);
5640f768 3089
1da177e4
LT
3090 sock_put(sk);
3091}
1da177e4
LT
3092EXPORT_SYMBOL(sk_common_release);
3093
a2d133b1
JH
3094void sk_get_meminfo(const struct sock *sk, u32 *mem)
3095{
3096 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3097
3098 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3099 mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3100 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3101 mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3102 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3103 mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3104 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3105 mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3106 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3107}
3108
13ff3d6f
PE
3109#ifdef CONFIG_PROC_FS
3110#define PROTO_INUSE_NR 64 /* should be enough for the first time */
1338d466
PE
3111struct prot_inuse {
3112 int val[PROTO_INUSE_NR];
3113};
13ff3d6f
PE
3114
3115static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
70ee1159 3116
70ee1159
PE
3117void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3118{
08fc7f81 3119 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
70ee1159
PE
3120}
3121EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3122
3123int sock_prot_inuse_get(struct net *net, struct proto *prot)
3124{
3125 int cpu, idx = prot->inuse_idx;
3126 int res = 0;
3127
3128 for_each_possible_cpu(cpu)
08fc7f81 3129 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
70ee1159
PE
3130
3131 return res >= 0 ? res : 0;
3132}
3133EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3134
648845ab
TZ
3135static void sock_inuse_add(struct net *net, int val)
3136{
3137 this_cpu_add(*net->core.sock_inuse, val);
3138}
3139
3140int sock_inuse_get(struct net *net)
3141{
3142 int cpu, res = 0;
3143
3144 for_each_possible_cpu(cpu)
3145 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3146
3147 return res;
3148}
3149
3150EXPORT_SYMBOL_GPL(sock_inuse_get);
3151
2c8c1e72 3152static int __net_init sock_inuse_init_net(struct net *net)
70ee1159 3153{
08fc7f81 3154 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
648845ab
TZ
3155 if (net->core.prot_inuse == NULL)
3156 return -ENOMEM;
3157
3158 net->core.sock_inuse = alloc_percpu(int);
3159 if (net->core.sock_inuse == NULL)
3160 goto out;
3161
3162 return 0;
3163
3164out:
3165 free_percpu(net->core.prot_inuse);
3166 return -ENOMEM;
70ee1159
PE
3167}
3168
2c8c1e72 3169static void __net_exit sock_inuse_exit_net(struct net *net)
70ee1159 3170{
08fc7f81 3171 free_percpu(net->core.prot_inuse);
648845ab 3172 free_percpu(net->core.sock_inuse);
70ee1159
PE
3173}
3174
3175static struct pernet_operations net_inuse_ops = {
3176 .init = sock_inuse_init_net,
3177 .exit = sock_inuse_exit_net,
3178};
3179
3180static __init int net_inuse_init(void)
3181{
3182 if (register_pernet_subsys(&net_inuse_ops))
3183 panic("Cannot initialize net inuse counters");
3184
3185 return 0;
3186}
3187
3188core_initcall(net_inuse_init);
13ff3d6f
PE
3189
3190static void assign_proto_idx(struct proto *prot)
3191{
3192 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3193
3194 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
e005d193 3195 pr_err("PROTO_INUSE_NR exhausted\n");
13ff3d6f
PE
3196 return;
3197 }
3198
3199 set_bit(prot->inuse_idx, proto_inuse_idx);
3200}
3201
3202static void release_proto_idx(struct proto *prot)
3203{
3204 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3205 clear_bit(prot->inuse_idx, proto_inuse_idx);
3206}
3207#else
3208static inline void assign_proto_idx(struct proto *prot)
3209{
3210}
3211
3212static inline void release_proto_idx(struct proto *prot)
3213{
3214}
648845ab
TZ
3215
3216static void sock_inuse_add(struct net *net, int val)
3217{
3218}
13ff3d6f
PE
3219#endif
3220
0159dfd3
ED
3221static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3222{
3223 if (!rsk_prot)
3224 return;
3225 kfree(rsk_prot->slab_name);
3226 rsk_prot->slab_name = NULL;
adf78eda
JL
3227 kmem_cache_destroy(rsk_prot->slab);
3228 rsk_prot->slab = NULL;
0159dfd3
ED
3229}
3230
3231static int req_prot_init(const struct proto *prot)
3232{
3233 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3234
3235 if (!rsk_prot)
3236 return 0;
3237
3238 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3239 prot->name);
3240 if (!rsk_prot->slab_name)
3241 return -ENOMEM;
3242
3243 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3244 rsk_prot->obj_size, 0,
e96f78ab 3245 prot->slab_flags, NULL);
0159dfd3
ED
3246
3247 if (!rsk_prot->slab) {
3248 pr_crit("%s: Can't create request sock SLAB cache!\n",
3249 prot->name);
3250 return -ENOMEM;
3251 }
3252 return 0;
3253}
3254
b733c007
PE
3255int proto_register(struct proto *prot, int alloc_slab)
3256{
1da177e4 3257 if (alloc_slab) {
30c2c9f1
DW
3258 prot->slab = kmem_cache_create_usercopy(prot->name,
3259 prot->obj_size, 0,
271b72c7 3260 SLAB_HWCACHE_ALIGN | prot->slab_flags,
289a4860 3261 prot->useroffset, prot->usersize,
271b72c7 3262 NULL);
1da177e4
LT
3263
3264 if (prot->slab == NULL) {
e005d193
JP
3265 pr_crit("%s: Can't create sock SLAB cache!\n",
3266 prot->name);
60e7663d 3267 goto out;
1da177e4 3268 }
2e6599cb 3269
0159dfd3
ED
3270 if (req_prot_init(prot))
3271 goto out_free_request_sock_slab;
8feaf0c0 3272
6d6ee43e 3273 if (prot->twsk_prot != NULL) {
faf23422 3274 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
8feaf0c0 3275
7e56b5d6 3276 if (prot->twsk_prot->twsk_slab_name == NULL)
8feaf0c0
ACM
3277 goto out_free_request_sock_slab;
3278
6d6ee43e 3279 prot->twsk_prot->twsk_slab =
7e56b5d6 3280 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
6d6ee43e 3281 prot->twsk_prot->twsk_obj_size,
3ab5aee7 3282 0,
52db70dc 3283 prot->slab_flags,
20c2df83 3284 NULL);
6d6ee43e 3285 if (prot->twsk_prot->twsk_slab == NULL)
8feaf0c0
ACM
3286 goto out_free_timewait_sock_slab_name;
3287 }
1da177e4
LT
3288 }
3289
36b77a52 3290 mutex_lock(&proto_list_mutex);
1da177e4 3291 list_add(&prot->node, &proto_list);
13ff3d6f 3292 assign_proto_idx(prot);
36b77a52 3293 mutex_unlock(&proto_list_mutex);
b733c007
PE
3294 return 0;
3295
8feaf0c0 3296out_free_timewait_sock_slab_name:
7e56b5d6 3297 kfree(prot->twsk_prot->twsk_slab_name);
8feaf0c0 3298out_free_request_sock_slab:
0159dfd3
ED
3299 req_prot_cleanup(prot->rsk_prot);
3300
2e6599cb
ACM
3301 kmem_cache_destroy(prot->slab);
3302 prot->slab = NULL;
b733c007
PE
3303out:
3304 return -ENOBUFS;
1da177e4 3305}
1da177e4
LT
3306EXPORT_SYMBOL(proto_register);
3307
3308void proto_unregister(struct proto *prot)
3309{
36b77a52 3310 mutex_lock(&proto_list_mutex);
13ff3d6f 3311 release_proto_idx(prot);
0a3f4358 3312 list_del(&prot->node);
36b77a52 3313 mutex_unlock(&proto_list_mutex);
1da177e4 3314
adf78eda
JL
3315 kmem_cache_destroy(prot->slab);
3316 prot->slab = NULL;
1da177e4 3317
0159dfd3 3318 req_prot_cleanup(prot->rsk_prot);
2e6599cb 3319
6d6ee43e 3320 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
6d6ee43e 3321 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
7e56b5d6 3322 kfree(prot->twsk_prot->twsk_slab_name);
6d6ee43e 3323 prot->twsk_prot->twsk_slab = NULL;
8feaf0c0 3324 }
1da177e4 3325}
1da177e4
LT
3326EXPORT_SYMBOL(proto_unregister);
3327
bf2ae2e4
XL
3328int sock_load_diag_module(int family, int protocol)
3329{
3330 if (!protocol) {
3331 if (!sock_is_registered(family))
3332 return -ENOENT;
3333
3334 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3335 NETLINK_SOCK_DIAG, family);
3336 }
3337
3338#ifdef CONFIG_INET
3339 if (family == AF_INET &&
3340 !rcu_access_pointer(inet_protos[protocol]))
3341 return -ENOENT;
3342#endif
3343
3344 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3345 NETLINK_SOCK_DIAG, family, protocol);
3346}
3347EXPORT_SYMBOL(sock_load_diag_module);
3348
1da177e4 3349#ifdef CONFIG_PROC_FS
1da177e4 3350static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
36b77a52 3351 __acquires(proto_list_mutex)
1da177e4 3352{
36b77a52 3353 mutex_lock(&proto_list_mutex);
60f0438a 3354 return seq_list_start_head(&proto_list, *pos);
1da177e4
LT
3355}
3356
3357static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3358{
60f0438a 3359 return seq_list_next(v, &proto_list, pos);
1da177e4
LT
3360}
3361
3362static void proto_seq_stop(struct seq_file *seq, void *v)
36b77a52 3363 __releases(proto_list_mutex)
1da177e4 3364{
36b77a52 3365 mutex_unlock(&proto_list_mutex);
1da177e4
LT
3366}
3367
3368static char proto_method_implemented(const void *method)
3369{
3370 return method == NULL ? 'n' : 'y';
3371}
180d8cd9
GC
3372static long sock_prot_memory_allocated(struct proto *proto)
3373{
cb75a36c 3374 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
180d8cd9
GC
3375}
3376
3377static char *sock_prot_memory_pressure(struct proto *proto)
3378{
3379 return proto->memory_pressure != NULL ?
3380 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3381}
1da177e4
LT
3382
3383static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3384{
180d8cd9 3385
8d987e5c 3386 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
1da177e4
LT
3387 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3388 proto->name,
3389 proto->obj_size,
14e943db 3390 sock_prot_inuse_get(seq_file_net(seq), proto),
180d8cd9
GC
3391 sock_prot_memory_allocated(proto),
3392 sock_prot_memory_pressure(proto),
1da177e4
LT
3393 proto->max_header,
3394 proto->slab == NULL ? "no" : "yes",
3395 module_name(proto->owner),
3396 proto_method_implemented(proto->close),
3397 proto_method_implemented(proto->connect),
3398 proto_method_implemented(proto->disconnect),
3399 proto_method_implemented(proto->accept),
3400 proto_method_implemented(proto->ioctl),
3401 proto_method_implemented(proto->init),
3402 proto_method_implemented(proto->destroy),
3403 proto_method_implemented(proto->shutdown),
3404 proto_method_implemented(proto->setsockopt),
3405 proto_method_implemented(proto->getsockopt),
3406 proto_method_implemented(proto->sendmsg),
3407 proto_method_implemented(proto->recvmsg),
3408 proto_method_implemented(proto->sendpage),
3409 proto_method_implemented(proto->bind),
3410 proto_method_implemented(proto->backlog_rcv),
3411 proto_method_implemented(proto->hash),
3412 proto_method_implemented(proto->unhash),
3413 proto_method_implemented(proto->get_port),
3414 proto_method_implemented(proto->enter_memory_pressure));
3415}
3416
3417static int proto_seq_show(struct seq_file *seq, void *v)
3418{
60f0438a 3419 if (v == &proto_list)
1da177e4
LT
3420 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3421 "protocol",
3422 "size",
3423 "sockets",
3424 "memory",
3425 "press",
3426 "maxhdr",
3427 "slab",
3428 "module",
3429 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3430 else
60f0438a 3431 proto_seq_printf(seq, list_entry(v, struct proto, node));
1da177e4
LT
3432 return 0;
3433}
3434
f690808e 3435static const struct seq_operations proto_seq_ops = {
1da177e4
LT
3436 .start = proto_seq_start,
3437 .next = proto_seq_next,
3438 .stop = proto_seq_stop,
3439 .show = proto_seq_show,
3440};
3441
3442static int proto_seq_open(struct inode *inode, struct file *file)
3443{
14e943db
ED
3444 return seq_open_net(inode, file, &proto_seq_ops,
3445 sizeof(struct seq_net_private));
1da177e4
LT
3446}
3447
9a32144e 3448static const struct file_operations proto_seq_fops = {
1da177e4
LT
3449 .open = proto_seq_open,
3450 .read = seq_read,
3451 .llseek = seq_lseek,
14e943db
ED
3452 .release = seq_release_net,
3453};
3454
3455static __net_init int proto_init_net(struct net *net)
3456{
d6444062 3457 if (!proc_create("protocols", 0444, net->proc_net, &proto_seq_fops))
14e943db
ED
3458 return -ENOMEM;
3459
3460 return 0;
3461}
3462
3463static __net_exit void proto_exit_net(struct net *net)
3464{
ece31ffd 3465 remove_proc_entry("protocols", net->proc_net);
14e943db
ED
3466}
3467
3468
3469static __net_initdata struct pernet_operations proto_net_ops = {
3470 .init = proto_init_net,
3471 .exit = proto_exit_net,
1da177e4
LT
3472};
3473
3474static int __init proto_init(void)
3475{
14e943db 3476 return register_pernet_subsys(&proto_net_ops);
1da177e4
LT
3477}
3478
3479subsys_initcall(proto_init);
3480
3481#endif /* PROC_FS */
7db6b048
SS
3482
3483#ifdef CONFIG_NET_RX_BUSY_POLL
3484bool sk_busy_loop_end(void *p, unsigned long start_time)
3485{
3486 struct sock *sk = p;
3487
3488 return !skb_queue_empty(&sk->sk_receive_queue) ||
3489 sk_busy_loop_timeout(sk, start_time);
3490}
3491EXPORT_SYMBOL(sk_busy_loop_end);
3492#endif /* CONFIG_NET_RX_BUSY_POLL */