tcp: use one bit in TCP_SKB_CB to mark ACK timestamps
[linux-2.6-block.git] / net / core / sock.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
4ec93edb 35 * code. The ACK stuff can wait and needs major
1da177e4
LT
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
e005d193
JP
92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
4fc268d2 94#include <linux/capability.h>
1da177e4 95#include <linux/errno.h>
cb820f8e 96#include <linux/errqueue.h>
1da177e4
LT
97#include <linux/types.h>
98#include <linux/socket.h>
99#include <linux/in.h>
100#include <linux/kernel.h>
1da177e4
LT
101#include <linux/module.h>
102#include <linux/proc_fs.h>
103#include <linux/seq_file.h>
104#include <linux/sched.h>
105#include <linux/timer.h>
106#include <linux/string.h>
107#include <linux/sockios.h>
108#include <linux/net.h>
109#include <linux/mm.h>
110#include <linux/slab.h>
111#include <linux/interrupt.h>
112#include <linux/poll.h>
113#include <linux/tcp.h>
114#include <linux/init.h>
a1f8e7f7 115#include <linux/highmem.h>
3f551f94 116#include <linux/user_namespace.h>
c5905afb 117#include <linux/static_key.h>
3969eb38 118#include <linux/memcontrol.h>
8c1ae10d 119#include <linux/prefetch.h>
1da177e4
LT
120
121#include <asm/uaccess.h>
1da177e4
LT
122
123#include <linux/netdevice.h>
124#include <net/protocol.h>
125#include <linux/skbuff.h>
457c4cbc 126#include <net/net_namespace.h>
2e6599cb 127#include <net/request_sock.h>
1da177e4 128#include <net/sock.h>
20d49473 129#include <linux/net_tstamp.h>
1da177e4
LT
130#include <net/xfrm.h>
131#include <linux/ipsec.h>
f8451725 132#include <net/cls_cgroup.h>
5bc1421e 133#include <net/netprio_cgroup.h>
eb4cb008 134#include <linux/sock_diag.h>
1da177e4
LT
135
136#include <linux/filter.h>
538950a1 137#include <net/sock_reuseport.h>
1da177e4 138
3847ce32
SM
139#include <trace/events/sock.h>
140
1da177e4
LT
141#ifdef CONFIG_INET
142#include <net/tcp.h>
143#endif
144
076bb0c8 145#include <net/busy_poll.h>
06021292 146
36b77a52 147static DEFINE_MUTEX(proto_list_mutex);
d1a4c0b3
GC
148static LIST_HEAD(proto_list);
149
a3b299da
EB
150/**
151 * sk_ns_capable - General socket capability test
152 * @sk: Socket to use a capability on or through
153 * @user_ns: The user namespace of the capability to use
154 * @cap: The capability to use
155 *
156 * Test to see if the opener of the socket had when the socket was
157 * created and the current process has the capability @cap in the user
158 * namespace @user_ns.
159 */
160bool sk_ns_capable(const struct sock *sk,
161 struct user_namespace *user_ns, int cap)
162{
163 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
164 ns_capable(user_ns, cap);
165}
166EXPORT_SYMBOL(sk_ns_capable);
167
168/**
169 * sk_capable - Socket global capability test
170 * @sk: Socket to use a capability on or through
e793c0f7 171 * @cap: The global capability to use
a3b299da
EB
172 *
173 * Test to see if the opener of the socket had when the socket was
174 * created and the current process has the capability @cap in all user
175 * namespaces.
176 */
177bool sk_capable(const struct sock *sk, int cap)
178{
179 return sk_ns_capable(sk, &init_user_ns, cap);
180}
181EXPORT_SYMBOL(sk_capable);
182
183/**
184 * sk_net_capable - Network namespace socket capability test
185 * @sk: Socket to use a capability on or through
186 * @cap: The capability to use
187 *
e793c0f7 188 * Test to see if the opener of the socket had when the socket was created
a3b299da
EB
189 * and the current process has the capability @cap over the network namespace
190 * the socket is a member of.
191 */
192bool sk_net_capable(const struct sock *sk, int cap)
193{
194 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
195}
196EXPORT_SYMBOL(sk_net_capable);
197
da21f24d
IM
198/*
199 * Each address family might have different locking rules, so we have
200 * one slock key per address family:
201 */
a5b5bb9a
IM
202static struct lock_class_key af_family_keys[AF_MAX];
203static struct lock_class_key af_family_slock_keys[AF_MAX];
204
a5b5bb9a
IM
205/*
206 * Make lock validator output more readable. (we pre-construct these
207 * strings build-time, so that runtime initialization of socket
208 * locks is fast):
209 */
36cbd3dc 210static const char *const af_family_key_strings[AF_MAX+1] = {
a5b5bb9a
IM
211 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
212 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
213 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
214 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
215 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
216 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
217 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
cbd151bf 218 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
a5b5bb9a 219 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
cd05acfe 220 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
17926a79 221 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
bce7b154 222 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
6f107b58 223 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
456db6a4 224 "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX"
a5b5bb9a 225};
36cbd3dc 226static const char *const af_family_slock_key_strings[AF_MAX+1] = {
a5b5bb9a
IM
227 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
228 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
229 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
230 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
231 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
232 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
233 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
cbd151bf 234 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
a5b5bb9a 235 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
cd05acfe 236 "slock-27" , "slock-28" , "slock-AF_CAN" ,
17926a79 237 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
bce7b154 238 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
6f107b58 239 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
456db6a4 240 "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX"
a5b5bb9a 241};
36cbd3dc 242static const char *const af_family_clock_key_strings[AF_MAX+1] = {
443aef0e
PZ
243 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
244 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
245 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
246 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
247 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
248 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
249 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
cbd151bf 250 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
443aef0e 251 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
b4942af6 252 "clock-27" , "clock-28" , "clock-AF_CAN" ,
e51f802b 253 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
bce7b154 254 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
6f107b58 255 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
456db6a4 256 "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX"
443aef0e 257};
da21f24d
IM
258
259/*
260 * sk_callback_lock locking rules are per-address-family,
261 * so split the lock classes by using a per-AF key:
262 */
263static struct lock_class_key af_callback_keys[AF_MAX];
264
1da177e4
LT
265/* Take into consideration the size of the struct sk_buff overhead in the
266 * determination of these values, since that is non-constant across
267 * platforms. This makes socket queueing behavior and performance
268 * not depend upon such differences.
269 */
270#define _SK_MEM_PACKETS 256
87fb4b7b 271#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
1da177e4
LT
272#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
273#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
274
275/* Run time adjustable parameters. */
ab32ea5d 276__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
6d8ebc8a 277EXPORT_SYMBOL(sysctl_wmem_max);
ab32ea5d 278__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
6d8ebc8a 279EXPORT_SYMBOL(sysctl_rmem_max);
ab32ea5d
BH
280__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
281__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
1da177e4 282
25985edc 283/* Maximal space eaten by iovec or ancillary data plus some space */
ab32ea5d 284int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
2a91525c 285EXPORT_SYMBOL(sysctl_optmem_max);
1da177e4 286
b245be1f
WB
287int sysctl_tstamp_allow_data __read_mostly = 1;
288
c93bdd0e
MG
289struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
290EXPORT_SYMBOL_GPL(memalloc_socks);
291
7cb02404
MG
292/**
293 * sk_set_memalloc - sets %SOCK_MEMALLOC
294 * @sk: socket to set it on
295 *
296 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297 * It's the responsibility of the admin to adjust min_free_kbytes
298 * to meet the requirements
299 */
300void sk_set_memalloc(struct sock *sk)
301{
302 sock_set_flag(sk, SOCK_MEMALLOC);
303 sk->sk_allocation |= __GFP_MEMALLOC;
c93bdd0e 304 static_key_slow_inc(&memalloc_socks);
7cb02404
MG
305}
306EXPORT_SYMBOL_GPL(sk_set_memalloc);
307
308void sk_clear_memalloc(struct sock *sk)
309{
310 sock_reset_flag(sk, SOCK_MEMALLOC);
311 sk->sk_allocation &= ~__GFP_MEMALLOC;
c93bdd0e 312 static_key_slow_dec(&memalloc_socks);
c76562b6
MG
313
314 /*
315 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
5d753610
MG
316 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 * it has rmem allocations due to the last swapfile being deactivated
318 * but there is a risk that the socket is unusable due to exceeding
319 * the rmem limits. Reclaim the reserves and obey rmem limits again.
c76562b6 320 */
5d753610 321 sk_mem_reclaim(sk);
7cb02404
MG
322}
323EXPORT_SYMBOL_GPL(sk_clear_memalloc);
324
b4b9e355
MG
325int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
326{
327 int ret;
328 unsigned long pflags = current->flags;
329
330 /* these should have been dropped before queueing */
331 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
332
333 current->flags |= PF_MEMALLOC;
334 ret = sk->sk_backlog_rcv(sk, skb);
335 tsk_restore_flags(current, pflags, PF_MEMALLOC);
336
337 return ret;
338}
339EXPORT_SYMBOL(__sk_backlog_rcv);
340
1da177e4
LT
341static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
342{
343 struct timeval tv;
344
345 if (optlen < sizeof(tv))
346 return -EINVAL;
347 if (copy_from_user(&tv, optval, sizeof(tv)))
348 return -EFAULT;
ba78073e
VA
349 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
350 return -EDOM;
1da177e4 351
ba78073e 352 if (tv.tv_sec < 0) {
6f11df83
AM
353 static int warned __read_mostly;
354
ba78073e 355 *timeo_p = 0;
50aab54f 356 if (warned < 10 && net_ratelimit()) {
ba78073e 357 warned++;
e005d193
JP
358 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
359 __func__, current->comm, task_pid_nr(current));
50aab54f 360 }
ba78073e
VA
361 return 0;
362 }
1da177e4
LT
363 *timeo_p = MAX_SCHEDULE_TIMEOUT;
364 if (tv.tv_sec == 0 && tv.tv_usec == 0)
365 return 0;
366 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
367 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
368 return 0;
369}
370
371static void sock_warn_obsolete_bsdism(const char *name)
372{
373 static int warned;
374 static char warncomm[TASK_COMM_LEN];
4ec93edb
YH
375 if (strcmp(warncomm, current->comm) && warned < 5) {
376 strcpy(warncomm, current->comm);
e005d193
JP
377 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
378 warncomm, name);
1da177e4
LT
379 warned++;
380 }
381}
382
080a270f
HFS
383static bool sock_needs_netstamp(const struct sock *sk)
384{
385 switch (sk->sk_family) {
386 case AF_UNSPEC:
387 case AF_UNIX:
388 return false;
389 default:
390 return true;
391 }
392}
393
08e29af3 394static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
4ec93edb 395{
08e29af3
ED
396 if (sk->sk_flags & flags) {
397 sk->sk_flags &= ~flags;
080a270f
HFS
398 if (sock_needs_netstamp(sk) &&
399 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
20d49473 400 net_disable_timestamp();
1da177e4
LT
401 }
402}
403
404
f0088a50
DV
405int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
406{
766e9037 407 int err;
3b885787
NH
408 unsigned long flags;
409 struct sk_buff_head *list = &sk->sk_receive_queue;
f0088a50 410
0fd7bac6 411 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
766e9037 412 atomic_inc(&sk->sk_drops);
3847ce32 413 trace_sock_rcvqueue_full(sk, skb);
766e9037 414 return -ENOMEM;
f0088a50
DV
415 }
416
fda9ef5d 417 err = sk_filter(sk, skb);
f0088a50 418 if (err)
766e9037 419 return err;
f0088a50 420
c76562b6 421 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
766e9037
ED
422 atomic_inc(&sk->sk_drops);
423 return -ENOBUFS;
3ab224be
HA
424 }
425
f0088a50
DV
426 skb->dev = NULL;
427 skb_set_owner_r(skb, sk);
49ad9599 428
7fee226a
ED
429 /* we escape from rcu protected region, make sure we dont leak
430 * a norefcounted dst
431 */
432 skb_dst_force(skb);
433
3b885787 434 spin_lock_irqsave(&list->lock, flags);
3bc3b96f 435 sock_skb_set_dropcount(sk, skb);
3b885787
NH
436 __skb_queue_tail(list, skb);
437 spin_unlock_irqrestore(&list->lock, flags);
f0088a50
DV
438
439 if (!sock_flag(sk, SOCK_DEAD))
676d2369 440 sk->sk_data_ready(sk);
766e9037 441 return 0;
f0088a50
DV
442}
443EXPORT_SYMBOL(sock_queue_rcv_skb);
444
58a5a7b9 445int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
f0088a50
DV
446{
447 int rc = NET_RX_SUCCESS;
448
fda9ef5d 449 if (sk_filter(sk, skb))
f0088a50
DV
450 goto discard_and_relse;
451
452 skb->dev = NULL;
453
274f482d 454 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
c377411f
ED
455 atomic_inc(&sk->sk_drops);
456 goto discard_and_relse;
457 }
58a5a7b9
ACM
458 if (nested)
459 bh_lock_sock_nested(sk);
460 else
461 bh_lock_sock(sk);
a5b5bb9a
IM
462 if (!sock_owned_by_user(sk)) {
463 /*
464 * trylock + unlock semantics:
465 */
466 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
467
c57943a1 468 rc = sk_backlog_rcv(sk, skb);
a5b5bb9a
IM
469
470 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
f545a38f 471 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
8eae939f
ZY
472 bh_unlock_sock(sk);
473 atomic_inc(&sk->sk_drops);
474 goto discard_and_relse;
475 }
476
f0088a50
DV
477 bh_unlock_sock(sk);
478out:
479 sock_put(sk);
480 return rc;
481discard_and_relse:
482 kfree_skb(skb);
483 goto out;
484}
485EXPORT_SYMBOL(sk_receive_skb);
486
487struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
488{
b6c6712a 489 struct dst_entry *dst = __sk_dst_get(sk);
f0088a50
DV
490
491 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
e022f0b4 492 sk_tx_queue_clear(sk);
a9b3cd7f 493 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
f0088a50
DV
494 dst_release(dst);
495 return NULL;
496 }
497
498 return dst;
499}
500EXPORT_SYMBOL(__sk_dst_check);
501
502struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
503{
504 struct dst_entry *dst = sk_dst_get(sk);
505
506 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
507 sk_dst_reset(sk);
508 dst_release(dst);
509 return NULL;
510 }
511
512 return dst;
513}
514EXPORT_SYMBOL(sk_dst_check);
515
c91f6df2
BH
516static int sock_setbindtodevice(struct sock *sk, char __user *optval,
517 int optlen)
4878809f
DM
518{
519 int ret = -ENOPROTOOPT;
520#ifdef CONFIG_NETDEVICES
3b1e0a65 521 struct net *net = sock_net(sk);
4878809f
DM
522 char devname[IFNAMSIZ];
523 int index;
524
525 /* Sorry... */
526 ret = -EPERM;
5e1fccc0 527 if (!ns_capable(net->user_ns, CAP_NET_RAW))
4878809f
DM
528 goto out;
529
530 ret = -EINVAL;
531 if (optlen < 0)
532 goto out;
533
534 /* Bind this socket to a particular device like "eth0",
535 * as specified in the passed interface name. If the
536 * name is "" or the option length is zero the socket
537 * is not bound.
538 */
539 if (optlen > IFNAMSIZ - 1)
540 optlen = IFNAMSIZ - 1;
541 memset(devname, 0, sizeof(devname));
542
543 ret = -EFAULT;
544 if (copy_from_user(devname, optval, optlen))
545 goto out;
546
000ba2e4
DM
547 index = 0;
548 if (devname[0] != '\0') {
bf8e56bf 549 struct net_device *dev;
4878809f 550
bf8e56bf
ED
551 rcu_read_lock();
552 dev = dev_get_by_name_rcu(net, devname);
553 if (dev)
554 index = dev->ifindex;
555 rcu_read_unlock();
4878809f
DM
556 ret = -ENODEV;
557 if (!dev)
558 goto out;
4878809f
DM
559 }
560
561 lock_sock(sk);
562 sk->sk_bound_dev_if = index;
563 sk_dst_reset(sk);
564 release_sock(sk);
565
566 ret = 0;
567
568out:
569#endif
570
571 return ret;
572}
573
c91f6df2
BH
574static int sock_getbindtodevice(struct sock *sk, char __user *optval,
575 int __user *optlen, int len)
576{
577 int ret = -ENOPROTOOPT;
578#ifdef CONFIG_NETDEVICES
579 struct net *net = sock_net(sk);
c91f6df2 580 char devname[IFNAMSIZ];
c91f6df2
BH
581
582 if (sk->sk_bound_dev_if == 0) {
583 len = 0;
584 goto zero;
585 }
586
587 ret = -EINVAL;
588 if (len < IFNAMSIZ)
589 goto out;
590
5dbe7c17
NS
591 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
592 if (ret)
c91f6df2 593 goto out;
c91f6df2
BH
594
595 len = strlen(devname) + 1;
596
597 ret = -EFAULT;
598 if (copy_to_user(optval, devname, len))
599 goto out;
600
601zero:
602 ret = -EFAULT;
603 if (put_user(len, optlen))
604 goto out;
605
606 ret = 0;
607
608out:
609#endif
610
611 return ret;
612}
613
c0ef877b
PE
614static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
615{
616 if (valbool)
617 sock_set_flag(sk, bit);
618 else
619 sock_reset_flag(sk, bit);
620}
621
f60e5990 622bool sk_mc_loop(struct sock *sk)
623{
624 if (dev_recursion_level())
625 return false;
626 if (!sk)
627 return true;
628 switch (sk->sk_family) {
629 case AF_INET:
630 return inet_sk(sk)->mc_loop;
631#if IS_ENABLED(CONFIG_IPV6)
632 case AF_INET6:
633 return inet6_sk(sk)->mc_loop;
634#endif
635 }
636 WARN_ON(1);
637 return true;
638}
639EXPORT_SYMBOL(sk_mc_loop);
640
1da177e4
LT
641/*
642 * This is meant for all protocols to use and covers goings on
643 * at the socket level. Everything here is generic.
644 */
645
646int sock_setsockopt(struct socket *sock, int level, int optname,
b7058842 647 char __user *optval, unsigned int optlen)
1da177e4 648{
2a91525c 649 struct sock *sk = sock->sk;
1da177e4
LT
650 int val;
651 int valbool;
652 struct linger ling;
653 int ret = 0;
4ec93edb 654
1da177e4
LT
655 /*
656 * Options without arguments
657 */
658
4878809f 659 if (optname == SO_BINDTODEVICE)
c91f6df2 660 return sock_setbindtodevice(sk, optval, optlen);
4878809f 661
e71a4783
SH
662 if (optlen < sizeof(int))
663 return -EINVAL;
4ec93edb 664
1da177e4
LT
665 if (get_user(val, (int __user *)optval))
666 return -EFAULT;
4ec93edb 667
2a91525c 668 valbool = val ? 1 : 0;
1da177e4
LT
669
670 lock_sock(sk);
671
2a91525c 672 switch (optname) {
e71a4783 673 case SO_DEBUG:
2a91525c 674 if (val && !capable(CAP_NET_ADMIN))
e71a4783 675 ret = -EACCES;
2a91525c 676 else
c0ef877b 677 sock_valbool_flag(sk, SOCK_DBG, valbool);
e71a4783
SH
678 break;
679 case SO_REUSEADDR:
4a17fd52 680 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
e71a4783 681 break;
055dc21a
TH
682 case SO_REUSEPORT:
683 sk->sk_reuseport = valbool;
684 break;
e71a4783 685 case SO_TYPE:
49c794e9 686 case SO_PROTOCOL:
0d6038ee 687 case SO_DOMAIN:
e71a4783
SH
688 case SO_ERROR:
689 ret = -ENOPROTOOPT;
690 break;
691 case SO_DONTROUTE:
c0ef877b 692 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
e71a4783
SH
693 break;
694 case SO_BROADCAST:
695 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
696 break;
697 case SO_SNDBUF:
698 /* Don't error on this BSD doesn't and if you think
82981930
ED
699 * about it this is right. Otherwise apps have to
700 * play 'guess the biggest size' games. RCVBUF/SNDBUF
701 * are treated in BSD as hints
702 */
703 val = min_t(u32, val, sysctl_wmem_max);
b0573dea 704set_sndbuf:
e71a4783 705 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
82981930
ED
706 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
707 /* Wake up sending tasks if we upped the value. */
e71a4783
SH
708 sk->sk_write_space(sk);
709 break;
1da177e4 710
e71a4783
SH
711 case SO_SNDBUFFORCE:
712 if (!capable(CAP_NET_ADMIN)) {
713 ret = -EPERM;
714 break;
715 }
716 goto set_sndbuf;
b0573dea 717
e71a4783
SH
718 case SO_RCVBUF:
719 /* Don't error on this BSD doesn't and if you think
82981930
ED
720 * about it this is right. Otherwise apps have to
721 * play 'guess the biggest size' games. RCVBUF/SNDBUF
722 * are treated in BSD as hints
723 */
724 val = min_t(u32, val, sysctl_rmem_max);
b0573dea 725set_rcvbuf:
e71a4783
SH
726 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
727 /*
728 * We double it on the way in to account for
729 * "struct sk_buff" etc. overhead. Applications
730 * assume that the SO_RCVBUF setting they make will
731 * allow that much actual data to be received on that
732 * socket.
733 *
734 * Applications are unaware that "struct sk_buff" and
735 * other overheads allocate from the receive buffer
736 * during socket buffer allocation.
737 *
738 * And after considering the possible alternatives,
739 * returning the value we actually used in getsockopt
740 * is the most desirable behavior.
741 */
82981930 742 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
e71a4783
SH
743 break;
744
745 case SO_RCVBUFFORCE:
746 if (!capable(CAP_NET_ADMIN)) {
747 ret = -EPERM;
1da177e4 748 break;
e71a4783
SH
749 }
750 goto set_rcvbuf;
1da177e4 751
e71a4783 752 case SO_KEEPALIVE:
1da177e4 753#ifdef CONFIG_INET
3e10986d
ED
754 if (sk->sk_protocol == IPPROTO_TCP &&
755 sk->sk_type == SOCK_STREAM)
e71a4783 756 tcp_set_keepalive(sk, valbool);
1da177e4 757#endif
e71a4783
SH
758 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
759 break;
760
761 case SO_OOBINLINE:
762 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
763 break;
764
765 case SO_NO_CHECK:
28448b80 766 sk->sk_no_check_tx = valbool;
e71a4783
SH
767 break;
768
769 case SO_PRIORITY:
5e1fccc0
EB
770 if ((val >= 0 && val <= 6) ||
771 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
e71a4783
SH
772 sk->sk_priority = val;
773 else
774 ret = -EPERM;
775 break;
776
777 case SO_LINGER:
778 if (optlen < sizeof(ling)) {
779 ret = -EINVAL; /* 1003.1g */
1da177e4 780 break;
e71a4783 781 }
2a91525c 782 if (copy_from_user(&ling, optval, sizeof(ling))) {
e71a4783 783 ret = -EFAULT;
1da177e4 784 break;
e71a4783
SH
785 }
786 if (!ling.l_onoff)
787 sock_reset_flag(sk, SOCK_LINGER);
788 else {
1da177e4 789#if (BITS_PER_LONG == 32)
e71a4783
SH
790 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
791 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1da177e4 792 else
e71a4783
SH
793#endif
794 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
795 sock_set_flag(sk, SOCK_LINGER);
796 }
797 break;
798
799 case SO_BSDCOMPAT:
800 sock_warn_obsolete_bsdism("setsockopt");
801 break;
802
803 case SO_PASSCRED:
804 if (valbool)
805 set_bit(SOCK_PASSCRED, &sock->flags);
806 else
807 clear_bit(SOCK_PASSCRED, &sock->flags);
808 break;
809
810 case SO_TIMESTAMP:
92f37fd2 811 case SO_TIMESTAMPNS:
e71a4783 812 if (valbool) {
92f37fd2
ED
813 if (optname == SO_TIMESTAMP)
814 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
815 else
816 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783 817 sock_set_flag(sk, SOCK_RCVTSTAMP);
20d49473 818 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
92f37fd2 819 } else {
e71a4783 820 sock_reset_flag(sk, SOCK_RCVTSTAMP);
92f37fd2
ED
821 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
822 }
e71a4783
SH
823 break;
824
20d49473
PO
825 case SO_TIMESTAMPING:
826 if (val & ~SOF_TIMESTAMPING_MASK) {
f249fb78 827 ret = -EINVAL;
20d49473
PO
828 break;
829 }
b245be1f 830
09c2d251 831 if (val & SOF_TIMESTAMPING_OPT_ID &&
4ed2d765 832 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
ac5cc977
WC
833 if (sk->sk_protocol == IPPROTO_TCP &&
834 sk->sk_type == SOCK_STREAM) {
6db8b963
SHY
835 if ((1 << sk->sk_state) &
836 (TCPF_CLOSE | TCPF_LISTEN)) {
4ed2d765
WB
837 ret = -EINVAL;
838 break;
839 }
840 sk->sk_tskey = tcp_sk(sk)->snd_una;
841 } else {
842 sk->sk_tskey = 0;
843 }
844 }
b9f40e21 845 sk->sk_tsflags = val;
20d49473
PO
846 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
847 sock_enable_timestamp(sk,
848 SOCK_TIMESTAMPING_RX_SOFTWARE);
849 else
850 sock_disable_timestamp(sk,
08e29af3 851 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
20d49473
PO
852 break;
853
e71a4783
SH
854 case SO_RCVLOWAT:
855 if (val < 0)
856 val = INT_MAX;
857 sk->sk_rcvlowat = val ? : 1;
858 break;
859
860 case SO_RCVTIMEO:
861 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
862 break;
863
864 case SO_SNDTIMEO:
865 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
866 break;
1da177e4 867
e71a4783
SH
868 case SO_ATTACH_FILTER:
869 ret = -EINVAL;
870 if (optlen == sizeof(struct sock_fprog)) {
871 struct sock_fprog fprog;
1da177e4 872
e71a4783
SH
873 ret = -EFAULT;
874 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1da177e4 875 break;
e71a4783
SH
876
877 ret = sk_attach_filter(&fprog, sk);
878 }
879 break;
880
89aa0758
AS
881 case SO_ATTACH_BPF:
882 ret = -EINVAL;
883 if (optlen == sizeof(u32)) {
884 u32 ufd;
885
886 ret = -EFAULT;
887 if (copy_from_user(&ufd, optval, sizeof(ufd)))
888 break;
889
890 ret = sk_attach_bpf(ufd, sk);
891 }
892 break;
893
538950a1
CG
894 case SO_ATTACH_REUSEPORT_CBPF:
895 ret = -EINVAL;
896 if (optlen == sizeof(struct sock_fprog)) {
897 struct sock_fprog fprog;
898
899 ret = -EFAULT;
900 if (copy_from_user(&fprog, optval, sizeof(fprog)))
901 break;
902
903 ret = sk_reuseport_attach_filter(&fprog, sk);
904 }
905 break;
906
907 case SO_ATTACH_REUSEPORT_EBPF:
908 ret = -EINVAL;
909 if (optlen == sizeof(u32)) {
910 u32 ufd;
911
912 ret = -EFAULT;
913 if (copy_from_user(&ufd, optval, sizeof(ufd)))
914 break;
915
916 ret = sk_reuseport_attach_bpf(ufd, sk);
917 }
918 break;
919
e71a4783 920 case SO_DETACH_FILTER:
55b33325 921 ret = sk_detach_filter(sk);
e71a4783 922 break;
1da177e4 923
d59577b6
VB
924 case SO_LOCK_FILTER:
925 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
926 ret = -EPERM;
927 else
928 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
929 break;
930
e71a4783
SH
931 case SO_PASSSEC:
932 if (valbool)
933 set_bit(SOCK_PASSSEC, &sock->flags);
934 else
935 clear_bit(SOCK_PASSSEC, &sock->flags);
936 break;
4a19ec58 937 case SO_MARK:
5e1fccc0 938 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
4a19ec58 939 ret = -EPERM;
2a91525c 940 else
4a19ec58 941 sk->sk_mark = val;
4a19ec58 942 break;
877ce7c1 943
3b885787 944 case SO_RXQ_OVFL:
8083f0fc 945 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
3b885787 946 break;
6e3e939f
JB
947
948 case SO_WIFI_STATUS:
949 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
950 break;
951
ef64a54f
PE
952 case SO_PEEK_OFF:
953 if (sock->ops->set_peek_off)
12663bfc 954 ret = sock->ops->set_peek_off(sk, val);
ef64a54f
PE
955 else
956 ret = -EOPNOTSUPP;
957 break;
3bdc0eba
BG
958
959 case SO_NOFCS:
960 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
961 break;
962
7d4c04fc
KJ
963 case SO_SELECT_ERR_QUEUE:
964 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
965 break;
966
e0d1095a 967#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 968 case SO_BUSY_POLL:
dafcc438
ET
969 /* allow unprivileged users to decrease the value */
970 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
971 ret = -EPERM;
972 else {
973 if (val < 0)
974 ret = -EINVAL;
975 else
976 sk->sk_ll_usec = val;
977 }
978 break;
979#endif
62748f32
ED
980
981 case SO_MAX_PACING_RATE:
982 sk->sk_max_pacing_rate = val;
983 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
984 sk->sk_max_pacing_rate);
985 break;
986
70da268b
ED
987 case SO_INCOMING_CPU:
988 sk->sk_incoming_cpu = val;
989 break;
990
a87cb3e4
TH
991 case SO_CNX_ADVICE:
992 if (val == 1)
993 dst_negative_advice(sk);
994 break;
e71a4783
SH
995 default:
996 ret = -ENOPROTOOPT;
997 break;
4ec93edb 998 }
1da177e4
LT
999 release_sock(sk);
1000 return ret;
1001}
2a91525c 1002EXPORT_SYMBOL(sock_setsockopt);
1da177e4
LT
1003
1004
8f09898b 1005static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1006 struct ucred *ucred)
3f551f94
EB
1007{
1008 ucred->pid = pid_vnr(pid);
1009 ucred->uid = ucred->gid = -1;
1010 if (cred) {
1011 struct user_namespace *current_ns = current_user_ns();
1012
b2e4f544
EB
1013 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1014 ucred->gid = from_kgid_munged(current_ns, cred->egid);
3f551f94
EB
1015 }
1016}
1017
1da177e4
LT
1018int sock_getsockopt(struct socket *sock, int level, int optname,
1019 char __user *optval, int __user *optlen)
1020{
1021 struct sock *sk = sock->sk;
4ec93edb 1022
e71a4783 1023 union {
4ec93edb
YH
1024 int val;
1025 struct linger ling;
1da177e4
LT
1026 struct timeval tm;
1027 } v;
4ec93edb 1028
4d0392be 1029 int lv = sizeof(int);
1da177e4 1030 int len;
4ec93edb 1031
e71a4783 1032 if (get_user(len, optlen))
4ec93edb 1033 return -EFAULT;
e71a4783 1034 if (len < 0)
1da177e4 1035 return -EINVAL;
4ec93edb 1036
50fee1de 1037 memset(&v, 0, sizeof(v));
df0bca04 1038
2a91525c 1039 switch (optname) {
e71a4783
SH
1040 case SO_DEBUG:
1041 v.val = sock_flag(sk, SOCK_DBG);
1042 break;
1043
1044 case SO_DONTROUTE:
1045 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1046 break;
1047
1048 case SO_BROADCAST:
1b23a5df 1049 v.val = sock_flag(sk, SOCK_BROADCAST);
e71a4783
SH
1050 break;
1051
1052 case SO_SNDBUF:
1053 v.val = sk->sk_sndbuf;
1054 break;
1055
1056 case SO_RCVBUF:
1057 v.val = sk->sk_rcvbuf;
1058 break;
1059
1060 case SO_REUSEADDR:
1061 v.val = sk->sk_reuse;
1062 break;
1063
055dc21a
TH
1064 case SO_REUSEPORT:
1065 v.val = sk->sk_reuseport;
1066 break;
1067
e71a4783 1068 case SO_KEEPALIVE:
1b23a5df 1069 v.val = sock_flag(sk, SOCK_KEEPOPEN);
e71a4783
SH
1070 break;
1071
1072 case SO_TYPE:
1073 v.val = sk->sk_type;
1074 break;
1075
49c794e9
JE
1076 case SO_PROTOCOL:
1077 v.val = sk->sk_protocol;
1078 break;
1079
0d6038ee
JE
1080 case SO_DOMAIN:
1081 v.val = sk->sk_family;
1082 break;
1083
e71a4783
SH
1084 case SO_ERROR:
1085 v.val = -sock_error(sk);
2a91525c 1086 if (v.val == 0)
e71a4783
SH
1087 v.val = xchg(&sk->sk_err_soft, 0);
1088 break;
1089
1090 case SO_OOBINLINE:
1b23a5df 1091 v.val = sock_flag(sk, SOCK_URGINLINE);
e71a4783
SH
1092 break;
1093
1094 case SO_NO_CHECK:
28448b80 1095 v.val = sk->sk_no_check_tx;
e71a4783
SH
1096 break;
1097
1098 case SO_PRIORITY:
1099 v.val = sk->sk_priority;
1100 break;
1101
1102 case SO_LINGER:
1103 lv = sizeof(v.ling);
1b23a5df 1104 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
e71a4783
SH
1105 v.ling.l_linger = sk->sk_lingertime / HZ;
1106 break;
1107
1108 case SO_BSDCOMPAT:
1109 sock_warn_obsolete_bsdism("getsockopt");
1110 break;
1111
1112 case SO_TIMESTAMP:
92f37fd2
ED
1113 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1114 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1115 break;
1116
1117 case SO_TIMESTAMPNS:
1118 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783
SH
1119 break;
1120
20d49473 1121 case SO_TIMESTAMPING:
b9f40e21 1122 v.val = sk->sk_tsflags;
20d49473
PO
1123 break;
1124
e71a4783 1125 case SO_RCVTIMEO:
2a91525c 1126 lv = sizeof(struct timeval);
e71a4783
SH
1127 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1128 v.tm.tv_sec = 0;
1129 v.tm.tv_usec = 0;
1130 } else {
1131 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1132 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1133 }
1134 break;
1135
1136 case SO_SNDTIMEO:
2a91525c 1137 lv = sizeof(struct timeval);
e71a4783
SH
1138 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1139 v.tm.tv_sec = 0;
1140 v.tm.tv_usec = 0;
1141 } else {
1142 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1143 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1144 }
1145 break;
1da177e4 1146
e71a4783
SH
1147 case SO_RCVLOWAT:
1148 v.val = sk->sk_rcvlowat;
1149 break;
1da177e4 1150
e71a4783 1151 case SO_SNDLOWAT:
2a91525c 1152 v.val = 1;
e71a4783 1153 break;
1da177e4 1154
e71a4783 1155 case SO_PASSCRED:
82981930 1156 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
e71a4783 1157 break;
1da177e4 1158
e71a4783 1159 case SO_PEERCRED:
109f6e39
EB
1160 {
1161 struct ucred peercred;
1162 if (len > sizeof(peercred))
1163 len = sizeof(peercred);
1164 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1165 if (copy_to_user(optval, &peercred, len))
e71a4783
SH
1166 return -EFAULT;
1167 goto lenout;
109f6e39 1168 }
1da177e4 1169
e71a4783
SH
1170 case SO_PEERNAME:
1171 {
1172 char address[128];
1173
1174 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1175 return -ENOTCONN;
1176 if (lv < len)
1177 return -EINVAL;
1178 if (copy_to_user(optval, address, len))
1179 return -EFAULT;
1180 goto lenout;
1181 }
1da177e4 1182
e71a4783
SH
1183 /* Dubious BSD thing... Probably nobody even uses it, but
1184 * the UNIX standard wants it for whatever reason... -DaveM
1185 */
1186 case SO_ACCEPTCONN:
1187 v.val = sk->sk_state == TCP_LISTEN;
1188 break;
1da177e4 1189
e71a4783 1190 case SO_PASSSEC:
82981930 1191 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
e71a4783 1192 break;
877ce7c1 1193
e71a4783
SH
1194 case SO_PEERSEC:
1195 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1da177e4 1196
4a19ec58
LAT
1197 case SO_MARK:
1198 v.val = sk->sk_mark;
1199 break;
1200
3b885787 1201 case SO_RXQ_OVFL:
1b23a5df 1202 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
3b885787
NH
1203 break;
1204
6e3e939f 1205 case SO_WIFI_STATUS:
1b23a5df 1206 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
6e3e939f
JB
1207 break;
1208
ef64a54f
PE
1209 case SO_PEEK_OFF:
1210 if (!sock->ops->set_peek_off)
1211 return -EOPNOTSUPP;
1212
1213 v.val = sk->sk_peek_off;
1214 break;
bc2f7996 1215 case SO_NOFCS:
1b23a5df 1216 v.val = sock_flag(sk, SOCK_NOFCS);
bc2f7996 1217 break;
c91f6df2 1218
f7b86bfe 1219 case SO_BINDTODEVICE:
c91f6df2
BH
1220 return sock_getbindtodevice(sk, optval, optlen, len);
1221
a8fc9277
PE
1222 case SO_GET_FILTER:
1223 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1224 if (len < 0)
1225 return len;
1226
1227 goto lenout;
c91f6df2 1228
d59577b6
VB
1229 case SO_LOCK_FILTER:
1230 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1231 break;
1232
ea02f941
MS
1233 case SO_BPF_EXTENSIONS:
1234 v.val = bpf_tell_extensions();
1235 break;
1236
7d4c04fc
KJ
1237 case SO_SELECT_ERR_QUEUE:
1238 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1239 break;
1240
e0d1095a 1241#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 1242 case SO_BUSY_POLL:
dafcc438
ET
1243 v.val = sk->sk_ll_usec;
1244 break;
1245#endif
1246
62748f32
ED
1247 case SO_MAX_PACING_RATE:
1248 v.val = sk->sk_max_pacing_rate;
1249 break;
1250
2c8c56e1
ED
1251 case SO_INCOMING_CPU:
1252 v.val = sk->sk_incoming_cpu;
1253 break;
1254
e71a4783 1255 default:
443b5991
YH
1256 /* We implement the SO_SNDLOWAT etc to not be settable
1257 * (1003.1g 7).
1258 */
e71a4783 1259 return -ENOPROTOOPT;
1da177e4 1260 }
e71a4783 1261
1da177e4
LT
1262 if (len > lv)
1263 len = lv;
1264 if (copy_to_user(optval, &v, len))
1265 return -EFAULT;
1266lenout:
4ec93edb
YH
1267 if (put_user(len, optlen))
1268 return -EFAULT;
1269 return 0;
1da177e4
LT
1270}
1271
a5b5bb9a
IM
1272/*
1273 * Initialize an sk_lock.
1274 *
1275 * (We also register the sk_lock with the lock validator.)
1276 */
b6f99a21 1277static inline void sock_lock_init(struct sock *sk)
a5b5bb9a 1278{
ed07536e
PZ
1279 sock_lock_init_class_and_name(sk,
1280 af_family_slock_key_strings[sk->sk_family],
1281 af_family_slock_keys + sk->sk_family,
1282 af_family_key_strings[sk->sk_family],
1283 af_family_keys + sk->sk_family);
a5b5bb9a
IM
1284}
1285
4dc6dc71
ED
1286/*
1287 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1288 * even temporarly, because of RCU lookups. sk_node should also be left as is.
68835aba 1289 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
4dc6dc71 1290 */
f1a6c4da
PE
1291static void sock_copy(struct sock *nsk, const struct sock *osk)
1292{
1293#ifdef CONFIG_SECURITY_NETWORK
1294 void *sptr = nsk->sk_security;
1295#endif
68835aba
ED
1296 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1297
1298 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1299 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1300
f1a6c4da
PE
1301#ifdef CONFIG_SECURITY_NETWORK
1302 nsk->sk_security = sptr;
1303 security_sk_clone(osk, nsk);
1304#endif
1305}
1306
fcbdf09d
OP
1307void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1308{
1309 unsigned long nulls1, nulls2;
1310
1311 nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1312 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1313 if (nulls1 > nulls2)
1314 swap(nulls1, nulls2);
1315
1316 if (nulls1 != 0)
1317 memset((char *)sk, 0, nulls1);
1318 memset((char *)sk + nulls1 + sizeof(void *), 0,
1319 nulls2 - nulls1 - sizeof(void *));
1320 memset((char *)sk + nulls2 + sizeof(void *), 0,
1321 size - nulls2 - sizeof(void *));
1322}
1323EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1324
2e4afe7b
PE
1325static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1326 int family)
c308c1b2
PE
1327{
1328 struct sock *sk;
1329 struct kmem_cache *slab;
1330
1331 slab = prot->slab;
e912b114
ED
1332 if (slab != NULL) {
1333 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1334 if (!sk)
1335 return sk;
1336 if (priority & __GFP_ZERO) {
fcbdf09d
OP
1337 if (prot->clear_sk)
1338 prot->clear_sk(sk, prot->obj_size);
1339 else
1340 sk_prot_clear_nulls(sk, prot->obj_size);
e912b114 1341 }
fcbdf09d 1342 } else
c308c1b2
PE
1343 sk = kmalloc(prot->obj_size, priority);
1344
2e4afe7b 1345 if (sk != NULL) {
a98b65a3
VN
1346 kmemcheck_annotate_bitfield(sk, flags);
1347
2e4afe7b
PE
1348 if (security_sk_alloc(sk, family, priority))
1349 goto out_free;
1350
1351 if (!try_module_get(prot->owner))
1352 goto out_free_sec;
e022f0b4 1353 sk_tx_queue_clear(sk);
bd1060a1 1354 cgroup_sk_alloc(&sk->sk_cgrp_data);
2e4afe7b
PE
1355 }
1356
c308c1b2 1357 return sk;
2e4afe7b
PE
1358
1359out_free_sec:
1360 security_sk_free(sk);
1361out_free:
1362 if (slab != NULL)
1363 kmem_cache_free(slab, sk);
1364 else
1365 kfree(sk);
1366 return NULL;
c308c1b2
PE
1367}
1368
1369static void sk_prot_free(struct proto *prot, struct sock *sk)
1370{
1371 struct kmem_cache *slab;
2e4afe7b 1372 struct module *owner;
c308c1b2 1373
2e4afe7b 1374 owner = prot->owner;
c308c1b2 1375 slab = prot->slab;
2e4afe7b 1376
bd1060a1 1377 cgroup_sk_free(&sk->sk_cgrp_data);
2e4afe7b 1378 security_sk_free(sk);
c308c1b2
PE
1379 if (slab != NULL)
1380 kmem_cache_free(slab, sk);
1381 else
1382 kfree(sk);
2e4afe7b 1383 module_put(owner);
c308c1b2
PE
1384}
1385
1da177e4
LT
1386/**
1387 * sk_alloc - All socket objects are allocated here
c4ea43c5 1388 * @net: the applicable net namespace
4dc3b16b
PP
1389 * @family: protocol family
1390 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1391 * @prot: struct proto associated with this new sock instance
11aa9c28 1392 * @kern: is this to be a kernel socket?
1da177e4 1393 */
1b8d7ae4 1394struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
11aa9c28 1395 struct proto *prot, int kern)
1da177e4 1396{
c308c1b2 1397 struct sock *sk;
1da177e4 1398
154adbc8 1399 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1da177e4 1400 if (sk) {
154adbc8
PE
1401 sk->sk_family = family;
1402 /*
1403 * See comment in struct sock definition to understand
1404 * why we need sk_prot_creator -acme
1405 */
1406 sk->sk_prot = sk->sk_prot_creator = prot;
1407 sock_lock_init(sk);
26abe143
EB
1408 sk->sk_net_refcnt = kern ? 0 : 1;
1409 if (likely(sk->sk_net_refcnt))
1410 get_net(net);
1411 sock_net_set(sk, net);
d66ee058 1412 atomic_set(&sk->sk_wmem_alloc, 1);
f8451725 1413
2a56a1fe
TH
1414 sock_update_classid(&sk->sk_cgrp_data);
1415 sock_update_netprioidx(&sk->sk_cgrp_data);
1da177e4 1416 }
a79af59e 1417
2e4afe7b 1418 return sk;
1da177e4 1419}
2a91525c 1420EXPORT_SYMBOL(sk_alloc);
1da177e4 1421
eb4cb008 1422void sk_destruct(struct sock *sk)
1da177e4
LT
1423{
1424 struct sk_filter *filter;
1da177e4
LT
1425
1426 if (sk->sk_destruct)
1427 sk->sk_destruct(sk);
1428
a898def2
PM
1429 filter = rcu_dereference_check(sk->sk_filter,
1430 atomic_read(&sk->sk_wmem_alloc) == 0);
1da177e4 1431 if (filter) {
309dd5fc 1432 sk_filter_uncharge(sk, filter);
a9b3cd7f 1433 RCU_INIT_POINTER(sk->sk_filter, NULL);
1da177e4 1434 }
538950a1
CG
1435 if (rcu_access_pointer(sk->sk_reuseport_cb))
1436 reuseport_detach_sock(sk);
1da177e4 1437
08e29af3 1438 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1da177e4
LT
1439
1440 if (atomic_read(&sk->sk_omem_alloc))
e005d193
JP
1441 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1442 __func__, atomic_read(&sk->sk_omem_alloc));
1da177e4 1443
109f6e39
EB
1444 if (sk->sk_peer_cred)
1445 put_cred(sk->sk_peer_cred);
1446 put_pid(sk->sk_peer_pid);
26abe143
EB
1447 if (likely(sk->sk_net_refcnt))
1448 put_net(sock_net(sk));
c308c1b2 1449 sk_prot_free(sk->sk_prot_creator, sk);
1da177e4 1450}
2b85a34e 1451
eb4cb008
CG
1452static void __sk_free(struct sock *sk)
1453{
b922622e 1454 if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
eb4cb008
CG
1455 sock_diag_broadcast_destroy(sk);
1456 else
1457 sk_destruct(sk);
1458}
1459
2b85a34e
ED
1460void sk_free(struct sock *sk)
1461{
1462 /*
25985edc 1463 * We subtract one from sk_wmem_alloc and can know if
2b85a34e
ED
1464 * some packets are still in some tx queue.
1465 * If not null, sock_wfree() will call __sk_free(sk) later
1466 */
1467 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1468 __sk_free(sk);
1469}
2a91525c 1470EXPORT_SYMBOL(sk_free);
1da177e4 1471
e56c57d0
ED
1472/**
1473 * sk_clone_lock - clone a socket, and lock its clone
1474 * @sk: the socket to clone
1475 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1476 *
1477 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1478 */
1479struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
87d11ceb 1480{
8fd1d178 1481 struct sock *newsk;
278571ba 1482 bool is_charged = true;
87d11ceb 1483
8fd1d178 1484 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
87d11ceb
ACM
1485 if (newsk != NULL) {
1486 struct sk_filter *filter;
1487
892c141e 1488 sock_copy(newsk, sk);
87d11ceb
ACM
1489
1490 /* SANITY */
8a681736
SV
1491 if (likely(newsk->sk_net_refcnt))
1492 get_net(sock_net(newsk));
87d11ceb
ACM
1493 sk_node_init(&newsk->sk_node);
1494 sock_lock_init(newsk);
1495 bh_lock_sock(newsk);
fa438ccf 1496 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
8eae939f 1497 newsk->sk_backlog.len = 0;
87d11ceb
ACM
1498
1499 atomic_set(&newsk->sk_rmem_alloc, 0);
2b85a34e
ED
1500 /*
1501 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1502 */
1503 atomic_set(&newsk->sk_wmem_alloc, 1);
87d11ceb
ACM
1504 atomic_set(&newsk->sk_omem_alloc, 0);
1505 skb_queue_head_init(&newsk->sk_receive_queue);
1506 skb_queue_head_init(&newsk->sk_write_queue);
1507
87d11ceb 1508 rwlock_init(&newsk->sk_callback_lock);
443aef0e
PZ
1509 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1510 af_callback_keys + newsk->sk_family,
1511 af_family_clock_key_strings[newsk->sk_family]);
87d11ceb
ACM
1512
1513 newsk->sk_dst_cache = NULL;
1514 newsk->sk_wmem_queued = 0;
1515 newsk->sk_forward_alloc = 0;
1516 newsk->sk_send_head = NULL;
87d11ceb
ACM
1517 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1518
1519 sock_reset_flag(newsk, SOCK_DONE);
1520 skb_queue_head_init(&newsk->sk_error_queue);
1521
0d7da9dd 1522 filter = rcu_dereference_protected(newsk->sk_filter, 1);
87d11ceb 1523 if (filter != NULL)
278571ba
AS
1524 /* though it's an empty new sock, the charging may fail
1525 * if sysctl_optmem_max was changed between creation of
1526 * original socket and cloning
1527 */
1528 is_charged = sk_filter_charge(newsk, filter);
87d11ceb 1529
d188ba86 1530 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
87d11ceb
ACM
1531 /* It is still raw copy of parent, so invalidate
1532 * destructor and make plain sk_free() */
1533 newsk->sk_destruct = NULL;
b0691c8e 1534 bh_unlock_sock(newsk);
87d11ceb
ACM
1535 sk_free(newsk);
1536 newsk = NULL;
1537 goto out;
1538 }
fa463497 1539 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
87d11ceb
ACM
1540
1541 newsk->sk_err = 0;
1542 newsk->sk_priority = 0;
2c8c56e1 1543 newsk->sk_incoming_cpu = raw_smp_processor_id();
33cf7c90 1544 atomic64_set(&newsk->sk_cookie, 0);
4dc6dc71
ED
1545 /*
1546 * Before updating sk_refcnt, we must commit prior changes to memory
1547 * (Documentation/RCU/rculist_nulls.txt for details)
1548 */
1549 smp_wmb();
87d11ceb
ACM
1550 atomic_set(&newsk->sk_refcnt, 2);
1551
1552 /*
1553 * Increment the counter in the same struct proto as the master
1554 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1555 * is the same as sk->sk_prot->socks, as this field was copied
1556 * with memcpy).
1557 *
1558 * This _changes_ the previous behaviour, where
1559 * tcp_create_openreq_child always was incrementing the
1560 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1561 * to be taken into account in all callers. -acme
1562 */
1563 sk_refcnt_debug_inc(newsk);
972692e0 1564 sk_set_socket(newsk, NULL);
43815482 1565 newsk->sk_wq = NULL;
87d11ceb 1566
baac50bb 1567 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3d596f7b 1568 sock_update_memcg(newsk);
f3f511e1 1569
87d11ceb 1570 if (newsk->sk_prot->sockets_allocated)
180d8cd9 1571 sk_sockets_allocated_inc(newsk);
704da560 1572
080a270f
HFS
1573 if (sock_needs_netstamp(sk) &&
1574 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
704da560 1575 net_enable_timestamp();
87d11ceb
ACM
1576 }
1577out:
1578 return newsk;
1579}
e56c57d0 1580EXPORT_SYMBOL_GPL(sk_clone_lock);
87d11ceb 1581
9958089a
AK
1582void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1583{
d6a4e26a
ED
1584 u32 max_segs = 1;
1585
6bd4f355 1586 sk_dst_set(sk, dst);
9958089a
AK
1587 sk->sk_route_caps = dst->dev->features;
1588 if (sk->sk_route_caps & NETIF_F_GSO)
4fcd6b99 1589 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
a465419b 1590 sk->sk_route_caps &= ~sk->sk_route_nocaps;
9958089a 1591 if (sk_can_gso(sk)) {
82cc1a7a 1592 if (dst->header_len) {
9958089a 1593 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
82cc1a7a 1594 } else {
9958089a 1595 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
82cc1a7a 1596 sk->sk_gso_max_size = dst->dev->gso_max_size;
d6a4e26a 1597 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
82cc1a7a 1598 }
9958089a 1599 }
d6a4e26a 1600 sk->sk_gso_max_segs = max_segs;
9958089a
AK
1601}
1602EXPORT_SYMBOL_GPL(sk_setup_caps);
1603
1da177e4
LT
1604/*
1605 * Simple resource managers for sockets.
1606 */
1607
1608
4ec93edb
YH
1609/*
1610 * Write buffer destructor automatically called from kfree_skb.
1da177e4
LT
1611 */
1612void sock_wfree(struct sk_buff *skb)
1613{
1614 struct sock *sk = skb->sk;
d99927f4 1615 unsigned int len = skb->truesize;
1da177e4 1616
d99927f4
ED
1617 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1618 /*
1619 * Keep a reference on sk_wmem_alloc, this will be released
1620 * after sk_write_space() call
1621 */
1622 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1da177e4 1623 sk->sk_write_space(sk);
d99927f4
ED
1624 len = 1;
1625 }
2b85a34e 1626 /*
d99927f4
ED
1627 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1628 * could not do because of in-flight packets
2b85a34e 1629 */
d99927f4 1630 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
2b85a34e 1631 __sk_free(sk);
1da177e4 1632}
2a91525c 1633EXPORT_SYMBOL(sock_wfree);
1da177e4 1634
9e17f8a4
ED
1635void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1636{
1637 skb_orphan(skb);
1638 skb->sk = sk;
1639#ifdef CONFIG_INET
1640 if (unlikely(!sk_fullsock(sk))) {
1641 skb->destructor = sock_edemux;
1642 sock_hold(sk);
1643 return;
1644 }
1645#endif
1646 skb->destructor = sock_wfree;
1647 skb_set_hash_from_sk(skb, sk);
1648 /*
1649 * We used to take a refcount on sk, but following operation
1650 * is enough to guarantee sk_free() wont free this sock until
1651 * all in-flight packets are completed
1652 */
1653 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1654}
1655EXPORT_SYMBOL(skb_set_owner_w);
1656
f2f872f9
ED
1657void skb_orphan_partial(struct sk_buff *skb)
1658{
1659 /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1660 * so we do not completely orphan skb, but transfert all
1661 * accounted bytes but one, to avoid unexpected reorders.
1662 */
1663 if (skb->destructor == sock_wfree
1664#ifdef CONFIG_INET
1665 || skb->destructor == tcp_wfree
1666#endif
1667 ) {
1668 atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1669 skb->truesize = 1;
1670 } else {
1671 skb_orphan(skb);
1672 }
1673}
1674EXPORT_SYMBOL(skb_orphan_partial);
1675
4ec93edb
YH
1676/*
1677 * Read buffer destructor automatically called from kfree_skb.
1da177e4
LT
1678 */
1679void sock_rfree(struct sk_buff *skb)
1680{
1681 struct sock *sk = skb->sk;
d361fd59 1682 unsigned int len = skb->truesize;
1da177e4 1683
d361fd59
ED
1684 atomic_sub(len, &sk->sk_rmem_alloc);
1685 sk_mem_uncharge(sk, len);
1da177e4 1686}
2a91525c 1687EXPORT_SYMBOL(sock_rfree);
1da177e4 1688
7768eed8
OH
1689/*
1690 * Buffer destructor for skbs that are not used directly in read or write
1691 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1692 */
62bccb8c
AD
1693void sock_efree(struct sk_buff *skb)
1694{
1695 sock_put(skb->sk);
1696}
1697EXPORT_SYMBOL(sock_efree);
1698
976d0201 1699kuid_t sock_i_uid(struct sock *sk)
1da177e4 1700{
976d0201 1701 kuid_t uid;
1da177e4 1702
f064af1e 1703 read_lock_bh(&sk->sk_callback_lock);
976d0201 1704 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
f064af1e 1705 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1706 return uid;
1707}
2a91525c 1708EXPORT_SYMBOL(sock_i_uid);
1da177e4
LT
1709
1710unsigned long sock_i_ino(struct sock *sk)
1711{
1712 unsigned long ino;
1713
f064af1e 1714 read_lock_bh(&sk->sk_callback_lock);
1da177e4 1715 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
f064af1e 1716 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1717 return ino;
1718}
2a91525c 1719EXPORT_SYMBOL(sock_i_ino);
1da177e4
LT
1720
1721/*
1722 * Allocate a skb from the socket's send buffer.
1723 */
86a76caf 1724struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 1725 gfp_t priority)
1da177e4
LT
1726{
1727 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
2a91525c 1728 struct sk_buff *skb = alloc_skb(size, priority);
1da177e4
LT
1729 if (skb) {
1730 skb_set_owner_w(skb, sk);
1731 return skb;
1732 }
1733 }
1734 return NULL;
1735}
2a91525c 1736EXPORT_SYMBOL(sock_wmalloc);
1da177e4 1737
4ec93edb 1738/*
1da177e4 1739 * Allocate a memory block from the socket's option memory buffer.
4ec93edb 1740 */
dd0fc66f 1741void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1da177e4 1742{
95c96174 1743 if ((unsigned int)size <= sysctl_optmem_max &&
1da177e4
LT
1744 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1745 void *mem;
1746 /* First do the add, to avoid the race if kmalloc
4ec93edb 1747 * might sleep.
1da177e4
LT
1748 */
1749 atomic_add(size, &sk->sk_omem_alloc);
1750 mem = kmalloc(size, priority);
1751 if (mem)
1752 return mem;
1753 atomic_sub(size, &sk->sk_omem_alloc);
1754 }
1755 return NULL;
1756}
2a91525c 1757EXPORT_SYMBOL(sock_kmalloc);
1da177e4 1758
79e88659
DB
1759/* Free an option memory block. Note, we actually want the inline
1760 * here as this allows gcc to detect the nullify and fold away the
1761 * condition entirely.
1da177e4 1762 */
79e88659
DB
1763static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1764 const bool nullify)
1da177e4 1765{
e53da5fb
DM
1766 if (WARN_ON_ONCE(!mem))
1767 return;
79e88659
DB
1768 if (nullify)
1769 kzfree(mem);
1770 else
1771 kfree(mem);
1da177e4
LT
1772 atomic_sub(size, &sk->sk_omem_alloc);
1773}
79e88659
DB
1774
1775void sock_kfree_s(struct sock *sk, void *mem, int size)
1776{
1777 __sock_kfree_s(sk, mem, size, false);
1778}
2a91525c 1779EXPORT_SYMBOL(sock_kfree_s);
1da177e4 1780
79e88659
DB
1781void sock_kzfree_s(struct sock *sk, void *mem, int size)
1782{
1783 __sock_kfree_s(sk, mem, size, true);
1784}
1785EXPORT_SYMBOL(sock_kzfree_s);
1786
1da177e4
LT
1787/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1788 I think, these locks should be removed for datagram sockets.
1789 */
2a91525c 1790static long sock_wait_for_wmem(struct sock *sk, long timeo)
1da177e4
LT
1791{
1792 DEFINE_WAIT(wait);
1793
9cd3e072 1794 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1da177e4
LT
1795 for (;;) {
1796 if (!timeo)
1797 break;
1798 if (signal_pending(current))
1799 break;
1800 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
aa395145 1801 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1da177e4
LT
1802 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1803 break;
1804 if (sk->sk_shutdown & SEND_SHUTDOWN)
1805 break;
1806 if (sk->sk_err)
1807 break;
1808 timeo = schedule_timeout(timeo);
1809 }
aa395145 1810 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
1811 return timeo;
1812}
1813
1814
1815/*
1816 * Generic send/receive buffer handlers
1817 */
1818
4cc7f68d
HX
1819struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1820 unsigned long data_len, int noblock,
28d64271 1821 int *errcode, int max_page_order)
1da177e4 1822{
2e4e4410 1823 struct sk_buff *skb;
1da177e4
LT
1824 long timeo;
1825 int err;
1826
1da177e4 1827 timeo = sock_sndtimeo(sk, noblock);
2e4e4410 1828 for (;;) {
1da177e4
LT
1829 err = sock_error(sk);
1830 if (err != 0)
1831 goto failure;
1832
1833 err = -EPIPE;
1834 if (sk->sk_shutdown & SEND_SHUTDOWN)
1835 goto failure;
1836
2e4e4410
ED
1837 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1838 break;
28d64271 1839
9cd3e072 1840 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2e4e4410
ED
1841 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1842 err = -EAGAIN;
1843 if (!timeo)
1da177e4 1844 goto failure;
2e4e4410
ED
1845 if (signal_pending(current))
1846 goto interrupted;
1847 timeo = sock_wait_for_wmem(sk, timeo);
1da177e4 1848 }
2e4e4410
ED
1849 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1850 errcode, sk->sk_allocation);
1851 if (skb)
1852 skb_set_owner_w(skb, sk);
1da177e4
LT
1853 return skb;
1854
1855interrupted:
1856 err = sock_intr_errno(timeo);
1857failure:
1858 *errcode = err;
1859 return NULL;
1860}
4cc7f68d 1861EXPORT_SYMBOL(sock_alloc_send_pskb);
1da177e4 1862
4ec93edb 1863struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1da177e4
LT
1864 int noblock, int *errcode)
1865{
28d64271 1866 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1da177e4 1867}
2a91525c 1868EXPORT_SYMBOL(sock_alloc_send_skb);
1da177e4 1869
39771b12
WB
1870int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
1871 struct sockcm_cookie *sockc)
1872{
1873 switch (cmsg->cmsg_type) {
1874 case SO_MARK:
1875 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1876 return -EPERM;
1877 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1878 return -EINVAL;
1879 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1880 break;
1881 default:
1882 return -EINVAL;
1883 }
1884 return 0;
1885}
1886EXPORT_SYMBOL(__sock_cmsg_send);
1887
f28ea365
EJ
1888int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1889 struct sockcm_cookie *sockc)
1890{
1891 struct cmsghdr *cmsg;
39771b12 1892 int ret;
f28ea365
EJ
1893
1894 for_each_cmsghdr(cmsg, msg) {
1895 if (!CMSG_OK(msg, cmsg))
1896 return -EINVAL;
1897 if (cmsg->cmsg_level != SOL_SOCKET)
1898 continue;
39771b12
WB
1899 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
1900 if (ret)
1901 return ret;
f28ea365
EJ
1902 }
1903 return 0;
1904}
1905EXPORT_SYMBOL(sock_cmsg_send);
1906
5640f768
ED
1907/* On 32bit arches, an skb frag is limited to 2^15 */
1908#define SKB_FRAG_PAGE_ORDER get_order(32768)
1909
400dfd3a
ED
1910/**
1911 * skb_page_frag_refill - check that a page_frag contains enough room
1912 * @sz: minimum size of the fragment we want to get
1913 * @pfrag: pointer to page_frag
82d5e2b8 1914 * @gfp: priority for memory allocation
400dfd3a
ED
1915 *
1916 * Note: While this allocator tries to use high order pages, there is
1917 * no guarantee that allocations succeed. Therefore, @sz MUST be
1918 * less or equal than PAGE_SIZE.
1919 */
d9b2938a 1920bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
5640f768 1921{
5640f768 1922 if (pfrag->page) {
fe896d18 1923 if (page_ref_count(pfrag->page) == 1) {
5640f768
ED
1924 pfrag->offset = 0;
1925 return true;
1926 }
400dfd3a 1927 if (pfrag->offset + sz <= pfrag->size)
5640f768
ED
1928 return true;
1929 put_page(pfrag->page);
1930 }
1931
d9b2938a
ED
1932 pfrag->offset = 0;
1933 if (SKB_FRAG_PAGE_ORDER) {
d0164adc
MG
1934 /* Avoid direct reclaim but allow kswapd to wake */
1935 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1936 __GFP_COMP | __GFP_NOWARN |
1937 __GFP_NORETRY,
d9b2938a 1938 SKB_FRAG_PAGE_ORDER);
5640f768 1939 if (likely(pfrag->page)) {
d9b2938a 1940 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
5640f768
ED
1941 return true;
1942 }
d9b2938a
ED
1943 }
1944 pfrag->page = alloc_page(gfp);
1945 if (likely(pfrag->page)) {
1946 pfrag->size = PAGE_SIZE;
1947 return true;
1948 }
400dfd3a
ED
1949 return false;
1950}
1951EXPORT_SYMBOL(skb_page_frag_refill);
1952
1953bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1954{
1955 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1956 return true;
1957
5640f768
ED
1958 sk_enter_memory_pressure(sk);
1959 sk_stream_moderate_sndbuf(sk);
1960 return false;
1961}
1962EXPORT_SYMBOL(sk_page_frag_refill);
1963
1da177e4 1964static void __lock_sock(struct sock *sk)
f39234d6
NK
1965 __releases(&sk->sk_lock.slock)
1966 __acquires(&sk->sk_lock.slock)
1da177e4
LT
1967{
1968 DEFINE_WAIT(wait);
1969
e71a4783 1970 for (;;) {
1da177e4
LT
1971 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1972 TASK_UNINTERRUPTIBLE);
1973 spin_unlock_bh(&sk->sk_lock.slock);
1974 schedule();
1975 spin_lock_bh(&sk->sk_lock.slock);
e71a4783 1976 if (!sock_owned_by_user(sk))
1da177e4
LT
1977 break;
1978 }
1979 finish_wait(&sk->sk_lock.wq, &wait);
1980}
1981
1982static void __release_sock(struct sock *sk)
f39234d6
NK
1983 __releases(&sk->sk_lock.slock)
1984 __acquires(&sk->sk_lock.slock)
1da177e4
LT
1985{
1986 struct sk_buff *skb = sk->sk_backlog.head;
1987
1988 do {
1989 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1990 bh_unlock_sock(sk);
1991
1992 do {
1993 struct sk_buff *next = skb->next;
1994
e4cbb02a 1995 prefetch(next);
7fee226a 1996 WARN_ON_ONCE(skb_dst_is_noref(skb));
1da177e4 1997 skb->next = NULL;
c57943a1 1998 sk_backlog_rcv(sk, skb);
1da177e4
LT
1999
2000 /*
2001 * We are in process context here with softirqs
2002 * disabled, use cond_resched_softirq() to preempt.
2003 * This is safe to do because we've taken the backlog
2004 * queue private:
2005 */
2006 cond_resched_softirq();
2007
2008 skb = next;
2009 } while (skb != NULL);
2010
2011 bh_lock_sock(sk);
e71a4783 2012 } while ((skb = sk->sk_backlog.head) != NULL);
8eae939f
ZY
2013
2014 /*
2015 * Doing the zeroing here guarantee we can not loop forever
2016 * while a wild producer attempts to flood us.
2017 */
2018 sk->sk_backlog.len = 0;
1da177e4
LT
2019}
2020
2021/**
2022 * sk_wait_data - wait for data to arrive at sk_receive_queue
4dc3b16b
PP
2023 * @sk: sock to wait on
2024 * @timeo: for how long
dfbafc99 2025 * @skb: last skb seen on sk_receive_queue
1da177e4
LT
2026 *
2027 * Now socket state including sk->sk_err is changed only under lock,
2028 * hence we may omit checks after joining wait queue.
2029 * We check receive queue before schedule() only as optimization;
2030 * it is very likely that release_sock() added new data.
2031 */
dfbafc99 2032int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
1da177e4
LT
2033{
2034 int rc;
2035 DEFINE_WAIT(wait);
2036
aa395145 2037 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
9cd3e072 2038 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
dfbafc99 2039 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
9cd3e072 2040 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
aa395145 2041 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
2042 return rc;
2043}
1da177e4
LT
2044EXPORT_SYMBOL(sk_wait_data);
2045
3ab224be
HA
2046/**
2047 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2048 * @sk: socket
2049 * @size: memory size to allocate
2050 * @kind: allocation type
2051 *
2052 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2053 * rmem allocation. This function assumes that protocols which have
2054 * memory_pressure use sk_wmem_queued as write buffer accounting.
2055 */
2056int __sk_mem_schedule(struct sock *sk, int size, int kind)
2057{
2058 struct proto *prot = sk->sk_prot;
2059 int amt = sk_mem_pages(size);
8d987e5c 2060 long allocated;
3ab224be
HA
2061
2062 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
180d8cd9 2063
e805605c
JW
2064 allocated = sk_memory_allocated_add(sk, amt);
2065
baac50bb
JW
2066 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2067 !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
e805605c 2068 goto suppress_allocation;
3ab224be
HA
2069
2070 /* Under limit. */
e805605c 2071 if (allocated <= sk_prot_mem_limits(sk, 0)) {
180d8cd9 2072 sk_leave_memory_pressure(sk);
3ab224be
HA
2073 return 1;
2074 }
2075
e805605c
JW
2076 /* Under pressure. */
2077 if (allocated > sk_prot_mem_limits(sk, 1))
180d8cd9 2078 sk_enter_memory_pressure(sk);
3ab224be 2079
e805605c
JW
2080 /* Over hard limit. */
2081 if (allocated > sk_prot_mem_limits(sk, 2))
3ab224be
HA
2082 goto suppress_allocation;
2083
2084 /* guarantee minimum buffer size under pressure */
2085 if (kind == SK_MEM_RECV) {
2086 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2087 return 1;
180d8cd9 2088
3ab224be
HA
2089 } else { /* SK_MEM_SEND */
2090 if (sk->sk_type == SOCK_STREAM) {
2091 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2092 return 1;
2093 } else if (atomic_read(&sk->sk_wmem_alloc) <
2094 prot->sysctl_wmem[0])
2095 return 1;
2096 }
2097
180d8cd9 2098 if (sk_has_memory_pressure(sk)) {
1748376b
ED
2099 int alloc;
2100
180d8cd9 2101 if (!sk_under_memory_pressure(sk))
1748376b 2102 return 1;
180d8cd9
GC
2103 alloc = sk_sockets_allocated_read_positive(sk);
2104 if (sk_prot_mem_limits(sk, 2) > alloc *
3ab224be
HA
2105 sk_mem_pages(sk->sk_wmem_queued +
2106 atomic_read(&sk->sk_rmem_alloc) +
2107 sk->sk_forward_alloc))
2108 return 1;
2109 }
2110
2111suppress_allocation:
2112
2113 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2114 sk_stream_moderate_sndbuf(sk);
2115
2116 /* Fail only if socket is _under_ its sndbuf.
2117 * In this case we cannot block, so that we have to fail.
2118 */
2119 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2120 return 1;
2121 }
2122
3847ce32
SM
2123 trace_sock_exceed_buf_limit(sk, prot, allocated);
2124
3ab224be
HA
2125 /* Alas. Undo changes. */
2126 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
180d8cd9 2127
0e90b31f 2128 sk_memory_allocated_sub(sk, amt);
180d8cd9 2129
baac50bb
JW
2130 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2131 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
e805605c 2132
3ab224be
HA
2133 return 0;
2134}
3ab224be
HA
2135EXPORT_SYMBOL(__sk_mem_schedule);
2136
2137/**
69dba9bb 2138 * __sk_mem_reclaim - reclaim memory_allocated
3ab224be 2139 * @sk: socket
1a24e04e 2140 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
3ab224be 2141 */
1a24e04e 2142void __sk_mem_reclaim(struct sock *sk, int amount)
3ab224be 2143{
1a24e04e
ED
2144 amount >>= SK_MEM_QUANTUM_SHIFT;
2145 sk_memory_allocated_sub(sk, amount);
2146 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
3ab224be 2147
baac50bb
JW
2148 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2149 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
e805605c 2150
180d8cd9
GC
2151 if (sk_under_memory_pressure(sk) &&
2152 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2153 sk_leave_memory_pressure(sk);
3ab224be 2154}
3ab224be
HA
2155EXPORT_SYMBOL(__sk_mem_reclaim);
2156
2157
1da177e4
LT
2158/*
2159 * Set of default routines for initialising struct proto_ops when
2160 * the protocol does not support a particular function. In certain
2161 * cases where it makes no sense for a protocol to have a "do nothing"
2162 * function, some default processing is provided.
2163 */
2164
2165int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2166{
2167 return -EOPNOTSUPP;
2168}
2a91525c 2169EXPORT_SYMBOL(sock_no_bind);
1da177e4 2170
4ec93edb 2171int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
2172 int len, int flags)
2173{
2174 return -EOPNOTSUPP;
2175}
2a91525c 2176EXPORT_SYMBOL(sock_no_connect);
1da177e4
LT
2177
2178int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2179{
2180 return -EOPNOTSUPP;
2181}
2a91525c 2182EXPORT_SYMBOL(sock_no_socketpair);
1da177e4
LT
2183
2184int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2185{
2186 return -EOPNOTSUPP;
2187}
2a91525c 2188EXPORT_SYMBOL(sock_no_accept);
1da177e4 2189
4ec93edb 2190int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
2191 int *len, int peer)
2192{
2193 return -EOPNOTSUPP;
2194}
2a91525c 2195EXPORT_SYMBOL(sock_no_getname);
1da177e4 2196
2a91525c 2197unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1da177e4
LT
2198{
2199 return 0;
2200}
2a91525c 2201EXPORT_SYMBOL(sock_no_poll);
1da177e4
LT
2202
2203int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2204{
2205 return -EOPNOTSUPP;
2206}
2a91525c 2207EXPORT_SYMBOL(sock_no_ioctl);
1da177e4
LT
2208
2209int sock_no_listen(struct socket *sock, int backlog)
2210{
2211 return -EOPNOTSUPP;
2212}
2a91525c 2213EXPORT_SYMBOL(sock_no_listen);
1da177e4
LT
2214
2215int sock_no_shutdown(struct socket *sock, int how)
2216{
2217 return -EOPNOTSUPP;
2218}
2a91525c 2219EXPORT_SYMBOL(sock_no_shutdown);
1da177e4
LT
2220
2221int sock_no_setsockopt(struct socket *sock, int level, int optname,
b7058842 2222 char __user *optval, unsigned int optlen)
1da177e4
LT
2223{
2224 return -EOPNOTSUPP;
2225}
2a91525c 2226EXPORT_SYMBOL(sock_no_setsockopt);
1da177e4
LT
2227
2228int sock_no_getsockopt(struct socket *sock, int level, int optname,
2229 char __user *optval, int __user *optlen)
2230{
2231 return -EOPNOTSUPP;
2232}
2a91525c 2233EXPORT_SYMBOL(sock_no_getsockopt);
1da177e4 2234
1b784140 2235int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
1da177e4
LT
2236{
2237 return -EOPNOTSUPP;
2238}
2a91525c 2239EXPORT_SYMBOL(sock_no_sendmsg);
1da177e4 2240
1b784140
YX
2241int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2242 int flags)
1da177e4
LT
2243{
2244 return -EOPNOTSUPP;
2245}
2a91525c 2246EXPORT_SYMBOL(sock_no_recvmsg);
1da177e4
LT
2247
2248int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2249{
2250 /* Mirror missing mmap method error code */
2251 return -ENODEV;
2252}
2a91525c 2253EXPORT_SYMBOL(sock_no_mmap);
1da177e4
LT
2254
2255ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2256{
2257 ssize_t res;
2258 struct msghdr msg = {.msg_flags = flags};
2259 struct kvec iov;
2260 char *kaddr = kmap(page);
2261 iov.iov_base = kaddr + offset;
2262 iov.iov_len = size;
2263 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2264 kunmap(page);
2265 return res;
2266}
2a91525c 2267EXPORT_SYMBOL(sock_no_sendpage);
1da177e4
LT
2268
2269/*
2270 * Default Socket Callbacks
2271 */
2272
2273static void sock_def_wakeup(struct sock *sk)
2274{
43815482
ED
2275 struct socket_wq *wq;
2276
2277 rcu_read_lock();
2278 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2279 if (skwq_has_sleeper(wq))
43815482
ED
2280 wake_up_interruptible_all(&wq->wait);
2281 rcu_read_unlock();
1da177e4
LT
2282}
2283
2284static void sock_def_error_report(struct sock *sk)
2285{
43815482
ED
2286 struct socket_wq *wq;
2287
2288 rcu_read_lock();
2289 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2290 if (skwq_has_sleeper(wq))
43815482 2291 wake_up_interruptible_poll(&wq->wait, POLLERR);
8d8ad9d7 2292 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
43815482 2293 rcu_read_unlock();
1da177e4
LT
2294}
2295
676d2369 2296static void sock_def_readable(struct sock *sk)
1da177e4 2297{
43815482
ED
2298 struct socket_wq *wq;
2299
2300 rcu_read_lock();
2301 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2302 if (skwq_has_sleeper(wq))
2c6607c6 2303 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
37e5540b 2304 POLLRDNORM | POLLRDBAND);
8d8ad9d7 2305 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
43815482 2306 rcu_read_unlock();
1da177e4
LT
2307}
2308
2309static void sock_def_write_space(struct sock *sk)
2310{
43815482
ED
2311 struct socket_wq *wq;
2312
2313 rcu_read_lock();
1da177e4
LT
2314
2315 /* Do not wake up a writer until he can make "significant"
2316 * progress. --DaveM
2317 */
e71a4783 2318 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
43815482 2319 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2320 if (skwq_has_sleeper(wq))
43815482 2321 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
37e5540b 2322 POLLWRNORM | POLLWRBAND);
1da177e4
LT
2323
2324 /* Should agree with poll, otherwise some programs break */
2325 if (sock_writeable(sk))
8d8ad9d7 2326 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1da177e4
LT
2327 }
2328
43815482 2329 rcu_read_unlock();
1da177e4
LT
2330}
2331
2332static void sock_def_destruct(struct sock *sk)
2333{
1da177e4
LT
2334}
2335
2336void sk_send_sigurg(struct sock *sk)
2337{
2338 if (sk->sk_socket && sk->sk_socket->file)
2339 if (send_sigurg(&sk->sk_socket->file->f_owner))
8d8ad9d7 2340 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1da177e4 2341}
2a91525c 2342EXPORT_SYMBOL(sk_send_sigurg);
1da177e4
LT
2343
2344void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2345 unsigned long expires)
2346{
2347 if (!mod_timer(timer, expires))
2348 sock_hold(sk);
2349}
1da177e4
LT
2350EXPORT_SYMBOL(sk_reset_timer);
2351
2352void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2353{
25cc4ae9 2354 if (del_timer(timer))
1da177e4
LT
2355 __sock_put(sk);
2356}
1da177e4
LT
2357EXPORT_SYMBOL(sk_stop_timer);
2358
2359void sock_init_data(struct socket *sock, struct sock *sk)
2360{
2361 skb_queue_head_init(&sk->sk_receive_queue);
2362 skb_queue_head_init(&sk->sk_write_queue);
2363 skb_queue_head_init(&sk->sk_error_queue);
2364
2365 sk->sk_send_head = NULL;
2366
2367 init_timer(&sk->sk_timer);
4ec93edb 2368
1da177e4
LT
2369 sk->sk_allocation = GFP_KERNEL;
2370 sk->sk_rcvbuf = sysctl_rmem_default;
2371 sk->sk_sndbuf = sysctl_wmem_default;
2372 sk->sk_state = TCP_CLOSE;
972692e0 2373 sk_set_socket(sk, sock);
1da177e4
LT
2374
2375 sock_set_flag(sk, SOCK_ZAPPED);
2376
e71a4783 2377 if (sock) {
1da177e4 2378 sk->sk_type = sock->type;
43815482 2379 sk->sk_wq = sock->wq;
1da177e4
LT
2380 sock->sk = sk;
2381 } else
43815482 2382 sk->sk_wq = NULL;
1da177e4 2383
1da177e4 2384 rwlock_init(&sk->sk_callback_lock);
443aef0e
PZ
2385 lockdep_set_class_and_name(&sk->sk_callback_lock,
2386 af_callback_keys + sk->sk_family,
2387 af_family_clock_key_strings[sk->sk_family]);
1da177e4
LT
2388
2389 sk->sk_state_change = sock_def_wakeup;
2390 sk->sk_data_ready = sock_def_readable;
2391 sk->sk_write_space = sock_def_write_space;
2392 sk->sk_error_report = sock_def_error_report;
2393 sk->sk_destruct = sock_def_destruct;
2394
5640f768
ED
2395 sk->sk_frag.page = NULL;
2396 sk->sk_frag.offset = 0;
ef64a54f 2397 sk->sk_peek_off = -1;
1da177e4 2398
109f6e39
EB
2399 sk->sk_peer_pid = NULL;
2400 sk->sk_peer_cred = NULL;
1da177e4
LT
2401 sk->sk_write_pending = 0;
2402 sk->sk_rcvlowat = 1;
2403 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2404 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2405
f37f0afb 2406 sk->sk_stamp = ktime_set(-1L, 0);
1da177e4 2407
e0d1095a 2408#ifdef CONFIG_NET_RX_BUSY_POLL
06021292 2409 sk->sk_napi_id = 0;
64b0dc51 2410 sk->sk_ll_usec = sysctl_net_busy_read;
06021292
ET
2411#endif
2412
62748f32 2413 sk->sk_max_pacing_rate = ~0U;
7eec4174 2414 sk->sk_pacing_rate = ~0U;
70da268b 2415 sk->sk_incoming_cpu = -1;
4dc6dc71
ED
2416 /*
2417 * Before updating sk_refcnt, we must commit prior changes to memory
2418 * (Documentation/RCU/rculist_nulls.txt for details)
2419 */
2420 smp_wmb();
1da177e4 2421 atomic_set(&sk->sk_refcnt, 1);
33c732c3 2422 atomic_set(&sk->sk_drops, 0);
1da177e4 2423}
2a91525c 2424EXPORT_SYMBOL(sock_init_data);
1da177e4 2425
b5606c2d 2426void lock_sock_nested(struct sock *sk, int subclass)
1da177e4
LT
2427{
2428 might_sleep();
a5b5bb9a 2429 spin_lock_bh(&sk->sk_lock.slock);
d2e9117c 2430 if (sk->sk_lock.owned)
1da177e4 2431 __lock_sock(sk);
d2e9117c 2432 sk->sk_lock.owned = 1;
a5b5bb9a
IM
2433 spin_unlock(&sk->sk_lock.slock);
2434 /*
2435 * The sk_lock has mutex_lock() semantics here:
2436 */
fcc70d5f 2437 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
a5b5bb9a 2438 local_bh_enable();
1da177e4 2439}
fcc70d5f 2440EXPORT_SYMBOL(lock_sock_nested);
1da177e4 2441
b5606c2d 2442void release_sock(struct sock *sk)
1da177e4 2443{
a5b5bb9a
IM
2444 /*
2445 * The sk_lock has mutex_unlock() semantics:
2446 */
2447 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2448
2449 spin_lock_bh(&sk->sk_lock.slock);
1da177e4
LT
2450 if (sk->sk_backlog.tail)
2451 __release_sock(sk);
46d3ceab 2452
c3f9b018
ED
2453 /* Warning : release_cb() might need to release sk ownership,
2454 * ie call sock_release_ownership(sk) before us.
2455 */
46d3ceab
ED
2456 if (sk->sk_prot->release_cb)
2457 sk->sk_prot->release_cb(sk);
2458
c3f9b018 2459 sock_release_ownership(sk);
a5b5bb9a
IM
2460 if (waitqueue_active(&sk->sk_lock.wq))
2461 wake_up(&sk->sk_lock.wq);
2462 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4
LT
2463}
2464EXPORT_SYMBOL(release_sock);
2465
8a74ad60
ED
2466/**
2467 * lock_sock_fast - fast version of lock_sock
2468 * @sk: socket
2469 *
2470 * This version should be used for very small section, where process wont block
2471 * return false if fast path is taken
2472 * sk_lock.slock locked, owned = 0, BH disabled
2473 * return true if slow path is taken
2474 * sk_lock.slock unlocked, owned = 1, BH enabled
2475 */
2476bool lock_sock_fast(struct sock *sk)
2477{
2478 might_sleep();
2479 spin_lock_bh(&sk->sk_lock.slock);
2480
2481 if (!sk->sk_lock.owned)
2482 /*
2483 * Note : We must disable BH
2484 */
2485 return false;
2486
2487 __lock_sock(sk);
2488 sk->sk_lock.owned = 1;
2489 spin_unlock(&sk->sk_lock.slock);
2490 /*
2491 * The sk_lock has mutex_lock() semantics here:
2492 */
2493 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2494 local_bh_enable();
2495 return true;
2496}
2497EXPORT_SYMBOL(lock_sock_fast);
2498
1da177e4 2499int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
4ec93edb 2500{
b7aa0bf7 2501 struct timeval tv;
1da177e4 2502 if (!sock_flag(sk, SOCK_TIMESTAMP))
20d49473 2503 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
b7aa0bf7
ED
2504 tv = ktime_to_timeval(sk->sk_stamp);
2505 if (tv.tv_sec == -1)
1da177e4 2506 return -ENOENT;
b7aa0bf7
ED
2507 if (tv.tv_sec == 0) {
2508 sk->sk_stamp = ktime_get_real();
2509 tv = ktime_to_timeval(sk->sk_stamp);
2510 }
2511 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
4ec93edb 2512}
1da177e4
LT
2513EXPORT_SYMBOL(sock_get_timestamp);
2514
ae40eb1e
ED
2515int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2516{
2517 struct timespec ts;
2518 if (!sock_flag(sk, SOCK_TIMESTAMP))
20d49473 2519 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
ae40eb1e
ED
2520 ts = ktime_to_timespec(sk->sk_stamp);
2521 if (ts.tv_sec == -1)
2522 return -ENOENT;
2523 if (ts.tv_sec == 0) {
2524 sk->sk_stamp = ktime_get_real();
2525 ts = ktime_to_timespec(sk->sk_stamp);
2526 }
2527 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2528}
2529EXPORT_SYMBOL(sock_get_timestampns);
2530
20d49473 2531void sock_enable_timestamp(struct sock *sk, int flag)
4ec93edb 2532{
20d49473 2533 if (!sock_flag(sk, flag)) {
08e29af3
ED
2534 unsigned long previous_flags = sk->sk_flags;
2535
20d49473
PO
2536 sock_set_flag(sk, flag);
2537 /*
2538 * we just set one of the two flags which require net
2539 * time stamping, but time stamping might have been on
2540 * already because of the other one
2541 */
080a270f
HFS
2542 if (sock_needs_netstamp(sk) &&
2543 !(previous_flags & SK_FLAGS_TIMESTAMP))
20d49473 2544 net_enable_timestamp();
1da177e4
LT
2545 }
2546}
1da177e4 2547
cb820f8e
RC
2548int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2549 int level, int type)
2550{
2551 struct sock_exterr_skb *serr;
364a9e93 2552 struct sk_buff *skb;
cb820f8e
RC
2553 int copied, err;
2554
2555 err = -EAGAIN;
364a9e93 2556 skb = sock_dequeue_err_skb(sk);
cb820f8e
RC
2557 if (skb == NULL)
2558 goto out;
2559
2560 copied = skb->len;
2561 if (copied > len) {
2562 msg->msg_flags |= MSG_TRUNC;
2563 copied = len;
2564 }
51f3d02b 2565 err = skb_copy_datagram_msg(skb, 0, msg, copied);
cb820f8e
RC
2566 if (err)
2567 goto out_free_skb;
2568
2569 sock_recv_timestamp(msg, sk, skb);
2570
2571 serr = SKB_EXT_ERR(skb);
2572 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2573
2574 msg->msg_flags |= MSG_ERRQUEUE;
2575 err = copied;
2576
cb820f8e
RC
2577out_free_skb:
2578 kfree_skb(skb);
2579out:
2580 return err;
2581}
2582EXPORT_SYMBOL(sock_recv_errqueue);
2583
1da177e4
LT
2584/*
2585 * Get a socket option on an socket.
2586 *
2587 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2588 * asynchronous errors should be reported by getsockopt. We assume
2589 * this means if you specify SO_ERROR (otherwise whats the point of it).
2590 */
2591int sock_common_getsockopt(struct socket *sock, int level, int optname,
2592 char __user *optval, int __user *optlen)
2593{
2594 struct sock *sk = sock->sk;
2595
2596 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2597}
1da177e4
LT
2598EXPORT_SYMBOL(sock_common_getsockopt);
2599
3fdadf7d 2600#ifdef CONFIG_COMPAT
543d9cfe
ACM
2601int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2602 char __user *optval, int __user *optlen)
3fdadf7d
DM
2603{
2604 struct sock *sk = sock->sk;
2605
1e51f951 2606 if (sk->sk_prot->compat_getsockopt != NULL)
543d9cfe
ACM
2607 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2608 optval, optlen);
3fdadf7d
DM
2609 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2610}
2611EXPORT_SYMBOL(compat_sock_common_getsockopt);
2612#endif
2613
1b784140
YX
2614int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2615 int flags)
1da177e4
LT
2616{
2617 struct sock *sk = sock->sk;
2618 int addr_len = 0;
2619 int err;
2620
1b784140 2621 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
1da177e4
LT
2622 flags & ~MSG_DONTWAIT, &addr_len);
2623 if (err >= 0)
2624 msg->msg_namelen = addr_len;
2625 return err;
2626}
1da177e4
LT
2627EXPORT_SYMBOL(sock_common_recvmsg);
2628
2629/*
2630 * Set socket options on an inet socket.
2631 */
2632int sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 2633 char __user *optval, unsigned int optlen)
1da177e4
LT
2634{
2635 struct sock *sk = sock->sk;
2636
2637 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2638}
1da177e4
LT
2639EXPORT_SYMBOL(sock_common_setsockopt);
2640
3fdadf7d 2641#ifdef CONFIG_COMPAT
543d9cfe 2642int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 2643 char __user *optval, unsigned int optlen)
3fdadf7d
DM
2644{
2645 struct sock *sk = sock->sk;
2646
543d9cfe
ACM
2647 if (sk->sk_prot->compat_setsockopt != NULL)
2648 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2649 optval, optlen);
3fdadf7d
DM
2650 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2651}
2652EXPORT_SYMBOL(compat_sock_common_setsockopt);
2653#endif
2654
1da177e4
LT
2655void sk_common_release(struct sock *sk)
2656{
2657 if (sk->sk_prot->destroy)
2658 sk->sk_prot->destroy(sk);
2659
2660 /*
2661 * Observation: when sock_common_release is called, processes have
2662 * no access to socket. But net still has.
2663 * Step one, detach it from networking:
2664 *
2665 * A. Remove from hash tables.
2666 */
2667
2668 sk->sk_prot->unhash(sk);
2669
2670 /*
2671 * In this point socket cannot receive new packets, but it is possible
2672 * that some packets are in flight because some CPU runs receiver and
2673 * did hash table lookup before we unhashed socket. They will achieve
2674 * receive queue and will be purged by socket destructor.
2675 *
2676 * Also we still have packets pending on receive queue and probably,
2677 * our own packets waiting in device queues. sock_destroy will drain
2678 * receive queue, but transmitted packets will delay socket destruction
2679 * until the last reference will be released.
2680 */
2681
2682 sock_orphan(sk);
2683
2684 xfrm_sk_free_policy(sk);
2685
e6848976 2686 sk_refcnt_debug_release(sk);
5640f768
ED
2687
2688 if (sk->sk_frag.page) {
2689 put_page(sk->sk_frag.page);
2690 sk->sk_frag.page = NULL;
2691 }
2692
1da177e4
LT
2693 sock_put(sk);
2694}
1da177e4
LT
2695EXPORT_SYMBOL(sk_common_release);
2696
13ff3d6f
PE
2697#ifdef CONFIG_PROC_FS
2698#define PROTO_INUSE_NR 64 /* should be enough for the first time */
1338d466
PE
2699struct prot_inuse {
2700 int val[PROTO_INUSE_NR];
2701};
13ff3d6f
PE
2702
2703static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
70ee1159
PE
2704
2705#ifdef CONFIG_NET_NS
2706void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2707{
d6d9ca0f 2708 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
70ee1159
PE
2709}
2710EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2711
2712int sock_prot_inuse_get(struct net *net, struct proto *prot)
2713{
2714 int cpu, idx = prot->inuse_idx;
2715 int res = 0;
2716
2717 for_each_possible_cpu(cpu)
2718 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2719
2720 return res >= 0 ? res : 0;
2721}
2722EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2723
2c8c1e72 2724static int __net_init sock_inuse_init_net(struct net *net)
70ee1159
PE
2725{
2726 net->core.inuse = alloc_percpu(struct prot_inuse);
2727 return net->core.inuse ? 0 : -ENOMEM;
2728}
2729
2c8c1e72 2730static void __net_exit sock_inuse_exit_net(struct net *net)
70ee1159
PE
2731{
2732 free_percpu(net->core.inuse);
2733}
2734
2735static struct pernet_operations net_inuse_ops = {
2736 .init = sock_inuse_init_net,
2737 .exit = sock_inuse_exit_net,
2738};
2739
2740static __init int net_inuse_init(void)
2741{
2742 if (register_pernet_subsys(&net_inuse_ops))
2743 panic("Cannot initialize net inuse counters");
2744
2745 return 0;
2746}
2747
2748core_initcall(net_inuse_init);
2749#else
1338d466
PE
2750static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2751
c29a0bc4 2752void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1338d466 2753{
d6d9ca0f 2754 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
1338d466
PE
2755}
2756EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2757
c29a0bc4 2758int sock_prot_inuse_get(struct net *net, struct proto *prot)
1338d466
PE
2759{
2760 int cpu, idx = prot->inuse_idx;
2761 int res = 0;
2762
2763 for_each_possible_cpu(cpu)
2764 res += per_cpu(prot_inuse, cpu).val[idx];
2765
2766 return res >= 0 ? res : 0;
2767}
2768EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
70ee1159 2769#endif
13ff3d6f
PE
2770
2771static void assign_proto_idx(struct proto *prot)
2772{
2773 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2774
2775 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
e005d193 2776 pr_err("PROTO_INUSE_NR exhausted\n");
13ff3d6f
PE
2777 return;
2778 }
2779
2780 set_bit(prot->inuse_idx, proto_inuse_idx);
2781}
2782
2783static void release_proto_idx(struct proto *prot)
2784{
2785 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2786 clear_bit(prot->inuse_idx, proto_inuse_idx);
2787}
2788#else
2789static inline void assign_proto_idx(struct proto *prot)
2790{
2791}
2792
2793static inline void release_proto_idx(struct proto *prot)
2794{
2795}
2796#endif
2797
0159dfd3
ED
2798static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2799{
2800 if (!rsk_prot)
2801 return;
2802 kfree(rsk_prot->slab_name);
2803 rsk_prot->slab_name = NULL;
adf78eda
JL
2804 kmem_cache_destroy(rsk_prot->slab);
2805 rsk_prot->slab = NULL;
0159dfd3
ED
2806}
2807
2808static int req_prot_init(const struct proto *prot)
2809{
2810 struct request_sock_ops *rsk_prot = prot->rsk_prot;
2811
2812 if (!rsk_prot)
2813 return 0;
2814
2815 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2816 prot->name);
2817 if (!rsk_prot->slab_name)
2818 return -ENOMEM;
2819
2820 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2821 rsk_prot->obj_size, 0,
e96f78ab 2822 prot->slab_flags, NULL);
0159dfd3
ED
2823
2824 if (!rsk_prot->slab) {
2825 pr_crit("%s: Can't create request sock SLAB cache!\n",
2826 prot->name);
2827 return -ENOMEM;
2828 }
2829 return 0;
2830}
2831
b733c007
PE
2832int proto_register(struct proto *prot, int alloc_slab)
2833{
1da177e4
LT
2834 if (alloc_slab) {
2835 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
271b72c7
ED
2836 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2837 NULL);
1da177e4
LT
2838
2839 if (prot->slab == NULL) {
e005d193
JP
2840 pr_crit("%s: Can't create sock SLAB cache!\n",
2841 prot->name);
60e7663d 2842 goto out;
1da177e4 2843 }
2e6599cb 2844
0159dfd3
ED
2845 if (req_prot_init(prot))
2846 goto out_free_request_sock_slab;
8feaf0c0 2847
6d6ee43e 2848 if (prot->twsk_prot != NULL) {
faf23422 2849 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
8feaf0c0 2850
7e56b5d6 2851 if (prot->twsk_prot->twsk_slab_name == NULL)
8feaf0c0
ACM
2852 goto out_free_request_sock_slab;
2853
6d6ee43e 2854 prot->twsk_prot->twsk_slab =
7e56b5d6 2855 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
6d6ee43e 2856 prot->twsk_prot->twsk_obj_size,
3ab5aee7 2857 0,
52db70dc 2858 prot->slab_flags,
20c2df83 2859 NULL);
6d6ee43e 2860 if (prot->twsk_prot->twsk_slab == NULL)
8feaf0c0
ACM
2861 goto out_free_timewait_sock_slab_name;
2862 }
1da177e4
LT
2863 }
2864
36b77a52 2865 mutex_lock(&proto_list_mutex);
1da177e4 2866 list_add(&prot->node, &proto_list);
13ff3d6f 2867 assign_proto_idx(prot);
36b77a52 2868 mutex_unlock(&proto_list_mutex);
b733c007
PE
2869 return 0;
2870
8feaf0c0 2871out_free_timewait_sock_slab_name:
7e56b5d6 2872 kfree(prot->twsk_prot->twsk_slab_name);
8feaf0c0 2873out_free_request_sock_slab:
0159dfd3
ED
2874 req_prot_cleanup(prot->rsk_prot);
2875
2e6599cb
ACM
2876 kmem_cache_destroy(prot->slab);
2877 prot->slab = NULL;
b733c007
PE
2878out:
2879 return -ENOBUFS;
1da177e4 2880}
1da177e4
LT
2881EXPORT_SYMBOL(proto_register);
2882
2883void proto_unregister(struct proto *prot)
2884{
36b77a52 2885 mutex_lock(&proto_list_mutex);
13ff3d6f 2886 release_proto_idx(prot);
0a3f4358 2887 list_del(&prot->node);
36b77a52 2888 mutex_unlock(&proto_list_mutex);
1da177e4 2889
adf78eda
JL
2890 kmem_cache_destroy(prot->slab);
2891 prot->slab = NULL;
1da177e4 2892
0159dfd3 2893 req_prot_cleanup(prot->rsk_prot);
2e6599cb 2894
6d6ee43e 2895 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
6d6ee43e 2896 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
7e56b5d6 2897 kfree(prot->twsk_prot->twsk_slab_name);
6d6ee43e 2898 prot->twsk_prot->twsk_slab = NULL;
8feaf0c0 2899 }
1da177e4 2900}
1da177e4
LT
2901EXPORT_SYMBOL(proto_unregister);
2902
2903#ifdef CONFIG_PROC_FS
1da177e4 2904static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
36b77a52 2905 __acquires(proto_list_mutex)
1da177e4 2906{
36b77a52 2907 mutex_lock(&proto_list_mutex);
60f0438a 2908 return seq_list_start_head(&proto_list, *pos);
1da177e4
LT
2909}
2910
2911static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2912{
60f0438a 2913 return seq_list_next(v, &proto_list, pos);
1da177e4
LT
2914}
2915
2916static void proto_seq_stop(struct seq_file *seq, void *v)
36b77a52 2917 __releases(proto_list_mutex)
1da177e4 2918{
36b77a52 2919 mutex_unlock(&proto_list_mutex);
1da177e4
LT
2920}
2921
2922static char proto_method_implemented(const void *method)
2923{
2924 return method == NULL ? 'n' : 'y';
2925}
180d8cd9
GC
2926static long sock_prot_memory_allocated(struct proto *proto)
2927{
cb75a36c 2928 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
180d8cd9
GC
2929}
2930
2931static char *sock_prot_memory_pressure(struct proto *proto)
2932{
2933 return proto->memory_pressure != NULL ?
2934 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2935}
1da177e4
LT
2936
2937static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2938{
180d8cd9 2939
8d987e5c 2940 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
1da177e4
LT
2941 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2942 proto->name,
2943 proto->obj_size,
14e943db 2944 sock_prot_inuse_get(seq_file_net(seq), proto),
180d8cd9
GC
2945 sock_prot_memory_allocated(proto),
2946 sock_prot_memory_pressure(proto),
1da177e4
LT
2947 proto->max_header,
2948 proto->slab == NULL ? "no" : "yes",
2949 module_name(proto->owner),
2950 proto_method_implemented(proto->close),
2951 proto_method_implemented(proto->connect),
2952 proto_method_implemented(proto->disconnect),
2953 proto_method_implemented(proto->accept),
2954 proto_method_implemented(proto->ioctl),
2955 proto_method_implemented(proto->init),
2956 proto_method_implemented(proto->destroy),
2957 proto_method_implemented(proto->shutdown),
2958 proto_method_implemented(proto->setsockopt),
2959 proto_method_implemented(proto->getsockopt),
2960 proto_method_implemented(proto->sendmsg),
2961 proto_method_implemented(proto->recvmsg),
2962 proto_method_implemented(proto->sendpage),
2963 proto_method_implemented(proto->bind),
2964 proto_method_implemented(proto->backlog_rcv),
2965 proto_method_implemented(proto->hash),
2966 proto_method_implemented(proto->unhash),
2967 proto_method_implemented(proto->get_port),
2968 proto_method_implemented(proto->enter_memory_pressure));
2969}
2970
2971static int proto_seq_show(struct seq_file *seq, void *v)
2972{
60f0438a 2973 if (v == &proto_list)
1da177e4
LT
2974 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2975 "protocol",
2976 "size",
2977 "sockets",
2978 "memory",
2979 "press",
2980 "maxhdr",
2981 "slab",
2982 "module",
2983 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2984 else
60f0438a 2985 proto_seq_printf(seq, list_entry(v, struct proto, node));
1da177e4
LT
2986 return 0;
2987}
2988
f690808e 2989static const struct seq_operations proto_seq_ops = {
1da177e4
LT
2990 .start = proto_seq_start,
2991 .next = proto_seq_next,
2992 .stop = proto_seq_stop,
2993 .show = proto_seq_show,
2994};
2995
2996static int proto_seq_open(struct inode *inode, struct file *file)
2997{
14e943db
ED
2998 return seq_open_net(inode, file, &proto_seq_ops,
2999 sizeof(struct seq_net_private));
1da177e4
LT
3000}
3001
9a32144e 3002static const struct file_operations proto_seq_fops = {
1da177e4
LT
3003 .owner = THIS_MODULE,
3004 .open = proto_seq_open,
3005 .read = seq_read,
3006 .llseek = seq_lseek,
14e943db
ED
3007 .release = seq_release_net,
3008};
3009
3010static __net_init int proto_init_net(struct net *net)
3011{
d4beaa66 3012 if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
14e943db
ED
3013 return -ENOMEM;
3014
3015 return 0;
3016}
3017
3018static __net_exit void proto_exit_net(struct net *net)
3019{
ece31ffd 3020 remove_proc_entry("protocols", net->proc_net);
14e943db
ED
3021}
3022
3023
3024static __net_initdata struct pernet_operations proto_net_ops = {
3025 .init = proto_init_net,
3026 .exit = proto_exit_net,
1da177e4
LT
3027};
3028
3029static int __init proto_init(void)
3030{
14e943db 3031 return register_pernet_subsys(&proto_net_ops);
1da177e4
LT
3032}
3033
3034subsys_initcall(proto_init);
3035
3036#endif /* PROC_FS */