af_unix: Annotate data-race of sk->sk_state in unix_inq_len().
[linux-2.6-block.git] / net / unix / af_unix.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:        Implementation of BSD Unix domain sockets.
4  *
5  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *              Linus Torvalds  :       Assorted bug cures.
9  *              Niibe Yutaka    :       async I/O support.
10  *              Carsten Paeth   :       PF_UNIX check, address fixes.
11  *              Alan Cox        :       Limit size of allocated blocks.
12  *              Alan Cox        :       Fixed the stupid socketpair bug.
13  *              Alan Cox        :       BSD compatibility fine tuning.
14  *              Alan Cox        :       Fixed a bug in connect when interrupted.
15  *              Alan Cox        :       Sorted out a proper draft version of
16  *                                      file descriptor passing hacked up from
17  *                                      Mike Shaver's work.
18  *              Marty Leisner   :       Fixes to fd passing
19  *              Nick Nevin      :       recvmsg bugfix.
20  *              Alan Cox        :       Started proper garbage collector
21  *              Heiko EiBfeldt  :       Missing verify_area check
22  *              Alan Cox        :       Started POSIXisms
23  *              Andreas Schwab  :       Replace inode by dentry for proper
24  *                                      reference counting
25  *              Kirk Petersen   :       Made this a module
26  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
27  *                                      Lots of bug fixes.
28  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
29  *                                      by above two patches.
30  *           Andrea Arcangeli   :       If possible we block in connect(2)
31  *                                      if the max backlog of the listen socket
32  *                                      is been reached. This won't break
33  *                                      old apps and it will avoid huge amount
34  *                                      of socks hashed (this for unix_gc()
35  *                                      performances reasons).
36  *                                      Security fix that limits the max
37  *                                      number of socks to 2*max_files and
38  *                                      the number of skb queueable in the
39  *                                      dgram receiver.
40  *              Artur Skawina   :       Hash function optimizations
41  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
42  *            Malcolm Beattie   :       Set peercred for socketpair
43  *           Michal Ostrowski   :       Module initialization cleanup.
44  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
45  *                                      the core infrastructure is doing that
46  *                                      for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *      [TO FIX]
51  *      ECONNREFUSED is not returned from one end of a connected() socket to the
52  *              other the moment one end closes.
53  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *      [NOT TO FIX]
56  *      accept() returns a path name even if the connecting socket has closed
57  *              in the meantime (BSD loses the path and gives up).
58  *      accept() returns 0 length path for an unbound connector. BSD returns 16
59  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *      BSD af_unix apparently has connect forgetting to block properly.
62  *              (need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *      Bug fixes and improvements.
66  *              - client shutdown killed server socket.
67  *              - removed all useless cli/sti pairs.
68  *
69  *      Semantic changes/extensions.
70  *              - generic control message passing.
71  *              - SCM_CREDENTIALS control message.
72  *              - "Abstract" (not FS based) socket bindings.
73  *                Abstract names are sequences of bytes (not zero terminated)
74  *                started by 0, so that this name space does not intersect
75  *                with BSD names.
76  */
77
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 #include <linux/bpf-cgroup.h>
120
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124
125 /* SMP locking strategy:
126  *    hash table is protected with spinlock.
127  *    each socket state is protected by separate spinlock.
128  */
129
130 static unsigned int unix_unbound_hash(struct sock *sk)
131 {
132         unsigned long hash = (unsigned long)sk;
133
134         hash ^= hash >> 16;
135         hash ^= hash >> 8;
136         hash ^= sk->sk_type;
137
138         return hash & UNIX_HASH_MOD;
139 }
140
141 static unsigned int unix_bsd_hash(struct inode *i)
142 {
143         return i->i_ino & UNIX_HASH_MOD;
144 }
145
146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
147                                        int addr_len, int type)
148 {
149         __wsum csum = csum_partial(sunaddr, addr_len, 0);
150         unsigned int hash;
151
152         hash = (__force unsigned int)csum_fold(csum);
153         hash ^= hash >> 8;
154         hash ^= type;
155
156         return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
157 }
158
159 static void unix_table_double_lock(struct net *net,
160                                    unsigned int hash1, unsigned int hash2)
161 {
162         if (hash1 == hash2) {
163                 spin_lock(&net->unx.table.locks[hash1]);
164                 return;
165         }
166
167         if (hash1 > hash2)
168                 swap(hash1, hash2);
169
170         spin_lock(&net->unx.table.locks[hash1]);
171         spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
172 }
173
174 static void unix_table_double_unlock(struct net *net,
175                                      unsigned int hash1, unsigned int hash2)
176 {
177         if (hash1 == hash2) {
178                 spin_unlock(&net->unx.table.locks[hash1]);
179                 return;
180         }
181
182         spin_unlock(&net->unx.table.locks[hash1]);
183         spin_unlock(&net->unx.table.locks[hash2]);
184 }
185
186 #ifdef CONFIG_SECURITY_NETWORK
187 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
188 {
189         UNIXCB(skb).secid = scm->secid;
190 }
191
192 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
193 {
194         scm->secid = UNIXCB(skb).secid;
195 }
196
197 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
198 {
199         return (scm->secid == UNIXCB(skb).secid);
200 }
201 #else
202 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
203 { }
204
205 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
206 { }
207
208 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
209 {
210         return true;
211 }
212 #endif /* CONFIG_SECURITY_NETWORK */
213
214 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
215 {
216         return unix_peer(osk) == sk;
217 }
218
219 static inline int unix_may_send(struct sock *sk, struct sock *osk)
220 {
221         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
222 }
223
224 static inline int unix_recvq_full(const struct sock *sk)
225 {
226         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
227 }
228
229 static inline int unix_recvq_full_lockless(const struct sock *sk)
230 {
231         return skb_queue_len_lockless(&sk->sk_receive_queue) >
232                 READ_ONCE(sk->sk_max_ack_backlog);
233 }
234
235 struct sock *unix_peer_get(struct sock *s)
236 {
237         struct sock *peer;
238
239         unix_state_lock(s);
240         peer = unix_peer(s);
241         if (peer)
242                 sock_hold(peer);
243         unix_state_unlock(s);
244         return peer;
245 }
246 EXPORT_SYMBOL_GPL(unix_peer_get);
247
248 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
249                                              int addr_len)
250 {
251         struct unix_address *addr;
252
253         addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
254         if (!addr)
255                 return NULL;
256
257         refcount_set(&addr->refcnt, 1);
258         addr->len = addr_len;
259         memcpy(addr->name, sunaddr, addr_len);
260
261         return addr;
262 }
263
264 static inline void unix_release_addr(struct unix_address *addr)
265 {
266         if (refcount_dec_and_test(&addr->refcnt))
267                 kfree(addr);
268 }
269
270 /*
271  *      Check unix socket name:
272  *              - should be not zero length.
273  *              - if started by not zero, should be NULL terminated (FS object)
274  *              - if started by zero, it is abstract name.
275  */
276
277 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
278 {
279         if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
280             addr_len > sizeof(*sunaddr))
281                 return -EINVAL;
282
283         if (sunaddr->sun_family != AF_UNIX)
284                 return -EINVAL;
285
286         return 0;
287 }
288
289 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
290 {
291         struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
292         short offset = offsetof(struct sockaddr_storage, __data);
293
294         BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
295
296         /* This may look like an off by one error but it is a bit more
297          * subtle.  108 is the longest valid AF_UNIX path for a binding.
298          * sun_path[108] doesn't as such exist.  However in kernel space
299          * we are guaranteed that it is a valid memory location in our
300          * kernel address buffer because syscall functions always pass
301          * a pointer of struct sockaddr_storage which has a bigger buffer
302          * than 108.  Also, we must terminate sun_path for strlen() in
303          * getname_kernel().
304          */
305         addr->__data[addr_len - offset] = 0;
306
307         /* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
308          * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
309          * know the actual buffer.
310          */
311         return strlen(addr->__data) + offset + 1;
312 }
313
314 static void __unix_remove_socket(struct sock *sk)
315 {
316         sk_del_node_init(sk);
317 }
318
319 static void __unix_insert_socket(struct net *net, struct sock *sk)
320 {
321         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
322         sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
323 }
324
325 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
326                                  struct unix_address *addr, unsigned int hash)
327 {
328         __unix_remove_socket(sk);
329         smp_store_release(&unix_sk(sk)->addr, addr);
330
331         sk->sk_hash = hash;
332         __unix_insert_socket(net, sk);
333 }
334
335 static void unix_remove_socket(struct net *net, struct sock *sk)
336 {
337         spin_lock(&net->unx.table.locks[sk->sk_hash]);
338         __unix_remove_socket(sk);
339         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
340 }
341
342 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
343 {
344         spin_lock(&net->unx.table.locks[sk->sk_hash]);
345         __unix_insert_socket(net, sk);
346         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
347 }
348
349 static void unix_insert_bsd_socket(struct sock *sk)
350 {
351         spin_lock(&bsd_socket_locks[sk->sk_hash]);
352         sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
353         spin_unlock(&bsd_socket_locks[sk->sk_hash]);
354 }
355
356 static void unix_remove_bsd_socket(struct sock *sk)
357 {
358         if (!hlist_unhashed(&sk->sk_bind_node)) {
359                 spin_lock(&bsd_socket_locks[sk->sk_hash]);
360                 __sk_del_bind_node(sk);
361                 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
362
363                 sk_node_init(&sk->sk_bind_node);
364         }
365 }
366
367 static struct sock *__unix_find_socket_byname(struct net *net,
368                                               struct sockaddr_un *sunname,
369                                               int len, unsigned int hash)
370 {
371         struct sock *s;
372
373         sk_for_each(s, &net->unx.table.buckets[hash]) {
374                 struct unix_sock *u = unix_sk(s);
375
376                 if (u->addr->len == len &&
377                     !memcmp(u->addr->name, sunname, len))
378                         return s;
379         }
380         return NULL;
381 }
382
383 static inline struct sock *unix_find_socket_byname(struct net *net,
384                                                    struct sockaddr_un *sunname,
385                                                    int len, unsigned int hash)
386 {
387         struct sock *s;
388
389         spin_lock(&net->unx.table.locks[hash]);
390         s = __unix_find_socket_byname(net, sunname, len, hash);
391         if (s)
392                 sock_hold(s);
393         spin_unlock(&net->unx.table.locks[hash]);
394         return s;
395 }
396
397 static struct sock *unix_find_socket_byinode(struct inode *i)
398 {
399         unsigned int hash = unix_bsd_hash(i);
400         struct sock *s;
401
402         spin_lock(&bsd_socket_locks[hash]);
403         sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
404                 struct dentry *dentry = unix_sk(s)->path.dentry;
405
406                 if (dentry && d_backing_inode(dentry) == i) {
407                         sock_hold(s);
408                         spin_unlock(&bsd_socket_locks[hash]);
409                         return s;
410                 }
411         }
412         spin_unlock(&bsd_socket_locks[hash]);
413         return NULL;
414 }
415
416 /* Support code for asymmetrically connected dgram sockets
417  *
418  * If a datagram socket is connected to a socket not itself connected
419  * to the first socket (eg, /dev/log), clients may only enqueue more
420  * messages if the present receive queue of the server socket is not
421  * "too large". This means there's a second writeability condition
422  * poll and sendmsg need to test. The dgram recv code will do a wake
423  * up on the peer_wait wait queue of a socket upon reception of a
424  * datagram which needs to be propagated to sleeping would-be writers
425  * since these might not have sent anything so far. This can't be
426  * accomplished via poll_wait because the lifetime of the server
427  * socket might be less than that of its clients if these break their
428  * association with it or if the server socket is closed while clients
429  * are still connected to it and there's no way to inform "a polling
430  * implementation" that it should let go of a certain wait queue
431  *
432  * In order to propagate a wake up, a wait_queue_entry_t of the client
433  * socket is enqueued on the peer_wait queue of the server socket
434  * whose wake function does a wake_up on the ordinary client socket
435  * wait queue. This connection is established whenever a write (or
436  * poll for write) hit the flow control condition and broken when the
437  * association to the server socket is dissolved or after a wake up
438  * was relayed.
439  */
440
441 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
442                                       void *key)
443 {
444         struct unix_sock *u;
445         wait_queue_head_t *u_sleep;
446
447         u = container_of(q, struct unix_sock, peer_wake);
448
449         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
450                             q);
451         u->peer_wake.private = NULL;
452
453         /* relaying can only happen while the wq still exists */
454         u_sleep = sk_sleep(&u->sk);
455         if (u_sleep)
456                 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
457
458         return 0;
459 }
460
461 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
462 {
463         struct unix_sock *u, *u_other;
464         int rc;
465
466         u = unix_sk(sk);
467         u_other = unix_sk(other);
468         rc = 0;
469         spin_lock(&u_other->peer_wait.lock);
470
471         if (!u->peer_wake.private) {
472                 u->peer_wake.private = other;
473                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
474
475                 rc = 1;
476         }
477
478         spin_unlock(&u_other->peer_wait.lock);
479         return rc;
480 }
481
482 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
483                                             struct sock *other)
484 {
485         struct unix_sock *u, *u_other;
486
487         u = unix_sk(sk);
488         u_other = unix_sk(other);
489         spin_lock(&u_other->peer_wait.lock);
490
491         if (u->peer_wake.private == other) {
492                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
493                 u->peer_wake.private = NULL;
494         }
495
496         spin_unlock(&u_other->peer_wait.lock);
497 }
498
499 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
500                                                    struct sock *other)
501 {
502         unix_dgram_peer_wake_disconnect(sk, other);
503         wake_up_interruptible_poll(sk_sleep(sk),
504                                    EPOLLOUT |
505                                    EPOLLWRNORM |
506                                    EPOLLWRBAND);
507 }
508
509 /* preconditions:
510  *      - unix_peer(sk) == other
511  *      - association is stable
512  */
513 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
514 {
515         int connected;
516
517         connected = unix_dgram_peer_wake_connect(sk, other);
518
519         /* If other is SOCK_DEAD, we want to make sure we signal
520          * POLLOUT, such that a subsequent write() can get a
521          * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
522          * to other and its full, we will hang waiting for POLLOUT.
523          */
524         if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
525                 return 1;
526
527         if (connected)
528                 unix_dgram_peer_wake_disconnect(sk, other);
529
530         return 0;
531 }
532
533 static int unix_writable(const struct sock *sk)
534 {
535         return sk->sk_state != TCP_LISTEN &&
536                (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
537 }
538
539 static void unix_write_space(struct sock *sk)
540 {
541         struct socket_wq *wq;
542
543         rcu_read_lock();
544         if (unix_writable(sk)) {
545                 wq = rcu_dereference(sk->sk_wq);
546                 if (skwq_has_sleeper(wq))
547                         wake_up_interruptible_sync_poll(&wq->wait,
548                                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
549                 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
550         }
551         rcu_read_unlock();
552 }
553
554 /* When dgram socket disconnects (or changes its peer), we clear its receive
555  * queue of packets arrived from previous peer. First, it allows to do
556  * flow control based only on wmem_alloc; second, sk connected to peer
557  * may receive messages only from that peer. */
558 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
559 {
560         if (!skb_queue_empty(&sk->sk_receive_queue)) {
561                 skb_queue_purge(&sk->sk_receive_queue);
562                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
563
564                 /* If one link of bidirectional dgram pipe is disconnected,
565                  * we signal error. Messages are lost. Do not make this,
566                  * when peer was not connected to us.
567                  */
568                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
569                         WRITE_ONCE(other->sk_err, ECONNRESET);
570                         sk_error_report(other);
571                 }
572         }
573 }
574
575 static void unix_sock_destructor(struct sock *sk)
576 {
577         struct unix_sock *u = unix_sk(sk);
578
579         skb_queue_purge(&sk->sk_receive_queue);
580
581         DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
582         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
583         DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
584         if (!sock_flag(sk, SOCK_DEAD)) {
585                 pr_info("Attempt to release alive unix socket: %p\n", sk);
586                 return;
587         }
588
589         if (u->addr)
590                 unix_release_addr(u->addr);
591
592         atomic_long_dec(&unix_nr_socks);
593         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
594 #ifdef UNIX_REFCNT_DEBUG
595         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
596                 atomic_long_read(&unix_nr_socks));
597 #endif
598 }
599
600 static void unix_release_sock(struct sock *sk, int embrion)
601 {
602         struct unix_sock *u = unix_sk(sk);
603         struct sock *skpair;
604         struct sk_buff *skb;
605         struct path path;
606         int state;
607
608         unix_remove_socket(sock_net(sk), sk);
609         unix_remove_bsd_socket(sk);
610
611         /* Clear state */
612         unix_state_lock(sk);
613         sock_orphan(sk);
614         WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
615         path         = u->path;
616         u->path.dentry = NULL;
617         u->path.mnt = NULL;
618         state = sk->sk_state;
619         WRITE_ONCE(sk->sk_state, TCP_CLOSE);
620
621         skpair = unix_peer(sk);
622         unix_peer(sk) = NULL;
623
624         unix_state_unlock(sk);
625
626 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
627         if (u->oob_skb) {
628                 kfree_skb(u->oob_skb);
629                 u->oob_skb = NULL;
630         }
631 #endif
632
633         wake_up_interruptible_all(&u->peer_wait);
634
635         if (skpair != NULL) {
636                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
637                         unix_state_lock(skpair);
638                         /* No more writes */
639                         WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
640                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
641                                 WRITE_ONCE(skpair->sk_err, ECONNRESET);
642                         unix_state_unlock(skpair);
643                         skpair->sk_state_change(skpair);
644                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
645                 }
646
647                 unix_dgram_peer_wake_disconnect(sk, skpair);
648                 sock_put(skpair); /* It may now die */
649         }
650
651         /* Try to flush out this socket. Throw out buffers at least */
652
653         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
654                 if (state == TCP_LISTEN)
655                         unix_release_sock(skb->sk, 1);
656                 /* passed fds are erased in the kfree_skb hook        */
657                 UNIXCB(skb).consumed = skb->len;
658                 kfree_skb(skb);
659         }
660
661         if (path.dentry)
662                 path_put(&path);
663
664         sock_put(sk);
665
666         /* ---- Socket is dead now and most probably destroyed ---- */
667
668         /*
669          * Fixme: BSD difference: In BSD all sockets connected to us get
670          *        ECONNRESET and we die on the spot. In Linux we behave
671          *        like files and pipes do and wait for the last
672          *        dereference.
673          *
674          * Can't we simply set sock->err?
675          *
676          *        What the above comment does talk about? --ANK(980817)
677          */
678
679         if (READ_ONCE(unix_tot_inflight))
680                 unix_gc();              /* Garbage collect fds */
681 }
682
683 static void init_peercred(struct sock *sk)
684 {
685         const struct cred *old_cred;
686         struct pid *old_pid;
687
688         spin_lock(&sk->sk_peer_lock);
689         old_pid = sk->sk_peer_pid;
690         old_cred = sk->sk_peer_cred;
691         sk->sk_peer_pid  = get_pid(task_tgid(current));
692         sk->sk_peer_cred = get_current_cred();
693         spin_unlock(&sk->sk_peer_lock);
694
695         put_pid(old_pid);
696         put_cred(old_cred);
697 }
698
699 static void copy_peercred(struct sock *sk, struct sock *peersk)
700 {
701         const struct cred *old_cred;
702         struct pid *old_pid;
703
704         if (sk < peersk) {
705                 spin_lock(&sk->sk_peer_lock);
706                 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
707         } else {
708                 spin_lock(&peersk->sk_peer_lock);
709                 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
710         }
711         old_pid = sk->sk_peer_pid;
712         old_cred = sk->sk_peer_cred;
713         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
714         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
715
716         spin_unlock(&sk->sk_peer_lock);
717         spin_unlock(&peersk->sk_peer_lock);
718
719         put_pid(old_pid);
720         put_cred(old_cred);
721 }
722
723 static int unix_listen(struct socket *sock, int backlog)
724 {
725         int err;
726         struct sock *sk = sock->sk;
727         struct unix_sock *u = unix_sk(sk);
728
729         err = -EOPNOTSUPP;
730         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
731                 goto out;       /* Only stream/seqpacket sockets accept */
732         err = -EINVAL;
733         if (!READ_ONCE(u->addr))
734                 goto out;       /* No listens on an unbound socket */
735         unix_state_lock(sk);
736         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
737                 goto out_unlock;
738         if (backlog > sk->sk_max_ack_backlog)
739                 wake_up_interruptible_all(&u->peer_wait);
740         sk->sk_max_ack_backlog  = backlog;
741         WRITE_ONCE(sk->sk_state, TCP_LISTEN);
742
743         /* set credentials so connect can copy them */
744         init_peercred(sk);
745         err = 0;
746
747 out_unlock:
748         unix_state_unlock(sk);
749 out:
750         return err;
751 }
752
753 static int unix_release(struct socket *);
754 static int unix_bind(struct socket *, struct sockaddr *, int);
755 static int unix_stream_connect(struct socket *, struct sockaddr *,
756                                int addr_len, int flags);
757 static int unix_socketpair(struct socket *, struct socket *);
758 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
759 static int unix_getname(struct socket *, struct sockaddr *, int);
760 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
761 static __poll_t unix_dgram_poll(struct file *, struct socket *,
762                                     poll_table *);
763 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
764 #ifdef CONFIG_COMPAT
765 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
766 #endif
767 static int unix_shutdown(struct socket *, int);
768 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
769 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
770 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
771                                        struct pipe_inode_info *, size_t size,
772                                        unsigned int flags);
773 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
774 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
775 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
776 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
777 static int unix_dgram_connect(struct socket *, struct sockaddr *,
778                               int, int);
779 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
780 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
781                                   int);
782
783 #ifdef CONFIG_PROC_FS
784 static int unix_count_nr_fds(struct sock *sk)
785 {
786         struct sk_buff *skb;
787         struct unix_sock *u;
788         int nr_fds = 0;
789
790         spin_lock(&sk->sk_receive_queue.lock);
791         skb = skb_peek(&sk->sk_receive_queue);
792         while (skb) {
793                 u = unix_sk(skb->sk);
794                 nr_fds += atomic_read(&u->scm_stat.nr_fds);
795                 skb = skb_peek_next(skb, &sk->sk_receive_queue);
796         }
797         spin_unlock(&sk->sk_receive_queue.lock);
798
799         return nr_fds;
800 }
801
802 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
803 {
804         struct sock *sk = sock->sk;
805         unsigned char s_state;
806         struct unix_sock *u;
807         int nr_fds = 0;
808
809         if (sk) {
810                 s_state = READ_ONCE(sk->sk_state);
811                 u = unix_sk(sk);
812
813                 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
814                  * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
815                  * SOCK_DGRAM is ordinary. So, no lock is needed.
816                  */
817                 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
818                         nr_fds = atomic_read(&u->scm_stat.nr_fds);
819                 else if (s_state == TCP_LISTEN)
820                         nr_fds = unix_count_nr_fds(sk);
821
822                 seq_printf(m, "scm_fds: %u\n", nr_fds);
823         }
824 }
825 #else
826 #define unix_show_fdinfo NULL
827 #endif
828
829 static const struct proto_ops unix_stream_ops = {
830         .family =       PF_UNIX,
831         .owner =        THIS_MODULE,
832         .release =      unix_release,
833         .bind =         unix_bind,
834         .connect =      unix_stream_connect,
835         .socketpair =   unix_socketpair,
836         .accept =       unix_accept,
837         .getname =      unix_getname,
838         .poll =         unix_poll,
839         .ioctl =        unix_ioctl,
840 #ifdef CONFIG_COMPAT
841         .compat_ioctl = unix_compat_ioctl,
842 #endif
843         .listen =       unix_listen,
844         .shutdown =     unix_shutdown,
845         .sendmsg =      unix_stream_sendmsg,
846         .recvmsg =      unix_stream_recvmsg,
847         .read_skb =     unix_stream_read_skb,
848         .mmap =         sock_no_mmap,
849         .splice_read =  unix_stream_splice_read,
850         .set_peek_off = sk_set_peek_off,
851         .show_fdinfo =  unix_show_fdinfo,
852 };
853
854 static const struct proto_ops unix_dgram_ops = {
855         .family =       PF_UNIX,
856         .owner =        THIS_MODULE,
857         .release =      unix_release,
858         .bind =         unix_bind,
859         .connect =      unix_dgram_connect,
860         .socketpair =   unix_socketpair,
861         .accept =       sock_no_accept,
862         .getname =      unix_getname,
863         .poll =         unix_dgram_poll,
864         .ioctl =        unix_ioctl,
865 #ifdef CONFIG_COMPAT
866         .compat_ioctl = unix_compat_ioctl,
867 #endif
868         .listen =       sock_no_listen,
869         .shutdown =     unix_shutdown,
870         .sendmsg =      unix_dgram_sendmsg,
871         .read_skb =     unix_read_skb,
872         .recvmsg =      unix_dgram_recvmsg,
873         .mmap =         sock_no_mmap,
874         .set_peek_off = sk_set_peek_off,
875         .show_fdinfo =  unix_show_fdinfo,
876 };
877
878 static const struct proto_ops unix_seqpacket_ops = {
879         .family =       PF_UNIX,
880         .owner =        THIS_MODULE,
881         .release =      unix_release,
882         .bind =         unix_bind,
883         .connect =      unix_stream_connect,
884         .socketpair =   unix_socketpair,
885         .accept =       unix_accept,
886         .getname =      unix_getname,
887         .poll =         unix_dgram_poll,
888         .ioctl =        unix_ioctl,
889 #ifdef CONFIG_COMPAT
890         .compat_ioctl = unix_compat_ioctl,
891 #endif
892         .listen =       unix_listen,
893         .shutdown =     unix_shutdown,
894         .sendmsg =      unix_seqpacket_sendmsg,
895         .recvmsg =      unix_seqpacket_recvmsg,
896         .mmap =         sock_no_mmap,
897         .set_peek_off = sk_set_peek_off,
898         .show_fdinfo =  unix_show_fdinfo,
899 };
900
901 static void unix_close(struct sock *sk, long timeout)
902 {
903         /* Nothing to do here, unix socket does not need a ->close().
904          * This is merely for sockmap.
905          */
906 }
907
908 static void unix_unhash(struct sock *sk)
909 {
910         /* Nothing to do here, unix socket does not need a ->unhash().
911          * This is merely for sockmap.
912          */
913 }
914
915 static bool unix_bpf_bypass_getsockopt(int level, int optname)
916 {
917         if (level == SOL_SOCKET) {
918                 switch (optname) {
919                 case SO_PEERPIDFD:
920                         return true;
921                 default:
922                         return false;
923                 }
924         }
925
926         return false;
927 }
928
929 struct proto unix_dgram_proto = {
930         .name                   = "UNIX",
931         .owner                  = THIS_MODULE,
932         .obj_size               = sizeof(struct unix_sock),
933         .close                  = unix_close,
934         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
935 #ifdef CONFIG_BPF_SYSCALL
936         .psock_update_sk_prot   = unix_dgram_bpf_update_proto,
937 #endif
938 };
939
940 struct proto unix_stream_proto = {
941         .name                   = "UNIX-STREAM",
942         .owner                  = THIS_MODULE,
943         .obj_size               = sizeof(struct unix_sock),
944         .close                  = unix_close,
945         .unhash                 = unix_unhash,
946         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
947 #ifdef CONFIG_BPF_SYSCALL
948         .psock_update_sk_prot   = unix_stream_bpf_update_proto,
949 #endif
950 };
951
952 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
953 {
954         struct unix_sock *u;
955         struct sock *sk;
956         int err;
957
958         atomic_long_inc(&unix_nr_socks);
959         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
960                 err = -ENFILE;
961                 goto err;
962         }
963
964         if (type == SOCK_STREAM)
965                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
966         else /*dgram and  seqpacket */
967                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
968
969         if (!sk) {
970                 err = -ENOMEM;
971                 goto err;
972         }
973
974         sock_init_data(sock, sk);
975
976         sk->sk_hash             = unix_unbound_hash(sk);
977         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
978         sk->sk_write_space      = unix_write_space;
979         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
980         sk->sk_destruct         = unix_sock_destructor;
981         u = unix_sk(sk);
982         u->listener = NULL;
983         u->vertex = NULL;
984         u->path.dentry = NULL;
985         u->path.mnt = NULL;
986         spin_lock_init(&u->lock);
987         mutex_init(&u->iolock); /* single task reading lock */
988         mutex_init(&u->bindlock); /* single task binding lock */
989         init_waitqueue_head(&u->peer_wait);
990         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
991         memset(&u->scm_stat, 0, sizeof(struct scm_stat));
992         unix_insert_unbound_socket(net, sk);
993
994         sock_prot_inuse_add(net, sk->sk_prot, 1);
995
996         return sk;
997
998 err:
999         atomic_long_dec(&unix_nr_socks);
1000         return ERR_PTR(err);
1001 }
1002
1003 static int unix_create(struct net *net, struct socket *sock, int protocol,
1004                        int kern)
1005 {
1006         struct sock *sk;
1007
1008         if (protocol && protocol != PF_UNIX)
1009                 return -EPROTONOSUPPORT;
1010
1011         sock->state = SS_UNCONNECTED;
1012
1013         switch (sock->type) {
1014         case SOCK_STREAM:
1015                 sock->ops = &unix_stream_ops;
1016                 break;
1017                 /*
1018                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
1019                  *      nothing uses it.
1020                  */
1021         case SOCK_RAW:
1022                 sock->type = SOCK_DGRAM;
1023                 fallthrough;
1024         case SOCK_DGRAM:
1025                 sock->ops = &unix_dgram_ops;
1026                 break;
1027         case SOCK_SEQPACKET:
1028                 sock->ops = &unix_seqpacket_ops;
1029                 break;
1030         default:
1031                 return -ESOCKTNOSUPPORT;
1032         }
1033
1034         sk = unix_create1(net, sock, kern, sock->type);
1035         if (IS_ERR(sk))
1036                 return PTR_ERR(sk);
1037
1038         return 0;
1039 }
1040
1041 static int unix_release(struct socket *sock)
1042 {
1043         struct sock *sk = sock->sk;
1044
1045         if (!sk)
1046                 return 0;
1047
1048         sk->sk_prot->close(sk, 0);
1049         unix_release_sock(sk, 0);
1050         sock->sk = NULL;
1051
1052         return 0;
1053 }
1054
1055 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1056                                   int type)
1057 {
1058         struct inode *inode;
1059         struct path path;
1060         struct sock *sk;
1061         int err;
1062
1063         unix_mkname_bsd(sunaddr, addr_len);
1064         err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1065         if (err)
1066                 goto fail;
1067
1068         err = path_permission(&path, MAY_WRITE);
1069         if (err)
1070                 goto path_put;
1071
1072         err = -ECONNREFUSED;
1073         inode = d_backing_inode(path.dentry);
1074         if (!S_ISSOCK(inode->i_mode))
1075                 goto path_put;
1076
1077         sk = unix_find_socket_byinode(inode);
1078         if (!sk)
1079                 goto path_put;
1080
1081         err = -EPROTOTYPE;
1082         if (sk->sk_type == type)
1083                 touch_atime(&path);
1084         else
1085                 goto sock_put;
1086
1087         path_put(&path);
1088
1089         return sk;
1090
1091 sock_put:
1092         sock_put(sk);
1093 path_put:
1094         path_put(&path);
1095 fail:
1096         return ERR_PTR(err);
1097 }
1098
1099 static struct sock *unix_find_abstract(struct net *net,
1100                                        struct sockaddr_un *sunaddr,
1101                                        int addr_len, int type)
1102 {
1103         unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1104         struct dentry *dentry;
1105         struct sock *sk;
1106
1107         sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1108         if (!sk)
1109                 return ERR_PTR(-ECONNREFUSED);
1110
1111         dentry = unix_sk(sk)->path.dentry;
1112         if (dentry)
1113                 touch_atime(&unix_sk(sk)->path);
1114
1115         return sk;
1116 }
1117
1118 static struct sock *unix_find_other(struct net *net,
1119                                     struct sockaddr_un *sunaddr,
1120                                     int addr_len, int type)
1121 {
1122         struct sock *sk;
1123
1124         if (sunaddr->sun_path[0])
1125                 sk = unix_find_bsd(sunaddr, addr_len, type);
1126         else
1127                 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1128
1129         return sk;
1130 }
1131
1132 static int unix_autobind(struct sock *sk)
1133 {
1134         struct unix_sock *u = unix_sk(sk);
1135         unsigned int new_hash, old_hash;
1136         struct net *net = sock_net(sk);
1137         struct unix_address *addr;
1138         u32 lastnum, ordernum;
1139         int err;
1140
1141         err = mutex_lock_interruptible(&u->bindlock);
1142         if (err)
1143                 return err;
1144
1145         if (u->addr)
1146                 goto out;
1147
1148         err = -ENOMEM;
1149         addr = kzalloc(sizeof(*addr) +
1150                        offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1151         if (!addr)
1152                 goto out;
1153
1154         addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1155         addr->name->sun_family = AF_UNIX;
1156         refcount_set(&addr->refcnt, 1);
1157
1158         old_hash = sk->sk_hash;
1159         ordernum = get_random_u32();
1160         lastnum = ordernum & 0xFFFFF;
1161 retry:
1162         ordernum = (ordernum + 1) & 0xFFFFF;
1163         sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1164
1165         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1166         unix_table_double_lock(net, old_hash, new_hash);
1167
1168         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1169                 unix_table_double_unlock(net, old_hash, new_hash);
1170
1171                 /* __unix_find_socket_byname() may take long time if many names
1172                  * are already in use.
1173                  */
1174                 cond_resched();
1175
1176                 if (ordernum == lastnum) {
1177                         /* Give up if all names seems to be in use. */
1178                         err = -ENOSPC;
1179                         unix_release_addr(addr);
1180                         goto out;
1181                 }
1182
1183                 goto retry;
1184         }
1185
1186         __unix_set_addr_hash(net, sk, addr, new_hash);
1187         unix_table_double_unlock(net, old_hash, new_hash);
1188         err = 0;
1189
1190 out:    mutex_unlock(&u->bindlock);
1191         return err;
1192 }
1193
1194 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1195                          int addr_len)
1196 {
1197         umode_t mode = S_IFSOCK |
1198                (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1199         struct unix_sock *u = unix_sk(sk);
1200         unsigned int new_hash, old_hash;
1201         struct net *net = sock_net(sk);
1202         struct mnt_idmap *idmap;
1203         struct unix_address *addr;
1204         struct dentry *dentry;
1205         struct path parent;
1206         int err;
1207
1208         addr_len = unix_mkname_bsd(sunaddr, addr_len);
1209         addr = unix_create_addr(sunaddr, addr_len);
1210         if (!addr)
1211                 return -ENOMEM;
1212
1213         /*
1214          * Get the parent directory, calculate the hash for last
1215          * component.
1216          */
1217         dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1218         if (IS_ERR(dentry)) {
1219                 err = PTR_ERR(dentry);
1220                 goto out;
1221         }
1222
1223         /*
1224          * All right, let's create it.
1225          */
1226         idmap = mnt_idmap(parent.mnt);
1227         err = security_path_mknod(&parent, dentry, mode, 0);
1228         if (!err)
1229                 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1230         if (err)
1231                 goto out_path;
1232         err = mutex_lock_interruptible(&u->bindlock);
1233         if (err)
1234                 goto out_unlink;
1235         if (u->addr)
1236                 goto out_unlock;
1237
1238         old_hash = sk->sk_hash;
1239         new_hash = unix_bsd_hash(d_backing_inode(dentry));
1240         unix_table_double_lock(net, old_hash, new_hash);
1241         u->path.mnt = mntget(parent.mnt);
1242         u->path.dentry = dget(dentry);
1243         __unix_set_addr_hash(net, sk, addr, new_hash);
1244         unix_table_double_unlock(net, old_hash, new_hash);
1245         unix_insert_bsd_socket(sk);
1246         mutex_unlock(&u->bindlock);
1247         done_path_create(&parent, dentry);
1248         return 0;
1249
1250 out_unlock:
1251         mutex_unlock(&u->bindlock);
1252         err = -EINVAL;
1253 out_unlink:
1254         /* failed after successful mknod?  unlink what we'd created... */
1255         vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1256 out_path:
1257         done_path_create(&parent, dentry);
1258 out:
1259         unix_release_addr(addr);
1260         return err == -EEXIST ? -EADDRINUSE : err;
1261 }
1262
1263 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1264                               int addr_len)
1265 {
1266         struct unix_sock *u = unix_sk(sk);
1267         unsigned int new_hash, old_hash;
1268         struct net *net = sock_net(sk);
1269         struct unix_address *addr;
1270         int err;
1271
1272         addr = unix_create_addr(sunaddr, addr_len);
1273         if (!addr)
1274                 return -ENOMEM;
1275
1276         err = mutex_lock_interruptible(&u->bindlock);
1277         if (err)
1278                 goto out;
1279
1280         if (u->addr) {
1281                 err = -EINVAL;
1282                 goto out_mutex;
1283         }
1284
1285         old_hash = sk->sk_hash;
1286         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1287         unix_table_double_lock(net, old_hash, new_hash);
1288
1289         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1290                 goto out_spin;
1291
1292         __unix_set_addr_hash(net, sk, addr, new_hash);
1293         unix_table_double_unlock(net, old_hash, new_hash);
1294         mutex_unlock(&u->bindlock);
1295         return 0;
1296
1297 out_spin:
1298         unix_table_double_unlock(net, old_hash, new_hash);
1299         err = -EADDRINUSE;
1300 out_mutex:
1301         mutex_unlock(&u->bindlock);
1302 out:
1303         unix_release_addr(addr);
1304         return err;
1305 }
1306
1307 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1308 {
1309         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1310         struct sock *sk = sock->sk;
1311         int err;
1312
1313         if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1314             sunaddr->sun_family == AF_UNIX)
1315                 return unix_autobind(sk);
1316
1317         err = unix_validate_addr(sunaddr, addr_len);
1318         if (err)
1319                 return err;
1320
1321         if (sunaddr->sun_path[0])
1322                 err = unix_bind_bsd(sk, sunaddr, addr_len);
1323         else
1324                 err = unix_bind_abstract(sk, sunaddr, addr_len);
1325
1326         return err;
1327 }
1328
1329 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1330 {
1331         if (unlikely(sk1 == sk2) || !sk2) {
1332                 unix_state_lock(sk1);
1333                 return;
1334         }
1335         if (sk1 > sk2)
1336                 swap(sk1, sk2);
1337
1338         unix_state_lock(sk1);
1339         unix_state_lock_nested(sk2, U_LOCK_SECOND);
1340 }
1341
1342 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1343 {
1344         if (unlikely(sk1 == sk2) || !sk2) {
1345                 unix_state_unlock(sk1);
1346                 return;
1347         }
1348         unix_state_unlock(sk1);
1349         unix_state_unlock(sk2);
1350 }
1351
1352 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1353                               int alen, int flags)
1354 {
1355         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1356         struct sock *sk = sock->sk;
1357         struct sock *other;
1358         int err;
1359
1360         err = -EINVAL;
1361         if (alen < offsetofend(struct sockaddr, sa_family))
1362                 goto out;
1363
1364         if (addr->sa_family != AF_UNSPEC) {
1365                 err = unix_validate_addr(sunaddr, alen);
1366                 if (err)
1367                         goto out;
1368
1369                 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1370                 if (err)
1371                         goto out;
1372
1373                 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1374                      test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1375                     !READ_ONCE(unix_sk(sk)->addr)) {
1376                         err = unix_autobind(sk);
1377                         if (err)
1378                                 goto out;
1379                 }
1380
1381 restart:
1382                 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1383                 if (IS_ERR(other)) {
1384                         err = PTR_ERR(other);
1385                         goto out;
1386                 }
1387
1388                 unix_state_double_lock(sk, other);
1389
1390                 /* Apparently VFS overslept socket death. Retry. */
1391                 if (sock_flag(other, SOCK_DEAD)) {
1392                         unix_state_double_unlock(sk, other);
1393                         sock_put(other);
1394                         goto restart;
1395                 }
1396
1397                 err = -EPERM;
1398                 if (!unix_may_send(sk, other))
1399                         goto out_unlock;
1400
1401                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1402                 if (err)
1403                         goto out_unlock;
1404
1405                 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1406                 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1407         } else {
1408                 /*
1409                  *      1003.1g breaking connected state with AF_UNSPEC
1410                  */
1411                 other = NULL;
1412                 unix_state_double_lock(sk, other);
1413         }
1414
1415         /*
1416          * If it was connected, reconnect.
1417          */
1418         if (unix_peer(sk)) {
1419                 struct sock *old_peer = unix_peer(sk);
1420
1421                 unix_peer(sk) = other;
1422                 if (!other)
1423                         WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1424                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1425
1426                 unix_state_double_unlock(sk, other);
1427
1428                 if (other != old_peer) {
1429                         unix_dgram_disconnected(sk, old_peer);
1430
1431                         unix_state_lock(old_peer);
1432                         if (!unix_peer(old_peer))
1433                                 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1434                         unix_state_unlock(old_peer);
1435                 }
1436
1437                 sock_put(old_peer);
1438         } else {
1439                 unix_peer(sk) = other;
1440                 unix_state_double_unlock(sk, other);
1441         }
1442
1443         return 0;
1444
1445 out_unlock:
1446         unix_state_double_unlock(sk, other);
1447         sock_put(other);
1448 out:
1449         return err;
1450 }
1451
1452 static long unix_wait_for_peer(struct sock *other, long timeo)
1453         __releases(&unix_sk(other)->lock)
1454 {
1455         struct unix_sock *u = unix_sk(other);
1456         int sched;
1457         DEFINE_WAIT(wait);
1458
1459         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1460
1461         sched = !sock_flag(other, SOCK_DEAD) &&
1462                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1463                 unix_recvq_full_lockless(other);
1464
1465         unix_state_unlock(other);
1466
1467         if (sched)
1468                 timeo = schedule_timeout(timeo);
1469
1470         finish_wait(&u->peer_wait, &wait);
1471         return timeo;
1472 }
1473
1474 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1475                                int addr_len, int flags)
1476 {
1477         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1478         struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1479         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1480         struct net *net = sock_net(sk);
1481         struct sk_buff *skb = NULL;
1482         long timeo;
1483         int err;
1484         int st;
1485
1486         err = unix_validate_addr(sunaddr, addr_len);
1487         if (err)
1488                 goto out;
1489
1490         err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1491         if (err)
1492                 goto out;
1493
1494         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1495              test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1496             !READ_ONCE(u->addr)) {
1497                 err = unix_autobind(sk);
1498                 if (err)
1499                         goto out;
1500         }
1501
1502         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1503
1504         /* First of all allocate resources.
1505            If we will make it after state is locked,
1506            we will have to recheck all again in any case.
1507          */
1508
1509         /* create new sock for complete connection */
1510         newsk = unix_create1(net, NULL, 0, sock->type);
1511         if (IS_ERR(newsk)) {
1512                 err = PTR_ERR(newsk);
1513                 newsk = NULL;
1514                 goto out;
1515         }
1516
1517         err = -ENOMEM;
1518
1519         /* Allocate skb for sending to listening sock */
1520         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1521         if (skb == NULL)
1522                 goto out;
1523
1524 restart:
1525         /*  Find listening sock. */
1526         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1527         if (IS_ERR(other)) {
1528                 err = PTR_ERR(other);
1529                 other = NULL;
1530                 goto out;
1531         }
1532
1533         /* Latch state of peer */
1534         unix_state_lock(other);
1535
1536         /* Apparently VFS overslept socket death. Retry. */
1537         if (sock_flag(other, SOCK_DEAD)) {
1538                 unix_state_unlock(other);
1539                 sock_put(other);
1540                 goto restart;
1541         }
1542
1543         err = -ECONNREFUSED;
1544         if (other->sk_state != TCP_LISTEN)
1545                 goto out_unlock;
1546         if (other->sk_shutdown & RCV_SHUTDOWN)
1547                 goto out_unlock;
1548
1549         if (unix_recvq_full(other)) {
1550                 err = -EAGAIN;
1551                 if (!timeo)
1552                         goto out_unlock;
1553
1554                 timeo = unix_wait_for_peer(other, timeo);
1555
1556                 err = sock_intr_errno(timeo);
1557                 if (signal_pending(current))
1558                         goto out;
1559                 sock_put(other);
1560                 goto restart;
1561         }
1562
1563         /* Latch our state.
1564
1565            It is tricky place. We need to grab our state lock and cannot
1566            drop lock on peer. It is dangerous because deadlock is
1567            possible. Connect to self case and simultaneous
1568            attempt to connect are eliminated by checking socket
1569            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1570            check this before attempt to grab lock.
1571
1572            Well, and we have to recheck the state after socket locked.
1573          */
1574         st = sk->sk_state;
1575
1576         switch (st) {
1577         case TCP_CLOSE:
1578                 /* This is ok... continue with connect */
1579                 break;
1580         case TCP_ESTABLISHED:
1581                 /* Socket is already connected */
1582                 err = -EISCONN;
1583                 goto out_unlock;
1584         default:
1585                 err = -EINVAL;
1586                 goto out_unlock;
1587         }
1588
1589         unix_state_lock_nested(sk, U_LOCK_SECOND);
1590
1591         if (sk->sk_state != st) {
1592                 unix_state_unlock(sk);
1593                 unix_state_unlock(other);
1594                 sock_put(other);
1595                 goto restart;
1596         }
1597
1598         err = security_unix_stream_connect(sk, other, newsk);
1599         if (err) {
1600                 unix_state_unlock(sk);
1601                 goto out_unlock;
1602         }
1603
1604         /* The way is open! Fastly set all the necessary fields... */
1605
1606         sock_hold(sk);
1607         unix_peer(newsk)        = sk;
1608         newsk->sk_state         = TCP_ESTABLISHED;
1609         newsk->sk_type          = sk->sk_type;
1610         init_peercred(newsk);
1611         newu = unix_sk(newsk);
1612         newu->listener = other;
1613         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1614         otheru = unix_sk(other);
1615
1616         /* copy address information from listening to new sock
1617          *
1618          * The contents of *(otheru->addr) and otheru->path
1619          * are seen fully set up here, since we have found
1620          * otheru in hash under its lock.  Insertion into the
1621          * hash chain we'd found it in had been done in an
1622          * earlier critical area protected by the chain's lock,
1623          * the same one where we'd set *(otheru->addr) contents,
1624          * as well as otheru->path and otheru->addr itself.
1625          *
1626          * Using smp_store_release() here to set newu->addr
1627          * is enough to make those stores, as well as stores
1628          * to newu->path visible to anyone who gets newu->addr
1629          * by smp_load_acquire().  IOW, the same warranties
1630          * as for unix_sock instances bound in unix_bind() or
1631          * in unix_autobind().
1632          */
1633         if (otheru->path.dentry) {
1634                 path_get(&otheru->path);
1635                 newu->path = otheru->path;
1636         }
1637         refcount_inc(&otheru->addr->refcnt);
1638         smp_store_release(&newu->addr, otheru->addr);
1639
1640         /* Set credentials */
1641         copy_peercred(sk, other);
1642
1643         sock->state     = SS_CONNECTED;
1644         WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1645         sock_hold(newsk);
1646
1647         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1648         unix_peer(sk)   = newsk;
1649
1650         unix_state_unlock(sk);
1651
1652         /* take ten and send info to listening sock */
1653         spin_lock(&other->sk_receive_queue.lock);
1654         __skb_queue_tail(&other->sk_receive_queue, skb);
1655         spin_unlock(&other->sk_receive_queue.lock);
1656         unix_state_unlock(other);
1657         other->sk_data_ready(other);
1658         sock_put(other);
1659         return 0;
1660
1661 out_unlock:
1662         if (other)
1663                 unix_state_unlock(other);
1664
1665 out:
1666         kfree_skb(skb);
1667         if (newsk)
1668                 unix_release_sock(newsk, 0);
1669         if (other)
1670                 sock_put(other);
1671         return err;
1672 }
1673
1674 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1675 {
1676         struct sock *ska = socka->sk, *skb = sockb->sk;
1677
1678         /* Join our sockets back to back */
1679         sock_hold(ska);
1680         sock_hold(skb);
1681         unix_peer(ska) = skb;
1682         unix_peer(skb) = ska;
1683         init_peercred(ska);
1684         init_peercred(skb);
1685
1686         ska->sk_state = TCP_ESTABLISHED;
1687         skb->sk_state = TCP_ESTABLISHED;
1688         socka->state  = SS_CONNECTED;
1689         sockb->state  = SS_CONNECTED;
1690         return 0;
1691 }
1692
1693 static void unix_sock_inherit_flags(const struct socket *old,
1694                                     struct socket *new)
1695 {
1696         if (test_bit(SOCK_PASSCRED, &old->flags))
1697                 set_bit(SOCK_PASSCRED, &new->flags);
1698         if (test_bit(SOCK_PASSPIDFD, &old->flags))
1699                 set_bit(SOCK_PASSPIDFD, &new->flags);
1700         if (test_bit(SOCK_PASSSEC, &old->flags))
1701                 set_bit(SOCK_PASSSEC, &new->flags);
1702 }
1703
1704 static int unix_accept(struct socket *sock, struct socket *newsock,
1705                        struct proto_accept_arg *arg)
1706 {
1707         struct sock *sk = sock->sk;
1708         struct sk_buff *skb;
1709         struct sock *tsk;
1710
1711         arg->err = -EOPNOTSUPP;
1712         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1713                 goto out;
1714
1715         arg->err = -EINVAL;
1716         if (sk->sk_state != TCP_LISTEN)
1717                 goto out;
1718
1719         /* If socket state is TCP_LISTEN it cannot change (for now...),
1720          * so that no locks are necessary.
1721          */
1722
1723         skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1724                                 &arg->err);
1725         if (!skb) {
1726                 /* This means receive shutdown. */
1727                 if (arg->err == 0)
1728                         arg->err = -EINVAL;
1729                 goto out;
1730         }
1731
1732         tsk = skb->sk;
1733         skb_free_datagram(sk, skb);
1734         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1735
1736         /* attach accepted sock to socket */
1737         unix_state_lock(tsk);
1738         unix_update_edges(unix_sk(tsk));
1739         newsock->state = SS_CONNECTED;
1740         unix_sock_inherit_flags(sock, newsock);
1741         sock_graft(tsk, newsock);
1742         unix_state_unlock(tsk);
1743         return 0;
1744
1745 out:
1746         return arg->err;
1747 }
1748
1749
1750 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1751 {
1752         struct sock *sk = sock->sk;
1753         struct unix_address *addr;
1754         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1755         int err = 0;
1756
1757         if (peer) {
1758                 sk = unix_peer_get(sk);
1759
1760                 err = -ENOTCONN;
1761                 if (!sk)
1762                         goto out;
1763                 err = 0;
1764         } else {
1765                 sock_hold(sk);
1766         }
1767
1768         addr = smp_load_acquire(&unix_sk(sk)->addr);
1769         if (!addr) {
1770                 sunaddr->sun_family = AF_UNIX;
1771                 sunaddr->sun_path[0] = 0;
1772                 err = offsetof(struct sockaddr_un, sun_path);
1773         } else {
1774                 err = addr->len;
1775                 memcpy(sunaddr, addr->name, addr->len);
1776
1777                 if (peer)
1778                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1779                                                CGROUP_UNIX_GETPEERNAME);
1780                 else
1781                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1782                                                CGROUP_UNIX_GETSOCKNAME);
1783         }
1784         sock_put(sk);
1785 out:
1786         return err;
1787 }
1788
1789 /* The "user->unix_inflight" variable is protected by the garbage
1790  * collection lock, and we just read it locklessly here. If you go
1791  * over the limit, there might be a tiny race in actually noticing
1792  * it across threads. Tough.
1793  */
1794 static inline bool too_many_unix_fds(struct task_struct *p)
1795 {
1796         struct user_struct *user = current_user();
1797
1798         if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1799                 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1800         return false;
1801 }
1802
1803 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1804 {
1805         if (too_many_unix_fds(current))
1806                 return -ETOOMANYREFS;
1807
1808         UNIXCB(skb).fp = scm->fp;
1809         scm->fp = NULL;
1810
1811         if (unix_prepare_fpl(UNIXCB(skb).fp))
1812                 return -ENOMEM;
1813
1814         return 0;
1815 }
1816
1817 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1818 {
1819         scm->fp = UNIXCB(skb).fp;
1820         UNIXCB(skb).fp = NULL;
1821
1822         unix_destroy_fpl(scm->fp);
1823 }
1824
1825 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1826 {
1827         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1828 }
1829
1830 static void unix_destruct_scm(struct sk_buff *skb)
1831 {
1832         struct scm_cookie scm;
1833
1834         memset(&scm, 0, sizeof(scm));
1835         scm.pid  = UNIXCB(skb).pid;
1836         if (UNIXCB(skb).fp)
1837                 unix_detach_fds(&scm, skb);
1838
1839         /* Alas, it calls VFS */
1840         /* So fscking what? fput() had been SMP-safe since the last Summer */
1841         scm_destroy(&scm);
1842         sock_wfree(skb);
1843 }
1844
1845 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1846 {
1847         int err = 0;
1848
1849         UNIXCB(skb).pid  = get_pid(scm->pid);
1850         UNIXCB(skb).uid = scm->creds.uid;
1851         UNIXCB(skb).gid = scm->creds.gid;
1852         UNIXCB(skb).fp = NULL;
1853         unix_get_secdata(scm, skb);
1854         if (scm->fp && send_fds)
1855                 err = unix_attach_fds(scm, skb);
1856
1857         skb->destructor = unix_destruct_scm;
1858         return err;
1859 }
1860
1861 static bool unix_passcred_enabled(const struct socket *sock,
1862                                   const struct sock *other)
1863 {
1864         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1865                test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1866                !other->sk_socket ||
1867                test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1868                test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1869 }
1870
1871 /*
1872  * Some apps rely on write() giving SCM_CREDENTIALS
1873  * We include credentials if source or destination socket
1874  * asserted SOCK_PASSCRED.
1875  */
1876 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1877                             const struct sock *other)
1878 {
1879         if (UNIXCB(skb).pid)
1880                 return;
1881         if (unix_passcred_enabled(sock, other)) {
1882                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1883                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1884         }
1885 }
1886
1887 static bool unix_skb_scm_eq(struct sk_buff *skb,
1888                             struct scm_cookie *scm)
1889 {
1890         return UNIXCB(skb).pid == scm->pid &&
1891                uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1892                gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1893                unix_secdata_eq(scm, skb);
1894 }
1895
1896 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1897 {
1898         struct scm_fp_list *fp = UNIXCB(skb).fp;
1899         struct unix_sock *u = unix_sk(sk);
1900
1901         if (unlikely(fp && fp->count)) {
1902                 atomic_add(fp->count, &u->scm_stat.nr_fds);
1903                 unix_add_edges(fp, u);
1904         }
1905 }
1906
1907 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1908 {
1909         struct scm_fp_list *fp = UNIXCB(skb).fp;
1910         struct unix_sock *u = unix_sk(sk);
1911
1912         if (unlikely(fp && fp->count)) {
1913                 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1914                 unix_del_edges(fp);
1915         }
1916 }
1917
1918 /*
1919  *      Send AF_UNIX data.
1920  */
1921
1922 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1923                               size_t len)
1924 {
1925         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1926         struct sock *sk = sock->sk, *other = NULL;
1927         struct unix_sock *u = unix_sk(sk);
1928         struct scm_cookie scm;
1929         struct sk_buff *skb;
1930         int data_len = 0;
1931         int sk_locked;
1932         long timeo;
1933         int err;
1934
1935         err = scm_send(sock, msg, &scm, false);
1936         if (err < 0)
1937                 return err;
1938
1939         wait_for_unix_gc(scm.fp);
1940
1941         err = -EOPNOTSUPP;
1942         if (msg->msg_flags&MSG_OOB)
1943                 goto out;
1944
1945         if (msg->msg_namelen) {
1946                 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1947                 if (err)
1948                         goto out;
1949
1950                 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1951                                                             msg->msg_name,
1952                                                             &msg->msg_namelen,
1953                                                             NULL);
1954                 if (err)
1955                         goto out;
1956         } else {
1957                 sunaddr = NULL;
1958                 err = -ENOTCONN;
1959                 other = unix_peer_get(sk);
1960                 if (!other)
1961                         goto out;
1962         }
1963
1964         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1965              test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1966             !READ_ONCE(u->addr)) {
1967                 err = unix_autobind(sk);
1968                 if (err)
1969                         goto out;
1970         }
1971
1972         err = -EMSGSIZE;
1973         if (len > sk->sk_sndbuf - 32)
1974                 goto out;
1975
1976         if (len > SKB_MAX_ALLOC) {
1977                 data_len = min_t(size_t,
1978                                  len - SKB_MAX_ALLOC,
1979                                  MAX_SKB_FRAGS * PAGE_SIZE);
1980                 data_len = PAGE_ALIGN(data_len);
1981
1982                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1983         }
1984
1985         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1986                                    msg->msg_flags & MSG_DONTWAIT, &err,
1987                                    PAGE_ALLOC_COSTLY_ORDER);
1988         if (skb == NULL)
1989                 goto out;
1990
1991         err = unix_scm_to_skb(&scm, skb, true);
1992         if (err < 0)
1993                 goto out_free;
1994
1995         skb_put(skb, len - data_len);
1996         skb->data_len = data_len;
1997         skb->len = len;
1998         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1999         if (err)
2000                 goto out_free;
2001
2002         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2003
2004 restart:
2005         if (!other) {
2006                 err = -ECONNRESET;
2007                 if (sunaddr == NULL)
2008                         goto out_free;
2009
2010                 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2011                                         sk->sk_type);
2012                 if (IS_ERR(other)) {
2013                         err = PTR_ERR(other);
2014                         other = NULL;
2015                         goto out_free;
2016                 }
2017         }
2018
2019         if (sk_filter(other, skb) < 0) {
2020                 /* Toss the packet but do not return any error to the sender */
2021                 err = len;
2022                 goto out_free;
2023         }
2024
2025         sk_locked = 0;
2026         unix_state_lock(other);
2027 restart_locked:
2028         err = -EPERM;
2029         if (!unix_may_send(sk, other))
2030                 goto out_unlock;
2031
2032         if (unlikely(sock_flag(other, SOCK_DEAD))) {
2033                 /*
2034                  *      Check with 1003.1g - what should
2035                  *      datagram error
2036                  */
2037                 unix_state_unlock(other);
2038                 sock_put(other);
2039
2040                 if (!sk_locked)
2041                         unix_state_lock(sk);
2042
2043                 err = 0;
2044                 if (sk->sk_type == SOCK_SEQPACKET) {
2045                         /* We are here only when racing with unix_release_sock()
2046                          * is clearing @other. Never change state to TCP_CLOSE
2047                          * unlike SOCK_DGRAM wants.
2048                          */
2049                         unix_state_unlock(sk);
2050                         err = -EPIPE;
2051                 } else if (unix_peer(sk) == other) {
2052                         unix_peer(sk) = NULL;
2053                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2054
2055                         WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2056                         unix_state_unlock(sk);
2057
2058                         unix_dgram_disconnected(sk, other);
2059                         sock_put(other);
2060                         err = -ECONNREFUSED;
2061                 } else {
2062                         unix_state_unlock(sk);
2063                 }
2064
2065                 other = NULL;
2066                 if (err)
2067                         goto out_free;
2068                 goto restart;
2069         }
2070
2071         err = -EPIPE;
2072         if (other->sk_shutdown & RCV_SHUTDOWN)
2073                 goto out_unlock;
2074
2075         if (sk->sk_type != SOCK_SEQPACKET) {
2076                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2077                 if (err)
2078                         goto out_unlock;
2079         }
2080
2081         /* other == sk && unix_peer(other) != sk if
2082          * - unix_peer(sk) == NULL, destination address bound to sk
2083          * - unix_peer(sk) == sk by time of get but disconnected before lock
2084          */
2085         if (other != sk &&
2086             unlikely(unix_peer(other) != sk &&
2087             unix_recvq_full_lockless(other))) {
2088                 if (timeo) {
2089                         timeo = unix_wait_for_peer(other, timeo);
2090
2091                         err = sock_intr_errno(timeo);
2092                         if (signal_pending(current))
2093                                 goto out_free;
2094
2095                         goto restart;
2096                 }
2097
2098                 if (!sk_locked) {
2099                         unix_state_unlock(other);
2100                         unix_state_double_lock(sk, other);
2101                 }
2102
2103                 if (unix_peer(sk) != other ||
2104                     unix_dgram_peer_wake_me(sk, other)) {
2105                         err = -EAGAIN;
2106                         sk_locked = 1;
2107                         goto out_unlock;
2108                 }
2109
2110                 if (!sk_locked) {
2111                         sk_locked = 1;
2112                         goto restart_locked;
2113                 }
2114         }
2115
2116         if (unlikely(sk_locked))
2117                 unix_state_unlock(sk);
2118
2119         if (sock_flag(other, SOCK_RCVTSTAMP))
2120                 __net_timestamp(skb);
2121         maybe_add_creds(skb, sock, other);
2122         scm_stat_add(other, skb);
2123         skb_queue_tail(&other->sk_receive_queue, skb);
2124         unix_state_unlock(other);
2125         other->sk_data_ready(other);
2126         sock_put(other);
2127         scm_destroy(&scm);
2128         return len;
2129
2130 out_unlock:
2131         if (sk_locked)
2132                 unix_state_unlock(sk);
2133         unix_state_unlock(other);
2134 out_free:
2135         kfree_skb(skb);
2136 out:
2137         if (other)
2138                 sock_put(other);
2139         scm_destroy(&scm);
2140         return err;
2141 }
2142
2143 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2144  * bytes, and a minimum of a full page.
2145  */
2146 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2147
2148 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2149 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2150                      struct scm_cookie *scm, bool fds_sent)
2151 {
2152         struct unix_sock *ousk = unix_sk(other);
2153         struct sk_buff *skb;
2154         int err = 0;
2155
2156         skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2157
2158         if (!skb)
2159                 return err;
2160
2161         err = unix_scm_to_skb(scm, skb, !fds_sent);
2162         if (err < 0) {
2163                 kfree_skb(skb);
2164                 return err;
2165         }
2166         skb_put(skb, 1);
2167         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2168
2169         if (err) {
2170                 kfree_skb(skb);
2171                 return err;
2172         }
2173
2174         unix_state_lock(other);
2175
2176         if (sock_flag(other, SOCK_DEAD) ||
2177             (other->sk_shutdown & RCV_SHUTDOWN)) {
2178                 unix_state_unlock(other);
2179                 kfree_skb(skb);
2180                 return -EPIPE;
2181         }
2182
2183         maybe_add_creds(skb, sock, other);
2184         skb_get(skb);
2185
2186         scm_stat_add(other, skb);
2187
2188         spin_lock(&other->sk_receive_queue.lock);
2189         if (ousk->oob_skb)
2190                 consume_skb(ousk->oob_skb);
2191         WRITE_ONCE(ousk->oob_skb, skb);
2192         __skb_queue_tail(&other->sk_receive_queue, skb);
2193         spin_unlock(&other->sk_receive_queue.lock);
2194
2195         sk_send_sigurg(other);
2196         unix_state_unlock(other);
2197         other->sk_data_ready(other);
2198
2199         return err;
2200 }
2201 #endif
2202
2203 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2204                                size_t len)
2205 {
2206         struct sock *sk = sock->sk;
2207         struct sock *other = NULL;
2208         int err, size;
2209         struct sk_buff *skb;
2210         int sent = 0;
2211         struct scm_cookie scm;
2212         bool fds_sent = false;
2213         int data_len;
2214
2215         err = scm_send(sock, msg, &scm, false);
2216         if (err < 0)
2217                 return err;
2218
2219         wait_for_unix_gc(scm.fp);
2220
2221         err = -EOPNOTSUPP;
2222         if (msg->msg_flags & MSG_OOB) {
2223 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2224                 if (len)
2225                         len--;
2226                 else
2227 #endif
2228                         goto out_err;
2229         }
2230
2231         if (msg->msg_namelen) {
2232                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2233                 goto out_err;
2234         } else {
2235                 err = -ENOTCONN;
2236                 other = unix_peer(sk);
2237                 if (!other)
2238                         goto out_err;
2239         }
2240
2241         if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2242                 goto pipe_err;
2243
2244         while (sent < len) {
2245                 size = len - sent;
2246
2247                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2248                         skb = sock_alloc_send_pskb(sk, 0, 0,
2249                                                    msg->msg_flags & MSG_DONTWAIT,
2250                                                    &err, 0);
2251                 } else {
2252                         /* Keep two messages in the pipe so it schedules better */
2253                         size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2254
2255                         /* allow fallback to order-0 allocations */
2256                         size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2257
2258                         data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2259
2260                         data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2261
2262                         skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2263                                                    msg->msg_flags & MSG_DONTWAIT, &err,
2264                                                    get_order(UNIX_SKB_FRAGS_SZ));
2265                 }
2266                 if (!skb)
2267                         goto out_err;
2268
2269                 /* Only send the fds in the first buffer */
2270                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2271                 if (err < 0) {
2272                         kfree_skb(skb);
2273                         goto out_err;
2274                 }
2275                 fds_sent = true;
2276
2277                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2278                         err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2279                                                    sk->sk_allocation);
2280                         if (err < 0) {
2281                                 kfree_skb(skb);
2282                                 goto out_err;
2283                         }
2284                         size = err;
2285                         refcount_add(size, &sk->sk_wmem_alloc);
2286                 } else {
2287                         skb_put(skb, size - data_len);
2288                         skb->data_len = data_len;
2289                         skb->len = size;
2290                         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2291                         if (err) {
2292                                 kfree_skb(skb);
2293                                 goto out_err;
2294                         }
2295                 }
2296
2297                 unix_state_lock(other);
2298
2299                 if (sock_flag(other, SOCK_DEAD) ||
2300                     (other->sk_shutdown & RCV_SHUTDOWN))
2301                         goto pipe_err_free;
2302
2303                 maybe_add_creds(skb, sock, other);
2304                 scm_stat_add(other, skb);
2305                 skb_queue_tail(&other->sk_receive_queue, skb);
2306                 unix_state_unlock(other);
2307                 other->sk_data_ready(other);
2308                 sent += size;
2309         }
2310
2311 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2312         if (msg->msg_flags & MSG_OOB) {
2313                 err = queue_oob(sock, msg, other, &scm, fds_sent);
2314                 if (err)
2315                         goto out_err;
2316                 sent++;
2317         }
2318 #endif
2319
2320         scm_destroy(&scm);
2321
2322         return sent;
2323
2324 pipe_err_free:
2325         unix_state_unlock(other);
2326         kfree_skb(skb);
2327 pipe_err:
2328         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2329                 send_sig(SIGPIPE, current, 0);
2330         err = -EPIPE;
2331 out_err:
2332         scm_destroy(&scm);
2333         return sent ? : err;
2334 }
2335
2336 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2337                                   size_t len)
2338 {
2339         int err;
2340         struct sock *sk = sock->sk;
2341
2342         err = sock_error(sk);
2343         if (err)
2344                 return err;
2345
2346         if (sk->sk_state != TCP_ESTABLISHED)
2347                 return -ENOTCONN;
2348
2349         if (msg->msg_namelen)
2350                 msg->msg_namelen = 0;
2351
2352         return unix_dgram_sendmsg(sock, msg, len);
2353 }
2354
2355 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2356                                   size_t size, int flags)
2357 {
2358         struct sock *sk = sock->sk;
2359
2360         if (sk->sk_state != TCP_ESTABLISHED)
2361                 return -ENOTCONN;
2362
2363         return unix_dgram_recvmsg(sock, msg, size, flags);
2364 }
2365
2366 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2367 {
2368         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2369
2370         if (addr) {
2371                 msg->msg_namelen = addr->len;
2372                 memcpy(msg->msg_name, addr->name, addr->len);
2373         }
2374 }
2375
2376 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2377                          int flags)
2378 {
2379         struct scm_cookie scm;
2380         struct socket *sock = sk->sk_socket;
2381         struct unix_sock *u = unix_sk(sk);
2382         struct sk_buff *skb, *last;
2383         long timeo;
2384         int skip;
2385         int err;
2386
2387         err = -EOPNOTSUPP;
2388         if (flags&MSG_OOB)
2389                 goto out;
2390
2391         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2392
2393         do {
2394                 mutex_lock(&u->iolock);
2395
2396                 skip = sk_peek_offset(sk, flags);
2397                 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2398                                               &skip, &err, &last);
2399                 if (skb) {
2400                         if (!(flags & MSG_PEEK))
2401                                 scm_stat_del(sk, skb);
2402                         break;
2403                 }
2404
2405                 mutex_unlock(&u->iolock);
2406
2407                 if (err != -EAGAIN)
2408                         break;
2409         } while (timeo &&
2410                  !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2411                                               &err, &timeo, last));
2412
2413         if (!skb) { /* implies iolock unlocked */
2414                 unix_state_lock(sk);
2415                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2416                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2417                     (sk->sk_shutdown & RCV_SHUTDOWN))
2418                         err = 0;
2419                 unix_state_unlock(sk);
2420                 goto out;
2421         }
2422
2423         if (wq_has_sleeper(&u->peer_wait))
2424                 wake_up_interruptible_sync_poll(&u->peer_wait,
2425                                                 EPOLLOUT | EPOLLWRNORM |
2426                                                 EPOLLWRBAND);
2427
2428         if (msg->msg_name) {
2429                 unix_copy_addr(msg, skb->sk);
2430
2431                 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2432                                                       msg->msg_name,
2433                                                       &msg->msg_namelen);
2434         }
2435
2436         if (size > skb->len - skip)
2437                 size = skb->len - skip;
2438         else if (size < skb->len - skip)
2439                 msg->msg_flags |= MSG_TRUNC;
2440
2441         err = skb_copy_datagram_msg(skb, skip, msg, size);
2442         if (err)
2443                 goto out_free;
2444
2445         if (sock_flag(sk, SOCK_RCVTSTAMP))
2446                 __sock_recv_timestamp(msg, sk, skb);
2447
2448         memset(&scm, 0, sizeof(scm));
2449
2450         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2451         unix_set_secdata(&scm, skb);
2452
2453         if (!(flags & MSG_PEEK)) {
2454                 if (UNIXCB(skb).fp)
2455                         unix_detach_fds(&scm, skb);
2456
2457                 sk_peek_offset_bwd(sk, skb->len);
2458         } else {
2459                 /* It is questionable: on PEEK we could:
2460                    - do not return fds - good, but too simple 8)
2461                    - return fds, and do not return them on read (old strategy,
2462                      apparently wrong)
2463                    - clone fds (I chose it for now, it is the most universal
2464                      solution)
2465
2466                    POSIX 1003.1g does not actually define this clearly
2467                    at all. POSIX 1003.1g doesn't define a lot of things
2468                    clearly however!
2469
2470                 */
2471
2472                 sk_peek_offset_fwd(sk, size);
2473
2474                 if (UNIXCB(skb).fp)
2475                         unix_peek_fds(&scm, skb);
2476         }
2477         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2478
2479         scm_recv_unix(sock, msg, &scm, flags);
2480
2481 out_free:
2482         skb_free_datagram(sk, skb);
2483         mutex_unlock(&u->iolock);
2484 out:
2485         return err;
2486 }
2487
2488 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2489                               int flags)
2490 {
2491         struct sock *sk = sock->sk;
2492
2493 #ifdef CONFIG_BPF_SYSCALL
2494         const struct proto *prot = READ_ONCE(sk->sk_prot);
2495
2496         if (prot != &unix_dgram_proto)
2497                 return prot->recvmsg(sk, msg, size, flags, NULL);
2498 #endif
2499         return __unix_dgram_recvmsg(sk, msg, size, flags);
2500 }
2501
2502 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2503 {
2504         struct unix_sock *u = unix_sk(sk);
2505         struct sk_buff *skb;
2506         int err;
2507
2508         mutex_lock(&u->iolock);
2509         skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2510         mutex_unlock(&u->iolock);
2511         if (!skb)
2512                 return err;
2513
2514         return recv_actor(sk, skb);
2515 }
2516
2517 /*
2518  *      Sleep until more data has arrived. But check for races..
2519  */
2520 static long unix_stream_data_wait(struct sock *sk, long timeo,
2521                                   struct sk_buff *last, unsigned int last_len,
2522                                   bool freezable)
2523 {
2524         unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2525         struct sk_buff *tail;
2526         DEFINE_WAIT(wait);
2527
2528         unix_state_lock(sk);
2529
2530         for (;;) {
2531                 prepare_to_wait(sk_sleep(sk), &wait, state);
2532
2533                 tail = skb_peek_tail(&sk->sk_receive_queue);
2534                 if (tail != last ||
2535                     (tail && tail->len != last_len) ||
2536                     sk->sk_err ||
2537                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2538                     signal_pending(current) ||
2539                     !timeo)
2540                         break;
2541
2542                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2543                 unix_state_unlock(sk);
2544                 timeo = schedule_timeout(timeo);
2545                 unix_state_lock(sk);
2546
2547                 if (sock_flag(sk, SOCK_DEAD))
2548                         break;
2549
2550                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2551         }
2552
2553         finish_wait(sk_sleep(sk), &wait);
2554         unix_state_unlock(sk);
2555         return timeo;
2556 }
2557
2558 static unsigned int unix_skb_len(const struct sk_buff *skb)
2559 {
2560         return skb->len - UNIXCB(skb).consumed;
2561 }
2562
2563 struct unix_stream_read_state {
2564         int (*recv_actor)(struct sk_buff *, int, int,
2565                           struct unix_stream_read_state *);
2566         struct socket *socket;
2567         struct msghdr *msg;
2568         struct pipe_inode_info *pipe;
2569         size_t size;
2570         int flags;
2571         unsigned int splice_flags;
2572 };
2573
2574 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2575 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2576 {
2577         struct socket *sock = state->socket;
2578         struct sock *sk = sock->sk;
2579         struct unix_sock *u = unix_sk(sk);
2580         int chunk = 1;
2581         struct sk_buff *oob_skb;
2582
2583         mutex_lock(&u->iolock);
2584         unix_state_lock(sk);
2585         spin_lock(&sk->sk_receive_queue.lock);
2586
2587         if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2588                 spin_unlock(&sk->sk_receive_queue.lock);
2589                 unix_state_unlock(sk);
2590                 mutex_unlock(&u->iolock);
2591                 return -EINVAL;
2592         }
2593
2594         oob_skb = u->oob_skb;
2595
2596         if (!(state->flags & MSG_PEEK))
2597                 WRITE_ONCE(u->oob_skb, NULL);
2598         else
2599                 skb_get(oob_skb);
2600
2601         spin_unlock(&sk->sk_receive_queue.lock);
2602         unix_state_unlock(sk);
2603
2604         chunk = state->recv_actor(oob_skb, 0, chunk, state);
2605
2606         if (!(state->flags & MSG_PEEK))
2607                 UNIXCB(oob_skb).consumed += 1;
2608
2609         consume_skb(oob_skb);
2610
2611         mutex_unlock(&u->iolock);
2612
2613         if (chunk < 0)
2614                 return -EFAULT;
2615
2616         state->msg->msg_flags |= MSG_OOB;
2617         return 1;
2618 }
2619
2620 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2621                                   int flags, int copied)
2622 {
2623         struct unix_sock *u = unix_sk(sk);
2624
2625         if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2626                 skb_unlink(skb, &sk->sk_receive_queue);
2627                 consume_skb(skb);
2628                 skb = NULL;
2629         } else {
2630                 struct sk_buff *unlinked_skb = NULL;
2631
2632                 spin_lock(&sk->sk_receive_queue.lock);
2633
2634                 if (skb == u->oob_skb) {
2635                         if (copied) {
2636                                 skb = NULL;
2637                         } else if (sock_flag(sk, SOCK_URGINLINE)) {
2638                                 if (!(flags & MSG_PEEK)) {
2639                                         WRITE_ONCE(u->oob_skb, NULL);
2640                                         consume_skb(skb);
2641                                 }
2642                         } else if (flags & MSG_PEEK) {
2643                                 skb = NULL;
2644                         } else {
2645                                 __skb_unlink(skb, &sk->sk_receive_queue);
2646                                 WRITE_ONCE(u->oob_skb, NULL);
2647                                 unlinked_skb = skb;
2648                                 skb = skb_peek(&sk->sk_receive_queue);
2649                         }
2650                 }
2651
2652                 spin_unlock(&sk->sk_receive_queue.lock);
2653
2654                 if (unlinked_skb) {
2655                         WARN_ON_ONCE(skb_unref(unlinked_skb));
2656                         kfree_skb(unlinked_skb);
2657                 }
2658         }
2659         return skb;
2660 }
2661 #endif
2662
2663 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2664 {
2665         if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2666                 return -ENOTCONN;
2667
2668         return unix_read_skb(sk, recv_actor);
2669 }
2670
2671 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2672                                     bool freezable)
2673 {
2674         struct scm_cookie scm;
2675         struct socket *sock = state->socket;
2676         struct sock *sk = sock->sk;
2677         struct unix_sock *u = unix_sk(sk);
2678         int copied = 0;
2679         int flags = state->flags;
2680         int noblock = flags & MSG_DONTWAIT;
2681         bool check_creds = false;
2682         int target;
2683         int err = 0;
2684         long timeo;
2685         int skip;
2686         size_t size = state->size;
2687         unsigned int last_len;
2688
2689         if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2690                 err = -EINVAL;
2691                 goto out;
2692         }
2693
2694         if (unlikely(flags & MSG_OOB)) {
2695                 err = -EOPNOTSUPP;
2696 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2697                 err = unix_stream_recv_urg(state);
2698 #endif
2699                 goto out;
2700         }
2701
2702         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2703         timeo = sock_rcvtimeo(sk, noblock);
2704
2705         memset(&scm, 0, sizeof(scm));
2706
2707         /* Lock the socket to prevent queue disordering
2708          * while sleeps in memcpy_tomsg
2709          */
2710         mutex_lock(&u->iolock);
2711
2712         skip = max(sk_peek_offset(sk, flags), 0);
2713
2714         do {
2715                 int chunk;
2716                 bool drop_skb;
2717                 struct sk_buff *skb, *last;
2718
2719 redo:
2720                 unix_state_lock(sk);
2721                 if (sock_flag(sk, SOCK_DEAD)) {
2722                         err = -ECONNRESET;
2723                         goto unlock;
2724                 }
2725                 last = skb = skb_peek(&sk->sk_receive_queue);
2726                 last_len = last ? last->len : 0;
2727
2728 again:
2729 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2730                 if (skb) {
2731                         skb = manage_oob(skb, sk, flags, copied);
2732                         if (!skb && copied) {
2733                                 unix_state_unlock(sk);
2734                                 break;
2735                         }
2736                 }
2737 #endif
2738                 if (skb == NULL) {
2739                         if (copied >= target)
2740                                 goto unlock;
2741
2742                         /*
2743                          *      POSIX 1003.1g mandates this order.
2744                          */
2745
2746                         err = sock_error(sk);
2747                         if (err)
2748                                 goto unlock;
2749                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2750                                 goto unlock;
2751
2752                         unix_state_unlock(sk);
2753                         if (!timeo) {
2754                                 err = -EAGAIN;
2755                                 break;
2756                         }
2757
2758                         mutex_unlock(&u->iolock);
2759
2760                         timeo = unix_stream_data_wait(sk, timeo, last,
2761                                                       last_len, freezable);
2762
2763                         if (signal_pending(current)) {
2764                                 err = sock_intr_errno(timeo);
2765                                 scm_destroy(&scm);
2766                                 goto out;
2767                         }
2768
2769                         mutex_lock(&u->iolock);
2770                         goto redo;
2771 unlock:
2772                         unix_state_unlock(sk);
2773                         break;
2774                 }
2775
2776                 while (skip >= unix_skb_len(skb)) {
2777                         skip -= unix_skb_len(skb);
2778                         last = skb;
2779                         last_len = skb->len;
2780                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2781                         if (!skb)
2782                                 goto again;
2783                 }
2784
2785                 unix_state_unlock(sk);
2786
2787                 if (check_creds) {
2788                         /* Never glue messages from different writers */
2789                         if (!unix_skb_scm_eq(skb, &scm))
2790                                 break;
2791                 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2792                            test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2793                         /* Copy credentials */
2794                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2795                         unix_set_secdata(&scm, skb);
2796                         check_creds = true;
2797                 }
2798
2799                 /* Copy address just once */
2800                 if (state->msg && state->msg->msg_name) {
2801                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2802                                          state->msg->msg_name);
2803                         unix_copy_addr(state->msg, skb->sk);
2804
2805                         BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2806                                                               state->msg->msg_name,
2807                                                               &state->msg->msg_namelen);
2808
2809                         sunaddr = NULL;
2810                 }
2811
2812                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2813                 skb_get(skb);
2814                 chunk = state->recv_actor(skb, skip, chunk, state);
2815                 drop_skb = !unix_skb_len(skb);
2816                 /* skb is only safe to use if !drop_skb */
2817                 consume_skb(skb);
2818                 if (chunk < 0) {
2819                         if (copied == 0)
2820                                 copied = -EFAULT;
2821                         break;
2822                 }
2823                 copied += chunk;
2824                 size -= chunk;
2825
2826                 if (drop_skb) {
2827                         /* the skb was touched by a concurrent reader;
2828                          * we should not expect anything from this skb
2829                          * anymore and assume it invalid - we can be
2830                          * sure it was dropped from the socket queue
2831                          *
2832                          * let's report a short read
2833                          */
2834                         err = 0;
2835                         break;
2836                 }
2837
2838                 /* Mark read part of skb as used */
2839                 if (!(flags & MSG_PEEK)) {
2840                         UNIXCB(skb).consumed += chunk;
2841
2842                         sk_peek_offset_bwd(sk, chunk);
2843
2844                         if (UNIXCB(skb).fp) {
2845                                 scm_stat_del(sk, skb);
2846                                 unix_detach_fds(&scm, skb);
2847                         }
2848
2849                         if (unix_skb_len(skb))
2850                                 break;
2851
2852                         skb_unlink(skb, &sk->sk_receive_queue);
2853                         consume_skb(skb);
2854
2855                         if (scm.fp)
2856                                 break;
2857                 } else {
2858                         /* It is questionable, see note in unix_dgram_recvmsg.
2859                          */
2860                         if (UNIXCB(skb).fp)
2861                                 unix_peek_fds(&scm, skb);
2862
2863                         sk_peek_offset_fwd(sk, chunk);
2864
2865                         if (UNIXCB(skb).fp)
2866                                 break;
2867
2868                         skip = 0;
2869                         last = skb;
2870                         last_len = skb->len;
2871                         unix_state_lock(sk);
2872                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2873                         if (skb)
2874                                 goto again;
2875                         unix_state_unlock(sk);
2876                         break;
2877                 }
2878         } while (size);
2879
2880         mutex_unlock(&u->iolock);
2881         if (state->msg)
2882                 scm_recv_unix(sock, state->msg, &scm, flags);
2883         else
2884                 scm_destroy(&scm);
2885 out:
2886         return copied ? : err;
2887 }
2888
2889 static int unix_stream_read_actor(struct sk_buff *skb,
2890                                   int skip, int chunk,
2891                                   struct unix_stream_read_state *state)
2892 {
2893         int ret;
2894
2895         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2896                                     state->msg, chunk);
2897         return ret ?: chunk;
2898 }
2899
2900 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2901                           size_t size, int flags)
2902 {
2903         struct unix_stream_read_state state = {
2904                 .recv_actor = unix_stream_read_actor,
2905                 .socket = sk->sk_socket,
2906                 .msg = msg,
2907                 .size = size,
2908                 .flags = flags
2909         };
2910
2911         return unix_stream_read_generic(&state, true);
2912 }
2913
2914 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2915                                size_t size, int flags)
2916 {
2917         struct unix_stream_read_state state = {
2918                 .recv_actor = unix_stream_read_actor,
2919                 .socket = sock,
2920                 .msg = msg,
2921                 .size = size,
2922                 .flags = flags
2923         };
2924
2925 #ifdef CONFIG_BPF_SYSCALL
2926         struct sock *sk = sock->sk;
2927         const struct proto *prot = READ_ONCE(sk->sk_prot);
2928
2929         if (prot != &unix_stream_proto)
2930                 return prot->recvmsg(sk, msg, size, flags, NULL);
2931 #endif
2932         return unix_stream_read_generic(&state, true);
2933 }
2934
2935 static int unix_stream_splice_actor(struct sk_buff *skb,
2936                                     int skip, int chunk,
2937                                     struct unix_stream_read_state *state)
2938 {
2939         return skb_splice_bits(skb, state->socket->sk,
2940                                UNIXCB(skb).consumed + skip,
2941                                state->pipe, chunk, state->splice_flags);
2942 }
2943
2944 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2945                                        struct pipe_inode_info *pipe,
2946                                        size_t size, unsigned int flags)
2947 {
2948         struct unix_stream_read_state state = {
2949                 .recv_actor = unix_stream_splice_actor,
2950                 .socket = sock,
2951                 .pipe = pipe,
2952                 .size = size,
2953                 .splice_flags = flags,
2954         };
2955
2956         if (unlikely(*ppos))
2957                 return -ESPIPE;
2958
2959         if (sock->file->f_flags & O_NONBLOCK ||
2960             flags & SPLICE_F_NONBLOCK)
2961                 state.flags = MSG_DONTWAIT;
2962
2963         return unix_stream_read_generic(&state, false);
2964 }
2965
2966 static int unix_shutdown(struct socket *sock, int mode)
2967 {
2968         struct sock *sk = sock->sk;
2969         struct sock *other;
2970
2971         if (mode < SHUT_RD || mode > SHUT_RDWR)
2972                 return -EINVAL;
2973         /* This maps:
2974          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2975          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2976          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2977          */
2978         ++mode;
2979
2980         unix_state_lock(sk);
2981         WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2982         other = unix_peer(sk);
2983         if (other)
2984                 sock_hold(other);
2985         unix_state_unlock(sk);
2986         sk->sk_state_change(sk);
2987
2988         if (other &&
2989                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2990
2991                 int peer_mode = 0;
2992                 const struct proto *prot = READ_ONCE(other->sk_prot);
2993
2994                 if (prot->unhash)
2995                         prot->unhash(other);
2996                 if (mode&RCV_SHUTDOWN)
2997                         peer_mode |= SEND_SHUTDOWN;
2998                 if (mode&SEND_SHUTDOWN)
2999                         peer_mode |= RCV_SHUTDOWN;
3000                 unix_state_lock(other);
3001                 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3002                 unix_state_unlock(other);
3003                 other->sk_state_change(other);
3004                 if (peer_mode == SHUTDOWN_MASK)
3005                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3006                 else if (peer_mode & RCV_SHUTDOWN)
3007                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3008         }
3009         if (other)
3010                 sock_put(other);
3011
3012         return 0;
3013 }
3014
3015 long unix_inq_len(struct sock *sk)
3016 {
3017         struct sk_buff *skb;
3018         long amount = 0;
3019
3020         if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3021                 return -EINVAL;
3022
3023         spin_lock(&sk->sk_receive_queue.lock);
3024         if (sk->sk_type == SOCK_STREAM ||
3025             sk->sk_type == SOCK_SEQPACKET) {
3026                 skb_queue_walk(&sk->sk_receive_queue, skb)
3027                         amount += unix_skb_len(skb);
3028         } else {
3029                 skb = skb_peek(&sk->sk_receive_queue);
3030                 if (skb)
3031                         amount = skb->len;
3032         }
3033         spin_unlock(&sk->sk_receive_queue.lock);
3034
3035         return amount;
3036 }
3037 EXPORT_SYMBOL_GPL(unix_inq_len);
3038
3039 long unix_outq_len(struct sock *sk)
3040 {
3041         return sk_wmem_alloc_get(sk);
3042 }
3043 EXPORT_SYMBOL_GPL(unix_outq_len);
3044
3045 static int unix_open_file(struct sock *sk)
3046 {
3047         struct path path;
3048         struct file *f;
3049         int fd;
3050
3051         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3052                 return -EPERM;
3053
3054         if (!smp_load_acquire(&unix_sk(sk)->addr))
3055                 return -ENOENT;
3056
3057         path = unix_sk(sk)->path;
3058         if (!path.dentry)
3059                 return -ENOENT;
3060
3061         path_get(&path);
3062
3063         fd = get_unused_fd_flags(O_CLOEXEC);
3064         if (fd < 0)
3065                 goto out;
3066
3067         f = dentry_open(&path, O_PATH, current_cred());
3068         if (IS_ERR(f)) {
3069                 put_unused_fd(fd);
3070                 fd = PTR_ERR(f);
3071                 goto out;
3072         }
3073
3074         fd_install(fd, f);
3075 out:
3076         path_put(&path);
3077
3078         return fd;
3079 }
3080
3081 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3082 {
3083         struct sock *sk = sock->sk;
3084         long amount = 0;
3085         int err;
3086
3087         switch (cmd) {
3088         case SIOCOUTQ:
3089                 amount = unix_outq_len(sk);
3090                 err = put_user(amount, (int __user *)arg);
3091                 break;
3092         case SIOCINQ:
3093                 amount = unix_inq_len(sk);
3094                 if (amount < 0)
3095                         err = amount;
3096                 else
3097                         err = put_user(amount, (int __user *)arg);
3098                 break;
3099         case SIOCUNIXFILE:
3100                 err = unix_open_file(sk);
3101                 break;
3102 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3103         case SIOCATMARK:
3104                 {
3105                         struct sk_buff *skb;
3106                         int answ = 0;
3107
3108                         skb = skb_peek(&sk->sk_receive_queue);
3109                         if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3110                                 answ = 1;
3111                         err = put_user(answ, (int __user *)arg);
3112                 }
3113                 break;
3114 #endif
3115         default:
3116                 err = -ENOIOCTLCMD;
3117                 break;
3118         }
3119         return err;
3120 }
3121
3122 #ifdef CONFIG_COMPAT
3123 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3124 {
3125         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3126 }
3127 #endif
3128
3129 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3130 {
3131         struct sock *sk = sock->sk;
3132         __poll_t mask;
3133         u8 shutdown;
3134
3135         sock_poll_wait(file, sock, wait);
3136         mask = 0;
3137         shutdown = READ_ONCE(sk->sk_shutdown);
3138
3139         /* exceptional events? */
3140         if (READ_ONCE(sk->sk_err))
3141                 mask |= EPOLLERR;
3142         if (shutdown == SHUTDOWN_MASK)
3143                 mask |= EPOLLHUP;
3144         if (shutdown & RCV_SHUTDOWN)
3145                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3146
3147         /* readable? */
3148         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3149                 mask |= EPOLLIN | EPOLLRDNORM;
3150         if (sk_is_readable(sk))
3151                 mask |= EPOLLIN | EPOLLRDNORM;
3152 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3153         if (READ_ONCE(unix_sk(sk)->oob_skb))
3154                 mask |= EPOLLPRI;
3155 #endif
3156
3157         /* Connection-based need to check for termination and startup */
3158         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3159             sk->sk_state == TCP_CLOSE)
3160                 mask |= EPOLLHUP;
3161
3162         /*
3163          * we set writable also when the other side has shut down the
3164          * connection. This prevents stuck sockets.
3165          */
3166         if (unix_writable(sk))
3167                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3168
3169         return mask;
3170 }
3171
3172 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3173                                     poll_table *wait)
3174 {
3175         struct sock *sk = sock->sk, *other;
3176         unsigned int writable;
3177         __poll_t mask;
3178         u8 shutdown;
3179
3180         sock_poll_wait(file, sock, wait);
3181         mask = 0;
3182         shutdown = READ_ONCE(sk->sk_shutdown);
3183
3184         /* exceptional events? */
3185         if (READ_ONCE(sk->sk_err) ||
3186             !skb_queue_empty_lockless(&sk->sk_error_queue))
3187                 mask |= EPOLLERR |
3188                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3189
3190         if (shutdown & RCV_SHUTDOWN)
3191                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3192         if (shutdown == SHUTDOWN_MASK)
3193                 mask |= EPOLLHUP;
3194
3195         /* readable? */
3196         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3197                 mask |= EPOLLIN | EPOLLRDNORM;
3198         if (sk_is_readable(sk))
3199                 mask |= EPOLLIN | EPOLLRDNORM;
3200
3201         /* Connection-based need to check for termination and startup */
3202         if (sk->sk_type == SOCK_SEQPACKET) {
3203                 if (sk->sk_state == TCP_CLOSE)
3204                         mask |= EPOLLHUP;
3205                 /* connection hasn't started yet? */
3206                 if (sk->sk_state == TCP_SYN_SENT)
3207                         return mask;
3208         }
3209
3210         /* No write status requested, avoid expensive OUT tests. */
3211         if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3212                 return mask;
3213
3214         writable = unix_writable(sk);
3215         if (writable) {
3216                 unix_state_lock(sk);
3217
3218                 other = unix_peer(sk);
3219                 if (other && unix_peer(other) != sk &&
3220                     unix_recvq_full_lockless(other) &&
3221                     unix_dgram_peer_wake_me(sk, other))
3222                         writable = 0;
3223
3224                 unix_state_unlock(sk);
3225         }
3226
3227         if (writable)
3228                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3229         else
3230                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3231
3232         return mask;
3233 }
3234
3235 #ifdef CONFIG_PROC_FS
3236
3237 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3238
3239 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3240 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3241 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3242
3243 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3244 {
3245         unsigned long offset = get_offset(*pos);
3246         unsigned long bucket = get_bucket(*pos);
3247         unsigned long count = 0;
3248         struct sock *sk;
3249
3250         for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3251              sk; sk = sk_next(sk)) {
3252                 if (++count == offset)
3253                         break;
3254         }
3255
3256         return sk;
3257 }
3258
3259 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3260 {
3261         unsigned long bucket = get_bucket(*pos);
3262         struct net *net = seq_file_net(seq);
3263         struct sock *sk;
3264
3265         while (bucket < UNIX_HASH_SIZE) {
3266                 spin_lock(&net->unx.table.locks[bucket]);
3267
3268                 sk = unix_from_bucket(seq, pos);
3269                 if (sk)
3270                         return sk;
3271
3272                 spin_unlock(&net->unx.table.locks[bucket]);
3273
3274                 *pos = set_bucket_offset(++bucket, 1);
3275         }
3276
3277         return NULL;
3278 }
3279
3280 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3281                                   loff_t *pos)
3282 {
3283         unsigned long bucket = get_bucket(*pos);
3284
3285         sk = sk_next(sk);
3286         if (sk)
3287                 return sk;
3288
3289
3290         spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3291
3292         *pos = set_bucket_offset(++bucket, 1);
3293
3294         return unix_get_first(seq, pos);
3295 }
3296
3297 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3298 {
3299         if (!*pos)
3300                 return SEQ_START_TOKEN;
3301
3302         return unix_get_first(seq, pos);
3303 }
3304
3305 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3306 {
3307         ++*pos;
3308
3309         if (v == SEQ_START_TOKEN)
3310                 return unix_get_first(seq, pos);
3311
3312         return unix_get_next(seq, v, pos);
3313 }
3314
3315 static void unix_seq_stop(struct seq_file *seq, void *v)
3316 {
3317         struct sock *sk = v;
3318
3319         if (sk)
3320                 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3321 }
3322
3323 static int unix_seq_show(struct seq_file *seq, void *v)
3324 {
3325
3326         if (v == SEQ_START_TOKEN)
3327                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3328                          "Inode Path\n");
3329         else {
3330                 struct sock *s = v;
3331                 struct unix_sock *u = unix_sk(s);
3332                 unix_state_lock(s);
3333
3334                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3335                         s,
3336                         refcount_read(&s->sk_refcnt),
3337                         0,
3338                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3339                         s->sk_type,
3340                         s->sk_socket ?
3341                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3342                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3343                         sock_i_ino(s));
3344
3345                 if (u->addr) {  // under a hash table lock here
3346                         int i, len;
3347                         seq_putc(seq, ' ');
3348
3349                         i = 0;
3350                         len = u->addr->len -
3351                                 offsetof(struct sockaddr_un, sun_path);
3352                         if (u->addr->name->sun_path[0]) {
3353                                 len--;
3354                         } else {
3355                                 seq_putc(seq, '@');
3356                                 i++;
3357                         }
3358                         for ( ; i < len; i++)
3359                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
3360                                          '@');
3361                 }
3362                 unix_state_unlock(s);
3363                 seq_putc(seq, '\n');
3364         }
3365
3366         return 0;
3367 }
3368
3369 static const struct seq_operations unix_seq_ops = {
3370         .start  = unix_seq_start,
3371         .next   = unix_seq_next,
3372         .stop   = unix_seq_stop,
3373         .show   = unix_seq_show,
3374 };
3375
3376 #ifdef CONFIG_BPF_SYSCALL
3377 struct bpf_unix_iter_state {
3378         struct seq_net_private p;
3379         unsigned int cur_sk;
3380         unsigned int end_sk;
3381         unsigned int max_sk;
3382         struct sock **batch;
3383         bool st_bucket_done;
3384 };
3385
3386 struct bpf_iter__unix {
3387         __bpf_md_ptr(struct bpf_iter_meta *, meta);
3388         __bpf_md_ptr(struct unix_sock *, unix_sk);
3389         uid_t uid __aligned(8);
3390 };
3391
3392 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3393                               struct unix_sock *unix_sk, uid_t uid)
3394 {
3395         struct bpf_iter__unix ctx;
3396
3397         meta->seq_num--;  /* skip SEQ_START_TOKEN */
3398         ctx.meta = meta;
3399         ctx.unix_sk = unix_sk;
3400         ctx.uid = uid;
3401         return bpf_iter_run_prog(prog, &ctx);
3402 }
3403
3404 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3405
3406 {
3407         struct bpf_unix_iter_state *iter = seq->private;
3408         unsigned int expected = 1;
3409         struct sock *sk;
3410
3411         sock_hold(start_sk);
3412         iter->batch[iter->end_sk++] = start_sk;
3413
3414         for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3415                 if (iter->end_sk < iter->max_sk) {
3416                         sock_hold(sk);
3417                         iter->batch[iter->end_sk++] = sk;
3418                 }
3419
3420                 expected++;
3421         }
3422
3423         spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3424
3425         return expected;
3426 }
3427
3428 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3429 {
3430         while (iter->cur_sk < iter->end_sk)
3431                 sock_put(iter->batch[iter->cur_sk++]);
3432 }
3433
3434 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3435                                        unsigned int new_batch_sz)
3436 {
3437         struct sock **new_batch;
3438
3439         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3440                              GFP_USER | __GFP_NOWARN);
3441         if (!new_batch)
3442                 return -ENOMEM;
3443
3444         bpf_iter_unix_put_batch(iter);
3445         kvfree(iter->batch);
3446         iter->batch = new_batch;
3447         iter->max_sk = new_batch_sz;
3448
3449         return 0;
3450 }
3451
3452 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3453                                         loff_t *pos)
3454 {
3455         struct bpf_unix_iter_state *iter = seq->private;
3456         unsigned int expected;
3457         bool resized = false;
3458         struct sock *sk;
3459
3460         if (iter->st_bucket_done)
3461                 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3462
3463 again:
3464         /* Get a new batch */
3465         iter->cur_sk = 0;
3466         iter->end_sk = 0;
3467
3468         sk = unix_get_first(seq, pos);
3469         if (!sk)
3470                 return NULL; /* Done */
3471
3472         expected = bpf_iter_unix_hold_batch(seq, sk);
3473
3474         if (iter->end_sk == expected) {
3475                 iter->st_bucket_done = true;
3476                 return sk;
3477         }
3478
3479         if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3480                 resized = true;
3481                 goto again;
3482         }
3483
3484         return sk;
3485 }
3486
3487 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3488 {
3489         if (!*pos)
3490                 return SEQ_START_TOKEN;
3491
3492         /* bpf iter does not support lseek, so it always
3493          * continue from where it was stop()-ped.
3494          */
3495         return bpf_iter_unix_batch(seq, pos);
3496 }
3497
3498 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3499 {
3500         struct bpf_unix_iter_state *iter = seq->private;
3501         struct sock *sk;
3502
3503         /* Whenever seq_next() is called, the iter->cur_sk is
3504          * done with seq_show(), so advance to the next sk in
3505          * the batch.
3506          */
3507         if (iter->cur_sk < iter->end_sk)
3508                 sock_put(iter->batch[iter->cur_sk++]);
3509
3510         ++*pos;
3511
3512         if (iter->cur_sk < iter->end_sk)
3513                 sk = iter->batch[iter->cur_sk];
3514         else
3515                 sk = bpf_iter_unix_batch(seq, pos);
3516
3517         return sk;
3518 }
3519
3520 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3521 {
3522         struct bpf_iter_meta meta;
3523         struct bpf_prog *prog;
3524         struct sock *sk = v;
3525         uid_t uid;
3526         bool slow;
3527         int ret;
3528
3529         if (v == SEQ_START_TOKEN)
3530                 return 0;
3531
3532         slow = lock_sock_fast(sk);
3533
3534         if (unlikely(sk_unhashed(sk))) {
3535                 ret = SEQ_SKIP;
3536                 goto unlock;
3537         }
3538
3539         uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3540         meta.seq = seq;
3541         prog = bpf_iter_get_info(&meta, false);
3542         ret = unix_prog_seq_show(prog, &meta, v, uid);
3543 unlock:
3544         unlock_sock_fast(sk, slow);
3545         return ret;
3546 }
3547
3548 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3549 {
3550         struct bpf_unix_iter_state *iter = seq->private;
3551         struct bpf_iter_meta meta;
3552         struct bpf_prog *prog;
3553
3554         if (!v) {
3555                 meta.seq = seq;
3556                 prog = bpf_iter_get_info(&meta, true);
3557                 if (prog)
3558                         (void)unix_prog_seq_show(prog, &meta, v, 0);
3559         }
3560
3561         if (iter->cur_sk < iter->end_sk)
3562                 bpf_iter_unix_put_batch(iter);
3563 }
3564
3565 static const struct seq_operations bpf_iter_unix_seq_ops = {
3566         .start  = bpf_iter_unix_seq_start,
3567         .next   = bpf_iter_unix_seq_next,
3568         .stop   = bpf_iter_unix_seq_stop,
3569         .show   = bpf_iter_unix_seq_show,
3570 };
3571 #endif
3572 #endif
3573
3574 static const struct net_proto_family unix_family_ops = {
3575         .family = PF_UNIX,
3576         .create = unix_create,
3577         .owner  = THIS_MODULE,
3578 };
3579
3580
3581 static int __net_init unix_net_init(struct net *net)
3582 {
3583         int i;
3584
3585         net->unx.sysctl_max_dgram_qlen = 10;
3586         if (unix_sysctl_register(net))
3587                 goto out;
3588
3589 #ifdef CONFIG_PROC_FS
3590         if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3591                              sizeof(struct seq_net_private)))
3592                 goto err_sysctl;
3593 #endif
3594
3595         net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3596                                               sizeof(spinlock_t), GFP_KERNEL);
3597         if (!net->unx.table.locks)
3598                 goto err_proc;
3599
3600         net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3601                                                 sizeof(struct hlist_head),
3602                                                 GFP_KERNEL);
3603         if (!net->unx.table.buckets)
3604                 goto free_locks;
3605
3606         for (i = 0; i < UNIX_HASH_SIZE; i++) {
3607                 spin_lock_init(&net->unx.table.locks[i]);
3608                 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3609         }
3610
3611         return 0;
3612
3613 free_locks:
3614         kvfree(net->unx.table.locks);
3615 err_proc:
3616 #ifdef CONFIG_PROC_FS
3617         remove_proc_entry("unix", net->proc_net);
3618 err_sysctl:
3619 #endif
3620         unix_sysctl_unregister(net);
3621 out:
3622         return -ENOMEM;
3623 }
3624
3625 static void __net_exit unix_net_exit(struct net *net)
3626 {
3627         kvfree(net->unx.table.buckets);
3628         kvfree(net->unx.table.locks);
3629         unix_sysctl_unregister(net);
3630         remove_proc_entry("unix", net->proc_net);
3631 }
3632
3633 static struct pernet_operations unix_net_ops = {
3634         .init = unix_net_init,
3635         .exit = unix_net_exit,
3636 };
3637
3638 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3639 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3640                      struct unix_sock *unix_sk, uid_t uid)
3641
3642 #define INIT_BATCH_SZ 16
3643
3644 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3645 {
3646         struct bpf_unix_iter_state *iter = priv_data;
3647         int err;
3648
3649         err = bpf_iter_init_seq_net(priv_data, aux);
3650         if (err)
3651                 return err;
3652
3653         err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3654         if (err) {
3655                 bpf_iter_fini_seq_net(priv_data);
3656                 return err;
3657         }
3658
3659         return 0;
3660 }
3661
3662 static void bpf_iter_fini_unix(void *priv_data)
3663 {
3664         struct bpf_unix_iter_state *iter = priv_data;
3665
3666         bpf_iter_fini_seq_net(priv_data);
3667         kvfree(iter->batch);
3668 }
3669
3670 static const struct bpf_iter_seq_info unix_seq_info = {
3671         .seq_ops                = &bpf_iter_unix_seq_ops,
3672         .init_seq_private       = bpf_iter_init_unix,
3673         .fini_seq_private       = bpf_iter_fini_unix,
3674         .seq_priv_size          = sizeof(struct bpf_unix_iter_state),
3675 };
3676
3677 static const struct bpf_func_proto *
3678 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3679                              const struct bpf_prog *prog)
3680 {
3681         switch (func_id) {
3682         case BPF_FUNC_setsockopt:
3683                 return &bpf_sk_setsockopt_proto;
3684         case BPF_FUNC_getsockopt:
3685                 return &bpf_sk_getsockopt_proto;
3686         default:
3687                 return NULL;
3688         }
3689 }
3690
3691 static struct bpf_iter_reg unix_reg_info = {
3692         .target                 = "unix",
3693         .ctx_arg_info_size      = 1,
3694         .ctx_arg_info           = {
3695                 { offsetof(struct bpf_iter__unix, unix_sk),
3696                   PTR_TO_BTF_ID_OR_NULL },
3697         },
3698         .get_func_proto         = bpf_iter_unix_get_func_proto,
3699         .seq_info               = &unix_seq_info,
3700 };
3701
3702 static void __init bpf_iter_register(void)
3703 {
3704         unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3705         if (bpf_iter_reg_target(&unix_reg_info))
3706                 pr_warn("Warning: could not register bpf iterator unix\n");
3707 }
3708 #endif
3709
3710 static int __init af_unix_init(void)
3711 {
3712         int i, rc = -1;
3713
3714         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3715
3716         for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3717                 spin_lock_init(&bsd_socket_locks[i]);
3718                 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3719         }
3720
3721         rc = proto_register(&unix_dgram_proto, 1);
3722         if (rc != 0) {
3723                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3724                 goto out;
3725         }
3726
3727         rc = proto_register(&unix_stream_proto, 1);
3728         if (rc != 0) {
3729                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3730                 proto_unregister(&unix_dgram_proto);
3731                 goto out;
3732         }
3733
3734         sock_register(&unix_family_ops);
3735         register_pernet_subsys(&unix_net_ops);
3736         unix_bpf_build_proto();
3737
3738 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3739         bpf_iter_register();
3740 #endif
3741
3742 out:
3743         return rc;
3744 }
3745
3746 /* Later than subsys_initcall() because we depend on stuff initialised there */
3747 fs_initcall(af_unix_init);