net/unix/af_unix.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * NET4:        Implementation of BSD Unix domain sockets.
   4  *
   5  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   6  *
   7  * Fixes:
   8  *              Linus Torvalds  :       Assorted bug cures.
   9  *              Niibe Yutaka    :       async I/O support.
  10  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  11  *              Alan Cox        :       Limit size of allocated blocks.
  12  *              Alan Cox        :       Fixed the stupid socketpair bug.
  13  *              Alan Cox        :       BSD compatibility fine tuning.
  14  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  15  *              Alan Cox        :       Sorted out a proper draft version of
  16  *                                      file descriptor passing hacked up from
  17  *                                      Mike Shaver's work.
  18  *              Marty Leisner   :       Fixes to fd passing
  19  *              Nick Nevin      :       recvmsg bugfix.
  20  *              Alan Cox        :       Started proper garbage collector
  21  *              Heiko EiBfeldt  :       Missing verify_area check
  22  *              Alan Cox        :       Started POSIXisms
  23  *              Andreas Schwab  :       Replace inode by dentry for proper
  24  *                                      reference counting
  25  *              Kirk Petersen   :       Made this a module
  26  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  27  *                                      Lots of bug fixes.
  28  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  29  *                                      by above two patches.
  30  *           Andrea Arcangeli   :       If possible we block in connect(2)
  31  *                                      if the max backlog of the listen socket
  32  *                                      is been reached. This won't break
  33  *                                      old apps and it will avoid huge amount
  34  *                                      of socks hashed (this for unix_gc()
  35  *                                      performances reasons).
  36  *                                      Security fix that limits the max
  37  *                                      number of socks to 2*max_files and
  38  *                                      the number of skb queueable in the
  39  *                                      dgram receiver.
  40  *              Artur Skawina   :       Hash function optimizations
  41  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  42  *            Malcolm Beattie   :       Set peercred for socketpair
  43  *           Michal Ostrowski   :       Module initialization cleanup.
  44  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  45  *                                      the core infrastructure is doing that
  46  *                                      for all net proto families now (2.5.69+)
  47  *
  48  * Known differences from reference BSD that was tested:
  49  *
  50  *      [TO FIX]
  51  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  52  *              other the moment one end closes.
  53  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  54  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  55  *      [NOT TO FIX]
  56  *      accept() returns a path name even if the connecting socket has closed
  57  *              in the meantime (BSD loses the path and gives up).
  58  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  59  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  60  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  61  *      BSD af_unix apparently has connect forgetting to block properly.
  62  *              (need to check this with the POSIX spec in detail)
  63  *
  64  * Differences from 2.0.0-11-... (ANK)
  65  *      Bug fixes and improvements.
  66  *              - client shutdown killed server socket.
  67  *              - removed all useless cli/sti pairs.
  68  *
  69  *      Semantic changes/extensions.
  70  *              - generic control message passing.
  71  *              - SCM_CREDENTIALS control message.
  72  *              - "Abstract" (not FS based) socket bindings.
  73  *                Abstract names are sequences of bytes (not zero terminated)
  74  *                started by 0, so that this name space does not intersect
  75  *                with BSD names.
  76  */
  77
  78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  79
  80 #include <linux/module.h>
  81 #include <linux/kernel.h>
  82 #include <linux/signal.h>
  83 #include <linux/sched/signal.h>
  84 #include <linux/errno.h>
  85 #include <linux/string.h>
  86 #include <linux/stat.h>
  87 #include <linux/dcache.h>
  88 #include <linux/namei.h>
  89 #include <linux/socket.h>
  90 #include <linux/un.h>
  91 #include <linux/fcntl.h>
  92 #include <linux/filter.h>
  93 #include <linux/termios.h>
  94 #include <linux/sockios.h>
  95 #include <linux/net.h>
  96 #include <linux/in.h>
  97 #include <linux/fs.h>
  98 #include <linux/slab.h>
  99 #include <linux/uaccess.h>
 100 #include <linux/skbuff.h>
 101 #include <linux/netdevice.h>
 102 #include <net/net_namespace.h>
 103 #include <net/sock.h>
 104 #include <net/tcp_states.h>
 105 #include <net/af_unix.h>
 106 #include <linux/proc_fs.h>
 107 #include <linux/seq_file.h>
 108 #include <net/scm.h>
 109 #include <linux/init.h>
 110 #include <linux/poll.h>
 111 #include <linux/rtnetlink.h>
 112 #include <linux/mount.h>
 113 #include <net/checksum.h>
 114 #include <linux/security.h>
 115 #include <linux/splice.h>
 116 #include <linux/freezer.h>
 117 #include <linux/file.h>
 118 #include <linux/btf_ids.h>
 119 #include <linux/bpf-cgroup.h>
 120
 121 #include "scm.h"
 122
 123 static atomic_long_t unix_nr_socks;
 124 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
 125 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
 126
 127 /* SMP locking strategy:
 128  *    hash table is protected with spinlock.
 129  *    each socket state is protected by separate spinlock.
 130  */
 131
 132 static unsigned int unix_unbound_hash(struct sock *sk)
 133 {
 134         unsigned long hash = (unsigned long)sk;
 135
 136         hash ^= hash >> 16;
 137         hash ^= hash >> 8;
 138         hash ^= sk->sk_type;
 139
 140         return hash & UNIX_HASH_MOD;
 141 }
 142
 143 static unsigned int unix_bsd_hash(struct inode *i)
 144 {
 145         return i->i_ino & UNIX_HASH_MOD;
 146 }
 147
 148 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
 149                                        int addr_len, int type)
 150 {
 151         __wsum csum = csum_partial(sunaddr, addr_len, 0);
 152         unsigned int hash;
 153
 154         hash = (__force unsigned int)csum_fold(csum);
 155         hash ^= hash >> 8;
 156         hash ^= type;
 157
 158         return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
 159 }
 160
 161 static void unix_table_double_lock(struct net *net,
 162                                    unsigned int hash1, unsigned int hash2)
 163 {
 164         if (hash1 == hash2) {
 165                 spin_lock(&net->unx.table.locks[hash1]);
 166                 return;
 167         }
 168
 169         if (hash1 > hash2)
 170                 swap(hash1, hash2);
 171
 172         spin_lock(&net->unx.table.locks[hash1]);
 173         spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
 174 }
 175
 176 static void unix_table_double_unlock(struct net *net,
 177                                      unsigned int hash1, unsigned int hash2)
 178 {
 179         if (hash1 == hash2) {
 180                 spin_unlock(&net->unx.table.locks[hash1]);
 181                 return;
 182         }
 183
 184         spin_unlock(&net->unx.table.locks[hash1]);
 185         spin_unlock(&net->unx.table.locks[hash2]);
 186 }
 187
 188 #ifdef CONFIG_SECURITY_NETWORK
 189 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 190 {
 191         UNIXCB(skb).secid = scm->secid;
 192 }
 193
 194 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 195 {
 196         scm->secid = UNIXCB(skb).secid;
 197 }
 198
 199 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 200 {
 201         return (scm->secid == UNIXCB(skb).secid);
 202 }
 203 #else
 204 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 205 { }
 206
 207 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 208 { }
 209
 210 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 211 {
 212         return true;
 213 }
 214 #endif /* CONFIG_SECURITY_NETWORK */
 215
 216 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 217 {
 218         return unix_peer(osk) == sk;
 219 }
 220
 221 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 222 {
 223         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 224 }
 225
 226 static inline int unix_recvq_full(const struct sock *sk)
 227 {
 228         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 229 }
 230
 231 static inline int unix_recvq_full_lockless(const struct sock *sk)
 232 {
 233         return skb_queue_len_lockless(&sk->sk_receive_queue) >
 234                 READ_ONCE(sk->sk_max_ack_backlog);
 235 }
 236
 237 struct sock *unix_peer_get(struct sock *s)
 238 {
 239         struct sock *peer;
 240
 241         unix_state_lock(s);
 242         peer = unix_peer(s);
 243         if (peer)
 244                 sock_hold(peer);
 245         unix_state_unlock(s);
 246         return peer;
 247 }
 248 EXPORT_SYMBOL_GPL(unix_peer_get);
 249
 250 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
 251                                              int addr_len)
 252 {
 253         struct unix_address *addr;
 254
 255         addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
 256         if (!addr)
 257                 return NULL;
 258
 259         refcount_set(&addr->refcnt, 1);
 260         addr->len = addr_len;
 261         memcpy(addr->name, sunaddr, addr_len);
 262
 263         return addr;
 264 }
 265
 266 static inline void unix_release_addr(struct unix_address *addr)
 267 {
 268         if (refcount_dec_and_test(&addr->refcnt))
 269                 kfree(addr);
 270 }
 271
 272 /*
 273  *      Check unix socket name:
 274  *              - should be not zero length.
 275  *              - if started by not zero, should be NULL terminated (FS object)
 276  *              - if started by zero, it is abstract name.
 277  */
 278
 279 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
 280 {
 281         if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
 282             addr_len > sizeof(*sunaddr))
 283                 return -EINVAL;
 284
 285         if (sunaddr->sun_family != AF_UNIX)
 286                 return -EINVAL;
 287
 288         return 0;
 289 }
 290
 291 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
 292 {
 293         struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
 294         short offset = offsetof(struct sockaddr_storage, __data);
 295
 296         BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
 297
 298         /* This may look like an off by one error but it is a bit more
 299          * subtle.  108 is the longest valid AF_UNIX path for a binding.
 300          * sun_path[108] doesn't as such exist.  However in kernel space
 301          * we are guaranteed that it is a valid memory location in our
 302          * kernel address buffer because syscall functions always pass
 303          * a pointer of struct sockaddr_storage which has a bigger buffer
 304          * than 108.  Also, we must terminate sun_path for strlen() in
 305          * getname_kernel().
 306          */
 307         addr->__data[addr_len - offset] = 0;
 308
 309         /* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
 310          * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
 311          * know the actual buffer.
 312          */
 313         return strlen(addr->__data) + offset + 1;
 314 }
 315
 316 static void __unix_remove_socket(struct sock *sk)
 317 {
 318         sk_del_node_init(sk);
 319 }
 320
 321 static void __unix_insert_socket(struct net *net, struct sock *sk)
 322 {
 323         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 324         sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
 325 }
 326
 327 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
 328                                  struct unix_address *addr, unsigned int hash)
 329 {
 330         __unix_remove_socket(sk);
 331         smp_store_release(&unix_sk(sk)->addr, addr);
 332
 333         sk->sk_hash = hash;
 334         __unix_insert_socket(net, sk);
 335 }
 336
 337 static void unix_remove_socket(struct net *net, struct sock *sk)
 338 {
 339         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 340         __unix_remove_socket(sk);
 341         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 342 }
 343
 344 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
 345 {
 346         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 347         __unix_insert_socket(net, sk);
 348         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 349 }
 350
 351 static void unix_insert_bsd_socket(struct sock *sk)
 352 {
 353         spin_lock(&bsd_socket_locks[sk->sk_hash]);
 354         sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
 355         spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 356 }
 357
 358 static void unix_remove_bsd_socket(struct sock *sk)
 359 {
 360         if (!hlist_unhashed(&sk->sk_bind_node)) {
 361                 spin_lock(&bsd_socket_locks[sk->sk_hash]);
 362                 __sk_del_bind_node(sk);
 363                 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 364
 365                 sk_node_init(&sk->sk_bind_node);
 366         }
 367 }
 368
 369 static struct sock *__unix_find_socket_byname(struct net *net,
 370                                               struct sockaddr_un *sunname,
 371                                               int len, unsigned int hash)
 372 {
 373         struct sock *s;
 374
 375         sk_for_each(s, &net->unx.table.buckets[hash]) {
 376                 struct unix_sock *u = unix_sk(s);
 377
 378                 if (u->addr->len == len &&
 379                     !memcmp(u->addr->name, sunname, len))
 380                         return s;
 381         }
 382         return NULL;
 383 }
 384
 385 static inline struct sock *unix_find_socket_byname(struct net *net,
 386                                                    struct sockaddr_un *sunname,
 387                                                    int len, unsigned int hash)
 388 {
 389         struct sock *s;
 390
 391         spin_lock(&net->unx.table.locks[hash]);
 392         s = __unix_find_socket_byname(net, sunname, len, hash);
 393         if (s)
 394                 sock_hold(s);
 395         spin_unlock(&net->unx.table.locks[hash]);
 396         return s;
 397 }
 398
 399 static struct sock *unix_find_socket_byinode(struct inode *i)
 400 {
 401         unsigned int hash = unix_bsd_hash(i);
 402         struct sock *s;
 403
 404         spin_lock(&bsd_socket_locks[hash]);
 405         sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
 406                 struct dentry *dentry = unix_sk(s)->path.dentry;
 407
 408                 if (dentry && d_backing_inode(dentry) == i) {
 409                         sock_hold(s);
 410                         spin_unlock(&bsd_socket_locks[hash]);
 411                         return s;
 412                 }
 413         }
 414         spin_unlock(&bsd_socket_locks[hash]);
 415         return NULL;
 416 }
 417
 418 /* Support code for asymmetrically connected dgram sockets
 419  *
 420  * If a datagram socket is connected to a socket not itself connected
 421  * to the first socket (eg, /dev/log), clients may only enqueue more
 422  * messages if the present receive queue of the server socket is not
 423  * "too large". This means there's a second writeability condition
 424  * poll and sendmsg need to test. The dgram recv code will do a wake
 425  * up on the peer_wait wait queue of a socket upon reception of a
 426  * datagram which needs to be propagated to sleeping would-be writers
 427  * since these might not have sent anything so far. This can't be
 428  * accomplished via poll_wait because the lifetime of the server
 429  * socket might be less than that of its clients if these break their
 430  * association with it or if the server socket is closed while clients
 431  * are still connected to it and there's no way to inform "a polling
 432  * implementation" that it should let go of a certain wait queue
 433  *
 434  * In order to propagate a wake up, a wait_queue_entry_t of the client
 435  * socket is enqueued on the peer_wait queue of the server socket
 436  * whose wake function does a wake_up on the ordinary client socket
 437  * wait queue. This connection is established whenever a write (or
 438  * poll for write) hit the flow control condition and broken when the
 439  * association to the server socket is dissolved or after a wake up
 440  * was relayed.
 441  */
 442
 443 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 444                                       void *key)
 445 {
 446         struct unix_sock *u;
 447         wait_queue_head_t *u_sleep;
 448
 449         u = container_of(q, struct unix_sock, peer_wake);
 450
 451         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 452                             q);
 453         u->peer_wake.private = NULL;
 454
 455         /* relaying can only happen while the wq still exists */
 456         u_sleep = sk_sleep(&u->sk);
 457         if (u_sleep)
 458                 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
 459
 460         return 0;
 461 }
 462
 463 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 464 {
 465         struct unix_sock *u, *u_other;
 466         int rc;
 467
 468         u = unix_sk(sk);
 469         u_other = unix_sk(other);
 470         rc = 0;
 471         spin_lock(&u_other->peer_wait.lock);
 472
 473         if (!u->peer_wake.private) {
 474                 u->peer_wake.private = other;
 475                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 476
 477                 rc = 1;
 478         }
 479
 480         spin_unlock(&u_other->peer_wait.lock);
 481         return rc;
 482 }
 483
 484 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 485                                             struct sock *other)
 486 {
 487         struct unix_sock *u, *u_other;
 488
 489         u = unix_sk(sk);
 490         u_other = unix_sk(other);
 491         spin_lock(&u_other->peer_wait.lock);
 492
 493         if (u->peer_wake.private == other) {
 494                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 495                 u->peer_wake.private = NULL;
 496         }
 497
 498         spin_unlock(&u_other->peer_wait.lock);
 499 }
 500
 501 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 502                                                    struct sock *other)
 503 {
 504         unix_dgram_peer_wake_disconnect(sk, other);
 505         wake_up_interruptible_poll(sk_sleep(sk),
 506                                    EPOLLOUT |
 507                                    EPOLLWRNORM |
 508                                    EPOLLWRBAND);
 509 }
 510
 511 /* preconditions:
 512  *      - unix_peer(sk) == other
 513  *      - association is stable
 514  */
 515 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 516 {
 517         int connected;
 518
 519         connected = unix_dgram_peer_wake_connect(sk, other);
 520
 521         /* If other is SOCK_DEAD, we want to make sure we signal
 522          * POLLOUT, such that a subsequent write() can get a
 523          * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
 524          * to other and its full, we will hang waiting for POLLOUT.
 525          */
 526         if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
 527                 return 1;
 528
 529         if (connected)
 530                 unix_dgram_peer_wake_disconnect(sk, other);
 531
 532         return 0;
 533 }
 534
 535 static int unix_writable(const struct sock *sk)
 536 {
 537         return sk->sk_state != TCP_LISTEN &&
 538                (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 539 }
 540
 541 static void unix_write_space(struct sock *sk)
 542 {
 543         struct socket_wq *wq;
 544
 545         rcu_read_lock();
 546         if (unix_writable(sk)) {
 547                 wq = rcu_dereference(sk->sk_wq);
 548                 if (skwq_has_sleeper(wq))
 549                         wake_up_interruptible_sync_poll(&wq->wait,
 550                                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
 551                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 552         }
 553         rcu_read_unlock();
 554 }
 555
 556 /* When dgram socket disconnects (or changes its peer), we clear its receive
 557  * queue of packets arrived from previous peer. First, it allows to do
 558  * flow control based only on wmem_alloc; second, sk connected to peer
 559  * may receive messages only from that peer. */
 560 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 561 {
 562         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 563                 skb_queue_purge(&sk->sk_receive_queue);
 564                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 565
 566                 /* If one link of bidirectional dgram pipe is disconnected,
 567                  * we signal error. Messages are lost. Do not make this,
 568                  * when peer was not connected to us.
 569                  */
 570                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 571                         WRITE_ONCE(other->sk_err, ECONNRESET);
 572                         sk_error_report(other);
 573                 }
 574         }
 575         other->sk_state = TCP_CLOSE;
 576 }
 577
 578 static void unix_sock_destructor(struct sock *sk)
 579 {
 580         struct unix_sock *u = unix_sk(sk);
 581
 582         skb_queue_purge(&sk->sk_receive_queue);
 583
 584         DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
 585         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 586         DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
 587         if (!sock_flag(sk, SOCK_DEAD)) {
 588                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 589                 return;
 590         }
 591
 592         if (u->addr)
 593                 unix_release_addr(u->addr);
 594
 595         atomic_long_dec(&unix_nr_socks);
 596         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 597 #ifdef UNIX_REFCNT_DEBUG
 598         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 599                 atomic_long_read(&unix_nr_socks));
 600 #endif
 601 }
 602
 603 static void unix_release_sock(struct sock *sk, int embrion)
 604 {
 605         struct unix_sock *u = unix_sk(sk);
 606         struct sock *skpair;
 607         struct sk_buff *skb;
 608         struct path path;
 609         int state;
 610
 611         unix_remove_socket(sock_net(sk), sk);
 612         unix_remove_bsd_socket(sk);
 613
 614         /* Clear state */
 615         unix_state_lock(sk);
 616         sock_orphan(sk);
 617         WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
 618         path         = u->path;
 619         u->path.dentry = NULL;
 620         u->path.mnt = NULL;
 621         state = sk->sk_state;
 622         sk->sk_state = TCP_CLOSE;
 623
 624         skpair = unix_peer(sk);
 625         unix_peer(sk) = NULL;
 626
 627         unix_state_unlock(sk);
 628
 629 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 630         if (u->oob_skb) {
 631                 kfree_skb(u->oob_skb);
 632                 u->oob_skb = NULL;
 633         }
 634 #endif
 635
 636         wake_up_interruptible_all(&u->peer_wait);
 637
 638         if (skpair != NULL) {
 639                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 640                         unix_state_lock(skpair);
 641                         /* No more writes */
 642                         WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
 643                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 644                                 WRITE_ONCE(skpair->sk_err, ECONNRESET);
 645                         unix_state_unlock(skpair);
 646                         skpair->sk_state_change(skpair);
 647                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 648                 }
 649
 650                 unix_dgram_peer_wake_disconnect(sk, skpair);
 651                 sock_put(skpair); /* It may now die */
 652         }
 653
 654         /* Try to flush out this socket. Throw out buffers at least */
 655
 656         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 657                 if (state == TCP_LISTEN)
 658                         unix_release_sock(skb->sk, 1);
 659                 /* passed fds are erased in the kfree_skb hook        */
 660                 UNIXCB(skb).consumed = skb->len;
 661                 kfree_skb(skb);
 662         }
 663
 664         if (path.dentry)
 665                 path_put(&path);
 666
 667         sock_put(sk);
 668
 669         /* ---- Socket is dead now and most probably destroyed ---- */
 670
 671         /*
 672          * Fixme: BSD difference: In BSD all sockets connected to us get
 673          *        ECONNRESET and we die on the spot. In Linux we behave
 674          *        like files and pipes do and wait for the last
 675          *        dereference.
 676          *
 677          * Can't we simply set sock->err?
 678          *
 679          *        What the above comment does talk about? --ANK(980817)
 680          */
 681
 682         if (READ_ONCE(unix_tot_inflight))
 683                 unix_gc();              /* Garbage collect fds */
 684 }
 685
 686 static void init_peercred(struct sock *sk)
 687 {
 688         const struct cred *old_cred;
 689         struct pid *old_pid;
 690
 691         spin_lock(&sk->sk_peer_lock);
 692         old_pid = sk->sk_peer_pid;
 693         old_cred = sk->sk_peer_cred;
 694         sk->sk_peer_pid  = get_pid(task_tgid(current));
 695         sk->sk_peer_cred = get_current_cred();
 696         spin_unlock(&sk->sk_peer_lock);
 697
 698         put_pid(old_pid);
 699         put_cred(old_cred);
 700 }
 701
 702 static void copy_peercred(struct sock *sk, struct sock *peersk)
 703 {
 704         const struct cred *old_cred;
 705         struct pid *old_pid;
 706
 707         if (sk < peersk) {
 708                 spin_lock(&sk->sk_peer_lock);
 709                 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 710         } else {
 711                 spin_lock(&peersk->sk_peer_lock);
 712                 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
 713         }
 714         old_pid = sk->sk_peer_pid;
 715         old_cred = sk->sk_peer_cred;
 716         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 717         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 718
 719         spin_unlock(&sk->sk_peer_lock);
 720         spin_unlock(&peersk->sk_peer_lock);
 721
 722         put_pid(old_pid);
 723         put_cred(old_cred);
 724 }
 725
 726 static int unix_listen(struct socket *sock, int backlog)
 727 {
 728         int err;
 729         struct sock *sk = sock->sk;
 730         struct unix_sock *u = unix_sk(sk);
 731
 732         err = -EOPNOTSUPP;
 733         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 734                 goto out;       /* Only stream/seqpacket sockets accept */
 735         err = -EINVAL;
 736         if (!u->addr)
 737                 goto out;       /* No listens on an unbound socket */
 738         unix_state_lock(sk);
 739         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 740                 goto out_unlock;
 741         if (backlog > sk->sk_max_ack_backlog)
 742                 wake_up_interruptible_all(&u->peer_wait);
 743         sk->sk_max_ack_backlog  = backlog;
 744         sk->sk_state            = TCP_LISTEN;
 745         /* set credentials so connect can copy them */
 746         init_peercred(sk);
 747         err = 0;
 748
 749 out_unlock:
 750         unix_state_unlock(sk);
 751 out:
 752         return err;
 753 }
 754
 755 static int unix_release(struct socket *);
 756 static int unix_bind(struct socket *, struct sockaddr *, int);
 757 static int unix_stream_connect(struct socket *, struct sockaddr *,
 758                                int addr_len, int flags);
 759 static int unix_socketpair(struct socket *, struct socket *);
 760 static int unix_accept(struct socket *, struct socket *, int, bool);
 761 static int unix_getname(struct socket *, struct sockaddr *, int);
 762 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
 763 static __poll_t unix_dgram_poll(struct file *, struct socket *,
 764                                     poll_table *);
 765 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 766 #ifdef CONFIG_COMPAT
 767 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 768 #endif
 769 static int unix_shutdown(struct socket *, int);
 770 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 771 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 772 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 773                                        struct pipe_inode_info *, size_t size,
 774                                        unsigned int flags);
 775 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 776 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 777 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 778 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 779 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 780                               int, int);
 781 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 782 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 783                                   int);
 784
 785 static int unix_set_peek_off(struct sock *sk, int val)
 786 {
 787         struct unix_sock *u = unix_sk(sk);
 788
 789         if (mutex_lock_interruptible(&u->iolock))
 790                 return -EINTR;
 791
 792         WRITE_ONCE(sk->sk_peek_off, val);
 793         mutex_unlock(&u->iolock);
 794
 795         return 0;
 796 }
 797
 798 #ifdef CONFIG_PROC_FS
 799 static int unix_count_nr_fds(struct sock *sk)
 800 {
 801         struct sk_buff *skb;
 802         struct unix_sock *u;
 803         int nr_fds = 0;
 804
 805         spin_lock(&sk->sk_receive_queue.lock);
 806         skb = skb_peek(&sk->sk_receive_queue);
 807         while (skb) {
 808                 u = unix_sk(skb->sk);
 809                 nr_fds += atomic_read(&u->scm_stat.nr_fds);
 810                 skb = skb_peek_next(skb, &sk->sk_receive_queue);
 811         }
 812         spin_unlock(&sk->sk_receive_queue.lock);
 813
 814         return nr_fds;
 815 }
 816
 817 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 818 {
 819         struct sock *sk = sock->sk;
 820         unsigned char s_state;
 821         struct unix_sock *u;
 822         int nr_fds = 0;
 823
 824         if (sk) {
 825                 s_state = READ_ONCE(sk->sk_state);
 826                 u = unix_sk(sk);
 827
 828                 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
 829                  * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
 830                  * SOCK_DGRAM is ordinary. So, no lock is needed.
 831                  */
 832                 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
 833                         nr_fds = atomic_read(&u->scm_stat.nr_fds);
 834                 else if (s_state == TCP_LISTEN)
 835                         nr_fds = unix_count_nr_fds(sk);
 836
 837                 seq_printf(m, "scm_fds: %u\n", nr_fds);
 838         }
 839 }
 840 #else
 841 #define unix_show_fdinfo NULL
 842 #endif
 843
 844 static const struct proto_ops unix_stream_ops = {
 845         .family =       PF_UNIX,
 846         .owner =        THIS_MODULE,
 847         .release =      unix_release,
 848         .bind =         unix_bind,
 849         .connect =      unix_stream_connect,
 850         .socketpair =   unix_socketpair,
 851         .accept =       unix_accept,
 852         .getname =      unix_getname,
 853         .poll =         unix_poll,
 854         .ioctl =        unix_ioctl,
 855 #ifdef CONFIG_COMPAT
 856         .compat_ioctl = unix_compat_ioctl,
 857 #endif
 858         .listen =       unix_listen,
 859         .shutdown =     unix_shutdown,
 860         .sendmsg =      unix_stream_sendmsg,
 861         .recvmsg =      unix_stream_recvmsg,
 862         .read_skb =     unix_stream_read_skb,
 863         .mmap =         sock_no_mmap,
 864         .splice_read =  unix_stream_splice_read,
 865         .set_peek_off = unix_set_peek_off,
 866         .show_fdinfo =  unix_show_fdinfo,
 867 };
 868
 869 static const struct proto_ops unix_dgram_ops = {
 870         .family =       PF_UNIX,
 871         .owner =        THIS_MODULE,
 872         .release =      unix_release,
 873         .bind =         unix_bind,
 874         .connect =      unix_dgram_connect,
 875         .socketpair =   unix_socketpair,
 876         .accept =       sock_no_accept,
 877         .getname =      unix_getname,
 878         .poll =         unix_dgram_poll,
 879         .ioctl =        unix_ioctl,
 880 #ifdef CONFIG_COMPAT
 881         .compat_ioctl = unix_compat_ioctl,
 882 #endif
 883         .listen =       sock_no_listen,
 884         .shutdown =     unix_shutdown,
 885         .sendmsg =      unix_dgram_sendmsg,
 886         .read_skb =     unix_read_skb,
 887         .recvmsg =      unix_dgram_recvmsg,
 888         .mmap =         sock_no_mmap,
 889         .set_peek_off = unix_set_peek_off,
 890         .show_fdinfo =  unix_show_fdinfo,
 891 };
 892
 893 static const struct proto_ops unix_seqpacket_ops = {
 894         .family =       PF_UNIX,
 895         .owner =        THIS_MODULE,
 896         .release =      unix_release,
 897         .bind =         unix_bind,
 898         .connect =      unix_stream_connect,
 899         .socketpair =   unix_socketpair,
 900         .accept =       unix_accept,
 901         .getname =      unix_getname,
 902         .poll =         unix_dgram_poll,
 903         .ioctl =        unix_ioctl,
 904 #ifdef CONFIG_COMPAT
 905         .compat_ioctl = unix_compat_ioctl,
 906 #endif
 907         .listen =       unix_listen,
 908         .shutdown =     unix_shutdown,
 909         .sendmsg =      unix_seqpacket_sendmsg,
 910         .recvmsg =      unix_seqpacket_recvmsg,
 911         .mmap =         sock_no_mmap,
 912         .set_peek_off = unix_set_peek_off,
 913         .show_fdinfo =  unix_show_fdinfo,
 914 };
 915
 916 static void unix_close(struct sock *sk, long timeout)
 917 {
 918         /* Nothing to do here, unix socket does not need a ->close().
 919          * This is merely for sockmap.
 920          */
 921 }
 922
 923 static void unix_unhash(struct sock *sk)
 924 {
 925         /* Nothing to do here, unix socket does not need a ->unhash().
 926          * This is merely for sockmap.
 927          */
 928 }
 929
 930 static bool unix_bpf_bypass_getsockopt(int level, int optname)
 931 {
 932         if (level == SOL_SOCKET) {
 933                 switch (optname) {
 934                 case SO_PEERPIDFD:
 935                         return true;
 936                 default:
 937                         return false;
 938                 }
 939         }
 940
 941         return false;
 942 }
 943
 944 struct proto unix_dgram_proto = {
 945         .name                   = "UNIX",
 946         .owner                  = THIS_MODULE,
 947         .obj_size               = sizeof(struct unix_sock),
 948         .close                  = unix_close,
 949         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
 950 #ifdef CONFIG_BPF_SYSCALL
 951         .psock_update_sk_prot   = unix_dgram_bpf_update_proto,
 952 #endif
 953 };
 954
 955 struct proto unix_stream_proto = {
 956         .name                   = "UNIX-STREAM",
 957         .owner                  = THIS_MODULE,
 958         .obj_size               = sizeof(struct unix_sock),
 959         .close                  = unix_close,
 960         .unhash                 = unix_unhash,
 961         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
 962 #ifdef CONFIG_BPF_SYSCALL
 963         .psock_update_sk_prot   = unix_stream_bpf_update_proto,
 964 #endif
 965 };
 966
 967 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
 968 {
 969         struct unix_sock *u;
 970         struct sock *sk;
 971         int err;
 972
 973         atomic_long_inc(&unix_nr_socks);
 974         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
 975                 err = -ENFILE;
 976                 goto err;
 977         }
 978
 979         if (type == SOCK_STREAM)
 980                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
 981         else /*dgram and  seqpacket */
 982                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
 983
 984         if (!sk) {
 985                 err = -ENOMEM;
 986                 goto err;
 987         }
 988
 989         sock_init_data(sock, sk);
 990
 991         sk->sk_hash             = unix_unbound_hash(sk);
 992         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
 993         sk->sk_write_space      = unix_write_space;
 994         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 995         sk->sk_destruct         = unix_sock_destructor;
 996         u         = unix_sk(sk);
 997         u->path.dentry = NULL;
 998         u->path.mnt = NULL;
 999         spin_lock_init(&u->lock);
1000         atomic_long_set(&u->inflight, 0);
1001         INIT_LIST_HEAD(&u->link);
1002         mutex_init(&u->iolock); /* single task reading lock */
1003         mutex_init(&u->bindlock); /* single task binding lock */
1004         init_waitqueue_head(&u->peer_wait);
1005         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1006         memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1007         unix_insert_unbound_socket(net, sk);
1008
1009         sock_prot_inuse_add(net, sk->sk_prot, 1);
1010
1011         return sk;
1012
1013 err:
1014         atomic_long_dec(&unix_nr_socks);
1015         return ERR_PTR(err);
1016 }
1017
1018 static int unix_create(struct net *net, struct socket *sock, int protocol,
1019                        int kern)
1020 {
1021         struct sock *sk;
1022
1023         if (protocol && protocol != PF_UNIX)
1024                 return -EPROTONOSUPPORT;
1025
1026         sock->state = SS_UNCONNECTED;
1027
1028         switch (sock->type) {
1029         case SOCK_STREAM:
1030                 sock->ops = &unix_stream_ops;
1031                 break;
1032                 /*
1033                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
1034                  *      nothing uses it.
1035                  */
1036         case SOCK_RAW:
1037                 sock->type = SOCK_DGRAM;
1038                 fallthrough;
1039         case SOCK_DGRAM:
1040                 sock->ops = &unix_dgram_ops;
1041                 break;
1042         case SOCK_SEQPACKET:
1043                 sock->ops = &unix_seqpacket_ops;
1044                 break;
1045         default:
1046                 return -ESOCKTNOSUPPORT;
1047         }
1048
1049         sk = unix_create1(net, sock, kern, sock->type);
1050         if (IS_ERR(sk))
1051                 return PTR_ERR(sk);
1052
1053         return 0;
1054 }
1055
1056 static int unix_release(struct socket *sock)
1057 {
1058         struct sock *sk = sock->sk;
1059
1060         if (!sk)
1061                 return 0;
1062
1063         sk->sk_prot->close(sk, 0);
1064         unix_release_sock(sk, 0);
1065         sock->sk = NULL;
1066
1067         return 0;
1068 }
1069
1070 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1071                                   int type)
1072 {
1073         struct inode *inode;
1074         struct path path;
1075         struct sock *sk;
1076         int err;
1077
1078         unix_mkname_bsd(sunaddr, addr_len);
1079         err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1080         if (err)
1081                 goto fail;
1082
1083         err = path_permission(&path, MAY_WRITE);
1084         if (err)
1085                 goto path_put;
1086
1087         err = -ECONNREFUSED;
1088         inode = d_backing_inode(path.dentry);
1089         if (!S_ISSOCK(inode->i_mode))
1090                 goto path_put;
1091
1092         sk = unix_find_socket_byinode(inode);
1093         if (!sk)
1094                 goto path_put;
1095
1096         err = -EPROTOTYPE;
1097         if (sk->sk_type == type)
1098                 touch_atime(&path);
1099         else
1100                 goto sock_put;
1101
1102         path_put(&path);
1103
1104         return sk;
1105
1106 sock_put:
1107         sock_put(sk);
1108 path_put:
1109         path_put(&path);
1110 fail:
1111         return ERR_PTR(err);
1112 }
1113
1114 static struct sock *unix_find_abstract(struct net *net,
1115                                        struct sockaddr_un *sunaddr,
1116                                        int addr_len, int type)
1117 {
1118         unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1119         struct dentry *dentry;
1120         struct sock *sk;
1121
1122         sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1123         if (!sk)
1124                 return ERR_PTR(-ECONNREFUSED);
1125
1126         dentry = unix_sk(sk)->path.dentry;
1127         if (dentry)
1128                 touch_atime(&unix_sk(sk)->path);
1129
1130         return sk;
1131 }
1132
1133 static struct sock *unix_find_other(struct net *net,
1134                                     struct sockaddr_un *sunaddr,
1135                                     int addr_len, int type)
1136 {
1137         struct sock *sk;
1138
1139         if (sunaddr->sun_path[0])
1140                 sk = unix_find_bsd(sunaddr, addr_len, type);
1141         else
1142                 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1143
1144         return sk;
1145 }
1146
1147 static int unix_autobind(struct sock *sk)
1148 {
1149         unsigned int new_hash, old_hash = sk->sk_hash;
1150         struct unix_sock *u = unix_sk(sk);
1151         struct net *net = sock_net(sk);
1152         struct unix_address *addr;
1153         u32 lastnum, ordernum;
1154         int err;
1155
1156         err = mutex_lock_interruptible(&u->bindlock);
1157         if (err)
1158                 return err;
1159
1160         if (u->addr)
1161                 goto out;
1162
1163         err = -ENOMEM;
1164         addr = kzalloc(sizeof(*addr) +
1165                        offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1166         if (!addr)
1167                 goto out;
1168
1169         addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1170         addr->name->sun_family = AF_UNIX;
1171         refcount_set(&addr->refcnt, 1);
1172
1173         ordernum = get_random_u32();
1174         lastnum = ordernum & 0xFFFFF;
1175 retry:
1176         ordernum = (ordernum + 1) & 0xFFFFF;
1177         sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1178
1179         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1180         unix_table_double_lock(net, old_hash, new_hash);
1181
1182         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1183                 unix_table_double_unlock(net, old_hash, new_hash);
1184
1185                 /* __unix_find_socket_byname() may take long time if many names
1186                  * are already in use.
1187                  */
1188                 cond_resched();
1189
1190                 if (ordernum == lastnum) {
1191                         /* Give up if all names seems to be in use. */
1192                         err = -ENOSPC;
1193                         unix_release_addr(addr);
1194                         goto out;
1195                 }
1196
1197                 goto retry;
1198         }
1199
1200         __unix_set_addr_hash(net, sk, addr, new_hash);
1201         unix_table_double_unlock(net, old_hash, new_hash);
1202         err = 0;
1203
1204 out:    mutex_unlock(&u->bindlock);
1205         return err;
1206 }
1207
1208 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1209                          int addr_len)
1210 {
1211         umode_t mode = S_IFSOCK |
1212                (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1213         unsigned int new_hash, old_hash = sk->sk_hash;
1214         struct unix_sock *u = unix_sk(sk);
1215         struct net *net = sock_net(sk);
1216         struct mnt_idmap *idmap;
1217         struct unix_address *addr;
1218         struct dentry *dentry;
1219         struct path parent;
1220         int err;
1221
1222         addr_len = unix_mkname_bsd(sunaddr, addr_len);
1223         addr = unix_create_addr(sunaddr, addr_len);
1224         if (!addr)
1225                 return -ENOMEM;
1226
1227         /*
1228          * Get the parent directory, calculate the hash for last
1229          * component.
1230          */
1231         dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1232         if (IS_ERR(dentry)) {
1233                 err = PTR_ERR(dentry);
1234                 goto out;
1235         }
1236
1237         /*
1238          * All right, let's create it.
1239          */
1240         idmap = mnt_idmap(parent.mnt);
1241         err = security_path_mknod(&parent, dentry, mode, 0);
1242         if (!err)
1243                 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1244         if (err)
1245                 goto out_path;
1246         err = mutex_lock_interruptible(&u->bindlock);
1247         if (err)
1248                 goto out_unlink;
1249         if (u->addr)
1250                 goto out_unlock;
1251
1252         new_hash = unix_bsd_hash(d_backing_inode(dentry));
1253         unix_table_double_lock(net, old_hash, new_hash);
1254         u->path.mnt = mntget(parent.mnt);
1255         u->path.dentry = dget(dentry);
1256         __unix_set_addr_hash(net, sk, addr, new_hash);
1257         unix_table_double_unlock(net, old_hash, new_hash);
1258         unix_insert_bsd_socket(sk);
1259         mutex_unlock(&u->bindlock);
1260         done_path_create(&parent, dentry);
1261         return 0;
1262
1263 out_unlock:
1264         mutex_unlock(&u->bindlock);
1265         err = -EINVAL;
1266 out_unlink:
1267         /* failed after successful mknod?  unlink what we'd created... */
1268         vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1269 out_path:
1270         done_path_create(&parent, dentry);
1271 out:
1272         unix_release_addr(addr);
1273         return err == -EEXIST ? -EADDRINUSE : err;
1274 }
1275
1276 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1277                               int addr_len)
1278 {
1279         unsigned int new_hash, old_hash = sk->sk_hash;
1280         struct unix_sock *u = unix_sk(sk);
1281         struct net *net = sock_net(sk);
1282         struct unix_address *addr;
1283         int err;
1284
1285         addr = unix_create_addr(sunaddr, addr_len);
1286         if (!addr)
1287                 return -ENOMEM;
1288
1289         err = mutex_lock_interruptible(&u->bindlock);
1290         if (err)
1291                 goto out;
1292
1293         if (u->addr) {
1294                 err = -EINVAL;
1295                 goto out_mutex;
1296         }
1297
1298         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1299         unix_table_double_lock(net, old_hash, new_hash);
1300
1301         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1302                 goto out_spin;
1303
1304         __unix_set_addr_hash(net, sk, addr, new_hash);
1305         unix_table_double_unlock(net, old_hash, new_hash);
1306         mutex_unlock(&u->bindlock);
1307         return 0;
1308
1309 out_spin:
1310         unix_table_double_unlock(net, old_hash, new_hash);
1311         err = -EADDRINUSE;
1312 out_mutex:
1313         mutex_unlock(&u->bindlock);
1314 out:
1315         unix_release_addr(addr);
1316         return err;
1317 }
1318
1319 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1320 {
1321         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1322         struct sock *sk = sock->sk;
1323         int err;
1324
1325         if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1326             sunaddr->sun_family == AF_UNIX)
1327                 return unix_autobind(sk);
1328
1329         err = unix_validate_addr(sunaddr, addr_len);
1330         if (err)
1331                 return err;
1332
1333         if (sunaddr->sun_path[0])
1334                 err = unix_bind_bsd(sk, sunaddr, addr_len);
1335         else
1336                 err = unix_bind_abstract(sk, sunaddr, addr_len);
1337
1338         return err;
1339 }
1340
1341 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1342 {
1343         if (unlikely(sk1 == sk2) || !sk2) {
1344                 unix_state_lock(sk1);
1345                 return;
1346         }
1347         if (sk1 > sk2)
1348                 swap(sk1, sk2);
1349
1350         unix_state_lock(sk1);
1351         unix_state_lock_nested(sk2, U_LOCK_SECOND);
1352 }
1353
1354 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1355 {
1356         if (unlikely(sk1 == sk2) || !sk2) {
1357                 unix_state_unlock(sk1);
1358                 return;
1359         }
1360         unix_state_unlock(sk1);
1361         unix_state_unlock(sk2);
1362 }
1363
1364 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1365                               int alen, int flags)
1366 {
1367         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1368         struct sock *sk = sock->sk;
1369         struct sock *other;
1370         int err;
1371
1372         err = -EINVAL;
1373         if (alen < offsetofend(struct sockaddr, sa_family))
1374                 goto out;
1375
1376         if (addr->sa_family != AF_UNSPEC) {
1377                 err = unix_validate_addr(sunaddr, alen);
1378                 if (err)
1379                         goto out;
1380
1381                 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1382                 if (err)
1383                         goto out;
1384
1385                 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1386                      test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1387                     !unix_sk(sk)->addr) {
1388                         err = unix_autobind(sk);
1389                         if (err)
1390                                 goto out;
1391                 }
1392
1393 restart:
1394                 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1395                 if (IS_ERR(other)) {
1396                         err = PTR_ERR(other);
1397                         goto out;
1398                 }
1399
1400                 unix_state_double_lock(sk, other);
1401
1402                 /* Apparently VFS overslept socket death. Retry. */
1403                 if (sock_flag(other, SOCK_DEAD)) {
1404                         unix_state_double_unlock(sk, other);
1405                         sock_put(other);
1406                         goto restart;
1407                 }
1408
1409                 err = -EPERM;
1410                 if (!unix_may_send(sk, other))
1411                         goto out_unlock;
1412
1413                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1414                 if (err)
1415                         goto out_unlock;
1416
1417                 sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1418         } else {
1419                 /*
1420                  *      1003.1g breaking connected state with AF_UNSPEC
1421                  */
1422                 other = NULL;
1423                 unix_state_double_lock(sk, other);
1424         }
1425
1426         /*
1427          * If it was connected, reconnect.
1428          */
1429         if (unix_peer(sk)) {
1430                 struct sock *old_peer = unix_peer(sk);
1431
1432                 unix_peer(sk) = other;
1433                 if (!other)
1434                         sk->sk_state = TCP_CLOSE;
1435                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1436
1437                 unix_state_double_unlock(sk, other);
1438
1439                 if (other != old_peer)
1440                         unix_dgram_disconnected(sk, old_peer);
1441                 sock_put(old_peer);
1442         } else {
1443                 unix_peer(sk) = other;
1444                 unix_state_double_unlock(sk, other);
1445         }
1446
1447         return 0;
1448
1449 out_unlock:
1450         unix_state_double_unlock(sk, other);
1451         sock_put(other);
1452 out:
1453         return err;
1454 }
1455
1456 static long unix_wait_for_peer(struct sock *other, long timeo)
1457         __releases(&unix_sk(other)->lock)
1458 {
1459         struct unix_sock *u = unix_sk(other);
1460         int sched;
1461         DEFINE_WAIT(wait);
1462
1463         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1464
1465         sched = !sock_flag(other, SOCK_DEAD) &&
1466                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1467                 unix_recvq_full_lockless(other);
1468
1469         unix_state_unlock(other);
1470
1471         if (sched)
1472                 timeo = schedule_timeout(timeo);
1473
1474         finish_wait(&u->peer_wait, &wait);
1475         return timeo;
1476 }
1477
1478 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1479                                int addr_len, int flags)
1480 {
1481         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1482         struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1483         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1484         struct net *net = sock_net(sk);
1485         struct sk_buff *skb = NULL;
1486         long timeo;
1487         int err;
1488         int st;
1489
1490         err = unix_validate_addr(sunaddr, addr_len);
1491         if (err)
1492                 goto out;
1493
1494         err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1495         if (err)
1496                 goto out;
1497
1498         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1499              test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1500                 err = unix_autobind(sk);
1501                 if (err)
1502                         goto out;
1503         }
1504
1505         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1506
1507         /* First of all allocate resources.
1508            If we will make it after state is locked,
1509            we will have to recheck all again in any case.
1510          */
1511
1512         /* create new sock for complete connection */
1513         newsk = unix_create1(net, NULL, 0, sock->type);
1514         if (IS_ERR(newsk)) {
1515                 err = PTR_ERR(newsk);
1516                 newsk = NULL;
1517                 goto out;
1518         }
1519
1520         err = -ENOMEM;
1521
1522         /* Allocate skb for sending to listening sock */
1523         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1524         if (skb == NULL)
1525                 goto out;
1526
1527 restart:
1528         /*  Find listening sock. */
1529         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1530         if (IS_ERR(other)) {
1531                 err = PTR_ERR(other);
1532                 other = NULL;
1533                 goto out;
1534         }
1535
1536         /* Latch state of peer */
1537         unix_state_lock(other);
1538
1539         /* Apparently VFS overslept socket death. Retry. */
1540         if (sock_flag(other, SOCK_DEAD)) {
1541                 unix_state_unlock(other);
1542                 sock_put(other);
1543                 goto restart;
1544         }
1545
1546         err = -ECONNREFUSED;
1547         if (other->sk_state != TCP_LISTEN)
1548                 goto out_unlock;
1549         if (other->sk_shutdown & RCV_SHUTDOWN)
1550                 goto out_unlock;
1551
1552         if (unix_recvq_full(other)) {
1553                 err = -EAGAIN;
1554                 if (!timeo)
1555                         goto out_unlock;
1556
1557                 timeo = unix_wait_for_peer(other, timeo);
1558
1559                 err = sock_intr_errno(timeo);
1560                 if (signal_pending(current))
1561                         goto out;
1562                 sock_put(other);
1563                 goto restart;
1564         }
1565
1566         /* Latch our state.
1567
1568            It is tricky place. We need to grab our state lock and cannot
1569            drop lock on peer. It is dangerous because deadlock is
1570            possible. Connect to self case and simultaneous
1571            attempt to connect are eliminated by checking socket
1572            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1573            check this before attempt to grab lock.
1574
1575            Well, and we have to recheck the state after socket locked.
1576          */
1577         st = sk->sk_state;
1578
1579         switch (st) {
1580         case TCP_CLOSE:
1581                 /* This is ok... continue with connect */
1582                 break;
1583         case TCP_ESTABLISHED:
1584                 /* Socket is already connected */
1585                 err = -EISCONN;
1586                 goto out_unlock;
1587         default:
1588                 err = -EINVAL;
1589                 goto out_unlock;
1590         }
1591
1592         unix_state_lock_nested(sk, U_LOCK_SECOND);
1593
1594         if (sk->sk_state != st) {
1595                 unix_state_unlock(sk);
1596                 unix_state_unlock(other);
1597                 sock_put(other);
1598                 goto restart;
1599         }
1600
1601         err = security_unix_stream_connect(sk, other, newsk);
1602         if (err) {
1603                 unix_state_unlock(sk);
1604                 goto out_unlock;
1605         }
1606
1607         /* The way is open! Fastly set all the necessary fields... */
1608
1609         sock_hold(sk);
1610         unix_peer(newsk)        = sk;
1611         newsk->sk_state         = TCP_ESTABLISHED;
1612         newsk->sk_type          = sk->sk_type;
1613         init_peercred(newsk);
1614         newu = unix_sk(newsk);
1615         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1616         otheru = unix_sk(other);
1617
1618         /* copy address information from listening to new sock
1619          *
1620          * The contents of *(otheru->addr) and otheru->path
1621          * are seen fully set up here, since we have found
1622          * otheru in hash under its lock.  Insertion into the
1623          * hash chain we'd found it in had been done in an
1624          * earlier critical area protected by the chain's lock,
1625          * the same one where we'd set *(otheru->addr) contents,
1626          * as well as otheru->path and otheru->addr itself.
1627          *
1628          * Using smp_store_release() here to set newu->addr
1629          * is enough to make those stores, as well as stores
1630          * to newu->path visible to anyone who gets newu->addr
1631          * by smp_load_acquire().  IOW, the same warranties
1632          * as for unix_sock instances bound in unix_bind() or
1633          * in unix_autobind().
1634          */
1635         if (otheru->path.dentry) {
1636                 path_get(&otheru->path);
1637                 newu->path = otheru->path;
1638         }
1639         refcount_inc(&otheru->addr->refcnt);
1640         smp_store_release(&newu->addr, otheru->addr);
1641
1642         /* Set credentials */
1643         copy_peercred(sk, other);
1644
1645         sock->state     = SS_CONNECTED;
1646         sk->sk_state    = TCP_ESTABLISHED;
1647         sock_hold(newsk);
1648
1649         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1650         unix_peer(sk)   = newsk;
1651
1652         unix_state_unlock(sk);
1653
1654         /* take ten and send info to listening sock */
1655         spin_lock(&other->sk_receive_queue.lock);
1656         __skb_queue_tail(&other->sk_receive_queue, skb);
1657         spin_unlock(&other->sk_receive_queue.lock);
1658         unix_state_unlock(other);
1659         other->sk_data_ready(other);
1660         sock_put(other);
1661         return 0;
1662
1663 out_unlock:
1664         if (other)
1665                 unix_state_unlock(other);
1666
1667 out:
1668         kfree_skb(skb);
1669         if (newsk)
1670                 unix_release_sock(newsk, 0);
1671         if (other)
1672                 sock_put(other);
1673         return err;
1674 }
1675
1676 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1677 {
1678         struct sock *ska = socka->sk, *skb = sockb->sk;
1679
1680         /* Join our sockets back to back */
1681         sock_hold(ska);
1682         sock_hold(skb);
1683         unix_peer(ska) = skb;
1684         unix_peer(skb) = ska;
1685         init_peercred(ska);
1686         init_peercred(skb);
1687
1688         ska->sk_state = TCP_ESTABLISHED;
1689         skb->sk_state = TCP_ESTABLISHED;
1690         socka->state  = SS_CONNECTED;
1691         sockb->state  = SS_CONNECTED;
1692         return 0;
1693 }
1694
1695 static void unix_sock_inherit_flags(const struct socket *old,
1696                                     struct socket *new)
1697 {
1698         if (test_bit(SOCK_PASSCRED, &old->flags))
1699                 set_bit(SOCK_PASSCRED, &new->flags);
1700         if (test_bit(SOCK_PASSPIDFD, &old->flags))
1701                 set_bit(SOCK_PASSPIDFD, &new->flags);
1702         if (test_bit(SOCK_PASSSEC, &old->flags))
1703                 set_bit(SOCK_PASSSEC, &new->flags);
1704 }
1705
1706 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1707                        bool kern)
1708 {
1709         struct sock *sk = sock->sk;
1710         struct sock *tsk;
1711         struct sk_buff *skb;
1712         int err;
1713
1714         err = -EOPNOTSUPP;
1715         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1716                 goto out;
1717
1718         err = -EINVAL;
1719         if (sk->sk_state != TCP_LISTEN)
1720                 goto out;
1721
1722         /* If socket state is TCP_LISTEN it cannot change (for now...),
1723          * so that no locks are necessary.
1724          */
1725
1726         skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1727                                 &err);
1728         if (!skb) {
1729                 /* This means receive shutdown. */
1730                 if (err == 0)
1731                         err = -EINVAL;
1732                 goto out;
1733         }
1734
1735         tsk = skb->sk;
1736         skb_free_datagram(sk, skb);
1737         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1738
1739         /* attach accepted sock to socket */
1740         unix_state_lock(tsk);
1741         newsock->state = SS_CONNECTED;
1742         unix_sock_inherit_flags(sock, newsock);
1743         sock_graft(tsk, newsock);
1744         unix_state_unlock(tsk);
1745         return 0;
1746
1747 out:
1748         return err;
1749 }
1750
1751
1752 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1753 {
1754         struct sock *sk = sock->sk;
1755         struct unix_address *addr;
1756         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1757         int err = 0;
1758
1759         if (peer) {
1760                 sk = unix_peer_get(sk);
1761
1762                 err = -ENOTCONN;
1763                 if (!sk)
1764                         goto out;
1765                 err = 0;
1766         } else {
1767                 sock_hold(sk);
1768         }
1769
1770         addr = smp_load_acquire(&unix_sk(sk)->addr);
1771         if (!addr) {
1772                 sunaddr->sun_family = AF_UNIX;
1773                 sunaddr->sun_path[0] = 0;
1774                 err = offsetof(struct sockaddr_un, sun_path);
1775         } else {
1776                 err = addr->len;
1777                 memcpy(sunaddr, addr->name, addr->len);
1778
1779                 if (peer)
1780                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1781                                                CGROUP_UNIX_GETPEERNAME);
1782                 else
1783                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1784                                                CGROUP_UNIX_GETSOCKNAME);
1785         }
1786         sock_put(sk);
1787 out:
1788         return err;
1789 }
1790
1791 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1792 {
1793         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1794
1795         /*
1796          * Garbage collection of unix sockets starts by selecting a set of
1797          * candidate sockets which have reference only from being in flight
1798          * (total_refs == inflight_refs).  This condition is checked once during
1799          * the candidate collection phase, and candidates are marked as such, so
1800          * that non-candidates can later be ignored.  While inflight_refs is
1801          * protected by unix_gc_lock, total_refs (file count) is not, hence this
1802          * is an instantaneous decision.
1803          *
1804          * Once a candidate, however, the socket must not be reinstalled into a
1805          * file descriptor while the garbage collection is in progress.
1806          *
1807          * If the above conditions are met, then the directed graph of
1808          * candidates (*) does not change while unix_gc_lock is held.
1809          *
1810          * Any operations that changes the file count through file descriptors
1811          * (dup, close, sendmsg) does not change the graph since candidates are
1812          * not installed in fds.
1813          *
1814          * Dequeing a candidate via recvmsg would install it into an fd, but
1815          * that takes unix_gc_lock to decrement the inflight count, so it's
1816          * serialized with garbage collection.
1817          *
1818          * MSG_PEEK is special in that it does not change the inflight count,
1819          * yet does install the socket into an fd.  The following lock/unlock
1820          * pair is to ensure serialization with garbage collection.  It must be
1821          * done between incrementing the file count and installing the file into
1822          * an fd.
1823          *
1824          * If garbage collection starts after the barrier provided by the
1825          * lock/unlock, then it will see the elevated refcount and not mark this
1826          * as a candidate.  If a garbage collection is already in progress
1827          * before the file count was incremented, then the lock/unlock pair will
1828          * ensure that garbage collection is finished before progressing to
1829          * installing the fd.
1830          *
1831          * (*) A -> B where B is on the queue of A or B is on the queue of C
1832          * which is on the queue of listening socket A.
1833          */
1834         spin_lock(&unix_gc_lock);
1835         spin_unlock(&unix_gc_lock);
1836 }
1837
1838 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1839 {
1840         int err = 0;
1841
1842         UNIXCB(skb).pid  = get_pid(scm->pid);
1843         UNIXCB(skb).uid = scm->creds.uid;
1844         UNIXCB(skb).gid = scm->creds.gid;
1845         UNIXCB(skb).fp = NULL;
1846         unix_get_secdata(scm, skb);
1847         if (scm->fp && send_fds)
1848                 err = unix_attach_fds(scm, skb);
1849
1850         skb->destructor = unix_destruct_scm;
1851         return err;
1852 }
1853
1854 static bool unix_passcred_enabled(const struct socket *sock,
1855                                   const struct sock *other)
1856 {
1857         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1858                test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1859                !other->sk_socket ||
1860                test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1861                test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1862 }
1863
1864 /*
1865  * Some apps rely on write() giving SCM_CREDENTIALS
1866  * We include credentials if source or destination socket
1867  * asserted SOCK_PASSCRED.
1868  */
1869 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1870                             const struct sock *other)
1871 {
1872         if (UNIXCB(skb).pid)
1873                 return;
1874         if (unix_passcred_enabled(sock, other)) {
1875                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1876                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1877         }
1878 }
1879
1880 static bool unix_skb_scm_eq(struct sk_buff *skb,
1881                             struct scm_cookie *scm)
1882 {
1883         return UNIXCB(skb).pid == scm->pid &&
1884                uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1885                gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1886                unix_secdata_eq(scm, skb);
1887 }
1888
1889 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1890 {
1891         struct scm_fp_list *fp = UNIXCB(skb).fp;
1892         struct unix_sock *u = unix_sk(sk);
1893
1894         if (unlikely(fp && fp->count))
1895                 atomic_add(fp->count, &u->scm_stat.nr_fds);
1896 }
1897
1898 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1899 {
1900         struct scm_fp_list *fp = UNIXCB(skb).fp;
1901         struct unix_sock *u = unix_sk(sk);
1902
1903         if (unlikely(fp && fp->count))
1904                 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1905 }
1906
1907 /*
1908  *      Send AF_UNIX data.
1909  */
1910
1911 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1912                               size_t len)
1913 {
1914         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1915         struct sock *sk = sock->sk, *other = NULL;
1916         struct unix_sock *u = unix_sk(sk);
1917         struct scm_cookie scm;
1918         struct sk_buff *skb;
1919         int data_len = 0;
1920         int sk_locked;
1921         long timeo;
1922         int err;
1923
1924         wait_for_unix_gc();
1925         err = scm_send(sock, msg, &scm, false);
1926         if (err < 0)
1927                 return err;
1928
1929         err = -EOPNOTSUPP;
1930         if (msg->msg_flags&MSG_OOB)
1931                 goto out;
1932
1933         if (msg->msg_namelen) {
1934                 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1935                 if (err)
1936                         goto out;
1937
1938                 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1939                                                             msg->msg_name,
1940                                                             &msg->msg_namelen,
1941                                                             NULL);
1942                 if (err)
1943                         goto out;
1944         } else {
1945                 sunaddr = NULL;
1946                 err = -ENOTCONN;
1947                 other = unix_peer_get(sk);
1948                 if (!other)
1949                         goto out;
1950         }
1951
1952         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1953              test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1954                 err = unix_autobind(sk);
1955                 if (err)
1956                         goto out;
1957         }
1958
1959         err = -EMSGSIZE;
1960         if (len > sk->sk_sndbuf - 32)
1961                 goto out;
1962
1963         if (len > SKB_MAX_ALLOC) {
1964                 data_len = min_t(size_t,
1965                                  len - SKB_MAX_ALLOC,
1966                                  MAX_SKB_FRAGS * PAGE_SIZE);
1967                 data_len = PAGE_ALIGN(data_len);
1968
1969                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1970         }
1971
1972         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1973                                    msg->msg_flags & MSG_DONTWAIT, &err,
1974                                    PAGE_ALLOC_COSTLY_ORDER);
1975         if (skb == NULL)
1976                 goto out;
1977
1978         err = unix_scm_to_skb(&scm, skb, true);
1979         if (err < 0)
1980                 goto out_free;
1981
1982         skb_put(skb, len - data_len);
1983         skb->data_len = data_len;
1984         skb->len = len;
1985         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1986         if (err)
1987                 goto out_free;
1988
1989         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1990
1991 restart:
1992         if (!other) {
1993                 err = -ECONNRESET;
1994                 if (sunaddr == NULL)
1995                         goto out_free;
1996
1997                 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1998                                         sk->sk_type);
1999                 if (IS_ERR(other)) {
2000                         err = PTR_ERR(other);
2001                         other = NULL;
2002                         goto out_free;
2003                 }
2004         }
2005
2006         if (sk_filter(other, skb) < 0) {
2007                 /* Toss the packet but do not return any error to the sender */
2008                 err = len;
2009                 goto out_free;
2010         }
2011
2012         sk_locked = 0;
2013         unix_state_lock(other);
2014 restart_locked:
2015         err = -EPERM;
2016         if (!unix_may_send(sk, other))
2017                 goto out_unlock;
2018
2019         if (unlikely(sock_flag(other, SOCK_DEAD))) {
2020                 /*
2021                  *      Check with 1003.1g - what should
2022                  *      datagram error
2023                  */
2024                 unix_state_unlock(other);
2025                 sock_put(other);
2026
2027                 if (!sk_locked)
2028                         unix_state_lock(sk);
2029
2030                 err = 0;
2031                 if (sk->sk_type == SOCK_SEQPACKET) {
2032                         /* We are here only when racing with unix_release_sock()
2033                          * is clearing @other. Never change state to TCP_CLOSE
2034                          * unlike SOCK_DGRAM wants.
2035                          */
2036                         unix_state_unlock(sk);
2037                         err = -EPIPE;
2038                 } else if (unix_peer(sk) == other) {
2039                         unix_peer(sk) = NULL;
2040                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2041
2042                         sk->sk_state = TCP_CLOSE;
2043                         unix_state_unlock(sk);
2044
2045                         unix_dgram_disconnected(sk, other);
2046                         sock_put(other);
2047                         err = -ECONNREFUSED;
2048                 } else {
2049                         unix_state_unlock(sk);
2050                 }
2051
2052                 other = NULL;
2053                 if (err)
2054                         goto out_free;
2055                 goto restart;
2056         }
2057
2058         err = -EPIPE;
2059         if (other->sk_shutdown & RCV_SHUTDOWN)
2060                 goto out_unlock;
2061
2062         if (sk->sk_type != SOCK_SEQPACKET) {
2063                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2064                 if (err)
2065                         goto out_unlock;
2066         }
2067
2068         /* other == sk && unix_peer(other) != sk if
2069          * - unix_peer(sk) == NULL, destination address bound to sk
2070          * - unix_peer(sk) == sk by time of get but disconnected before lock
2071          */
2072         if (other != sk &&
2073             unlikely(unix_peer(other) != sk &&
2074             unix_recvq_full_lockless(other))) {
2075                 if (timeo) {
2076                         timeo = unix_wait_for_peer(other, timeo);
2077
2078                         err = sock_intr_errno(timeo);
2079                         if (signal_pending(current))
2080                                 goto out_free;
2081
2082                         goto restart;
2083                 }
2084
2085                 if (!sk_locked) {
2086                         unix_state_unlock(other);
2087                         unix_state_double_lock(sk, other);
2088                 }
2089
2090                 if (unix_peer(sk) != other ||
2091                     unix_dgram_peer_wake_me(sk, other)) {
2092                         err = -EAGAIN;
2093                         sk_locked = 1;
2094                         goto out_unlock;
2095                 }
2096
2097                 if (!sk_locked) {
2098                         sk_locked = 1;
2099                         goto restart_locked;
2100                 }
2101         }
2102
2103         if (unlikely(sk_locked))
2104                 unix_state_unlock(sk);
2105
2106         if (sock_flag(other, SOCK_RCVTSTAMP))
2107                 __net_timestamp(skb);
2108         maybe_add_creds(skb, sock, other);
2109         scm_stat_add(other, skb);
2110         skb_queue_tail(&other->sk_receive_queue, skb);
2111         unix_state_unlock(other);
2112         other->sk_data_ready(other);
2113         sock_put(other);
2114         scm_destroy(&scm);
2115         return len;
2116
2117 out_unlock:
2118         if (sk_locked)
2119                 unix_state_unlock(sk);
2120         unix_state_unlock(other);
2121 out_free:
2122         kfree_skb(skb);
2123 out:
2124         if (other)
2125                 sock_put(other);
2126         scm_destroy(&scm);
2127         return err;
2128 }
2129
2130 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2131  * bytes, and a minimum of a full page.
2132  */
2133 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2134
2135 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2136 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2137                      struct scm_cookie *scm, bool fds_sent)
2138 {
2139         struct unix_sock *ousk = unix_sk(other);
2140         struct sk_buff *skb;
2141         int err = 0;
2142
2143         skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2144
2145         if (!skb)
2146                 return err;
2147
2148         err = unix_scm_to_skb(scm, skb, !fds_sent);
2149         if (err < 0) {
2150                 kfree_skb(skb);
2151                 return err;
2152         }
2153         skb_put(skb, 1);
2154         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2155
2156         if (err) {
2157                 kfree_skb(skb);
2158                 return err;
2159         }
2160
2161         unix_state_lock(other);
2162
2163         if (sock_flag(other, SOCK_DEAD) ||
2164             (other->sk_shutdown & RCV_SHUTDOWN)) {
2165                 unix_state_unlock(other);
2166                 kfree_skb(skb);
2167                 return -EPIPE;
2168         }
2169
2170         maybe_add_creds(skb, sock, other);
2171         skb_get(skb);
2172
2173         if (ousk->oob_skb)
2174                 consume_skb(ousk->oob_skb);
2175
2176         WRITE_ONCE(ousk->oob_skb, skb);
2177
2178         scm_stat_add(other, skb);
2179         skb_queue_tail(&other->sk_receive_queue, skb);
2180         sk_send_sigurg(other);
2181         unix_state_unlock(other);
2182         other->sk_data_ready(other);
2183
2184         return err;
2185 }
2186 #endif
2187
2188 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2189                                size_t len)
2190 {
2191         struct sock *sk = sock->sk;
2192         struct sock *other = NULL;
2193         int err, size;
2194         struct sk_buff *skb;
2195         int sent = 0;
2196         struct scm_cookie scm;
2197         bool fds_sent = false;
2198         int data_len;
2199
2200         wait_for_unix_gc();
2201         err = scm_send(sock, msg, &scm, false);
2202         if (err < 0)
2203                 return err;
2204
2205         err = -EOPNOTSUPP;
2206         if (msg->msg_flags & MSG_OOB) {
2207 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2208                 if (len)
2209                         len--;
2210                 else
2211 #endif
2212                         goto out_err;
2213         }
2214
2215         if (msg->msg_namelen) {
2216                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2217                 goto out_err;
2218         } else {
2219                 err = -ENOTCONN;
2220                 other = unix_peer(sk);
2221                 if (!other)
2222                         goto out_err;
2223         }
2224
2225         if (sk->sk_shutdown & SEND_SHUTDOWN)
2226                 goto pipe_err;
2227
2228         while (sent < len) {
2229                 size = len - sent;
2230
2231                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2232                         skb = sock_alloc_send_pskb(sk, 0, 0,
2233                                                    msg->msg_flags & MSG_DONTWAIT,
2234                                                    &err, 0);
2235                 } else {
2236                         /* Keep two messages in the pipe so it schedules better */
2237                         size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2238
2239                         /* allow fallback to order-0 allocations */
2240                         size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2241
2242                         data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2243
2244                         data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2245
2246                         skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2247                                                    msg->msg_flags & MSG_DONTWAIT, &err,
2248                                                    get_order(UNIX_SKB_FRAGS_SZ));
2249                 }
2250                 if (!skb)
2251                         goto out_err;
2252
2253                 /* Only send the fds in the first buffer */
2254                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2255                 if (err < 0) {
2256                         kfree_skb(skb);
2257                         goto out_err;
2258                 }
2259                 fds_sent = true;
2260
2261                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2262                         err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2263                                                    sk->sk_allocation);
2264                         if (err < 0) {
2265                                 kfree_skb(skb);
2266                                 goto out_err;
2267                         }
2268                         size = err;
2269                         refcount_add(size, &sk->sk_wmem_alloc);
2270                 } else {
2271                         skb_put(skb, size - data_len);
2272                         skb->data_len = data_len;
2273                         skb->len = size;
2274                         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2275                         if (err) {
2276                                 kfree_skb(skb);
2277                                 goto out_err;
2278                         }
2279                 }
2280
2281                 unix_state_lock(other);
2282
2283                 if (sock_flag(other, SOCK_DEAD) ||
2284                     (other->sk_shutdown & RCV_SHUTDOWN))
2285                         goto pipe_err_free;
2286
2287                 maybe_add_creds(skb, sock, other);
2288                 scm_stat_add(other, skb);
2289                 skb_queue_tail(&other->sk_receive_queue, skb);
2290                 unix_state_unlock(other);
2291                 other->sk_data_ready(other);
2292                 sent += size;
2293         }
2294
2295 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2296         if (msg->msg_flags & MSG_OOB) {
2297                 err = queue_oob(sock, msg, other, &scm, fds_sent);
2298                 if (err)
2299                         goto out_err;
2300                 sent++;
2301         }
2302 #endif
2303
2304         scm_destroy(&scm);
2305
2306         return sent;
2307
2308 pipe_err_free:
2309         unix_state_unlock(other);
2310         kfree_skb(skb);
2311 pipe_err:
2312         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2313                 send_sig(SIGPIPE, current, 0);
2314         err = -EPIPE;
2315 out_err:
2316         scm_destroy(&scm);
2317         return sent ? : err;
2318 }
2319
2320 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2321                                   size_t len)
2322 {
2323         int err;
2324         struct sock *sk = sock->sk;
2325
2326         err = sock_error(sk);
2327         if (err)
2328                 return err;
2329
2330         if (sk->sk_state != TCP_ESTABLISHED)
2331                 return -ENOTCONN;
2332
2333         if (msg->msg_namelen)
2334                 msg->msg_namelen = 0;
2335
2336         return unix_dgram_sendmsg(sock, msg, len);
2337 }
2338
2339 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2340                                   size_t size, int flags)
2341 {
2342         struct sock *sk = sock->sk;
2343
2344         if (sk->sk_state != TCP_ESTABLISHED)
2345                 return -ENOTCONN;
2346
2347         return unix_dgram_recvmsg(sock, msg, size, flags);
2348 }
2349
2350 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2351 {
2352         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2353
2354         if (addr) {
2355                 msg->msg_namelen = addr->len;
2356                 memcpy(msg->msg_name, addr->name, addr->len);
2357         }
2358 }
2359
2360 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2361                          int flags)
2362 {
2363         struct scm_cookie scm;
2364         struct socket *sock = sk->sk_socket;
2365         struct unix_sock *u = unix_sk(sk);
2366         struct sk_buff *skb, *last;
2367         long timeo;
2368         int skip;
2369         int err;
2370
2371         err = -EOPNOTSUPP;
2372         if (flags&MSG_OOB)
2373                 goto out;
2374
2375         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2376
2377         do {
2378                 mutex_lock(&u->iolock);
2379
2380                 skip = sk_peek_offset(sk, flags);
2381                 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2382                                               &skip, &err, &last);
2383                 if (skb) {
2384                         if (!(flags & MSG_PEEK))
2385                                 scm_stat_del(sk, skb);
2386                         break;
2387                 }
2388
2389                 mutex_unlock(&u->iolock);
2390
2391                 if (err != -EAGAIN)
2392                         break;
2393         } while (timeo &&
2394                  !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2395                                               &err, &timeo, last));
2396
2397         if (!skb) { /* implies iolock unlocked */
2398                 unix_state_lock(sk);
2399                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2400                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2401                     (sk->sk_shutdown & RCV_SHUTDOWN))
2402                         err = 0;
2403                 unix_state_unlock(sk);
2404                 goto out;
2405         }
2406
2407         if (wq_has_sleeper(&u->peer_wait))
2408                 wake_up_interruptible_sync_poll(&u->peer_wait,
2409                                                 EPOLLOUT | EPOLLWRNORM |
2410                                                 EPOLLWRBAND);
2411
2412         if (msg->msg_name) {
2413                 unix_copy_addr(msg, skb->sk);
2414
2415                 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2416                                                       msg->msg_name,
2417                                                       &msg->msg_namelen);
2418         }
2419
2420         if (size > skb->len - skip)
2421                 size = skb->len - skip;
2422         else if (size < skb->len - skip)
2423                 msg->msg_flags |= MSG_TRUNC;
2424
2425         err = skb_copy_datagram_msg(skb, skip, msg, size);
2426         if (err)
2427                 goto out_free;
2428
2429         if (sock_flag(sk, SOCK_RCVTSTAMP))
2430                 __sock_recv_timestamp(msg, sk, skb);
2431
2432         memset(&scm, 0, sizeof(scm));
2433
2434         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2435         unix_set_secdata(&scm, skb);
2436
2437         if (!(flags & MSG_PEEK)) {
2438                 if (UNIXCB(skb).fp)
2439                         unix_detach_fds(&scm, skb);
2440
2441                 sk_peek_offset_bwd(sk, skb->len);
2442         } else {
2443                 /* It is questionable: on PEEK we could:
2444                    - do not return fds - good, but too simple 8)
2445                    - return fds, and do not return them on read (old strategy,
2446                      apparently wrong)
2447                    - clone fds (I chose it for now, it is the most universal
2448                      solution)
2449
2450                    POSIX 1003.1g does not actually define this clearly
2451                    at all. POSIX 1003.1g doesn't define a lot of things
2452                    clearly however!
2453
2454                 */
2455
2456                 sk_peek_offset_fwd(sk, size);
2457
2458                 if (UNIXCB(skb).fp)
2459                         unix_peek_fds(&scm, skb);
2460         }
2461         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2462
2463         scm_recv_unix(sock, msg, &scm, flags);
2464
2465 out_free:
2466         skb_free_datagram(sk, skb);
2467         mutex_unlock(&u->iolock);
2468 out:
2469         return err;
2470 }
2471
2472 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2473                               int flags)
2474 {
2475         struct sock *sk = sock->sk;
2476
2477 #ifdef CONFIG_BPF_SYSCALL
2478         const struct proto *prot = READ_ONCE(sk->sk_prot);
2479
2480         if (prot != &unix_dgram_proto)
2481                 return prot->recvmsg(sk, msg, size, flags, NULL);
2482 #endif
2483         return __unix_dgram_recvmsg(sk, msg, size, flags);
2484 }
2485
2486 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2487 {
2488         struct unix_sock *u = unix_sk(sk);
2489         struct sk_buff *skb;
2490         int err;
2491
2492         mutex_lock(&u->iolock);
2493         skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2494         mutex_unlock(&u->iolock);
2495         if (!skb)
2496                 return err;
2497
2498         return recv_actor(sk, skb);
2499 }
2500
2501 /*
2502  *      Sleep until more data has arrived. But check for races..
2503  */
2504 static long unix_stream_data_wait(struct sock *sk, long timeo,
2505                                   struct sk_buff *last, unsigned int last_len,
2506                                   bool freezable)
2507 {
2508         unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2509         struct sk_buff *tail;
2510         DEFINE_WAIT(wait);
2511
2512         unix_state_lock(sk);
2513
2514         for (;;) {
2515                 prepare_to_wait(sk_sleep(sk), &wait, state);
2516
2517                 tail = skb_peek_tail(&sk->sk_receive_queue);
2518                 if (tail != last ||
2519                     (tail && tail->len != last_len) ||
2520                     sk->sk_err ||
2521                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2522                     signal_pending(current) ||
2523                     !timeo)
2524                         break;
2525
2526                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2527                 unix_state_unlock(sk);
2528                 timeo = schedule_timeout(timeo);
2529                 unix_state_lock(sk);
2530
2531                 if (sock_flag(sk, SOCK_DEAD))
2532                         break;
2533
2534                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2535         }
2536
2537         finish_wait(sk_sleep(sk), &wait);
2538         unix_state_unlock(sk);
2539         return timeo;
2540 }
2541
2542 static unsigned int unix_skb_len(const struct sk_buff *skb)
2543 {
2544         return skb->len - UNIXCB(skb).consumed;
2545 }
2546
2547 struct unix_stream_read_state {
2548         int (*recv_actor)(struct sk_buff *, int, int,
2549                           struct unix_stream_read_state *);
2550         struct socket *socket;
2551         struct msghdr *msg;
2552         struct pipe_inode_info *pipe;
2553         size_t size;
2554         int flags;
2555         unsigned int splice_flags;
2556 };
2557
2558 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2559 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2560 {
2561         struct socket *sock = state->socket;
2562         struct sock *sk = sock->sk;
2563         struct unix_sock *u = unix_sk(sk);
2564         int chunk = 1;
2565         struct sk_buff *oob_skb;
2566
2567         mutex_lock(&u->iolock);
2568         unix_state_lock(sk);
2569
2570         if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2571                 unix_state_unlock(sk);
2572                 mutex_unlock(&u->iolock);
2573                 return -EINVAL;
2574         }
2575
2576         oob_skb = u->oob_skb;
2577
2578         if (!(state->flags & MSG_PEEK))
2579                 WRITE_ONCE(u->oob_skb, NULL);
2580         else
2581                 skb_get(oob_skb);
2582         unix_state_unlock(sk);
2583
2584         chunk = state->recv_actor(oob_skb, 0, chunk, state);
2585
2586         if (!(state->flags & MSG_PEEK))
2587                 UNIXCB(oob_skb).consumed += 1;
2588
2589         consume_skb(oob_skb);
2590
2591         mutex_unlock(&u->iolock);
2592
2593         if (chunk < 0)
2594                 return -EFAULT;
2595
2596         state->msg->msg_flags |= MSG_OOB;
2597         return 1;
2598 }
2599
2600 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2601                                   int flags, int copied)
2602 {
2603         struct unix_sock *u = unix_sk(sk);
2604
2605         if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2606                 skb_unlink(skb, &sk->sk_receive_queue);
2607                 consume_skb(skb);
2608                 skb = NULL;
2609         } else {
2610                 if (skb == u->oob_skb) {
2611                         if (copied) {
2612                                 skb = NULL;
2613                         } else if (sock_flag(sk, SOCK_URGINLINE)) {
2614                                 if (!(flags & MSG_PEEK)) {
2615                                         WRITE_ONCE(u->oob_skb, NULL);
2616                                         consume_skb(skb);
2617                                 }
2618                         } else if (!(flags & MSG_PEEK)) {
2619                                 skb_unlink(skb, &sk->sk_receive_queue);
2620                                 consume_skb(skb);
2621                                 skb = skb_peek(&sk->sk_receive_queue);
2622                         }
2623                 }
2624         }
2625         return skb;
2626 }
2627 #endif
2628
2629 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2630 {
2631         if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2632                 return -ENOTCONN;
2633
2634         return unix_read_skb(sk, recv_actor);
2635 }
2636
2637 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2638                                     bool freezable)
2639 {
2640         struct scm_cookie scm;
2641         struct socket *sock = state->socket;
2642         struct sock *sk = sock->sk;
2643         struct unix_sock *u = unix_sk(sk);
2644         int copied = 0;
2645         int flags = state->flags;
2646         int noblock = flags & MSG_DONTWAIT;
2647         bool check_creds = false;
2648         int target;
2649         int err = 0;
2650         long timeo;
2651         int skip;
2652         size_t size = state->size;
2653         unsigned int last_len;
2654
2655         if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2656                 err = -EINVAL;
2657                 goto out;
2658         }
2659
2660         if (unlikely(flags & MSG_OOB)) {
2661                 err = -EOPNOTSUPP;
2662 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2663                 err = unix_stream_recv_urg(state);
2664 #endif
2665                 goto out;
2666         }
2667
2668         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2669         timeo = sock_rcvtimeo(sk, noblock);
2670
2671         memset(&scm, 0, sizeof(scm));
2672
2673         /* Lock the socket to prevent queue disordering
2674          * while sleeps in memcpy_tomsg
2675          */
2676         mutex_lock(&u->iolock);
2677
2678         skip = max(sk_peek_offset(sk, flags), 0);
2679
2680         do {
2681                 int chunk;
2682                 bool drop_skb;
2683                 struct sk_buff *skb, *last;
2684
2685 redo:
2686                 unix_state_lock(sk);
2687                 if (sock_flag(sk, SOCK_DEAD)) {
2688                         err = -ECONNRESET;
2689                         goto unlock;
2690                 }
2691                 last = skb = skb_peek(&sk->sk_receive_queue);
2692                 last_len = last ? last->len : 0;
2693
2694 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2695                 if (skb) {
2696                         skb = manage_oob(skb, sk, flags, copied);
2697                         if (!skb) {
2698                                 unix_state_unlock(sk);
2699                                 if (copied)
2700                                         break;
2701                                 goto redo;
2702                         }
2703                 }
2704 #endif
2705 again:
2706                 if (skb == NULL) {
2707                         if (copied >= target)
2708                                 goto unlock;
2709
2710                         /*
2711                          *      POSIX 1003.1g mandates this order.
2712                          */
2713
2714                         err = sock_error(sk);
2715                         if (err)
2716                                 goto unlock;
2717                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2718                                 goto unlock;
2719
2720                         unix_state_unlock(sk);
2721                         if (!timeo) {
2722                                 err = -EAGAIN;
2723                                 break;
2724                         }
2725
2726                         mutex_unlock(&u->iolock);
2727
2728                         timeo = unix_stream_data_wait(sk, timeo, last,
2729                                                       last_len, freezable);
2730
2731                         if (signal_pending(current)) {
2732                                 err = sock_intr_errno(timeo);
2733                                 scm_destroy(&scm);
2734                                 goto out;
2735                         }
2736
2737                         mutex_lock(&u->iolock);
2738                         goto redo;
2739 unlock:
2740                         unix_state_unlock(sk);
2741                         break;
2742                 }
2743
2744                 while (skip >= unix_skb_len(skb)) {
2745                         skip -= unix_skb_len(skb);
2746                         last = skb;
2747                         last_len = skb->len;
2748                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2749                         if (!skb)
2750                                 goto again;
2751                 }
2752
2753                 unix_state_unlock(sk);
2754
2755                 if (check_creds) {
2756                         /* Never glue messages from different writers */
2757                         if (!unix_skb_scm_eq(skb, &scm))
2758                                 break;
2759                 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2760                            test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2761                         /* Copy credentials */
2762                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2763                         unix_set_secdata(&scm, skb);
2764                         check_creds = true;
2765                 }
2766
2767                 /* Copy address just once */
2768                 if (state->msg && state->msg->msg_name) {
2769                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2770                                          state->msg->msg_name);
2771                         unix_copy_addr(state->msg, skb->sk);
2772
2773                         BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2774                                                               state->msg->msg_name,
2775                                                               &state->msg->msg_namelen);
2776
2777                         sunaddr = NULL;
2778                 }
2779
2780                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2781                 skb_get(skb);
2782                 chunk = state->recv_actor(skb, skip, chunk, state);
2783                 drop_skb = !unix_skb_len(skb);
2784                 /* skb is only safe to use if !drop_skb */
2785                 consume_skb(skb);
2786                 if (chunk < 0) {
2787                         if (copied == 0)
2788                                 copied = -EFAULT;
2789                         break;
2790                 }
2791                 copied += chunk;
2792                 size -= chunk;
2793
2794                 if (drop_skb) {
2795                         /* the skb was touched by a concurrent reader;
2796                          * we should not expect anything from this skb
2797                          * anymore and assume it invalid - we can be
2798                          * sure it was dropped from the socket queue
2799                          *
2800                          * let's report a short read
2801                          */
2802                         err = 0;
2803                         break;
2804                 }
2805
2806                 /* Mark read part of skb as used */
2807                 if (!(flags & MSG_PEEK)) {
2808                         UNIXCB(skb).consumed += chunk;
2809
2810                         sk_peek_offset_bwd(sk, chunk);
2811
2812                         if (UNIXCB(skb).fp) {
2813                                 scm_stat_del(sk, skb);
2814                                 unix_detach_fds(&scm, skb);
2815                         }
2816
2817                         if (unix_skb_len(skb))
2818                                 break;
2819
2820                         skb_unlink(skb, &sk->sk_receive_queue);
2821                         consume_skb(skb);
2822
2823                         if (scm.fp)
2824                                 break;
2825                 } else {
2826                         /* It is questionable, see note in unix_dgram_recvmsg.
2827                          */
2828                         if (UNIXCB(skb).fp)
2829                                 unix_peek_fds(&scm, skb);
2830
2831                         sk_peek_offset_fwd(sk, chunk);
2832
2833                         if (UNIXCB(skb).fp)
2834                                 break;
2835
2836                         skip = 0;
2837                         last = skb;
2838                         last_len = skb->len;
2839                         unix_state_lock(sk);
2840                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2841                         if (skb)
2842                                 goto again;
2843                         unix_state_unlock(sk);
2844                         break;
2845                 }
2846         } while (size);
2847
2848         mutex_unlock(&u->iolock);
2849         if (state->msg)
2850                 scm_recv_unix(sock, state->msg, &scm, flags);
2851         else
2852                 scm_destroy(&scm);
2853 out:
2854         return copied ? : err;
2855 }
2856
2857 static int unix_stream_read_actor(struct sk_buff *skb,
2858                                   int skip, int chunk,
2859                                   struct unix_stream_read_state *state)
2860 {
2861         int ret;
2862
2863         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2864                                     state->msg, chunk);
2865         return ret ?: chunk;
2866 }
2867
2868 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2869                           size_t size, int flags)
2870 {
2871         struct unix_stream_read_state state = {
2872                 .recv_actor = unix_stream_read_actor,
2873                 .socket = sk->sk_socket,
2874                 .msg = msg,
2875                 .size = size,
2876                 .flags = flags
2877         };
2878
2879         return unix_stream_read_generic(&state, true);
2880 }
2881
2882 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2883                                size_t size, int flags)
2884 {
2885         struct unix_stream_read_state state = {
2886                 .recv_actor = unix_stream_read_actor,
2887                 .socket = sock,
2888                 .msg = msg,
2889                 .size = size,
2890                 .flags = flags
2891         };
2892
2893 #ifdef CONFIG_BPF_SYSCALL
2894         struct sock *sk = sock->sk;
2895         const struct proto *prot = READ_ONCE(sk->sk_prot);
2896
2897         if (prot != &unix_stream_proto)
2898                 return prot->recvmsg(sk, msg, size, flags, NULL);
2899 #endif
2900         return unix_stream_read_generic(&state, true);
2901 }
2902
2903 static int unix_stream_splice_actor(struct sk_buff *skb,
2904                                     int skip, int chunk,
2905                                     struct unix_stream_read_state *state)
2906 {
2907         return skb_splice_bits(skb, state->socket->sk,
2908                                UNIXCB(skb).consumed + skip,
2909                                state->pipe, chunk, state->splice_flags);
2910 }
2911
2912 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2913                                        struct pipe_inode_info *pipe,
2914                                        size_t size, unsigned int flags)
2915 {
2916         struct unix_stream_read_state state = {
2917                 .recv_actor = unix_stream_splice_actor,
2918                 .socket = sock,
2919                 .pipe = pipe,
2920                 .size = size,
2921                 .splice_flags = flags,
2922         };
2923
2924         if (unlikely(*ppos))
2925                 return -ESPIPE;
2926
2927         if (sock->file->f_flags & O_NONBLOCK ||
2928             flags & SPLICE_F_NONBLOCK)
2929                 state.flags = MSG_DONTWAIT;
2930
2931         return unix_stream_read_generic(&state, false);
2932 }
2933
2934 static int unix_shutdown(struct socket *sock, int mode)
2935 {
2936         struct sock *sk = sock->sk;
2937         struct sock *other;
2938
2939         if (mode < SHUT_RD || mode > SHUT_RDWR)
2940                 return -EINVAL;
2941         /* This maps:
2942          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2943          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2944          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2945          */
2946         ++mode;
2947
2948         unix_state_lock(sk);
2949         WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2950         other = unix_peer(sk);
2951         if (other)
2952                 sock_hold(other);
2953         unix_state_unlock(sk);
2954         sk->sk_state_change(sk);
2955
2956         if (other &&
2957                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2958
2959                 int peer_mode = 0;
2960                 const struct proto *prot = READ_ONCE(other->sk_prot);
2961
2962                 if (prot->unhash)
2963                         prot->unhash(other);
2964                 if (mode&RCV_SHUTDOWN)
2965                         peer_mode |= SEND_SHUTDOWN;
2966                 if (mode&SEND_SHUTDOWN)
2967                         peer_mode |= RCV_SHUTDOWN;
2968                 unix_state_lock(other);
2969                 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2970                 unix_state_unlock(other);
2971                 other->sk_state_change(other);
2972                 if (peer_mode == SHUTDOWN_MASK)
2973                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2974                 else if (peer_mode & RCV_SHUTDOWN)
2975                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2976         }
2977         if (other)
2978                 sock_put(other);
2979
2980         return 0;
2981 }
2982
2983 long unix_inq_len(struct sock *sk)
2984 {
2985         struct sk_buff *skb;
2986         long amount = 0;
2987
2988         if (sk->sk_state == TCP_LISTEN)
2989                 return -EINVAL;
2990
2991         spin_lock(&sk->sk_receive_queue.lock);
2992         if (sk->sk_type == SOCK_STREAM ||
2993             sk->sk_type == SOCK_SEQPACKET) {
2994                 skb_queue_walk(&sk->sk_receive_queue, skb)
2995                         amount += unix_skb_len(skb);
2996         } else {
2997                 skb = skb_peek(&sk->sk_receive_queue);
2998                 if (skb)
2999                         amount = skb->len;
3000         }
3001         spin_unlock(&sk->sk_receive_queue.lock);
3002
3003         return amount;
3004 }
3005 EXPORT_SYMBOL_GPL(unix_inq_len);
3006
3007 long unix_outq_len(struct sock *sk)
3008 {
3009         return sk_wmem_alloc_get(sk);
3010 }
3011 EXPORT_SYMBOL_GPL(unix_outq_len);
3012
3013 static int unix_open_file(struct sock *sk)
3014 {
3015         struct path path;
3016         struct file *f;
3017         int fd;
3018
3019         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3020                 return -EPERM;
3021
3022         if (!smp_load_acquire(&unix_sk(sk)->addr))
3023                 return -ENOENT;
3024
3025         path = unix_sk(sk)->path;
3026         if (!path.dentry)
3027                 return -ENOENT;
3028
3029         path_get(&path);
3030
3031         fd = get_unused_fd_flags(O_CLOEXEC);
3032         if (fd < 0)
3033                 goto out;
3034
3035         f = dentry_open(&path, O_PATH, current_cred());
3036         if (IS_ERR(f)) {
3037                 put_unused_fd(fd);
3038                 fd = PTR_ERR(f);
3039                 goto out;
3040         }
3041
3042         fd_install(fd, f);
3043 out:
3044         path_put(&path);
3045
3046         return fd;
3047 }
3048
3049 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3050 {
3051         struct sock *sk = sock->sk;
3052         long amount = 0;
3053         int err;
3054
3055         switch (cmd) {
3056         case SIOCOUTQ:
3057                 amount = unix_outq_len(sk);
3058                 err = put_user(amount, (int __user *)arg);
3059                 break;
3060         case SIOCINQ:
3061                 amount = unix_inq_len(sk);
3062                 if (amount < 0)
3063                         err = amount;
3064                 else
3065                         err = put_user(amount, (int __user *)arg);
3066                 break;
3067         case SIOCUNIXFILE:
3068                 err = unix_open_file(sk);
3069                 break;
3070 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3071         case SIOCATMARK:
3072                 {
3073                         struct sk_buff *skb;
3074                         int answ = 0;
3075
3076                         skb = skb_peek(&sk->sk_receive_queue);
3077                         if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3078                                 answ = 1;
3079                         err = put_user(answ, (int __user *)arg);
3080                 }
3081                 break;
3082 #endif
3083         default:
3084                 err = -ENOIOCTLCMD;
3085                 break;
3086         }
3087         return err;
3088 }
3089
3090 #ifdef CONFIG_COMPAT
3091 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3092 {
3093         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3094 }
3095 #endif
3096
3097 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3098 {
3099         struct sock *sk = sock->sk;
3100         __poll_t mask;
3101         u8 shutdown;
3102
3103         sock_poll_wait(file, sock, wait);
3104         mask = 0;
3105         shutdown = READ_ONCE(sk->sk_shutdown);
3106
3107         /* exceptional events? */
3108         if (READ_ONCE(sk->sk_err))
3109                 mask |= EPOLLERR;
3110         if (shutdown == SHUTDOWN_MASK)
3111                 mask |= EPOLLHUP;
3112         if (shutdown & RCV_SHUTDOWN)
3113                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3114
3115         /* readable? */
3116         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3117                 mask |= EPOLLIN | EPOLLRDNORM;
3118         if (sk_is_readable(sk))
3119                 mask |= EPOLLIN | EPOLLRDNORM;
3120 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3121         if (READ_ONCE(unix_sk(sk)->oob_skb))
3122                 mask |= EPOLLPRI;
3123 #endif
3124
3125         /* Connection-based need to check for termination and startup */
3126         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3127             sk->sk_state == TCP_CLOSE)
3128                 mask |= EPOLLHUP;
3129
3130         /*
3131          * we set writable also when the other side has shut down the
3132          * connection. This prevents stuck sockets.
3133          */
3134         if (unix_writable(sk))
3135                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3136
3137         return mask;
3138 }
3139
3140 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3141                                     poll_table *wait)
3142 {
3143         struct sock *sk = sock->sk, *other;
3144         unsigned int writable;
3145         __poll_t mask;
3146         u8 shutdown;
3147
3148         sock_poll_wait(file, sock, wait);
3149         mask = 0;
3150         shutdown = READ_ONCE(sk->sk_shutdown);
3151
3152         /* exceptional events? */
3153         if (READ_ONCE(sk->sk_err) ||
3154             !skb_queue_empty_lockless(&sk->sk_error_queue))
3155                 mask |= EPOLLERR |
3156                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3157
3158         if (shutdown & RCV_SHUTDOWN)
3159                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3160         if (shutdown == SHUTDOWN_MASK)
3161                 mask |= EPOLLHUP;
3162
3163         /* readable? */
3164         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3165                 mask |= EPOLLIN | EPOLLRDNORM;
3166         if (sk_is_readable(sk))
3167                 mask |= EPOLLIN | EPOLLRDNORM;
3168
3169         /* Connection-based need to check for termination and startup */
3170         if (sk->sk_type == SOCK_SEQPACKET) {
3171                 if (sk->sk_state == TCP_CLOSE)
3172                         mask |= EPOLLHUP;
3173                 /* connection hasn't started yet? */
3174                 if (sk->sk_state == TCP_SYN_SENT)
3175                         return mask;
3176         }
3177
3178         /* No write status requested, avoid expensive OUT tests. */
3179         if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3180                 return mask;
3181
3182         writable = unix_writable(sk);
3183         if (writable) {
3184                 unix_state_lock(sk);
3185
3186                 other = unix_peer(sk);
3187                 if (other && unix_peer(other) != sk &&
3188                     unix_recvq_full_lockless(other) &&
3189                     unix_dgram_peer_wake_me(sk, other))
3190                         writable = 0;
3191
3192                 unix_state_unlock(sk);
3193         }
3194
3195         if (writable)
3196                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3197         else
3198                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3199
3200         return mask;
3201 }
3202
3203 #ifdef CONFIG_PROC_FS
3204
3205 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3206
3207 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3208 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3209 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3210
3211 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3212 {
3213         unsigned long offset = get_offset(*pos);
3214         unsigned long bucket = get_bucket(*pos);
3215         unsigned long count = 0;
3216         struct sock *sk;
3217
3218         for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3219              sk; sk = sk_next(sk)) {
3220                 if (++count == offset)
3221                         break;
3222         }
3223
3224         return sk;
3225 }
3226
3227 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3228 {
3229         unsigned long bucket = get_bucket(*pos);
3230         struct net *net = seq_file_net(seq);
3231         struct sock *sk;
3232
3233         while (bucket < UNIX_HASH_SIZE) {
3234                 spin_lock(&net->unx.table.locks[bucket]);
3235
3236                 sk = unix_from_bucket(seq, pos);
3237                 if (sk)
3238                         return sk;
3239
3240                 spin_unlock(&net->unx.table.locks[bucket]);
3241
3242                 *pos = set_bucket_offset(++bucket, 1);
3243         }
3244
3245         return NULL;
3246 }
3247
3248 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3249                                   loff_t *pos)
3250 {
3251         unsigned long bucket = get_bucket(*pos);
3252
3253         sk = sk_next(sk);
3254         if (sk)
3255                 return sk;
3256
3257
3258         spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3259
3260         *pos = set_bucket_offset(++bucket, 1);
3261
3262         return unix_get_first(seq, pos);
3263 }
3264
3265 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3266 {
3267         if (!*pos)
3268                 return SEQ_START_TOKEN;
3269
3270         return unix_get_first(seq, pos);
3271 }
3272
3273 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3274 {
3275         ++*pos;
3276
3277         if (v == SEQ_START_TOKEN)
3278                 return unix_get_first(seq, pos);
3279
3280         return unix_get_next(seq, v, pos);
3281 }
3282
3283 static void unix_seq_stop(struct seq_file *seq, void *v)
3284 {
3285         struct sock *sk = v;
3286
3287         if (sk)
3288                 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3289 }
3290
3291 static int unix_seq_show(struct seq_file *seq, void *v)
3292 {
3293
3294         if (v == SEQ_START_TOKEN)
3295                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3296                          "Inode Path\n");
3297         else {
3298                 struct sock *s = v;
3299                 struct unix_sock *u = unix_sk(s);
3300                 unix_state_lock(s);
3301
3302                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3303                         s,
3304                         refcount_read(&s->sk_refcnt),
3305                         0,
3306                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3307                         s->sk_type,
3308                         s->sk_socket ?
3309                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3310                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3311                         sock_i_ino(s));
3312
3313                 if (u->addr) {  // under a hash table lock here
3314                         int i, len;
3315                         seq_putc(seq, ' ');
3316
3317                         i = 0;
3318                         len = u->addr->len -
3319                                 offsetof(struct sockaddr_un, sun_path);
3320                         if (u->addr->name->sun_path[0]) {
3321                                 len--;
3322                         } else {
3323                                 seq_putc(seq, '@');
3324                                 i++;
3325                         }
3326                         for ( ; i < len; i++)
3327                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
3328                                          '@');
3329                 }
3330                 unix_state_unlock(s);
3331                 seq_putc(seq, '\n');
3332         }
3333
3334         return 0;
3335 }
3336
3337 static const struct seq_operations unix_seq_ops = {
3338         .start  = unix_seq_start,
3339         .next   = unix_seq_next,
3340         .stop   = unix_seq_stop,
3341         .show   = unix_seq_show,
3342 };
3343
3344 #ifdef CONFIG_BPF_SYSCALL
3345 struct bpf_unix_iter_state {
3346         struct seq_net_private p;
3347         unsigned int cur_sk;
3348         unsigned int end_sk;
3349         unsigned int max_sk;
3350         struct sock **batch;
3351         bool st_bucket_done;
3352 };
3353
3354 struct bpf_iter__unix {
3355         __bpf_md_ptr(struct bpf_iter_meta *, meta);
3356         __bpf_md_ptr(struct unix_sock *, unix_sk);
3357         uid_t uid __aligned(8);
3358 };
3359
3360 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3361                               struct unix_sock *unix_sk, uid_t uid)
3362 {
3363         struct bpf_iter__unix ctx;
3364
3365         meta->seq_num--;  /* skip SEQ_START_TOKEN */
3366         ctx.meta = meta;
3367         ctx.unix_sk = unix_sk;
3368         ctx.uid = uid;
3369         return bpf_iter_run_prog(prog, &ctx);
3370 }
3371
3372 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3373
3374 {
3375         struct bpf_unix_iter_state *iter = seq->private;
3376         unsigned int expected = 1;
3377         struct sock *sk;
3378
3379         sock_hold(start_sk);
3380         iter->batch[iter->end_sk++] = start_sk;
3381
3382         for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3383                 if (iter->end_sk < iter->max_sk) {
3384                         sock_hold(sk);
3385                         iter->batch[iter->end_sk++] = sk;
3386                 }
3387
3388                 expected++;
3389         }
3390
3391         spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3392
3393         return expected;
3394 }
3395
3396 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3397 {
3398         while (iter->cur_sk < iter->end_sk)
3399                 sock_put(iter->batch[iter->cur_sk++]);
3400 }
3401
3402 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3403                                        unsigned int new_batch_sz)
3404 {
3405         struct sock **new_batch;
3406
3407         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3408                              GFP_USER | __GFP_NOWARN);
3409         if (!new_batch)
3410                 return -ENOMEM;
3411
3412         bpf_iter_unix_put_batch(iter);
3413         kvfree(iter->batch);
3414         iter->batch = new_batch;
3415         iter->max_sk = new_batch_sz;
3416
3417         return 0;
3418 }
3419
3420 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3421                                         loff_t *pos)
3422 {
3423         struct bpf_unix_iter_state *iter = seq->private;
3424         unsigned int expected;
3425         bool resized = false;
3426         struct sock *sk;
3427
3428         if (iter->st_bucket_done)
3429                 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3430
3431 again:
3432         /* Get a new batch */
3433         iter->cur_sk = 0;
3434         iter->end_sk = 0;
3435
3436         sk = unix_get_first(seq, pos);
3437         if (!sk)
3438                 return NULL; /* Done */
3439
3440         expected = bpf_iter_unix_hold_batch(seq, sk);
3441
3442         if (iter->end_sk == expected) {
3443                 iter->st_bucket_done = true;
3444                 return sk;
3445         }
3446
3447         if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3448                 resized = true;
3449                 goto again;
3450         }
3451
3452         return sk;
3453 }
3454
3455 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3456 {
3457         if (!*pos)
3458                 return SEQ_START_TOKEN;
3459
3460         /* bpf iter does not support lseek, so it always
3461          * continue from where it was stop()-ped.
3462          */
3463         return bpf_iter_unix_batch(seq, pos);
3464 }
3465
3466 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3467 {
3468         struct bpf_unix_iter_state *iter = seq->private;
3469         struct sock *sk;
3470
3471         /* Whenever seq_next() is called, the iter->cur_sk is
3472          * done with seq_show(), so advance to the next sk in
3473          * the batch.
3474          */
3475         if (iter->cur_sk < iter->end_sk)
3476                 sock_put(iter->batch[iter->cur_sk++]);
3477
3478         ++*pos;
3479
3480         if (iter->cur_sk < iter->end_sk)
3481                 sk = iter->batch[iter->cur_sk];
3482         else
3483                 sk = bpf_iter_unix_batch(seq, pos);
3484
3485         return sk;
3486 }
3487
3488 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3489 {
3490         struct bpf_iter_meta meta;
3491         struct bpf_prog *prog;
3492         struct sock *sk = v;
3493         uid_t uid;
3494         bool slow;
3495         int ret;
3496
3497         if (v == SEQ_START_TOKEN)
3498                 return 0;
3499
3500         slow = lock_sock_fast(sk);
3501
3502         if (unlikely(sk_unhashed(sk))) {
3503                 ret = SEQ_SKIP;
3504                 goto unlock;
3505         }
3506
3507         uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3508         meta.seq = seq;
3509         prog = bpf_iter_get_info(&meta, false);
3510         ret = unix_prog_seq_show(prog, &meta, v, uid);
3511 unlock:
3512         unlock_sock_fast(sk, slow);
3513         return ret;
3514 }
3515
3516 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3517 {
3518         struct bpf_unix_iter_state *iter = seq->private;
3519         struct bpf_iter_meta meta;
3520         struct bpf_prog *prog;
3521
3522         if (!v) {
3523                 meta.seq = seq;
3524                 prog = bpf_iter_get_info(&meta, true);
3525                 if (prog)
3526                         (void)unix_prog_seq_show(prog, &meta, v, 0);
3527         }
3528
3529         if (iter->cur_sk < iter->end_sk)
3530                 bpf_iter_unix_put_batch(iter);
3531 }
3532
3533 static const struct seq_operations bpf_iter_unix_seq_ops = {
3534         .start  = bpf_iter_unix_seq_start,
3535         .next   = bpf_iter_unix_seq_next,
3536         .stop   = bpf_iter_unix_seq_stop,
3537         .show   = bpf_iter_unix_seq_show,
3538 };
3539 #endif
3540 #endif
3541
3542 static const struct net_proto_family unix_family_ops = {
3543         .family = PF_UNIX,
3544         .create = unix_create,
3545         .owner  = THIS_MODULE,
3546 };
3547
3548
3549 static int __net_init unix_net_init(struct net *net)
3550 {
3551         int i;
3552
3553         net->unx.sysctl_max_dgram_qlen = 10;
3554         if (unix_sysctl_register(net))
3555                 goto out;
3556
3557 #ifdef CONFIG_PROC_FS
3558         if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3559                              sizeof(struct seq_net_private)))
3560                 goto err_sysctl;
3561 #endif
3562
3563         net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3564                                               sizeof(spinlock_t), GFP_KERNEL);
3565         if (!net->unx.table.locks)
3566                 goto err_proc;
3567
3568         net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3569                                                 sizeof(struct hlist_head),
3570                                                 GFP_KERNEL);
3571         if (!net->unx.table.buckets)
3572                 goto free_locks;
3573
3574         for (i = 0; i < UNIX_HASH_SIZE; i++) {
3575                 spin_lock_init(&net->unx.table.locks[i]);
3576                 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3577         }
3578
3579         return 0;
3580
3581 free_locks:
3582         kvfree(net->unx.table.locks);
3583 err_proc:
3584 #ifdef CONFIG_PROC_FS
3585         remove_proc_entry("unix", net->proc_net);
3586 err_sysctl:
3587 #endif
3588         unix_sysctl_unregister(net);
3589 out:
3590         return -ENOMEM;
3591 }
3592
3593 static void __net_exit unix_net_exit(struct net *net)
3594 {
3595         kvfree(net->unx.table.buckets);
3596         kvfree(net->unx.table.locks);
3597         unix_sysctl_unregister(net);
3598         remove_proc_entry("unix", net->proc_net);
3599 }
3600
3601 static struct pernet_operations unix_net_ops = {
3602         .init = unix_net_init,
3603         .exit = unix_net_exit,
3604 };
3605
3606 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3607 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3608                      struct unix_sock *unix_sk, uid_t uid)
3609
3610 #define INIT_BATCH_SZ 16
3611
3612 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3613 {
3614         struct bpf_unix_iter_state *iter = priv_data;
3615         int err;
3616
3617         err = bpf_iter_init_seq_net(priv_data, aux);
3618         if (err)
3619                 return err;
3620
3621         err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3622         if (err) {
3623                 bpf_iter_fini_seq_net(priv_data);
3624                 return err;
3625         }
3626
3627         return 0;
3628 }
3629
3630 static void bpf_iter_fini_unix(void *priv_data)
3631 {
3632         struct bpf_unix_iter_state *iter = priv_data;
3633
3634         bpf_iter_fini_seq_net(priv_data);
3635         kvfree(iter->batch);
3636 }
3637
3638 static const struct bpf_iter_seq_info unix_seq_info = {
3639         .seq_ops                = &bpf_iter_unix_seq_ops,
3640         .init_seq_private       = bpf_iter_init_unix,
3641         .fini_seq_private       = bpf_iter_fini_unix,
3642         .seq_priv_size          = sizeof(struct bpf_unix_iter_state),
3643 };
3644
3645 static const struct bpf_func_proto *
3646 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3647                              const struct bpf_prog *prog)
3648 {
3649         switch (func_id) {
3650         case BPF_FUNC_setsockopt:
3651                 return &bpf_sk_setsockopt_proto;
3652         case BPF_FUNC_getsockopt:
3653                 return &bpf_sk_getsockopt_proto;
3654         default:
3655                 return NULL;
3656         }
3657 }
3658
3659 static struct bpf_iter_reg unix_reg_info = {
3660         .target                 = "unix",
3661         .ctx_arg_info_size      = 1,
3662         .ctx_arg_info           = {
3663                 { offsetof(struct bpf_iter__unix, unix_sk),
3664                   PTR_TO_BTF_ID_OR_NULL },
3665         },
3666         .get_func_proto         = bpf_iter_unix_get_func_proto,
3667         .seq_info               = &unix_seq_info,
3668 };
3669
3670 static void __init bpf_iter_register(void)
3671 {
3672         unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3673         if (bpf_iter_reg_target(&unix_reg_info))
3674                 pr_warn("Warning: could not register bpf iterator unix\n");
3675 }
3676 #endif
3677
3678 static int __init af_unix_init(void)
3679 {
3680         int i, rc = -1;
3681
3682         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3683
3684         for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3685                 spin_lock_init(&bsd_socket_locks[i]);
3686                 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3687         }
3688
3689         rc = proto_register(&unix_dgram_proto, 1);
3690         if (rc != 0) {
3691                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3692                 goto out;
3693         }
3694
3695         rc = proto_register(&unix_stream_proto, 1);
3696         if (rc != 0) {
3697                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3698                 proto_unregister(&unix_dgram_proto);
3699                 goto out;
3700         }
3701
3702         sock_register(&unix_family_ops);
3703         register_pernet_subsys(&unix_net_ops);
3704         unix_bpf_build_proto();
3705
3706 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3707         bpf_iter_register();
3708 #endif
3709
3710 out:
3711         return rc;
3712 }
3713
3714 /* Later than subsys_initcall() because we depend on stuff initialised there */
3715 fs_initcall(af_unix_init);