net/unix/af_unix.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * NET4:        Implementation of BSD Unix domain sockets.
   4  *
   5  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   6  *
   7  * Fixes:
   8  *              Linus Torvalds  :       Assorted bug cures.
   9  *              Niibe Yutaka    :       async I/O support.
  10  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  11  *              Alan Cox        :       Limit size of allocated blocks.
  12  *              Alan Cox        :       Fixed the stupid socketpair bug.
  13  *              Alan Cox        :       BSD compatibility fine tuning.
  14  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  15  *              Alan Cox        :       Sorted out a proper draft version of
  16  *                                      file descriptor passing hacked up from
  17  *                                      Mike Shaver's work.
  18  *              Marty Leisner   :       Fixes to fd passing
  19  *              Nick Nevin      :       recvmsg bugfix.
  20  *              Alan Cox        :       Started proper garbage collector
  21  *              Heiko EiBfeldt  :       Missing verify_area check
  22  *              Alan Cox        :       Started POSIXisms
  23  *              Andreas Schwab  :       Replace inode by dentry for proper
  24  *                                      reference counting
  25  *              Kirk Petersen   :       Made this a module
  26  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  27  *                                      Lots of bug fixes.
  28  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  29  *                                      by above two patches.
  30  *           Andrea Arcangeli   :       If possible we block in connect(2)
  31  *                                      if the max backlog of the listen socket
  32  *                                      is been reached. This won't break
  33  *                                      old apps and it will avoid huge amount
  34  *                                      of socks hashed (this for unix_gc()
  35  *                                      performances reasons).
  36  *                                      Security fix that limits the max
  37  *                                      number of socks to 2*max_files and
  38  *                                      the number of skb queueable in the
  39  *                                      dgram receiver.
  40  *              Artur Skawina   :       Hash function optimizations
  41  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  42  *            Malcolm Beattie   :       Set peercred for socketpair
  43  *           Michal Ostrowski   :       Module initialization cleanup.
  44  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  45  *                                      the core infrastructure is doing that
  46  *                                      for all net proto families now (2.5.69+)
  47  *
  48  * Known differences from reference BSD that was tested:
  49  *
  50  *      [TO FIX]
  51  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  52  *              other the moment one end closes.
  53  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  54  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  55  *      [NOT TO FIX]
  56  *      accept() returns a path name even if the connecting socket has closed
  57  *              in the meantime (BSD loses the path and gives up).
  58  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  59  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  60  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  61  *      BSD af_unix apparently has connect forgetting to block properly.
  62  *              (need to check this with the POSIX spec in detail)
  63  *
  64  * Differences from 2.0.0-11-... (ANK)
  65  *      Bug fixes and improvements.
  66  *              - client shutdown killed server socket.
  67  *              - removed all useless cli/sti pairs.
  68  *
  69  *      Semantic changes/extensions.
  70  *              - generic control message passing.
  71  *              - SCM_CREDENTIALS control message.
  72  *              - "Abstract" (not FS based) socket bindings.
  73  *                Abstract names are sequences of bytes (not zero terminated)
  74  *                started by 0, so that this name space does not intersect
  75  *                with BSD names.
  76  */
  77
  78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  79
  80 #include <linux/module.h>
  81 #include <linux/kernel.h>
  82 #include <linux/signal.h>
  83 #include <linux/sched/signal.h>
  84 #include <linux/errno.h>
  85 #include <linux/string.h>
  86 #include <linux/stat.h>
  87 #include <linux/dcache.h>
  88 #include <linux/namei.h>
  89 #include <linux/socket.h>
  90 #include <linux/un.h>
  91 #include <linux/fcntl.h>
  92 #include <linux/filter.h>
  93 #include <linux/termios.h>
  94 #include <linux/sockios.h>
  95 #include <linux/net.h>
  96 #include <linux/in.h>
  97 #include <linux/fs.h>
  98 #include <linux/slab.h>
  99 #include <linux/uaccess.h>
 100 #include <linux/skbuff.h>
 101 #include <linux/netdevice.h>
 102 #include <net/net_namespace.h>
 103 #include <net/sock.h>
 104 #include <net/tcp_states.h>
 105 #include <net/af_unix.h>
 106 #include <linux/proc_fs.h>
 107 #include <linux/seq_file.h>
 108 #include <net/scm.h>
 109 #include <linux/init.h>
 110 #include <linux/poll.h>
 111 #include <linux/rtnetlink.h>
 112 #include <linux/mount.h>
 113 #include <net/checksum.h>
 114 #include <linux/security.h>
 115 #include <linux/splice.h>
 116 #include <linux/freezer.h>
 117 #include <linux/file.h>
 118 #include <linux/btf_ids.h>
 119 #include <linux/bpf-cgroup.h>
 120
 121 static atomic_long_t unix_nr_socks;
 122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
 123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
 124
 125 /* SMP locking strategy:
 126  *    hash table is protected with spinlock.
 127  *    each socket state is protected by separate spinlock.
 128  */
 129 #ifdef CONFIG_PROVE_LOCKING
 130 #define cmp_ptr(l, r)   (((l) > (r)) - ((l) < (r)))
 131
 132 static int unix_table_lock_cmp_fn(const struct lockdep_map *a,
 133                                   const struct lockdep_map *b)
 134 {
 135         return cmp_ptr(a, b);
 136 }
 137
 138 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
 139                                   const struct lockdep_map *_b)
 140 {
 141         const struct unix_sock *a, *b;
 142
 143         a = container_of(_a, struct unix_sock, lock.dep_map);
 144         b = container_of(_b, struct unix_sock, lock.dep_map);
 145
 146         if (a->sk.sk_state == TCP_LISTEN) {
 147                 /* unix_stream_connect(): Before the 2nd unix_state_lock(),
 148                  *
 149                  *   1. a is TCP_LISTEN.
 150                  *   2. b is not a.
 151                  *   3. concurrent connect(b -> a) must fail.
 152                  *
 153                  * Except for 2. & 3., the b's state can be any possible
 154                  * value due to concurrent connect() or listen().
 155                  *
 156                  * 2. is detected in debug_spin_lock_before(), and 3. cannot
 157                  * be expressed as lock_cmp_fn.
 158                  */
 159                 switch (b->sk.sk_state) {
 160                 case TCP_CLOSE:
 161                 case TCP_ESTABLISHED:
 162                 case TCP_LISTEN:
 163                         return -1;
 164                 default:
 165                         /* Invalid case. */
 166                         return 0;
 167                 }
 168         }
 169
 170         /* Should never happen.  Just to be symmetric. */
 171         if (b->sk.sk_state == TCP_LISTEN) {
 172                 switch (b->sk.sk_state) {
 173                 case TCP_CLOSE:
 174                 case TCP_ESTABLISHED:
 175                         return 1;
 176                 default:
 177                         return 0;
 178                 }
 179         }
 180
 181         /* unix_state_double_lock(): ascending address order. */
 182         return cmp_ptr(a, b);
 183 }
 184
 185 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a,
 186                                   const struct lockdep_map *_b)
 187 {
 188         const struct sock *a, *b;
 189
 190         a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map);
 191         b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map);
 192
 193         /* unix_collect_skb(): listener -> embryo order. */
 194         if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a)
 195                 return -1;
 196
 197         /* Should never happen.  Just to be symmetric. */
 198         if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b)
 199                 return 1;
 200
 201         return 0;
 202 }
 203 #endif
 204
 205 static unsigned int unix_unbound_hash(struct sock *sk)
 206 {
 207         unsigned long hash = (unsigned long)sk;
 208
 209         hash ^= hash >> 16;
 210         hash ^= hash >> 8;
 211         hash ^= sk->sk_type;
 212
 213         return hash & UNIX_HASH_MOD;
 214 }
 215
 216 static unsigned int unix_bsd_hash(struct inode *i)
 217 {
 218         return i->i_ino & UNIX_HASH_MOD;
 219 }
 220
 221 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
 222                                        int addr_len, int type)
 223 {
 224         __wsum csum = csum_partial(sunaddr, addr_len, 0);
 225         unsigned int hash;
 226
 227         hash = (__force unsigned int)csum_fold(csum);
 228         hash ^= hash >> 8;
 229         hash ^= type;
 230
 231         return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
 232 }
 233
 234 static void unix_table_double_lock(struct net *net,
 235                                    unsigned int hash1, unsigned int hash2)
 236 {
 237         if (hash1 == hash2) {
 238                 spin_lock(&net->unx.table.locks[hash1]);
 239                 return;
 240         }
 241
 242         if (hash1 > hash2)
 243                 swap(hash1, hash2);
 244
 245         spin_lock(&net->unx.table.locks[hash1]);
 246         spin_lock(&net->unx.table.locks[hash2]);
 247 }
 248
 249 static void unix_table_double_unlock(struct net *net,
 250                                      unsigned int hash1, unsigned int hash2)
 251 {
 252         if (hash1 == hash2) {
 253                 spin_unlock(&net->unx.table.locks[hash1]);
 254                 return;
 255         }
 256
 257         spin_unlock(&net->unx.table.locks[hash1]);
 258         spin_unlock(&net->unx.table.locks[hash2]);
 259 }
 260
 261 #ifdef CONFIG_SECURITY_NETWORK
 262 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 263 {
 264         UNIXCB(skb).secid = scm->secid;
 265 }
 266
 267 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 268 {
 269         scm->secid = UNIXCB(skb).secid;
 270 }
 271
 272 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 273 {
 274         return (scm->secid == UNIXCB(skb).secid);
 275 }
 276 #else
 277 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 278 { }
 279
 280 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 281 { }
 282
 283 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 284 {
 285         return true;
 286 }
 287 #endif /* CONFIG_SECURITY_NETWORK */
 288
 289 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 290 {
 291         return unix_peer(osk) == sk;
 292 }
 293
 294 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 295 {
 296         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 297 }
 298
 299 static inline int unix_recvq_full_lockless(const struct sock *sk)
 300 {
 301         return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 302 }
 303
 304 struct sock *unix_peer_get(struct sock *s)
 305 {
 306         struct sock *peer;
 307
 308         unix_state_lock(s);
 309         peer = unix_peer(s);
 310         if (peer)
 311                 sock_hold(peer);
 312         unix_state_unlock(s);
 313         return peer;
 314 }
 315 EXPORT_SYMBOL_GPL(unix_peer_get);
 316
 317 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
 318                                              int addr_len)
 319 {
 320         struct unix_address *addr;
 321
 322         addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
 323         if (!addr)
 324                 return NULL;
 325
 326         refcount_set(&addr->refcnt, 1);
 327         addr->len = addr_len;
 328         memcpy(addr->name, sunaddr, addr_len);
 329
 330         return addr;
 331 }
 332
 333 static inline void unix_release_addr(struct unix_address *addr)
 334 {
 335         if (refcount_dec_and_test(&addr->refcnt))
 336                 kfree(addr);
 337 }
 338
 339 /*
 340  *      Check unix socket name:
 341  *              - should be not zero length.
 342  *              - if started by not zero, should be NULL terminated (FS object)
 343  *              - if started by zero, it is abstract name.
 344  */
 345
 346 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
 347 {
 348         if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
 349             addr_len > sizeof(*sunaddr))
 350                 return -EINVAL;
 351
 352         if (sunaddr->sun_family != AF_UNIX)
 353                 return -EINVAL;
 354
 355         return 0;
 356 }
 357
 358 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
 359 {
 360         struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
 361         short offset = offsetof(struct sockaddr_storage, __data);
 362
 363         BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
 364
 365         /* This may look like an off by one error but it is a bit more
 366          * subtle.  108 is the longest valid AF_UNIX path for a binding.
 367          * sun_path[108] doesn't as such exist.  However in kernel space
 368          * we are guaranteed that it is a valid memory location in our
 369          * kernel address buffer because syscall functions always pass
 370          * a pointer of struct sockaddr_storage which has a bigger buffer
 371          * than 108.  Also, we must terminate sun_path for strlen() in
 372          * getname_kernel().
 373          */
 374         addr->__data[addr_len - offset] = 0;
 375
 376         /* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
 377          * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
 378          * know the actual buffer.
 379          */
 380         return strlen(addr->__data) + offset + 1;
 381 }
 382
 383 static void __unix_remove_socket(struct sock *sk)
 384 {
 385         sk_del_node_init(sk);
 386 }
 387
 388 static void __unix_insert_socket(struct net *net, struct sock *sk)
 389 {
 390         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 391         sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
 392 }
 393
 394 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
 395                                  struct unix_address *addr, unsigned int hash)
 396 {
 397         __unix_remove_socket(sk);
 398         smp_store_release(&unix_sk(sk)->addr, addr);
 399
 400         sk->sk_hash = hash;
 401         __unix_insert_socket(net, sk);
 402 }
 403
 404 static void unix_remove_socket(struct net *net, struct sock *sk)
 405 {
 406         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 407         __unix_remove_socket(sk);
 408         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 409 }
 410
 411 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
 412 {
 413         spin_lock(&net->unx.table.locks[sk->sk_hash]);
 414         __unix_insert_socket(net, sk);
 415         spin_unlock(&net->unx.table.locks[sk->sk_hash]);
 416 }
 417
 418 static void unix_insert_bsd_socket(struct sock *sk)
 419 {
 420         spin_lock(&bsd_socket_locks[sk->sk_hash]);
 421         sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
 422         spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 423 }
 424
 425 static void unix_remove_bsd_socket(struct sock *sk)
 426 {
 427         if (!hlist_unhashed(&sk->sk_bind_node)) {
 428                 spin_lock(&bsd_socket_locks[sk->sk_hash]);
 429                 __sk_del_bind_node(sk);
 430                 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
 431
 432                 sk_node_init(&sk->sk_bind_node);
 433         }
 434 }
 435
 436 static struct sock *__unix_find_socket_byname(struct net *net,
 437                                               struct sockaddr_un *sunname,
 438                                               int len, unsigned int hash)
 439 {
 440         struct sock *s;
 441
 442         sk_for_each(s, &net->unx.table.buckets[hash]) {
 443                 struct unix_sock *u = unix_sk(s);
 444
 445                 if (u->addr->len == len &&
 446                     !memcmp(u->addr->name, sunname, len))
 447                         return s;
 448         }
 449         return NULL;
 450 }
 451
 452 static inline struct sock *unix_find_socket_byname(struct net *net,
 453                                                    struct sockaddr_un *sunname,
 454                                                    int len, unsigned int hash)
 455 {
 456         struct sock *s;
 457
 458         spin_lock(&net->unx.table.locks[hash]);
 459         s = __unix_find_socket_byname(net, sunname, len, hash);
 460         if (s)
 461                 sock_hold(s);
 462         spin_unlock(&net->unx.table.locks[hash]);
 463         return s;
 464 }
 465
 466 static struct sock *unix_find_socket_byinode(struct inode *i)
 467 {
 468         unsigned int hash = unix_bsd_hash(i);
 469         struct sock *s;
 470
 471         spin_lock(&bsd_socket_locks[hash]);
 472         sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
 473                 struct dentry *dentry = unix_sk(s)->path.dentry;
 474
 475                 if (dentry && d_backing_inode(dentry) == i) {
 476                         sock_hold(s);
 477                         spin_unlock(&bsd_socket_locks[hash]);
 478                         return s;
 479                 }
 480         }
 481         spin_unlock(&bsd_socket_locks[hash]);
 482         return NULL;
 483 }
 484
 485 /* Support code for asymmetrically connected dgram sockets
 486  *
 487  * If a datagram socket is connected to a socket not itself connected
 488  * to the first socket (eg, /dev/log), clients may only enqueue more
 489  * messages if the present receive queue of the server socket is not
 490  * "too large". This means there's a second writeability condition
 491  * poll and sendmsg need to test. The dgram recv code will do a wake
 492  * up on the peer_wait wait queue of a socket upon reception of a
 493  * datagram which needs to be propagated to sleeping would-be writers
 494  * since these might not have sent anything so far. This can't be
 495  * accomplished via poll_wait because the lifetime of the server
 496  * socket might be less than that of its clients if these break their
 497  * association with it or if the server socket is closed while clients
 498  * are still connected to it and there's no way to inform "a polling
 499  * implementation" that it should let go of a certain wait queue
 500  *
 501  * In order to propagate a wake up, a wait_queue_entry_t of the client
 502  * socket is enqueued on the peer_wait queue of the server socket
 503  * whose wake function does a wake_up on the ordinary client socket
 504  * wait queue. This connection is established whenever a write (or
 505  * poll for write) hit the flow control condition and broken when the
 506  * association to the server socket is dissolved or after a wake up
 507  * was relayed.
 508  */
 509
 510 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 511                                       void *key)
 512 {
 513         struct unix_sock *u;
 514         wait_queue_head_t *u_sleep;
 515
 516         u = container_of(q, struct unix_sock, peer_wake);
 517
 518         __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
 519                             q);
 520         u->peer_wake.private = NULL;
 521
 522         /* relaying can only happen while the wq still exists */
 523         u_sleep = sk_sleep(&u->sk);
 524         if (u_sleep)
 525                 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
 526
 527         return 0;
 528 }
 529
 530 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
 531 {
 532         struct unix_sock *u, *u_other;
 533         int rc;
 534
 535         u = unix_sk(sk);
 536         u_other = unix_sk(other);
 537         rc = 0;
 538         spin_lock(&u_other->peer_wait.lock);
 539
 540         if (!u->peer_wake.private) {
 541                 u->peer_wake.private = other;
 542                 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
 543
 544                 rc = 1;
 545         }
 546
 547         spin_unlock(&u_other->peer_wait.lock);
 548         return rc;
 549 }
 550
 551 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
 552                                             struct sock *other)
 553 {
 554         struct unix_sock *u, *u_other;
 555
 556         u = unix_sk(sk);
 557         u_other = unix_sk(other);
 558         spin_lock(&u_other->peer_wait.lock);
 559
 560         if (u->peer_wake.private == other) {
 561                 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
 562                 u->peer_wake.private = NULL;
 563         }
 564
 565         spin_unlock(&u_other->peer_wait.lock);
 566 }
 567
 568 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 569                                                    struct sock *other)
 570 {
 571         unix_dgram_peer_wake_disconnect(sk, other);
 572         wake_up_interruptible_poll(sk_sleep(sk),
 573                                    EPOLLOUT |
 574                                    EPOLLWRNORM |
 575                                    EPOLLWRBAND);
 576 }
 577
 578 /* preconditions:
 579  *      - unix_peer(sk) == other
 580  *      - association is stable
 581  */
 582 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
 583 {
 584         int connected;
 585
 586         connected = unix_dgram_peer_wake_connect(sk, other);
 587
 588         /* If other is SOCK_DEAD, we want to make sure we signal
 589          * POLLOUT, such that a subsequent write() can get a
 590          * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
 591          * to other and its full, we will hang waiting for POLLOUT.
 592          */
 593         if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
 594                 return 1;
 595
 596         if (connected)
 597                 unix_dgram_peer_wake_disconnect(sk, other);
 598
 599         return 0;
 600 }
 601
 602 static int unix_writable(const struct sock *sk, unsigned char state)
 603 {
 604         return state != TCP_LISTEN &&
 605                 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
 606 }
 607
 608 static void unix_write_space(struct sock *sk)
 609 {
 610         struct socket_wq *wq;
 611
 612         rcu_read_lock();
 613         if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
 614                 wq = rcu_dereference(sk->sk_wq);
 615                 if (skwq_has_sleeper(wq))
 616                         wake_up_interruptible_sync_poll(&wq->wait,
 617                                 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
 618                 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
 619         }
 620         rcu_read_unlock();
 621 }
 622
 623 /* When dgram socket disconnects (or changes its peer), we clear its receive
 624  * queue of packets arrived from previous peer. First, it allows to do
 625  * flow control based only on wmem_alloc; second, sk connected to peer
 626  * may receive messages only from that peer. */
 627 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 628 {
 629         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 630                 skb_queue_purge(&sk->sk_receive_queue);
 631                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 632
 633                 /* If one link of bidirectional dgram pipe is disconnected,
 634                  * we signal error. Messages are lost. Do not make this,
 635                  * when peer was not connected to us.
 636                  */
 637                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 638                         WRITE_ONCE(other->sk_err, ECONNRESET);
 639                         sk_error_report(other);
 640                 }
 641         }
 642 }
 643
 644 static void unix_sock_destructor(struct sock *sk)
 645 {
 646         struct unix_sock *u = unix_sk(sk);
 647
 648         skb_queue_purge(&sk->sk_receive_queue);
 649
 650         DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
 651         DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 652         DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
 653         if (!sock_flag(sk, SOCK_DEAD)) {
 654                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 655                 return;
 656         }
 657
 658         if (u->addr)
 659                 unix_release_addr(u->addr);
 660
 661         atomic_long_dec(&unix_nr_socks);
 662         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 663 #ifdef UNIX_REFCNT_DEBUG
 664         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 665                 atomic_long_read(&unix_nr_socks));
 666 #endif
 667 }
 668
 669 static void unix_release_sock(struct sock *sk, int embrion)
 670 {
 671         struct unix_sock *u = unix_sk(sk);
 672         struct sock *skpair;
 673         struct sk_buff *skb;
 674         struct path path;
 675         int state;
 676
 677         unix_remove_socket(sock_net(sk), sk);
 678         unix_remove_bsd_socket(sk);
 679
 680         /* Clear state */
 681         unix_state_lock(sk);
 682         sock_orphan(sk);
 683         WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
 684         path         = u->path;
 685         u->path.dentry = NULL;
 686         u->path.mnt = NULL;
 687         state = sk->sk_state;
 688         WRITE_ONCE(sk->sk_state, TCP_CLOSE);
 689
 690         skpair = unix_peer(sk);
 691         unix_peer(sk) = NULL;
 692
 693         unix_state_unlock(sk);
 694
 695 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 696         u->oob_skb = NULL;
 697 #endif
 698
 699         wake_up_interruptible_all(&u->peer_wait);
 700
 701         if (skpair != NULL) {
 702                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 703                         unix_state_lock(skpair);
 704                         /* No more writes */
 705                         WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
 706                         if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
 707                                 WRITE_ONCE(skpair->sk_err, ECONNRESET);
 708                         unix_state_unlock(skpair);
 709                         skpair->sk_state_change(skpair);
 710                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 711                 }
 712
 713                 unix_dgram_peer_wake_disconnect(sk, skpair);
 714                 sock_put(skpair); /* It may now die */
 715         }
 716
 717         /* Try to flush out this socket. Throw out buffers at least */
 718
 719         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 720                 if (state == TCP_LISTEN)
 721                         unix_release_sock(skb->sk, 1);
 722
 723                 /* passed fds are erased in the kfree_skb hook        */
 724                 kfree_skb(skb);
 725         }
 726
 727         if (path.dentry)
 728                 path_put(&path);
 729
 730         sock_put(sk);
 731
 732         /* ---- Socket is dead now and most probably destroyed ---- */
 733
 734         /*
 735          * Fixme: BSD difference: In BSD all sockets connected to us get
 736          *        ECONNRESET and we die on the spot. In Linux we behave
 737          *        like files and pipes do and wait for the last
 738          *        dereference.
 739          *
 740          * Can't we simply set sock->err?
 741          *
 742          *        What the above comment does talk about? --ANK(980817)
 743          */
 744
 745         if (READ_ONCE(unix_tot_inflight))
 746                 unix_gc();              /* Garbage collect fds */
 747 }
 748
 749 static void init_peercred(struct sock *sk)
 750 {
 751         sk->sk_peer_pid = get_pid(task_tgid(current));
 752         sk->sk_peer_cred = get_current_cred();
 753 }
 754
 755 static void update_peercred(struct sock *sk)
 756 {
 757         const struct cred *old_cred;
 758         struct pid *old_pid;
 759
 760         spin_lock(&sk->sk_peer_lock);
 761         old_pid = sk->sk_peer_pid;
 762         old_cred = sk->sk_peer_cred;
 763         init_peercred(sk);
 764         spin_unlock(&sk->sk_peer_lock);
 765
 766         put_pid(old_pid);
 767         put_cred(old_cred);
 768 }
 769
 770 static void copy_peercred(struct sock *sk, struct sock *peersk)
 771 {
 772         lockdep_assert_held(&unix_sk(peersk)->lock);
 773
 774         spin_lock(&sk->sk_peer_lock);
 775         sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
 776         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 777         spin_unlock(&sk->sk_peer_lock);
 778 }
 779
 780 static int unix_listen(struct socket *sock, int backlog)
 781 {
 782         int err;
 783         struct sock *sk = sock->sk;
 784         struct unix_sock *u = unix_sk(sk);
 785
 786         err = -EOPNOTSUPP;
 787         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 788                 goto out;       /* Only stream/seqpacket sockets accept */
 789         err = -EINVAL;
 790         if (!READ_ONCE(u->addr))
 791                 goto out;       /* No listens on an unbound socket */
 792         unix_state_lock(sk);
 793         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 794                 goto out_unlock;
 795         if (backlog > sk->sk_max_ack_backlog)
 796                 wake_up_interruptible_all(&u->peer_wait);
 797         sk->sk_max_ack_backlog  = backlog;
 798         WRITE_ONCE(sk->sk_state, TCP_LISTEN);
 799
 800         /* set credentials so connect can copy them */
 801         update_peercred(sk);
 802         err = 0;
 803
 804 out_unlock:
 805         unix_state_unlock(sk);
 806 out:
 807         return err;
 808 }
 809
 810 static int unix_release(struct socket *);
 811 static int unix_bind(struct socket *, struct sockaddr *, int);
 812 static int unix_stream_connect(struct socket *, struct sockaddr *,
 813                                int addr_len, int flags);
 814 static int unix_socketpair(struct socket *, struct socket *);
 815 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
 816 static int unix_getname(struct socket *, struct sockaddr *, int);
 817 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
 818 static __poll_t unix_dgram_poll(struct file *, struct socket *,
 819                                     poll_table *);
 820 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 821 #ifdef CONFIG_COMPAT
 822 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 823 #endif
 824 static int unix_shutdown(struct socket *, int);
 825 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 826 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 827 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 828                                        struct pipe_inode_info *, size_t size,
 829                                        unsigned int flags);
 830 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 831 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 832 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 833 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
 834 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 835                               int, int);
 836 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 837 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 838                                   int);
 839
 840 #ifdef CONFIG_PROC_FS
 841 static int unix_count_nr_fds(struct sock *sk)
 842 {
 843         struct sk_buff *skb;
 844         struct unix_sock *u;
 845         int nr_fds = 0;
 846
 847         spin_lock(&sk->sk_receive_queue.lock);
 848         skb = skb_peek(&sk->sk_receive_queue);
 849         while (skb) {
 850                 u = unix_sk(skb->sk);
 851                 nr_fds += atomic_read(&u->scm_stat.nr_fds);
 852                 skb = skb_peek_next(skb, &sk->sk_receive_queue);
 853         }
 854         spin_unlock(&sk->sk_receive_queue.lock);
 855
 856         return nr_fds;
 857 }
 858
 859 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 860 {
 861         struct sock *sk = sock->sk;
 862         unsigned char s_state;
 863         struct unix_sock *u;
 864         int nr_fds = 0;
 865
 866         if (sk) {
 867                 s_state = READ_ONCE(sk->sk_state);
 868                 u = unix_sk(sk);
 869
 870                 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
 871                  * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
 872                  * SOCK_DGRAM is ordinary. So, no lock is needed.
 873                  */
 874                 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
 875                         nr_fds = atomic_read(&u->scm_stat.nr_fds);
 876                 else if (s_state == TCP_LISTEN)
 877                         nr_fds = unix_count_nr_fds(sk);
 878
 879                 seq_printf(m, "scm_fds: %u\n", nr_fds);
 880         }
 881 }
 882 #else
 883 #define unix_show_fdinfo NULL
 884 #endif
 885
 886 static const struct proto_ops unix_stream_ops = {
 887         .family =       PF_UNIX,
 888         .owner =        THIS_MODULE,
 889         .release =      unix_release,
 890         .bind =         unix_bind,
 891         .connect =      unix_stream_connect,
 892         .socketpair =   unix_socketpair,
 893         .accept =       unix_accept,
 894         .getname =      unix_getname,
 895         .poll =         unix_poll,
 896         .ioctl =        unix_ioctl,
 897 #ifdef CONFIG_COMPAT
 898         .compat_ioctl = unix_compat_ioctl,
 899 #endif
 900         .listen =       unix_listen,
 901         .shutdown =     unix_shutdown,
 902         .sendmsg =      unix_stream_sendmsg,
 903         .recvmsg =      unix_stream_recvmsg,
 904         .read_skb =     unix_stream_read_skb,
 905         .mmap =         sock_no_mmap,
 906         .splice_read =  unix_stream_splice_read,
 907         .set_peek_off = sk_set_peek_off,
 908         .show_fdinfo =  unix_show_fdinfo,
 909 };
 910
 911 static const struct proto_ops unix_dgram_ops = {
 912         .family =       PF_UNIX,
 913         .owner =        THIS_MODULE,
 914         .release =      unix_release,
 915         .bind =         unix_bind,
 916         .connect =      unix_dgram_connect,
 917         .socketpair =   unix_socketpair,
 918         .accept =       sock_no_accept,
 919         .getname =      unix_getname,
 920         .poll =         unix_dgram_poll,
 921         .ioctl =        unix_ioctl,
 922 #ifdef CONFIG_COMPAT
 923         .compat_ioctl = unix_compat_ioctl,
 924 #endif
 925         .listen =       sock_no_listen,
 926         .shutdown =     unix_shutdown,
 927         .sendmsg =      unix_dgram_sendmsg,
 928         .read_skb =     unix_read_skb,
 929         .recvmsg =      unix_dgram_recvmsg,
 930         .mmap =         sock_no_mmap,
 931         .set_peek_off = sk_set_peek_off,
 932         .show_fdinfo =  unix_show_fdinfo,
 933 };
 934
 935 static const struct proto_ops unix_seqpacket_ops = {
 936         .family =       PF_UNIX,
 937         .owner =        THIS_MODULE,
 938         .release =      unix_release,
 939         .bind =         unix_bind,
 940         .connect =      unix_stream_connect,
 941         .socketpair =   unix_socketpair,
 942         .accept =       unix_accept,
 943         .getname =      unix_getname,
 944         .poll =         unix_dgram_poll,
 945         .ioctl =        unix_ioctl,
 946 #ifdef CONFIG_COMPAT
 947         .compat_ioctl = unix_compat_ioctl,
 948 #endif
 949         .listen =       unix_listen,
 950         .shutdown =     unix_shutdown,
 951         .sendmsg =      unix_seqpacket_sendmsg,
 952         .recvmsg =      unix_seqpacket_recvmsg,
 953         .mmap =         sock_no_mmap,
 954         .set_peek_off = sk_set_peek_off,
 955         .show_fdinfo =  unix_show_fdinfo,
 956 };
 957
 958 static void unix_close(struct sock *sk, long timeout)
 959 {
 960         /* Nothing to do here, unix socket does not need a ->close().
 961          * This is merely for sockmap.
 962          */
 963 }
 964
 965 static void unix_unhash(struct sock *sk)
 966 {
 967         /* Nothing to do here, unix socket does not need a ->unhash().
 968          * This is merely for sockmap.
 969          */
 970 }
 971
 972 static bool unix_bpf_bypass_getsockopt(int level, int optname)
 973 {
 974         if (level == SOL_SOCKET) {
 975                 switch (optname) {
 976                 case SO_PEERPIDFD:
 977                         return true;
 978                 default:
 979                         return false;
 980                 }
 981         }
 982
 983         return false;
 984 }
 985
 986 struct proto unix_dgram_proto = {
 987         .name                   = "UNIX",
 988         .owner                  = THIS_MODULE,
 989         .obj_size               = sizeof(struct unix_sock),
 990         .close                  = unix_close,
 991         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
 992 #ifdef CONFIG_BPF_SYSCALL
 993         .psock_update_sk_prot   = unix_dgram_bpf_update_proto,
 994 #endif
 995 };
 996
 997 struct proto unix_stream_proto = {
 998         .name                   = "UNIX-STREAM",
 999         .owner                  = THIS_MODULE,
1000         .obj_size               = sizeof(struct unix_sock),
1001         .close                  = unix_close,
1002         .unhash                 = unix_unhash,
1003         .bpf_bypass_getsockopt  = unix_bpf_bypass_getsockopt,
1004 #ifdef CONFIG_BPF_SYSCALL
1005         .psock_update_sk_prot   = unix_stream_bpf_update_proto,
1006 #endif
1007 };
1008
1009 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
1010 {
1011         struct unix_sock *u;
1012         struct sock *sk;
1013         int err;
1014
1015         atomic_long_inc(&unix_nr_socks);
1016         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
1017                 err = -ENFILE;
1018                 goto err;
1019         }
1020
1021         if (type == SOCK_STREAM)
1022                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
1023         else /*dgram and  seqpacket */
1024                 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
1025
1026         if (!sk) {
1027                 err = -ENOMEM;
1028                 goto err;
1029         }
1030
1031         sock_init_data(sock, sk);
1032
1033         sk->sk_hash             = unix_unbound_hash(sk);
1034         sk->sk_allocation       = GFP_KERNEL_ACCOUNT;
1035         sk->sk_write_space      = unix_write_space;
1036         sk->sk_max_ack_backlog  = READ_ONCE(net->unx.sysctl_max_dgram_qlen);
1037         sk->sk_destruct         = unix_sock_destructor;
1038         lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL);
1039
1040         u = unix_sk(sk);
1041         u->listener = NULL;
1042         u->vertex = NULL;
1043         u->path.dentry = NULL;
1044         u->path.mnt = NULL;
1045         spin_lock_init(&u->lock);
1046         lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);
1047         mutex_init(&u->iolock); /* single task reading lock */
1048         mutex_init(&u->bindlock); /* single task binding lock */
1049         init_waitqueue_head(&u->peer_wait);
1050         init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1051         memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1052         unix_insert_unbound_socket(net, sk);
1053
1054         sock_prot_inuse_add(net, sk->sk_prot, 1);
1055
1056         return sk;
1057
1058 err:
1059         atomic_long_dec(&unix_nr_socks);
1060         return ERR_PTR(err);
1061 }
1062
1063 static int unix_create(struct net *net, struct socket *sock, int protocol,
1064                        int kern)
1065 {
1066         struct sock *sk;
1067
1068         if (protocol && protocol != PF_UNIX)
1069                 return -EPROTONOSUPPORT;
1070
1071         sock->state = SS_UNCONNECTED;
1072
1073         switch (sock->type) {
1074         case SOCK_STREAM:
1075                 sock->ops = &unix_stream_ops;
1076                 break;
1077                 /*
1078                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
1079                  *      nothing uses it.
1080                  */
1081         case SOCK_RAW:
1082                 sock->type = SOCK_DGRAM;
1083                 fallthrough;
1084         case SOCK_DGRAM:
1085                 sock->ops = &unix_dgram_ops;
1086                 break;
1087         case SOCK_SEQPACKET:
1088                 sock->ops = &unix_seqpacket_ops;
1089                 break;
1090         default:
1091                 return -ESOCKTNOSUPPORT;
1092         }
1093
1094         sk = unix_create1(net, sock, kern, sock->type);
1095         if (IS_ERR(sk))
1096                 return PTR_ERR(sk);
1097
1098         return 0;
1099 }
1100
1101 static int unix_release(struct socket *sock)
1102 {
1103         struct sock *sk = sock->sk;
1104
1105         if (!sk)
1106                 return 0;
1107
1108         sk->sk_prot->close(sk, 0);
1109         unix_release_sock(sk, 0);
1110         sock->sk = NULL;
1111
1112         return 0;
1113 }
1114
1115 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1116                                   int type)
1117 {
1118         struct inode *inode;
1119         struct path path;
1120         struct sock *sk;
1121         int err;
1122
1123         unix_mkname_bsd(sunaddr, addr_len);
1124         err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1125         if (err)
1126                 goto fail;
1127
1128         err = path_permission(&path, MAY_WRITE);
1129         if (err)
1130                 goto path_put;
1131
1132         err = -ECONNREFUSED;
1133         inode = d_backing_inode(path.dentry);
1134         if (!S_ISSOCK(inode->i_mode))
1135                 goto path_put;
1136
1137         sk = unix_find_socket_byinode(inode);
1138         if (!sk)
1139                 goto path_put;
1140
1141         err = -EPROTOTYPE;
1142         if (sk->sk_type == type)
1143                 touch_atime(&path);
1144         else
1145                 goto sock_put;
1146
1147         path_put(&path);
1148
1149         return sk;
1150
1151 sock_put:
1152         sock_put(sk);
1153 path_put:
1154         path_put(&path);
1155 fail:
1156         return ERR_PTR(err);
1157 }
1158
1159 static struct sock *unix_find_abstract(struct net *net,
1160                                        struct sockaddr_un *sunaddr,
1161                                        int addr_len, int type)
1162 {
1163         unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1164         struct dentry *dentry;
1165         struct sock *sk;
1166
1167         sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1168         if (!sk)
1169                 return ERR_PTR(-ECONNREFUSED);
1170
1171         dentry = unix_sk(sk)->path.dentry;
1172         if (dentry)
1173                 touch_atime(&unix_sk(sk)->path);
1174
1175         return sk;
1176 }
1177
1178 static struct sock *unix_find_other(struct net *net,
1179                                     struct sockaddr_un *sunaddr,
1180                                     int addr_len, int type)
1181 {
1182         struct sock *sk;
1183
1184         if (sunaddr->sun_path[0])
1185                 sk = unix_find_bsd(sunaddr, addr_len, type);
1186         else
1187                 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1188
1189         return sk;
1190 }
1191
1192 static int unix_autobind(struct sock *sk)
1193 {
1194         struct unix_sock *u = unix_sk(sk);
1195         unsigned int new_hash, old_hash;
1196         struct net *net = sock_net(sk);
1197         struct unix_address *addr;
1198         u32 lastnum, ordernum;
1199         int err;
1200
1201         err = mutex_lock_interruptible(&u->bindlock);
1202         if (err)
1203                 return err;
1204
1205         if (u->addr)
1206                 goto out;
1207
1208         err = -ENOMEM;
1209         addr = kzalloc(sizeof(*addr) +
1210                        offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1211         if (!addr)
1212                 goto out;
1213
1214         addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1215         addr->name->sun_family = AF_UNIX;
1216         refcount_set(&addr->refcnt, 1);
1217
1218         old_hash = sk->sk_hash;
1219         ordernum = get_random_u32();
1220         lastnum = ordernum & 0xFFFFF;
1221 retry:
1222         ordernum = (ordernum + 1) & 0xFFFFF;
1223         sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1224
1225         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1226         unix_table_double_lock(net, old_hash, new_hash);
1227
1228         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1229                 unix_table_double_unlock(net, old_hash, new_hash);
1230
1231                 /* __unix_find_socket_byname() may take long time if many names
1232                  * are already in use.
1233                  */
1234                 cond_resched();
1235
1236                 if (ordernum == lastnum) {
1237                         /* Give up if all names seems to be in use. */
1238                         err = -ENOSPC;
1239                         unix_release_addr(addr);
1240                         goto out;
1241                 }
1242
1243                 goto retry;
1244         }
1245
1246         __unix_set_addr_hash(net, sk, addr, new_hash);
1247         unix_table_double_unlock(net, old_hash, new_hash);
1248         err = 0;
1249
1250 out:    mutex_unlock(&u->bindlock);
1251         return err;
1252 }
1253
1254 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1255                          int addr_len)
1256 {
1257         umode_t mode = S_IFSOCK |
1258                (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1259         struct unix_sock *u = unix_sk(sk);
1260         unsigned int new_hash, old_hash;
1261         struct net *net = sock_net(sk);
1262         struct mnt_idmap *idmap;
1263         struct unix_address *addr;
1264         struct dentry *dentry;
1265         struct path parent;
1266         int err;
1267
1268         addr_len = unix_mkname_bsd(sunaddr, addr_len);
1269         addr = unix_create_addr(sunaddr, addr_len);
1270         if (!addr)
1271                 return -ENOMEM;
1272
1273         /*
1274          * Get the parent directory, calculate the hash for last
1275          * component.
1276          */
1277         dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1278         if (IS_ERR(dentry)) {
1279                 err = PTR_ERR(dentry);
1280                 goto out;
1281         }
1282
1283         /*
1284          * All right, let's create it.
1285          */
1286         idmap = mnt_idmap(parent.mnt);
1287         err = security_path_mknod(&parent, dentry, mode, 0);
1288         if (!err)
1289                 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1290         if (err)
1291                 goto out_path;
1292         err = mutex_lock_interruptible(&u->bindlock);
1293         if (err)
1294                 goto out_unlink;
1295         if (u->addr)
1296                 goto out_unlock;
1297
1298         old_hash = sk->sk_hash;
1299         new_hash = unix_bsd_hash(d_backing_inode(dentry));
1300         unix_table_double_lock(net, old_hash, new_hash);
1301         u->path.mnt = mntget(parent.mnt);
1302         u->path.dentry = dget(dentry);
1303         __unix_set_addr_hash(net, sk, addr, new_hash);
1304         unix_table_double_unlock(net, old_hash, new_hash);
1305         unix_insert_bsd_socket(sk);
1306         mutex_unlock(&u->bindlock);
1307         done_path_create(&parent, dentry);
1308         return 0;
1309
1310 out_unlock:
1311         mutex_unlock(&u->bindlock);
1312         err = -EINVAL;
1313 out_unlink:
1314         /* failed after successful mknod?  unlink what we'd created... */
1315         vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1316 out_path:
1317         done_path_create(&parent, dentry);
1318 out:
1319         unix_release_addr(addr);
1320         return err == -EEXIST ? -EADDRINUSE : err;
1321 }
1322
1323 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1324                               int addr_len)
1325 {
1326         struct unix_sock *u = unix_sk(sk);
1327         unsigned int new_hash, old_hash;
1328         struct net *net = sock_net(sk);
1329         struct unix_address *addr;
1330         int err;
1331
1332         addr = unix_create_addr(sunaddr, addr_len);
1333         if (!addr)
1334                 return -ENOMEM;
1335
1336         err = mutex_lock_interruptible(&u->bindlock);
1337         if (err)
1338                 goto out;
1339
1340         if (u->addr) {
1341                 err = -EINVAL;
1342                 goto out_mutex;
1343         }
1344
1345         old_hash = sk->sk_hash;
1346         new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1347         unix_table_double_lock(net, old_hash, new_hash);
1348
1349         if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1350                 goto out_spin;
1351
1352         __unix_set_addr_hash(net, sk, addr, new_hash);
1353         unix_table_double_unlock(net, old_hash, new_hash);
1354         mutex_unlock(&u->bindlock);
1355         return 0;
1356
1357 out_spin:
1358         unix_table_double_unlock(net, old_hash, new_hash);
1359         err = -EADDRINUSE;
1360 out_mutex:
1361         mutex_unlock(&u->bindlock);
1362 out:
1363         unix_release_addr(addr);
1364         return err;
1365 }
1366
1367 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1368 {
1369         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1370         struct sock *sk = sock->sk;
1371         int err;
1372
1373         if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1374             sunaddr->sun_family == AF_UNIX)
1375                 return unix_autobind(sk);
1376
1377         err = unix_validate_addr(sunaddr, addr_len);
1378         if (err)
1379                 return err;
1380
1381         if (sunaddr->sun_path[0])
1382                 err = unix_bind_bsd(sk, sunaddr, addr_len);
1383         else
1384                 err = unix_bind_abstract(sk, sunaddr, addr_len);
1385
1386         return err;
1387 }
1388
1389 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1390 {
1391         if (unlikely(sk1 == sk2) || !sk2) {
1392                 unix_state_lock(sk1);
1393                 return;
1394         }
1395
1396         if (sk1 > sk2)
1397                 swap(sk1, sk2);
1398
1399         unix_state_lock(sk1);
1400         unix_state_lock(sk2);
1401 }
1402
1403 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1404 {
1405         if (unlikely(sk1 == sk2) || !sk2) {
1406                 unix_state_unlock(sk1);
1407                 return;
1408         }
1409         unix_state_unlock(sk1);
1410         unix_state_unlock(sk2);
1411 }
1412
1413 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1414                               int alen, int flags)
1415 {
1416         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1417         struct sock *sk = sock->sk;
1418         struct sock *other;
1419         int err;
1420
1421         err = -EINVAL;
1422         if (alen < offsetofend(struct sockaddr, sa_family))
1423                 goto out;
1424
1425         if (addr->sa_family != AF_UNSPEC) {
1426                 err = unix_validate_addr(sunaddr, alen);
1427                 if (err)
1428                         goto out;
1429
1430                 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1431                 if (err)
1432                         goto out;
1433
1434                 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1435                      test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1436                     !READ_ONCE(unix_sk(sk)->addr)) {
1437                         err = unix_autobind(sk);
1438                         if (err)
1439                                 goto out;
1440                 }
1441
1442 restart:
1443                 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1444                 if (IS_ERR(other)) {
1445                         err = PTR_ERR(other);
1446                         goto out;
1447                 }
1448
1449                 unix_state_double_lock(sk, other);
1450
1451                 /* Apparently VFS overslept socket death. Retry. */
1452                 if (sock_flag(other, SOCK_DEAD)) {
1453                         unix_state_double_unlock(sk, other);
1454                         sock_put(other);
1455                         goto restart;
1456                 }
1457
1458                 err = -EPERM;
1459                 if (!unix_may_send(sk, other))
1460                         goto out_unlock;
1461
1462                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1463                 if (err)
1464                         goto out_unlock;
1465
1466                 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1467                 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1468         } else {
1469                 /*
1470                  *      1003.1g breaking connected state with AF_UNSPEC
1471                  */
1472                 other = NULL;
1473                 unix_state_double_lock(sk, other);
1474         }
1475
1476         /*
1477          * If it was connected, reconnect.
1478          */
1479         if (unix_peer(sk)) {
1480                 struct sock *old_peer = unix_peer(sk);
1481
1482                 unix_peer(sk) = other;
1483                 if (!other)
1484                         WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1485                 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1486
1487                 unix_state_double_unlock(sk, other);
1488
1489                 if (other != old_peer) {
1490                         unix_dgram_disconnected(sk, old_peer);
1491
1492                         unix_state_lock(old_peer);
1493                         if (!unix_peer(old_peer))
1494                                 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1495                         unix_state_unlock(old_peer);
1496                 }
1497
1498                 sock_put(old_peer);
1499         } else {
1500                 unix_peer(sk) = other;
1501                 unix_state_double_unlock(sk, other);
1502         }
1503
1504         return 0;
1505
1506 out_unlock:
1507         unix_state_double_unlock(sk, other);
1508         sock_put(other);
1509 out:
1510         return err;
1511 }
1512
1513 static long unix_wait_for_peer(struct sock *other, long timeo)
1514         __releases(&unix_sk(other)->lock)
1515 {
1516         struct unix_sock *u = unix_sk(other);
1517         int sched;
1518         DEFINE_WAIT(wait);
1519
1520         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1521
1522         sched = !sock_flag(other, SOCK_DEAD) &&
1523                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1524                 unix_recvq_full_lockless(other);
1525
1526         unix_state_unlock(other);
1527
1528         if (sched)
1529                 timeo = schedule_timeout(timeo);
1530
1531         finish_wait(&u->peer_wait, &wait);
1532         return timeo;
1533 }
1534
1535 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1536                                int addr_len, int flags)
1537 {
1538         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1539         struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1540         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1541         struct net *net = sock_net(sk);
1542         struct sk_buff *skb = NULL;
1543         unsigned char state;
1544         long timeo;
1545         int err;
1546
1547         err = unix_validate_addr(sunaddr, addr_len);
1548         if (err)
1549                 goto out;
1550
1551         err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1552         if (err)
1553                 goto out;
1554
1555         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1556              test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1557             !READ_ONCE(u->addr)) {
1558                 err = unix_autobind(sk);
1559                 if (err)
1560                         goto out;
1561         }
1562
1563         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1564
1565         /* First of all allocate resources.
1566            If we will make it after state is locked,
1567            we will have to recheck all again in any case.
1568          */
1569
1570         /* create new sock for complete connection */
1571         newsk = unix_create1(net, NULL, 0, sock->type);
1572         if (IS_ERR(newsk)) {
1573                 err = PTR_ERR(newsk);
1574                 newsk = NULL;
1575                 goto out;
1576         }
1577
1578         err = -ENOMEM;
1579
1580         /* Allocate skb for sending to listening sock */
1581         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1582         if (skb == NULL)
1583                 goto out;
1584
1585 restart:
1586         /*  Find listening sock. */
1587         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1588         if (IS_ERR(other)) {
1589                 err = PTR_ERR(other);
1590                 other = NULL;
1591                 goto out;
1592         }
1593
1594         unix_state_lock(other);
1595
1596         /* Apparently VFS overslept socket death. Retry. */
1597         if (sock_flag(other, SOCK_DEAD)) {
1598                 unix_state_unlock(other);
1599                 sock_put(other);
1600                 goto restart;
1601         }
1602
1603         err = -ECONNREFUSED;
1604         if (other->sk_state != TCP_LISTEN)
1605                 goto out_unlock;
1606         if (other->sk_shutdown & RCV_SHUTDOWN)
1607                 goto out_unlock;
1608
1609         if (unix_recvq_full_lockless(other)) {
1610                 err = -EAGAIN;
1611                 if (!timeo)
1612                         goto out_unlock;
1613
1614                 timeo = unix_wait_for_peer(other, timeo);
1615
1616                 err = sock_intr_errno(timeo);
1617                 if (signal_pending(current))
1618                         goto out;
1619                 sock_put(other);
1620                 goto restart;
1621         }
1622
1623         /* self connect and simultaneous connect are eliminated
1624          * by rejecting TCP_LISTEN socket to avoid deadlock.
1625          */
1626         state = READ_ONCE(sk->sk_state);
1627         if (unlikely(state != TCP_CLOSE)) {
1628                 err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1629                 goto out_unlock;
1630         }
1631
1632         unix_state_lock(sk);
1633
1634         if (unlikely(sk->sk_state != TCP_CLOSE)) {
1635                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1636                 unix_state_unlock(sk);
1637                 goto out_unlock;
1638         }
1639
1640         err = security_unix_stream_connect(sk, other, newsk);
1641         if (err) {
1642                 unix_state_unlock(sk);
1643                 goto out_unlock;
1644         }
1645
1646         /* The way is open! Fastly set all the necessary fields... */
1647
1648         sock_hold(sk);
1649         unix_peer(newsk)        = sk;
1650         newsk->sk_state         = TCP_ESTABLISHED;
1651         newsk->sk_type          = sk->sk_type;
1652         init_peercred(newsk);
1653         newu = unix_sk(newsk);
1654         newu->listener = other;
1655         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1656         otheru = unix_sk(other);
1657
1658         /* copy address information from listening to new sock
1659          *
1660          * The contents of *(otheru->addr) and otheru->path
1661          * are seen fully set up here, since we have found
1662          * otheru in hash under its lock.  Insertion into the
1663          * hash chain we'd found it in had been done in an
1664          * earlier critical area protected by the chain's lock,
1665          * the same one where we'd set *(otheru->addr) contents,
1666          * as well as otheru->path and otheru->addr itself.
1667          *
1668          * Using smp_store_release() here to set newu->addr
1669          * is enough to make those stores, as well as stores
1670          * to newu->path visible to anyone who gets newu->addr
1671          * by smp_load_acquire().  IOW, the same warranties
1672          * as for unix_sock instances bound in unix_bind() or
1673          * in unix_autobind().
1674          */
1675         if (otheru->path.dentry) {
1676                 path_get(&otheru->path);
1677                 newu->path = otheru->path;
1678         }
1679         refcount_inc(&otheru->addr->refcnt);
1680         smp_store_release(&newu->addr, otheru->addr);
1681
1682         /* Set credentials */
1683         copy_peercred(sk, other);
1684
1685         sock->state     = SS_CONNECTED;
1686         WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1687         sock_hold(newsk);
1688
1689         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1690         unix_peer(sk)   = newsk;
1691
1692         unix_state_unlock(sk);
1693
1694         /* take ten and send info to listening sock */
1695         spin_lock(&other->sk_receive_queue.lock);
1696         __skb_queue_tail(&other->sk_receive_queue, skb);
1697         spin_unlock(&other->sk_receive_queue.lock);
1698         unix_state_unlock(other);
1699         other->sk_data_ready(other);
1700         sock_put(other);
1701         return 0;
1702
1703 out_unlock:
1704         if (other)
1705                 unix_state_unlock(other);
1706
1707 out:
1708         kfree_skb(skb);
1709         if (newsk)
1710                 unix_release_sock(newsk, 0);
1711         if (other)
1712                 sock_put(other);
1713         return err;
1714 }
1715
1716 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1717 {
1718         struct sock *ska = socka->sk, *skb = sockb->sk;
1719
1720         /* Join our sockets back to back */
1721         sock_hold(ska);
1722         sock_hold(skb);
1723         unix_peer(ska) = skb;
1724         unix_peer(skb) = ska;
1725         init_peercred(ska);
1726         init_peercred(skb);
1727
1728         ska->sk_state = TCP_ESTABLISHED;
1729         skb->sk_state = TCP_ESTABLISHED;
1730         socka->state  = SS_CONNECTED;
1731         sockb->state  = SS_CONNECTED;
1732         return 0;
1733 }
1734
1735 static void unix_sock_inherit_flags(const struct socket *old,
1736                                     struct socket *new)
1737 {
1738         if (test_bit(SOCK_PASSCRED, &old->flags))
1739                 set_bit(SOCK_PASSCRED, &new->flags);
1740         if (test_bit(SOCK_PASSPIDFD, &old->flags))
1741                 set_bit(SOCK_PASSPIDFD, &new->flags);
1742         if (test_bit(SOCK_PASSSEC, &old->flags))
1743                 set_bit(SOCK_PASSSEC, &new->flags);
1744 }
1745
1746 static int unix_accept(struct socket *sock, struct socket *newsock,
1747                        struct proto_accept_arg *arg)
1748 {
1749         struct sock *sk = sock->sk;
1750         struct sk_buff *skb;
1751         struct sock *tsk;
1752
1753         arg->err = -EOPNOTSUPP;
1754         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1755                 goto out;
1756
1757         arg->err = -EINVAL;
1758         if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1759                 goto out;
1760
1761         /* If socket state is TCP_LISTEN it cannot change (for now...),
1762          * so that no locks are necessary.
1763          */
1764
1765         skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1766                                 &arg->err);
1767         if (!skb) {
1768                 /* This means receive shutdown. */
1769                 if (arg->err == 0)
1770                         arg->err = -EINVAL;
1771                 goto out;
1772         }
1773
1774         tsk = skb->sk;
1775         skb_free_datagram(sk, skb);
1776         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1777
1778         /* attach accepted sock to socket */
1779         unix_state_lock(tsk);
1780         unix_update_edges(unix_sk(tsk));
1781         newsock->state = SS_CONNECTED;
1782         unix_sock_inherit_flags(sock, newsock);
1783         sock_graft(tsk, newsock);
1784         unix_state_unlock(tsk);
1785         return 0;
1786
1787 out:
1788         return arg->err;
1789 }
1790
1791
1792 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1793 {
1794         struct sock *sk = sock->sk;
1795         struct unix_address *addr;
1796         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1797         int err = 0;
1798
1799         if (peer) {
1800                 sk = unix_peer_get(sk);
1801
1802                 err = -ENOTCONN;
1803                 if (!sk)
1804                         goto out;
1805                 err = 0;
1806         } else {
1807                 sock_hold(sk);
1808         }
1809
1810         addr = smp_load_acquire(&unix_sk(sk)->addr);
1811         if (!addr) {
1812                 sunaddr->sun_family = AF_UNIX;
1813                 sunaddr->sun_path[0] = 0;
1814                 err = offsetof(struct sockaddr_un, sun_path);
1815         } else {
1816                 err = addr->len;
1817                 memcpy(sunaddr, addr->name, addr->len);
1818
1819                 if (peer)
1820                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1821                                                CGROUP_UNIX_GETPEERNAME);
1822                 else
1823                         BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1824                                                CGROUP_UNIX_GETSOCKNAME);
1825         }
1826         sock_put(sk);
1827 out:
1828         return err;
1829 }
1830
1831 /* The "user->unix_inflight" variable is protected by the garbage
1832  * collection lock, and we just read it locklessly here. If you go
1833  * over the limit, there might be a tiny race in actually noticing
1834  * it across threads. Tough.
1835  */
1836 static inline bool too_many_unix_fds(struct task_struct *p)
1837 {
1838         struct user_struct *user = current_user();
1839
1840         if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1841                 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1842         return false;
1843 }
1844
1845 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1846 {
1847         if (too_many_unix_fds(current))
1848                 return -ETOOMANYREFS;
1849
1850         UNIXCB(skb).fp = scm->fp;
1851         scm->fp = NULL;
1852
1853         if (unix_prepare_fpl(UNIXCB(skb).fp))
1854                 return -ENOMEM;
1855
1856         return 0;
1857 }
1858
1859 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1860 {
1861         scm->fp = UNIXCB(skb).fp;
1862         UNIXCB(skb).fp = NULL;
1863
1864         unix_destroy_fpl(scm->fp);
1865 }
1866
1867 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1868 {
1869         scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1870 }
1871
1872 static void unix_destruct_scm(struct sk_buff *skb)
1873 {
1874         struct scm_cookie scm;
1875
1876         memset(&scm, 0, sizeof(scm));
1877         scm.pid  = UNIXCB(skb).pid;
1878         if (UNIXCB(skb).fp)
1879                 unix_detach_fds(&scm, skb);
1880
1881         /* Alas, it calls VFS */
1882         /* So fscking what? fput() had been SMP-safe since the last Summer */
1883         scm_destroy(&scm);
1884         sock_wfree(skb);
1885 }
1886
1887 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1888 {
1889         int err = 0;
1890
1891         UNIXCB(skb).pid  = get_pid(scm->pid);
1892         UNIXCB(skb).uid = scm->creds.uid;
1893         UNIXCB(skb).gid = scm->creds.gid;
1894         UNIXCB(skb).fp = NULL;
1895         unix_get_secdata(scm, skb);
1896         if (scm->fp && send_fds)
1897                 err = unix_attach_fds(scm, skb);
1898
1899         skb->destructor = unix_destruct_scm;
1900         return err;
1901 }
1902
1903 static bool unix_passcred_enabled(const struct socket *sock,
1904                                   const struct sock *other)
1905 {
1906         return test_bit(SOCK_PASSCRED, &sock->flags) ||
1907                test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1908                !other->sk_socket ||
1909                test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1910                test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1911 }
1912
1913 /*
1914  * Some apps rely on write() giving SCM_CREDENTIALS
1915  * We include credentials if source or destination socket
1916  * asserted SOCK_PASSCRED.
1917  */
1918 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1919                             const struct sock *other)
1920 {
1921         if (UNIXCB(skb).pid)
1922                 return;
1923         if (unix_passcred_enabled(sock, other)) {
1924                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1925                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1926         }
1927 }
1928
1929 static bool unix_skb_scm_eq(struct sk_buff *skb,
1930                             struct scm_cookie *scm)
1931 {
1932         return UNIXCB(skb).pid == scm->pid &&
1933                uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1934                gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1935                unix_secdata_eq(scm, skb);
1936 }
1937
1938 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1939 {
1940         struct scm_fp_list *fp = UNIXCB(skb).fp;
1941         struct unix_sock *u = unix_sk(sk);
1942
1943         if (unlikely(fp && fp->count)) {
1944                 atomic_add(fp->count, &u->scm_stat.nr_fds);
1945                 unix_add_edges(fp, u);
1946         }
1947 }
1948
1949 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1950 {
1951         struct scm_fp_list *fp = UNIXCB(skb).fp;
1952         struct unix_sock *u = unix_sk(sk);
1953
1954         if (unlikely(fp && fp->count)) {
1955                 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1956                 unix_del_edges(fp);
1957         }
1958 }
1959
1960 /*
1961  *      Send AF_UNIX data.
1962  */
1963
1964 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1965                               size_t len)
1966 {
1967         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1968         struct sock *sk = sock->sk, *other = NULL;
1969         struct unix_sock *u = unix_sk(sk);
1970         struct scm_cookie scm;
1971         struct sk_buff *skb;
1972         int data_len = 0;
1973         int sk_locked;
1974         long timeo;
1975         int err;
1976
1977         err = scm_send(sock, msg, &scm, false);
1978         if (err < 0)
1979                 return err;
1980
1981         wait_for_unix_gc(scm.fp);
1982
1983         err = -EOPNOTSUPP;
1984         if (msg->msg_flags&MSG_OOB)
1985                 goto out;
1986
1987         if (msg->msg_namelen) {
1988                 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1989                 if (err)
1990                         goto out;
1991
1992                 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1993                                                             msg->msg_name,
1994                                                             &msg->msg_namelen,
1995                                                             NULL);
1996                 if (err)
1997                         goto out;
1998         } else {
1999                 sunaddr = NULL;
2000                 err = -ENOTCONN;
2001                 other = unix_peer_get(sk);
2002                 if (!other)
2003                         goto out;
2004         }
2005
2006         if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
2007              test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
2008             !READ_ONCE(u->addr)) {
2009                 err = unix_autobind(sk);
2010                 if (err)
2011                         goto out;
2012         }
2013
2014         err = -EMSGSIZE;
2015         if (len > READ_ONCE(sk->sk_sndbuf) - 32)
2016                 goto out;
2017
2018         if (len > SKB_MAX_ALLOC) {
2019                 data_len = min_t(size_t,
2020                                  len - SKB_MAX_ALLOC,
2021                                  MAX_SKB_FRAGS * PAGE_SIZE);
2022                 data_len = PAGE_ALIGN(data_len);
2023
2024                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2025         }
2026
2027         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2028                                    msg->msg_flags & MSG_DONTWAIT, &err,
2029                                    PAGE_ALLOC_COSTLY_ORDER);
2030         if (skb == NULL)
2031                 goto out;
2032
2033         err = unix_scm_to_skb(&scm, skb, true);
2034         if (err < 0)
2035                 goto out_free;
2036
2037         skb_put(skb, len - data_len);
2038         skb->data_len = data_len;
2039         skb->len = len;
2040         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2041         if (err)
2042                 goto out_free;
2043
2044         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2045
2046 restart:
2047         if (!other) {
2048                 err = -ECONNRESET;
2049                 if (sunaddr == NULL)
2050                         goto out_free;
2051
2052                 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2053                                         sk->sk_type);
2054                 if (IS_ERR(other)) {
2055                         err = PTR_ERR(other);
2056                         other = NULL;
2057                         goto out_free;
2058                 }
2059         }
2060
2061         if (sk_filter(other, skb) < 0) {
2062                 /* Toss the packet but do not return any error to the sender */
2063                 err = len;
2064                 goto out_free;
2065         }
2066
2067         sk_locked = 0;
2068         unix_state_lock(other);
2069 restart_locked:
2070         err = -EPERM;
2071         if (!unix_may_send(sk, other))
2072                 goto out_unlock;
2073
2074         if (unlikely(sock_flag(other, SOCK_DEAD))) {
2075                 /*
2076                  *      Check with 1003.1g - what should
2077                  *      datagram error
2078                  */
2079                 unix_state_unlock(other);
2080                 sock_put(other);
2081
2082                 if (!sk_locked)
2083                         unix_state_lock(sk);
2084
2085                 err = 0;
2086                 if (sk->sk_type == SOCK_SEQPACKET) {
2087                         /* We are here only when racing with unix_release_sock()
2088                          * is clearing @other. Never change state to TCP_CLOSE
2089                          * unlike SOCK_DGRAM wants.
2090                          */
2091                         unix_state_unlock(sk);
2092                         err = -EPIPE;
2093                 } else if (unix_peer(sk) == other) {
2094                         unix_peer(sk) = NULL;
2095                         unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2096
2097                         WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2098                         unix_state_unlock(sk);
2099
2100                         unix_dgram_disconnected(sk, other);
2101                         sock_put(other);
2102                         err = -ECONNREFUSED;
2103                 } else {
2104                         unix_state_unlock(sk);
2105                 }
2106
2107                 other = NULL;
2108                 if (err)
2109                         goto out_free;
2110                 goto restart;
2111         }
2112
2113         err = -EPIPE;
2114         if (other->sk_shutdown & RCV_SHUTDOWN)
2115                 goto out_unlock;
2116
2117         if (sk->sk_type != SOCK_SEQPACKET) {
2118                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2119                 if (err)
2120                         goto out_unlock;
2121         }
2122
2123         /* other == sk && unix_peer(other) != sk if
2124          * - unix_peer(sk) == NULL, destination address bound to sk
2125          * - unix_peer(sk) == sk by time of get but disconnected before lock
2126          */
2127         if (other != sk &&
2128             unlikely(unix_peer(other) != sk &&
2129             unix_recvq_full_lockless(other))) {
2130                 if (timeo) {
2131                         timeo = unix_wait_for_peer(other, timeo);
2132
2133                         err = sock_intr_errno(timeo);
2134                         if (signal_pending(current))
2135                                 goto out_free;
2136
2137                         goto restart;
2138                 }
2139
2140                 if (!sk_locked) {
2141                         unix_state_unlock(other);
2142                         unix_state_double_lock(sk, other);
2143                 }
2144
2145                 if (unix_peer(sk) != other ||
2146                     unix_dgram_peer_wake_me(sk, other)) {
2147                         err = -EAGAIN;
2148                         sk_locked = 1;
2149                         goto out_unlock;
2150                 }
2151
2152                 if (!sk_locked) {
2153                         sk_locked = 1;
2154                         goto restart_locked;
2155                 }
2156         }
2157
2158         if (unlikely(sk_locked))
2159                 unix_state_unlock(sk);
2160
2161         if (sock_flag(other, SOCK_RCVTSTAMP))
2162                 __net_timestamp(skb);
2163         maybe_add_creds(skb, sock, other);
2164         scm_stat_add(other, skb);
2165         skb_queue_tail(&other->sk_receive_queue, skb);
2166         unix_state_unlock(other);
2167         other->sk_data_ready(other);
2168         sock_put(other);
2169         scm_destroy(&scm);
2170         return len;
2171
2172 out_unlock:
2173         if (sk_locked)
2174                 unix_state_unlock(sk);
2175         unix_state_unlock(other);
2176 out_free:
2177         kfree_skb(skb);
2178 out:
2179         if (other)
2180                 sock_put(other);
2181         scm_destroy(&scm);
2182         return err;
2183 }
2184
2185 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2186  * bytes, and a minimum of a full page.
2187  */
2188 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2189
2190 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2191 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2192                      struct scm_cookie *scm, bool fds_sent)
2193 {
2194         struct unix_sock *ousk = unix_sk(other);
2195         struct sk_buff *skb;
2196         int err = 0;
2197
2198         skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2199
2200         if (!skb)
2201                 return err;
2202
2203         err = unix_scm_to_skb(scm, skb, !fds_sent);
2204         if (err < 0) {
2205                 kfree_skb(skb);
2206                 return err;
2207         }
2208         skb_put(skb, 1);
2209         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2210
2211         if (err) {
2212                 kfree_skb(skb);
2213                 return err;
2214         }
2215
2216         unix_state_lock(other);
2217
2218         if (sock_flag(other, SOCK_DEAD) ||
2219             (other->sk_shutdown & RCV_SHUTDOWN)) {
2220                 unix_state_unlock(other);
2221                 kfree_skb(skb);
2222                 return -EPIPE;
2223         }
2224
2225         maybe_add_creds(skb, sock, other);
2226         scm_stat_add(other, skb);
2227
2228         spin_lock(&other->sk_receive_queue.lock);
2229         WRITE_ONCE(ousk->oob_skb, skb);
2230         __skb_queue_tail(&other->sk_receive_queue, skb);
2231         spin_unlock(&other->sk_receive_queue.lock);
2232
2233         sk_send_sigurg(other);
2234         unix_state_unlock(other);
2235         other->sk_data_ready(other);
2236
2237         return err;
2238 }
2239 #endif
2240
2241 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2242                                size_t len)
2243 {
2244         struct sock *sk = sock->sk;
2245         struct sock *other = NULL;
2246         int err, size;
2247         struct sk_buff *skb;
2248         int sent = 0;
2249         struct scm_cookie scm;
2250         bool fds_sent = false;
2251         int data_len;
2252
2253         err = scm_send(sock, msg, &scm, false);
2254         if (err < 0)
2255                 return err;
2256
2257         wait_for_unix_gc(scm.fp);
2258
2259         err = -EOPNOTSUPP;
2260         if (msg->msg_flags & MSG_OOB) {
2261 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2262                 if (len)
2263                         len--;
2264                 else
2265 #endif
2266                         goto out_err;
2267         }
2268
2269         if (msg->msg_namelen) {
2270                 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2271                 goto out_err;
2272         } else {
2273                 err = -ENOTCONN;
2274                 other = unix_peer(sk);
2275                 if (!other)
2276                         goto out_err;
2277         }
2278
2279         if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2280                 goto pipe_err;
2281
2282         while (sent < len) {
2283                 size = len - sent;
2284
2285                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2286                         skb = sock_alloc_send_pskb(sk, 0, 0,
2287                                                    msg->msg_flags & MSG_DONTWAIT,
2288                                                    &err, 0);
2289                 } else {
2290                         /* Keep two messages in the pipe so it schedules better */
2291                         size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2292
2293                         /* allow fallback to order-0 allocations */
2294                         size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2295
2296                         data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2297
2298                         data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2299
2300                         skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2301                                                    msg->msg_flags & MSG_DONTWAIT, &err,
2302                                                    get_order(UNIX_SKB_FRAGS_SZ));
2303                 }
2304                 if (!skb)
2305                         goto out_err;
2306
2307                 /* Only send the fds in the first buffer */
2308                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2309                 if (err < 0) {
2310                         kfree_skb(skb);
2311                         goto out_err;
2312                 }
2313                 fds_sent = true;
2314
2315                 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2316                         err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2317                                                    sk->sk_allocation);
2318                         if (err < 0) {
2319                                 kfree_skb(skb);
2320                                 goto out_err;
2321                         }
2322                         size = err;
2323                         refcount_add(size, &sk->sk_wmem_alloc);
2324                 } else {
2325                         skb_put(skb, size - data_len);
2326                         skb->data_len = data_len;
2327                         skb->len = size;
2328                         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2329                         if (err) {
2330                                 kfree_skb(skb);
2331                                 goto out_err;
2332                         }
2333                 }
2334
2335                 unix_state_lock(other);
2336
2337                 if (sock_flag(other, SOCK_DEAD) ||
2338                     (other->sk_shutdown & RCV_SHUTDOWN))
2339                         goto pipe_err_free;
2340
2341                 maybe_add_creds(skb, sock, other);
2342                 scm_stat_add(other, skb);
2343                 skb_queue_tail(&other->sk_receive_queue, skb);
2344                 unix_state_unlock(other);
2345                 other->sk_data_ready(other);
2346                 sent += size;
2347         }
2348
2349 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2350         if (msg->msg_flags & MSG_OOB) {
2351                 err = queue_oob(sock, msg, other, &scm, fds_sent);
2352                 if (err)
2353                         goto out_err;
2354                 sent++;
2355         }
2356 #endif
2357
2358         scm_destroy(&scm);
2359
2360         return sent;
2361
2362 pipe_err_free:
2363         unix_state_unlock(other);
2364         kfree_skb(skb);
2365 pipe_err:
2366         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2367                 send_sig(SIGPIPE, current, 0);
2368         err = -EPIPE;
2369 out_err:
2370         scm_destroy(&scm);
2371         return sent ? : err;
2372 }
2373
2374 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2375                                   size_t len)
2376 {
2377         int err;
2378         struct sock *sk = sock->sk;
2379
2380         err = sock_error(sk);
2381         if (err)
2382                 return err;
2383
2384         if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2385                 return -ENOTCONN;
2386
2387         if (msg->msg_namelen)
2388                 msg->msg_namelen = 0;
2389
2390         return unix_dgram_sendmsg(sock, msg, len);
2391 }
2392
2393 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2394                                   size_t size, int flags)
2395 {
2396         struct sock *sk = sock->sk;
2397
2398         if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2399                 return -ENOTCONN;
2400
2401         return unix_dgram_recvmsg(sock, msg, size, flags);
2402 }
2403
2404 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2405 {
2406         struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2407
2408         if (addr) {
2409                 msg->msg_namelen = addr->len;
2410                 memcpy(msg->msg_name, addr->name, addr->len);
2411         }
2412 }
2413
2414 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2415                          int flags)
2416 {
2417         struct scm_cookie scm;
2418         struct socket *sock = sk->sk_socket;
2419         struct unix_sock *u = unix_sk(sk);
2420         struct sk_buff *skb, *last;
2421         long timeo;
2422         int skip;
2423         int err;
2424
2425         err = -EOPNOTSUPP;
2426         if (flags&MSG_OOB)
2427                 goto out;
2428
2429         timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2430
2431         do {
2432                 mutex_lock(&u->iolock);
2433
2434                 skip = sk_peek_offset(sk, flags);
2435                 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2436                                               &skip, &err, &last);
2437                 if (skb) {
2438                         if (!(flags & MSG_PEEK))
2439                                 scm_stat_del(sk, skb);
2440                         break;
2441                 }
2442
2443                 mutex_unlock(&u->iolock);
2444
2445                 if (err != -EAGAIN)
2446                         break;
2447         } while (timeo &&
2448                  !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2449                                               &err, &timeo, last));
2450
2451         if (!skb) { /* implies iolock unlocked */
2452                 unix_state_lock(sk);
2453                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2454                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2455                     (sk->sk_shutdown & RCV_SHUTDOWN))
2456                         err = 0;
2457                 unix_state_unlock(sk);
2458                 goto out;
2459         }
2460
2461         if (wq_has_sleeper(&u->peer_wait))
2462                 wake_up_interruptible_sync_poll(&u->peer_wait,
2463                                                 EPOLLOUT | EPOLLWRNORM |
2464                                                 EPOLLWRBAND);
2465
2466         if (msg->msg_name) {
2467                 unix_copy_addr(msg, skb->sk);
2468
2469                 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2470                                                       msg->msg_name,
2471                                                       &msg->msg_namelen);
2472         }
2473
2474         if (size > skb->len - skip)
2475                 size = skb->len - skip;
2476         else if (size < skb->len - skip)
2477                 msg->msg_flags |= MSG_TRUNC;
2478
2479         err = skb_copy_datagram_msg(skb, skip, msg, size);
2480         if (err)
2481                 goto out_free;
2482
2483         if (sock_flag(sk, SOCK_RCVTSTAMP))
2484                 __sock_recv_timestamp(msg, sk, skb);
2485
2486         memset(&scm, 0, sizeof(scm));
2487
2488         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2489         unix_set_secdata(&scm, skb);
2490
2491         if (!(flags & MSG_PEEK)) {
2492                 if (UNIXCB(skb).fp)
2493                         unix_detach_fds(&scm, skb);
2494
2495                 sk_peek_offset_bwd(sk, skb->len);
2496         } else {
2497                 /* It is questionable: on PEEK we could:
2498                    - do not return fds - good, but too simple 8)
2499                    - return fds, and do not return them on read (old strategy,
2500                      apparently wrong)
2501                    - clone fds (I chose it for now, it is the most universal
2502                      solution)
2503
2504                    POSIX 1003.1g does not actually define this clearly
2505                    at all. POSIX 1003.1g doesn't define a lot of things
2506                    clearly however!
2507
2508                 */
2509
2510                 sk_peek_offset_fwd(sk, size);
2511
2512                 if (UNIXCB(skb).fp)
2513                         unix_peek_fds(&scm, skb);
2514         }
2515         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2516
2517         scm_recv_unix(sock, msg, &scm, flags);
2518
2519 out_free:
2520         skb_free_datagram(sk, skb);
2521         mutex_unlock(&u->iolock);
2522 out:
2523         return err;
2524 }
2525
2526 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2527                               int flags)
2528 {
2529         struct sock *sk = sock->sk;
2530
2531 #ifdef CONFIG_BPF_SYSCALL
2532         const struct proto *prot = READ_ONCE(sk->sk_prot);
2533
2534         if (prot != &unix_dgram_proto)
2535                 return prot->recvmsg(sk, msg, size, flags, NULL);
2536 #endif
2537         return __unix_dgram_recvmsg(sk, msg, size, flags);
2538 }
2539
2540 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2541 {
2542         struct unix_sock *u = unix_sk(sk);
2543         struct sk_buff *skb;
2544         int err;
2545
2546         mutex_lock(&u->iolock);
2547         skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2548         mutex_unlock(&u->iolock);
2549         if (!skb)
2550                 return err;
2551
2552         return recv_actor(sk, skb);
2553 }
2554
2555 /*
2556  *      Sleep until more data has arrived. But check for races..
2557  */
2558 static long unix_stream_data_wait(struct sock *sk, long timeo,
2559                                   struct sk_buff *last, unsigned int last_len,
2560                                   bool freezable)
2561 {
2562         unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2563         struct sk_buff *tail;
2564         DEFINE_WAIT(wait);
2565
2566         unix_state_lock(sk);
2567
2568         for (;;) {
2569                 prepare_to_wait(sk_sleep(sk), &wait, state);
2570
2571                 tail = skb_peek_tail(&sk->sk_receive_queue);
2572                 if (tail != last ||
2573                     (tail && tail->len != last_len) ||
2574                     sk->sk_err ||
2575                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
2576                     signal_pending(current) ||
2577                     !timeo)
2578                         break;
2579
2580                 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2581                 unix_state_unlock(sk);
2582                 timeo = schedule_timeout(timeo);
2583                 unix_state_lock(sk);
2584
2585                 if (sock_flag(sk, SOCK_DEAD))
2586                         break;
2587
2588                 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2589         }
2590
2591         finish_wait(sk_sleep(sk), &wait);
2592         unix_state_unlock(sk);
2593         return timeo;
2594 }
2595
2596 static unsigned int unix_skb_len(const struct sk_buff *skb)
2597 {
2598         return skb->len - UNIXCB(skb).consumed;
2599 }
2600
2601 struct unix_stream_read_state {
2602         int (*recv_actor)(struct sk_buff *, int, int,
2603                           struct unix_stream_read_state *);
2604         struct socket *socket;
2605         struct msghdr *msg;
2606         struct pipe_inode_info *pipe;
2607         size_t size;
2608         int flags;
2609         unsigned int splice_flags;
2610 };
2611
2612 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2613 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2614 {
2615         struct socket *sock = state->socket;
2616         struct sock *sk = sock->sk;
2617         struct unix_sock *u = unix_sk(sk);
2618         int chunk = 1;
2619         struct sk_buff *oob_skb;
2620
2621         mutex_lock(&u->iolock);
2622         unix_state_lock(sk);
2623         spin_lock(&sk->sk_receive_queue.lock);
2624
2625         if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2626                 spin_unlock(&sk->sk_receive_queue.lock);
2627                 unix_state_unlock(sk);
2628                 mutex_unlock(&u->iolock);
2629                 return -EINVAL;
2630         }
2631
2632         oob_skb = u->oob_skb;
2633
2634         if (!(state->flags & MSG_PEEK))
2635                 WRITE_ONCE(u->oob_skb, NULL);
2636
2637         spin_unlock(&sk->sk_receive_queue.lock);
2638         unix_state_unlock(sk);
2639
2640         chunk = state->recv_actor(oob_skb, 0, chunk, state);
2641
2642         if (!(state->flags & MSG_PEEK))
2643                 UNIXCB(oob_skb).consumed += 1;
2644
2645         mutex_unlock(&u->iolock);
2646
2647         if (chunk < 0)
2648                 return -EFAULT;
2649
2650         state->msg->msg_flags |= MSG_OOB;
2651         return 1;
2652 }
2653
2654 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2655                                   int flags, int copied)
2656 {
2657         struct sk_buff *read_skb = NULL, *unread_skb = NULL;
2658         struct unix_sock *u = unix_sk(sk);
2659
2660         if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb)))
2661                 return skb;
2662
2663         spin_lock(&sk->sk_receive_queue.lock);
2664
2665         if (!unix_skb_len(skb)) {
2666                 if (copied && (!u->oob_skb || skb == u->oob_skb)) {
2667                         skb = NULL;
2668                 } else if (flags & MSG_PEEK) {
2669                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2670                 } else {
2671                         read_skb = skb;
2672                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2673                         __skb_unlink(read_skb, &sk->sk_receive_queue);
2674                 }
2675
2676                 if (!skb)
2677                         goto unlock;
2678         }
2679
2680         if (skb != u->oob_skb)
2681                 goto unlock;
2682
2683         if (copied) {
2684                 skb = NULL;
2685         } else if (!(flags & MSG_PEEK)) {
2686                 WRITE_ONCE(u->oob_skb, NULL);
2687
2688                 if (!sock_flag(sk, SOCK_URGINLINE)) {
2689                         __skb_unlink(skb, &sk->sk_receive_queue);
2690                         unread_skb = skb;
2691                         skb = skb_peek(&sk->sk_receive_queue);
2692                 }
2693         } else if (!sock_flag(sk, SOCK_URGINLINE)) {
2694                 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2695         }
2696
2697 unlock:
2698         spin_unlock(&sk->sk_receive_queue.lock);
2699
2700         consume_skb(read_skb);
2701         kfree_skb(unread_skb);
2702
2703         return skb;
2704 }
2705 #endif
2706
2707 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2708 {
2709         struct unix_sock *u = unix_sk(sk);
2710         struct sk_buff *skb;
2711         int err;
2712
2713         if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2714                 return -ENOTCONN;
2715
2716         mutex_lock(&u->iolock);
2717         skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2718         mutex_unlock(&u->iolock);
2719         if (!skb)
2720                 return err;
2721
2722 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2723         if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2724                 bool drop = false;
2725
2726                 unix_state_lock(sk);
2727
2728                 if (sock_flag(sk, SOCK_DEAD)) {
2729                         unix_state_unlock(sk);
2730                         kfree_skb(skb);
2731                         return -ECONNRESET;
2732                 }
2733
2734                 spin_lock(&sk->sk_receive_queue.lock);
2735                 if (likely(skb == u->oob_skb)) {
2736                         WRITE_ONCE(u->oob_skb, NULL);
2737                         drop = true;
2738                 }
2739                 spin_unlock(&sk->sk_receive_queue.lock);
2740
2741                 unix_state_unlock(sk);
2742
2743                 if (drop) {
2744                         kfree_skb(skb);
2745                         return -EAGAIN;
2746                 }
2747         }
2748 #endif
2749
2750         return recv_actor(sk, skb);
2751 }
2752
2753 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2754                                     bool freezable)
2755 {
2756         struct scm_cookie scm;
2757         struct socket *sock = state->socket;
2758         struct sock *sk = sock->sk;
2759         struct unix_sock *u = unix_sk(sk);
2760         int copied = 0;
2761         int flags = state->flags;
2762         int noblock = flags & MSG_DONTWAIT;
2763         bool check_creds = false;
2764         int target;
2765         int err = 0;
2766         long timeo;
2767         int skip;
2768         size_t size = state->size;
2769         unsigned int last_len;
2770
2771         if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2772                 err = -EINVAL;
2773                 goto out;
2774         }
2775
2776         if (unlikely(flags & MSG_OOB)) {
2777                 err = -EOPNOTSUPP;
2778 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2779                 err = unix_stream_recv_urg(state);
2780 #endif
2781                 goto out;
2782         }
2783
2784         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2785         timeo = sock_rcvtimeo(sk, noblock);
2786
2787         memset(&scm, 0, sizeof(scm));
2788
2789         /* Lock the socket to prevent queue disordering
2790          * while sleeps in memcpy_tomsg
2791          */
2792         mutex_lock(&u->iolock);
2793
2794         skip = max(sk_peek_offset(sk, flags), 0);
2795
2796         do {
2797                 struct sk_buff *skb, *last;
2798                 int chunk;
2799
2800 redo:
2801                 unix_state_lock(sk);
2802                 if (sock_flag(sk, SOCK_DEAD)) {
2803                         err = -ECONNRESET;
2804                         goto unlock;
2805                 }
2806                 last = skb = skb_peek(&sk->sk_receive_queue);
2807                 last_len = last ? last->len : 0;
2808
2809 again:
2810 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2811                 if (skb) {
2812                         skb = manage_oob(skb, sk, flags, copied);
2813                         if (!skb && copied) {
2814                                 unix_state_unlock(sk);
2815                                 break;
2816                         }
2817                 }
2818 #endif
2819                 if (skb == NULL) {
2820                         if (copied >= target)
2821                                 goto unlock;
2822
2823                         /*
2824                          *      POSIX 1003.1g mandates this order.
2825                          */
2826
2827                         err = sock_error(sk);
2828                         if (err)
2829                                 goto unlock;
2830                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2831                                 goto unlock;
2832
2833                         unix_state_unlock(sk);
2834                         if (!timeo) {
2835                                 err = -EAGAIN;
2836                                 break;
2837                         }
2838
2839                         mutex_unlock(&u->iolock);
2840
2841                         timeo = unix_stream_data_wait(sk, timeo, last,
2842                                                       last_len, freezable);
2843
2844                         if (signal_pending(current)) {
2845                                 err = sock_intr_errno(timeo);
2846                                 scm_destroy(&scm);
2847                                 goto out;
2848                         }
2849
2850                         mutex_lock(&u->iolock);
2851                         goto redo;
2852 unlock:
2853                         unix_state_unlock(sk);
2854                         break;
2855                 }
2856
2857                 while (skip >= unix_skb_len(skb)) {
2858                         skip -= unix_skb_len(skb);
2859                         last = skb;
2860                         last_len = skb->len;
2861                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2862                         if (!skb)
2863                                 goto again;
2864                 }
2865
2866                 unix_state_unlock(sk);
2867
2868                 if (check_creds) {
2869                         /* Never glue messages from different writers */
2870                         if (!unix_skb_scm_eq(skb, &scm))
2871                                 break;
2872                 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2873                            test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2874                         /* Copy credentials */
2875                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2876                         unix_set_secdata(&scm, skb);
2877                         check_creds = true;
2878                 }
2879
2880                 /* Copy address just once */
2881                 if (state->msg && state->msg->msg_name) {
2882                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2883                                          state->msg->msg_name);
2884                         unix_copy_addr(state->msg, skb->sk);
2885
2886                         BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2887                                                               state->msg->msg_name,
2888                                                               &state->msg->msg_namelen);
2889
2890                         sunaddr = NULL;
2891                 }
2892
2893                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2894                 chunk = state->recv_actor(skb, skip, chunk, state);
2895                 if (chunk < 0) {
2896                         if (copied == 0)
2897                                 copied = -EFAULT;
2898                         break;
2899                 }
2900                 copied += chunk;
2901                 size -= chunk;
2902
2903                 /* Mark read part of skb as used */
2904                 if (!(flags & MSG_PEEK)) {
2905                         UNIXCB(skb).consumed += chunk;
2906
2907                         sk_peek_offset_bwd(sk, chunk);
2908
2909                         if (UNIXCB(skb).fp) {
2910                                 scm_stat_del(sk, skb);
2911                                 unix_detach_fds(&scm, skb);
2912                         }
2913
2914                         if (unix_skb_len(skb))
2915                                 break;
2916
2917                         skb_unlink(skb, &sk->sk_receive_queue);
2918                         consume_skb(skb);
2919
2920                         if (scm.fp)
2921                                 break;
2922                 } else {
2923                         /* It is questionable, see note in unix_dgram_recvmsg.
2924                          */
2925                         if (UNIXCB(skb).fp)
2926                                 unix_peek_fds(&scm, skb);
2927
2928                         sk_peek_offset_fwd(sk, chunk);
2929
2930                         if (UNIXCB(skb).fp)
2931                                 break;
2932
2933                         skip = 0;
2934                         last = skb;
2935                         last_len = skb->len;
2936                         unix_state_lock(sk);
2937                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2938                         if (skb)
2939                                 goto again;
2940                         unix_state_unlock(sk);
2941                         break;
2942                 }
2943         } while (size);
2944
2945         mutex_unlock(&u->iolock);
2946         if (state->msg)
2947                 scm_recv_unix(sock, state->msg, &scm, flags);
2948         else
2949                 scm_destroy(&scm);
2950 out:
2951         return copied ? : err;
2952 }
2953
2954 static int unix_stream_read_actor(struct sk_buff *skb,
2955                                   int skip, int chunk,
2956                                   struct unix_stream_read_state *state)
2957 {
2958         int ret;
2959
2960         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2961                                     state->msg, chunk);
2962         return ret ?: chunk;
2963 }
2964
2965 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2966                           size_t size, int flags)
2967 {
2968         struct unix_stream_read_state state = {
2969                 .recv_actor = unix_stream_read_actor,
2970                 .socket = sk->sk_socket,
2971                 .msg = msg,
2972                 .size = size,
2973                 .flags = flags
2974         };
2975
2976         return unix_stream_read_generic(&state, true);
2977 }
2978
2979 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2980                                size_t size, int flags)
2981 {
2982         struct unix_stream_read_state state = {
2983                 .recv_actor = unix_stream_read_actor,
2984                 .socket = sock,
2985                 .msg = msg,
2986                 .size = size,
2987                 .flags = flags
2988         };
2989
2990 #ifdef CONFIG_BPF_SYSCALL
2991         struct sock *sk = sock->sk;
2992         const struct proto *prot = READ_ONCE(sk->sk_prot);
2993
2994         if (prot != &unix_stream_proto)
2995                 return prot->recvmsg(sk, msg, size, flags, NULL);
2996 #endif
2997         return unix_stream_read_generic(&state, true);
2998 }
2999
3000 static int unix_stream_splice_actor(struct sk_buff *skb,
3001                                     int skip, int chunk,
3002                                     struct unix_stream_read_state *state)
3003 {
3004         return skb_splice_bits(skb, state->socket->sk,
3005                                UNIXCB(skb).consumed + skip,
3006                                state->pipe, chunk, state->splice_flags);
3007 }
3008
3009 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
3010                                        struct pipe_inode_info *pipe,
3011                                        size_t size, unsigned int flags)
3012 {
3013         struct unix_stream_read_state state = {
3014                 .recv_actor = unix_stream_splice_actor,
3015                 .socket = sock,
3016                 .pipe = pipe,
3017                 .size = size,
3018                 .splice_flags = flags,
3019         };
3020
3021         if (unlikely(*ppos))
3022                 return -ESPIPE;
3023
3024         if (sock->file->f_flags & O_NONBLOCK ||
3025             flags & SPLICE_F_NONBLOCK)
3026                 state.flags = MSG_DONTWAIT;
3027
3028         return unix_stream_read_generic(&state, false);
3029 }
3030
3031 static int unix_shutdown(struct socket *sock, int mode)
3032 {
3033         struct sock *sk = sock->sk;
3034         struct sock *other;
3035
3036         if (mode < SHUT_RD || mode > SHUT_RDWR)
3037                 return -EINVAL;
3038         /* This maps:
3039          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
3040          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
3041          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3042          */
3043         ++mode;
3044
3045         unix_state_lock(sk);
3046         WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3047         other = unix_peer(sk);
3048         if (other)
3049                 sock_hold(other);
3050         unix_state_unlock(sk);
3051         sk->sk_state_change(sk);
3052
3053         if (other &&
3054                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3055
3056                 int peer_mode = 0;
3057                 const struct proto *prot = READ_ONCE(other->sk_prot);
3058
3059                 if (prot->unhash)
3060                         prot->unhash(other);
3061                 if (mode&RCV_SHUTDOWN)
3062                         peer_mode |= SEND_SHUTDOWN;
3063                 if (mode&SEND_SHUTDOWN)
3064                         peer_mode |= RCV_SHUTDOWN;
3065                 unix_state_lock(other);
3066                 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3067                 unix_state_unlock(other);
3068                 other->sk_state_change(other);
3069                 if (peer_mode == SHUTDOWN_MASK)
3070                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3071                 else if (peer_mode & RCV_SHUTDOWN)
3072                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3073         }
3074         if (other)
3075                 sock_put(other);
3076
3077         return 0;
3078 }
3079
3080 long unix_inq_len(struct sock *sk)
3081 {
3082         struct sk_buff *skb;
3083         long amount = 0;
3084
3085         if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3086                 return -EINVAL;
3087
3088         spin_lock(&sk->sk_receive_queue.lock);
3089         if (sk->sk_type == SOCK_STREAM ||
3090             sk->sk_type == SOCK_SEQPACKET) {
3091                 skb_queue_walk(&sk->sk_receive_queue, skb)
3092                         amount += unix_skb_len(skb);
3093         } else {
3094                 skb = skb_peek(&sk->sk_receive_queue);
3095                 if (skb)
3096                         amount = skb->len;
3097         }
3098         spin_unlock(&sk->sk_receive_queue.lock);
3099
3100         return amount;
3101 }
3102 EXPORT_SYMBOL_GPL(unix_inq_len);
3103
3104 long unix_outq_len(struct sock *sk)
3105 {
3106         return sk_wmem_alloc_get(sk);
3107 }
3108 EXPORT_SYMBOL_GPL(unix_outq_len);
3109
3110 static int unix_open_file(struct sock *sk)
3111 {
3112         struct path path;
3113         struct file *f;
3114         int fd;
3115
3116         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3117                 return -EPERM;
3118
3119         if (!smp_load_acquire(&unix_sk(sk)->addr))
3120                 return -ENOENT;
3121
3122         path = unix_sk(sk)->path;
3123         if (!path.dentry)
3124                 return -ENOENT;
3125
3126         path_get(&path);
3127
3128         fd = get_unused_fd_flags(O_CLOEXEC);
3129         if (fd < 0)
3130                 goto out;
3131
3132         f = dentry_open(&path, O_PATH, current_cred());
3133         if (IS_ERR(f)) {
3134                 put_unused_fd(fd);
3135                 fd = PTR_ERR(f);
3136                 goto out;
3137         }
3138
3139         fd_install(fd, f);
3140 out:
3141         path_put(&path);
3142
3143         return fd;
3144 }
3145
3146 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3147 {
3148         struct sock *sk = sock->sk;
3149         long amount = 0;
3150         int err;
3151
3152         switch (cmd) {
3153         case SIOCOUTQ:
3154                 amount = unix_outq_len(sk);
3155                 err = put_user(amount, (int __user *)arg);
3156                 break;
3157         case SIOCINQ:
3158                 amount = unix_inq_len(sk);
3159                 if (amount < 0)
3160                         err = amount;
3161                 else
3162                         err = put_user(amount, (int __user *)arg);
3163                 break;
3164         case SIOCUNIXFILE:
3165                 err = unix_open_file(sk);
3166                 break;
3167 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3168         case SIOCATMARK:
3169                 {
3170                         struct unix_sock *u = unix_sk(sk);
3171                         struct sk_buff *skb;
3172                         int answ = 0;
3173
3174                         mutex_lock(&u->iolock);
3175
3176                         skb = skb_peek(&sk->sk_receive_queue);
3177                         if (skb) {
3178                                 struct sk_buff *oob_skb = READ_ONCE(u->oob_skb);
3179                                 struct sk_buff *next_skb;
3180
3181                                 next_skb = skb_peek_next(skb, &sk->sk_receive_queue);
3182
3183                                 if (skb == oob_skb ||
3184                                     (!unix_skb_len(skb) &&
3185                                      (!oob_skb || next_skb == oob_skb)))
3186                                         answ = 1;
3187                         }
3188
3189                         mutex_unlock(&u->iolock);
3190
3191                         err = put_user(answ, (int __user *)arg);
3192                 }
3193                 break;
3194 #endif
3195         default:
3196                 err = -ENOIOCTLCMD;
3197                 break;
3198         }
3199         return err;
3200 }
3201
3202 #ifdef CONFIG_COMPAT
3203 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3204 {
3205         return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3206 }
3207 #endif
3208
3209 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3210 {
3211         struct sock *sk = sock->sk;
3212         unsigned char state;
3213         __poll_t mask;
3214         u8 shutdown;
3215
3216         sock_poll_wait(file, sock, wait);
3217         mask = 0;
3218         shutdown = READ_ONCE(sk->sk_shutdown);
3219         state = READ_ONCE(sk->sk_state);
3220
3221         /* exceptional events? */
3222         if (READ_ONCE(sk->sk_err))
3223                 mask |= EPOLLERR;
3224         if (shutdown == SHUTDOWN_MASK)
3225                 mask |= EPOLLHUP;
3226         if (shutdown & RCV_SHUTDOWN)
3227                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3228
3229         /* readable? */
3230         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3231                 mask |= EPOLLIN | EPOLLRDNORM;
3232         if (sk_is_readable(sk))
3233                 mask |= EPOLLIN | EPOLLRDNORM;
3234 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3235         if (READ_ONCE(unix_sk(sk)->oob_skb))
3236                 mask |= EPOLLPRI;
3237 #endif
3238
3239         /* Connection-based need to check for termination and startup */
3240         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3241             state == TCP_CLOSE)
3242                 mask |= EPOLLHUP;
3243
3244         /*
3245          * we set writable also when the other side has shut down the
3246          * connection. This prevents stuck sockets.
3247          */
3248         if (unix_writable(sk, state))
3249                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3250
3251         return mask;
3252 }
3253
3254 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3255                                     poll_table *wait)
3256 {
3257         struct sock *sk = sock->sk, *other;
3258         unsigned int writable;
3259         unsigned char state;
3260         __poll_t mask;
3261         u8 shutdown;
3262
3263         sock_poll_wait(file, sock, wait);
3264         mask = 0;
3265         shutdown = READ_ONCE(sk->sk_shutdown);
3266         state = READ_ONCE(sk->sk_state);
3267
3268         /* exceptional events? */
3269         if (READ_ONCE(sk->sk_err) ||
3270             !skb_queue_empty_lockless(&sk->sk_error_queue))
3271                 mask |= EPOLLERR |
3272                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3273
3274         if (shutdown & RCV_SHUTDOWN)
3275                 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3276         if (shutdown == SHUTDOWN_MASK)
3277                 mask |= EPOLLHUP;
3278
3279         /* readable? */
3280         if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3281                 mask |= EPOLLIN | EPOLLRDNORM;
3282         if (sk_is_readable(sk))
3283                 mask |= EPOLLIN | EPOLLRDNORM;
3284
3285         /* Connection-based need to check for termination and startup */
3286         if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3287                 mask |= EPOLLHUP;
3288
3289         /* No write status requested, avoid expensive OUT tests. */
3290         if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3291                 return mask;
3292
3293         writable = unix_writable(sk, state);
3294         if (writable) {
3295                 unix_state_lock(sk);
3296
3297                 other = unix_peer(sk);
3298                 if (other && unix_peer(other) != sk &&
3299                     unix_recvq_full_lockless(other) &&
3300                     unix_dgram_peer_wake_me(sk, other))
3301                         writable = 0;
3302
3303                 unix_state_unlock(sk);
3304         }
3305
3306         if (writable)
3307                 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3308         else
3309                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3310
3311         return mask;
3312 }
3313
3314 #ifdef CONFIG_PROC_FS
3315
3316 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3317
3318 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3319 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3320 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3321
3322 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3323 {
3324         unsigned long offset = get_offset(*pos);
3325         unsigned long bucket = get_bucket(*pos);
3326         unsigned long count = 0;
3327         struct sock *sk;
3328
3329         for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3330              sk; sk = sk_next(sk)) {
3331                 if (++count == offset)
3332                         break;
3333         }
3334
3335         return sk;
3336 }
3337
3338 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3339 {
3340         unsigned long bucket = get_bucket(*pos);
3341         struct net *net = seq_file_net(seq);
3342         struct sock *sk;
3343
3344         while (bucket < UNIX_HASH_SIZE) {
3345                 spin_lock(&net->unx.table.locks[bucket]);
3346
3347                 sk = unix_from_bucket(seq, pos);
3348                 if (sk)
3349                         return sk;
3350
3351                 spin_unlock(&net->unx.table.locks[bucket]);
3352
3353                 *pos = set_bucket_offset(++bucket, 1);
3354         }
3355
3356         return NULL;
3357 }
3358
3359 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3360                                   loff_t *pos)
3361 {
3362         unsigned long bucket = get_bucket(*pos);
3363
3364         sk = sk_next(sk);
3365         if (sk)
3366                 return sk;
3367
3368
3369         spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3370
3371         *pos = set_bucket_offset(++bucket, 1);
3372
3373         return unix_get_first(seq, pos);
3374 }
3375
3376 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3377 {
3378         if (!*pos)
3379                 return SEQ_START_TOKEN;
3380
3381         return unix_get_first(seq, pos);
3382 }
3383
3384 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3385 {
3386         ++*pos;
3387
3388         if (v == SEQ_START_TOKEN)
3389                 return unix_get_first(seq, pos);
3390
3391         return unix_get_next(seq, v, pos);
3392 }
3393
3394 static void unix_seq_stop(struct seq_file *seq, void *v)
3395 {
3396         struct sock *sk = v;
3397
3398         if (sk)
3399                 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3400 }
3401
3402 static int unix_seq_show(struct seq_file *seq, void *v)
3403 {
3404
3405         if (v == SEQ_START_TOKEN)
3406                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3407                          "Inode Path\n");
3408         else {
3409                 struct sock *s = v;
3410                 struct unix_sock *u = unix_sk(s);
3411                 unix_state_lock(s);
3412
3413                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3414                         s,
3415                         refcount_read(&s->sk_refcnt),
3416                         0,
3417                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3418                         s->sk_type,
3419                         s->sk_socket ?
3420                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3421                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3422                         sock_i_ino(s));
3423
3424                 if (u->addr) {  // under a hash table lock here
3425                         int i, len;
3426                         seq_putc(seq, ' ');
3427
3428                         i = 0;
3429                         len = u->addr->len -
3430                                 offsetof(struct sockaddr_un, sun_path);
3431                         if (u->addr->name->sun_path[0]) {
3432                                 len--;
3433                         } else {
3434                                 seq_putc(seq, '@');
3435                                 i++;
3436                         }
3437                         for ( ; i < len; i++)
3438                                 seq_putc(seq, u->addr->name->sun_path[i] ?:
3439                                          '@');
3440                 }
3441                 unix_state_unlock(s);
3442                 seq_putc(seq, '\n');
3443         }
3444
3445         return 0;
3446 }
3447
3448 static const struct seq_operations unix_seq_ops = {
3449         .start  = unix_seq_start,
3450         .next   = unix_seq_next,
3451         .stop   = unix_seq_stop,
3452         .show   = unix_seq_show,
3453 };
3454
3455 #ifdef CONFIG_BPF_SYSCALL
3456 struct bpf_unix_iter_state {
3457         struct seq_net_private p;
3458         unsigned int cur_sk;
3459         unsigned int end_sk;
3460         unsigned int max_sk;
3461         struct sock **batch;
3462         bool st_bucket_done;
3463 };
3464
3465 struct bpf_iter__unix {
3466         __bpf_md_ptr(struct bpf_iter_meta *, meta);
3467         __bpf_md_ptr(struct unix_sock *, unix_sk);
3468         uid_t uid __aligned(8);
3469 };
3470
3471 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3472                               struct unix_sock *unix_sk, uid_t uid)
3473 {
3474         struct bpf_iter__unix ctx;
3475
3476         meta->seq_num--;  /* skip SEQ_START_TOKEN */
3477         ctx.meta = meta;
3478         ctx.unix_sk = unix_sk;
3479         ctx.uid = uid;
3480         return bpf_iter_run_prog(prog, &ctx);
3481 }
3482
3483 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3484
3485 {
3486         struct bpf_unix_iter_state *iter = seq->private;
3487         unsigned int expected = 1;
3488         struct sock *sk;
3489
3490         sock_hold(start_sk);
3491         iter->batch[iter->end_sk++] = start_sk;
3492
3493         for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3494                 if (iter->end_sk < iter->max_sk) {
3495                         sock_hold(sk);
3496                         iter->batch[iter->end_sk++] = sk;
3497                 }
3498
3499                 expected++;
3500         }
3501
3502         spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3503
3504         return expected;
3505 }
3506
3507 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3508 {
3509         while (iter->cur_sk < iter->end_sk)
3510                 sock_put(iter->batch[iter->cur_sk++]);
3511 }
3512
3513 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3514                                        unsigned int new_batch_sz)
3515 {
3516         struct sock **new_batch;
3517
3518         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3519                              GFP_USER | __GFP_NOWARN);
3520         if (!new_batch)
3521                 return -ENOMEM;
3522
3523         bpf_iter_unix_put_batch(iter);
3524         kvfree(iter->batch);
3525         iter->batch = new_batch;
3526         iter->max_sk = new_batch_sz;
3527
3528         return 0;
3529 }
3530
3531 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3532                                         loff_t *pos)
3533 {
3534         struct bpf_unix_iter_state *iter = seq->private;
3535         unsigned int expected;
3536         bool resized = false;
3537         struct sock *sk;
3538
3539         if (iter->st_bucket_done)
3540                 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3541
3542 again:
3543         /* Get a new batch */
3544         iter->cur_sk = 0;
3545         iter->end_sk = 0;
3546
3547         sk = unix_get_first(seq, pos);
3548         if (!sk)
3549                 return NULL; /* Done */
3550
3551         expected = bpf_iter_unix_hold_batch(seq, sk);
3552
3553         if (iter->end_sk == expected) {
3554                 iter->st_bucket_done = true;
3555                 return sk;
3556         }
3557
3558         if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3559                 resized = true;
3560                 goto again;
3561         }
3562
3563         return sk;
3564 }
3565
3566 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3567 {
3568         if (!*pos)
3569                 return SEQ_START_TOKEN;
3570
3571         /* bpf iter does not support lseek, so it always
3572          * continue from where it was stop()-ped.
3573          */
3574         return bpf_iter_unix_batch(seq, pos);
3575 }
3576
3577 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3578 {
3579         struct bpf_unix_iter_state *iter = seq->private;
3580         struct sock *sk;
3581
3582         /* Whenever seq_next() is called, the iter->cur_sk is
3583          * done with seq_show(), so advance to the next sk in
3584          * the batch.
3585          */
3586         if (iter->cur_sk < iter->end_sk)
3587                 sock_put(iter->batch[iter->cur_sk++]);
3588
3589         ++*pos;
3590
3591         if (iter->cur_sk < iter->end_sk)
3592                 sk = iter->batch[iter->cur_sk];
3593         else
3594                 sk = bpf_iter_unix_batch(seq, pos);
3595
3596         return sk;
3597 }
3598
3599 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3600 {
3601         struct bpf_iter_meta meta;
3602         struct bpf_prog *prog;
3603         struct sock *sk = v;
3604         uid_t uid;
3605         bool slow;
3606         int ret;
3607
3608         if (v == SEQ_START_TOKEN)
3609                 return 0;
3610
3611         slow = lock_sock_fast(sk);
3612
3613         if (unlikely(sk_unhashed(sk))) {
3614                 ret = SEQ_SKIP;
3615                 goto unlock;
3616         }
3617
3618         uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3619         meta.seq = seq;
3620         prog = bpf_iter_get_info(&meta, false);
3621         ret = unix_prog_seq_show(prog, &meta, v, uid);
3622 unlock:
3623         unlock_sock_fast(sk, slow);
3624         return ret;
3625 }
3626
3627 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3628 {
3629         struct bpf_unix_iter_state *iter = seq->private;
3630         struct bpf_iter_meta meta;
3631         struct bpf_prog *prog;
3632
3633         if (!v) {
3634                 meta.seq = seq;
3635                 prog = bpf_iter_get_info(&meta, true);
3636                 if (prog)
3637                         (void)unix_prog_seq_show(prog, &meta, v, 0);
3638         }
3639
3640         if (iter->cur_sk < iter->end_sk)
3641                 bpf_iter_unix_put_batch(iter);
3642 }
3643
3644 static const struct seq_operations bpf_iter_unix_seq_ops = {
3645         .start  = bpf_iter_unix_seq_start,
3646         .next   = bpf_iter_unix_seq_next,
3647         .stop   = bpf_iter_unix_seq_stop,
3648         .show   = bpf_iter_unix_seq_show,
3649 };
3650 #endif
3651 #endif
3652
3653 static const struct net_proto_family unix_family_ops = {
3654         .family = PF_UNIX,
3655         .create = unix_create,
3656         .owner  = THIS_MODULE,
3657 };
3658
3659
3660 static int __net_init unix_net_init(struct net *net)
3661 {
3662         int i;
3663
3664         net->unx.sysctl_max_dgram_qlen = 10;
3665         if (unix_sysctl_register(net))
3666                 goto out;
3667
3668 #ifdef CONFIG_PROC_FS
3669         if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3670                              sizeof(struct seq_net_private)))
3671                 goto err_sysctl;
3672 #endif
3673
3674         net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3675                                               sizeof(spinlock_t), GFP_KERNEL);
3676         if (!net->unx.table.locks)
3677                 goto err_proc;
3678
3679         net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3680                                                 sizeof(struct hlist_head),
3681                                                 GFP_KERNEL);
3682         if (!net->unx.table.buckets)
3683                 goto free_locks;
3684
3685         for (i = 0; i < UNIX_HASH_SIZE; i++) {
3686                 spin_lock_init(&net->unx.table.locks[i]);
3687                 lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);
3688                 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3689         }
3690
3691         return 0;
3692
3693 free_locks:
3694         kvfree(net->unx.table.locks);
3695 err_proc:
3696 #ifdef CONFIG_PROC_FS
3697         remove_proc_entry("unix", net->proc_net);
3698 err_sysctl:
3699 #endif
3700         unix_sysctl_unregister(net);
3701 out:
3702         return -ENOMEM;
3703 }
3704
3705 static void __net_exit unix_net_exit(struct net *net)
3706 {
3707         kvfree(net->unx.table.buckets);
3708         kvfree(net->unx.table.locks);
3709         unix_sysctl_unregister(net);
3710         remove_proc_entry("unix", net->proc_net);
3711 }
3712
3713 static struct pernet_operations unix_net_ops = {
3714         .init = unix_net_init,
3715         .exit = unix_net_exit,
3716 };
3717
3718 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3719 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3720                      struct unix_sock *unix_sk, uid_t uid)
3721
3722 #define INIT_BATCH_SZ 16
3723
3724 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3725 {
3726         struct bpf_unix_iter_state *iter = priv_data;
3727         int err;
3728
3729         err = bpf_iter_init_seq_net(priv_data, aux);
3730         if (err)
3731                 return err;
3732
3733         err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3734         if (err) {
3735                 bpf_iter_fini_seq_net(priv_data);
3736                 return err;
3737         }
3738
3739         return 0;
3740 }
3741
3742 static void bpf_iter_fini_unix(void *priv_data)
3743 {
3744         struct bpf_unix_iter_state *iter = priv_data;
3745
3746         bpf_iter_fini_seq_net(priv_data);
3747         kvfree(iter->batch);
3748 }
3749
3750 static const struct bpf_iter_seq_info unix_seq_info = {
3751         .seq_ops                = &bpf_iter_unix_seq_ops,
3752         .init_seq_private       = bpf_iter_init_unix,
3753         .fini_seq_private       = bpf_iter_fini_unix,
3754         .seq_priv_size          = sizeof(struct bpf_unix_iter_state),
3755 };
3756
3757 static const struct bpf_func_proto *
3758 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3759                              const struct bpf_prog *prog)
3760 {
3761         switch (func_id) {
3762         case BPF_FUNC_setsockopt:
3763                 return &bpf_sk_setsockopt_proto;
3764         case BPF_FUNC_getsockopt:
3765                 return &bpf_sk_getsockopt_proto;
3766         default:
3767                 return NULL;
3768         }
3769 }
3770
3771 static struct bpf_iter_reg unix_reg_info = {
3772         .target                 = "unix",
3773         .ctx_arg_info_size      = 1,
3774         .ctx_arg_info           = {
3775                 { offsetof(struct bpf_iter__unix, unix_sk),
3776                   PTR_TO_BTF_ID_OR_NULL },
3777         },
3778         .get_func_proto         = bpf_iter_unix_get_func_proto,
3779         .seq_info               = &unix_seq_info,
3780 };
3781
3782 static void __init bpf_iter_register(void)
3783 {
3784         unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3785         if (bpf_iter_reg_target(&unix_reg_info))
3786                 pr_warn("Warning: could not register bpf iterator unix\n");
3787 }
3788 #endif
3789
3790 static int __init af_unix_init(void)
3791 {
3792         int i, rc = -1;
3793
3794         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3795
3796         for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3797                 spin_lock_init(&bsd_socket_locks[i]);
3798                 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3799         }
3800
3801         rc = proto_register(&unix_dgram_proto, 1);
3802         if (rc != 0) {
3803                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3804                 goto out;
3805         }
3806
3807         rc = proto_register(&unix_stream_proto, 1);
3808         if (rc != 0) {
3809                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3810                 proto_unregister(&unix_dgram_proto);
3811                 goto out;
3812         }
3813
3814         sock_register(&unix_family_ops);
3815         register_pernet_subsys(&unix_net_ops);
3816         unix_bpf_build_proto();
3817
3818 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3819         bpf_iter_register();
3820 #endif
3821
3822 out:
3823         return rc;
3824 }
3825
3826 /* Later than subsys_initcall() because we depend on stuff initialised there */
3827 fs_initcall(af_unix_init);