cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

af_unix.c (87996B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * NET4:	Implementation of BSD Unix domain sockets.
      4 *
      5 * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
      6 *
      7 * Fixes:
      8 *		Linus Torvalds	:	Assorted bug cures.
      9 *		Niibe Yutaka	:	async I/O support.
     10 *		Carsten Paeth	:	PF_UNIX check, address fixes.
     11 *		Alan Cox	:	Limit size of allocated blocks.
     12 *		Alan Cox	:	Fixed the stupid socketpair bug.
     13 *		Alan Cox	:	BSD compatibility fine tuning.
     14 *		Alan Cox	:	Fixed a bug in connect when interrupted.
     15 *		Alan Cox	:	Sorted out a proper draft version of
     16 *					file descriptor passing hacked up from
     17 *					Mike Shaver's work.
     18 *		Marty Leisner	:	Fixes to fd passing
     19 *		Nick Nevin	:	recvmsg bugfix.
     20 *		Alan Cox	:	Started proper garbage collector
     21 *		Heiko EiBfeldt	:	Missing verify_area check
     22 *		Alan Cox	:	Started POSIXisms
     23 *		Andreas Schwab	:	Replace inode by dentry for proper
     24 *					reference counting
     25 *		Kirk Petersen	:	Made this a module
     26 *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
     27 *					Lots of bug fixes.
     28 *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
     29 *					by above two patches.
     30 *	     Andrea Arcangeli	:	If possible we block in connect(2)
     31 *					if the max backlog of the listen socket
     32 *					is been reached. This won't break
     33 *					old apps and it will avoid huge amount
     34 *					of socks hashed (this for unix_gc()
     35 *					performances reasons).
     36 *					Security fix that limits the max
     37 *					number of socks to 2*max_files and
     38 *					the number of skb queueable in the
     39 *					dgram receiver.
     40 *		Artur Skawina   :	Hash function optimizations
     41 *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
     42 *	      Malcolm Beattie   :	Set peercred for socketpair
     43 *	     Michal Ostrowski   :       Module initialization cleanup.
     44 *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
     45 *	     				the core infrastructure is doing that
     46 *	     				for all net proto families now (2.5.69+)
     47 *
     48 * Known differences from reference BSD that was tested:
     49 *
     50 *	[TO FIX]
     51 *	ECONNREFUSED is not returned from one end of a connected() socket to the
     52 *		other the moment one end closes.
     53 *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
     54 *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
     55 *	[NOT TO FIX]
     56 *	accept() returns a path name even if the connecting socket has closed
     57 *		in the meantime (BSD loses the path and gives up).
     58 *	accept() returns 0 length path for an unbound connector. BSD returns 16
     59 *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
     60 *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
     61 *	BSD af_unix apparently has connect forgetting to block properly.
     62 *		(need to check this with the POSIX spec in detail)
     63 *
     64 * Differences from 2.0.0-11-... (ANK)
     65 *	Bug fixes and improvements.
     66 *		- client shutdown killed server socket.
     67 *		- removed all useless cli/sti pairs.
     68 *
     69 *	Semantic changes/extensions.
     70 *		- generic control message passing.
     71 *		- SCM_CREDENTIALS control message.
     72 *		- "Abstract" (not FS based) socket bindings.
     73 *		  Abstract names are sequences of bytes (not zero terminated)
     74 *		  started by 0, so that this name space does not intersect
     75 *		  with BSD names.
     76 */
     77
     78#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
     79
     80#include <linux/module.h>
     81#include <linux/kernel.h>
     82#include <linux/signal.h>
     83#include <linux/sched/signal.h>
     84#include <linux/errno.h>
     85#include <linux/string.h>
     86#include <linux/stat.h>
     87#include <linux/dcache.h>
     88#include <linux/namei.h>
     89#include <linux/socket.h>
     90#include <linux/un.h>
     91#include <linux/fcntl.h>
     92#include <linux/filter.h>
     93#include <linux/termios.h>
     94#include <linux/sockios.h>
     95#include <linux/net.h>
     96#include <linux/in.h>
     97#include <linux/fs.h>
     98#include <linux/slab.h>
     99#include <linux/uaccess.h>
    100#include <linux/skbuff.h>
    101#include <linux/netdevice.h>
    102#include <net/net_namespace.h>
    103#include <net/sock.h>
    104#include <net/tcp_states.h>
    105#include <net/af_unix.h>
    106#include <linux/proc_fs.h>
    107#include <linux/seq_file.h>
    108#include <net/scm.h>
    109#include <linux/init.h>
    110#include <linux/poll.h>
    111#include <linux/rtnetlink.h>
    112#include <linux/mount.h>
    113#include <net/checksum.h>
    114#include <linux/security.h>
    115#include <linux/freezer.h>
    116#include <linux/file.h>
    117#include <linux/btf_ids.h>
    118
    119#include "scm.h"
    120
    121spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE];
    122EXPORT_SYMBOL_GPL(unix_table_locks);
    123struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
    124EXPORT_SYMBOL_GPL(unix_socket_table);
    125static atomic_long_t unix_nr_socks;
    126
    127/* SMP locking strategy:
    128 *    hash table is protected with spinlock unix_table_locks
    129 *    each socket state is protected by separate spin lock.
    130 */
    131
    132static unsigned int unix_unbound_hash(struct sock *sk)
    133{
    134	unsigned long hash = (unsigned long)sk;
    135
    136	hash ^= hash >> 16;
    137	hash ^= hash >> 8;
    138	hash ^= sk->sk_type;
    139
    140	return UNIX_HASH_SIZE + (hash & (UNIX_HASH_SIZE - 1));
    141}
    142
    143static unsigned int unix_bsd_hash(struct inode *i)
    144{
    145	return i->i_ino & (UNIX_HASH_SIZE - 1);
    146}
    147
    148static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
    149				       int addr_len, int type)
    150{
    151	__wsum csum = csum_partial(sunaddr, addr_len, 0);
    152	unsigned int hash;
    153
    154	hash = (__force unsigned int)csum_fold(csum);
    155	hash ^= hash >> 8;
    156	hash ^= type;
    157
    158	return hash & (UNIX_HASH_SIZE - 1);
    159}
    160
    161static void unix_table_double_lock(unsigned int hash1, unsigned int hash2)
    162{
    163	/* hash1 and hash2 is never the same because
    164	 * one is between 0 and UNIX_HASH_SIZE - 1, and
    165	 * another is between UNIX_HASH_SIZE and UNIX_HASH_SIZE * 2.
    166	 */
    167	if (hash1 > hash2)
    168		swap(hash1, hash2);
    169
    170	spin_lock(&unix_table_locks[hash1]);
    171	spin_lock_nested(&unix_table_locks[hash2], SINGLE_DEPTH_NESTING);
    172}
    173
    174static void unix_table_double_unlock(unsigned int hash1, unsigned int hash2)
    175{
    176	spin_unlock(&unix_table_locks[hash1]);
    177	spin_unlock(&unix_table_locks[hash2]);
    178}
    179
    180#ifdef CONFIG_SECURITY_NETWORK
    181static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
    182{
    183	UNIXCB(skb).secid = scm->secid;
    184}
    185
    186static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
    187{
    188	scm->secid = UNIXCB(skb).secid;
    189}
    190
    191static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
    192{
    193	return (scm->secid == UNIXCB(skb).secid);
    194}
    195#else
    196static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
    197{ }
    198
    199static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
    200{ }
    201
    202static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
    203{
    204	return true;
    205}
    206#endif /* CONFIG_SECURITY_NETWORK */
    207
    208#define unix_peer(sk) (unix_sk(sk)->peer)
    209
    210static inline int unix_our_peer(struct sock *sk, struct sock *osk)
    211{
    212	return unix_peer(osk) == sk;
    213}
    214
    215static inline int unix_may_send(struct sock *sk, struct sock *osk)
    216{
    217	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
    218}
    219
    220static inline int unix_recvq_full(const struct sock *sk)
    221{
    222	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
    223}
    224
    225static inline int unix_recvq_full_lockless(const struct sock *sk)
    226{
    227	return skb_queue_len_lockless(&sk->sk_receive_queue) >
    228		READ_ONCE(sk->sk_max_ack_backlog);
    229}
    230
    231struct sock *unix_peer_get(struct sock *s)
    232{
    233	struct sock *peer;
    234
    235	unix_state_lock(s);
    236	peer = unix_peer(s);
    237	if (peer)
    238		sock_hold(peer);
    239	unix_state_unlock(s);
    240	return peer;
    241}
    242EXPORT_SYMBOL_GPL(unix_peer_get);
    243
    244static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
    245					     int addr_len)
    246{
    247	struct unix_address *addr;
    248
    249	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
    250	if (!addr)
    251		return NULL;
    252
    253	refcount_set(&addr->refcnt, 1);
    254	addr->len = addr_len;
    255	memcpy(addr->name, sunaddr, addr_len);
    256
    257	return addr;
    258}
    259
    260static inline void unix_release_addr(struct unix_address *addr)
    261{
    262	if (refcount_dec_and_test(&addr->refcnt))
    263		kfree(addr);
    264}
    265
    266/*
    267 *	Check unix socket name:
    268 *		- should be not zero length.
    269 *	        - if started by not zero, should be NULL terminated (FS object)
    270 *		- if started by zero, it is abstract name.
    271 */
    272
    273static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
    274{
    275	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
    276	    addr_len > sizeof(*sunaddr))
    277		return -EINVAL;
    278
    279	if (sunaddr->sun_family != AF_UNIX)
    280		return -EINVAL;
    281
    282	return 0;
    283}
    284
    285static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
    286{
    287	/* This may look like an off by one error but it is a bit more
    288	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
    289	 * sun_path[108] doesn't as such exist.  However in kernel space
    290	 * we are guaranteed that it is a valid memory location in our
    291	 * kernel address buffer because syscall functions always pass
    292	 * a pointer of struct sockaddr_storage which has a bigger buffer
    293	 * than 108.
    294	 */
    295	((char *)sunaddr)[addr_len] = 0;
    296}
    297
    298static void __unix_remove_socket(struct sock *sk)
    299{
    300	sk_del_node_init(sk);
    301}
    302
    303static void __unix_insert_socket(struct sock *sk)
    304{
    305	WARN_ON(!sk_unhashed(sk));
    306	sk_add_node(sk, &unix_socket_table[sk->sk_hash]);
    307}
    308
    309static void __unix_set_addr_hash(struct sock *sk, struct unix_address *addr,
    310				 unsigned int hash)
    311{
    312	__unix_remove_socket(sk);
    313	smp_store_release(&unix_sk(sk)->addr, addr);
    314
    315	sk->sk_hash = hash;
    316	__unix_insert_socket(sk);
    317}
    318
    319static void unix_remove_socket(struct sock *sk)
    320{
    321	spin_lock(&unix_table_locks[sk->sk_hash]);
    322	__unix_remove_socket(sk);
    323	spin_unlock(&unix_table_locks[sk->sk_hash]);
    324}
    325
    326static void unix_insert_unbound_socket(struct sock *sk)
    327{
    328	spin_lock(&unix_table_locks[sk->sk_hash]);
    329	__unix_insert_socket(sk);
    330	spin_unlock(&unix_table_locks[sk->sk_hash]);
    331}
    332
    333static struct sock *__unix_find_socket_byname(struct net *net,
    334					      struct sockaddr_un *sunname,
    335					      int len, unsigned int hash)
    336{
    337	struct sock *s;
    338
    339	sk_for_each(s, &unix_socket_table[hash]) {
    340		struct unix_sock *u = unix_sk(s);
    341
    342		if (!net_eq(sock_net(s), net))
    343			continue;
    344
    345		if (u->addr->len == len &&
    346		    !memcmp(u->addr->name, sunname, len))
    347			return s;
    348	}
    349	return NULL;
    350}
    351
    352static inline struct sock *unix_find_socket_byname(struct net *net,
    353						   struct sockaddr_un *sunname,
    354						   int len, unsigned int hash)
    355{
    356	struct sock *s;
    357
    358	spin_lock(&unix_table_locks[hash]);
    359	s = __unix_find_socket_byname(net, sunname, len, hash);
    360	if (s)
    361		sock_hold(s);
    362	spin_unlock(&unix_table_locks[hash]);
    363	return s;
    364}
    365
    366static struct sock *unix_find_socket_byinode(struct inode *i)
    367{
    368	unsigned int hash = unix_bsd_hash(i);
    369	struct sock *s;
    370
    371	spin_lock(&unix_table_locks[hash]);
    372	sk_for_each(s, &unix_socket_table[hash]) {
    373		struct dentry *dentry = unix_sk(s)->path.dentry;
    374
    375		if (dentry && d_backing_inode(dentry) == i) {
    376			sock_hold(s);
    377			spin_unlock(&unix_table_locks[hash]);
    378			return s;
    379		}
    380	}
    381	spin_unlock(&unix_table_locks[hash]);
    382	return NULL;
    383}
    384
    385/* Support code for asymmetrically connected dgram sockets
    386 *
    387 * If a datagram socket is connected to a socket not itself connected
    388 * to the first socket (eg, /dev/log), clients may only enqueue more
    389 * messages if the present receive queue of the server socket is not
    390 * "too large". This means there's a second writeability condition
    391 * poll and sendmsg need to test. The dgram recv code will do a wake
    392 * up on the peer_wait wait queue of a socket upon reception of a
    393 * datagram which needs to be propagated to sleeping would-be writers
    394 * since these might not have sent anything so far. This can't be
    395 * accomplished via poll_wait because the lifetime of the server
    396 * socket might be less than that of its clients if these break their
    397 * association with it or if the server socket is closed while clients
    398 * are still connected to it and there's no way to inform "a polling
    399 * implementation" that it should let go of a certain wait queue
    400 *
    401 * In order to propagate a wake up, a wait_queue_entry_t of the client
    402 * socket is enqueued on the peer_wait queue of the server socket
    403 * whose wake function does a wake_up on the ordinary client socket
    404 * wait queue. This connection is established whenever a write (or
    405 * poll for write) hit the flow control condition and broken when the
    406 * association to the server socket is dissolved or after a wake up
    407 * was relayed.
    408 */
    409
    410static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
    411				      void *key)
    412{
    413	struct unix_sock *u;
    414	wait_queue_head_t *u_sleep;
    415
    416	u = container_of(q, struct unix_sock, peer_wake);
    417
    418	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
    419			    q);
    420	u->peer_wake.private = NULL;
    421
    422	/* relaying can only happen while the wq still exists */
    423	u_sleep = sk_sleep(&u->sk);
    424	if (u_sleep)
    425		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
    426
    427	return 0;
    428}
    429
    430static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
    431{
    432	struct unix_sock *u, *u_other;
    433	int rc;
    434
    435	u = unix_sk(sk);
    436	u_other = unix_sk(other);
    437	rc = 0;
    438	spin_lock(&u_other->peer_wait.lock);
    439
    440	if (!u->peer_wake.private) {
    441		u->peer_wake.private = other;
    442		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
    443
    444		rc = 1;
    445	}
    446
    447	spin_unlock(&u_other->peer_wait.lock);
    448	return rc;
    449}
    450
    451static void unix_dgram_peer_wake_disconnect(struct sock *sk,
    452					    struct sock *other)
    453{
    454	struct unix_sock *u, *u_other;
    455
    456	u = unix_sk(sk);
    457	u_other = unix_sk(other);
    458	spin_lock(&u_other->peer_wait.lock);
    459
    460	if (u->peer_wake.private == other) {
    461		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
    462		u->peer_wake.private = NULL;
    463	}
    464
    465	spin_unlock(&u_other->peer_wait.lock);
    466}
    467
    468static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
    469						   struct sock *other)
    470{
    471	unix_dgram_peer_wake_disconnect(sk, other);
    472	wake_up_interruptible_poll(sk_sleep(sk),
    473				   EPOLLOUT |
    474				   EPOLLWRNORM |
    475				   EPOLLWRBAND);
    476}
    477
    478/* preconditions:
    479 *	- unix_peer(sk) == other
    480 *	- association is stable
    481 */
    482static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
    483{
    484	int connected;
    485
    486	connected = unix_dgram_peer_wake_connect(sk, other);
    487
    488	/* If other is SOCK_DEAD, we want to make sure we signal
    489	 * POLLOUT, such that a subsequent write() can get a
    490	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
    491	 * to other and its full, we will hang waiting for POLLOUT.
    492	 */
    493	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
    494		return 1;
    495
    496	if (connected)
    497		unix_dgram_peer_wake_disconnect(sk, other);
    498
    499	return 0;
    500}
    501
    502static int unix_writable(const struct sock *sk)
    503{
    504	return sk->sk_state != TCP_LISTEN &&
    505	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
    506}
    507
    508static void unix_write_space(struct sock *sk)
    509{
    510	struct socket_wq *wq;
    511
    512	rcu_read_lock();
    513	if (unix_writable(sk)) {
    514		wq = rcu_dereference(sk->sk_wq);
    515		if (skwq_has_sleeper(wq))
    516			wake_up_interruptible_sync_poll(&wq->wait,
    517				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
    518		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
    519	}
    520	rcu_read_unlock();
    521}
    522
    523/* When dgram socket disconnects (or changes its peer), we clear its receive
    524 * queue of packets arrived from previous peer. First, it allows to do
    525 * flow control based only on wmem_alloc; second, sk connected to peer
    526 * may receive messages only from that peer. */
    527static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
    528{
    529	if (!skb_queue_empty(&sk->sk_receive_queue)) {
    530		skb_queue_purge(&sk->sk_receive_queue);
    531		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
    532
    533		/* If one link of bidirectional dgram pipe is disconnected,
    534		 * we signal error. Messages are lost. Do not make this,
    535		 * when peer was not connected to us.
    536		 */
    537		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
    538			other->sk_err = ECONNRESET;
    539			sk_error_report(other);
    540		}
    541	}
    542	other->sk_state = TCP_CLOSE;
    543}
    544
    545static void unix_sock_destructor(struct sock *sk)
    546{
    547	struct unix_sock *u = unix_sk(sk);
    548
    549	skb_queue_purge(&sk->sk_receive_queue);
    550
    551#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
    552	if (u->oob_skb) {
    553		kfree_skb(u->oob_skb);
    554		u->oob_skb = NULL;
    555	}
    556#endif
    557	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
    558	WARN_ON(!sk_unhashed(sk));
    559	WARN_ON(sk->sk_socket);
    560	if (!sock_flag(sk, SOCK_DEAD)) {
    561		pr_info("Attempt to release alive unix socket: %p\n", sk);
    562		return;
    563	}
    564
    565	if (u->addr)
    566		unix_release_addr(u->addr);
    567
    568	atomic_long_dec(&unix_nr_socks);
    569	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
    570#ifdef UNIX_REFCNT_DEBUG
    571	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
    572		atomic_long_read(&unix_nr_socks));
    573#endif
    574}
    575
    576static void unix_release_sock(struct sock *sk, int embrion)
    577{
    578	struct unix_sock *u = unix_sk(sk);
    579	struct path path;
    580	struct sock *skpair;
    581	struct sk_buff *skb;
    582	int state;
    583
    584	unix_remove_socket(sk);
    585
    586	/* Clear state */
    587	unix_state_lock(sk);
    588	sock_orphan(sk);
    589	sk->sk_shutdown = SHUTDOWN_MASK;
    590	path	     = u->path;
    591	u->path.dentry = NULL;
    592	u->path.mnt = NULL;
    593	state = sk->sk_state;
    594	sk->sk_state = TCP_CLOSE;
    595
    596	skpair = unix_peer(sk);
    597	unix_peer(sk) = NULL;
    598
    599	unix_state_unlock(sk);
    600
    601	wake_up_interruptible_all(&u->peer_wait);
    602
    603	if (skpair != NULL) {
    604		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
    605			unix_state_lock(skpair);
    606			/* No more writes */
    607			skpair->sk_shutdown = SHUTDOWN_MASK;
    608			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
    609				skpair->sk_err = ECONNRESET;
    610			unix_state_unlock(skpair);
    611			skpair->sk_state_change(skpair);
    612			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
    613		}
    614
    615		unix_dgram_peer_wake_disconnect(sk, skpair);
    616		sock_put(skpair); /* It may now die */
    617	}
    618
    619	/* Try to flush out this socket. Throw out buffers at least */
    620
    621	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
    622		if (state == TCP_LISTEN)
    623			unix_release_sock(skb->sk, 1);
    624		/* passed fds are erased in the kfree_skb hook	      */
    625		UNIXCB(skb).consumed = skb->len;
    626		kfree_skb(skb);
    627	}
    628
    629	if (path.dentry)
    630		path_put(&path);
    631
    632	sock_put(sk);
    633
    634	/* ---- Socket is dead now and most probably destroyed ---- */
    635
    636	/*
    637	 * Fixme: BSD difference: In BSD all sockets connected to us get
    638	 *	  ECONNRESET and we die on the spot. In Linux we behave
    639	 *	  like files and pipes do and wait for the last
    640	 *	  dereference.
    641	 *
    642	 * Can't we simply set sock->err?
    643	 *
    644	 *	  What the above comment does talk about? --ANK(980817)
    645	 */
    646
    647	if (unix_tot_inflight)
    648		unix_gc();		/* Garbage collect fds */
    649}
    650
    651static void init_peercred(struct sock *sk)
    652{
    653	const struct cred *old_cred;
    654	struct pid *old_pid;
    655
    656	spin_lock(&sk->sk_peer_lock);
    657	old_pid = sk->sk_peer_pid;
    658	old_cred = sk->sk_peer_cred;
    659	sk->sk_peer_pid  = get_pid(task_tgid(current));
    660	sk->sk_peer_cred = get_current_cred();
    661	spin_unlock(&sk->sk_peer_lock);
    662
    663	put_pid(old_pid);
    664	put_cred(old_cred);
    665}
    666
    667static void copy_peercred(struct sock *sk, struct sock *peersk)
    668{
    669	const struct cred *old_cred;
    670	struct pid *old_pid;
    671
    672	if (sk < peersk) {
    673		spin_lock(&sk->sk_peer_lock);
    674		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
    675	} else {
    676		spin_lock(&peersk->sk_peer_lock);
    677		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
    678	}
    679	old_pid = sk->sk_peer_pid;
    680	old_cred = sk->sk_peer_cred;
    681	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
    682	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
    683
    684	spin_unlock(&sk->sk_peer_lock);
    685	spin_unlock(&peersk->sk_peer_lock);
    686
    687	put_pid(old_pid);
    688	put_cred(old_cred);
    689}
    690
    691static int unix_listen(struct socket *sock, int backlog)
    692{
    693	int err;
    694	struct sock *sk = sock->sk;
    695	struct unix_sock *u = unix_sk(sk);
    696
    697	err = -EOPNOTSUPP;
    698	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
    699		goto out;	/* Only stream/seqpacket sockets accept */
    700	err = -EINVAL;
    701	if (!u->addr)
    702		goto out;	/* No listens on an unbound socket */
    703	unix_state_lock(sk);
    704	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
    705		goto out_unlock;
    706	if (backlog > sk->sk_max_ack_backlog)
    707		wake_up_interruptible_all(&u->peer_wait);
    708	sk->sk_max_ack_backlog	= backlog;
    709	sk->sk_state		= TCP_LISTEN;
    710	/* set credentials so connect can copy them */
    711	init_peercred(sk);
    712	err = 0;
    713
    714out_unlock:
    715	unix_state_unlock(sk);
    716out:
    717	return err;
    718}
    719
    720static int unix_release(struct socket *);
    721static int unix_bind(struct socket *, struct sockaddr *, int);
    722static int unix_stream_connect(struct socket *, struct sockaddr *,
    723			       int addr_len, int flags);
    724static int unix_socketpair(struct socket *, struct socket *);
    725static int unix_accept(struct socket *, struct socket *, int, bool);
    726static int unix_getname(struct socket *, struct sockaddr *, int);
    727static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
    728static __poll_t unix_dgram_poll(struct file *, struct socket *,
    729				    poll_table *);
    730static int unix_ioctl(struct socket *, unsigned int, unsigned long);
    731#ifdef CONFIG_COMPAT
    732static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
    733#endif
    734static int unix_shutdown(struct socket *, int);
    735static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
    736static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
    737static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
    738				    size_t size, int flags);
    739static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
    740				       struct pipe_inode_info *, size_t size,
    741				       unsigned int flags);
    742static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
    743static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
    744static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
    745			  sk_read_actor_t recv_actor);
    746static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc,
    747				 sk_read_actor_t recv_actor);
    748static int unix_dgram_connect(struct socket *, struct sockaddr *,
    749			      int, int);
    750static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
    751static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
    752				  int);
    753
    754static int unix_set_peek_off(struct sock *sk, int val)
    755{
    756	struct unix_sock *u = unix_sk(sk);
    757
    758	if (mutex_lock_interruptible(&u->iolock))
    759		return -EINTR;
    760
    761	sk->sk_peek_off = val;
    762	mutex_unlock(&u->iolock);
    763
    764	return 0;
    765}
    766
    767#ifdef CONFIG_PROC_FS
    768static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
    769{
    770	struct sock *sk = sock->sk;
    771	struct unix_sock *u;
    772
    773	if (sk) {
    774		u = unix_sk(sock->sk);
    775		seq_printf(m, "scm_fds: %u\n",
    776			   atomic_read(&u->scm_stat.nr_fds));
    777	}
    778}
    779#else
    780#define unix_show_fdinfo NULL
    781#endif
    782
    783static const struct proto_ops unix_stream_ops = {
    784	.family =	PF_UNIX,
    785	.owner =	THIS_MODULE,
    786	.release =	unix_release,
    787	.bind =		unix_bind,
    788	.connect =	unix_stream_connect,
    789	.socketpair =	unix_socketpair,
    790	.accept =	unix_accept,
    791	.getname =	unix_getname,
    792	.poll =		unix_poll,
    793	.ioctl =	unix_ioctl,
    794#ifdef CONFIG_COMPAT
    795	.compat_ioctl =	unix_compat_ioctl,
    796#endif
    797	.listen =	unix_listen,
    798	.shutdown =	unix_shutdown,
    799	.sendmsg =	unix_stream_sendmsg,
    800	.recvmsg =	unix_stream_recvmsg,
    801	.read_sock =	unix_stream_read_sock,
    802	.mmap =		sock_no_mmap,
    803	.sendpage =	unix_stream_sendpage,
    804	.splice_read =	unix_stream_splice_read,
    805	.set_peek_off =	unix_set_peek_off,
    806	.show_fdinfo =	unix_show_fdinfo,
    807};
    808
    809static const struct proto_ops unix_dgram_ops = {
    810	.family =	PF_UNIX,
    811	.owner =	THIS_MODULE,
    812	.release =	unix_release,
    813	.bind =		unix_bind,
    814	.connect =	unix_dgram_connect,
    815	.socketpair =	unix_socketpair,
    816	.accept =	sock_no_accept,
    817	.getname =	unix_getname,
    818	.poll =		unix_dgram_poll,
    819	.ioctl =	unix_ioctl,
    820#ifdef CONFIG_COMPAT
    821	.compat_ioctl =	unix_compat_ioctl,
    822#endif
    823	.listen =	sock_no_listen,
    824	.shutdown =	unix_shutdown,
    825	.sendmsg =	unix_dgram_sendmsg,
    826	.read_sock =	unix_read_sock,
    827	.recvmsg =	unix_dgram_recvmsg,
    828	.mmap =		sock_no_mmap,
    829	.sendpage =	sock_no_sendpage,
    830	.set_peek_off =	unix_set_peek_off,
    831	.show_fdinfo =	unix_show_fdinfo,
    832};
    833
    834static const struct proto_ops unix_seqpacket_ops = {
    835	.family =	PF_UNIX,
    836	.owner =	THIS_MODULE,
    837	.release =	unix_release,
    838	.bind =		unix_bind,
    839	.connect =	unix_stream_connect,
    840	.socketpair =	unix_socketpair,
    841	.accept =	unix_accept,
    842	.getname =	unix_getname,
    843	.poll =		unix_dgram_poll,
    844	.ioctl =	unix_ioctl,
    845#ifdef CONFIG_COMPAT
    846	.compat_ioctl =	unix_compat_ioctl,
    847#endif
    848	.listen =	unix_listen,
    849	.shutdown =	unix_shutdown,
    850	.sendmsg =	unix_seqpacket_sendmsg,
    851	.recvmsg =	unix_seqpacket_recvmsg,
    852	.mmap =		sock_no_mmap,
    853	.sendpage =	sock_no_sendpage,
    854	.set_peek_off =	unix_set_peek_off,
    855	.show_fdinfo =	unix_show_fdinfo,
    856};
    857
    858static void unix_close(struct sock *sk, long timeout)
    859{
    860	/* Nothing to do here, unix socket does not need a ->close().
    861	 * This is merely for sockmap.
    862	 */
    863}
    864
    865static void unix_unhash(struct sock *sk)
    866{
    867	/* Nothing to do here, unix socket does not need a ->unhash().
    868	 * This is merely for sockmap.
    869	 */
    870}
    871
    872struct proto unix_dgram_proto = {
    873	.name			= "UNIX",
    874	.owner			= THIS_MODULE,
    875	.obj_size		= sizeof(struct unix_sock),
    876	.close			= unix_close,
    877#ifdef CONFIG_BPF_SYSCALL
    878	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
    879#endif
    880};
    881
    882struct proto unix_stream_proto = {
    883	.name			= "UNIX-STREAM",
    884	.owner			= THIS_MODULE,
    885	.obj_size		= sizeof(struct unix_sock),
    886	.close			= unix_close,
    887	.unhash			= unix_unhash,
    888#ifdef CONFIG_BPF_SYSCALL
    889	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
    890#endif
    891};
    892
    893static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
    894{
    895	struct unix_sock *u;
    896	struct sock *sk;
    897	int err;
    898
    899	atomic_long_inc(&unix_nr_socks);
    900	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
    901		err = -ENFILE;
    902		goto err;
    903	}
    904
    905	if (type == SOCK_STREAM)
    906		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
    907	else /*dgram and  seqpacket */
    908		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
    909
    910	if (!sk) {
    911		err = -ENOMEM;
    912		goto err;
    913	}
    914
    915	sock_init_data(sock, sk);
    916
    917	sk->sk_hash		= unix_unbound_hash(sk);
    918	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
    919	sk->sk_write_space	= unix_write_space;
    920	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
    921	sk->sk_destruct		= unix_sock_destructor;
    922	u	  = unix_sk(sk);
    923	u->path.dentry = NULL;
    924	u->path.mnt = NULL;
    925	spin_lock_init(&u->lock);
    926	atomic_long_set(&u->inflight, 0);
    927	INIT_LIST_HEAD(&u->link);
    928	mutex_init(&u->iolock); /* single task reading lock */
    929	mutex_init(&u->bindlock); /* single task binding lock */
    930	init_waitqueue_head(&u->peer_wait);
    931	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
    932	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
    933	unix_insert_unbound_socket(sk);
    934
    935	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
    936
    937	return sk;
    938
    939err:
    940	atomic_long_dec(&unix_nr_socks);
    941	return ERR_PTR(err);
    942}
    943
    944static int unix_create(struct net *net, struct socket *sock, int protocol,
    945		       int kern)
    946{
    947	struct sock *sk;
    948
    949	if (protocol && protocol != PF_UNIX)
    950		return -EPROTONOSUPPORT;
    951
    952	sock->state = SS_UNCONNECTED;
    953
    954	switch (sock->type) {
    955	case SOCK_STREAM:
    956		sock->ops = &unix_stream_ops;
    957		break;
    958		/*
    959		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
    960		 *	nothing uses it.
    961		 */
    962	case SOCK_RAW:
    963		sock->type = SOCK_DGRAM;
    964		fallthrough;
    965	case SOCK_DGRAM:
    966		sock->ops = &unix_dgram_ops;
    967		break;
    968	case SOCK_SEQPACKET:
    969		sock->ops = &unix_seqpacket_ops;
    970		break;
    971	default:
    972		return -ESOCKTNOSUPPORT;
    973	}
    974
    975	sk = unix_create1(net, sock, kern, sock->type);
    976	if (IS_ERR(sk))
    977		return PTR_ERR(sk);
    978
    979	return 0;
    980}
    981
    982static int unix_release(struct socket *sock)
    983{
    984	struct sock *sk = sock->sk;
    985
    986	if (!sk)
    987		return 0;
    988
    989	sk->sk_prot->close(sk, 0);
    990	unix_release_sock(sk, 0);
    991	sock->sk = NULL;
    992
    993	return 0;
    994}
    995
    996static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr,
    997				  int addr_len, int type)
    998{
    999	struct inode *inode;
   1000	struct path path;
   1001	struct sock *sk;
   1002	int err;
   1003
   1004	unix_mkname_bsd(sunaddr, addr_len);
   1005	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
   1006	if (err)
   1007		goto fail;
   1008
   1009	err = path_permission(&path, MAY_WRITE);
   1010	if (err)
   1011		goto path_put;
   1012
   1013	err = -ECONNREFUSED;
   1014	inode = d_backing_inode(path.dentry);
   1015	if (!S_ISSOCK(inode->i_mode))
   1016		goto path_put;
   1017
   1018	sk = unix_find_socket_byinode(inode);
   1019	if (!sk)
   1020		goto path_put;
   1021
   1022	err = -EPROTOTYPE;
   1023	if (sk->sk_type == type)
   1024		touch_atime(&path);
   1025	else
   1026		goto sock_put;
   1027
   1028	path_put(&path);
   1029
   1030	return sk;
   1031
   1032sock_put:
   1033	sock_put(sk);
   1034path_put:
   1035	path_put(&path);
   1036fail:
   1037	return ERR_PTR(err);
   1038}
   1039
   1040static struct sock *unix_find_abstract(struct net *net,
   1041				       struct sockaddr_un *sunaddr,
   1042				       int addr_len, int type)
   1043{
   1044	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
   1045	struct dentry *dentry;
   1046	struct sock *sk;
   1047
   1048	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
   1049	if (!sk)
   1050		return ERR_PTR(-ECONNREFUSED);
   1051
   1052	dentry = unix_sk(sk)->path.dentry;
   1053	if (dentry)
   1054		touch_atime(&unix_sk(sk)->path);
   1055
   1056	return sk;
   1057}
   1058
   1059static struct sock *unix_find_other(struct net *net,
   1060				    struct sockaddr_un *sunaddr,
   1061				    int addr_len, int type)
   1062{
   1063	struct sock *sk;
   1064
   1065	if (sunaddr->sun_path[0])
   1066		sk = unix_find_bsd(net, sunaddr, addr_len, type);
   1067	else
   1068		sk = unix_find_abstract(net, sunaddr, addr_len, type);
   1069
   1070	return sk;
   1071}
   1072
   1073static int unix_autobind(struct sock *sk)
   1074{
   1075	unsigned int new_hash, old_hash = sk->sk_hash;
   1076	struct unix_sock *u = unix_sk(sk);
   1077	struct unix_address *addr;
   1078	u32 lastnum, ordernum;
   1079	int err;
   1080
   1081	err = mutex_lock_interruptible(&u->bindlock);
   1082	if (err)
   1083		return err;
   1084
   1085	if (u->addr)
   1086		goto out;
   1087
   1088	err = -ENOMEM;
   1089	addr = kzalloc(sizeof(*addr) +
   1090		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
   1091	if (!addr)
   1092		goto out;
   1093
   1094	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
   1095	addr->name->sun_family = AF_UNIX;
   1096	refcount_set(&addr->refcnt, 1);
   1097
   1098	ordernum = prandom_u32();
   1099	lastnum = ordernum & 0xFFFFF;
   1100retry:
   1101	ordernum = (ordernum + 1) & 0xFFFFF;
   1102	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
   1103
   1104	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
   1105	unix_table_double_lock(old_hash, new_hash);
   1106
   1107	if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
   1108				      new_hash)) {
   1109		unix_table_double_unlock(old_hash, new_hash);
   1110
   1111		/* __unix_find_socket_byname() may take long time if many names
   1112		 * are already in use.
   1113		 */
   1114		cond_resched();
   1115
   1116		if (ordernum == lastnum) {
   1117			/* Give up if all names seems to be in use. */
   1118			err = -ENOSPC;
   1119			unix_release_addr(addr);
   1120			goto out;
   1121		}
   1122
   1123		goto retry;
   1124	}
   1125
   1126	__unix_set_addr_hash(sk, addr, new_hash);
   1127	unix_table_double_unlock(old_hash, new_hash);
   1128	err = 0;
   1129
   1130out:	mutex_unlock(&u->bindlock);
   1131	return err;
   1132}
   1133
   1134static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
   1135			 int addr_len)
   1136{
   1137	umode_t mode = S_IFSOCK |
   1138	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
   1139	unsigned int new_hash, old_hash = sk->sk_hash;
   1140	struct unix_sock *u = unix_sk(sk);
   1141	struct user_namespace *ns; // barf...
   1142	struct unix_address *addr;
   1143	struct dentry *dentry;
   1144	struct path parent;
   1145	int err;
   1146
   1147	unix_mkname_bsd(sunaddr, addr_len);
   1148	addr_len = strlen(sunaddr->sun_path) +
   1149		offsetof(struct sockaddr_un, sun_path) + 1;
   1150
   1151	addr = unix_create_addr(sunaddr, addr_len);
   1152	if (!addr)
   1153		return -ENOMEM;
   1154
   1155	/*
   1156	 * Get the parent directory, calculate the hash for last
   1157	 * component.
   1158	 */
   1159	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
   1160	if (IS_ERR(dentry)) {
   1161		err = PTR_ERR(dentry);
   1162		goto out;
   1163	}
   1164
   1165	/*
   1166	 * All right, let's create it.
   1167	 */
   1168	ns = mnt_user_ns(parent.mnt);
   1169	err = security_path_mknod(&parent, dentry, mode, 0);
   1170	if (!err)
   1171		err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0);
   1172	if (err)
   1173		goto out_path;
   1174	err = mutex_lock_interruptible(&u->bindlock);
   1175	if (err)
   1176		goto out_unlink;
   1177	if (u->addr)
   1178		goto out_unlock;
   1179
   1180	new_hash = unix_bsd_hash(d_backing_inode(dentry));
   1181	unix_table_double_lock(old_hash, new_hash);
   1182	u->path.mnt = mntget(parent.mnt);
   1183	u->path.dentry = dget(dentry);
   1184	__unix_set_addr_hash(sk, addr, new_hash);
   1185	unix_table_double_unlock(old_hash, new_hash);
   1186	mutex_unlock(&u->bindlock);
   1187	done_path_create(&parent, dentry);
   1188	return 0;
   1189
   1190out_unlock:
   1191	mutex_unlock(&u->bindlock);
   1192	err = -EINVAL;
   1193out_unlink:
   1194	/* failed after successful mknod?  unlink what we'd created... */
   1195	vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL);
   1196out_path:
   1197	done_path_create(&parent, dentry);
   1198out:
   1199	unix_release_addr(addr);
   1200	return err == -EEXIST ? -EADDRINUSE : err;
   1201}
   1202
   1203static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
   1204			      int addr_len)
   1205{
   1206	unsigned int new_hash, old_hash = sk->sk_hash;
   1207	struct unix_sock *u = unix_sk(sk);
   1208	struct unix_address *addr;
   1209	int err;
   1210
   1211	addr = unix_create_addr(sunaddr, addr_len);
   1212	if (!addr)
   1213		return -ENOMEM;
   1214
   1215	err = mutex_lock_interruptible(&u->bindlock);
   1216	if (err)
   1217		goto out;
   1218
   1219	if (u->addr) {
   1220		err = -EINVAL;
   1221		goto out_mutex;
   1222	}
   1223
   1224	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
   1225	unix_table_double_lock(old_hash, new_hash);
   1226
   1227	if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
   1228				      new_hash))
   1229		goto out_spin;
   1230
   1231	__unix_set_addr_hash(sk, addr, new_hash);
   1232	unix_table_double_unlock(old_hash, new_hash);
   1233	mutex_unlock(&u->bindlock);
   1234	return 0;
   1235
   1236out_spin:
   1237	unix_table_double_unlock(old_hash, new_hash);
   1238	err = -EADDRINUSE;
   1239out_mutex:
   1240	mutex_unlock(&u->bindlock);
   1241out:
   1242	unix_release_addr(addr);
   1243	return err;
   1244}
   1245
   1246static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
   1247{
   1248	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
   1249	struct sock *sk = sock->sk;
   1250	int err;
   1251
   1252	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
   1253	    sunaddr->sun_family == AF_UNIX)
   1254		return unix_autobind(sk);
   1255
   1256	err = unix_validate_addr(sunaddr, addr_len);
   1257	if (err)
   1258		return err;
   1259
   1260	if (sunaddr->sun_path[0])
   1261		err = unix_bind_bsd(sk, sunaddr, addr_len);
   1262	else
   1263		err = unix_bind_abstract(sk, sunaddr, addr_len);
   1264
   1265	return err;
   1266}
   1267
   1268static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
   1269{
   1270	if (unlikely(sk1 == sk2) || !sk2) {
   1271		unix_state_lock(sk1);
   1272		return;
   1273	}
   1274	if (sk1 < sk2) {
   1275		unix_state_lock(sk1);
   1276		unix_state_lock_nested(sk2);
   1277	} else {
   1278		unix_state_lock(sk2);
   1279		unix_state_lock_nested(sk1);
   1280	}
   1281}
   1282
   1283static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
   1284{
   1285	if (unlikely(sk1 == sk2) || !sk2) {
   1286		unix_state_unlock(sk1);
   1287		return;
   1288	}
   1289	unix_state_unlock(sk1);
   1290	unix_state_unlock(sk2);
   1291}
   1292
   1293static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
   1294			      int alen, int flags)
   1295{
   1296	struct sock *sk = sock->sk;
   1297	struct net *net = sock_net(sk);
   1298	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
   1299	struct sock *other;
   1300	int err;
   1301
   1302	err = -EINVAL;
   1303	if (alen < offsetofend(struct sockaddr, sa_family))
   1304		goto out;
   1305
   1306	if (addr->sa_family != AF_UNSPEC) {
   1307		err = unix_validate_addr(sunaddr, alen);
   1308		if (err)
   1309			goto out;
   1310
   1311		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
   1312		    !unix_sk(sk)->addr) {
   1313			err = unix_autobind(sk);
   1314			if (err)
   1315				goto out;
   1316		}
   1317
   1318restart:
   1319		other = unix_find_other(net, sunaddr, alen, sock->type);
   1320		if (IS_ERR(other)) {
   1321			err = PTR_ERR(other);
   1322			goto out;
   1323		}
   1324
   1325		unix_state_double_lock(sk, other);
   1326
   1327		/* Apparently VFS overslept socket death. Retry. */
   1328		if (sock_flag(other, SOCK_DEAD)) {
   1329			unix_state_double_unlock(sk, other);
   1330			sock_put(other);
   1331			goto restart;
   1332		}
   1333
   1334		err = -EPERM;
   1335		if (!unix_may_send(sk, other))
   1336			goto out_unlock;
   1337
   1338		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
   1339		if (err)
   1340			goto out_unlock;
   1341
   1342		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
   1343	} else {
   1344		/*
   1345		 *	1003.1g breaking connected state with AF_UNSPEC
   1346		 */
   1347		other = NULL;
   1348		unix_state_double_lock(sk, other);
   1349	}
   1350
   1351	/*
   1352	 * If it was connected, reconnect.
   1353	 */
   1354	if (unix_peer(sk)) {
   1355		struct sock *old_peer = unix_peer(sk);
   1356
   1357		unix_peer(sk) = other;
   1358		if (!other)
   1359			sk->sk_state = TCP_CLOSE;
   1360		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
   1361
   1362		unix_state_double_unlock(sk, other);
   1363
   1364		if (other != old_peer)
   1365			unix_dgram_disconnected(sk, old_peer);
   1366		sock_put(old_peer);
   1367	} else {
   1368		unix_peer(sk) = other;
   1369		unix_state_double_unlock(sk, other);
   1370	}
   1371
   1372	return 0;
   1373
   1374out_unlock:
   1375	unix_state_double_unlock(sk, other);
   1376	sock_put(other);
   1377out:
   1378	return err;
   1379}
   1380
   1381static long unix_wait_for_peer(struct sock *other, long timeo)
   1382	__releases(&unix_sk(other)->lock)
   1383{
   1384	struct unix_sock *u = unix_sk(other);
   1385	int sched;
   1386	DEFINE_WAIT(wait);
   1387
   1388	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
   1389
   1390	sched = !sock_flag(other, SOCK_DEAD) &&
   1391		!(other->sk_shutdown & RCV_SHUTDOWN) &&
   1392		unix_recvq_full(other);
   1393
   1394	unix_state_unlock(other);
   1395
   1396	if (sched)
   1397		timeo = schedule_timeout(timeo);
   1398
   1399	finish_wait(&u->peer_wait, &wait);
   1400	return timeo;
   1401}
   1402
   1403static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
   1404			       int addr_len, int flags)
   1405{
   1406	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
   1407	struct sock *sk = sock->sk;
   1408	struct net *net = sock_net(sk);
   1409	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
   1410	struct sock *newsk = NULL;
   1411	struct sock *other = NULL;
   1412	struct sk_buff *skb = NULL;
   1413	int st;
   1414	int err;
   1415	long timeo;
   1416
   1417	err = unix_validate_addr(sunaddr, addr_len);
   1418	if (err)
   1419		goto out;
   1420
   1421	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
   1422		err = unix_autobind(sk);
   1423		if (err)
   1424			goto out;
   1425	}
   1426
   1427	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
   1428
   1429	/* First of all allocate resources.
   1430	   If we will make it after state is locked,
   1431	   we will have to recheck all again in any case.
   1432	 */
   1433
   1434	/* create new sock for complete connection */
   1435	newsk = unix_create1(sock_net(sk), NULL, 0, sock->type);
   1436	if (IS_ERR(newsk)) {
   1437		err = PTR_ERR(newsk);
   1438		newsk = NULL;
   1439		goto out;
   1440	}
   1441
   1442	err = -ENOMEM;
   1443
   1444	/* Allocate skb for sending to listening sock */
   1445	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
   1446	if (skb == NULL)
   1447		goto out;
   1448
   1449restart:
   1450	/*  Find listening sock. */
   1451	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
   1452	if (IS_ERR(other)) {
   1453		err = PTR_ERR(other);
   1454		other = NULL;
   1455		goto out;
   1456	}
   1457
   1458	/* Latch state of peer */
   1459	unix_state_lock(other);
   1460
   1461	/* Apparently VFS overslept socket death. Retry. */
   1462	if (sock_flag(other, SOCK_DEAD)) {
   1463		unix_state_unlock(other);
   1464		sock_put(other);
   1465		goto restart;
   1466	}
   1467
   1468	err = -ECONNREFUSED;
   1469	if (other->sk_state != TCP_LISTEN)
   1470		goto out_unlock;
   1471	if (other->sk_shutdown & RCV_SHUTDOWN)
   1472		goto out_unlock;
   1473
   1474	if (unix_recvq_full(other)) {
   1475		err = -EAGAIN;
   1476		if (!timeo)
   1477			goto out_unlock;
   1478
   1479		timeo = unix_wait_for_peer(other, timeo);
   1480
   1481		err = sock_intr_errno(timeo);
   1482		if (signal_pending(current))
   1483			goto out;
   1484		sock_put(other);
   1485		goto restart;
   1486	}
   1487
   1488	/* Latch our state.
   1489
   1490	   It is tricky place. We need to grab our state lock and cannot
   1491	   drop lock on peer. It is dangerous because deadlock is
   1492	   possible. Connect to self case and simultaneous
   1493	   attempt to connect are eliminated by checking socket
   1494	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
   1495	   check this before attempt to grab lock.
   1496
   1497	   Well, and we have to recheck the state after socket locked.
   1498	 */
   1499	st = sk->sk_state;
   1500
   1501	switch (st) {
   1502	case TCP_CLOSE:
   1503		/* This is ok... continue with connect */
   1504		break;
   1505	case TCP_ESTABLISHED:
   1506		/* Socket is already connected */
   1507		err = -EISCONN;
   1508		goto out_unlock;
   1509	default:
   1510		err = -EINVAL;
   1511		goto out_unlock;
   1512	}
   1513
   1514	unix_state_lock_nested(sk);
   1515
   1516	if (sk->sk_state != st) {
   1517		unix_state_unlock(sk);
   1518		unix_state_unlock(other);
   1519		sock_put(other);
   1520		goto restart;
   1521	}
   1522
   1523	err = security_unix_stream_connect(sk, other, newsk);
   1524	if (err) {
   1525		unix_state_unlock(sk);
   1526		goto out_unlock;
   1527	}
   1528
   1529	/* The way is open! Fastly set all the necessary fields... */
   1530
   1531	sock_hold(sk);
   1532	unix_peer(newsk)	= sk;
   1533	newsk->sk_state		= TCP_ESTABLISHED;
   1534	newsk->sk_type		= sk->sk_type;
   1535	init_peercred(newsk);
   1536	newu = unix_sk(newsk);
   1537	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
   1538	otheru = unix_sk(other);
   1539
   1540	/* copy address information from listening to new sock
   1541	 *
   1542	 * The contents of *(otheru->addr) and otheru->path
   1543	 * are seen fully set up here, since we have found
   1544	 * otheru in hash under unix_table_locks.  Insertion
   1545	 * into the hash chain we'd found it in had been done
   1546	 * in an earlier critical area protected by unix_table_locks,
   1547	 * the same one where we'd set *(otheru->addr) contents,
   1548	 * as well as otheru->path and otheru->addr itself.
   1549	 *
   1550	 * Using smp_store_release() here to set newu->addr
   1551	 * is enough to make those stores, as well as stores
   1552	 * to newu->path visible to anyone who gets newu->addr
   1553	 * by smp_load_acquire().  IOW, the same warranties
   1554	 * as for unix_sock instances bound in unix_bind() or
   1555	 * in unix_autobind().
   1556	 */
   1557	if (otheru->path.dentry) {
   1558		path_get(&otheru->path);
   1559		newu->path = otheru->path;
   1560	}
   1561	refcount_inc(&otheru->addr->refcnt);
   1562	smp_store_release(&newu->addr, otheru->addr);
   1563
   1564	/* Set credentials */
   1565	copy_peercred(sk, other);
   1566
   1567	sock->state	= SS_CONNECTED;
   1568	sk->sk_state	= TCP_ESTABLISHED;
   1569	sock_hold(newsk);
   1570
   1571	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
   1572	unix_peer(sk)	= newsk;
   1573
   1574	unix_state_unlock(sk);
   1575
   1576	/* take ten and send info to listening sock */
   1577	spin_lock(&other->sk_receive_queue.lock);
   1578	__skb_queue_tail(&other->sk_receive_queue, skb);
   1579	spin_unlock(&other->sk_receive_queue.lock);
   1580	unix_state_unlock(other);
   1581	other->sk_data_ready(other);
   1582	sock_put(other);
   1583	return 0;
   1584
   1585out_unlock:
   1586	if (other)
   1587		unix_state_unlock(other);
   1588
   1589out:
   1590	kfree_skb(skb);
   1591	if (newsk)
   1592		unix_release_sock(newsk, 0);
   1593	if (other)
   1594		sock_put(other);
   1595	return err;
   1596}
   1597
   1598static int unix_socketpair(struct socket *socka, struct socket *sockb)
   1599{
   1600	struct sock *ska = socka->sk, *skb = sockb->sk;
   1601
   1602	/* Join our sockets back to back */
   1603	sock_hold(ska);
   1604	sock_hold(skb);
   1605	unix_peer(ska) = skb;
   1606	unix_peer(skb) = ska;
   1607	init_peercred(ska);
   1608	init_peercred(skb);
   1609
   1610	ska->sk_state = TCP_ESTABLISHED;
   1611	skb->sk_state = TCP_ESTABLISHED;
   1612	socka->state  = SS_CONNECTED;
   1613	sockb->state  = SS_CONNECTED;
   1614	return 0;
   1615}
   1616
   1617static void unix_sock_inherit_flags(const struct socket *old,
   1618				    struct socket *new)
   1619{
   1620	if (test_bit(SOCK_PASSCRED, &old->flags))
   1621		set_bit(SOCK_PASSCRED, &new->flags);
   1622	if (test_bit(SOCK_PASSSEC, &old->flags))
   1623		set_bit(SOCK_PASSSEC, &new->flags);
   1624}
   1625
   1626static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
   1627		       bool kern)
   1628{
   1629	struct sock *sk = sock->sk;
   1630	struct sock *tsk;
   1631	struct sk_buff *skb;
   1632	int err;
   1633
   1634	err = -EOPNOTSUPP;
   1635	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
   1636		goto out;
   1637
   1638	err = -EINVAL;
   1639	if (sk->sk_state != TCP_LISTEN)
   1640		goto out;
   1641
   1642	/* If socket state is TCP_LISTEN it cannot change (for now...),
   1643	 * so that no locks are necessary.
   1644	 */
   1645
   1646	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
   1647				&err);
   1648	if (!skb) {
   1649		/* This means receive shutdown. */
   1650		if (err == 0)
   1651			err = -EINVAL;
   1652		goto out;
   1653	}
   1654
   1655	tsk = skb->sk;
   1656	skb_free_datagram(sk, skb);
   1657	wake_up_interruptible(&unix_sk(sk)->peer_wait);
   1658
   1659	/* attach accepted sock to socket */
   1660	unix_state_lock(tsk);
   1661	newsock->state = SS_CONNECTED;
   1662	unix_sock_inherit_flags(sock, newsock);
   1663	sock_graft(tsk, newsock);
   1664	unix_state_unlock(tsk);
   1665	return 0;
   1666
   1667out:
   1668	return err;
   1669}
   1670
   1671
   1672static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
   1673{
   1674	struct sock *sk = sock->sk;
   1675	struct unix_address *addr;
   1676	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
   1677	int err = 0;
   1678
   1679	if (peer) {
   1680		sk = unix_peer_get(sk);
   1681
   1682		err = -ENOTCONN;
   1683		if (!sk)
   1684			goto out;
   1685		err = 0;
   1686	} else {
   1687		sock_hold(sk);
   1688	}
   1689
   1690	addr = smp_load_acquire(&unix_sk(sk)->addr);
   1691	if (!addr) {
   1692		sunaddr->sun_family = AF_UNIX;
   1693		sunaddr->sun_path[0] = 0;
   1694		err = offsetof(struct sockaddr_un, sun_path);
   1695	} else {
   1696		err = addr->len;
   1697		memcpy(sunaddr, addr->name, addr->len);
   1698	}
   1699	sock_put(sk);
   1700out:
   1701	return err;
   1702}
   1703
   1704static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
   1705{
   1706	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
   1707
   1708	/*
   1709	 * Garbage collection of unix sockets starts by selecting a set of
   1710	 * candidate sockets which have reference only from being in flight
   1711	 * (total_refs == inflight_refs).  This condition is checked once during
   1712	 * the candidate collection phase, and candidates are marked as such, so
   1713	 * that non-candidates can later be ignored.  While inflight_refs is
   1714	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
   1715	 * is an instantaneous decision.
   1716	 *
   1717	 * Once a candidate, however, the socket must not be reinstalled into a
   1718	 * file descriptor while the garbage collection is in progress.
   1719	 *
   1720	 * If the above conditions are met, then the directed graph of
   1721	 * candidates (*) does not change while unix_gc_lock is held.
   1722	 *
   1723	 * Any operations that changes the file count through file descriptors
   1724	 * (dup, close, sendmsg) does not change the graph since candidates are
   1725	 * not installed in fds.
   1726	 *
   1727	 * Dequeing a candidate via recvmsg would install it into an fd, but
   1728	 * that takes unix_gc_lock to decrement the inflight count, so it's
   1729	 * serialized with garbage collection.
   1730	 *
   1731	 * MSG_PEEK is special in that it does not change the inflight count,
   1732	 * yet does install the socket into an fd.  The following lock/unlock
   1733	 * pair is to ensure serialization with garbage collection.  It must be
   1734	 * done between incrementing the file count and installing the file into
   1735	 * an fd.
   1736	 *
   1737	 * If garbage collection starts after the barrier provided by the
   1738	 * lock/unlock, then it will see the elevated refcount and not mark this
   1739	 * as a candidate.  If a garbage collection is already in progress
   1740	 * before the file count was incremented, then the lock/unlock pair will
   1741	 * ensure that garbage collection is finished before progressing to
   1742	 * installing the fd.
   1743	 *
   1744	 * (*) A -> B where B is on the queue of A or B is on the queue of C
   1745	 * which is on the queue of listening socket A.
   1746	 */
   1747	spin_lock(&unix_gc_lock);
   1748	spin_unlock(&unix_gc_lock);
   1749}
   1750
   1751static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
   1752{
   1753	int err = 0;
   1754
   1755	UNIXCB(skb).pid  = get_pid(scm->pid);
   1756	UNIXCB(skb).uid = scm->creds.uid;
   1757	UNIXCB(skb).gid = scm->creds.gid;
   1758	UNIXCB(skb).fp = NULL;
   1759	unix_get_secdata(scm, skb);
   1760	if (scm->fp && send_fds)
   1761		err = unix_attach_fds(scm, skb);
   1762
   1763	skb->destructor = unix_destruct_scm;
   1764	return err;
   1765}
   1766
   1767static bool unix_passcred_enabled(const struct socket *sock,
   1768				  const struct sock *other)
   1769{
   1770	return test_bit(SOCK_PASSCRED, &sock->flags) ||
   1771	       !other->sk_socket ||
   1772	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
   1773}
   1774
   1775/*
   1776 * Some apps rely on write() giving SCM_CREDENTIALS
   1777 * We include credentials if source or destination socket
   1778 * asserted SOCK_PASSCRED.
   1779 */
   1780static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
   1781			    const struct sock *other)
   1782{
   1783	if (UNIXCB(skb).pid)
   1784		return;
   1785	if (unix_passcred_enabled(sock, other)) {
   1786		UNIXCB(skb).pid  = get_pid(task_tgid(current));
   1787		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
   1788	}
   1789}
   1790
   1791static int maybe_init_creds(struct scm_cookie *scm,
   1792			    struct socket *socket,
   1793			    const struct sock *other)
   1794{
   1795	int err;
   1796	struct msghdr msg = { .msg_controllen = 0 };
   1797
   1798	err = scm_send(socket, &msg, scm, false);
   1799	if (err)
   1800		return err;
   1801
   1802	if (unix_passcred_enabled(socket, other)) {
   1803		scm->pid = get_pid(task_tgid(current));
   1804		current_uid_gid(&scm->creds.uid, &scm->creds.gid);
   1805	}
   1806	return err;
   1807}
   1808
   1809static bool unix_skb_scm_eq(struct sk_buff *skb,
   1810			    struct scm_cookie *scm)
   1811{
   1812	return UNIXCB(skb).pid == scm->pid &&
   1813	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
   1814	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
   1815	       unix_secdata_eq(scm, skb);
   1816}
   1817
   1818static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
   1819{
   1820	struct scm_fp_list *fp = UNIXCB(skb).fp;
   1821	struct unix_sock *u = unix_sk(sk);
   1822
   1823	if (unlikely(fp && fp->count))
   1824		atomic_add(fp->count, &u->scm_stat.nr_fds);
   1825}
   1826
   1827static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
   1828{
   1829	struct scm_fp_list *fp = UNIXCB(skb).fp;
   1830	struct unix_sock *u = unix_sk(sk);
   1831
   1832	if (unlikely(fp && fp->count))
   1833		atomic_sub(fp->count, &u->scm_stat.nr_fds);
   1834}
   1835
   1836/*
   1837 *	Send AF_UNIX data.
   1838 */
   1839
   1840static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
   1841			      size_t len)
   1842{
   1843	struct sock *sk = sock->sk;
   1844	struct net *net = sock_net(sk);
   1845	struct unix_sock *u = unix_sk(sk);
   1846	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
   1847	struct sock *other = NULL;
   1848	int err;
   1849	struct sk_buff *skb;
   1850	long timeo;
   1851	struct scm_cookie scm;
   1852	int data_len = 0;
   1853	int sk_locked;
   1854
   1855	wait_for_unix_gc();
   1856	err = scm_send(sock, msg, &scm, false);
   1857	if (err < 0)
   1858		return err;
   1859
   1860	err = -EOPNOTSUPP;
   1861	if (msg->msg_flags&MSG_OOB)
   1862		goto out;
   1863
   1864	if (msg->msg_namelen) {
   1865		err = unix_validate_addr(sunaddr, msg->msg_namelen);
   1866		if (err)
   1867			goto out;
   1868	} else {
   1869		sunaddr = NULL;
   1870		err = -ENOTCONN;
   1871		other = unix_peer_get(sk);
   1872		if (!other)
   1873			goto out;
   1874	}
   1875
   1876	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
   1877		err = unix_autobind(sk);
   1878		if (err)
   1879			goto out;
   1880	}
   1881
   1882	err = -EMSGSIZE;
   1883	if (len > sk->sk_sndbuf - 32)
   1884		goto out;
   1885
   1886	if (len > SKB_MAX_ALLOC) {
   1887		data_len = min_t(size_t,
   1888				 len - SKB_MAX_ALLOC,
   1889				 MAX_SKB_FRAGS * PAGE_SIZE);
   1890		data_len = PAGE_ALIGN(data_len);
   1891
   1892		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
   1893	}
   1894
   1895	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
   1896				   msg->msg_flags & MSG_DONTWAIT, &err,
   1897				   PAGE_ALLOC_COSTLY_ORDER);
   1898	if (skb == NULL)
   1899		goto out;
   1900
   1901	err = unix_scm_to_skb(&scm, skb, true);
   1902	if (err < 0)
   1903		goto out_free;
   1904
   1905	skb_put(skb, len - data_len);
   1906	skb->data_len = data_len;
   1907	skb->len = len;
   1908	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
   1909	if (err)
   1910		goto out_free;
   1911
   1912	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
   1913
   1914restart:
   1915	if (!other) {
   1916		err = -ECONNRESET;
   1917		if (sunaddr == NULL)
   1918			goto out_free;
   1919
   1920		other = unix_find_other(net, sunaddr, msg->msg_namelen,
   1921					sk->sk_type);
   1922		if (IS_ERR(other)) {
   1923			err = PTR_ERR(other);
   1924			other = NULL;
   1925			goto out_free;
   1926		}
   1927	}
   1928
   1929	if (sk_filter(other, skb) < 0) {
   1930		/* Toss the packet but do not return any error to the sender */
   1931		err = len;
   1932		goto out_free;
   1933	}
   1934
   1935	sk_locked = 0;
   1936	unix_state_lock(other);
   1937restart_locked:
   1938	err = -EPERM;
   1939	if (!unix_may_send(sk, other))
   1940		goto out_unlock;
   1941
   1942	if (unlikely(sock_flag(other, SOCK_DEAD))) {
   1943		/*
   1944		 *	Check with 1003.1g - what should
   1945		 *	datagram error
   1946		 */
   1947		unix_state_unlock(other);
   1948		sock_put(other);
   1949
   1950		if (!sk_locked)
   1951			unix_state_lock(sk);
   1952
   1953		err = 0;
   1954		if (unix_peer(sk) == other) {
   1955			unix_peer(sk) = NULL;
   1956			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
   1957
   1958			unix_state_unlock(sk);
   1959
   1960			sk->sk_state = TCP_CLOSE;
   1961			unix_dgram_disconnected(sk, other);
   1962			sock_put(other);
   1963			err = -ECONNREFUSED;
   1964		} else {
   1965			unix_state_unlock(sk);
   1966		}
   1967
   1968		other = NULL;
   1969		if (err)
   1970			goto out_free;
   1971		goto restart;
   1972	}
   1973
   1974	err = -EPIPE;
   1975	if (other->sk_shutdown & RCV_SHUTDOWN)
   1976		goto out_unlock;
   1977
   1978	if (sk->sk_type != SOCK_SEQPACKET) {
   1979		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
   1980		if (err)
   1981			goto out_unlock;
   1982	}
   1983
   1984	/* other == sk && unix_peer(other) != sk if
   1985	 * - unix_peer(sk) == NULL, destination address bound to sk
   1986	 * - unix_peer(sk) == sk by time of get but disconnected before lock
   1987	 */
   1988	if (other != sk &&
   1989	    unlikely(unix_peer(other) != sk &&
   1990	    unix_recvq_full_lockless(other))) {
   1991		if (timeo) {
   1992			timeo = unix_wait_for_peer(other, timeo);
   1993
   1994			err = sock_intr_errno(timeo);
   1995			if (signal_pending(current))
   1996				goto out_free;
   1997
   1998			goto restart;
   1999		}
   2000
   2001		if (!sk_locked) {
   2002			unix_state_unlock(other);
   2003			unix_state_double_lock(sk, other);
   2004		}
   2005
   2006		if (unix_peer(sk) != other ||
   2007		    unix_dgram_peer_wake_me(sk, other)) {
   2008			err = -EAGAIN;
   2009			sk_locked = 1;
   2010			goto out_unlock;
   2011		}
   2012
   2013		if (!sk_locked) {
   2014			sk_locked = 1;
   2015			goto restart_locked;
   2016		}
   2017	}
   2018
   2019	if (unlikely(sk_locked))
   2020		unix_state_unlock(sk);
   2021
   2022	if (sock_flag(other, SOCK_RCVTSTAMP))
   2023		__net_timestamp(skb);
   2024	maybe_add_creds(skb, sock, other);
   2025	scm_stat_add(other, skb);
   2026	skb_queue_tail(&other->sk_receive_queue, skb);
   2027	unix_state_unlock(other);
   2028	other->sk_data_ready(other);
   2029	sock_put(other);
   2030	scm_destroy(&scm);
   2031	return len;
   2032
   2033out_unlock:
   2034	if (sk_locked)
   2035		unix_state_unlock(sk);
   2036	unix_state_unlock(other);
   2037out_free:
   2038	kfree_skb(skb);
   2039out:
   2040	if (other)
   2041		sock_put(other);
   2042	scm_destroy(&scm);
   2043	return err;
   2044}
   2045
   2046/* We use paged skbs for stream sockets, and limit occupancy to 32768
   2047 * bytes, and a minimum of a full page.
   2048 */
   2049#define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
   2050
   2051#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
   2052static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other)
   2053{
   2054	struct unix_sock *ousk = unix_sk(other);
   2055	struct sk_buff *skb;
   2056	int err = 0;
   2057
   2058	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
   2059
   2060	if (!skb)
   2061		return err;
   2062
   2063	skb_put(skb, 1);
   2064	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
   2065
   2066	if (err) {
   2067		kfree_skb(skb);
   2068		return err;
   2069	}
   2070
   2071	unix_state_lock(other);
   2072
   2073	if (sock_flag(other, SOCK_DEAD) ||
   2074	    (other->sk_shutdown & RCV_SHUTDOWN)) {
   2075		unix_state_unlock(other);
   2076		kfree_skb(skb);
   2077		return -EPIPE;
   2078	}
   2079
   2080	maybe_add_creds(skb, sock, other);
   2081	skb_get(skb);
   2082
   2083	if (ousk->oob_skb)
   2084		consume_skb(ousk->oob_skb);
   2085
   2086	WRITE_ONCE(ousk->oob_skb, skb);
   2087
   2088	scm_stat_add(other, skb);
   2089	skb_queue_tail(&other->sk_receive_queue, skb);
   2090	sk_send_sigurg(other);
   2091	unix_state_unlock(other);
   2092	other->sk_data_ready(other);
   2093
   2094	return err;
   2095}
   2096#endif
   2097
   2098static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
   2099			       size_t len)
   2100{
   2101	struct sock *sk = sock->sk;
   2102	struct sock *other = NULL;
   2103	int err, size;
   2104	struct sk_buff *skb;
   2105	int sent = 0;
   2106	struct scm_cookie scm;
   2107	bool fds_sent = false;
   2108	int data_len;
   2109
   2110	wait_for_unix_gc();
   2111	err = scm_send(sock, msg, &scm, false);
   2112	if (err < 0)
   2113		return err;
   2114
   2115	err = -EOPNOTSUPP;
   2116	if (msg->msg_flags & MSG_OOB) {
   2117#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
   2118		if (len)
   2119			len--;
   2120		else
   2121#endif
   2122			goto out_err;
   2123	}
   2124
   2125	if (msg->msg_namelen) {
   2126		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
   2127		goto out_err;
   2128	} else {
   2129		err = -ENOTCONN;
   2130		other = unix_peer(sk);
   2131		if (!other)
   2132			goto out_err;
   2133	}
   2134
   2135	if (sk->sk_shutdown & SEND_SHUTDOWN)
   2136		goto pipe_err;
   2137
   2138	while (sent < len) {
   2139		size = len - sent;
   2140
   2141		/* Keep two messages in the pipe so it schedules better */
   2142		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
   2143
   2144		/* allow fallback to order-0 allocations */
   2145		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
   2146
   2147		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
   2148
   2149		data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
   2150
   2151		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
   2152					   msg->msg_flags & MSG_DONTWAIT, &err,
   2153					   get_order(UNIX_SKB_FRAGS_SZ));
   2154		if (!skb)
   2155			goto out_err;
   2156
   2157		/* Only send the fds in the first buffer */
   2158		err = unix_scm_to_skb(&scm, skb, !fds_sent);
   2159		if (err < 0) {
   2160			kfree_skb(skb);
   2161			goto out_err;
   2162		}
   2163		fds_sent = true;
   2164
   2165		skb_put(skb, size - data_len);
   2166		skb->data_len = data_len;
   2167		skb->len = size;
   2168		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
   2169		if (err) {
   2170			kfree_skb(skb);
   2171			goto out_err;
   2172		}
   2173
   2174		unix_state_lock(other);
   2175
   2176		if (sock_flag(other, SOCK_DEAD) ||
   2177		    (other->sk_shutdown & RCV_SHUTDOWN))
   2178			goto pipe_err_free;
   2179
   2180		maybe_add_creds(skb, sock, other);
   2181		scm_stat_add(other, skb);
   2182		skb_queue_tail(&other->sk_receive_queue, skb);
   2183		unix_state_unlock(other);
   2184		other->sk_data_ready(other);
   2185		sent += size;
   2186	}
   2187
   2188#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
   2189	if (msg->msg_flags & MSG_OOB) {
   2190		err = queue_oob(sock, msg, other);
   2191		if (err)
   2192			goto out_err;
   2193		sent++;
   2194	}
   2195#endif
   2196
   2197	scm_destroy(&scm);
   2198
   2199	return sent;
   2200
   2201pipe_err_free:
   2202	unix_state_unlock(other);
   2203	kfree_skb(skb);
   2204pipe_err:
   2205	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
   2206		send_sig(SIGPIPE, current, 0);
   2207	err = -EPIPE;
   2208out_err:
   2209	scm_destroy(&scm);
   2210	return sent ? : err;
   2211}
   2212
   2213static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
   2214				    int offset, size_t size, int flags)
   2215{
   2216	int err;
   2217	bool send_sigpipe = false;
   2218	bool init_scm = true;
   2219	struct scm_cookie scm;
   2220	struct sock *other, *sk = socket->sk;
   2221	struct sk_buff *skb, *newskb = NULL, *tail = NULL;
   2222
   2223	if (flags & MSG_OOB)
   2224		return -EOPNOTSUPP;
   2225
   2226	other = unix_peer(sk);
   2227	if (!other || sk->sk_state != TCP_ESTABLISHED)
   2228		return -ENOTCONN;
   2229
   2230	if (false) {
   2231alloc_skb:
   2232		unix_state_unlock(other);
   2233		mutex_unlock(&unix_sk(other)->iolock);
   2234		newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
   2235					      &err, 0);
   2236		if (!newskb)
   2237			goto err;
   2238	}
   2239
   2240	/* we must acquire iolock as we modify already present
   2241	 * skbs in the sk_receive_queue and mess with skb->len
   2242	 */
   2243	err = mutex_lock_interruptible(&unix_sk(other)->iolock);
   2244	if (err) {
   2245		err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
   2246		goto err;
   2247	}
   2248
   2249	if (sk->sk_shutdown & SEND_SHUTDOWN) {
   2250		err = -EPIPE;
   2251		send_sigpipe = true;
   2252		goto err_unlock;
   2253	}
   2254
   2255	unix_state_lock(other);
   2256
   2257	if (sock_flag(other, SOCK_DEAD) ||
   2258	    other->sk_shutdown & RCV_SHUTDOWN) {
   2259		err = -EPIPE;
   2260		send_sigpipe = true;
   2261		goto err_state_unlock;
   2262	}
   2263
   2264	if (init_scm) {
   2265		err = maybe_init_creds(&scm, socket, other);
   2266		if (err)
   2267			goto err_state_unlock;
   2268		init_scm = false;
   2269	}
   2270
   2271	skb = skb_peek_tail(&other->sk_receive_queue);
   2272	if (tail && tail == skb) {
   2273		skb = newskb;
   2274	} else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
   2275		if (newskb) {
   2276			skb = newskb;
   2277		} else {
   2278			tail = skb;
   2279			goto alloc_skb;
   2280		}
   2281	} else if (newskb) {
   2282		/* this is fast path, we don't necessarily need to
   2283		 * call to kfree_skb even though with newskb == NULL
   2284		 * this - does no harm
   2285		 */
   2286		consume_skb(newskb);
   2287		newskb = NULL;
   2288	}
   2289
   2290	if (skb_append_pagefrags(skb, page, offset, size)) {
   2291		tail = skb;
   2292		goto alloc_skb;
   2293	}
   2294
   2295	skb->len += size;
   2296	skb->data_len += size;
   2297	skb->truesize += size;
   2298	refcount_add(size, &sk->sk_wmem_alloc);
   2299
   2300	if (newskb) {
   2301		err = unix_scm_to_skb(&scm, skb, false);
   2302		if (err)
   2303			goto err_state_unlock;
   2304		spin_lock(&other->sk_receive_queue.lock);
   2305		__skb_queue_tail(&other->sk_receive_queue, newskb);
   2306		spin_unlock(&other->sk_receive_queue.lock);
   2307	}
   2308
   2309	unix_state_unlock(other);
   2310	mutex_unlock(&unix_sk(other)->iolock);
   2311
   2312	other->sk_data_ready(other);
   2313	scm_destroy(&scm);
   2314	return size;
   2315
   2316err_state_unlock:
   2317	unix_state_unlock(other);
   2318err_unlock:
   2319	mutex_unlock(&unix_sk(other)->iolock);
   2320err:
   2321	kfree_skb(newskb);
   2322	if (send_sigpipe && !(flags & MSG_NOSIGNAL))
   2323		send_sig(SIGPIPE, current, 0);
   2324	if (!init_scm)
   2325		scm_destroy(&scm);
   2326	return err;
   2327}
   2328
   2329static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
   2330				  size_t len)
   2331{
   2332	int err;
   2333	struct sock *sk = sock->sk;
   2334
   2335	err = sock_error(sk);
   2336	if (err)
   2337		return err;
   2338
   2339	if (sk->sk_state != TCP_ESTABLISHED)
   2340		return -ENOTCONN;
   2341
   2342	if (msg->msg_namelen)
   2343		msg->msg_namelen = 0;
   2344
   2345	return unix_dgram_sendmsg(sock, msg, len);
   2346}
   2347
   2348static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
   2349				  size_t size, int flags)
   2350{
   2351	struct sock *sk = sock->sk;
   2352
   2353	if (sk->sk_state != TCP_ESTABLISHED)
   2354		return -ENOTCONN;
   2355
   2356	return unix_dgram_recvmsg(sock, msg, size, flags);
   2357}
   2358
   2359static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
   2360{
   2361	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
   2362
   2363	if (addr) {
   2364		msg->msg_namelen = addr->len;
   2365		memcpy(msg->msg_name, addr->name, addr->len);
   2366	}
   2367}
   2368
   2369int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
   2370			 int flags)
   2371{
   2372	struct scm_cookie scm;
   2373	struct socket *sock = sk->sk_socket;
   2374	struct unix_sock *u = unix_sk(sk);
   2375	struct sk_buff *skb, *last;
   2376	long timeo;
   2377	int skip;
   2378	int err;
   2379
   2380	err = -EOPNOTSUPP;
   2381	if (flags&MSG_OOB)
   2382		goto out;
   2383
   2384	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
   2385
   2386	do {
   2387		mutex_lock(&u->iolock);
   2388
   2389		skip = sk_peek_offset(sk, flags);
   2390		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
   2391					      &skip, &err, &last);
   2392		if (skb) {
   2393			if (!(flags & MSG_PEEK))
   2394				scm_stat_del(sk, skb);
   2395			break;
   2396		}
   2397
   2398		mutex_unlock(&u->iolock);
   2399
   2400		if (err != -EAGAIN)
   2401			break;
   2402	} while (timeo &&
   2403		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
   2404					      &err, &timeo, last));
   2405
   2406	if (!skb) { /* implies iolock unlocked */
   2407		unix_state_lock(sk);
   2408		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
   2409		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
   2410		    (sk->sk_shutdown & RCV_SHUTDOWN))
   2411			err = 0;
   2412		unix_state_unlock(sk);
   2413		goto out;
   2414	}
   2415
   2416	if (wq_has_sleeper(&u->peer_wait))
   2417		wake_up_interruptible_sync_poll(&u->peer_wait,
   2418						EPOLLOUT | EPOLLWRNORM |
   2419						EPOLLWRBAND);
   2420
   2421	if (msg->msg_name)
   2422		unix_copy_addr(msg, skb->sk);
   2423
   2424	if (size > skb->len - skip)
   2425		size = skb->len - skip;
   2426	else if (size < skb->len - skip)
   2427		msg->msg_flags |= MSG_TRUNC;
   2428
   2429	err = skb_copy_datagram_msg(skb, skip, msg, size);
   2430	if (err)
   2431		goto out_free;
   2432
   2433	if (sock_flag(sk, SOCK_RCVTSTAMP))
   2434		__sock_recv_timestamp(msg, sk, skb);
   2435
   2436	memset(&scm, 0, sizeof(scm));
   2437
   2438	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
   2439	unix_set_secdata(&scm, skb);
   2440
   2441	if (!(flags & MSG_PEEK)) {
   2442		if (UNIXCB(skb).fp)
   2443			unix_detach_fds(&scm, skb);
   2444
   2445		sk_peek_offset_bwd(sk, skb->len);
   2446	} else {
   2447		/* It is questionable: on PEEK we could:
   2448		   - do not return fds - good, but too simple 8)
   2449		   - return fds, and do not return them on read (old strategy,
   2450		     apparently wrong)
   2451		   - clone fds (I chose it for now, it is the most universal
   2452		     solution)
   2453
   2454		   POSIX 1003.1g does not actually define this clearly
   2455		   at all. POSIX 1003.1g doesn't define a lot of things
   2456		   clearly however!
   2457
   2458		*/
   2459
   2460		sk_peek_offset_fwd(sk, size);
   2461
   2462		if (UNIXCB(skb).fp)
   2463			unix_peek_fds(&scm, skb);
   2464	}
   2465	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
   2466
   2467	scm_recv(sock, msg, &scm, flags);
   2468
   2469out_free:
   2470	skb_free_datagram(sk, skb);
   2471	mutex_unlock(&u->iolock);
   2472out:
   2473	return err;
   2474}
   2475
   2476static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
   2477			      int flags)
   2478{
   2479	struct sock *sk = sock->sk;
   2480
   2481#ifdef CONFIG_BPF_SYSCALL
   2482	const struct proto *prot = READ_ONCE(sk->sk_prot);
   2483
   2484	if (prot != &unix_dgram_proto)
   2485		return prot->recvmsg(sk, msg, size, flags, NULL);
   2486#endif
   2487	return __unix_dgram_recvmsg(sk, msg, size, flags);
   2488}
   2489
   2490static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
   2491			  sk_read_actor_t recv_actor)
   2492{
   2493	int copied = 0;
   2494
   2495	while (1) {
   2496		struct unix_sock *u = unix_sk(sk);
   2497		struct sk_buff *skb;
   2498		int used, err;
   2499
   2500		mutex_lock(&u->iolock);
   2501		skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
   2502		mutex_unlock(&u->iolock);
   2503		if (!skb)
   2504			return err;
   2505
   2506		used = recv_actor(desc, skb, 0, skb->len);
   2507		if (used <= 0) {
   2508			if (!copied)
   2509				copied = used;
   2510			kfree_skb(skb);
   2511			break;
   2512		} else if (used <= skb->len) {
   2513			copied += used;
   2514		}
   2515
   2516		kfree_skb(skb);
   2517		if (!desc->count)
   2518			break;
   2519	}
   2520
   2521	return copied;
   2522}
   2523
   2524/*
   2525 *	Sleep until more data has arrived. But check for races..
   2526 */
   2527static long unix_stream_data_wait(struct sock *sk, long timeo,
   2528				  struct sk_buff *last, unsigned int last_len,
   2529				  bool freezable)
   2530{
   2531	struct sk_buff *tail;
   2532	DEFINE_WAIT(wait);
   2533
   2534	unix_state_lock(sk);
   2535
   2536	for (;;) {
   2537		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
   2538
   2539		tail = skb_peek_tail(&sk->sk_receive_queue);
   2540		if (tail != last ||
   2541		    (tail && tail->len != last_len) ||
   2542		    sk->sk_err ||
   2543		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
   2544		    signal_pending(current) ||
   2545		    !timeo)
   2546			break;
   2547
   2548		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
   2549		unix_state_unlock(sk);
   2550		if (freezable)
   2551			timeo = freezable_schedule_timeout(timeo);
   2552		else
   2553			timeo = schedule_timeout(timeo);
   2554		unix_state_lock(sk);
   2555
   2556		if (sock_flag(sk, SOCK_DEAD))
   2557			break;
   2558
   2559		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
   2560	}
   2561
   2562	finish_wait(sk_sleep(sk), &wait);
   2563	unix_state_unlock(sk);
   2564	return timeo;
   2565}
   2566
   2567static unsigned int unix_skb_len(const struct sk_buff *skb)
   2568{
   2569	return skb->len - UNIXCB(skb).consumed;
   2570}
   2571
   2572struct unix_stream_read_state {
   2573	int (*recv_actor)(struct sk_buff *, int, int,
   2574			  struct unix_stream_read_state *);
   2575	struct socket *socket;
   2576	struct msghdr *msg;
   2577	struct pipe_inode_info *pipe;
   2578	size_t size;
   2579	int flags;
   2580	unsigned int splice_flags;
   2581};
   2582
   2583#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
   2584static int unix_stream_recv_urg(struct unix_stream_read_state *state)
   2585{
   2586	struct socket *sock = state->socket;
   2587	struct sock *sk = sock->sk;
   2588	struct unix_sock *u = unix_sk(sk);
   2589	int chunk = 1;
   2590	struct sk_buff *oob_skb;
   2591
   2592	mutex_lock(&u->iolock);
   2593	unix_state_lock(sk);
   2594
   2595	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
   2596		unix_state_unlock(sk);
   2597		mutex_unlock(&u->iolock);
   2598		return -EINVAL;
   2599	}
   2600
   2601	oob_skb = u->oob_skb;
   2602
   2603	if (!(state->flags & MSG_PEEK))
   2604		WRITE_ONCE(u->oob_skb, NULL);
   2605
   2606	unix_state_unlock(sk);
   2607
   2608	chunk = state->recv_actor(oob_skb, 0, chunk, state);
   2609
   2610	if (!(state->flags & MSG_PEEK)) {
   2611		UNIXCB(oob_skb).consumed += 1;
   2612		kfree_skb(oob_skb);
   2613	}
   2614
   2615	mutex_unlock(&u->iolock);
   2616
   2617	if (chunk < 0)
   2618		return -EFAULT;
   2619
   2620	state->msg->msg_flags |= MSG_OOB;
   2621	return 1;
   2622}
   2623
   2624static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
   2625				  int flags, int copied)
   2626{
   2627	struct unix_sock *u = unix_sk(sk);
   2628
   2629	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
   2630		skb_unlink(skb, &sk->sk_receive_queue);
   2631		consume_skb(skb);
   2632		skb = NULL;
   2633	} else {
   2634		if (skb == u->oob_skb) {
   2635			if (copied) {
   2636				skb = NULL;
   2637			} else if (sock_flag(sk, SOCK_URGINLINE)) {
   2638				if (!(flags & MSG_PEEK)) {
   2639					WRITE_ONCE(u->oob_skb, NULL);
   2640					consume_skb(skb);
   2641				}
   2642			} else if (!(flags & MSG_PEEK)) {
   2643				skb_unlink(skb, &sk->sk_receive_queue);
   2644				consume_skb(skb);
   2645				skb = skb_peek(&sk->sk_receive_queue);
   2646			}
   2647		}
   2648	}
   2649	return skb;
   2650}
   2651#endif
   2652
   2653static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc,
   2654				 sk_read_actor_t recv_actor)
   2655{
   2656	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
   2657		return -ENOTCONN;
   2658
   2659	return unix_read_sock(sk, desc, recv_actor);
   2660}
   2661
   2662static int unix_stream_read_generic(struct unix_stream_read_state *state,
   2663				    bool freezable)
   2664{
   2665	struct scm_cookie scm;
   2666	struct socket *sock = state->socket;
   2667	struct sock *sk = sock->sk;
   2668	struct unix_sock *u = unix_sk(sk);
   2669	int copied = 0;
   2670	int flags = state->flags;
   2671	int noblock = flags & MSG_DONTWAIT;
   2672	bool check_creds = false;
   2673	int target;
   2674	int err = 0;
   2675	long timeo;
   2676	int skip;
   2677	size_t size = state->size;
   2678	unsigned int last_len;
   2679
   2680	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
   2681		err = -EINVAL;
   2682		goto out;
   2683	}
   2684
   2685	if (unlikely(flags & MSG_OOB)) {
   2686		err = -EOPNOTSUPP;
   2687#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
   2688		err = unix_stream_recv_urg(state);
   2689#endif
   2690		goto out;
   2691	}
   2692
   2693	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
   2694	timeo = sock_rcvtimeo(sk, noblock);
   2695
   2696	memset(&scm, 0, sizeof(scm));
   2697
   2698	/* Lock the socket to prevent queue disordering
   2699	 * while sleeps in memcpy_tomsg
   2700	 */
   2701	mutex_lock(&u->iolock);
   2702
   2703	skip = max(sk_peek_offset(sk, flags), 0);
   2704
   2705	do {
   2706		int chunk;
   2707		bool drop_skb;
   2708		struct sk_buff *skb, *last;
   2709
   2710redo:
   2711		unix_state_lock(sk);
   2712		if (sock_flag(sk, SOCK_DEAD)) {
   2713			err = -ECONNRESET;
   2714			goto unlock;
   2715		}
   2716		last = skb = skb_peek(&sk->sk_receive_queue);
   2717		last_len = last ? last->len : 0;
   2718
   2719#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
   2720		if (skb) {
   2721			skb = manage_oob(skb, sk, flags, copied);
   2722			if (!skb) {
   2723				unix_state_unlock(sk);
   2724				if (copied)
   2725					break;
   2726				goto redo;
   2727			}
   2728		}
   2729#endif
   2730again:
   2731		if (skb == NULL) {
   2732			if (copied >= target)
   2733				goto unlock;
   2734
   2735			/*
   2736			 *	POSIX 1003.1g mandates this order.
   2737			 */
   2738
   2739			err = sock_error(sk);
   2740			if (err)
   2741				goto unlock;
   2742			if (sk->sk_shutdown & RCV_SHUTDOWN)
   2743				goto unlock;
   2744
   2745			unix_state_unlock(sk);
   2746			if (!timeo) {
   2747				err = -EAGAIN;
   2748				break;
   2749			}
   2750
   2751			mutex_unlock(&u->iolock);
   2752
   2753			timeo = unix_stream_data_wait(sk, timeo, last,
   2754						      last_len, freezable);
   2755
   2756			if (signal_pending(current)) {
   2757				err = sock_intr_errno(timeo);
   2758				scm_destroy(&scm);
   2759				goto out;
   2760			}
   2761
   2762			mutex_lock(&u->iolock);
   2763			goto redo;
   2764unlock:
   2765			unix_state_unlock(sk);
   2766			break;
   2767		}
   2768
   2769		while (skip >= unix_skb_len(skb)) {
   2770			skip -= unix_skb_len(skb);
   2771			last = skb;
   2772			last_len = skb->len;
   2773			skb = skb_peek_next(skb, &sk->sk_receive_queue);
   2774			if (!skb)
   2775				goto again;
   2776		}
   2777
   2778		unix_state_unlock(sk);
   2779
   2780		if (check_creds) {
   2781			/* Never glue messages from different writers */
   2782			if (!unix_skb_scm_eq(skb, &scm))
   2783				break;
   2784		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
   2785			/* Copy credentials */
   2786			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
   2787			unix_set_secdata(&scm, skb);
   2788			check_creds = true;
   2789		}
   2790
   2791		/* Copy address just once */
   2792		if (state->msg && state->msg->msg_name) {
   2793			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
   2794					 state->msg->msg_name);
   2795			unix_copy_addr(state->msg, skb->sk);
   2796			sunaddr = NULL;
   2797		}
   2798
   2799		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
   2800		skb_get(skb);
   2801		chunk = state->recv_actor(skb, skip, chunk, state);
   2802		drop_skb = !unix_skb_len(skb);
   2803		/* skb is only safe to use if !drop_skb */
   2804		consume_skb(skb);
   2805		if (chunk < 0) {
   2806			if (copied == 0)
   2807				copied = -EFAULT;
   2808			break;
   2809		}
   2810		copied += chunk;
   2811		size -= chunk;
   2812
   2813		if (drop_skb) {
   2814			/* the skb was touched by a concurrent reader;
   2815			 * we should not expect anything from this skb
   2816			 * anymore and assume it invalid - we can be
   2817			 * sure it was dropped from the socket queue
   2818			 *
   2819			 * let's report a short read
   2820			 */
   2821			err = 0;
   2822			break;
   2823		}
   2824
   2825		/* Mark read part of skb as used */
   2826		if (!(flags & MSG_PEEK)) {
   2827			UNIXCB(skb).consumed += chunk;
   2828
   2829			sk_peek_offset_bwd(sk, chunk);
   2830
   2831			if (UNIXCB(skb).fp) {
   2832				scm_stat_del(sk, skb);
   2833				unix_detach_fds(&scm, skb);
   2834			}
   2835
   2836			if (unix_skb_len(skb))
   2837				break;
   2838
   2839			skb_unlink(skb, &sk->sk_receive_queue);
   2840			consume_skb(skb);
   2841
   2842			if (scm.fp)
   2843				break;
   2844		} else {
   2845			/* It is questionable, see note in unix_dgram_recvmsg.
   2846			 */
   2847			if (UNIXCB(skb).fp)
   2848				unix_peek_fds(&scm, skb);
   2849
   2850			sk_peek_offset_fwd(sk, chunk);
   2851
   2852			if (UNIXCB(skb).fp)
   2853				break;
   2854
   2855			skip = 0;
   2856			last = skb;
   2857			last_len = skb->len;
   2858			unix_state_lock(sk);
   2859			skb = skb_peek_next(skb, &sk->sk_receive_queue);
   2860			if (skb)
   2861				goto again;
   2862			unix_state_unlock(sk);
   2863			break;
   2864		}
   2865	} while (size);
   2866
   2867	mutex_unlock(&u->iolock);
   2868	if (state->msg)
   2869		scm_recv(sock, state->msg, &scm, flags);
   2870	else
   2871		scm_destroy(&scm);
   2872out:
   2873	return copied ? : err;
   2874}
   2875
   2876static int unix_stream_read_actor(struct sk_buff *skb,
   2877				  int skip, int chunk,
   2878				  struct unix_stream_read_state *state)
   2879{
   2880	int ret;
   2881
   2882	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
   2883				    state->msg, chunk);
   2884	return ret ?: chunk;
   2885}
   2886
   2887int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
   2888			  size_t size, int flags)
   2889{
   2890	struct unix_stream_read_state state = {
   2891		.recv_actor = unix_stream_read_actor,
   2892		.socket = sk->sk_socket,
   2893		.msg = msg,
   2894		.size = size,
   2895		.flags = flags
   2896	};
   2897
   2898	return unix_stream_read_generic(&state, true);
   2899}
   2900
   2901static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
   2902			       size_t size, int flags)
   2903{
   2904	struct unix_stream_read_state state = {
   2905		.recv_actor = unix_stream_read_actor,
   2906		.socket = sock,
   2907		.msg = msg,
   2908		.size = size,
   2909		.flags = flags
   2910	};
   2911
   2912#ifdef CONFIG_BPF_SYSCALL
   2913	struct sock *sk = sock->sk;
   2914	const struct proto *prot = READ_ONCE(sk->sk_prot);
   2915
   2916	if (prot != &unix_stream_proto)
   2917		return prot->recvmsg(sk, msg, size, flags, NULL);
   2918#endif
   2919	return unix_stream_read_generic(&state, true);
   2920}
   2921
   2922static int unix_stream_splice_actor(struct sk_buff *skb,
   2923				    int skip, int chunk,
   2924				    struct unix_stream_read_state *state)
   2925{
   2926	return skb_splice_bits(skb, state->socket->sk,
   2927			       UNIXCB(skb).consumed + skip,
   2928			       state->pipe, chunk, state->splice_flags);
   2929}
   2930
   2931static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
   2932				       struct pipe_inode_info *pipe,
   2933				       size_t size, unsigned int flags)
   2934{
   2935	struct unix_stream_read_state state = {
   2936		.recv_actor = unix_stream_splice_actor,
   2937		.socket = sock,
   2938		.pipe = pipe,
   2939		.size = size,
   2940		.splice_flags = flags,
   2941	};
   2942
   2943	if (unlikely(*ppos))
   2944		return -ESPIPE;
   2945
   2946	if (sock->file->f_flags & O_NONBLOCK ||
   2947	    flags & SPLICE_F_NONBLOCK)
   2948		state.flags = MSG_DONTWAIT;
   2949
   2950	return unix_stream_read_generic(&state, false);
   2951}
   2952
   2953static int unix_shutdown(struct socket *sock, int mode)
   2954{
   2955	struct sock *sk = sock->sk;
   2956	struct sock *other;
   2957
   2958	if (mode < SHUT_RD || mode > SHUT_RDWR)
   2959		return -EINVAL;
   2960	/* This maps:
   2961	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
   2962	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
   2963	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
   2964	 */
   2965	++mode;
   2966
   2967	unix_state_lock(sk);
   2968	sk->sk_shutdown |= mode;
   2969	other = unix_peer(sk);
   2970	if (other)
   2971		sock_hold(other);
   2972	unix_state_unlock(sk);
   2973	sk->sk_state_change(sk);
   2974
   2975	if (other &&
   2976		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
   2977
   2978		int peer_mode = 0;
   2979		const struct proto *prot = READ_ONCE(other->sk_prot);
   2980
   2981		if (prot->unhash)
   2982			prot->unhash(other);
   2983		if (mode&RCV_SHUTDOWN)
   2984			peer_mode |= SEND_SHUTDOWN;
   2985		if (mode&SEND_SHUTDOWN)
   2986			peer_mode |= RCV_SHUTDOWN;
   2987		unix_state_lock(other);
   2988		other->sk_shutdown |= peer_mode;
   2989		unix_state_unlock(other);
   2990		other->sk_state_change(other);
   2991		if (peer_mode == SHUTDOWN_MASK)
   2992			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
   2993		else if (peer_mode & RCV_SHUTDOWN)
   2994			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
   2995	}
   2996	if (other)
   2997		sock_put(other);
   2998
   2999	return 0;
   3000}
   3001
   3002long unix_inq_len(struct sock *sk)
   3003{
   3004	struct sk_buff *skb;
   3005	long amount = 0;
   3006
   3007	if (sk->sk_state == TCP_LISTEN)
   3008		return -EINVAL;
   3009
   3010	spin_lock(&sk->sk_receive_queue.lock);
   3011	if (sk->sk_type == SOCK_STREAM ||
   3012	    sk->sk_type == SOCK_SEQPACKET) {
   3013		skb_queue_walk(&sk->sk_receive_queue, skb)
   3014			amount += unix_skb_len(skb);
   3015	} else {
   3016		skb = skb_peek(&sk->sk_receive_queue);
   3017		if (skb)
   3018			amount = skb->len;
   3019	}
   3020	spin_unlock(&sk->sk_receive_queue.lock);
   3021
   3022	return amount;
   3023}
   3024EXPORT_SYMBOL_GPL(unix_inq_len);
   3025
   3026long unix_outq_len(struct sock *sk)
   3027{
   3028	return sk_wmem_alloc_get(sk);
   3029}
   3030EXPORT_SYMBOL_GPL(unix_outq_len);
   3031
   3032static int unix_open_file(struct sock *sk)
   3033{
   3034	struct path path;
   3035	struct file *f;
   3036	int fd;
   3037
   3038	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
   3039		return -EPERM;
   3040
   3041	if (!smp_load_acquire(&unix_sk(sk)->addr))
   3042		return -ENOENT;
   3043
   3044	path = unix_sk(sk)->path;
   3045	if (!path.dentry)
   3046		return -ENOENT;
   3047
   3048	path_get(&path);
   3049
   3050	fd = get_unused_fd_flags(O_CLOEXEC);
   3051	if (fd < 0)
   3052		goto out;
   3053
   3054	f = dentry_open(&path, O_PATH, current_cred());
   3055	if (IS_ERR(f)) {
   3056		put_unused_fd(fd);
   3057		fd = PTR_ERR(f);
   3058		goto out;
   3059	}
   3060
   3061	fd_install(fd, f);
   3062out:
   3063	path_put(&path);
   3064
   3065	return fd;
   3066}
   3067
   3068static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
   3069{
   3070	struct sock *sk = sock->sk;
   3071	long amount = 0;
   3072	int err;
   3073
   3074	switch (cmd) {
   3075	case SIOCOUTQ:
   3076		amount = unix_outq_len(sk);
   3077		err = put_user(amount, (int __user *)arg);
   3078		break;
   3079	case SIOCINQ:
   3080		amount = unix_inq_len(sk);
   3081		if (amount < 0)
   3082			err = amount;
   3083		else
   3084			err = put_user(amount, (int __user *)arg);
   3085		break;
   3086	case SIOCUNIXFILE:
   3087		err = unix_open_file(sk);
   3088		break;
   3089#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
   3090	case SIOCATMARK:
   3091		{
   3092			struct sk_buff *skb;
   3093			int answ = 0;
   3094
   3095			skb = skb_peek(&sk->sk_receive_queue);
   3096			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
   3097				answ = 1;
   3098			err = put_user(answ, (int __user *)arg);
   3099		}
   3100		break;
   3101#endif
   3102	default:
   3103		err = -ENOIOCTLCMD;
   3104		break;
   3105	}
   3106	return err;
   3107}
   3108
   3109#ifdef CONFIG_COMPAT
   3110static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
   3111{
   3112	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
   3113}
   3114#endif
   3115
   3116static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
   3117{
   3118	struct sock *sk = sock->sk;
   3119	__poll_t mask;
   3120
   3121	sock_poll_wait(file, sock, wait);
   3122	mask = 0;
   3123
   3124	/* exceptional events? */
   3125	if (sk->sk_err)
   3126		mask |= EPOLLERR;
   3127	if (sk->sk_shutdown == SHUTDOWN_MASK)
   3128		mask |= EPOLLHUP;
   3129	if (sk->sk_shutdown & RCV_SHUTDOWN)
   3130		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
   3131
   3132	/* readable? */
   3133	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
   3134		mask |= EPOLLIN | EPOLLRDNORM;
   3135	if (sk_is_readable(sk))
   3136		mask |= EPOLLIN | EPOLLRDNORM;
   3137#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
   3138	if (READ_ONCE(unix_sk(sk)->oob_skb))
   3139		mask |= EPOLLPRI;
   3140#endif
   3141
   3142	/* Connection-based need to check for termination and startup */
   3143	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
   3144	    sk->sk_state == TCP_CLOSE)
   3145		mask |= EPOLLHUP;
   3146
   3147	/*
   3148	 * we set writable also when the other side has shut down the
   3149	 * connection. This prevents stuck sockets.
   3150	 */
   3151	if (unix_writable(sk))
   3152		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
   3153
   3154	return mask;
   3155}
   3156
   3157static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
   3158				    poll_table *wait)
   3159{
   3160	struct sock *sk = sock->sk, *other;
   3161	unsigned int writable;
   3162	__poll_t mask;
   3163
   3164	sock_poll_wait(file, sock, wait);
   3165	mask = 0;
   3166
   3167	/* exceptional events? */
   3168	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
   3169		mask |= EPOLLERR |
   3170			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
   3171
   3172	if (sk->sk_shutdown & RCV_SHUTDOWN)
   3173		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
   3174	if (sk->sk_shutdown == SHUTDOWN_MASK)
   3175		mask |= EPOLLHUP;
   3176
   3177	/* readable? */
   3178	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
   3179		mask |= EPOLLIN | EPOLLRDNORM;
   3180	if (sk_is_readable(sk))
   3181		mask |= EPOLLIN | EPOLLRDNORM;
   3182
   3183	/* Connection-based need to check for termination and startup */
   3184	if (sk->sk_type == SOCK_SEQPACKET) {
   3185		if (sk->sk_state == TCP_CLOSE)
   3186			mask |= EPOLLHUP;
   3187		/* connection hasn't started yet? */
   3188		if (sk->sk_state == TCP_SYN_SENT)
   3189			return mask;
   3190	}
   3191
   3192	/* No write status requested, avoid expensive OUT tests. */
   3193	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
   3194		return mask;
   3195
   3196	writable = unix_writable(sk);
   3197	if (writable) {
   3198		unix_state_lock(sk);
   3199
   3200		other = unix_peer(sk);
   3201		if (other && unix_peer(other) != sk &&
   3202		    unix_recvq_full_lockless(other) &&
   3203		    unix_dgram_peer_wake_me(sk, other))
   3204			writable = 0;
   3205
   3206		unix_state_unlock(sk);
   3207	}
   3208
   3209	if (writable)
   3210		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
   3211	else
   3212		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
   3213
   3214	return mask;
   3215}
   3216
   3217#ifdef CONFIG_PROC_FS
   3218
   3219#define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
   3220
   3221#define get_bucket(x) ((x) >> BUCKET_SPACE)
   3222#define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
   3223#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
   3224
   3225static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
   3226{
   3227	unsigned long offset = get_offset(*pos);
   3228	unsigned long bucket = get_bucket(*pos);
   3229	struct sock *sk;
   3230	unsigned long count = 0;
   3231
   3232	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
   3233		if (sock_net(sk) != seq_file_net(seq))
   3234			continue;
   3235		if (++count == offset)
   3236			break;
   3237	}
   3238
   3239	return sk;
   3240}
   3241
   3242static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
   3243{
   3244	unsigned long bucket = get_bucket(*pos);
   3245	struct sock *sk;
   3246
   3247	while (bucket < ARRAY_SIZE(unix_socket_table)) {
   3248		spin_lock(&unix_table_locks[bucket]);
   3249
   3250		sk = unix_from_bucket(seq, pos);
   3251		if (sk)
   3252			return sk;
   3253
   3254		spin_unlock(&unix_table_locks[bucket]);
   3255
   3256		*pos = set_bucket_offset(++bucket, 1);
   3257	}
   3258
   3259	return NULL;
   3260}
   3261
   3262static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
   3263				  loff_t *pos)
   3264{
   3265	unsigned long bucket = get_bucket(*pos);
   3266
   3267	for (sk = sk_next(sk); sk; sk = sk_next(sk))
   3268		if (sock_net(sk) == seq_file_net(seq))
   3269			return sk;
   3270
   3271	spin_unlock(&unix_table_locks[bucket]);
   3272
   3273	*pos = set_bucket_offset(++bucket, 1);
   3274
   3275	return unix_get_first(seq, pos);
   3276}
   3277
   3278static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
   3279{
   3280	if (!*pos)
   3281		return SEQ_START_TOKEN;
   3282
   3283	return unix_get_first(seq, pos);
   3284}
   3285
   3286static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
   3287{
   3288	++*pos;
   3289
   3290	if (v == SEQ_START_TOKEN)
   3291		return unix_get_first(seq, pos);
   3292
   3293	return unix_get_next(seq, v, pos);
   3294}
   3295
   3296static void unix_seq_stop(struct seq_file *seq, void *v)
   3297{
   3298	struct sock *sk = v;
   3299
   3300	if (sk)
   3301		spin_unlock(&unix_table_locks[sk->sk_hash]);
   3302}
   3303
   3304static int unix_seq_show(struct seq_file *seq, void *v)
   3305{
   3306
   3307	if (v == SEQ_START_TOKEN)
   3308		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
   3309			 "Inode Path\n");
   3310	else {
   3311		struct sock *s = v;
   3312		struct unix_sock *u = unix_sk(s);
   3313		unix_state_lock(s);
   3314
   3315		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
   3316			s,
   3317			refcount_read(&s->sk_refcnt),
   3318			0,
   3319			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
   3320			s->sk_type,
   3321			s->sk_socket ?
   3322			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
   3323			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
   3324			sock_i_ino(s));
   3325
   3326		if (u->addr) {	// under unix_table_locks here
   3327			int i, len;
   3328			seq_putc(seq, ' ');
   3329
   3330			i = 0;
   3331			len = u->addr->len -
   3332				offsetof(struct sockaddr_un, sun_path);
   3333			if (u->addr->name->sun_path[0]) {
   3334				len--;
   3335			} else {
   3336				seq_putc(seq, '@');
   3337				i++;
   3338			}
   3339			for ( ; i < len; i++)
   3340				seq_putc(seq, u->addr->name->sun_path[i] ?:
   3341					 '@');
   3342		}
   3343		unix_state_unlock(s);
   3344		seq_putc(seq, '\n');
   3345	}
   3346
   3347	return 0;
   3348}
   3349
   3350static const struct seq_operations unix_seq_ops = {
   3351	.start  = unix_seq_start,
   3352	.next   = unix_seq_next,
   3353	.stop   = unix_seq_stop,
   3354	.show   = unix_seq_show,
   3355};
   3356
   3357#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
   3358struct bpf_unix_iter_state {
   3359	struct seq_net_private p;
   3360	unsigned int cur_sk;
   3361	unsigned int end_sk;
   3362	unsigned int max_sk;
   3363	struct sock **batch;
   3364	bool st_bucket_done;
   3365};
   3366
   3367struct bpf_iter__unix {
   3368	__bpf_md_ptr(struct bpf_iter_meta *, meta);
   3369	__bpf_md_ptr(struct unix_sock *, unix_sk);
   3370	uid_t uid __aligned(8);
   3371};
   3372
   3373static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
   3374			      struct unix_sock *unix_sk, uid_t uid)
   3375{
   3376	struct bpf_iter__unix ctx;
   3377
   3378	meta->seq_num--;  /* skip SEQ_START_TOKEN */
   3379	ctx.meta = meta;
   3380	ctx.unix_sk = unix_sk;
   3381	ctx.uid = uid;
   3382	return bpf_iter_run_prog(prog, &ctx);
   3383}
   3384
   3385static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
   3386
   3387{
   3388	struct bpf_unix_iter_state *iter = seq->private;
   3389	unsigned int expected = 1;
   3390	struct sock *sk;
   3391
   3392	sock_hold(start_sk);
   3393	iter->batch[iter->end_sk++] = start_sk;
   3394
   3395	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
   3396		if (sock_net(sk) != seq_file_net(seq))
   3397			continue;
   3398
   3399		if (iter->end_sk < iter->max_sk) {
   3400			sock_hold(sk);
   3401			iter->batch[iter->end_sk++] = sk;
   3402		}
   3403
   3404		expected++;
   3405	}
   3406
   3407	spin_unlock(&unix_table_locks[start_sk->sk_hash]);
   3408
   3409	return expected;
   3410}
   3411
   3412static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
   3413{
   3414	while (iter->cur_sk < iter->end_sk)
   3415		sock_put(iter->batch[iter->cur_sk++]);
   3416}
   3417
   3418static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
   3419				       unsigned int new_batch_sz)
   3420{
   3421	struct sock **new_batch;
   3422
   3423	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
   3424			     GFP_USER | __GFP_NOWARN);
   3425	if (!new_batch)
   3426		return -ENOMEM;
   3427
   3428	bpf_iter_unix_put_batch(iter);
   3429	kvfree(iter->batch);
   3430	iter->batch = new_batch;
   3431	iter->max_sk = new_batch_sz;
   3432
   3433	return 0;
   3434}
   3435
   3436static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
   3437					loff_t *pos)
   3438{
   3439	struct bpf_unix_iter_state *iter = seq->private;
   3440	unsigned int expected;
   3441	bool resized = false;
   3442	struct sock *sk;
   3443
   3444	if (iter->st_bucket_done)
   3445		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
   3446
   3447again:
   3448	/* Get a new batch */
   3449	iter->cur_sk = 0;
   3450	iter->end_sk = 0;
   3451
   3452	sk = unix_get_first(seq, pos);
   3453	if (!sk)
   3454		return NULL; /* Done */
   3455
   3456	expected = bpf_iter_unix_hold_batch(seq, sk);
   3457
   3458	if (iter->end_sk == expected) {
   3459		iter->st_bucket_done = true;
   3460		return sk;
   3461	}
   3462
   3463	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
   3464		resized = true;
   3465		goto again;
   3466	}
   3467
   3468	return sk;
   3469}
   3470
   3471static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
   3472{
   3473	if (!*pos)
   3474		return SEQ_START_TOKEN;
   3475
   3476	/* bpf iter does not support lseek, so it always
   3477	 * continue from where it was stop()-ped.
   3478	 */
   3479	return bpf_iter_unix_batch(seq, pos);
   3480}
   3481
   3482static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
   3483{
   3484	struct bpf_unix_iter_state *iter = seq->private;
   3485	struct sock *sk;
   3486
   3487	/* Whenever seq_next() is called, the iter->cur_sk is
   3488	 * done with seq_show(), so advance to the next sk in
   3489	 * the batch.
   3490	 */
   3491	if (iter->cur_sk < iter->end_sk)
   3492		sock_put(iter->batch[iter->cur_sk++]);
   3493
   3494	++*pos;
   3495
   3496	if (iter->cur_sk < iter->end_sk)
   3497		sk = iter->batch[iter->cur_sk];
   3498	else
   3499		sk = bpf_iter_unix_batch(seq, pos);
   3500
   3501	return sk;
   3502}
   3503
   3504static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
   3505{
   3506	struct bpf_iter_meta meta;
   3507	struct bpf_prog *prog;
   3508	struct sock *sk = v;
   3509	uid_t uid;
   3510	bool slow;
   3511	int ret;
   3512
   3513	if (v == SEQ_START_TOKEN)
   3514		return 0;
   3515
   3516	slow = lock_sock_fast(sk);
   3517
   3518	if (unlikely(sk_unhashed(sk))) {
   3519		ret = SEQ_SKIP;
   3520		goto unlock;
   3521	}
   3522
   3523	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
   3524	meta.seq = seq;
   3525	prog = bpf_iter_get_info(&meta, false);
   3526	ret = unix_prog_seq_show(prog, &meta, v, uid);
   3527unlock:
   3528	unlock_sock_fast(sk, slow);
   3529	return ret;
   3530}
   3531
   3532static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
   3533{
   3534	struct bpf_unix_iter_state *iter = seq->private;
   3535	struct bpf_iter_meta meta;
   3536	struct bpf_prog *prog;
   3537
   3538	if (!v) {
   3539		meta.seq = seq;
   3540		prog = bpf_iter_get_info(&meta, true);
   3541		if (prog)
   3542			(void)unix_prog_seq_show(prog, &meta, v, 0);
   3543	}
   3544
   3545	if (iter->cur_sk < iter->end_sk)
   3546		bpf_iter_unix_put_batch(iter);
   3547}
   3548
   3549static const struct seq_operations bpf_iter_unix_seq_ops = {
   3550	.start	= bpf_iter_unix_seq_start,
   3551	.next	= bpf_iter_unix_seq_next,
   3552	.stop	= bpf_iter_unix_seq_stop,
   3553	.show	= bpf_iter_unix_seq_show,
   3554};
   3555#endif
   3556#endif
   3557
   3558static const struct net_proto_family unix_family_ops = {
   3559	.family = PF_UNIX,
   3560	.create = unix_create,
   3561	.owner	= THIS_MODULE,
   3562};
   3563
   3564
   3565static int __net_init unix_net_init(struct net *net)
   3566{
   3567	int error = -ENOMEM;
   3568
   3569	net->unx.sysctl_max_dgram_qlen = 10;
   3570	if (unix_sysctl_register(net))
   3571		goto out;
   3572
   3573#ifdef CONFIG_PROC_FS
   3574	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
   3575			sizeof(struct seq_net_private))) {
   3576		unix_sysctl_unregister(net);
   3577		goto out;
   3578	}
   3579#endif
   3580	error = 0;
   3581out:
   3582	return error;
   3583}
   3584
   3585static void __net_exit unix_net_exit(struct net *net)
   3586{
   3587	unix_sysctl_unregister(net);
   3588	remove_proc_entry("unix", net->proc_net);
   3589}
   3590
   3591static struct pernet_operations unix_net_ops = {
   3592	.init = unix_net_init,
   3593	.exit = unix_net_exit,
   3594};
   3595
   3596#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
   3597DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
   3598		     struct unix_sock *unix_sk, uid_t uid)
   3599
   3600#define INIT_BATCH_SZ 16
   3601
   3602static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
   3603{
   3604	struct bpf_unix_iter_state *iter = priv_data;
   3605	int err;
   3606
   3607	err = bpf_iter_init_seq_net(priv_data, aux);
   3608	if (err)
   3609		return err;
   3610
   3611	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
   3612	if (err) {
   3613		bpf_iter_fini_seq_net(priv_data);
   3614		return err;
   3615	}
   3616
   3617	return 0;
   3618}
   3619
   3620static void bpf_iter_fini_unix(void *priv_data)
   3621{
   3622	struct bpf_unix_iter_state *iter = priv_data;
   3623
   3624	bpf_iter_fini_seq_net(priv_data);
   3625	kvfree(iter->batch);
   3626}
   3627
   3628static const struct bpf_iter_seq_info unix_seq_info = {
   3629	.seq_ops		= &bpf_iter_unix_seq_ops,
   3630	.init_seq_private	= bpf_iter_init_unix,
   3631	.fini_seq_private	= bpf_iter_fini_unix,
   3632	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
   3633};
   3634
   3635static const struct bpf_func_proto *
   3636bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
   3637			     const struct bpf_prog *prog)
   3638{
   3639	switch (func_id) {
   3640	case BPF_FUNC_setsockopt:
   3641		return &bpf_sk_setsockopt_proto;
   3642	case BPF_FUNC_getsockopt:
   3643		return &bpf_sk_getsockopt_proto;
   3644	default:
   3645		return NULL;
   3646	}
   3647}
   3648
   3649static struct bpf_iter_reg unix_reg_info = {
   3650	.target			= "unix",
   3651	.ctx_arg_info_size	= 1,
   3652	.ctx_arg_info		= {
   3653		{ offsetof(struct bpf_iter__unix, unix_sk),
   3654		  PTR_TO_BTF_ID_OR_NULL },
   3655	},
   3656	.get_func_proto         = bpf_iter_unix_get_func_proto,
   3657	.seq_info		= &unix_seq_info,
   3658};
   3659
   3660static void __init bpf_iter_register(void)
   3661{
   3662	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
   3663	if (bpf_iter_reg_target(&unix_reg_info))
   3664		pr_warn("Warning: could not register bpf iterator unix\n");
   3665}
   3666#endif
   3667
   3668static int __init af_unix_init(void)
   3669{
   3670	int i, rc = -1;
   3671
   3672	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
   3673
   3674	for (i = 0; i < 2 * UNIX_HASH_SIZE; i++)
   3675		spin_lock_init(&unix_table_locks[i]);
   3676
   3677	rc = proto_register(&unix_dgram_proto, 1);
   3678	if (rc != 0) {
   3679		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
   3680		goto out;
   3681	}
   3682
   3683	rc = proto_register(&unix_stream_proto, 1);
   3684	if (rc != 0) {
   3685		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
   3686		goto out;
   3687	}
   3688
   3689	sock_register(&unix_family_ops);
   3690	register_pernet_subsys(&unix_net_ops);
   3691	unix_bpf_build_proto();
   3692
   3693#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
   3694	bpf_iter_register();
   3695#endif
   3696
   3697out:
   3698	return rc;
   3699}
   3700
   3701static void __exit af_unix_exit(void)
   3702{
   3703	sock_unregister(PF_UNIX);
   3704	proto_unregister(&unix_dgram_proto);
   3705	proto_unregister(&unix_stream_proto);
   3706	unregister_pernet_subsys(&unix_net_ops);
   3707}
   3708
   3709/* Earlier than device_initcall() so that other drivers invoking
   3710   request_module() don't end up in a loop when modprobe tries
   3711   to use a UNIX socket. But later than subsys_initcall() because
   3712   we depend on stuff initialised there */
   3713fs_initcall(af_unix_init);
   3714module_exit(af_unix_exit);
   3715
   3716MODULE_LICENSE("GPL");
   3717MODULE_ALIAS_NETPROTO(PF_UNIX);