sock.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
sock.c (98056B)
      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
      4 *		operating system.  INET is implemented using the  BSD Socket
      5 *		interface as the means of communication with the user level.
      6 *
      7 *		Generic socket support routines. Memory allocators, socket lock/release
      8 *		handler for protocols to use and generic option handler.
      9 *
     10 * Authors:	Ross Biro
     11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
     12 *		Florian La Roche, <flla@stud.uni-sb.de>
     13 *		Alan Cox, <A.Cox@swansea.ac.uk>
     14 *
     15 * Fixes:
     16 *		Alan Cox	: 	Numerous verify_area() problems
     17 *		Alan Cox	:	Connecting on a connecting socket
     18 *					now returns an error for tcp.
     19 *		Alan Cox	:	sock->protocol is set correctly.
     20 *					and is not sometimes left as 0.
     21 *		Alan Cox	:	connect handles icmp errors on a
     22 *					connect properly. Unfortunately there
     23 *					is a restart syscall nasty there. I
     24 *					can't match BSD without hacking the C
     25 *					library. Ideas urgently sought!
     26 *		Alan Cox	:	Disallow bind() to addresses that are
     27 *					not ours - especially broadcast ones!!
     28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
     29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
     30 *					instead they leave that for the DESTROY timer.
     31 *		Alan Cox	:	Clean up error flag in accept
     32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
     33 *					was buggy. Put a remove_sock() in the handler
     34 *					for memory when we hit 0. Also altered the timer
     35 *					code. The ACK stuff can wait and needs major
     36 *					TCP layer surgery.
     37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
     38 *					and fixed timer/inet_bh race.
     39 *		Alan Cox	:	Added zapped flag for TCP
     40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
     41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
     42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
     43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
     44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
     45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
     46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
     47 *	Pauline Middelink	:	identd support
     48 *		Alan Cox	:	Fixed connect() taking signals I think.
     49 *		Alan Cox	:	SO_LINGER supported
     50 *		Alan Cox	:	Error reporting fixes
     51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
     52 *		Alan Cox	:	inet sockets don't set sk->type!
     53 *		Alan Cox	:	Split socket option code
     54 *		Alan Cox	:	Callbacks
     55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
     56 *		Alex		:	Removed restriction on inet fioctl
     57 *		Alan Cox	:	Splitting INET from NET core
     58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
     59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
     60 *		Alan Cox	:	Split IP from generic code
     61 *		Alan Cox	:	New kfree_skbmem()
     62 *		Alan Cox	:	Make SO_DEBUG superuser only.
     63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
     64 *					(compatibility fix)
     65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
     66 *		Alan Cox	:	Allocator for a socket is settable.
     67 *		Alan Cox	:	SO_ERROR includes soft errors.
     68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
     69 *		Alan Cox	: 	Generic socket allocation to make hooks
     70 *					easier (suggested by Craig Metz).
     71 *		Michael Pall	:	SO_ERROR returns positive errno again
     72 *              Steve Whitehouse:       Added default destructor to free
     73 *                                      protocol private data.
     74 *              Steve Whitehouse:       Added various other default routines
     75 *                                      common to several socket families.
     76 *              Chris Evans     :       Call suser() check last on F_SETOWN
     77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
     78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
     79 *		Andi Kleen	:	Fix write_space callback
     80 *		Chris Evans	:	Security fixes - signedness again
     81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
     82 *
     83 * To Fix:
     84 */
     85
     86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
     87
     88#include <asm/unaligned.h>
     89#include <linux/capability.h>
     90#include <linux/errno.h>
     91#include <linux/errqueue.h>
     92#include <linux/types.h>
     93#include <linux/socket.h>
     94#include <linux/in.h>
     95#include <linux/kernel.h>
     96#include <linux/module.h>
     97#include <linux/proc_fs.h>
     98#include <linux/seq_file.h>
     99#include <linux/sched.h>
    100#include <linux/sched/mm.h>
    101#include <linux/timer.h>
    102#include <linux/string.h>
    103#include <linux/sockios.h>
    104#include <linux/net.h>
    105#include <linux/mm.h>
    106#include <linux/slab.h>
    107#include <linux/interrupt.h>
    108#include <linux/poll.h>
    109#include <linux/tcp.h>
    110#include <linux/init.h>
    111#include <linux/highmem.h>
    112#include <linux/user_namespace.h>
    113#include <linux/static_key.h>
    114#include <linux/memcontrol.h>
    115#include <linux/prefetch.h>
    116#include <linux/compat.h>
    117
    118#include <linux/uaccess.h>
    119
    120#include <linux/netdevice.h>
    121#include <net/protocol.h>
    122#include <linux/skbuff.h>
    123#include <net/net_namespace.h>
    124#include <net/request_sock.h>
    125#include <net/sock.h>
    126#include <linux/net_tstamp.h>
    127#include <net/xfrm.h>
    128#include <linux/ipsec.h>
    129#include <net/cls_cgroup.h>
    130#include <net/netprio_cgroup.h>
    131#include <linux/sock_diag.h>
    132
    133#include <linux/filter.h>
    134#include <net/sock_reuseport.h>
    135#include <net/bpf_sk_storage.h>
    136
    137#include <trace/events/sock.h>
    138
    139#include <net/tcp.h>
    140#include <net/busy_poll.h>
    141
    142#include <linux/ethtool.h>
    143
    144#include "dev.h"
    145
    146static DEFINE_MUTEX(proto_list_mutex);
    147static LIST_HEAD(proto_list);
    148
    149static void sock_def_write_space_wfree(struct sock *sk);
    150static void sock_def_write_space(struct sock *sk);
    151
    152/**
    153 * sk_ns_capable - General socket capability test
    154 * @sk: Socket to use a capability on or through
    155 * @user_ns: The user namespace of the capability to use
    156 * @cap: The capability to use
    157 *
    158 * Test to see if the opener of the socket had when the socket was
    159 * created and the current process has the capability @cap in the user
    160 * namespace @user_ns.
    161 */
    162bool sk_ns_capable(const struct sock *sk,
    163		   struct user_namespace *user_ns, int cap)
    164{
    165	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
    166		ns_capable(user_ns, cap);
    167}
    168EXPORT_SYMBOL(sk_ns_capable);
    169
    170/**
    171 * sk_capable - Socket global capability test
    172 * @sk: Socket to use a capability on or through
    173 * @cap: The global capability to use
    174 *
    175 * Test to see if the opener of the socket had when the socket was
    176 * created and the current process has the capability @cap in all user
    177 * namespaces.
    178 */
    179bool sk_capable(const struct sock *sk, int cap)
    180{
    181	return sk_ns_capable(sk, &init_user_ns, cap);
    182}
    183EXPORT_SYMBOL(sk_capable);
    184
    185/**
    186 * sk_net_capable - Network namespace socket capability test
    187 * @sk: Socket to use a capability on or through
    188 * @cap: The capability to use
    189 *
    190 * Test to see if the opener of the socket had when the socket was created
    191 * and the current process has the capability @cap over the network namespace
    192 * the socket is a member of.
    193 */
    194bool sk_net_capable(const struct sock *sk, int cap)
    195{
    196	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
    197}
    198EXPORT_SYMBOL(sk_net_capable);
    199
    200/*
    201 * Each address family might have different locking rules, so we have
    202 * one slock key per address family and separate keys for internal and
    203 * userspace sockets.
    204 */
    205static struct lock_class_key af_family_keys[AF_MAX];
    206static struct lock_class_key af_family_kern_keys[AF_MAX];
    207static struct lock_class_key af_family_slock_keys[AF_MAX];
    208static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
    209
    210/*
    211 * Make lock validator output more readable. (we pre-construct these
    212 * strings build-time, so that runtime initialization of socket
    213 * locks is fast):
    214 */
    215
    216#define _sock_locks(x)						  \
    217  x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
    218  x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
    219  x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
    220  x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
    221  x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
    222  x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
    223  x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
    224  x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
    225  x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
    226  x "27"       ,	x "28"          ,	x "AF_CAN"      , \
    227  x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
    228  x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
    229  x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
    230  x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
    231  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
    232  x "AF_MCTP"  , \
    233  x "AF_MAX"
    234
    235static const char *const af_family_key_strings[AF_MAX+1] = {
    236	_sock_locks("sk_lock-")
    237};
    238static const char *const af_family_slock_key_strings[AF_MAX+1] = {
    239	_sock_locks("slock-")
    240};
    241static const char *const af_family_clock_key_strings[AF_MAX+1] = {
    242	_sock_locks("clock-")
    243};
    244
    245static const char *const af_family_kern_key_strings[AF_MAX+1] = {
    246	_sock_locks("k-sk_lock-")
    247};
    248static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
    249	_sock_locks("k-slock-")
    250};
    251static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
    252	_sock_locks("k-clock-")
    253};
    254static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
    255	_sock_locks("rlock-")
    256};
    257static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
    258	_sock_locks("wlock-")
    259};
    260static const char *const af_family_elock_key_strings[AF_MAX+1] = {
    261	_sock_locks("elock-")
    262};
    263
    264/*
    265 * sk_callback_lock and sk queues locking rules are per-address-family,
    266 * so split the lock classes by using a per-AF key:
    267 */
    268static struct lock_class_key af_callback_keys[AF_MAX];
    269static struct lock_class_key af_rlock_keys[AF_MAX];
    270static struct lock_class_key af_wlock_keys[AF_MAX];
    271static struct lock_class_key af_elock_keys[AF_MAX];
    272static struct lock_class_key af_kern_callback_keys[AF_MAX];
    273
    274/* Run time adjustable parameters. */
    275__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
    276EXPORT_SYMBOL(sysctl_wmem_max);
    277__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
    278EXPORT_SYMBOL(sysctl_rmem_max);
    279__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
    280__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
    281
    282/* Maximal space eaten by iovec or ancillary data plus some space */
    283int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
    284EXPORT_SYMBOL(sysctl_optmem_max);
    285
    286int sysctl_tstamp_allow_data __read_mostly = 1;
    287
    288DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
    289EXPORT_SYMBOL_GPL(memalloc_socks_key);
    290
    291/**
    292 * sk_set_memalloc - sets %SOCK_MEMALLOC
    293 * @sk: socket to set it on
    294 *
    295 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
    296 * It's the responsibility of the admin to adjust min_free_kbytes
    297 * to meet the requirements
    298 */
    299void sk_set_memalloc(struct sock *sk)
    300{
    301	sock_set_flag(sk, SOCK_MEMALLOC);
    302	sk->sk_allocation |= __GFP_MEMALLOC;
    303	static_branch_inc(&memalloc_socks_key);
    304}
    305EXPORT_SYMBOL_GPL(sk_set_memalloc);
    306
    307void sk_clear_memalloc(struct sock *sk)
    308{
    309	sock_reset_flag(sk, SOCK_MEMALLOC);
    310	sk->sk_allocation &= ~__GFP_MEMALLOC;
    311	static_branch_dec(&memalloc_socks_key);
    312
    313	/*
    314	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
    315	 * progress of swapping. SOCK_MEMALLOC may be cleared while
    316	 * it has rmem allocations due to the last swapfile being deactivated
    317	 * but there is a risk that the socket is unusable due to exceeding
    318	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
    319	 */
    320	sk_mem_reclaim(sk);
    321}
    322EXPORT_SYMBOL_GPL(sk_clear_memalloc);
    323
    324int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
    325{
    326	int ret;
    327	unsigned int noreclaim_flag;
    328
    329	/* these should have been dropped before queueing */
    330	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
    331
    332	noreclaim_flag = memalloc_noreclaim_save();
    333	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
    334				 tcp_v6_do_rcv,
    335				 tcp_v4_do_rcv,
    336				 sk, skb);
    337	memalloc_noreclaim_restore(noreclaim_flag);
    338
    339	return ret;
    340}
    341EXPORT_SYMBOL(__sk_backlog_rcv);
    342
    343void sk_error_report(struct sock *sk)
    344{
    345	sk->sk_error_report(sk);
    346
    347	switch (sk->sk_family) {
    348	case AF_INET:
    349		fallthrough;
    350	case AF_INET6:
    351		trace_inet_sk_error_report(sk);
    352		break;
    353	default:
    354		break;
    355	}
    356}
    357EXPORT_SYMBOL(sk_error_report);
    358
    359int sock_get_timeout(long timeo, void *optval, bool old_timeval)
    360{
    361	struct __kernel_sock_timeval tv;
    362
    363	if (timeo == MAX_SCHEDULE_TIMEOUT) {
    364		tv.tv_sec = 0;
    365		tv.tv_usec = 0;
    366	} else {
    367		tv.tv_sec = timeo / HZ;
    368		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
    369	}
    370
    371	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
    372		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
    373		*(struct old_timeval32 *)optval = tv32;
    374		return sizeof(tv32);
    375	}
    376
    377	if (old_timeval) {
    378		struct __kernel_old_timeval old_tv;
    379		old_tv.tv_sec = tv.tv_sec;
    380		old_tv.tv_usec = tv.tv_usec;
    381		*(struct __kernel_old_timeval *)optval = old_tv;
    382		return sizeof(old_tv);
    383	}
    384
    385	*(struct __kernel_sock_timeval *)optval = tv;
    386	return sizeof(tv);
    387}
    388EXPORT_SYMBOL(sock_get_timeout);
    389
    390int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
    391			   sockptr_t optval, int optlen, bool old_timeval)
    392{
    393	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
    394		struct old_timeval32 tv32;
    395
    396		if (optlen < sizeof(tv32))
    397			return -EINVAL;
    398
    399		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
    400			return -EFAULT;
    401		tv->tv_sec = tv32.tv_sec;
    402		tv->tv_usec = tv32.tv_usec;
    403	} else if (old_timeval) {
    404		struct __kernel_old_timeval old_tv;
    405
    406		if (optlen < sizeof(old_tv))
    407			return -EINVAL;
    408		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
    409			return -EFAULT;
    410		tv->tv_sec = old_tv.tv_sec;
    411		tv->tv_usec = old_tv.tv_usec;
    412	} else {
    413		if (optlen < sizeof(*tv))
    414			return -EINVAL;
    415		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
    416			return -EFAULT;
    417	}
    418
    419	return 0;
    420}
    421EXPORT_SYMBOL(sock_copy_user_timeval);
    422
    423static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
    424			    bool old_timeval)
    425{
    426	struct __kernel_sock_timeval tv;
    427	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
    428
    429	if (err)
    430		return err;
    431
    432	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
    433		return -EDOM;
    434
    435	if (tv.tv_sec < 0) {
    436		static int warned __read_mostly;
    437
    438		*timeo_p = 0;
    439		if (warned < 10 && net_ratelimit()) {
    440			warned++;
    441			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
    442				__func__, current->comm, task_pid_nr(current));
    443		}
    444		return 0;
    445	}
    446	*timeo_p = MAX_SCHEDULE_TIMEOUT;
    447	if (tv.tv_sec == 0 && tv.tv_usec == 0)
    448		return 0;
    449	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
    450		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
    451	return 0;
    452}
    453
    454static bool sock_needs_netstamp(const struct sock *sk)
    455{
    456	switch (sk->sk_family) {
    457	case AF_UNSPEC:
    458	case AF_UNIX:
    459		return false;
    460	default:
    461		return true;
    462	}
    463}
    464
    465static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
    466{
    467	if (sk->sk_flags & flags) {
    468		sk->sk_flags &= ~flags;
    469		if (sock_needs_netstamp(sk) &&
    470		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
    471			net_disable_timestamp();
    472	}
    473}
    474
    475
    476int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
    477{
    478	unsigned long flags;
    479	struct sk_buff_head *list = &sk->sk_receive_queue;
    480
    481	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
    482		atomic_inc(&sk->sk_drops);
    483		trace_sock_rcvqueue_full(sk, skb);
    484		return -ENOMEM;
    485	}
    486
    487	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
    488		atomic_inc(&sk->sk_drops);
    489		return -ENOBUFS;
    490	}
    491
    492	skb->dev = NULL;
    493	skb_set_owner_r(skb, sk);
    494
    495	/* we escape from rcu protected region, make sure we dont leak
    496	 * a norefcounted dst
    497	 */
    498	skb_dst_force(skb);
    499
    500	spin_lock_irqsave(&list->lock, flags);
    501	sock_skb_set_dropcount(sk, skb);
    502	__skb_queue_tail(list, skb);
    503	spin_unlock_irqrestore(&list->lock, flags);
    504
    505	if (!sock_flag(sk, SOCK_DEAD))
    506		sk->sk_data_ready(sk);
    507	return 0;
    508}
    509EXPORT_SYMBOL(__sock_queue_rcv_skb);
    510
    511int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
    512			      enum skb_drop_reason *reason)
    513{
    514	enum skb_drop_reason drop_reason;
    515	int err;
    516
    517	err = sk_filter(sk, skb);
    518	if (err) {
    519		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
    520		goto out;
    521	}
    522	err = __sock_queue_rcv_skb(sk, skb);
    523	switch (err) {
    524	case -ENOMEM:
    525		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
    526		break;
    527	case -ENOBUFS:
    528		drop_reason = SKB_DROP_REASON_PROTO_MEM;
    529		break;
    530	default:
    531		drop_reason = SKB_NOT_DROPPED_YET;
    532		break;
    533	}
    534out:
    535	if (reason)
    536		*reason = drop_reason;
    537	return err;
    538}
    539EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
    540
    541int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
    542		     const int nested, unsigned int trim_cap, bool refcounted)
    543{
    544	int rc = NET_RX_SUCCESS;
    545
    546	if (sk_filter_trim_cap(sk, skb, trim_cap))
    547		goto discard_and_relse;
    548
    549	skb->dev = NULL;
    550
    551	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
    552		atomic_inc(&sk->sk_drops);
    553		goto discard_and_relse;
    554	}
    555	if (nested)
    556		bh_lock_sock_nested(sk);
    557	else
    558		bh_lock_sock(sk);
    559	if (!sock_owned_by_user(sk)) {
    560		/*
    561		 * trylock + unlock semantics:
    562		 */
    563		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
    564
    565		rc = sk_backlog_rcv(sk, skb);
    566
    567		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
    568	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
    569		bh_unlock_sock(sk);
    570		atomic_inc(&sk->sk_drops);
    571		goto discard_and_relse;
    572	}
    573
    574	bh_unlock_sock(sk);
    575out:
    576	if (refcounted)
    577		sock_put(sk);
    578	return rc;
    579discard_and_relse:
    580	kfree_skb(skb);
    581	goto out;
    582}
    583EXPORT_SYMBOL(__sk_receive_skb);
    584
    585INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
    586							  u32));
    587INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
    588							   u32));
    589struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
    590{
    591	struct dst_entry *dst = __sk_dst_get(sk);
    592
    593	if (dst && dst->obsolete &&
    594	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
    595			       dst, cookie) == NULL) {
    596		sk_tx_queue_clear(sk);
    597		sk->sk_dst_pending_confirm = 0;
    598		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
    599		dst_release(dst);
    600		return NULL;
    601	}
    602
    603	return dst;
    604}
    605EXPORT_SYMBOL(__sk_dst_check);
    606
    607struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
    608{
    609	struct dst_entry *dst = sk_dst_get(sk);
    610
    611	if (dst && dst->obsolete &&
    612	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
    613			       dst, cookie) == NULL) {
    614		sk_dst_reset(sk);
    615		dst_release(dst);
    616		return NULL;
    617	}
    618
    619	return dst;
    620}
    621EXPORT_SYMBOL(sk_dst_check);
    622
    623static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
    624{
    625	int ret = -ENOPROTOOPT;
    626#ifdef CONFIG_NETDEVICES
    627	struct net *net = sock_net(sk);
    628
    629	/* Sorry... */
    630	ret = -EPERM;
    631	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
    632		goto out;
    633
    634	ret = -EINVAL;
    635	if (ifindex < 0)
    636		goto out;
    637
    638	/* Paired with all READ_ONCE() done locklessly. */
    639	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
    640
    641	if (sk->sk_prot->rehash)
    642		sk->sk_prot->rehash(sk);
    643	sk_dst_reset(sk);
    644
    645	ret = 0;
    646
    647out:
    648#endif
    649
    650	return ret;
    651}
    652
    653int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
    654{
    655	int ret;
    656
    657	if (lock_sk)
    658		lock_sock(sk);
    659	ret = sock_bindtoindex_locked(sk, ifindex);
    660	if (lock_sk)
    661		release_sock(sk);
    662
    663	return ret;
    664}
    665EXPORT_SYMBOL(sock_bindtoindex);
    666
    667static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
    668{
    669	int ret = -ENOPROTOOPT;
    670#ifdef CONFIG_NETDEVICES
    671	struct net *net = sock_net(sk);
    672	char devname[IFNAMSIZ];
    673	int index;
    674
    675	ret = -EINVAL;
    676	if (optlen < 0)
    677		goto out;
    678
    679	/* Bind this socket to a particular device like "eth0",
    680	 * as specified in the passed interface name. If the
    681	 * name is "" or the option length is zero the socket
    682	 * is not bound.
    683	 */
    684	if (optlen > IFNAMSIZ - 1)
    685		optlen = IFNAMSIZ - 1;
    686	memset(devname, 0, sizeof(devname));
    687
    688	ret = -EFAULT;
    689	if (copy_from_sockptr(devname, optval, optlen))
    690		goto out;
    691
    692	index = 0;
    693	if (devname[0] != '\0') {
    694		struct net_device *dev;
    695
    696		rcu_read_lock();
    697		dev = dev_get_by_name_rcu(net, devname);
    698		if (dev)
    699			index = dev->ifindex;
    700		rcu_read_unlock();
    701		ret = -ENODEV;
    702		if (!dev)
    703			goto out;
    704	}
    705
    706	return sock_bindtoindex(sk, index, true);
    707out:
    708#endif
    709
    710	return ret;
    711}
    712
    713static int sock_getbindtodevice(struct sock *sk, char __user *optval,
    714				int __user *optlen, int len)
    715{
    716	int ret = -ENOPROTOOPT;
    717#ifdef CONFIG_NETDEVICES
    718	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
    719	struct net *net = sock_net(sk);
    720	char devname[IFNAMSIZ];
    721
    722	if (bound_dev_if == 0) {
    723		len = 0;
    724		goto zero;
    725	}
    726
    727	ret = -EINVAL;
    728	if (len < IFNAMSIZ)
    729		goto out;
    730
    731	ret = netdev_get_name(net, devname, bound_dev_if);
    732	if (ret)
    733		goto out;
    734
    735	len = strlen(devname) + 1;
    736
    737	ret = -EFAULT;
    738	if (copy_to_user(optval, devname, len))
    739		goto out;
    740
    741zero:
    742	ret = -EFAULT;
    743	if (put_user(len, optlen))
    744		goto out;
    745
    746	ret = 0;
    747
    748out:
    749#endif
    750
    751	return ret;
    752}
    753
    754bool sk_mc_loop(struct sock *sk)
    755{
    756	if (dev_recursion_level())
    757		return false;
    758	if (!sk)
    759		return true;
    760	switch (sk->sk_family) {
    761	case AF_INET:
    762		return inet_sk(sk)->mc_loop;
    763#if IS_ENABLED(CONFIG_IPV6)
    764	case AF_INET6:
    765		return inet6_sk(sk)->mc_loop;
    766#endif
    767	}
    768	WARN_ON_ONCE(1);
    769	return true;
    770}
    771EXPORT_SYMBOL(sk_mc_loop);
    772
    773void sock_set_reuseaddr(struct sock *sk)
    774{
    775	lock_sock(sk);
    776	sk->sk_reuse = SK_CAN_REUSE;
    777	release_sock(sk);
    778}
    779EXPORT_SYMBOL(sock_set_reuseaddr);
    780
    781void sock_set_reuseport(struct sock *sk)
    782{
    783	lock_sock(sk);
    784	sk->sk_reuseport = true;
    785	release_sock(sk);
    786}
    787EXPORT_SYMBOL(sock_set_reuseport);
    788
    789void sock_no_linger(struct sock *sk)
    790{
    791	lock_sock(sk);
    792	sk->sk_lingertime = 0;
    793	sock_set_flag(sk, SOCK_LINGER);
    794	release_sock(sk);
    795}
    796EXPORT_SYMBOL(sock_no_linger);
    797
    798void sock_set_priority(struct sock *sk, u32 priority)
    799{
    800	lock_sock(sk);
    801	sk->sk_priority = priority;
    802	release_sock(sk);
    803}
    804EXPORT_SYMBOL(sock_set_priority);
    805
    806void sock_set_sndtimeo(struct sock *sk, s64 secs)
    807{
    808	lock_sock(sk);
    809	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
    810		sk->sk_sndtimeo = secs * HZ;
    811	else
    812		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
    813	release_sock(sk);
    814}
    815EXPORT_SYMBOL(sock_set_sndtimeo);
    816
    817static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
    818{
    819	if (val)  {
    820		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
    821		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
    822		sock_set_flag(sk, SOCK_RCVTSTAMP);
    823		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
    824	} else {
    825		sock_reset_flag(sk, SOCK_RCVTSTAMP);
    826		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
    827	}
    828}
    829
    830void sock_enable_timestamps(struct sock *sk)
    831{
    832	lock_sock(sk);
    833	__sock_set_timestamps(sk, true, false, true);
    834	release_sock(sk);
    835}
    836EXPORT_SYMBOL(sock_enable_timestamps);
    837
    838void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
    839{
    840	switch (optname) {
    841	case SO_TIMESTAMP_OLD:
    842		__sock_set_timestamps(sk, valbool, false, false);
    843		break;
    844	case SO_TIMESTAMP_NEW:
    845		__sock_set_timestamps(sk, valbool, true, false);
    846		break;
    847	case SO_TIMESTAMPNS_OLD:
    848		__sock_set_timestamps(sk, valbool, false, true);
    849		break;
    850	case SO_TIMESTAMPNS_NEW:
    851		__sock_set_timestamps(sk, valbool, true, true);
    852		break;
    853	}
    854}
    855
    856static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
    857{
    858	struct net *net = sock_net(sk);
    859	struct net_device *dev = NULL;
    860	bool match = false;
    861	int *vclock_index;
    862	int i, num;
    863
    864	if (sk->sk_bound_dev_if)
    865		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
    866
    867	if (!dev) {
    868		pr_err("%s: sock not bind to device\n", __func__);
    869		return -EOPNOTSUPP;
    870	}
    871
    872	num = ethtool_get_phc_vclocks(dev, &vclock_index);
    873	dev_put(dev);
    874
    875	for (i = 0; i < num; i++) {
    876		if (*(vclock_index + i) == phc_index) {
    877			match = true;
    878			break;
    879		}
    880	}
    881
    882	if (num > 0)
    883		kfree(vclock_index);
    884
    885	if (!match)
    886		return -EINVAL;
    887
    888	sk->sk_bind_phc = phc_index;
    889
    890	return 0;
    891}
    892
    893int sock_set_timestamping(struct sock *sk, int optname,
    894			  struct so_timestamping timestamping)
    895{
    896	int val = timestamping.flags;
    897	int ret;
    898
    899	if (val & ~SOF_TIMESTAMPING_MASK)
    900		return -EINVAL;
    901
    902	if (val & SOF_TIMESTAMPING_OPT_ID &&
    903	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
    904		if (sk_is_tcp(sk)) {
    905			if ((1 << sk->sk_state) &
    906			    (TCPF_CLOSE | TCPF_LISTEN))
    907				return -EINVAL;
    908			atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
    909		} else {
    910			atomic_set(&sk->sk_tskey, 0);
    911		}
    912	}
    913
    914	if (val & SOF_TIMESTAMPING_OPT_STATS &&
    915	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
    916		return -EINVAL;
    917
    918	if (val & SOF_TIMESTAMPING_BIND_PHC) {
    919		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
    920		if (ret)
    921			return ret;
    922	}
    923
    924	sk->sk_tsflags = val;
    925	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
    926
    927	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
    928		sock_enable_timestamp(sk,
    929				      SOCK_TIMESTAMPING_RX_SOFTWARE);
    930	else
    931		sock_disable_timestamp(sk,
    932				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
    933	return 0;
    934}
    935
    936void sock_set_keepalive(struct sock *sk)
    937{
    938	lock_sock(sk);
    939	if (sk->sk_prot->keepalive)
    940		sk->sk_prot->keepalive(sk, true);
    941	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
    942	release_sock(sk);
    943}
    944EXPORT_SYMBOL(sock_set_keepalive);
    945
    946static void __sock_set_rcvbuf(struct sock *sk, int val)
    947{
    948	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
    949	 * as a negative value.
    950	 */
    951	val = min_t(int, val, INT_MAX / 2);
    952	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
    953
    954	/* We double it on the way in to account for "struct sk_buff" etc.
    955	 * overhead.   Applications assume that the SO_RCVBUF setting they make
    956	 * will allow that much actual data to be received on that socket.
    957	 *
    958	 * Applications are unaware that "struct sk_buff" and other overheads
    959	 * allocate from the receive buffer during socket buffer allocation.
    960	 *
    961	 * And after considering the possible alternatives, returning the value
    962	 * we actually used in getsockopt is the most desirable behavior.
    963	 */
    964	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
    965}
    966
    967void sock_set_rcvbuf(struct sock *sk, int val)
    968{
    969	lock_sock(sk);
    970	__sock_set_rcvbuf(sk, val);
    971	release_sock(sk);
    972}
    973EXPORT_SYMBOL(sock_set_rcvbuf);
    974
    975static void __sock_set_mark(struct sock *sk, u32 val)
    976{
    977	if (val != sk->sk_mark) {
    978		sk->sk_mark = val;
    979		sk_dst_reset(sk);
    980	}
    981}
    982
    983void sock_set_mark(struct sock *sk, u32 val)
    984{
    985	lock_sock(sk);
    986	__sock_set_mark(sk, val);
    987	release_sock(sk);
    988}
    989EXPORT_SYMBOL(sock_set_mark);
    990
    991static void sock_release_reserved_memory(struct sock *sk, int bytes)
    992{
    993	/* Round down bytes to multiple of pages */
    994	bytes &= ~(SK_MEM_QUANTUM - 1);
    995
    996	WARN_ON(bytes > sk->sk_reserved_mem);
    997	sk->sk_reserved_mem -= bytes;
    998	sk_mem_reclaim(sk);
    999}
   1000
   1001static int sock_reserve_memory(struct sock *sk, int bytes)
   1002{
   1003	long allocated;
   1004	bool charged;
   1005	int pages;
   1006
   1007	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
   1008		return -EOPNOTSUPP;
   1009
   1010	if (!bytes)
   1011		return 0;
   1012
   1013	pages = sk_mem_pages(bytes);
   1014
   1015	/* pre-charge to memcg */
   1016	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
   1017					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
   1018	if (!charged)
   1019		return -ENOMEM;
   1020
   1021	/* pre-charge to forward_alloc */
   1022	allocated = sk_memory_allocated_add(sk, pages);
   1023	/* If the system goes into memory pressure with this
   1024	 * precharge, give up and return error.
   1025	 */
   1026	if (allocated > sk_prot_mem_limits(sk, 1)) {
   1027		sk_memory_allocated_sub(sk, pages);
   1028		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
   1029		return -ENOMEM;
   1030	}
   1031	sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT;
   1032
   1033	sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT;
   1034
   1035	return 0;
   1036}
   1037
   1038/*
   1039 *	This is meant for all protocols to use and covers goings on
   1040 *	at the socket level. Everything here is generic.
   1041 */
   1042
   1043int sock_setsockopt(struct socket *sock, int level, int optname,
   1044		    sockptr_t optval, unsigned int optlen)
   1045{
   1046	struct so_timestamping timestamping;
   1047	struct sock_txtime sk_txtime;
   1048	struct sock *sk = sock->sk;
   1049	int val;
   1050	int valbool;
   1051	struct linger ling;
   1052	int ret = 0;
   1053
   1054	/*
   1055	 *	Options without arguments
   1056	 */
   1057
   1058	if (optname == SO_BINDTODEVICE)
   1059		return sock_setbindtodevice(sk, optval, optlen);
   1060
   1061	if (optlen < sizeof(int))
   1062		return -EINVAL;
   1063
   1064	if (copy_from_sockptr(&val, optval, sizeof(val)))
   1065		return -EFAULT;
   1066
   1067	valbool = val ? 1 : 0;
   1068
   1069	lock_sock(sk);
   1070
   1071	switch (optname) {
   1072	case SO_DEBUG:
   1073		if (val && !capable(CAP_NET_ADMIN))
   1074			ret = -EACCES;
   1075		else
   1076			sock_valbool_flag(sk, SOCK_DBG, valbool);
   1077		break;
   1078	case SO_REUSEADDR:
   1079		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
   1080		break;
   1081	case SO_REUSEPORT:
   1082		sk->sk_reuseport = valbool;
   1083		break;
   1084	case SO_TYPE:
   1085	case SO_PROTOCOL:
   1086	case SO_DOMAIN:
   1087	case SO_ERROR:
   1088		ret = -ENOPROTOOPT;
   1089		break;
   1090	case SO_DONTROUTE:
   1091		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
   1092		sk_dst_reset(sk);
   1093		break;
   1094	case SO_BROADCAST:
   1095		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
   1096		break;
   1097	case SO_SNDBUF:
   1098		/* Don't error on this BSD doesn't and if you think
   1099		 * about it this is right. Otherwise apps have to
   1100		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
   1101		 * are treated in BSD as hints
   1102		 */
   1103		val = min_t(u32, val, sysctl_wmem_max);
   1104set_sndbuf:
   1105		/* Ensure val * 2 fits into an int, to prevent max_t()
   1106		 * from treating it as a negative value.
   1107		 */
   1108		val = min_t(int, val, INT_MAX / 2);
   1109		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
   1110		WRITE_ONCE(sk->sk_sndbuf,
   1111			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
   1112		/* Wake up sending tasks if we upped the value. */
   1113		sk->sk_write_space(sk);
   1114		break;
   1115
   1116	case SO_SNDBUFFORCE:
   1117		if (!capable(CAP_NET_ADMIN)) {
   1118			ret = -EPERM;
   1119			break;
   1120		}
   1121
   1122		/* No negative values (to prevent underflow, as val will be
   1123		 * multiplied by 2).
   1124		 */
   1125		if (val < 0)
   1126			val = 0;
   1127		goto set_sndbuf;
   1128
   1129	case SO_RCVBUF:
   1130		/* Don't error on this BSD doesn't and if you think
   1131		 * about it this is right. Otherwise apps have to
   1132		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
   1133		 * are treated in BSD as hints
   1134		 */
   1135		__sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
   1136		break;
   1137
   1138	case SO_RCVBUFFORCE:
   1139		if (!capable(CAP_NET_ADMIN)) {
   1140			ret = -EPERM;
   1141			break;
   1142		}
   1143
   1144		/* No negative values (to prevent underflow, as val will be
   1145		 * multiplied by 2).
   1146		 */
   1147		__sock_set_rcvbuf(sk, max(val, 0));
   1148		break;
   1149
   1150	case SO_KEEPALIVE:
   1151		if (sk->sk_prot->keepalive)
   1152			sk->sk_prot->keepalive(sk, valbool);
   1153		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
   1154		break;
   1155
   1156	case SO_OOBINLINE:
   1157		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
   1158		break;
   1159
   1160	case SO_NO_CHECK:
   1161		sk->sk_no_check_tx = valbool;
   1162		break;
   1163
   1164	case SO_PRIORITY:
   1165		if ((val >= 0 && val <= 6) ||
   1166		    ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
   1167		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
   1168			sk->sk_priority = val;
   1169		else
   1170			ret = -EPERM;
   1171		break;
   1172
   1173	case SO_LINGER:
   1174		if (optlen < sizeof(ling)) {
   1175			ret = -EINVAL;	/* 1003.1g */
   1176			break;
   1177		}
   1178		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
   1179			ret = -EFAULT;
   1180			break;
   1181		}
   1182		if (!ling.l_onoff)
   1183			sock_reset_flag(sk, SOCK_LINGER);
   1184		else {
   1185#if (BITS_PER_LONG == 32)
   1186			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
   1187				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
   1188			else
   1189#endif
   1190				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
   1191			sock_set_flag(sk, SOCK_LINGER);
   1192		}
   1193		break;
   1194
   1195	case SO_BSDCOMPAT:
   1196		break;
   1197
   1198	case SO_PASSCRED:
   1199		if (valbool)
   1200			set_bit(SOCK_PASSCRED, &sock->flags);
   1201		else
   1202			clear_bit(SOCK_PASSCRED, &sock->flags);
   1203		break;
   1204
   1205	case SO_TIMESTAMP_OLD:
   1206	case SO_TIMESTAMP_NEW:
   1207	case SO_TIMESTAMPNS_OLD:
   1208	case SO_TIMESTAMPNS_NEW:
   1209		sock_set_timestamp(sk, optname, valbool);
   1210		break;
   1211
   1212	case SO_TIMESTAMPING_NEW:
   1213	case SO_TIMESTAMPING_OLD:
   1214		if (optlen == sizeof(timestamping)) {
   1215			if (copy_from_sockptr(&timestamping, optval,
   1216					      sizeof(timestamping))) {
   1217				ret = -EFAULT;
   1218				break;
   1219			}
   1220		} else {
   1221			memset(&timestamping, 0, sizeof(timestamping));
   1222			timestamping.flags = val;
   1223		}
   1224		ret = sock_set_timestamping(sk, optname, timestamping);
   1225		break;
   1226
   1227	case SO_RCVLOWAT:
   1228		if (val < 0)
   1229			val = INT_MAX;
   1230		if (sock->ops->set_rcvlowat)
   1231			ret = sock->ops->set_rcvlowat(sk, val);
   1232		else
   1233			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
   1234		break;
   1235
   1236	case SO_RCVTIMEO_OLD:
   1237	case SO_RCVTIMEO_NEW:
   1238		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
   1239				       optlen, optname == SO_RCVTIMEO_OLD);
   1240		break;
   1241
   1242	case SO_SNDTIMEO_OLD:
   1243	case SO_SNDTIMEO_NEW:
   1244		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
   1245				       optlen, optname == SO_SNDTIMEO_OLD);
   1246		break;
   1247
   1248	case SO_ATTACH_FILTER: {
   1249		struct sock_fprog fprog;
   1250
   1251		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
   1252		if (!ret)
   1253			ret = sk_attach_filter(&fprog, sk);
   1254		break;
   1255	}
   1256	case SO_ATTACH_BPF:
   1257		ret = -EINVAL;
   1258		if (optlen == sizeof(u32)) {
   1259			u32 ufd;
   1260
   1261			ret = -EFAULT;
   1262			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
   1263				break;
   1264
   1265			ret = sk_attach_bpf(ufd, sk);
   1266		}
   1267		break;
   1268
   1269	case SO_ATTACH_REUSEPORT_CBPF: {
   1270		struct sock_fprog fprog;
   1271
   1272		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
   1273		if (!ret)
   1274			ret = sk_reuseport_attach_filter(&fprog, sk);
   1275		break;
   1276	}
   1277	case SO_ATTACH_REUSEPORT_EBPF:
   1278		ret = -EINVAL;
   1279		if (optlen == sizeof(u32)) {
   1280			u32 ufd;
   1281
   1282			ret = -EFAULT;
   1283			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
   1284				break;
   1285
   1286			ret = sk_reuseport_attach_bpf(ufd, sk);
   1287		}
   1288		break;
   1289
   1290	case SO_DETACH_REUSEPORT_BPF:
   1291		ret = reuseport_detach_prog(sk);
   1292		break;
   1293
   1294	case SO_DETACH_FILTER:
   1295		ret = sk_detach_filter(sk);
   1296		break;
   1297
   1298	case SO_LOCK_FILTER:
   1299		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
   1300			ret = -EPERM;
   1301		else
   1302			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
   1303		break;
   1304
   1305	case SO_PASSSEC:
   1306		if (valbool)
   1307			set_bit(SOCK_PASSSEC, &sock->flags);
   1308		else
   1309			clear_bit(SOCK_PASSSEC, &sock->flags);
   1310		break;
   1311	case SO_MARK:
   1312		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
   1313		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
   1314			ret = -EPERM;
   1315			break;
   1316		}
   1317
   1318		__sock_set_mark(sk, val);
   1319		break;
   1320	case SO_RCVMARK:
   1321		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
   1322		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
   1323			ret = -EPERM;
   1324			break;
   1325		}
   1326
   1327		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
   1328		break;
   1329
   1330	case SO_RXQ_OVFL:
   1331		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
   1332		break;
   1333
   1334	case SO_WIFI_STATUS:
   1335		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
   1336		break;
   1337
   1338	case SO_PEEK_OFF:
   1339		if (sock->ops->set_peek_off)
   1340			ret = sock->ops->set_peek_off(sk, val);
   1341		else
   1342			ret = -EOPNOTSUPP;
   1343		break;
   1344
   1345	case SO_NOFCS:
   1346		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
   1347		break;
   1348
   1349	case SO_SELECT_ERR_QUEUE:
   1350		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
   1351		break;
   1352
   1353#ifdef CONFIG_NET_RX_BUSY_POLL
   1354	case SO_BUSY_POLL:
   1355		/* allow unprivileged users to decrease the value */
   1356		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
   1357			ret = -EPERM;
   1358		else {
   1359			if (val < 0)
   1360				ret = -EINVAL;
   1361			else
   1362				WRITE_ONCE(sk->sk_ll_usec, val);
   1363		}
   1364		break;
   1365	case SO_PREFER_BUSY_POLL:
   1366		if (valbool && !capable(CAP_NET_ADMIN))
   1367			ret = -EPERM;
   1368		else
   1369			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
   1370		break;
   1371	case SO_BUSY_POLL_BUDGET:
   1372		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
   1373			ret = -EPERM;
   1374		} else {
   1375			if (val < 0 || val > U16_MAX)
   1376				ret = -EINVAL;
   1377			else
   1378				WRITE_ONCE(sk->sk_busy_poll_budget, val);
   1379		}
   1380		break;
   1381#endif
   1382
   1383	case SO_MAX_PACING_RATE:
   1384		{
   1385		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
   1386
   1387		if (sizeof(ulval) != sizeof(val) &&
   1388		    optlen >= sizeof(ulval) &&
   1389		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
   1390			ret = -EFAULT;
   1391			break;
   1392		}
   1393		if (ulval != ~0UL)
   1394			cmpxchg(&sk->sk_pacing_status,
   1395				SK_PACING_NONE,
   1396				SK_PACING_NEEDED);
   1397		sk->sk_max_pacing_rate = ulval;
   1398		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
   1399		break;
   1400		}
   1401	case SO_INCOMING_CPU:
   1402		WRITE_ONCE(sk->sk_incoming_cpu, val);
   1403		break;
   1404
   1405	case SO_CNX_ADVICE:
   1406		if (val == 1)
   1407			dst_negative_advice(sk);
   1408		break;
   1409
   1410	case SO_ZEROCOPY:
   1411		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
   1412			if (!(sk_is_tcp(sk) ||
   1413			      (sk->sk_type == SOCK_DGRAM &&
   1414			       sk->sk_protocol == IPPROTO_UDP)))
   1415				ret = -EOPNOTSUPP;
   1416		} else if (sk->sk_family != PF_RDS) {
   1417			ret = -EOPNOTSUPP;
   1418		}
   1419		if (!ret) {
   1420			if (val < 0 || val > 1)
   1421				ret = -EINVAL;
   1422			else
   1423				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
   1424		}
   1425		break;
   1426
   1427	case SO_TXTIME:
   1428		if (optlen != sizeof(struct sock_txtime)) {
   1429			ret = -EINVAL;
   1430			break;
   1431		} else if (copy_from_sockptr(&sk_txtime, optval,
   1432			   sizeof(struct sock_txtime))) {
   1433			ret = -EFAULT;
   1434			break;
   1435		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
   1436			ret = -EINVAL;
   1437			break;
   1438		}
   1439		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
   1440		 * scheduler has enough safe guards.
   1441		 */
   1442		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
   1443		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
   1444			ret = -EPERM;
   1445			break;
   1446		}
   1447		sock_valbool_flag(sk, SOCK_TXTIME, true);
   1448		sk->sk_clockid = sk_txtime.clockid;
   1449		sk->sk_txtime_deadline_mode =
   1450			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
   1451		sk->sk_txtime_report_errors =
   1452			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
   1453		break;
   1454
   1455	case SO_BINDTOIFINDEX:
   1456		ret = sock_bindtoindex_locked(sk, val);
   1457		break;
   1458
   1459	case SO_BUF_LOCK:
   1460		if (val & ~SOCK_BUF_LOCK_MASK) {
   1461			ret = -EINVAL;
   1462			break;
   1463		}
   1464		sk->sk_userlocks = val | (sk->sk_userlocks &
   1465					  ~SOCK_BUF_LOCK_MASK);
   1466		break;
   1467
   1468	case SO_RESERVE_MEM:
   1469	{
   1470		int delta;
   1471
   1472		if (val < 0) {
   1473			ret = -EINVAL;
   1474			break;
   1475		}
   1476
   1477		delta = val - sk->sk_reserved_mem;
   1478		if (delta < 0)
   1479			sock_release_reserved_memory(sk, -delta);
   1480		else
   1481			ret = sock_reserve_memory(sk, delta);
   1482		break;
   1483	}
   1484
   1485	case SO_TXREHASH:
   1486		if (val < -1 || val > 1) {
   1487			ret = -EINVAL;
   1488			break;
   1489		}
   1490		/* Paired with READ_ONCE() in tcp_rtx_synack() */
   1491		WRITE_ONCE(sk->sk_txrehash, (u8)val);
   1492		break;
   1493
   1494	default:
   1495		ret = -ENOPROTOOPT;
   1496		break;
   1497	}
   1498	release_sock(sk);
   1499	return ret;
   1500}
   1501EXPORT_SYMBOL(sock_setsockopt);
   1502
   1503static const struct cred *sk_get_peer_cred(struct sock *sk)
   1504{
   1505	const struct cred *cred;
   1506
   1507	spin_lock(&sk->sk_peer_lock);
   1508	cred = get_cred(sk->sk_peer_cred);
   1509	spin_unlock(&sk->sk_peer_lock);
   1510
   1511	return cred;
   1512}
   1513
   1514static void cred_to_ucred(struct pid *pid, const struct cred *cred,
   1515			  struct ucred *ucred)
   1516{
   1517	ucred->pid = pid_vnr(pid);
   1518	ucred->uid = ucred->gid = -1;
   1519	if (cred) {
   1520		struct user_namespace *current_ns = current_user_ns();
   1521
   1522		ucred->uid = from_kuid_munged(current_ns, cred->euid);
   1523		ucred->gid = from_kgid_munged(current_ns, cred->egid);
   1524	}
   1525}
   1526
   1527static int groups_to_user(gid_t __user *dst, const struct group_info *src)
   1528{
   1529	struct user_namespace *user_ns = current_user_ns();
   1530	int i;
   1531
   1532	for (i = 0; i < src->ngroups; i++)
   1533		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
   1534			return -EFAULT;
   1535
   1536	return 0;
   1537}
   1538
   1539int sock_getsockopt(struct socket *sock, int level, int optname,
   1540		    char __user *optval, int __user *optlen)
   1541{
   1542	struct sock *sk = sock->sk;
   1543
   1544	union {
   1545		int val;
   1546		u64 val64;
   1547		unsigned long ulval;
   1548		struct linger ling;
   1549		struct old_timeval32 tm32;
   1550		struct __kernel_old_timeval tm;
   1551		struct  __kernel_sock_timeval stm;
   1552		struct sock_txtime txtime;
   1553		struct so_timestamping timestamping;
   1554	} v;
   1555
   1556	int lv = sizeof(int);
   1557	int len;
   1558
   1559	if (get_user(len, optlen))
   1560		return -EFAULT;
   1561	if (len < 0)
   1562		return -EINVAL;
   1563
   1564	memset(&v, 0, sizeof(v));
   1565
   1566	switch (optname) {
   1567	case SO_DEBUG:
   1568		v.val = sock_flag(sk, SOCK_DBG);
   1569		break;
   1570
   1571	case SO_DONTROUTE:
   1572		v.val = sock_flag(sk, SOCK_LOCALROUTE);
   1573		break;
   1574
   1575	case SO_BROADCAST:
   1576		v.val = sock_flag(sk, SOCK_BROADCAST);
   1577		break;
   1578
   1579	case SO_SNDBUF:
   1580		v.val = sk->sk_sndbuf;
   1581		break;
   1582
   1583	case SO_RCVBUF:
   1584		v.val = sk->sk_rcvbuf;
   1585		break;
   1586
   1587	case SO_REUSEADDR:
   1588		v.val = sk->sk_reuse;
   1589		break;
   1590
   1591	case SO_REUSEPORT:
   1592		v.val = sk->sk_reuseport;
   1593		break;
   1594
   1595	case SO_KEEPALIVE:
   1596		v.val = sock_flag(sk, SOCK_KEEPOPEN);
   1597		break;
   1598
   1599	case SO_TYPE:
   1600		v.val = sk->sk_type;
   1601		break;
   1602
   1603	case SO_PROTOCOL:
   1604		v.val = sk->sk_protocol;
   1605		break;
   1606
   1607	case SO_DOMAIN:
   1608		v.val = sk->sk_family;
   1609		break;
   1610
   1611	case SO_ERROR:
   1612		v.val = -sock_error(sk);
   1613		if (v.val == 0)
   1614			v.val = xchg(&sk->sk_err_soft, 0);
   1615		break;
   1616
   1617	case SO_OOBINLINE:
   1618		v.val = sock_flag(sk, SOCK_URGINLINE);
   1619		break;
   1620
   1621	case SO_NO_CHECK:
   1622		v.val = sk->sk_no_check_tx;
   1623		break;
   1624
   1625	case SO_PRIORITY:
   1626		v.val = sk->sk_priority;
   1627		break;
   1628
   1629	case SO_LINGER:
   1630		lv		= sizeof(v.ling);
   1631		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
   1632		v.ling.l_linger	= sk->sk_lingertime / HZ;
   1633		break;
   1634
   1635	case SO_BSDCOMPAT:
   1636		break;
   1637
   1638	case SO_TIMESTAMP_OLD:
   1639		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
   1640				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
   1641				!sock_flag(sk, SOCK_RCVTSTAMPNS);
   1642		break;
   1643
   1644	case SO_TIMESTAMPNS_OLD:
   1645		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
   1646		break;
   1647
   1648	case SO_TIMESTAMP_NEW:
   1649		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
   1650		break;
   1651
   1652	case SO_TIMESTAMPNS_NEW:
   1653		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
   1654		break;
   1655
   1656	case SO_TIMESTAMPING_OLD:
   1657		lv = sizeof(v.timestamping);
   1658		v.timestamping.flags = sk->sk_tsflags;
   1659		v.timestamping.bind_phc = sk->sk_bind_phc;
   1660		break;
   1661
   1662	case SO_RCVTIMEO_OLD:
   1663	case SO_RCVTIMEO_NEW:
   1664		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
   1665		break;
   1666
   1667	case SO_SNDTIMEO_OLD:
   1668	case SO_SNDTIMEO_NEW:
   1669		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
   1670		break;
   1671
   1672	case SO_RCVLOWAT:
   1673		v.val = sk->sk_rcvlowat;
   1674		break;
   1675
   1676	case SO_SNDLOWAT:
   1677		v.val = 1;
   1678		break;
   1679
   1680	case SO_PASSCRED:
   1681		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
   1682		break;
   1683
   1684	case SO_PEERCRED:
   1685	{
   1686		struct ucred peercred;
   1687		if (len > sizeof(peercred))
   1688			len = sizeof(peercred);
   1689
   1690		spin_lock(&sk->sk_peer_lock);
   1691		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
   1692		spin_unlock(&sk->sk_peer_lock);
   1693
   1694		if (copy_to_user(optval, &peercred, len))
   1695			return -EFAULT;
   1696		goto lenout;
   1697	}
   1698
   1699	case SO_PEERGROUPS:
   1700	{
   1701		const struct cred *cred;
   1702		int ret, n;
   1703
   1704		cred = sk_get_peer_cred(sk);
   1705		if (!cred)
   1706			return -ENODATA;
   1707
   1708		n = cred->group_info->ngroups;
   1709		if (len < n * sizeof(gid_t)) {
   1710			len = n * sizeof(gid_t);
   1711			put_cred(cred);
   1712			return put_user(len, optlen) ? -EFAULT : -ERANGE;
   1713		}
   1714		len = n * sizeof(gid_t);
   1715
   1716		ret = groups_to_user((gid_t __user *)optval, cred->group_info);
   1717		put_cred(cred);
   1718		if (ret)
   1719			return ret;
   1720		goto lenout;
   1721	}
   1722
   1723	case SO_PEERNAME:
   1724	{
   1725		char address[128];
   1726
   1727		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
   1728		if (lv < 0)
   1729			return -ENOTCONN;
   1730		if (lv < len)
   1731			return -EINVAL;
   1732		if (copy_to_user(optval, address, len))
   1733			return -EFAULT;
   1734		goto lenout;
   1735	}
   1736
   1737	/* Dubious BSD thing... Probably nobody even uses it, but
   1738	 * the UNIX standard wants it for whatever reason... -DaveM
   1739	 */
   1740	case SO_ACCEPTCONN:
   1741		v.val = sk->sk_state == TCP_LISTEN;
   1742		break;
   1743
   1744	case SO_PASSSEC:
   1745		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
   1746		break;
   1747
   1748	case SO_PEERSEC:
   1749		return security_socket_getpeersec_stream(sock, optval, optlen, len);
   1750
   1751	case SO_MARK:
   1752		v.val = sk->sk_mark;
   1753		break;
   1754
   1755	case SO_RCVMARK:
   1756		v.val = sock_flag(sk, SOCK_RCVMARK);
   1757		break;
   1758
   1759	case SO_RXQ_OVFL:
   1760		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
   1761		break;
   1762
   1763	case SO_WIFI_STATUS:
   1764		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
   1765		break;
   1766
   1767	case SO_PEEK_OFF:
   1768		if (!sock->ops->set_peek_off)
   1769			return -EOPNOTSUPP;
   1770
   1771		v.val = sk->sk_peek_off;
   1772		break;
   1773	case SO_NOFCS:
   1774		v.val = sock_flag(sk, SOCK_NOFCS);
   1775		break;
   1776
   1777	case SO_BINDTODEVICE:
   1778		return sock_getbindtodevice(sk, optval, optlen, len);
   1779
   1780	case SO_GET_FILTER:
   1781		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
   1782		if (len < 0)
   1783			return len;
   1784
   1785		goto lenout;
   1786
   1787	case SO_LOCK_FILTER:
   1788		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
   1789		break;
   1790
   1791	case SO_BPF_EXTENSIONS:
   1792		v.val = bpf_tell_extensions();
   1793		break;
   1794
   1795	case SO_SELECT_ERR_QUEUE:
   1796		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
   1797		break;
   1798
   1799#ifdef CONFIG_NET_RX_BUSY_POLL
   1800	case SO_BUSY_POLL:
   1801		v.val = sk->sk_ll_usec;
   1802		break;
   1803	case SO_PREFER_BUSY_POLL:
   1804		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
   1805		break;
   1806#endif
   1807
   1808	case SO_MAX_PACING_RATE:
   1809		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
   1810			lv = sizeof(v.ulval);
   1811			v.ulval = sk->sk_max_pacing_rate;
   1812		} else {
   1813			/* 32bit version */
   1814			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
   1815		}
   1816		break;
   1817
   1818	case SO_INCOMING_CPU:
   1819		v.val = READ_ONCE(sk->sk_incoming_cpu);
   1820		break;
   1821
   1822	case SO_MEMINFO:
   1823	{
   1824		u32 meminfo[SK_MEMINFO_VARS];
   1825
   1826		sk_get_meminfo(sk, meminfo);
   1827
   1828		len = min_t(unsigned int, len, sizeof(meminfo));
   1829		if (copy_to_user(optval, &meminfo, len))
   1830			return -EFAULT;
   1831
   1832		goto lenout;
   1833	}
   1834
   1835#ifdef CONFIG_NET_RX_BUSY_POLL
   1836	case SO_INCOMING_NAPI_ID:
   1837		v.val = READ_ONCE(sk->sk_napi_id);
   1838
   1839		/* aggregate non-NAPI IDs down to 0 */
   1840		if (v.val < MIN_NAPI_ID)
   1841			v.val = 0;
   1842
   1843		break;
   1844#endif
   1845
   1846	case SO_COOKIE:
   1847		lv = sizeof(u64);
   1848		if (len < lv)
   1849			return -EINVAL;
   1850		v.val64 = sock_gen_cookie(sk);
   1851		break;
   1852
   1853	case SO_ZEROCOPY:
   1854		v.val = sock_flag(sk, SOCK_ZEROCOPY);
   1855		break;
   1856
   1857	case SO_TXTIME:
   1858		lv = sizeof(v.txtime);
   1859		v.txtime.clockid = sk->sk_clockid;
   1860		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
   1861				  SOF_TXTIME_DEADLINE_MODE : 0;
   1862		v.txtime.flags |= sk->sk_txtime_report_errors ?
   1863				  SOF_TXTIME_REPORT_ERRORS : 0;
   1864		break;
   1865
   1866	case SO_BINDTOIFINDEX:
   1867		v.val = READ_ONCE(sk->sk_bound_dev_if);
   1868		break;
   1869
   1870	case SO_NETNS_COOKIE:
   1871		lv = sizeof(u64);
   1872		if (len != lv)
   1873			return -EINVAL;
   1874		v.val64 = sock_net(sk)->net_cookie;
   1875		break;
   1876
   1877	case SO_BUF_LOCK:
   1878		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
   1879		break;
   1880
   1881	case SO_RESERVE_MEM:
   1882		v.val = sk->sk_reserved_mem;
   1883		break;
   1884
   1885	case SO_TXREHASH:
   1886		v.val = sk->sk_txrehash;
   1887		break;
   1888
   1889	default:
   1890		/* We implement the SO_SNDLOWAT etc to not be settable
   1891		 * (1003.1g 7).
   1892		 */
   1893		return -ENOPROTOOPT;
   1894	}
   1895
   1896	if (len > lv)
   1897		len = lv;
   1898	if (copy_to_user(optval, &v, len))
   1899		return -EFAULT;
   1900lenout:
   1901	if (put_user(len, optlen))
   1902		return -EFAULT;
   1903	return 0;
   1904}
   1905
   1906/*
   1907 * Initialize an sk_lock.
   1908 *
   1909 * (We also register the sk_lock with the lock validator.)
   1910 */
   1911static inline void sock_lock_init(struct sock *sk)
   1912{
   1913	if (sk->sk_kern_sock)
   1914		sock_lock_init_class_and_name(
   1915			sk,
   1916			af_family_kern_slock_key_strings[sk->sk_family],
   1917			af_family_kern_slock_keys + sk->sk_family,
   1918			af_family_kern_key_strings[sk->sk_family],
   1919			af_family_kern_keys + sk->sk_family);
   1920	else
   1921		sock_lock_init_class_and_name(
   1922			sk,
   1923			af_family_slock_key_strings[sk->sk_family],
   1924			af_family_slock_keys + sk->sk_family,
   1925			af_family_key_strings[sk->sk_family],
   1926			af_family_keys + sk->sk_family);
   1927}
   1928
   1929/*
   1930 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
   1931 * even temporarly, because of RCU lookups. sk_node should also be left as is.
   1932 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
   1933 */
   1934static void sock_copy(struct sock *nsk, const struct sock *osk)
   1935{
   1936	const struct proto *prot = READ_ONCE(osk->sk_prot);
   1937#ifdef CONFIG_SECURITY_NETWORK
   1938	void *sptr = nsk->sk_security;
   1939#endif
   1940
   1941	/* If we move sk_tx_queue_mapping out of the private section,
   1942	 * we must check if sk_tx_queue_clear() is called after
   1943	 * sock_copy() in sk_clone_lock().
   1944	 */
   1945	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
   1946		     offsetof(struct sock, sk_dontcopy_begin) ||
   1947		     offsetof(struct sock, sk_tx_queue_mapping) >=
   1948		     offsetof(struct sock, sk_dontcopy_end));
   1949
   1950	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
   1951
   1952	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
   1953	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
   1954
   1955#ifdef CONFIG_SECURITY_NETWORK
   1956	nsk->sk_security = sptr;
   1957	security_sk_clone(osk, nsk);
   1958#endif
   1959}
   1960
   1961static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
   1962		int family)
   1963{
   1964	struct sock *sk;
   1965	struct kmem_cache *slab;
   1966
   1967	slab = prot->slab;
   1968	if (slab != NULL) {
   1969		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
   1970		if (!sk)
   1971			return sk;
   1972		if (want_init_on_alloc(priority))
   1973			sk_prot_clear_nulls(sk, prot->obj_size);
   1974	} else
   1975		sk = kmalloc(prot->obj_size, priority);
   1976
   1977	if (sk != NULL) {
   1978		if (security_sk_alloc(sk, family, priority))
   1979			goto out_free;
   1980
   1981		if (!try_module_get(prot->owner))
   1982			goto out_free_sec;
   1983	}
   1984
   1985	return sk;
   1986
   1987out_free_sec:
   1988	security_sk_free(sk);
   1989out_free:
   1990	if (slab != NULL)
   1991		kmem_cache_free(slab, sk);
   1992	else
   1993		kfree(sk);
   1994	return NULL;
   1995}
   1996
   1997static void sk_prot_free(struct proto *prot, struct sock *sk)
   1998{
   1999	struct kmem_cache *slab;
   2000	struct module *owner;
   2001
   2002	owner = prot->owner;
   2003	slab = prot->slab;
   2004
   2005	cgroup_sk_free(&sk->sk_cgrp_data);
   2006	mem_cgroup_sk_free(sk);
   2007	security_sk_free(sk);
   2008	if (slab != NULL)
   2009		kmem_cache_free(slab, sk);
   2010	else
   2011		kfree(sk);
   2012	module_put(owner);
   2013}
   2014
   2015/**
   2016 *	sk_alloc - All socket objects are allocated here
   2017 *	@net: the applicable net namespace
   2018 *	@family: protocol family
   2019 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
   2020 *	@prot: struct proto associated with this new sock instance
   2021 *	@kern: is this to be a kernel socket?
   2022 */
   2023struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
   2024		      struct proto *prot, int kern)
   2025{
   2026	struct sock *sk;
   2027
   2028	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
   2029	if (sk) {
   2030		sk->sk_family = family;
   2031		/*
   2032		 * See comment in struct sock definition to understand
   2033		 * why we need sk_prot_creator -acme
   2034		 */
   2035		sk->sk_prot = sk->sk_prot_creator = prot;
   2036		sk->sk_kern_sock = kern;
   2037		sock_lock_init(sk);
   2038		sk->sk_net_refcnt = kern ? 0 : 1;
   2039		if (likely(sk->sk_net_refcnt)) {
   2040			get_net_track(net, &sk->ns_tracker, priority);
   2041			sock_inuse_add(net, 1);
   2042		}
   2043
   2044		sock_net_set(sk, net);
   2045		refcount_set(&sk->sk_wmem_alloc, 1);
   2046
   2047		mem_cgroup_sk_alloc(sk);
   2048		cgroup_sk_alloc(&sk->sk_cgrp_data);
   2049		sock_update_classid(&sk->sk_cgrp_data);
   2050		sock_update_netprioidx(&sk->sk_cgrp_data);
   2051		sk_tx_queue_clear(sk);
   2052	}
   2053
   2054	return sk;
   2055}
   2056EXPORT_SYMBOL(sk_alloc);
   2057
   2058/* Sockets having SOCK_RCU_FREE will call this function after one RCU
   2059 * grace period. This is the case for UDP sockets and TCP listeners.
   2060 */
   2061static void __sk_destruct(struct rcu_head *head)
   2062{
   2063	struct sock *sk = container_of(head, struct sock, sk_rcu);
   2064	struct sk_filter *filter;
   2065
   2066	if (sk->sk_destruct)
   2067		sk->sk_destruct(sk);
   2068
   2069	filter = rcu_dereference_check(sk->sk_filter,
   2070				       refcount_read(&sk->sk_wmem_alloc) == 0);
   2071	if (filter) {
   2072		sk_filter_uncharge(sk, filter);
   2073		RCU_INIT_POINTER(sk->sk_filter, NULL);
   2074	}
   2075
   2076	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
   2077
   2078#ifdef CONFIG_BPF_SYSCALL
   2079	bpf_sk_storage_free(sk);
   2080#endif
   2081
   2082	if (atomic_read(&sk->sk_omem_alloc))
   2083		pr_debug("%s: optmem leakage (%d bytes) detected\n",
   2084			 __func__, atomic_read(&sk->sk_omem_alloc));
   2085
   2086	if (sk->sk_frag.page) {
   2087		put_page(sk->sk_frag.page);
   2088		sk->sk_frag.page = NULL;
   2089	}
   2090
   2091	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
   2092	put_cred(sk->sk_peer_cred);
   2093	put_pid(sk->sk_peer_pid);
   2094
   2095	if (likely(sk->sk_net_refcnt))
   2096		put_net_track(sock_net(sk), &sk->ns_tracker);
   2097	sk_prot_free(sk->sk_prot_creator, sk);
   2098}
   2099
   2100void sk_destruct(struct sock *sk)
   2101{
   2102	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
   2103
   2104	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
   2105		reuseport_detach_sock(sk);
   2106		use_call_rcu = true;
   2107	}
   2108
   2109	if (use_call_rcu)
   2110		call_rcu(&sk->sk_rcu, __sk_destruct);
   2111	else
   2112		__sk_destruct(&sk->sk_rcu);
   2113}
   2114
   2115static void __sk_free(struct sock *sk)
   2116{
   2117	if (likely(sk->sk_net_refcnt))
   2118		sock_inuse_add(sock_net(sk), -1);
   2119
   2120	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
   2121		sock_diag_broadcast_destroy(sk);
   2122	else
   2123		sk_destruct(sk);
   2124}
   2125
   2126void sk_free(struct sock *sk)
   2127{
   2128	/*
   2129	 * We subtract one from sk_wmem_alloc and can know if
   2130	 * some packets are still in some tx queue.
   2131	 * If not null, sock_wfree() will call __sk_free(sk) later
   2132	 */
   2133	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
   2134		__sk_free(sk);
   2135}
   2136EXPORT_SYMBOL(sk_free);
   2137
   2138static void sk_init_common(struct sock *sk)
   2139{
   2140	skb_queue_head_init(&sk->sk_receive_queue);
   2141	skb_queue_head_init(&sk->sk_write_queue);
   2142	skb_queue_head_init(&sk->sk_error_queue);
   2143
   2144	rwlock_init(&sk->sk_callback_lock);
   2145	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
   2146			af_rlock_keys + sk->sk_family,
   2147			af_family_rlock_key_strings[sk->sk_family]);
   2148	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
   2149			af_wlock_keys + sk->sk_family,
   2150			af_family_wlock_key_strings[sk->sk_family]);
   2151	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
   2152			af_elock_keys + sk->sk_family,
   2153			af_family_elock_key_strings[sk->sk_family]);
   2154	lockdep_set_class_and_name(&sk->sk_callback_lock,
   2155			af_callback_keys + sk->sk_family,
   2156			af_family_clock_key_strings[sk->sk_family]);
   2157}
   2158
   2159/**
   2160 *	sk_clone_lock - clone a socket, and lock its clone
   2161 *	@sk: the socket to clone
   2162 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
   2163 *
   2164 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
   2165 */
   2166struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
   2167{
   2168	struct proto *prot = READ_ONCE(sk->sk_prot);
   2169	struct sk_filter *filter;
   2170	bool is_charged = true;
   2171	struct sock *newsk;
   2172
   2173	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
   2174	if (!newsk)
   2175		goto out;
   2176
   2177	sock_copy(newsk, sk);
   2178
   2179	newsk->sk_prot_creator = prot;
   2180
   2181	/* SANITY */
   2182	if (likely(newsk->sk_net_refcnt)) {
   2183		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
   2184		sock_inuse_add(sock_net(newsk), 1);
   2185	}
   2186	sk_node_init(&newsk->sk_node);
   2187	sock_lock_init(newsk);
   2188	bh_lock_sock(newsk);
   2189	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
   2190	newsk->sk_backlog.len = 0;
   2191
   2192	atomic_set(&newsk->sk_rmem_alloc, 0);
   2193
   2194	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
   2195	refcount_set(&newsk->sk_wmem_alloc, 1);
   2196
   2197	atomic_set(&newsk->sk_omem_alloc, 0);
   2198	sk_init_common(newsk);
   2199
   2200	newsk->sk_dst_cache	= NULL;
   2201	newsk->sk_dst_pending_confirm = 0;
   2202	newsk->sk_wmem_queued	= 0;
   2203	newsk->sk_forward_alloc = 0;
   2204	newsk->sk_reserved_mem  = 0;
   2205	atomic_set(&newsk->sk_drops, 0);
   2206	newsk->sk_send_head	= NULL;
   2207	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
   2208	atomic_set(&newsk->sk_zckey, 0);
   2209
   2210	sock_reset_flag(newsk, SOCK_DONE);
   2211
   2212	/* sk->sk_memcg will be populated at accept() time */
   2213	newsk->sk_memcg = NULL;
   2214
   2215	cgroup_sk_clone(&newsk->sk_cgrp_data);
   2216
   2217	rcu_read_lock();
   2218	filter = rcu_dereference(sk->sk_filter);
   2219	if (filter != NULL)
   2220		/* though it's an empty new sock, the charging may fail
   2221		 * if sysctl_optmem_max was changed between creation of
   2222		 * original socket and cloning
   2223		 */
   2224		is_charged = sk_filter_charge(newsk, filter);
   2225	RCU_INIT_POINTER(newsk->sk_filter, filter);
   2226	rcu_read_unlock();
   2227
   2228	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
   2229		/* We need to make sure that we don't uncharge the new
   2230		 * socket if we couldn't charge it in the first place
   2231		 * as otherwise we uncharge the parent's filter.
   2232		 */
   2233		if (!is_charged)
   2234			RCU_INIT_POINTER(newsk->sk_filter, NULL);
   2235		sk_free_unlock_clone(newsk);
   2236		newsk = NULL;
   2237		goto out;
   2238	}
   2239	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
   2240
   2241	if (bpf_sk_storage_clone(sk, newsk)) {
   2242		sk_free_unlock_clone(newsk);
   2243		newsk = NULL;
   2244		goto out;
   2245	}
   2246
   2247	/* Clear sk_user_data if parent had the pointer tagged
   2248	 * as not suitable for copying when cloning.
   2249	 */
   2250	if (sk_user_data_is_nocopy(newsk))
   2251		newsk->sk_user_data = NULL;
   2252
   2253	newsk->sk_err	   = 0;
   2254	newsk->sk_err_soft = 0;
   2255	newsk->sk_priority = 0;
   2256	newsk->sk_incoming_cpu = raw_smp_processor_id();
   2257
   2258	/* Before updating sk_refcnt, we must commit prior changes to memory
   2259	 * (Documentation/RCU/rculist_nulls.rst for details)
   2260	 */
   2261	smp_wmb();
   2262	refcount_set(&newsk->sk_refcnt, 2);
   2263
   2264	/* Increment the counter in the same struct proto as the master
   2265	 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
   2266	 * is the same as sk->sk_prot->socks, as this field was copied
   2267	 * with memcpy).
   2268	 *
   2269	 * This _changes_ the previous behaviour, where
   2270	 * tcp_create_openreq_child always was incrementing the
   2271	 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
   2272	 * to be taken into account in all callers. -acme
   2273	 */
   2274	sk_refcnt_debug_inc(newsk);
   2275	sk_set_socket(newsk, NULL);
   2276	sk_tx_queue_clear(newsk);
   2277	RCU_INIT_POINTER(newsk->sk_wq, NULL);
   2278
   2279	if (newsk->sk_prot->sockets_allocated)
   2280		sk_sockets_allocated_inc(newsk);
   2281
   2282	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
   2283		net_enable_timestamp();
   2284out:
   2285	return newsk;
   2286}
   2287EXPORT_SYMBOL_GPL(sk_clone_lock);
   2288
   2289void sk_free_unlock_clone(struct sock *sk)
   2290{
   2291	/* It is still raw copy of parent, so invalidate
   2292	 * destructor and make plain sk_free() */
   2293	sk->sk_destruct = NULL;
   2294	bh_unlock_sock(sk);
   2295	sk_free(sk);
   2296}
   2297EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
   2298
   2299static void sk_trim_gso_size(struct sock *sk)
   2300{
   2301	if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE)
   2302		return;
   2303#if IS_ENABLED(CONFIG_IPV6)
   2304	if (sk->sk_family == AF_INET6 &&
   2305	    sk_is_tcp(sk) &&
   2306	    !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
   2307		return;
   2308#endif
   2309	sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
   2310}
   2311
   2312void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
   2313{
   2314	u32 max_segs = 1;
   2315
   2316	sk_dst_set(sk, dst);
   2317	sk->sk_route_caps = dst->dev->features;
   2318	if (sk_is_tcp(sk))
   2319		sk->sk_route_caps |= NETIF_F_GSO;
   2320	if (sk->sk_route_caps & NETIF_F_GSO)
   2321		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
   2322	if (unlikely(sk->sk_gso_disabled))
   2323		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
   2324	if (sk_can_gso(sk)) {
   2325		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
   2326			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
   2327		} else {
   2328			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
   2329			/* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
   2330			sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
   2331			sk_trim_gso_size(sk);
   2332			sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
   2333			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
   2334			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
   2335		}
   2336	}
   2337	sk->sk_gso_max_segs = max_segs;
   2338}
   2339EXPORT_SYMBOL_GPL(sk_setup_caps);
   2340
   2341/*
   2342 *	Simple resource managers for sockets.
   2343 */
   2344
   2345
   2346/*
   2347 * Write buffer destructor automatically called from kfree_skb.
   2348 */
   2349void sock_wfree(struct sk_buff *skb)
   2350{
   2351	struct sock *sk = skb->sk;
   2352	unsigned int len = skb->truesize;
   2353	bool free;
   2354
   2355	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
   2356		if (sock_flag(sk, SOCK_RCU_FREE) &&
   2357		    sk->sk_write_space == sock_def_write_space) {
   2358			rcu_read_lock();
   2359			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
   2360			sock_def_write_space_wfree(sk);
   2361			rcu_read_unlock();
   2362			if (unlikely(free))
   2363				__sk_free(sk);
   2364			return;
   2365		}
   2366
   2367		/*
   2368		 * Keep a reference on sk_wmem_alloc, this will be released
   2369		 * after sk_write_space() call
   2370		 */
   2371		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
   2372		sk->sk_write_space(sk);
   2373		len = 1;
   2374	}
   2375	/*
   2376	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
   2377	 * could not do because of in-flight packets
   2378	 */
   2379	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
   2380		__sk_free(sk);
   2381}
   2382EXPORT_SYMBOL(sock_wfree);
   2383
   2384/* This variant of sock_wfree() is used by TCP,
   2385 * since it sets SOCK_USE_WRITE_QUEUE.
   2386 */
   2387void __sock_wfree(struct sk_buff *skb)
   2388{
   2389	struct sock *sk = skb->sk;
   2390
   2391	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
   2392		__sk_free(sk);
   2393}
   2394
   2395void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
   2396{
   2397	skb_orphan(skb);
   2398	skb->sk = sk;
   2399#ifdef CONFIG_INET
   2400	if (unlikely(!sk_fullsock(sk))) {
   2401		skb->destructor = sock_edemux;
   2402		sock_hold(sk);
   2403		return;
   2404	}
   2405#endif
   2406	skb->destructor = sock_wfree;
   2407	skb_set_hash_from_sk(skb, sk);
   2408	/*
   2409	 * We used to take a refcount on sk, but following operation
   2410	 * is enough to guarantee sk_free() wont free this sock until
   2411	 * all in-flight packets are completed
   2412	 */
   2413	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
   2414}
   2415EXPORT_SYMBOL(skb_set_owner_w);
   2416
   2417static bool can_skb_orphan_partial(const struct sk_buff *skb)
   2418{
   2419#ifdef CONFIG_TLS_DEVICE
   2420	/* Drivers depend on in-order delivery for crypto offload,
   2421	 * partial orphan breaks out-of-order-OK logic.
   2422	 */
   2423	if (skb->decrypted)
   2424		return false;
   2425#endif
   2426	return (skb->destructor == sock_wfree ||
   2427		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
   2428}
   2429
   2430/* This helper is used by netem, as it can hold packets in its
   2431 * delay queue. We want to allow the owner socket to send more
   2432 * packets, as if they were already TX completed by a typical driver.
   2433 * But we also want to keep skb->sk set because some packet schedulers
   2434 * rely on it (sch_fq for example).
   2435 */
   2436void skb_orphan_partial(struct sk_buff *skb)
   2437{
   2438	if (skb_is_tcp_pure_ack(skb))
   2439		return;
   2440
   2441	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
   2442		return;
   2443
   2444	skb_orphan(skb);
   2445}
   2446EXPORT_SYMBOL(skb_orphan_partial);
   2447
   2448/*
   2449 * Read buffer destructor automatically called from kfree_skb.
   2450 */
   2451void sock_rfree(struct sk_buff *skb)
   2452{
   2453	struct sock *sk = skb->sk;
   2454	unsigned int len = skb->truesize;
   2455
   2456	atomic_sub(len, &sk->sk_rmem_alloc);
   2457	sk_mem_uncharge(sk, len);
   2458}
   2459EXPORT_SYMBOL(sock_rfree);
   2460
   2461/*
   2462 * Buffer destructor for skbs that are not used directly in read or write
   2463 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
   2464 */
   2465void sock_efree(struct sk_buff *skb)
   2466{
   2467	sock_put(skb->sk);
   2468}
   2469EXPORT_SYMBOL(sock_efree);
   2470
   2471/* Buffer destructor for prefetch/receive path where reference count may
   2472 * not be held, e.g. for listen sockets.
   2473 */
   2474#ifdef CONFIG_INET
   2475void sock_pfree(struct sk_buff *skb)
   2476{
   2477	if (sk_is_refcounted(skb->sk))
   2478		sock_gen_put(skb->sk);
   2479}
   2480EXPORT_SYMBOL(sock_pfree);
   2481#endif /* CONFIG_INET */
   2482
   2483kuid_t sock_i_uid(struct sock *sk)
   2484{
   2485	kuid_t uid;
   2486
   2487	read_lock_bh(&sk->sk_callback_lock);
   2488	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
   2489	read_unlock_bh(&sk->sk_callback_lock);
   2490	return uid;
   2491}
   2492EXPORT_SYMBOL(sock_i_uid);
   2493
   2494unsigned long sock_i_ino(struct sock *sk)
   2495{
   2496	unsigned long ino;
   2497
   2498	read_lock_bh(&sk->sk_callback_lock);
   2499	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
   2500	read_unlock_bh(&sk->sk_callback_lock);
   2501	return ino;
   2502}
   2503EXPORT_SYMBOL(sock_i_ino);
   2504
   2505/*
   2506 * Allocate a skb from the socket's send buffer.
   2507 */
   2508struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
   2509			     gfp_t priority)
   2510{
   2511	if (force ||
   2512	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
   2513		struct sk_buff *skb = alloc_skb(size, priority);
   2514
   2515		if (skb) {
   2516			skb_set_owner_w(skb, sk);
   2517			return skb;
   2518		}
   2519	}
   2520	return NULL;
   2521}
   2522EXPORT_SYMBOL(sock_wmalloc);
   2523
   2524static void sock_ofree(struct sk_buff *skb)
   2525{
   2526	struct sock *sk = skb->sk;
   2527
   2528	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
   2529}
   2530
   2531struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
   2532			     gfp_t priority)
   2533{
   2534	struct sk_buff *skb;
   2535
   2536	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
   2537	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
   2538	    sysctl_optmem_max)
   2539		return NULL;
   2540
   2541	skb = alloc_skb(size, priority);
   2542	if (!skb)
   2543		return NULL;
   2544
   2545	atomic_add(skb->truesize, &sk->sk_omem_alloc);
   2546	skb->sk = sk;
   2547	skb->destructor = sock_ofree;
   2548	return skb;
   2549}
   2550
   2551/*
   2552 * Allocate a memory block from the socket's option memory buffer.
   2553 */
   2554void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
   2555{
   2556	if ((unsigned int)size <= sysctl_optmem_max &&
   2557	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
   2558		void *mem;
   2559		/* First do the add, to avoid the race if kmalloc
   2560		 * might sleep.
   2561		 */
   2562		atomic_add(size, &sk->sk_omem_alloc);
   2563		mem = kmalloc(size, priority);
   2564		if (mem)
   2565			return mem;
   2566		atomic_sub(size, &sk->sk_omem_alloc);
   2567	}
   2568	return NULL;
   2569}
   2570EXPORT_SYMBOL(sock_kmalloc);
   2571
   2572/* Free an option memory block. Note, we actually want the inline
   2573 * here as this allows gcc to detect the nullify and fold away the
   2574 * condition entirely.
   2575 */
   2576static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
   2577				  const bool nullify)
   2578{
   2579	if (WARN_ON_ONCE(!mem))
   2580		return;
   2581	if (nullify)
   2582		kfree_sensitive(mem);
   2583	else
   2584		kfree(mem);
   2585	atomic_sub(size, &sk->sk_omem_alloc);
   2586}
   2587
   2588void sock_kfree_s(struct sock *sk, void *mem, int size)
   2589{
   2590	__sock_kfree_s(sk, mem, size, false);
   2591}
   2592EXPORT_SYMBOL(sock_kfree_s);
   2593
   2594void sock_kzfree_s(struct sock *sk, void *mem, int size)
   2595{
   2596	__sock_kfree_s(sk, mem, size, true);
   2597}
   2598EXPORT_SYMBOL(sock_kzfree_s);
   2599
   2600/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
   2601   I think, these locks should be removed for datagram sockets.
   2602 */
   2603static long sock_wait_for_wmem(struct sock *sk, long timeo)
   2604{
   2605	DEFINE_WAIT(wait);
   2606
   2607	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
   2608	for (;;) {
   2609		if (!timeo)
   2610			break;
   2611		if (signal_pending(current))
   2612			break;
   2613		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
   2614		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
   2615		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
   2616			break;
   2617		if (sk->sk_shutdown & SEND_SHUTDOWN)
   2618			break;
   2619		if (sk->sk_err)
   2620			break;
   2621		timeo = schedule_timeout(timeo);
   2622	}
   2623	finish_wait(sk_sleep(sk), &wait);
   2624	return timeo;
   2625}
   2626
   2627
   2628/*
   2629 *	Generic send/receive buffer handlers
   2630 */
   2631
   2632struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
   2633				     unsigned long data_len, int noblock,
   2634				     int *errcode, int max_page_order)
   2635{
   2636	struct sk_buff *skb;
   2637	long timeo;
   2638	int err;
   2639
   2640	timeo = sock_sndtimeo(sk, noblock);
   2641	for (;;) {
   2642		err = sock_error(sk);
   2643		if (err != 0)
   2644			goto failure;
   2645
   2646		err = -EPIPE;
   2647		if (sk->sk_shutdown & SEND_SHUTDOWN)
   2648			goto failure;
   2649
   2650		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
   2651			break;
   2652
   2653		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
   2654		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
   2655		err = -EAGAIN;
   2656		if (!timeo)
   2657			goto failure;
   2658		if (signal_pending(current))
   2659			goto interrupted;
   2660		timeo = sock_wait_for_wmem(sk, timeo);
   2661	}
   2662	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
   2663				   errcode, sk->sk_allocation);
   2664	if (skb)
   2665		skb_set_owner_w(skb, sk);
   2666	return skb;
   2667
   2668interrupted:
   2669	err = sock_intr_errno(timeo);
   2670failure:
   2671	*errcode = err;
   2672	return NULL;
   2673}
   2674EXPORT_SYMBOL(sock_alloc_send_pskb);
   2675
   2676int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
   2677		     struct sockcm_cookie *sockc)
   2678{
   2679	u32 tsflags;
   2680
   2681	switch (cmsg->cmsg_type) {
   2682	case SO_MARK:
   2683		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
   2684		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
   2685			return -EPERM;
   2686		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
   2687			return -EINVAL;
   2688		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
   2689		break;
   2690	case SO_TIMESTAMPING_OLD:
   2691		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
   2692			return -EINVAL;
   2693
   2694		tsflags = *(u32 *)CMSG_DATA(cmsg);
   2695		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
   2696			return -EINVAL;
   2697
   2698		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
   2699		sockc->tsflags |= tsflags;
   2700		break;
   2701	case SCM_TXTIME:
   2702		if (!sock_flag(sk, SOCK_TXTIME))
   2703			return -EINVAL;
   2704		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
   2705			return -EINVAL;
   2706		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
   2707		break;
   2708	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
   2709	case SCM_RIGHTS:
   2710	case SCM_CREDENTIALS:
   2711		break;
   2712	default:
   2713		return -EINVAL;
   2714	}
   2715	return 0;
   2716}
   2717EXPORT_SYMBOL(__sock_cmsg_send);
   2718
   2719int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
   2720		   struct sockcm_cookie *sockc)
   2721{
   2722	struct cmsghdr *cmsg;
   2723	int ret;
   2724
   2725	for_each_cmsghdr(cmsg, msg) {
   2726		if (!CMSG_OK(msg, cmsg))
   2727			return -EINVAL;
   2728		if (cmsg->cmsg_level != SOL_SOCKET)
   2729			continue;
   2730		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
   2731		if (ret)
   2732			return ret;
   2733	}
   2734	return 0;
   2735}
   2736EXPORT_SYMBOL(sock_cmsg_send);
   2737
   2738static void sk_enter_memory_pressure(struct sock *sk)
   2739{
   2740	if (!sk->sk_prot->enter_memory_pressure)
   2741		return;
   2742
   2743	sk->sk_prot->enter_memory_pressure(sk);
   2744}
   2745
   2746static void sk_leave_memory_pressure(struct sock *sk)
   2747{
   2748	if (sk->sk_prot->leave_memory_pressure) {
   2749		sk->sk_prot->leave_memory_pressure(sk);
   2750	} else {
   2751		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
   2752
   2753		if (memory_pressure && READ_ONCE(*memory_pressure))
   2754			WRITE_ONCE(*memory_pressure, 0);
   2755	}
   2756}
   2757
   2758DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
   2759
   2760/**
   2761 * skb_page_frag_refill - check that a page_frag contains enough room
   2762 * @sz: minimum size of the fragment we want to get
   2763 * @pfrag: pointer to page_frag
   2764 * @gfp: priority for memory allocation
   2765 *
   2766 * Note: While this allocator tries to use high order pages, there is
   2767 * no guarantee that allocations succeed. Therefore, @sz MUST be
   2768 * less or equal than PAGE_SIZE.
   2769 */
   2770bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
   2771{
   2772	if (pfrag->page) {
   2773		if (page_ref_count(pfrag->page) == 1) {
   2774			pfrag->offset = 0;
   2775			return true;
   2776		}
   2777		if (pfrag->offset + sz <= pfrag->size)
   2778			return true;
   2779		put_page(pfrag->page);
   2780	}
   2781
   2782	pfrag->offset = 0;
   2783	if (SKB_FRAG_PAGE_ORDER &&
   2784	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
   2785		/* Avoid direct reclaim but allow kswapd to wake */
   2786		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
   2787					  __GFP_COMP | __GFP_NOWARN |
   2788					  __GFP_NORETRY,
   2789					  SKB_FRAG_PAGE_ORDER);
   2790		if (likely(pfrag->page)) {
   2791			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
   2792			return true;
   2793		}
   2794	}
   2795	pfrag->page = alloc_page(gfp);
   2796	if (likely(pfrag->page)) {
   2797		pfrag->size = PAGE_SIZE;
   2798		return true;
   2799	}
   2800	return false;
   2801}
   2802EXPORT_SYMBOL(skb_page_frag_refill);
   2803
   2804bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
   2805{
   2806	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
   2807		return true;
   2808
   2809	sk_enter_memory_pressure(sk);
   2810	sk_stream_moderate_sndbuf(sk);
   2811	return false;
   2812}
   2813EXPORT_SYMBOL(sk_page_frag_refill);
   2814
   2815void __lock_sock(struct sock *sk)
   2816	__releases(&sk->sk_lock.slock)
   2817	__acquires(&sk->sk_lock.slock)
   2818{
   2819	DEFINE_WAIT(wait);
   2820
   2821	for (;;) {
   2822		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
   2823					TASK_UNINTERRUPTIBLE);
   2824		spin_unlock_bh(&sk->sk_lock.slock);
   2825		schedule();
   2826		spin_lock_bh(&sk->sk_lock.slock);
   2827		if (!sock_owned_by_user(sk))
   2828			break;
   2829	}
   2830	finish_wait(&sk->sk_lock.wq, &wait);
   2831}
   2832
   2833void __release_sock(struct sock *sk)
   2834	__releases(&sk->sk_lock.slock)
   2835	__acquires(&sk->sk_lock.slock)
   2836{
   2837	struct sk_buff *skb, *next;
   2838
   2839	while ((skb = sk->sk_backlog.head) != NULL) {
   2840		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
   2841
   2842		spin_unlock_bh(&sk->sk_lock.slock);
   2843
   2844		do {
   2845			next = skb->next;
   2846			prefetch(next);
   2847			WARN_ON_ONCE(skb_dst_is_noref(skb));
   2848			skb_mark_not_on_list(skb);
   2849			sk_backlog_rcv(sk, skb);
   2850
   2851			cond_resched();
   2852
   2853			skb = next;
   2854		} while (skb != NULL);
   2855
   2856		spin_lock_bh(&sk->sk_lock.slock);
   2857	}
   2858
   2859	/*
   2860	 * Doing the zeroing here guarantee we can not loop forever
   2861	 * while a wild producer attempts to flood us.
   2862	 */
   2863	sk->sk_backlog.len = 0;
   2864}
   2865
   2866void __sk_flush_backlog(struct sock *sk)
   2867{
   2868	spin_lock_bh(&sk->sk_lock.slock);
   2869	__release_sock(sk);
   2870	spin_unlock_bh(&sk->sk_lock.slock);
   2871}
   2872
   2873/**
   2874 * sk_wait_data - wait for data to arrive at sk_receive_queue
   2875 * @sk:    sock to wait on
   2876 * @timeo: for how long
   2877 * @skb:   last skb seen on sk_receive_queue
   2878 *
   2879 * Now socket state including sk->sk_err is changed only under lock,
   2880 * hence we may omit checks after joining wait queue.
   2881 * We check receive queue before schedule() only as optimization;
   2882 * it is very likely that release_sock() added new data.
   2883 */
   2884int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
   2885{
   2886	DEFINE_WAIT_FUNC(wait, woken_wake_function);
   2887	int rc;
   2888
   2889	add_wait_queue(sk_sleep(sk), &wait);
   2890	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
   2891	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
   2892	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
   2893	remove_wait_queue(sk_sleep(sk), &wait);
   2894	return rc;
   2895}
   2896EXPORT_SYMBOL(sk_wait_data);
   2897
   2898/**
   2899 *	__sk_mem_raise_allocated - increase memory_allocated
   2900 *	@sk: socket
   2901 *	@size: memory size to allocate
   2902 *	@amt: pages to allocate
   2903 *	@kind: allocation type
   2904 *
   2905 *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
   2906 */
   2907int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
   2908{
   2909	struct proto *prot = sk->sk_prot;
   2910	long allocated = sk_memory_allocated_add(sk, amt);
   2911	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
   2912	bool charged = true;
   2913
   2914	if (memcg_charge &&
   2915	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
   2916						gfp_memcg_charge())))
   2917		goto suppress_allocation;
   2918
   2919	/* Under limit. */
   2920	if (allocated <= sk_prot_mem_limits(sk, 0)) {
   2921		sk_leave_memory_pressure(sk);
   2922		return 1;
   2923	}
   2924
   2925	/* Under pressure. */
   2926	if (allocated > sk_prot_mem_limits(sk, 1))
   2927		sk_enter_memory_pressure(sk);
   2928
   2929	/* Over hard limit. */
   2930	if (allocated > sk_prot_mem_limits(sk, 2))
   2931		goto suppress_allocation;
   2932
   2933	/* guarantee minimum buffer size under pressure */
   2934	if (kind == SK_MEM_RECV) {
   2935		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
   2936			return 1;
   2937
   2938	} else { /* SK_MEM_SEND */
   2939		int wmem0 = sk_get_wmem0(sk, prot);
   2940
   2941		if (sk->sk_type == SOCK_STREAM) {
   2942			if (sk->sk_wmem_queued < wmem0)
   2943				return 1;
   2944		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
   2945				return 1;
   2946		}
   2947	}
   2948
   2949	if (sk_has_memory_pressure(sk)) {
   2950		u64 alloc;
   2951
   2952		if (!sk_under_memory_pressure(sk))
   2953			return 1;
   2954		alloc = sk_sockets_allocated_read_positive(sk);
   2955		if (sk_prot_mem_limits(sk, 2) > alloc *
   2956		    sk_mem_pages(sk->sk_wmem_queued +
   2957				 atomic_read(&sk->sk_rmem_alloc) +
   2958				 sk->sk_forward_alloc))
   2959			return 1;
   2960	}
   2961
   2962suppress_allocation:
   2963
   2964	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
   2965		sk_stream_moderate_sndbuf(sk);
   2966
   2967		/* Fail only if socket is _under_ its sndbuf.
   2968		 * In this case we cannot block, so that we have to fail.
   2969		 */
   2970		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
   2971			/* Force charge with __GFP_NOFAIL */
   2972			if (memcg_charge && !charged) {
   2973				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
   2974					gfp_memcg_charge() | __GFP_NOFAIL);
   2975			}
   2976			return 1;
   2977		}
   2978	}
   2979
   2980	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
   2981		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
   2982
   2983	sk_memory_allocated_sub(sk, amt);
   2984
   2985	if (memcg_charge && charged)
   2986		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
   2987
   2988	return 0;
   2989}
   2990EXPORT_SYMBOL(__sk_mem_raise_allocated);
   2991
   2992/**
   2993 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
   2994 *	@sk: socket
   2995 *	@size: memory size to allocate
   2996 *	@kind: allocation type
   2997 *
   2998 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
   2999 *	rmem allocation. This function assumes that protocols which have
   3000 *	memory_pressure use sk_wmem_queued as write buffer accounting.
   3001 */
   3002int __sk_mem_schedule(struct sock *sk, int size, int kind)
   3003{
   3004	int ret, amt = sk_mem_pages(size);
   3005
   3006	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
   3007	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
   3008	if (!ret)
   3009		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
   3010	return ret;
   3011}
   3012EXPORT_SYMBOL(__sk_mem_schedule);
   3013
   3014/**
   3015 *	__sk_mem_reduce_allocated - reclaim memory_allocated
   3016 *	@sk: socket
   3017 *	@amount: number of quanta
   3018 *
   3019 *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
   3020 */
   3021void __sk_mem_reduce_allocated(struct sock *sk, int amount)
   3022{
   3023	sk_memory_allocated_sub(sk, amount);
   3024
   3025	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
   3026		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
   3027
   3028	if (sk_under_memory_pressure(sk) &&
   3029	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
   3030		sk_leave_memory_pressure(sk);
   3031}
   3032EXPORT_SYMBOL(__sk_mem_reduce_allocated);
   3033
   3034/**
   3035 *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
   3036 *	@sk: socket
   3037 *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
   3038 */
   3039void __sk_mem_reclaim(struct sock *sk, int amount)
   3040{
   3041	amount >>= SK_MEM_QUANTUM_SHIFT;
   3042	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
   3043	__sk_mem_reduce_allocated(sk, amount);
   3044}
   3045EXPORT_SYMBOL(__sk_mem_reclaim);
   3046
   3047int sk_set_peek_off(struct sock *sk, int val)
   3048{
   3049	sk->sk_peek_off = val;
   3050	return 0;
   3051}
   3052EXPORT_SYMBOL_GPL(sk_set_peek_off);
   3053
   3054/*
   3055 * Set of default routines for initialising struct proto_ops when
   3056 * the protocol does not support a particular function. In certain
   3057 * cases where it makes no sense for a protocol to have a "do nothing"
   3058 * function, some default processing is provided.
   3059 */
   3060
   3061int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
   3062{
   3063	return -EOPNOTSUPP;
   3064}
   3065EXPORT_SYMBOL(sock_no_bind);
   3066
   3067int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
   3068		    int len, int flags)
   3069{
   3070	return -EOPNOTSUPP;
   3071}
   3072EXPORT_SYMBOL(sock_no_connect);
   3073
   3074int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
   3075{
   3076	return -EOPNOTSUPP;
   3077}
   3078EXPORT_SYMBOL(sock_no_socketpair);
   3079
   3080int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
   3081		   bool kern)
   3082{
   3083	return -EOPNOTSUPP;
   3084}
   3085EXPORT_SYMBOL(sock_no_accept);
   3086
   3087int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
   3088		    int peer)
   3089{
   3090	return -EOPNOTSUPP;
   3091}
   3092EXPORT_SYMBOL(sock_no_getname);
   3093
   3094int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
   3095{
   3096	return -EOPNOTSUPP;
   3097}
   3098EXPORT_SYMBOL(sock_no_ioctl);
   3099
   3100int sock_no_listen(struct socket *sock, int backlog)
   3101{
   3102	return -EOPNOTSUPP;
   3103}
   3104EXPORT_SYMBOL(sock_no_listen);
   3105
   3106int sock_no_shutdown(struct socket *sock, int how)
   3107{
   3108	return -EOPNOTSUPP;
   3109}
   3110EXPORT_SYMBOL(sock_no_shutdown);
   3111
   3112int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
   3113{
   3114	return -EOPNOTSUPP;
   3115}
   3116EXPORT_SYMBOL(sock_no_sendmsg);
   3117
   3118int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
   3119{
   3120	return -EOPNOTSUPP;
   3121}
   3122EXPORT_SYMBOL(sock_no_sendmsg_locked);
   3123
   3124int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
   3125		    int flags)
   3126{
   3127	return -EOPNOTSUPP;
   3128}
   3129EXPORT_SYMBOL(sock_no_recvmsg);
   3130
   3131int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
   3132{
   3133	/* Mirror missing mmap method error code */
   3134	return -ENODEV;
   3135}
   3136EXPORT_SYMBOL(sock_no_mmap);
   3137
   3138/*
   3139 * When a file is received (via SCM_RIGHTS, etc), we must bump the
   3140 * various sock-based usage counts.
   3141 */
   3142void __receive_sock(struct file *file)
   3143{
   3144	struct socket *sock;
   3145
   3146	sock = sock_from_file(file);
   3147	if (sock) {
   3148		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
   3149		sock_update_classid(&sock->sk->sk_cgrp_data);
   3150	}
   3151}
   3152
   3153ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
   3154{
   3155	ssize_t res;
   3156	struct msghdr msg = {.msg_flags = flags};
   3157	struct kvec iov;
   3158	char *kaddr = kmap(page);
   3159	iov.iov_base = kaddr + offset;
   3160	iov.iov_len = size;
   3161	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
   3162	kunmap(page);
   3163	return res;
   3164}
   3165EXPORT_SYMBOL(sock_no_sendpage);
   3166
   3167ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
   3168				int offset, size_t size, int flags)
   3169{
   3170	ssize_t res;
   3171	struct msghdr msg = {.msg_flags = flags};
   3172	struct kvec iov;
   3173	char *kaddr = kmap(page);
   3174
   3175	iov.iov_base = kaddr + offset;
   3176	iov.iov_len = size;
   3177	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
   3178	kunmap(page);
   3179	return res;
   3180}
   3181EXPORT_SYMBOL(sock_no_sendpage_locked);
   3182
   3183/*
   3184 *	Default Socket Callbacks
   3185 */
   3186
   3187static void sock_def_wakeup(struct sock *sk)
   3188{
   3189	struct socket_wq *wq;
   3190
   3191	rcu_read_lock();
   3192	wq = rcu_dereference(sk->sk_wq);
   3193	if (skwq_has_sleeper(wq))
   3194		wake_up_interruptible_all(&wq->wait);
   3195	rcu_read_unlock();
   3196}
   3197
   3198static void sock_def_error_report(struct sock *sk)
   3199{
   3200	struct socket_wq *wq;
   3201
   3202	rcu_read_lock();
   3203	wq = rcu_dereference(sk->sk_wq);
   3204	if (skwq_has_sleeper(wq))
   3205		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
   3206	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
   3207	rcu_read_unlock();
   3208}
   3209
   3210void sock_def_readable(struct sock *sk)
   3211{
   3212	struct socket_wq *wq;
   3213
   3214	rcu_read_lock();
   3215	wq = rcu_dereference(sk->sk_wq);
   3216	if (skwq_has_sleeper(wq))
   3217		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
   3218						EPOLLRDNORM | EPOLLRDBAND);
   3219	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
   3220	rcu_read_unlock();
   3221}
   3222
   3223static void sock_def_write_space(struct sock *sk)
   3224{
   3225	struct socket_wq *wq;
   3226
   3227	rcu_read_lock();
   3228
   3229	/* Do not wake up a writer until he can make "significant"
   3230	 * progress.  --DaveM
   3231	 */
   3232	if (sock_writeable(sk)) {
   3233		wq = rcu_dereference(sk->sk_wq);
   3234		if (skwq_has_sleeper(wq))
   3235			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
   3236						EPOLLWRNORM | EPOLLWRBAND);
   3237
   3238		/* Should agree with poll, otherwise some programs break */
   3239		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
   3240	}
   3241
   3242	rcu_read_unlock();
   3243}
   3244
   3245/* An optimised version of sock_def_write_space(), should only be called
   3246 * for SOCK_RCU_FREE sockets under RCU read section and after putting
   3247 * ->sk_wmem_alloc.
   3248 */
   3249static void sock_def_write_space_wfree(struct sock *sk)
   3250{
   3251	/* Do not wake up a writer until he can make "significant"
   3252	 * progress.  --DaveM
   3253	 */
   3254	if (sock_writeable(sk)) {
   3255		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
   3256
   3257		/* rely on refcount_sub from sock_wfree() */
   3258		smp_mb__after_atomic();
   3259		if (wq && waitqueue_active(&wq->wait))
   3260			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
   3261						EPOLLWRNORM | EPOLLWRBAND);
   3262
   3263		/* Should agree with poll, otherwise some programs break */
   3264		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
   3265	}
   3266}
   3267
   3268static void sock_def_destruct(struct sock *sk)
   3269{
   3270}
   3271
   3272void sk_send_sigurg(struct sock *sk)
   3273{
   3274	if (sk->sk_socket && sk->sk_socket->file)
   3275		if (send_sigurg(&sk->sk_socket->file->f_owner))
   3276			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
   3277}
   3278EXPORT_SYMBOL(sk_send_sigurg);
   3279
   3280void sk_reset_timer(struct sock *sk, struct timer_list* timer,
   3281		    unsigned long expires)
   3282{
   3283	if (!mod_timer(timer, expires))
   3284		sock_hold(sk);
   3285}
   3286EXPORT_SYMBOL(sk_reset_timer);
   3287
   3288void sk_stop_timer(struct sock *sk, struct timer_list* timer)
   3289{
   3290	if (del_timer(timer))
   3291		__sock_put(sk);
   3292}
   3293EXPORT_SYMBOL(sk_stop_timer);
   3294
   3295void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
   3296{
   3297	if (del_timer_sync(timer))
   3298		__sock_put(sk);
   3299}
   3300EXPORT_SYMBOL(sk_stop_timer_sync);
   3301
   3302void sock_init_data(struct socket *sock, struct sock *sk)
   3303{
   3304	sk_init_common(sk);
   3305	sk->sk_send_head	=	NULL;
   3306
   3307	timer_setup(&sk->sk_timer, NULL, 0);
   3308
   3309	sk->sk_allocation	=	GFP_KERNEL;
   3310	sk->sk_rcvbuf		=	sysctl_rmem_default;
   3311	sk->sk_sndbuf		=	sysctl_wmem_default;
   3312	sk->sk_state		=	TCP_CLOSE;
   3313	sk_set_socket(sk, sock);
   3314
   3315	sock_set_flag(sk, SOCK_ZAPPED);
   3316
   3317	if (sock) {
   3318		sk->sk_type	=	sock->type;
   3319		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
   3320		sock->sk	=	sk;
   3321		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
   3322	} else {
   3323		RCU_INIT_POINTER(sk->sk_wq, NULL);
   3324		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
   3325	}
   3326
   3327	rwlock_init(&sk->sk_callback_lock);
   3328	if (sk->sk_kern_sock)
   3329		lockdep_set_class_and_name(
   3330			&sk->sk_callback_lock,
   3331			af_kern_callback_keys + sk->sk_family,
   3332			af_family_kern_clock_key_strings[sk->sk_family]);
   3333	else
   3334		lockdep_set_class_and_name(
   3335			&sk->sk_callback_lock,
   3336			af_callback_keys + sk->sk_family,
   3337			af_family_clock_key_strings[sk->sk_family]);
   3338
   3339	sk->sk_state_change	=	sock_def_wakeup;
   3340	sk->sk_data_ready	=	sock_def_readable;
   3341	sk->sk_write_space	=	sock_def_write_space;
   3342	sk->sk_error_report	=	sock_def_error_report;
   3343	sk->sk_destruct		=	sock_def_destruct;
   3344
   3345	sk->sk_frag.page	=	NULL;
   3346	sk->sk_frag.offset	=	0;
   3347	sk->sk_peek_off		=	-1;
   3348
   3349	sk->sk_peer_pid 	=	NULL;
   3350	sk->sk_peer_cred	=	NULL;
   3351	spin_lock_init(&sk->sk_peer_lock);
   3352
   3353	sk->sk_write_pending	=	0;
   3354	sk->sk_rcvlowat		=	1;
   3355	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
   3356	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
   3357
   3358	sk->sk_stamp = SK_DEFAULT_STAMP;
   3359#if BITS_PER_LONG==32
   3360	seqlock_init(&sk->sk_stamp_seq);
   3361#endif
   3362	atomic_set(&sk->sk_zckey, 0);
   3363
   3364#ifdef CONFIG_NET_RX_BUSY_POLL
   3365	sk->sk_napi_id		=	0;
   3366	sk->sk_ll_usec		=	sysctl_net_busy_read;
   3367#endif
   3368
   3369	sk->sk_max_pacing_rate = ~0UL;
   3370	sk->sk_pacing_rate = ~0UL;
   3371	WRITE_ONCE(sk->sk_pacing_shift, 10);
   3372	sk->sk_incoming_cpu = -1;
   3373	sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
   3374
   3375	sk_rx_queue_clear(sk);
   3376	/*
   3377	 * Before updating sk_refcnt, we must commit prior changes to memory
   3378	 * (Documentation/RCU/rculist_nulls.rst for details)
   3379	 */
   3380	smp_wmb();
   3381	refcount_set(&sk->sk_refcnt, 1);
   3382	atomic_set(&sk->sk_drops, 0);
   3383}
   3384EXPORT_SYMBOL(sock_init_data);
   3385
   3386void lock_sock_nested(struct sock *sk, int subclass)
   3387{
   3388	/* The sk_lock has mutex_lock() semantics here. */
   3389	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
   3390
   3391	might_sleep();
   3392	spin_lock_bh(&sk->sk_lock.slock);
   3393	if (sock_owned_by_user_nocheck(sk))
   3394		__lock_sock(sk);
   3395	sk->sk_lock.owned = 1;
   3396	spin_unlock_bh(&sk->sk_lock.slock);
   3397}
   3398EXPORT_SYMBOL(lock_sock_nested);
   3399
   3400void release_sock(struct sock *sk)
   3401{
   3402	spin_lock_bh(&sk->sk_lock.slock);
   3403	if (sk->sk_backlog.tail)
   3404		__release_sock(sk);
   3405
   3406	/* Warning : release_cb() might need to release sk ownership,
   3407	 * ie call sock_release_ownership(sk) before us.
   3408	 */
   3409	if (sk->sk_prot->release_cb)
   3410		sk->sk_prot->release_cb(sk);
   3411
   3412	sock_release_ownership(sk);
   3413	if (waitqueue_active(&sk->sk_lock.wq))
   3414		wake_up(&sk->sk_lock.wq);
   3415	spin_unlock_bh(&sk->sk_lock.slock);
   3416}
   3417EXPORT_SYMBOL(release_sock);
   3418
   3419bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
   3420{
   3421	might_sleep();
   3422	spin_lock_bh(&sk->sk_lock.slock);
   3423
   3424	if (!sock_owned_by_user_nocheck(sk)) {
   3425		/*
   3426		 * Fast path return with bottom halves disabled and
   3427		 * sock::sk_lock.slock held.
   3428		 *
   3429		 * The 'mutex' is not contended and holding
   3430		 * sock::sk_lock.slock prevents all other lockers to
   3431		 * proceed so the corresponding unlock_sock_fast() can
   3432		 * avoid the slow path of release_sock() completely and
   3433		 * just release slock.
   3434		 *
   3435		 * From a semantical POV this is equivalent to 'acquiring'
   3436		 * the 'mutex', hence the corresponding lockdep
   3437		 * mutex_release() has to happen in the fast path of
   3438		 * unlock_sock_fast().
   3439		 */
   3440		return false;
   3441	}
   3442
   3443	__lock_sock(sk);
   3444	sk->sk_lock.owned = 1;
   3445	__acquire(&sk->sk_lock.slock);
   3446	spin_unlock_bh(&sk->sk_lock.slock);
   3447	return true;
   3448}
   3449EXPORT_SYMBOL(__lock_sock_fast);
   3450
   3451int sock_gettstamp(struct socket *sock, void __user *userstamp,
   3452		   bool timeval, bool time32)
   3453{
   3454	struct sock *sk = sock->sk;
   3455	struct timespec64 ts;
   3456
   3457	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
   3458	ts = ktime_to_timespec64(sock_read_timestamp(sk));
   3459	if (ts.tv_sec == -1)
   3460		return -ENOENT;
   3461	if (ts.tv_sec == 0) {
   3462		ktime_t kt = ktime_get_real();
   3463		sock_write_timestamp(sk, kt);
   3464		ts = ktime_to_timespec64(kt);
   3465	}
   3466
   3467	if (timeval)
   3468		ts.tv_nsec /= 1000;
   3469
   3470#ifdef CONFIG_COMPAT_32BIT_TIME
   3471	if (time32)
   3472		return put_old_timespec32(&ts, userstamp);
   3473#endif
   3474#ifdef CONFIG_SPARC64
   3475	/* beware of padding in sparc64 timeval */
   3476	if (timeval && !in_compat_syscall()) {
   3477		struct __kernel_old_timeval __user tv = {
   3478			.tv_sec = ts.tv_sec,
   3479			.tv_usec = ts.tv_nsec,
   3480		};
   3481		if (copy_to_user(userstamp, &tv, sizeof(tv)))
   3482			return -EFAULT;
   3483		return 0;
   3484	}
   3485#endif
   3486	return put_timespec64(&ts, userstamp);
   3487}
   3488EXPORT_SYMBOL(sock_gettstamp);
   3489
   3490void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
   3491{
   3492	if (!sock_flag(sk, flag)) {
   3493		unsigned long previous_flags = sk->sk_flags;
   3494
   3495		sock_set_flag(sk, flag);
   3496		/*
   3497		 * we just set one of the two flags which require net
   3498		 * time stamping, but time stamping might have been on
   3499		 * already because of the other one
   3500		 */
   3501		if (sock_needs_netstamp(sk) &&
   3502		    !(previous_flags & SK_FLAGS_TIMESTAMP))
   3503			net_enable_timestamp();
   3504	}
   3505}
   3506
   3507int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
   3508		       int level, int type)
   3509{
   3510	struct sock_exterr_skb *serr;
   3511	struct sk_buff *skb;
   3512	int copied, err;
   3513
   3514	err = -EAGAIN;
   3515	skb = sock_dequeue_err_skb(sk);
   3516	if (skb == NULL)
   3517		goto out;
   3518
   3519	copied = skb->len;
   3520	if (copied > len) {
   3521		msg->msg_flags |= MSG_TRUNC;
   3522		copied = len;
   3523	}
   3524	err = skb_copy_datagram_msg(skb, 0, msg, copied);
   3525	if (err)
   3526		goto out_free_skb;
   3527
   3528	sock_recv_timestamp(msg, sk, skb);
   3529
   3530	serr = SKB_EXT_ERR(skb);
   3531	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
   3532
   3533	msg->msg_flags |= MSG_ERRQUEUE;
   3534	err = copied;
   3535
   3536out_free_skb:
   3537	kfree_skb(skb);
   3538out:
   3539	return err;
   3540}
   3541EXPORT_SYMBOL(sock_recv_errqueue);
   3542
   3543/*
   3544 *	Get a socket option on an socket.
   3545 *
   3546 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
   3547 *	asynchronous errors should be reported by getsockopt. We assume
   3548 *	this means if you specify SO_ERROR (otherwise whats the point of it).
   3549 */
   3550int sock_common_getsockopt(struct socket *sock, int level, int optname,
   3551			   char __user *optval, int __user *optlen)
   3552{
   3553	struct sock *sk = sock->sk;
   3554
   3555	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
   3556}
   3557EXPORT_SYMBOL(sock_common_getsockopt);
   3558
   3559int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
   3560			int flags)
   3561{
   3562	struct sock *sk = sock->sk;
   3563	int addr_len = 0;
   3564	int err;
   3565
   3566	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
   3567	if (err >= 0)
   3568		msg->msg_namelen = addr_len;
   3569	return err;
   3570}
   3571EXPORT_SYMBOL(sock_common_recvmsg);
   3572
   3573/*
   3574 *	Set socket options on an inet socket.
   3575 */
   3576int sock_common_setsockopt(struct socket *sock, int level, int optname,
   3577			   sockptr_t optval, unsigned int optlen)
   3578{
   3579	struct sock *sk = sock->sk;
   3580
   3581	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
   3582}
   3583EXPORT_SYMBOL(sock_common_setsockopt);
   3584
   3585void sk_common_release(struct sock *sk)
   3586{
   3587	if (sk->sk_prot->destroy)
   3588		sk->sk_prot->destroy(sk);
   3589
   3590	/*
   3591	 * Observation: when sk_common_release is called, processes have
   3592	 * no access to socket. But net still has.
   3593	 * Step one, detach it from networking:
   3594	 *
   3595	 * A. Remove from hash tables.
   3596	 */
   3597
   3598	sk->sk_prot->unhash(sk);
   3599
   3600	/*
   3601	 * In this point socket cannot receive new packets, but it is possible
   3602	 * that some packets are in flight because some CPU runs receiver and
   3603	 * did hash table lookup before we unhashed socket. They will achieve
   3604	 * receive queue and will be purged by socket destructor.
   3605	 *
   3606	 * Also we still have packets pending on receive queue and probably,
   3607	 * our own packets waiting in device queues. sock_destroy will drain
   3608	 * receive queue, but transmitted packets will delay socket destruction
   3609	 * until the last reference will be released.
   3610	 */
   3611
   3612	sock_orphan(sk);
   3613
   3614	xfrm_sk_free_policy(sk);
   3615
   3616	sk_refcnt_debug_release(sk);
   3617
   3618	sock_put(sk);
   3619}
   3620EXPORT_SYMBOL(sk_common_release);
   3621
   3622void sk_get_meminfo(const struct sock *sk, u32 *mem)
   3623{
   3624	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
   3625
   3626	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
   3627	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
   3628	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
   3629	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
   3630	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
   3631	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
   3632	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
   3633	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
   3634	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
   3635}
   3636
   3637#ifdef CONFIG_PROC_FS
   3638static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
   3639
   3640int sock_prot_inuse_get(struct net *net, struct proto *prot)
   3641{
   3642	int cpu, idx = prot->inuse_idx;
   3643	int res = 0;
   3644
   3645	for_each_possible_cpu(cpu)
   3646		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
   3647
   3648	return res >= 0 ? res : 0;
   3649}
   3650EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
   3651
   3652int sock_inuse_get(struct net *net)
   3653{
   3654	int cpu, res = 0;
   3655
   3656	for_each_possible_cpu(cpu)
   3657		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
   3658
   3659	return res;
   3660}
   3661
   3662EXPORT_SYMBOL_GPL(sock_inuse_get);
   3663
   3664static int __net_init sock_inuse_init_net(struct net *net)
   3665{
   3666	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
   3667	if (net->core.prot_inuse == NULL)
   3668		return -ENOMEM;
   3669	return 0;
   3670}
   3671
   3672static void __net_exit sock_inuse_exit_net(struct net *net)
   3673{
   3674	free_percpu(net->core.prot_inuse);
   3675}
   3676
   3677static struct pernet_operations net_inuse_ops = {
   3678	.init = sock_inuse_init_net,
   3679	.exit = sock_inuse_exit_net,
   3680};
   3681
   3682static __init int net_inuse_init(void)
   3683{
   3684	if (register_pernet_subsys(&net_inuse_ops))
   3685		panic("Cannot initialize net inuse counters");
   3686
   3687	return 0;
   3688}
   3689
   3690core_initcall(net_inuse_init);
   3691
   3692static int assign_proto_idx(struct proto *prot)
   3693{
   3694	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
   3695
   3696	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
   3697		pr_err("PROTO_INUSE_NR exhausted\n");
   3698		return -ENOSPC;
   3699	}
   3700
   3701	set_bit(prot->inuse_idx, proto_inuse_idx);
   3702	return 0;
   3703}
   3704
   3705static void release_proto_idx(struct proto *prot)
   3706{
   3707	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
   3708		clear_bit(prot->inuse_idx, proto_inuse_idx);
   3709}
   3710#else
   3711static inline int assign_proto_idx(struct proto *prot)
   3712{
   3713	return 0;
   3714}
   3715
   3716static inline void release_proto_idx(struct proto *prot)
   3717{
   3718}
   3719
   3720#endif
   3721
   3722static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
   3723{
   3724	if (!twsk_prot)
   3725		return;
   3726	kfree(twsk_prot->twsk_slab_name);
   3727	twsk_prot->twsk_slab_name = NULL;
   3728	kmem_cache_destroy(twsk_prot->twsk_slab);
   3729	twsk_prot->twsk_slab = NULL;
   3730}
   3731
   3732static int tw_prot_init(const struct proto *prot)
   3733{
   3734	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
   3735
   3736	if (!twsk_prot)
   3737		return 0;
   3738
   3739	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
   3740					      prot->name);
   3741	if (!twsk_prot->twsk_slab_name)
   3742		return -ENOMEM;
   3743
   3744	twsk_prot->twsk_slab =
   3745		kmem_cache_create(twsk_prot->twsk_slab_name,
   3746				  twsk_prot->twsk_obj_size, 0,
   3747				  SLAB_ACCOUNT | prot->slab_flags,
   3748				  NULL);
   3749	if (!twsk_prot->twsk_slab) {
   3750		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
   3751			prot->name);
   3752		return -ENOMEM;
   3753	}
   3754
   3755	return 0;
   3756}
   3757
   3758static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
   3759{
   3760	if (!rsk_prot)
   3761		return;
   3762	kfree(rsk_prot->slab_name);
   3763	rsk_prot->slab_name = NULL;
   3764	kmem_cache_destroy(rsk_prot->slab);
   3765	rsk_prot->slab = NULL;
   3766}
   3767
   3768static int req_prot_init(const struct proto *prot)
   3769{
   3770	struct request_sock_ops *rsk_prot = prot->rsk_prot;
   3771
   3772	if (!rsk_prot)
   3773		return 0;
   3774
   3775	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
   3776					prot->name);
   3777	if (!rsk_prot->slab_name)
   3778		return -ENOMEM;
   3779
   3780	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
   3781					   rsk_prot->obj_size, 0,
   3782					   SLAB_ACCOUNT | prot->slab_flags,
   3783					   NULL);
   3784
   3785	if (!rsk_prot->slab) {
   3786		pr_crit("%s: Can't create request sock SLAB cache!\n",
   3787			prot->name);
   3788		return -ENOMEM;
   3789	}
   3790	return 0;
   3791}
   3792
   3793int proto_register(struct proto *prot, int alloc_slab)
   3794{
   3795	int ret = -ENOBUFS;
   3796
   3797	if (prot->memory_allocated && !prot->sysctl_mem) {
   3798		pr_err("%s: missing sysctl_mem\n", prot->name);
   3799		return -EINVAL;
   3800	}
   3801	if (alloc_slab) {
   3802		prot->slab = kmem_cache_create_usercopy(prot->name,
   3803					prot->obj_size, 0,
   3804					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
   3805					prot->slab_flags,
   3806					prot->useroffset, prot->usersize,
   3807					NULL);
   3808
   3809		if (prot->slab == NULL) {
   3810			pr_crit("%s: Can't create sock SLAB cache!\n",
   3811				prot->name);
   3812			goto out;
   3813		}
   3814
   3815		if (req_prot_init(prot))
   3816			goto out_free_request_sock_slab;
   3817
   3818		if (tw_prot_init(prot))
   3819			goto out_free_timewait_sock_slab;
   3820	}
   3821
   3822	mutex_lock(&proto_list_mutex);
   3823	ret = assign_proto_idx(prot);
   3824	if (ret) {
   3825		mutex_unlock(&proto_list_mutex);
   3826		goto out_free_timewait_sock_slab;
   3827	}
   3828	list_add(&prot->node, &proto_list);
   3829	mutex_unlock(&proto_list_mutex);
   3830	return ret;
   3831
   3832out_free_timewait_sock_slab:
   3833	if (alloc_slab)
   3834		tw_prot_cleanup(prot->twsk_prot);
   3835out_free_request_sock_slab:
   3836	if (alloc_slab) {
   3837		req_prot_cleanup(prot->rsk_prot);
   3838
   3839		kmem_cache_destroy(prot->slab);
   3840		prot->slab = NULL;
   3841	}
   3842out:
   3843	return ret;
   3844}
   3845EXPORT_SYMBOL(proto_register);
   3846
   3847void proto_unregister(struct proto *prot)
   3848{
   3849	mutex_lock(&proto_list_mutex);
   3850	release_proto_idx(prot);
   3851	list_del(&prot->node);
   3852	mutex_unlock(&proto_list_mutex);
   3853
   3854	kmem_cache_destroy(prot->slab);
   3855	prot->slab = NULL;
   3856
   3857	req_prot_cleanup(prot->rsk_prot);
   3858	tw_prot_cleanup(prot->twsk_prot);
   3859}
   3860EXPORT_SYMBOL(proto_unregister);
   3861
   3862int sock_load_diag_module(int family, int protocol)
   3863{
   3864	if (!protocol) {
   3865		if (!sock_is_registered(family))
   3866			return -ENOENT;
   3867
   3868		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
   3869				      NETLINK_SOCK_DIAG, family);
   3870	}
   3871
   3872#ifdef CONFIG_INET
   3873	if (family == AF_INET &&
   3874	    protocol != IPPROTO_RAW &&
   3875	    protocol < MAX_INET_PROTOS &&
   3876	    !rcu_access_pointer(inet_protos[protocol]))
   3877		return -ENOENT;
   3878#endif
   3879
   3880	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
   3881			      NETLINK_SOCK_DIAG, family, protocol);
   3882}
   3883EXPORT_SYMBOL(sock_load_diag_module);
   3884
   3885#ifdef CONFIG_PROC_FS
   3886static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
   3887	__acquires(proto_list_mutex)
   3888{
   3889	mutex_lock(&proto_list_mutex);
   3890	return seq_list_start_head(&proto_list, *pos);
   3891}
   3892
   3893static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
   3894{
   3895	return seq_list_next(v, &proto_list, pos);
   3896}
   3897
   3898static void proto_seq_stop(struct seq_file *seq, void *v)
   3899	__releases(proto_list_mutex)
   3900{
   3901	mutex_unlock(&proto_list_mutex);
   3902}
   3903
   3904static char proto_method_implemented(const void *method)
   3905{
   3906	return method == NULL ? 'n' : 'y';
   3907}
   3908static long sock_prot_memory_allocated(struct proto *proto)
   3909{
   3910	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
   3911}
   3912
   3913static const char *sock_prot_memory_pressure(struct proto *proto)
   3914{
   3915	return proto->memory_pressure != NULL ?
   3916	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
   3917}
   3918
   3919static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
   3920{
   3921
   3922	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
   3923			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
   3924		   proto->name,
   3925		   proto->obj_size,
   3926		   sock_prot_inuse_get(seq_file_net(seq), proto),
   3927		   sock_prot_memory_allocated(proto),
   3928		   sock_prot_memory_pressure(proto),
   3929		   proto->max_header,
   3930		   proto->slab == NULL ? "no" : "yes",
   3931		   module_name(proto->owner),
   3932		   proto_method_implemented(proto->close),
   3933		   proto_method_implemented(proto->connect),
   3934		   proto_method_implemented(proto->disconnect),
   3935		   proto_method_implemented(proto->accept),
   3936		   proto_method_implemented(proto->ioctl),
   3937		   proto_method_implemented(proto->init),
   3938		   proto_method_implemented(proto->destroy),
   3939		   proto_method_implemented(proto->shutdown),
   3940		   proto_method_implemented(proto->setsockopt),
   3941		   proto_method_implemented(proto->getsockopt),
   3942		   proto_method_implemented(proto->sendmsg),
   3943		   proto_method_implemented(proto->recvmsg),
   3944		   proto_method_implemented(proto->sendpage),
   3945		   proto_method_implemented(proto->bind),
   3946		   proto_method_implemented(proto->backlog_rcv),
   3947		   proto_method_implemented(proto->hash),
   3948		   proto_method_implemented(proto->unhash),
   3949		   proto_method_implemented(proto->get_port),
   3950		   proto_method_implemented(proto->enter_memory_pressure));
   3951}
   3952
   3953static int proto_seq_show(struct seq_file *seq, void *v)
   3954{
   3955	if (v == &proto_list)
   3956		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
   3957			   "protocol",
   3958			   "size",
   3959			   "sockets",
   3960			   "memory",
   3961			   "press",
   3962			   "maxhdr",
   3963			   "slab",
   3964			   "module",
   3965			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
   3966	else
   3967		proto_seq_printf(seq, list_entry(v, struct proto, node));
   3968	return 0;
   3969}
   3970
   3971static const struct seq_operations proto_seq_ops = {
   3972	.start  = proto_seq_start,
   3973	.next   = proto_seq_next,
   3974	.stop   = proto_seq_stop,
   3975	.show   = proto_seq_show,
   3976};
   3977
   3978static __net_init int proto_init_net(struct net *net)
   3979{
   3980	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
   3981			sizeof(struct seq_net_private)))
   3982		return -ENOMEM;
   3983
   3984	return 0;
   3985}
   3986
   3987static __net_exit void proto_exit_net(struct net *net)
   3988{
   3989	remove_proc_entry("protocols", net->proc_net);
   3990}
   3991
   3992
   3993static __net_initdata struct pernet_operations proto_net_ops = {
   3994	.init = proto_init_net,
   3995	.exit = proto_exit_net,
   3996};
   3997
   3998static int __init proto_init(void)
   3999{
   4000	return register_pernet_subsys(&proto_net_ops);
   4001}
   4002
   4003subsys_initcall(proto_init);
   4004
   4005#endif /* PROC_FS */
   4006
   4007#ifdef CONFIG_NET_RX_BUSY_POLL
   4008bool sk_busy_loop_end(void *p, unsigned long start_time)
   4009{
   4010	struct sock *sk = p;
   4011
   4012	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
   4013	       sk_busy_loop_timeout(sk, start_time);
   4014}
   4015EXPORT_SYMBOL(sk_busy_loop_end);
   4016#endif /* CONFIG_NET_RX_BUSY_POLL */
   4017
   4018int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
   4019{
   4020	if (!sk->sk_prot->bind_add)
   4021		return -EOPNOTSUPP;
   4022	return sk->sk_prot->bind_add(sk, addr, addr_len);
   4023}
   4024EXPORT_SYMBOL(sock_bind_add);