cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

af_packet.c (114463B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
      4 *		operating system.  INET is implemented using the  BSD Socket
      5 *		interface as the means of communication with the user level.
      6 *
      7 *		PACKET - implements raw packet sockets.
      8 *
      9 * Authors:	Ross Biro
     10 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
     11 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
     12 *
     13 * Fixes:
     14 *		Alan Cox	:	verify_area() now used correctly
     15 *		Alan Cox	:	new skbuff lists, look ma no backlogs!
     16 *		Alan Cox	:	tidied skbuff lists.
     17 *		Alan Cox	:	Now uses generic datagram routines I
     18 *					added. Also fixed the peek/read crash
     19 *					from all old Linux datagram code.
     20 *		Alan Cox	:	Uses the improved datagram code.
     21 *		Alan Cox	:	Added NULL's for socket options.
     22 *		Alan Cox	:	Re-commented the code.
     23 *		Alan Cox	:	Use new kernel side addressing
     24 *		Rob Janssen	:	Correct MTU usage.
     25 *		Dave Platt	:	Counter leaks caused by incorrect
     26 *					interrupt locking and some slightly
     27 *					dubious gcc output. Can you read
     28 *					compiler: it said _VOLATILE_
     29 *	Richard Kooijman	:	Timestamp fixes.
     30 *		Alan Cox	:	New buffers. Use sk->mac.raw.
     31 *		Alan Cox	:	sendmsg/recvmsg support.
     32 *		Alan Cox	:	Protocol setting support
     33 *	Alexey Kuznetsov	:	Untied from IPv4 stack.
     34 *	Cyrus Durgin		:	Fixed kerneld for kmod.
     35 *	Michal Ostrowski        :       Module initialization cleanup.
     36 *         Ulises Alonso        :       Frame number limit removal and
     37 *                                      packet_set_ring memory leak.
     38 *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
     39 *					The convention is that longer addresses
     40 *					will simply extend the hardware address
     41 *					byte arrays at the end of sockaddr_ll
     42 *					and packet_mreq.
     43 *		Johann Baudy	:	Added TX RING.
     44 *		Chetan Loke	:	Implemented TPACKET_V3 block abstraction
     45 *					layer.
     46 *					Copyright (C) 2011, <lokec@ccs.neu.edu>
     47 */
     48
     49#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
     50
     51#include <linux/ethtool.h>
     52#include <linux/filter.h>
     53#include <linux/types.h>
     54#include <linux/mm.h>
     55#include <linux/capability.h>
     56#include <linux/fcntl.h>
     57#include <linux/socket.h>
     58#include <linux/in.h>
     59#include <linux/inet.h>
     60#include <linux/netdevice.h>
     61#include <linux/if_packet.h>
     62#include <linux/wireless.h>
     63#include <linux/kernel.h>
     64#include <linux/kmod.h>
     65#include <linux/slab.h>
     66#include <linux/vmalloc.h>
     67#include <net/net_namespace.h>
     68#include <net/ip.h>
     69#include <net/protocol.h>
     70#include <linux/skbuff.h>
     71#include <net/sock.h>
     72#include <linux/errno.h>
     73#include <linux/timer.h>
     74#include <linux/uaccess.h>
     75#include <asm/ioctls.h>
     76#include <asm/page.h>
     77#include <asm/cacheflush.h>
     78#include <asm/io.h>
     79#include <linux/proc_fs.h>
     80#include <linux/seq_file.h>
     81#include <linux/poll.h>
     82#include <linux/module.h>
     83#include <linux/init.h>
     84#include <linux/mutex.h>
     85#include <linux/if_vlan.h>
     86#include <linux/virtio_net.h>
     87#include <linux/errqueue.h>
     88#include <linux/net_tstamp.h>
     89#include <linux/percpu.h>
     90#ifdef CONFIG_INET
     91#include <net/inet_common.h>
     92#endif
     93#include <linux/bpf.h>
     94#include <net/compat.h>
     95#include <linux/netfilter_netdev.h>
     96
     97#include "internal.h"
     98
     99/*
    100   Assumptions:
    101   - If the device has no dev->header_ops->create, there is no LL header
    102     visible above the device. In this case, its hard_header_len should be 0.
    103     The device may prepend its own header internally. In this case, its
    104     needed_headroom should be set to the space needed for it to add its
    105     internal header.
    106     For example, a WiFi driver pretending to be an Ethernet driver should
    107     set its hard_header_len to be the Ethernet header length, and set its
    108     needed_headroom to be (the real WiFi header length - the fake Ethernet
    109     header length).
    110   - packet socket receives packets with pulled ll header,
    111     so that SOCK_RAW should push it back.
    112
    113On receive:
    114-----------
    115
    116Incoming, dev_has_header(dev) == true
    117   mac_header -> ll header
    118   data       -> data
    119
    120Outgoing, dev_has_header(dev) == true
    121   mac_header -> ll header
    122   data       -> ll header
    123
    124Incoming, dev_has_header(dev) == false
    125   mac_header -> data
    126     However drivers often make it point to the ll header.
    127     This is incorrect because the ll header should be invisible to us.
    128   data       -> data
    129
    130Outgoing, dev_has_header(dev) == false
    131   mac_header -> data. ll header is invisible to us.
    132   data       -> data
    133
    134Resume
    135  If dev_has_header(dev) == false we are unable to restore the ll header,
    136    because it is invisible to us.
    137
    138
    139On transmit:
    140------------
    141
    142dev_has_header(dev) == true
    143   mac_header -> ll header
    144   data       -> ll header
    145
    146dev_has_header(dev) == false (ll header is invisible to us)
    147   mac_header -> data
    148   data       -> data
    149
    150   We should set network_header on output to the correct position,
    151   packet classifier depends on it.
    152 */
    153
    154/* Private packet socket structures. */
    155
    156/* identical to struct packet_mreq except it has
    157 * a longer address field.
    158 */
    159struct packet_mreq_max {
    160	int		mr_ifindex;
    161	unsigned short	mr_type;
    162	unsigned short	mr_alen;
    163	unsigned char	mr_address[MAX_ADDR_LEN];
    164};
    165
    166union tpacket_uhdr {
    167	struct tpacket_hdr  *h1;
    168	struct tpacket2_hdr *h2;
    169	struct tpacket3_hdr *h3;
    170	void *raw;
    171};
    172
    173static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
    174		int closing, int tx_ring);
    175
    176#define V3_ALIGNMENT	(8)
    177
    178#define BLK_HDR_LEN	(ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
    179
    180#define BLK_PLUS_PRIV(sz_of_priv) \
    181	(BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
    182
    183#define BLOCK_STATUS(x)	((x)->hdr.bh1.block_status)
    184#define BLOCK_NUM_PKTS(x)	((x)->hdr.bh1.num_pkts)
    185#define BLOCK_O2FP(x)		((x)->hdr.bh1.offset_to_first_pkt)
    186#define BLOCK_LEN(x)		((x)->hdr.bh1.blk_len)
    187#define BLOCK_SNUM(x)		((x)->hdr.bh1.seq_num)
    188#define BLOCK_O2PRIV(x)	((x)->offset_to_priv)
    189
    190struct packet_sock;
    191static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
    192		       struct packet_type *pt, struct net_device *orig_dev);
    193
    194static void *packet_previous_frame(struct packet_sock *po,
    195		struct packet_ring_buffer *rb,
    196		int status);
    197static void packet_increment_head(struct packet_ring_buffer *buff);
    198static int prb_curr_blk_in_use(struct tpacket_block_desc *);
    199static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
    200			struct packet_sock *);
    201static void prb_retire_current_block(struct tpacket_kbdq_core *,
    202		struct packet_sock *, unsigned int status);
    203static int prb_queue_frozen(struct tpacket_kbdq_core *);
    204static void prb_open_block(struct tpacket_kbdq_core *,
    205		struct tpacket_block_desc *);
    206static void prb_retire_rx_blk_timer_expired(struct timer_list *);
    207static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
    208static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
    209static void prb_clear_rxhash(struct tpacket_kbdq_core *,
    210		struct tpacket3_hdr *);
    211static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
    212		struct tpacket3_hdr *);
    213static void packet_flush_mclist(struct sock *sk);
    214static u16 packet_pick_tx_queue(struct sk_buff *skb);
    215
    216struct packet_skb_cb {
    217	union {
    218		struct sockaddr_pkt pkt;
    219		union {
    220			/* Trick: alias skb original length with
    221			 * ll.sll_family and ll.protocol in order
    222			 * to save room.
    223			 */
    224			unsigned int origlen;
    225			struct sockaddr_ll ll;
    226		};
    227	} sa;
    228};
    229
    230#define vio_le() virtio_legacy_is_little_endian()
    231
    232#define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
    233
    234#define GET_PBDQC_FROM_RB(x)	((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
    235#define GET_PBLOCK_DESC(x, bid)	\
    236	((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
    237#define GET_CURR_PBLOCK_DESC_FROM_CORE(x)	\
    238	((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
    239#define GET_NEXT_PRB_BLK_NUM(x) \
    240	(((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
    241	((x)->kactive_blk_num+1) : 0)
    242
    243static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
    244static void __fanout_link(struct sock *sk, struct packet_sock *po);
    245
    246#ifdef CONFIG_NETFILTER_EGRESS
    247static noinline struct sk_buff *nf_hook_direct_egress(struct sk_buff *skb)
    248{
    249	struct sk_buff *next, *head = NULL, *tail;
    250	int rc;
    251
    252	rcu_read_lock();
    253	for (; skb != NULL; skb = next) {
    254		next = skb->next;
    255		skb_mark_not_on_list(skb);
    256
    257		if (!nf_hook_egress(skb, &rc, skb->dev))
    258			continue;
    259
    260		if (!head)
    261			head = skb;
    262		else
    263			tail->next = skb;
    264
    265		tail = skb;
    266	}
    267	rcu_read_unlock();
    268
    269	return head;
    270}
    271#endif
    272
    273static int packet_direct_xmit(struct sk_buff *skb)
    274{
    275#ifdef CONFIG_NETFILTER_EGRESS
    276	if (nf_hook_egress_active()) {
    277		skb = nf_hook_direct_egress(skb);
    278		if (!skb)
    279			return NET_XMIT_DROP;
    280	}
    281#endif
    282	return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
    283}
    284
    285static struct net_device *packet_cached_dev_get(struct packet_sock *po)
    286{
    287	struct net_device *dev;
    288
    289	rcu_read_lock();
    290	dev = rcu_dereference(po->cached_dev);
    291	dev_hold(dev);
    292	rcu_read_unlock();
    293
    294	return dev;
    295}
    296
    297static void packet_cached_dev_assign(struct packet_sock *po,
    298				     struct net_device *dev)
    299{
    300	rcu_assign_pointer(po->cached_dev, dev);
    301}
    302
    303static void packet_cached_dev_reset(struct packet_sock *po)
    304{
    305	RCU_INIT_POINTER(po->cached_dev, NULL);
    306}
    307
    308static bool packet_use_direct_xmit(const struct packet_sock *po)
    309{
    310	return po->xmit == packet_direct_xmit;
    311}
    312
    313static u16 packet_pick_tx_queue(struct sk_buff *skb)
    314{
    315	struct net_device *dev = skb->dev;
    316	const struct net_device_ops *ops = dev->netdev_ops;
    317	int cpu = raw_smp_processor_id();
    318	u16 queue_index;
    319
    320#ifdef CONFIG_XPS
    321	skb->sender_cpu = cpu + 1;
    322#endif
    323	skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
    324	if (ops->ndo_select_queue) {
    325		queue_index = ops->ndo_select_queue(dev, skb, NULL);
    326		queue_index = netdev_cap_txqueue(dev, queue_index);
    327	} else {
    328		queue_index = netdev_pick_tx(dev, skb, NULL);
    329	}
    330
    331	return queue_index;
    332}
    333
    334/* __register_prot_hook must be invoked through register_prot_hook
    335 * or from a context in which asynchronous accesses to the packet
    336 * socket is not possible (packet_create()).
    337 */
    338static void __register_prot_hook(struct sock *sk)
    339{
    340	struct packet_sock *po = pkt_sk(sk);
    341
    342	if (!po->running) {
    343		if (po->fanout)
    344			__fanout_link(sk, po);
    345		else
    346			dev_add_pack(&po->prot_hook);
    347
    348		sock_hold(sk);
    349		po->running = 1;
    350	}
    351}
    352
    353static void register_prot_hook(struct sock *sk)
    354{
    355	lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
    356	__register_prot_hook(sk);
    357}
    358
    359/* If the sync parameter is true, we will temporarily drop
    360 * the po->bind_lock and do a synchronize_net to make sure no
    361 * asynchronous packet processing paths still refer to the elements
    362 * of po->prot_hook.  If the sync parameter is false, it is the
    363 * callers responsibility to take care of this.
    364 */
    365static void __unregister_prot_hook(struct sock *sk, bool sync)
    366{
    367	struct packet_sock *po = pkt_sk(sk);
    368
    369	lockdep_assert_held_once(&po->bind_lock);
    370
    371	po->running = 0;
    372
    373	if (po->fanout)
    374		__fanout_unlink(sk, po);
    375	else
    376		__dev_remove_pack(&po->prot_hook);
    377
    378	__sock_put(sk);
    379
    380	if (sync) {
    381		spin_unlock(&po->bind_lock);
    382		synchronize_net();
    383		spin_lock(&po->bind_lock);
    384	}
    385}
    386
    387static void unregister_prot_hook(struct sock *sk, bool sync)
    388{
    389	struct packet_sock *po = pkt_sk(sk);
    390
    391	if (po->running)
    392		__unregister_prot_hook(sk, sync);
    393}
    394
    395static inline struct page * __pure pgv_to_page(void *addr)
    396{
    397	if (is_vmalloc_addr(addr))
    398		return vmalloc_to_page(addr);
    399	return virt_to_page(addr);
    400}
    401
    402static void __packet_set_status(struct packet_sock *po, void *frame, int status)
    403{
    404	union tpacket_uhdr h;
    405
    406	h.raw = frame;
    407	switch (po->tp_version) {
    408	case TPACKET_V1:
    409		h.h1->tp_status = status;
    410		flush_dcache_page(pgv_to_page(&h.h1->tp_status));
    411		break;
    412	case TPACKET_V2:
    413		h.h2->tp_status = status;
    414		flush_dcache_page(pgv_to_page(&h.h2->tp_status));
    415		break;
    416	case TPACKET_V3:
    417		h.h3->tp_status = status;
    418		flush_dcache_page(pgv_to_page(&h.h3->tp_status));
    419		break;
    420	default:
    421		WARN(1, "TPACKET version not supported.\n");
    422		BUG();
    423	}
    424
    425	smp_wmb();
    426}
    427
    428static int __packet_get_status(const struct packet_sock *po, void *frame)
    429{
    430	union tpacket_uhdr h;
    431
    432	smp_rmb();
    433
    434	h.raw = frame;
    435	switch (po->tp_version) {
    436	case TPACKET_V1:
    437		flush_dcache_page(pgv_to_page(&h.h1->tp_status));
    438		return h.h1->tp_status;
    439	case TPACKET_V2:
    440		flush_dcache_page(pgv_to_page(&h.h2->tp_status));
    441		return h.h2->tp_status;
    442	case TPACKET_V3:
    443		flush_dcache_page(pgv_to_page(&h.h3->tp_status));
    444		return h.h3->tp_status;
    445	default:
    446		WARN(1, "TPACKET version not supported.\n");
    447		BUG();
    448		return 0;
    449	}
    450}
    451
    452static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts,
    453				   unsigned int flags)
    454{
    455	struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
    456
    457	if (shhwtstamps &&
    458	    (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
    459	    ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts))
    460		return TP_STATUS_TS_RAW_HARDWARE;
    461
    462	if ((flags & SOF_TIMESTAMPING_SOFTWARE) &&
    463	    ktime_to_timespec64_cond(skb_tstamp(skb), ts))
    464		return TP_STATUS_TS_SOFTWARE;
    465
    466	return 0;
    467}
    468
    469static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
    470				    struct sk_buff *skb)
    471{
    472	union tpacket_uhdr h;
    473	struct timespec64 ts;
    474	__u32 ts_status;
    475
    476	if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
    477		return 0;
    478
    479	h.raw = frame;
    480	/*
    481	 * versions 1 through 3 overflow the timestamps in y2106, since they
    482	 * all store the seconds in a 32-bit unsigned integer.
    483	 * If we create a version 4, that should have a 64-bit timestamp,
    484	 * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit
    485	 * nanoseconds.
    486	 */
    487	switch (po->tp_version) {
    488	case TPACKET_V1:
    489		h.h1->tp_sec = ts.tv_sec;
    490		h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
    491		break;
    492	case TPACKET_V2:
    493		h.h2->tp_sec = ts.tv_sec;
    494		h.h2->tp_nsec = ts.tv_nsec;
    495		break;
    496	case TPACKET_V3:
    497		h.h3->tp_sec = ts.tv_sec;
    498		h.h3->tp_nsec = ts.tv_nsec;
    499		break;
    500	default:
    501		WARN(1, "TPACKET version not supported.\n");
    502		BUG();
    503	}
    504
    505	/* one flush is safe, as both fields always lie on the same cacheline */
    506	flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
    507	smp_wmb();
    508
    509	return ts_status;
    510}
    511
    512static void *packet_lookup_frame(const struct packet_sock *po,
    513				 const struct packet_ring_buffer *rb,
    514				 unsigned int position,
    515				 int status)
    516{
    517	unsigned int pg_vec_pos, frame_offset;
    518	union tpacket_uhdr h;
    519
    520	pg_vec_pos = position / rb->frames_per_block;
    521	frame_offset = position % rb->frames_per_block;
    522
    523	h.raw = rb->pg_vec[pg_vec_pos].buffer +
    524		(frame_offset * rb->frame_size);
    525
    526	if (status != __packet_get_status(po, h.raw))
    527		return NULL;
    528
    529	return h.raw;
    530}
    531
    532static void *packet_current_frame(struct packet_sock *po,
    533		struct packet_ring_buffer *rb,
    534		int status)
    535{
    536	return packet_lookup_frame(po, rb, rb->head, status);
    537}
    538
    539static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
    540{
    541	del_timer_sync(&pkc->retire_blk_timer);
    542}
    543
    544static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
    545		struct sk_buff_head *rb_queue)
    546{
    547	struct tpacket_kbdq_core *pkc;
    548
    549	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
    550
    551	spin_lock_bh(&rb_queue->lock);
    552	pkc->delete_blk_timer = 1;
    553	spin_unlock_bh(&rb_queue->lock);
    554
    555	prb_del_retire_blk_timer(pkc);
    556}
    557
    558static void prb_setup_retire_blk_timer(struct packet_sock *po)
    559{
    560	struct tpacket_kbdq_core *pkc;
    561
    562	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
    563	timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
    564		    0);
    565	pkc->retire_blk_timer.expires = jiffies;
    566}
    567
    568static int prb_calc_retire_blk_tmo(struct packet_sock *po,
    569				int blk_size_in_bytes)
    570{
    571	struct net_device *dev;
    572	unsigned int mbits, div;
    573	struct ethtool_link_ksettings ecmd;
    574	int err;
    575
    576	rtnl_lock();
    577	dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
    578	if (unlikely(!dev)) {
    579		rtnl_unlock();
    580		return DEFAULT_PRB_RETIRE_TOV;
    581	}
    582	err = __ethtool_get_link_ksettings(dev, &ecmd);
    583	rtnl_unlock();
    584	if (err)
    585		return DEFAULT_PRB_RETIRE_TOV;
    586
    587	/* If the link speed is so slow you don't really
    588	 * need to worry about perf anyways
    589	 */
    590	if (ecmd.base.speed < SPEED_1000 ||
    591	    ecmd.base.speed == SPEED_UNKNOWN)
    592		return DEFAULT_PRB_RETIRE_TOV;
    593
    594	div = ecmd.base.speed / 1000;
    595	mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
    596
    597	if (div)
    598		mbits /= div;
    599
    600	if (div)
    601		return mbits + 1;
    602	return mbits;
    603}
    604
    605static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
    606			union tpacket_req_u *req_u)
    607{
    608	p1->feature_req_word = req_u->req3.tp_feature_req_word;
    609}
    610
    611static void init_prb_bdqc(struct packet_sock *po,
    612			struct packet_ring_buffer *rb,
    613			struct pgv *pg_vec,
    614			union tpacket_req_u *req_u)
    615{
    616	struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
    617	struct tpacket_block_desc *pbd;
    618
    619	memset(p1, 0x0, sizeof(*p1));
    620
    621	p1->knxt_seq_num = 1;
    622	p1->pkbdq = pg_vec;
    623	pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
    624	p1->pkblk_start	= pg_vec[0].buffer;
    625	p1->kblk_size = req_u->req3.tp_block_size;
    626	p1->knum_blocks	= req_u->req3.tp_block_nr;
    627	p1->hdrlen = po->tp_hdrlen;
    628	p1->version = po->tp_version;
    629	p1->last_kactive_blk_num = 0;
    630	po->stats.stats3.tp_freeze_q_cnt = 0;
    631	if (req_u->req3.tp_retire_blk_tov)
    632		p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
    633	else
    634		p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
    635						req_u->req3.tp_block_size);
    636	p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
    637	p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
    638	rwlock_init(&p1->blk_fill_in_prog_lock);
    639
    640	p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
    641	prb_init_ft_ops(p1, req_u);
    642	prb_setup_retire_blk_timer(po);
    643	prb_open_block(p1, pbd);
    644}
    645
    646/*  Do NOT update the last_blk_num first.
    647 *  Assumes sk_buff_head lock is held.
    648 */
    649static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
    650{
    651	mod_timer(&pkc->retire_blk_timer,
    652			jiffies + pkc->tov_in_jiffies);
    653	pkc->last_kactive_blk_num = pkc->kactive_blk_num;
    654}
    655
    656/*
    657 * Timer logic:
    658 * 1) We refresh the timer only when we open a block.
    659 *    By doing this we don't waste cycles refreshing the timer
    660 *	  on packet-by-packet basis.
    661 *
    662 * With a 1MB block-size, on a 1Gbps line, it will take
    663 * i) ~8 ms to fill a block + ii) memcpy etc.
    664 * In this cut we are not accounting for the memcpy time.
    665 *
    666 * So, if the user sets the 'tmo' to 10ms then the timer
    667 * will never fire while the block is still getting filled
    668 * (which is what we want). However, the user could choose
    669 * to close a block early and that's fine.
    670 *
    671 * But when the timer does fire, we check whether or not to refresh it.
    672 * Since the tmo granularity is in msecs, it is not too expensive
    673 * to refresh the timer, lets say every '8' msecs.
    674 * Either the user can set the 'tmo' or we can derive it based on
    675 * a) line-speed and b) block-size.
    676 * prb_calc_retire_blk_tmo() calculates the tmo.
    677 *
    678 */
    679static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
    680{
    681	struct packet_sock *po =
    682		from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
    683	struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
    684	unsigned int frozen;
    685	struct tpacket_block_desc *pbd;
    686
    687	spin_lock(&po->sk.sk_receive_queue.lock);
    688
    689	frozen = prb_queue_frozen(pkc);
    690	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
    691
    692	if (unlikely(pkc->delete_blk_timer))
    693		goto out;
    694
    695	/* We only need to plug the race when the block is partially filled.
    696	 * tpacket_rcv:
    697	 *		lock(); increment BLOCK_NUM_PKTS; unlock()
    698	 *		copy_bits() is in progress ...
    699	 *		timer fires on other cpu:
    700	 *		we can't retire the current block because copy_bits
    701	 *		is in progress.
    702	 *
    703	 */
    704	if (BLOCK_NUM_PKTS(pbd)) {
    705		/* Waiting for skb_copy_bits to finish... */
    706		write_lock(&pkc->blk_fill_in_prog_lock);
    707		write_unlock(&pkc->blk_fill_in_prog_lock);
    708	}
    709
    710	if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
    711		if (!frozen) {
    712			if (!BLOCK_NUM_PKTS(pbd)) {
    713				/* An empty block. Just refresh the timer. */
    714				goto refresh_timer;
    715			}
    716			prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
    717			if (!prb_dispatch_next_block(pkc, po))
    718				goto refresh_timer;
    719			else
    720				goto out;
    721		} else {
    722			/* Case 1. Queue was frozen because user-space was
    723			 *	   lagging behind.
    724			 */
    725			if (prb_curr_blk_in_use(pbd)) {
    726				/*
    727				 * Ok, user-space is still behind.
    728				 * So just refresh the timer.
    729				 */
    730				goto refresh_timer;
    731			} else {
    732			       /* Case 2. queue was frozen,user-space caught up,
    733				* now the link went idle && the timer fired.
    734				* We don't have a block to close.So we open this
    735				* block and restart the timer.
    736				* opening a block thaws the queue,restarts timer
    737				* Thawing/timer-refresh is a side effect.
    738				*/
    739				prb_open_block(pkc, pbd);
    740				goto out;
    741			}
    742		}
    743	}
    744
    745refresh_timer:
    746	_prb_refresh_rx_retire_blk_timer(pkc);
    747
    748out:
    749	spin_unlock(&po->sk.sk_receive_queue.lock);
    750}
    751
    752static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
    753		struct tpacket_block_desc *pbd1, __u32 status)
    754{
    755	/* Flush everything minus the block header */
    756
    757#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
    758	u8 *start, *end;
    759
    760	start = (u8 *)pbd1;
    761
    762	/* Skip the block header(we know header WILL fit in 4K) */
    763	start += PAGE_SIZE;
    764
    765	end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
    766	for (; start < end; start += PAGE_SIZE)
    767		flush_dcache_page(pgv_to_page(start));
    768
    769	smp_wmb();
    770#endif
    771
    772	/* Now update the block status. */
    773
    774	BLOCK_STATUS(pbd1) = status;
    775
    776	/* Flush the block header */
    777
    778#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
    779	start = (u8 *)pbd1;
    780	flush_dcache_page(pgv_to_page(start));
    781
    782	smp_wmb();
    783#endif
    784}
    785
    786/*
    787 * Side effect:
    788 *
    789 * 1) flush the block
    790 * 2) Increment active_blk_num
    791 *
    792 * Note:We DONT refresh the timer on purpose.
    793 *	Because almost always the next block will be opened.
    794 */
    795static void prb_close_block(struct tpacket_kbdq_core *pkc1,
    796		struct tpacket_block_desc *pbd1,
    797		struct packet_sock *po, unsigned int stat)
    798{
    799	__u32 status = TP_STATUS_USER | stat;
    800
    801	struct tpacket3_hdr *last_pkt;
    802	struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
    803	struct sock *sk = &po->sk;
    804
    805	if (atomic_read(&po->tp_drops))
    806		status |= TP_STATUS_LOSING;
    807
    808	last_pkt = (struct tpacket3_hdr *)pkc1->prev;
    809	last_pkt->tp_next_offset = 0;
    810
    811	/* Get the ts of the last pkt */
    812	if (BLOCK_NUM_PKTS(pbd1)) {
    813		h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
    814		h1->ts_last_pkt.ts_nsec	= last_pkt->tp_nsec;
    815	} else {
    816		/* Ok, we tmo'd - so get the current time.
    817		 *
    818		 * It shouldn't really happen as we don't close empty
    819		 * blocks. See prb_retire_rx_blk_timer_expired().
    820		 */
    821		struct timespec64 ts;
    822		ktime_get_real_ts64(&ts);
    823		h1->ts_last_pkt.ts_sec = ts.tv_sec;
    824		h1->ts_last_pkt.ts_nsec	= ts.tv_nsec;
    825	}
    826
    827	smp_wmb();
    828
    829	/* Flush the block */
    830	prb_flush_block(pkc1, pbd1, status);
    831
    832	sk->sk_data_ready(sk);
    833
    834	pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
    835}
    836
    837static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
    838{
    839	pkc->reset_pending_on_curr_blk = 0;
    840}
    841
    842/*
    843 * Side effect of opening a block:
    844 *
    845 * 1) prb_queue is thawed.
    846 * 2) retire_blk_timer is refreshed.
    847 *
    848 */
    849static void prb_open_block(struct tpacket_kbdq_core *pkc1,
    850	struct tpacket_block_desc *pbd1)
    851{
    852	struct timespec64 ts;
    853	struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
    854
    855	smp_rmb();
    856
    857	/* We could have just memset this but we will lose the
    858	 * flexibility of making the priv area sticky
    859	 */
    860
    861	BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
    862	BLOCK_NUM_PKTS(pbd1) = 0;
    863	BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
    864
    865	ktime_get_real_ts64(&ts);
    866
    867	h1->ts_first_pkt.ts_sec = ts.tv_sec;
    868	h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
    869
    870	pkc1->pkblk_start = (char *)pbd1;
    871	pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
    872
    873	BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
    874	BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
    875
    876	pbd1->version = pkc1->version;
    877	pkc1->prev = pkc1->nxt_offset;
    878	pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
    879
    880	prb_thaw_queue(pkc1);
    881	_prb_refresh_rx_retire_blk_timer(pkc1);
    882
    883	smp_wmb();
    884}
    885
    886/*
    887 * Queue freeze logic:
    888 * 1) Assume tp_block_nr = 8 blocks.
    889 * 2) At time 't0', user opens Rx ring.
    890 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
    891 * 4) user-space is either sleeping or processing block '0'.
    892 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
    893 *    it will close block-7,loop around and try to fill block '0'.
    894 *    call-flow:
    895 *    __packet_lookup_frame_in_block
    896 *      prb_retire_current_block()
    897 *      prb_dispatch_next_block()
    898 *        |->(BLOCK_STATUS == USER) evaluates to true
    899 *    5.1) Since block-0 is currently in-use, we just freeze the queue.
    900 * 6) Now there are two cases:
    901 *    6.1) Link goes idle right after the queue is frozen.
    902 *         But remember, the last open_block() refreshed the timer.
    903 *         When this timer expires,it will refresh itself so that we can
    904 *         re-open block-0 in near future.
    905 *    6.2) Link is busy and keeps on receiving packets. This is a simple
    906 *         case and __packet_lookup_frame_in_block will check if block-0
    907 *         is free and can now be re-used.
    908 */
    909static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
    910				  struct packet_sock *po)
    911{
    912	pkc->reset_pending_on_curr_blk = 1;
    913	po->stats.stats3.tp_freeze_q_cnt++;
    914}
    915
    916#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
    917
    918/*
    919 * If the next block is free then we will dispatch it
    920 * and return a good offset.
    921 * Else, we will freeze the queue.
    922 * So, caller must check the return value.
    923 */
    924static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
    925		struct packet_sock *po)
    926{
    927	struct tpacket_block_desc *pbd;
    928
    929	smp_rmb();
    930
    931	/* 1. Get current block num */
    932	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
    933
    934	/* 2. If this block is currently in_use then freeze the queue */
    935	if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
    936		prb_freeze_queue(pkc, po);
    937		return NULL;
    938	}
    939
    940	/*
    941	 * 3.
    942	 * open this block and return the offset where the first packet
    943	 * needs to get stored.
    944	 */
    945	prb_open_block(pkc, pbd);
    946	return (void *)pkc->nxt_offset;
    947}
    948
    949static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
    950		struct packet_sock *po, unsigned int status)
    951{
    952	struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
    953
    954	/* retire/close the current block */
    955	if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
    956		/*
    957		 * Plug the case where copy_bits() is in progress on
    958		 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
    959		 * have space to copy the pkt in the current block and
    960		 * called prb_retire_current_block()
    961		 *
    962		 * We don't need to worry about the TMO case because
    963		 * the timer-handler already handled this case.
    964		 */
    965		if (!(status & TP_STATUS_BLK_TMO)) {
    966			/* Waiting for skb_copy_bits to finish... */
    967			write_lock(&pkc->blk_fill_in_prog_lock);
    968			write_unlock(&pkc->blk_fill_in_prog_lock);
    969		}
    970		prb_close_block(pkc, pbd, po, status);
    971		return;
    972	}
    973}
    974
    975static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
    976{
    977	return TP_STATUS_USER & BLOCK_STATUS(pbd);
    978}
    979
    980static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
    981{
    982	return pkc->reset_pending_on_curr_blk;
    983}
    984
    985static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
    986	__releases(&pkc->blk_fill_in_prog_lock)
    987{
    988	struct tpacket_kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb);
    989
    990	read_unlock(&pkc->blk_fill_in_prog_lock);
    991}
    992
    993static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
    994			struct tpacket3_hdr *ppd)
    995{
    996	ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
    997}
    998
    999static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
   1000			struct tpacket3_hdr *ppd)
   1001{
   1002	ppd->hv1.tp_rxhash = 0;
   1003}
   1004
   1005static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
   1006			struct tpacket3_hdr *ppd)
   1007{
   1008	if (skb_vlan_tag_present(pkc->skb)) {
   1009		ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
   1010		ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
   1011		ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
   1012	} else {
   1013		ppd->hv1.tp_vlan_tci = 0;
   1014		ppd->hv1.tp_vlan_tpid = 0;
   1015		ppd->tp_status = TP_STATUS_AVAILABLE;
   1016	}
   1017}
   1018
   1019static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
   1020			struct tpacket3_hdr *ppd)
   1021{
   1022	ppd->hv1.tp_padding = 0;
   1023	prb_fill_vlan_info(pkc, ppd);
   1024
   1025	if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
   1026		prb_fill_rxhash(pkc, ppd);
   1027	else
   1028		prb_clear_rxhash(pkc, ppd);
   1029}
   1030
   1031static void prb_fill_curr_block(char *curr,
   1032				struct tpacket_kbdq_core *pkc,
   1033				struct tpacket_block_desc *pbd,
   1034				unsigned int len)
   1035	__acquires(&pkc->blk_fill_in_prog_lock)
   1036{
   1037	struct tpacket3_hdr *ppd;
   1038
   1039	ppd  = (struct tpacket3_hdr *)curr;
   1040	ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
   1041	pkc->prev = curr;
   1042	pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
   1043	BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
   1044	BLOCK_NUM_PKTS(pbd) += 1;
   1045	read_lock(&pkc->blk_fill_in_prog_lock);
   1046	prb_run_all_ft_ops(pkc, ppd);
   1047}
   1048
   1049/* Assumes caller has the sk->rx_queue.lock */
   1050static void *__packet_lookup_frame_in_block(struct packet_sock *po,
   1051					    struct sk_buff *skb,
   1052					    unsigned int len
   1053					    )
   1054{
   1055	struct tpacket_kbdq_core *pkc;
   1056	struct tpacket_block_desc *pbd;
   1057	char *curr, *end;
   1058
   1059	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
   1060	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
   1061
   1062	/* Queue is frozen when user space is lagging behind */
   1063	if (prb_queue_frozen(pkc)) {
   1064		/*
   1065		 * Check if that last block which caused the queue to freeze,
   1066		 * is still in_use by user-space.
   1067		 */
   1068		if (prb_curr_blk_in_use(pbd)) {
   1069			/* Can't record this packet */
   1070			return NULL;
   1071		} else {
   1072			/*
   1073			 * Ok, the block was released by user-space.
   1074			 * Now let's open that block.
   1075			 * opening a block also thaws the queue.
   1076			 * Thawing is a side effect.
   1077			 */
   1078			prb_open_block(pkc, pbd);
   1079		}
   1080	}
   1081
   1082	smp_mb();
   1083	curr = pkc->nxt_offset;
   1084	pkc->skb = skb;
   1085	end = (char *)pbd + pkc->kblk_size;
   1086
   1087	/* first try the current block */
   1088	if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
   1089		prb_fill_curr_block(curr, pkc, pbd, len);
   1090		return (void *)curr;
   1091	}
   1092
   1093	/* Ok, close the current block */
   1094	prb_retire_current_block(pkc, po, 0);
   1095
   1096	/* Now, try to dispatch the next block */
   1097	curr = (char *)prb_dispatch_next_block(pkc, po);
   1098	if (curr) {
   1099		pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
   1100		prb_fill_curr_block(curr, pkc, pbd, len);
   1101		return (void *)curr;
   1102	}
   1103
   1104	/*
   1105	 * No free blocks are available.user_space hasn't caught up yet.
   1106	 * Queue was just frozen and now this packet will get dropped.
   1107	 */
   1108	return NULL;
   1109}
   1110
   1111static void *packet_current_rx_frame(struct packet_sock *po,
   1112					    struct sk_buff *skb,
   1113					    int status, unsigned int len)
   1114{
   1115	char *curr = NULL;
   1116	switch (po->tp_version) {
   1117	case TPACKET_V1:
   1118	case TPACKET_V2:
   1119		curr = packet_lookup_frame(po, &po->rx_ring,
   1120					po->rx_ring.head, status);
   1121		return curr;
   1122	case TPACKET_V3:
   1123		return __packet_lookup_frame_in_block(po, skb, len);
   1124	default:
   1125		WARN(1, "TPACKET version not supported\n");
   1126		BUG();
   1127		return NULL;
   1128	}
   1129}
   1130
   1131static void *prb_lookup_block(const struct packet_sock *po,
   1132			      const struct packet_ring_buffer *rb,
   1133			      unsigned int idx,
   1134			      int status)
   1135{
   1136	struct tpacket_kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb);
   1137	struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
   1138
   1139	if (status != BLOCK_STATUS(pbd))
   1140		return NULL;
   1141	return pbd;
   1142}
   1143
   1144static int prb_previous_blk_num(struct packet_ring_buffer *rb)
   1145{
   1146	unsigned int prev;
   1147	if (rb->prb_bdqc.kactive_blk_num)
   1148		prev = rb->prb_bdqc.kactive_blk_num-1;
   1149	else
   1150		prev = rb->prb_bdqc.knum_blocks-1;
   1151	return prev;
   1152}
   1153
   1154/* Assumes caller has held the rx_queue.lock */
   1155static void *__prb_previous_block(struct packet_sock *po,
   1156					 struct packet_ring_buffer *rb,
   1157					 int status)
   1158{
   1159	unsigned int previous = prb_previous_blk_num(rb);
   1160	return prb_lookup_block(po, rb, previous, status);
   1161}
   1162
   1163static void *packet_previous_rx_frame(struct packet_sock *po,
   1164					     struct packet_ring_buffer *rb,
   1165					     int status)
   1166{
   1167	if (po->tp_version <= TPACKET_V2)
   1168		return packet_previous_frame(po, rb, status);
   1169
   1170	return __prb_previous_block(po, rb, status);
   1171}
   1172
   1173static void packet_increment_rx_head(struct packet_sock *po,
   1174					    struct packet_ring_buffer *rb)
   1175{
   1176	switch (po->tp_version) {
   1177	case TPACKET_V1:
   1178	case TPACKET_V2:
   1179		return packet_increment_head(rb);
   1180	case TPACKET_V3:
   1181	default:
   1182		WARN(1, "TPACKET version not supported.\n");
   1183		BUG();
   1184		return;
   1185	}
   1186}
   1187
   1188static void *packet_previous_frame(struct packet_sock *po,
   1189		struct packet_ring_buffer *rb,
   1190		int status)
   1191{
   1192	unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
   1193	return packet_lookup_frame(po, rb, previous, status);
   1194}
   1195
   1196static void packet_increment_head(struct packet_ring_buffer *buff)
   1197{
   1198	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
   1199}
   1200
   1201static void packet_inc_pending(struct packet_ring_buffer *rb)
   1202{
   1203	this_cpu_inc(*rb->pending_refcnt);
   1204}
   1205
   1206static void packet_dec_pending(struct packet_ring_buffer *rb)
   1207{
   1208	this_cpu_dec(*rb->pending_refcnt);
   1209}
   1210
   1211static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
   1212{
   1213	unsigned int refcnt = 0;
   1214	int cpu;
   1215
   1216	/* We don't use pending refcount in rx_ring. */
   1217	if (rb->pending_refcnt == NULL)
   1218		return 0;
   1219
   1220	for_each_possible_cpu(cpu)
   1221		refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
   1222
   1223	return refcnt;
   1224}
   1225
   1226static int packet_alloc_pending(struct packet_sock *po)
   1227{
   1228	po->rx_ring.pending_refcnt = NULL;
   1229
   1230	po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
   1231	if (unlikely(po->tx_ring.pending_refcnt == NULL))
   1232		return -ENOBUFS;
   1233
   1234	return 0;
   1235}
   1236
   1237static void packet_free_pending(struct packet_sock *po)
   1238{
   1239	free_percpu(po->tx_ring.pending_refcnt);
   1240}
   1241
   1242#define ROOM_POW_OFF	2
   1243#define ROOM_NONE	0x0
   1244#define ROOM_LOW	0x1
   1245#define ROOM_NORMAL	0x2
   1246
   1247static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
   1248{
   1249	int idx, len;
   1250
   1251	len = READ_ONCE(po->rx_ring.frame_max) + 1;
   1252	idx = READ_ONCE(po->rx_ring.head);
   1253	if (pow_off)
   1254		idx += len >> pow_off;
   1255	if (idx >= len)
   1256		idx -= len;
   1257	return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
   1258}
   1259
   1260static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
   1261{
   1262	int idx, len;
   1263
   1264	len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
   1265	idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
   1266	if (pow_off)
   1267		idx += len >> pow_off;
   1268	if (idx >= len)
   1269		idx -= len;
   1270	return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
   1271}
   1272
   1273static int __packet_rcv_has_room(const struct packet_sock *po,
   1274				 const struct sk_buff *skb)
   1275{
   1276	const struct sock *sk = &po->sk;
   1277	int ret = ROOM_NONE;
   1278
   1279	if (po->prot_hook.func != tpacket_rcv) {
   1280		int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
   1281		int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
   1282				   - (skb ? skb->truesize : 0);
   1283
   1284		if (avail > (rcvbuf >> ROOM_POW_OFF))
   1285			return ROOM_NORMAL;
   1286		else if (avail > 0)
   1287			return ROOM_LOW;
   1288		else
   1289			return ROOM_NONE;
   1290	}
   1291
   1292	if (po->tp_version == TPACKET_V3) {
   1293		if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
   1294			ret = ROOM_NORMAL;
   1295		else if (__tpacket_v3_has_room(po, 0))
   1296			ret = ROOM_LOW;
   1297	} else {
   1298		if (__tpacket_has_room(po, ROOM_POW_OFF))
   1299			ret = ROOM_NORMAL;
   1300		else if (__tpacket_has_room(po, 0))
   1301			ret = ROOM_LOW;
   1302	}
   1303
   1304	return ret;
   1305}
   1306
   1307static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
   1308{
   1309	int pressure, ret;
   1310
   1311	ret = __packet_rcv_has_room(po, skb);
   1312	pressure = ret != ROOM_NORMAL;
   1313
   1314	if (READ_ONCE(po->pressure) != pressure)
   1315		WRITE_ONCE(po->pressure, pressure);
   1316
   1317	return ret;
   1318}
   1319
   1320static void packet_rcv_try_clear_pressure(struct packet_sock *po)
   1321{
   1322	if (READ_ONCE(po->pressure) &&
   1323	    __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
   1324		WRITE_ONCE(po->pressure,  0);
   1325}
   1326
   1327static void packet_sock_destruct(struct sock *sk)
   1328{
   1329	skb_queue_purge(&sk->sk_error_queue);
   1330
   1331	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
   1332	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
   1333
   1334	if (!sock_flag(sk, SOCK_DEAD)) {
   1335		pr_err("Attempt to release alive packet socket: %p\n", sk);
   1336		return;
   1337	}
   1338
   1339	sk_refcnt_debug_dec(sk);
   1340}
   1341
   1342static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
   1343{
   1344	u32 *history = po->rollover->history;
   1345	u32 victim, rxhash;
   1346	int i, count = 0;
   1347
   1348	rxhash = skb_get_hash(skb);
   1349	for (i = 0; i < ROLLOVER_HLEN; i++)
   1350		if (READ_ONCE(history[i]) == rxhash)
   1351			count++;
   1352
   1353	victim = prandom_u32() % ROLLOVER_HLEN;
   1354
   1355	/* Avoid dirtying the cache line if possible */
   1356	if (READ_ONCE(history[victim]) != rxhash)
   1357		WRITE_ONCE(history[victim], rxhash);
   1358
   1359	return count > (ROLLOVER_HLEN >> 1);
   1360}
   1361
   1362static unsigned int fanout_demux_hash(struct packet_fanout *f,
   1363				      struct sk_buff *skb,
   1364				      unsigned int num)
   1365{
   1366	return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
   1367}
   1368
   1369static unsigned int fanout_demux_lb(struct packet_fanout *f,
   1370				    struct sk_buff *skb,
   1371				    unsigned int num)
   1372{
   1373	unsigned int val = atomic_inc_return(&f->rr_cur);
   1374
   1375	return val % num;
   1376}
   1377
   1378static unsigned int fanout_demux_cpu(struct packet_fanout *f,
   1379				     struct sk_buff *skb,
   1380				     unsigned int num)
   1381{
   1382	return smp_processor_id() % num;
   1383}
   1384
   1385static unsigned int fanout_demux_rnd(struct packet_fanout *f,
   1386				     struct sk_buff *skb,
   1387				     unsigned int num)
   1388{
   1389	return prandom_u32_max(num);
   1390}
   1391
   1392static unsigned int fanout_demux_rollover(struct packet_fanout *f,
   1393					  struct sk_buff *skb,
   1394					  unsigned int idx, bool try_self,
   1395					  unsigned int num)
   1396{
   1397	struct packet_sock *po, *po_next, *po_skip = NULL;
   1398	unsigned int i, j, room = ROOM_NONE;
   1399
   1400	po = pkt_sk(rcu_dereference(f->arr[idx]));
   1401
   1402	if (try_self) {
   1403		room = packet_rcv_has_room(po, skb);
   1404		if (room == ROOM_NORMAL ||
   1405		    (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
   1406			return idx;
   1407		po_skip = po;
   1408	}
   1409
   1410	i = j = min_t(int, po->rollover->sock, num - 1);
   1411	do {
   1412		po_next = pkt_sk(rcu_dereference(f->arr[i]));
   1413		if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
   1414		    packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
   1415			if (i != j)
   1416				po->rollover->sock = i;
   1417			atomic_long_inc(&po->rollover->num);
   1418			if (room == ROOM_LOW)
   1419				atomic_long_inc(&po->rollover->num_huge);
   1420			return i;
   1421		}
   1422
   1423		if (++i == num)
   1424			i = 0;
   1425	} while (i != j);
   1426
   1427	atomic_long_inc(&po->rollover->num_failed);
   1428	return idx;
   1429}
   1430
   1431static unsigned int fanout_demux_qm(struct packet_fanout *f,
   1432				    struct sk_buff *skb,
   1433				    unsigned int num)
   1434{
   1435	return skb_get_queue_mapping(skb) % num;
   1436}
   1437
   1438static unsigned int fanout_demux_bpf(struct packet_fanout *f,
   1439				     struct sk_buff *skb,
   1440				     unsigned int num)
   1441{
   1442	struct bpf_prog *prog;
   1443	unsigned int ret = 0;
   1444
   1445	rcu_read_lock();
   1446	prog = rcu_dereference(f->bpf_prog);
   1447	if (prog)
   1448		ret = bpf_prog_run_clear_cb(prog, skb) % num;
   1449	rcu_read_unlock();
   1450
   1451	return ret;
   1452}
   1453
   1454static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
   1455{
   1456	return f->flags & (flag >> 8);
   1457}
   1458
   1459static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
   1460			     struct packet_type *pt, struct net_device *orig_dev)
   1461{
   1462	struct packet_fanout *f = pt->af_packet_priv;
   1463	unsigned int num = READ_ONCE(f->num_members);
   1464	struct net *net = read_pnet(&f->net);
   1465	struct packet_sock *po;
   1466	unsigned int idx;
   1467
   1468	if (!net_eq(dev_net(dev), net) || !num) {
   1469		kfree_skb(skb);
   1470		return 0;
   1471	}
   1472
   1473	if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
   1474		skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
   1475		if (!skb)
   1476			return 0;
   1477	}
   1478	switch (f->type) {
   1479	case PACKET_FANOUT_HASH:
   1480	default:
   1481		idx = fanout_demux_hash(f, skb, num);
   1482		break;
   1483	case PACKET_FANOUT_LB:
   1484		idx = fanout_demux_lb(f, skb, num);
   1485		break;
   1486	case PACKET_FANOUT_CPU:
   1487		idx = fanout_demux_cpu(f, skb, num);
   1488		break;
   1489	case PACKET_FANOUT_RND:
   1490		idx = fanout_demux_rnd(f, skb, num);
   1491		break;
   1492	case PACKET_FANOUT_QM:
   1493		idx = fanout_demux_qm(f, skb, num);
   1494		break;
   1495	case PACKET_FANOUT_ROLLOVER:
   1496		idx = fanout_demux_rollover(f, skb, 0, false, num);
   1497		break;
   1498	case PACKET_FANOUT_CBPF:
   1499	case PACKET_FANOUT_EBPF:
   1500		idx = fanout_demux_bpf(f, skb, num);
   1501		break;
   1502	}
   1503
   1504	if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
   1505		idx = fanout_demux_rollover(f, skb, idx, true, num);
   1506
   1507	po = pkt_sk(rcu_dereference(f->arr[idx]));
   1508	return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
   1509}
   1510
   1511DEFINE_MUTEX(fanout_mutex);
   1512EXPORT_SYMBOL_GPL(fanout_mutex);
   1513static LIST_HEAD(fanout_list);
   1514static u16 fanout_next_id;
   1515
   1516static void __fanout_link(struct sock *sk, struct packet_sock *po)
   1517{
   1518	struct packet_fanout *f = po->fanout;
   1519
   1520	spin_lock(&f->lock);
   1521	rcu_assign_pointer(f->arr[f->num_members], sk);
   1522	smp_wmb();
   1523	f->num_members++;
   1524	if (f->num_members == 1)
   1525		dev_add_pack(&f->prot_hook);
   1526	spin_unlock(&f->lock);
   1527}
   1528
   1529static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
   1530{
   1531	struct packet_fanout *f = po->fanout;
   1532	int i;
   1533
   1534	spin_lock(&f->lock);
   1535	for (i = 0; i < f->num_members; i++) {
   1536		if (rcu_dereference_protected(f->arr[i],
   1537					      lockdep_is_held(&f->lock)) == sk)
   1538			break;
   1539	}
   1540	BUG_ON(i >= f->num_members);
   1541	rcu_assign_pointer(f->arr[i],
   1542			   rcu_dereference_protected(f->arr[f->num_members - 1],
   1543						     lockdep_is_held(&f->lock)));
   1544	f->num_members--;
   1545	if (f->num_members == 0)
   1546		__dev_remove_pack(&f->prot_hook);
   1547	spin_unlock(&f->lock);
   1548}
   1549
   1550static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
   1551{
   1552	if (sk->sk_family != PF_PACKET)
   1553		return false;
   1554
   1555	return ptype->af_packet_priv == pkt_sk(sk)->fanout;
   1556}
   1557
   1558static void fanout_init_data(struct packet_fanout *f)
   1559{
   1560	switch (f->type) {
   1561	case PACKET_FANOUT_LB:
   1562		atomic_set(&f->rr_cur, 0);
   1563		break;
   1564	case PACKET_FANOUT_CBPF:
   1565	case PACKET_FANOUT_EBPF:
   1566		RCU_INIT_POINTER(f->bpf_prog, NULL);
   1567		break;
   1568	}
   1569}
   1570
   1571static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
   1572{
   1573	struct bpf_prog *old;
   1574
   1575	spin_lock(&f->lock);
   1576	old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
   1577	rcu_assign_pointer(f->bpf_prog, new);
   1578	spin_unlock(&f->lock);
   1579
   1580	if (old) {
   1581		synchronize_net();
   1582		bpf_prog_destroy(old);
   1583	}
   1584}
   1585
   1586static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data,
   1587				unsigned int len)
   1588{
   1589	struct bpf_prog *new;
   1590	struct sock_fprog fprog;
   1591	int ret;
   1592
   1593	if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
   1594		return -EPERM;
   1595
   1596	ret = copy_bpf_fprog_from_user(&fprog, data, len);
   1597	if (ret)
   1598		return ret;
   1599
   1600	ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
   1601	if (ret)
   1602		return ret;
   1603
   1604	__fanout_set_data_bpf(po->fanout, new);
   1605	return 0;
   1606}
   1607
   1608static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data,
   1609				unsigned int len)
   1610{
   1611	struct bpf_prog *new;
   1612	u32 fd;
   1613
   1614	if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
   1615		return -EPERM;
   1616	if (len != sizeof(fd))
   1617		return -EINVAL;
   1618	if (copy_from_sockptr(&fd, data, len))
   1619		return -EFAULT;
   1620
   1621	new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
   1622	if (IS_ERR(new))
   1623		return PTR_ERR(new);
   1624
   1625	__fanout_set_data_bpf(po->fanout, new);
   1626	return 0;
   1627}
   1628
   1629static int fanout_set_data(struct packet_sock *po, sockptr_t data,
   1630			   unsigned int len)
   1631{
   1632	switch (po->fanout->type) {
   1633	case PACKET_FANOUT_CBPF:
   1634		return fanout_set_data_cbpf(po, data, len);
   1635	case PACKET_FANOUT_EBPF:
   1636		return fanout_set_data_ebpf(po, data, len);
   1637	default:
   1638		return -EINVAL;
   1639	}
   1640}
   1641
   1642static void fanout_release_data(struct packet_fanout *f)
   1643{
   1644	switch (f->type) {
   1645	case PACKET_FANOUT_CBPF:
   1646	case PACKET_FANOUT_EBPF:
   1647		__fanout_set_data_bpf(f, NULL);
   1648	}
   1649}
   1650
   1651static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
   1652{
   1653	struct packet_fanout *f;
   1654
   1655	list_for_each_entry(f, &fanout_list, list) {
   1656		if (f->id == candidate_id &&
   1657		    read_pnet(&f->net) == sock_net(sk)) {
   1658			return false;
   1659		}
   1660	}
   1661	return true;
   1662}
   1663
   1664static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
   1665{
   1666	u16 id = fanout_next_id;
   1667
   1668	do {
   1669		if (__fanout_id_is_free(sk, id)) {
   1670			*new_id = id;
   1671			fanout_next_id = id + 1;
   1672			return true;
   1673		}
   1674
   1675		id++;
   1676	} while (id != fanout_next_id);
   1677
   1678	return false;
   1679}
   1680
   1681static int fanout_add(struct sock *sk, struct fanout_args *args)
   1682{
   1683	struct packet_rollover *rollover = NULL;
   1684	struct packet_sock *po = pkt_sk(sk);
   1685	u16 type_flags = args->type_flags;
   1686	struct packet_fanout *f, *match;
   1687	u8 type = type_flags & 0xff;
   1688	u8 flags = type_flags >> 8;
   1689	u16 id = args->id;
   1690	int err;
   1691
   1692	switch (type) {
   1693	case PACKET_FANOUT_ROLLOVER:
   1694		if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
   1695			return -EINVAL;
   1696		break;
   1697	case PACKET_FANOUT_HASH:
   1698	case PACKET_FANOUT_LB:
   1699	case PACKET_FANOUT_CPU:
   1700	case PACKET_FANOUT_RND:
   1701	case PACKET_FANOUT_QM:
   1702	case PACKET_FANOUT_CBPF:
   1703	case PACKET_FANOUT_EBPF:
   1704		break;
   1705	default:
   1706		return -EINVAL;
   1707	}
   1708
   1709	mutex_lock(&fanout_mutex);
   1710
   1711	err = -EALREADY;
   1712	if (po->fanout)
   1713		goto out;
   1714
   1715	if (type == PACKET_FANOUT_ROLLOVER ||
   1716	    (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
   1717		err = -ENOMEM;
   1718		rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
   1719		if (!rollover)
   1720			goto out;
   1721		atomic_long_set(&rollover->num, 0);
   1722		atomic_long_set(&rollover->num_huge, 0);
   1723		atomic_long_set(&rollover->num_failed, 0);
   1724	}
   1725
   1726	if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
   1727		if (id != 0) {
   1728			err = -EINVAL;
   1729			goto out;
   1730		}
   1731		if (!fanout_find_new_id(sk, &id)) {
   1732			err = -ENOMEM;
   1733			goto out;
   1734		}
   1735		/* ephemeral flag for the first socket in the group: drop it */
   1736		flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
   1737	}
   1738
   1739	match = NULL;
   1740	list_for_each_entry(f, &fanout_list, list) {
   1741		if (f->id == id &&
   1742		    read_pnet(&f->net) == sock_net(sk)) {
   1743			match = f;
   1744			break;
   1745		}
   1746	}
   1747	err = -EINVAL;
   1748	if (match) {
   1749		if (match->flags != flags)
   1750			goto out;
   1751		if (args->max_num_members &&
   1752		    args->max_num_members != match->max_num_members)
   1753			goto out;
   1754	} else {
   1755		if (args->max_num_members > PACKET_FANOUT_MAX)
   1756			goto out;
   1757		if (!args->max_num_members)
   1758			/* legacy PACKET_FANOUT_MAX */
   1759			args->max_num_members = 256;
   1760		err = -ENOMEM;
   1761		match = kvzalloc(struct_size(match, arr, args->max_num_members),
   1762				 GFP_KERNEL);
   1763		if (!match)
   1764			goto out;
   1765		write_pnet(&match->net, sock_net(sk));
   1766		match->id = id;
   1767		match->type = type;
   1768		match->flags = flags;
   1769		INIT_LIST_HEAD(&match->list);
   1770		spin_lock_init(&match->lock);
   1771		refcount_set(&match->sk_ref, 0);
   1772		fanout_init_data(match);
   1773		match->prot_hook.type = po->prot_hook.type;
   1774		match->prot_hook.dev = po->prot_hook.dev;
   1775		match->prot_hook.func = packet_rcv_fanout;
   1776		match->prot_hook.af_packet_priv = match;
   1777		match->prot_hook.af_packet_net = read_pnet(&match->net);
   1778		match->prot_hook.id_match = match_fanout_group;
   1779		match->max_num_members = args->max_num_members;
   1780		list_add(&match->list, &fanout_list);
   1781	}
   1782	err = -EINVAL;
   1783
   1784	spin_lock(&po->bind_lock);
   1785	if (po->running &&
   1786	    match->type == type &&
   1787	    match->prot_hook.type == po->prot_hook.type &&
   1788	    match->prot_hook.dev == po->prot_hook.dev) {
   1789		err = -ENOSPC;
   1790		if (refcount_read(&match->sk_ref) < match->max_num_members) {
   1791			__dev_remove_pack(&po->prot_hook);
   1792
   1793			/* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */
   1794			WRITE_ONCE(po->fanout, match);
   1795
   1796			po->rollover = rollover;
   1797			rollover = NULL;
   1798			refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
   1799			__fanout_link(sk, po);
   1800			err = 0;
   1801		}
   1802	}
   1803	spin_unlock(&po->bind_lock);
   1804
   1805	if (err && !refcount_read(&match->sk_ref)) {
   1806		list_del(&match->list);
   1807		kvfree(match);
   1808	}
   1809
   1810out:
   1811	kfree(rollover);
   1812	mutex_unlock(&fanout_mutex);
   1813	return err;
   1814}
   1815
   1816/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
   1817 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
   1818 * It is the responsibility of the caller to call fanout_release_data() and
   1819 * free the returned packet_fanout (after synchronize_net())
   1820 */
   1821static struct packet_fanout *fanout_release(struct sock *sk)
   1822{
   1823	struct packet_sock *po = pkt_sk(sk);
   1824	struct packet_fanout *f;
   1825
   1826	mutex_lock(&fanout_mutex);
   1827	f = po->fanout;
   1828	if (f) {
   1829		po->fanout = NULL;
   1830
   1831		if (refcount_dec_and_test(&f->sk_ref))
   1832			list_del(&f->list);
   1833		else
   1834			f = NULL;
   1835	}
   1836	mutex_unlock(&fanout_mutex);
   1837
   1838	return f;
   1839}
   1840
   1841static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
   1842					  struct sk_buff *skb)
   1843{
   1844	/* Earlier code assumed this would be a VLAN pkt, double-check
   1845	 * this now that we have the actual packet in hand. We can only
   1846	 * do this check on Ethernet devices.
   1847	 */
   1848	if (unlikely(dev->type != ARPHRD_ETHER))
   1849		return false;
   1850
   1851	skb_reset_mac_header(skb);
   1852	return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
   1853}
   1854
   1855static const struct proto_ops packet_ops;
   1856
   1857static const struct proto_ops packet_ops_spkt;
   1858
   1859static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
   1860			   struct packet_type *pt, struct net_device *orig_dev)
   1861{
   1862	struct sock *sk;
   1863	struct sockaddr_pkt *spkt;
   1864
   1865	/*
   1866	 *	When we registered the protocol we saved the socket in the data
   1867	 *	field for just this event.
   1868	 */
   1869
   1870	sk = pt->af_packet_priv;
   1871
   1872	/*
   1873	 *	Yank back the headers [hope the device set this
   1874	 *	right or kerboom...]
   1875	 *
   1876	 *	Incoming packets have ll header pulled,
   1877	 *	push it back.
   1878	 *
   1879	 *	For outgoing ones skb->data == skb_mac_header(skb)
   1880	 *	so that this procedure is noop.
   1881	 */
   1882
   1883	if (skb->pkt_type == PACKET_LOOPBACK)
   1884		goto out;
   1885
   1886	if (!net_eq(dev_net(dev), sock_net(sk)))
   1887		goto out;
   1888
   1889	skb = skb_share_check(skb, GFP_ATOMIC);
   1890	if (skb == NULL)
   1891		goto oom;
   1892
   1893	/* drop any routing info */
   1894	skb_dst_drop(skb);
   1895
   1896	/* drop conntrack reference */
   1897	nf_reset_ct(skb);
   1898
   1899	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
   1900
   1901	skb_push(skb, skb->data - skb_mac_header(skb));
   1902
   1903	/*
   1904	 *	The SOCK_PACKET socket receives _all_ frames.
   1905	 */
   1906
   1907	spkt->spkt_family = dev->type;
   1908	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
   1909	spkt->spkt_protocol = skb->protocol;
   1910
   1911	/*
   1912	 *	Charge the memory to the socket. This is done specifically
   1913	 *	to prevent sockets using all the memory up.
   1914	 */
   1915
   1916	if (sock_queue_rcv_skb(sk, skb) == 0)
   1917		return 0;
   1918
   1919out:
   1920	kfree_skb(skb);
   1921oom:
   1922	return 0;
   1923}
   1924
   1925static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
   1926{
   1927	int depth;
   1928
   1929	if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
   1930	    sock->type == SOCK_RAW) {
   1931		skb_reset_mac_header(skb);
   1932		skb->protocol = dev_parse_header_protocol(skb);
   1933	}
   1934
   1935	/* Move network header to the right position for VLAN tagged packets */
   1936	if (likely(skb->dev->type == ARPHRD_ETHER) &&
   1937	    eth_type_vlan(skb->protocol) &&
   1938	    __vlan_get_protocol(skb, skb->protocol, &depth) != 0) {
   1939		if (pskb_may_pull(skb, depth))
   1940			skb_set_network_header(skb, depth);
   1941	}
   1942
   1943	skb_probe_transport_header(skb);
   1944}
   1945
   1946/*
   1947 *	Output a raw packet to a device layer. This bypasses all the other
   1948 *	protocol layers and you must therefore supply it with a complete frame
   1949 */
   1950
   1951static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
   1952			       size_t len)
   1953{
   1954	struct sock *sk = sock->sk;
   1955	DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
   1956	struct sk_buff *skb = NULL;
   1957	struct net_device *dev;
   1958	struct sockcm_cookie sockc;
   1959	__be16 proto = 0;
   1960	int err;
   1961	int extra_len = 0;
   1962
   1963	/*
   1964	 *	Get and verify the address.
   1965	 */
   1966
   1967	if (saddr) {
   1968		if (msg->msg_namelen < sizeof(struct sockaddr))
   1969			return -EINVAL;
   1970		if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
   1971			proto = saddr->spkt_protocol;
   1972	} else
   1973		return -ENOTCONN;	/* SOCK_PACKET must be sent giving an address */
   1974
   1975	/*
   1976	 *	Find the device first to size check it
   1977	 */
   1978
   1979	saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
   1980retry:
   1981	rcu_read_lock();
   1982	dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
   1983	err = -ENODEV;
   1984	if (dev == NULL)
   1985		goto out_unlock;
   1986
   1987	err = -ENETDOWN;
   1988	if (!(dev->flags & IFF_UP))
   1989		goto out_unlock;
   1990
   1991	/*
   1992	 * You may not queue a frame bigger than the mtu. This is the lowest level
   1993	 * raw protocol and you must do your own fragmentation at this level.
   1994	 */
   1995
   1996	if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
   1997		if (!netif_supports_nofcs(dev)) {
   1998			err = -EPROTONOSUPPORT;
   1999			goto out_unlock;
   2000		}
   2001		extra_len = 4; /* We're doing our own CRC */
   2002	}
   2003
   2004	err = -EMSGSIZE;
   2005	if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
   2006		goto out_unlock;
   2007
   2008	if (!skb) {
   2009		size_t reserved = LL_RESERVED_SPACE(dev);
   2010		int tlen = dev->needed_tailroom;
   2011		unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
   2012
   2013		rcu_read_unlock();
   2014		skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
   2015		if (skb == NULL)
   2016			return -ENOBUFS;
   2017		/* FIXME: Save some space for broken drivers that write a hard
   2018		 * header at transmission time by themselves. PPP is the notable
   2019		 * one here. This should really be fixed at the driver level.
   2020		 */
   2021		skb_reserve(skb, reserved);
   2022		skb_reset_network_header(skb);
   2023
   2024		/* Try to align data part correctly */
   2025		if (hhlen) {
   2026			skb->data -= hhlen;
   2027			skb->tail -= hhlen;
   2028			if (len < hhlen)
   2029				skb_reset_network_header(skb);
   2030		}
   2031		err = memcpy_from_msg(skb_put(skb, len), msg, len);
   2032		if (err)
   2033			goto out_free;
   2034		goto retry;
   2035	}
   2036
   2037	if (!dev_validate_header(dev, skb->data, len)) {
   2038		err = -EINVAL;
   2039		goto out_unlock;
   2040	}
   2041	if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
   2042	    !packet_extra_vlan_len_allowed(dev, skb)) {
   2043		err = -EMSGSIZE;
   2044		goto out_unlock;
   2045	}
   2046
   2047	sockcm_init(&sockc, sk);
   2048	if (msg->msg_controllen) {
   2049		err = sock_cmsg_send(sk, msg, &sockc);
   2050		if (unlikely(err))
   2051			goto out_unlock;
   2052	}
   2053
   2054	skb->protocol = proto;
   2055	skb->dev = dev;
   2056	skb->priority = sk->sk_priority;
   2057	skb->mark = sk->sk_mark;
   2058	skb->tstamp = sockc.transmit_time;
   2059
   2060	skb_setup_tx_timestamp(skb, sockc.tsflags);
   2061
   2062	if (unlikely(extra_len == 4))
   2063		skb->no_fcs = 1;
   2064
   2065	packet_parse_headers(skb, sock);
   2066
   2067	dev_queue_xmit(skb);
   2068	rcu_read_unlock();
   2069	return len;
   2070
   2071out_unlock:
   2072	rcu_read_unlock();
   2073out_free:
   2074	kfree_skb(skb);
   2075	return err;
   2076}
   2077
   2078static unsigned int run_filter(struct sk_buff *skb,
   2079			       const struct sock *sk,
   2080			       unsigned int res)
   2081{
   2082	struct sk_filter *filter;
   2083
   2084	rcu_read_lock();
   2085	filter = rcu_dereference(sk->sk_filter);
   2086	if (filter != NULL)
   2087		res = bpf_prog_run_clear_cb(filter->prog, skb);
   2088	rcu_read_unlock();
   2089
   2090	return res;
   2091}
   2092
   2093static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
   2094			   size_t *len)
   2095{
   2096	struct virtio_net_hdr vnet_hdr;
   2097
   2098	if (*len < sizeof(vnet_hdr))
   2099		return -EINVAL;
   2100	*len -= sizeof(vnet_hdr);
   2101
   2102	if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
   2103		return -EINVAL;
   2104
   2105	return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
   2106}
   2107
   2108/*
   2109 * This function makes lazy skb cloning in hope that most of packets
   2110 * are discarded by BPF.
   2111 *
   2112 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
   2113 * and skb->cb are mangled. It works because (and until) packets
   2114 * falling here are owned by current CPU. Output packets are cloned
   2115 * by dev_queue_xmit_nit(), input packets are processed by net_bh
   2116 * sequentially, so that if we return skb to original state on exit,
   2117 * we will not harm anyone.
   2118 */
   2119
   2120static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
   2121		      struct packet_type *pt, struct net_device *orig_dev)
   2122{
   2123	struct sock *sk;
   2124	struct sockaddr_ll *sll;
   2125	struct packet_sock *po;
   2126	u8 *skb_head = skb->data;
   2127	int skb_len = skb->len;
   2128	unsigned int snaplen, res;
   2129	bool is_drop_n_account = false;
   2130
   2131	if (skb->pkt_type == PACKET_LOOPBACK)
   2132		goto drop;
   2133
   2134	sk = pt->af_packet_priv;
   2135	po = pkt_sk(sk);
   2136
   2137	if (!net_eq(dev_net(dev), sock_net(sk)))
   2138		goto drop;
   2139
   2140	skb->dev = dev;
   2141
   2142	if (dev_has_header(dev)) {
   2143		/* The device has an explicit notion of ll header,
   2144		 * exported to higher levels.
   2145		 *
   2146		 * Otherwise, the device hides details of its frame
   2147		 * structure, so that corresponding packet head is
   2148		 * never delivered to user.
   2149		 */
   2150		if (sk->sk_type != SOCK_DGRAM)
   2151			skb_push(skb, skb->data - skb_mac_header(skb));
   2152		else if (skb->pkt_type == PACKET_OUTGOING) {
   2153			/* Special case: outgoing packets have ll header at head */
   2154			skb_pull(skb, skb_network_offset(skb));
   2155		}
   2156	}
   2157
   2158	snaplen = skb->len;
   2159
   2160	res = run_filter(skb, sk, snaplen);
   2161	if (!res)
   2162		goto drop_n_restore;
   2163	if (snaplen > res)
   2164		snaplen = res;
   2165
   2166	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
   2167		goto drop_n_acct;
   2168
   2169	if (skb_shared(skb)) {
   2170		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
   2171		if (nskb == NULL)
   2172			goto drop_n_acct;
   2173
   2174		if (skb_head != skb->data) {
   2175			skb->data = skb_head;
   2176			skb->len = skb_len;
   2177		}
   2178		consume_skb(skb);
   2179		skb = nskb;
   2180	}
   2181
   2182	sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
   2183
   2184	sll = &PACKET_SKB_CB(skb)->sa.ll;
   2185	sll->sll_hatype = dev->type;
   2186	sll->sll_pkttype = skb->pkt_type;
   2187	if (unlikely(po->origdev))
   2188		sll->sll_ifindex = orig_dev->ifindex;
   2189	else
   2190		sll->sll_ifindex = dev->ifindex;
   2191
   2192	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
   2193
   2194	/* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
   2195	 * Use their space for storing the original skb length.
   2196	 */
   2197	PACKET_SKB_CB(skb)->sa.origlen = skb->len;
   2198
   2199	if (pskb_trim(skb, snaplen))
   2200		goto drop_n_acct;
   2201
   2202	skb_set_owner_r(skb, sk);
   2203	skb->dev = NULL;
   2204	skb_dst_drop(skb);
   2205
   2206	/* drop conntrack reference */
   2207	nf_reset_ct(skb);
   2208
   2209	spin_lock(&sk->sk_receive_queue.lock);
   2210	po->stats.stats1.tp_packets++;
   2211	sock_skb_set_dropcount(sk, skb);
   2212	skb_clear_delivery_time(skb);
   2213	__skb_queue_tail(&sk->sk_receive_queue, skb);
   2214	spin_unlock(&sk->sk_receive_queue.lock);
   2215	sk->sk_data_ready(sk);
   2216	return 0;
   2217
   2218drop_n_acct:
   2219	is_drop_n_account = true;
   2220	atomic_inc(&po->tp_drops);
   2221	atomic_inc(&sk->sk_drops);
   2222
   2223drop_n_restore:
   2224	if (skb_head != skb->data && skb_shared(skb)) {
   2225		skb->data = skb_head;
   2226		skb->len = skb_len;
   2227	}
   2228drop:
   2229	if (!is_drop_n_account)
   2230		consume_skb(skb);
   2231	else
   2232		kfree_skb(skb);
   2233	return 0;
   2234}
   2235
   2236static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
   2237		       struct packet_type *pt, struct net_device *orig_dev)
   2238{
   2239	struct sock *sk;
   2240	struct packet_sock *po;
   2241	struct sockaddr_ll *sll;
   2242	union tpacket_uhdr h;
   2243	u8 *skb_head = skb->data;
   2244	int skb_len = skb->len;
   2245	unsigned int snaplen, res;
   2246	unsigned long status = TP_STATUS_USER;
   2247	unsigned short macoff, hdrlen;
   2248	unsigned int netoff;
   2249	struct sk_buff *copy_skb = NULL;
   2250	struct timespec64 ts;
   2251	__u32 ts_status;
   2252	bool is_drop_n_account = false;
   2253	unsigned int slot_id = 0;
   2254	bool do_vnet = false;
   2255
   2256	/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
   2257	 * We may add members to them until current aligned size without forcing
   2258	 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
   2259	 */
   2260	BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
   2261	BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
   2262
   2263	if (skb->pkt_type == PACKET_LOOPBACK)
   2264		goto drop;
   2265
   2266	sk = pt->af_packet_priv;
   2267	po = pkt_sk(sk);
   2268
   2269	if (!net_eq(dev_net(dev), sock_net(sk)))
   2270		goto drop;
   2271
   2272	if (dev_has_header(dev)) {
   2273		if (sk->sk_type != SOCK_DGRAM)
   2274			skb_push(skb, skb->data - skb_mac_header(skb));
   2275		else if (skb->pkt_type == PACKET_OUTGOING) {
   2276			/* Special case: outgoing packets have ll header at head */
   2277			skb_pull(skb, skb_network_offset(skb));
   2278		}
   2279	}
   2280
   2281	snaplen = skb->len;
   2282
   2283	res = run_filter(skb, sk, snaplen);
   2284	if (!res)
   2285		goto drop_n_restore;
   2286
   2287	/* If we are flooded, just give up */
   2288	if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
   2289		atomic_inc(&po->tp_drops);
   2290		goto drop_n_restore;
   2291	}
   2292
   2293	if (skb->ip_summed == CHECKSUM_PARTIAL)
   2294		status |= TP_STATUS_CSUMNOTREADY;
   2295	else if (skb->pkt_type != PACKET_OUTGOING &&
   2296		 (skb->ip_summed == CHECKSUM_COMPLETE ||
   2297		  skb_csum_unnecessary(skb)))
   2298		status |= TP_STATUS_CSUM_VALID;
   2299
   2300	if (snaplen > res)
   2301		snaplen = res;
   2302
   2303	if (sk->sk_type == SOCK_DGRAM) {
   2304		macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
   2305				  po->tp_reserve;
   2306	} else {
   2307		unsigned int maclen = skb_network_offset(skb);
   2308		netoff = TPACKET_ALIGN(po->tp_hdrlen +
   2309				       (maclen < 16 ? 16 : maclen)) +
   2310				       po->tp_reserve;
   2311		if (po->has_vnet_hdr) {
   2312			netoff += sizeof(struct virtio_net_hdr);
   2313			do_vnet = true;
   2314		}
   2315		macoff = netoff - maclen;
   2316	}
   2317	if (netoff > USHRT_MAX) {
   2318		atomic_inc(&po->tp_drops);
   2319		goto drop_n_restore;
   2320	}
   2321	if (po->tp_version <= TPACKET_V2) {
   2322		if (macoff + snaplen > po->rx_ring.frame_size) {
   2323			if (po->copy_thresh &&
   2324			    atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
   2325				if (skb_shared(skb)) {
   2326					copy_skb = skb_clone(skb, GFP_ATOMIC);
   2327				} else {
   2328					copy_skb = skb_get(skb);
   2329					skb_head = skb->data;
   2330				}
   2331				if (copy_skb) {
   2332					memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0,
   2333					       sizeof(PACKET_SKB_CB(copy_skb)->sa.ll));
   2334					skb_set_owner_r(copy_skb, sk);
   2335				}
   2336			}
   2337			snaplen = po->rx_ring.frame_size - macoff;
   2338			if ((int)snaplen < 0) {
   2339				snaplen = 0;
   2340				do_vnet = false;
   2341			}
   2342		}
   2343	} else if (unlikely(macoff + snaplen >
   2344			    GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
   2345		u32 nval;
   2346
   2347		nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
   2348		pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
   2349			    snaplen, nval, macoff);
   2350		snaplen = nval;
   2351		if (unlikely((int)snaplen < 0)) {
   2352			snaplen = 0;
   2353			macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
   2354			do_vnet = false;
   2355		}
   2356	}
   2357	spin_lock(&sk->sk_receive_queue.lock);
   2358	h.raw = packet_current_rx_frame(po, skb,
   2359					TP_STATUS_KERNEL, (macoff+snaplen));
   2360	if (!h.raw)
   2361		goto drop_n_account;
   2362
   2363	if (po->tp_version <= TPACKET_V2) {
   2364		slot_id = po->rx_ring.head;
   2365		if (test_bit(slot_id, po->rx_ring.rx_owner_map))
   2366			goto drop_n_account;
   2367		__set_bit(slot_id, po->rx_ring.rx_owner_map);
   2368	}
   2369
   2370	if (do_vnet &&
   2371	    virtio_net_hdr_from_skb(skb, h.raw + macoff -
   2372				    sizeof(struct virtio_net_hdr),
   2373				    vio_le(), true, 0)) {
   2374		if (po->tp_version == TPACKET_V3)
   2375			prb_clear_blk_fill_status(&po->rx_ring);
   2376		goto drop_n_account;
   2377	}
   2378
   2379	if (po->tp_version <= TPACKET_V2) {
   2380		packet_increment_rx_head(po, &po->rx_ring);
   2381	/*
   2382	 * LOSING will be reported till you read the stats,
   2383	 * because it's COR - Clear On Read.
   2384	 * Anyways, moving it for V1/V2 only as V3 doesn't need this
   2385	 * at packet level.
   2386	 */
   2387		if (atomic_read(&po->tp_drops))
   2388			status |= TP_STATUS_LOSING;
   2389	}
   2390
   2391	po->stats.stats1.tp_packets++;
   2392	if (copy_skb) {
   2393		status |= TP_STATUS_COPY;
   2394		skb_clear_delivery_time(copy_skb);
   2395		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
   2396	}
   2397	spin_unlock(&sk->sk_receive_queue.lock);
   2398
   2399	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
   2400
   2401	/* Always timestamp; prefer an existing software timestamp taken
   2402	 * closer to the time of capture.
   2403	 */
   2404	ts_status = tpacket_get_timestamp(skb, &ts,
   2405					  po->tp_tstamp | SOF_TIMESTAMPING_SOFTWARE);
   2406	if (!ts_status)
   2407		ktime_get_real_ts64(&ts);
   2408
   2409	status |= ts_status;
   2410
   2411	switch (po->tp_version) {
   2412	case TPACKET_V1:
   2413		h.h1->tp_len = skb->len;
   2414		h.h1->tp_snaplen = snaplen;
   2415		h.h1->tp_mac = macoff;
   2416		h.h1->tp_net = netoff;
   2417		h.h1->tp_sec = ts.tv_sec;
   2418		h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
   2419		hdrlen = sizeof(*h.h1);
   2420		break;
   2421	case TPACKET_V2:
   2422		h.h2->tp_len = skb->len;
   2423		h.h2->tp_snaplen = snaplen;
   2424		h.h2->tp_mac = macoff;
   2425		h.h2->tp_net = netoff;
   2426		h.h2->tp_sec = ts.tv_sec;
   2427		h.h2->tp_nsec = ts.tv_nsec;
   2428		if (skb_vlan_tag_present(skb)) {
   2429			h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
   2430			h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
   2431			status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
   2432		} else {
   2433			h.h2->tp_vlan_tci = 0;
   2434			h.h2->tp_vlan_tpid = 0;
   2435		}
   2436		memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
   2437		hdrlen = sizeof(*h.h2);
   2438		break;
   2439	case TPACKET_V3:
   2440		/* tp_nxt_offset,vlan are already populated above.
   2441		 * So DONT clear those fields here
   2442		 */
   2443		h.h3->tp_status |= status;
   2444		h.h3->tp_len = skb->len;
   2445		h.h3->tp_snaplen = snaplen;
   2446		h.h3->tp_mac = macoff;
   2447		h.h3->tp_net = netoff;
   2448		h.h3->tp_sec  = ts.tv_sec;
   2449		h.h3->tp_nsec = ts.tv_nsec;
   2450		memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
   2451		hdrlen = sizeof(*h.h3);
   2452		break;
   2453	default:
   2454		BUG();
   2455	}
   2456
   2457	sll = h.raw + TPACKET_ALIGN(hdrlen);
   2458	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
   2459	sll->sll_family = AF_PACKET;
   2460	sll->sll_hatype = dev->type;
   2461	sll->sll_protocol = skb->protocol;
   2462	sll->sll_pkttype = skb->pkt_type;
   2463	if (unlikely(po->origdev))
   2464		sll->sll_ifindex = orig_dev->ifindex;
   2465	else
   2466		sll->sll_ifindex = dev->ifindex;
   2467
   2468	smp_mb();
   2469
   2470#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
   2471	if (po->tp_version <= TPACKET_V2) {
   2472		u8 *start, *end;
   2473
   2474		end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
   2475					macoff + snaplen);
   2476
   2477		for (start = h.raw; start < end; start += PAGE_SIZE)
   2478			flush_dcache_page(pgv_to_page(start));
   2479	}
   2480	smp_wmb();
   2481#endif
   2482
   2483	if (po->tp_version <= TPACKET_V2) {
   2484		spin_lock(&sk->sk_receive_queue.lock);
   2485		__packet_set_status(po, h.raw, status);
   2486		__clear_bit(slot_id, po->rx_ring.rx_owner_map);
   2487		spin_unlock(&sk->sk_receive_queue.lock);
   2488		sk->sk_data_ready(sk);
   2489	} else if (po->tp_version == TPACKET_V3) {
   2490		prb_clear_blk_fill_status(&po->rx_ring);
   2491	}
   2492
   2493drop_n_restore:
   2494	if (skb_head != skb->data && skb_shared(skb)) {
   2495		skb->data = skb_head;
   2496		skb->len = skb_len;
   2497	}
   2498drop:
   2499	if (!is_drop_n_account)
   2500		consume_skb(skb);
   2501	else
   2502		kfree_skb(skb);
   2503	return 0;
   2504
   2505drop_n_account:
   2506	spin_unlock(&sk->sk_receive_queue.lock);
   2507	atomic_inc(&po->tp_drops);
   2508	is_drop_n_account = true;
   2509
   2510	sk->sk_data_ready(sk);
   2511	kfree_skb(copy_skb);
   2512	goto drop_n_restore;
   2513}
   2514
   2515static void tpacket_destruct_skb(struct sk_buff *skb)
   2516{
   2517	struct packet_sock *po = pkt_sk(skb->sk);
   2518
   2519	if (likely(po->tx_ring.pg_vec)) {
   2520		void *ph;
   2521		__u32 ts;
   2522
   2523		ph = skb_zcopy_get_nouarg(skb);
   2524		packet_dec_pending(&po->tx_ring);
   2525
   2526		ts = __packet_set_timestamp(po, ph, skb);
   2527		__packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
   2528
   2529		if (!packet_read_pending(&po->tx_ring))
   2530			complete(&po->skb_completion);
   2531	}
   2532
   2533	sock_wfree(skb);
   2534}
   2535
   2536static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
   2537{
   2538	if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
   2539	    (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
   2540	     __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
   2541	      __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
   2542		vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
   2543			 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
   2544			__virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
   2545
   2546	if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
   2547		return -EINVAL;
   2548
   2549	return 0;
   2550}
   2551
   2552static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
   2553				 struct virtio_net_hdr *vnet_hdr)
   2554{
   2555	if (*len < sizeof(*vnet_hdr))
   2556		return -EINVAL;
   2557	*len -= sizeof(*vnet_hdr);
   2558
   2559	if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
   2560		return -EFAULT;
   2561
   2562	return __packet_snd_vnet_parse(vnet_hdr, *len);
   2563}
   2564
   2565static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
   2566		void *frame, struct net_device *dev, void *data, int tp_len,
   2567		__be16 proto, unsigned char *addr, int hlen, int copylen,
   2568		const struct sockcm_cookie *sockc)
   2569{
   2570	union tpacket_uhdr ph;
   2571	int to_write, offset, len, nr_frags, len_max;
   2572	struct socket *sock = po->sk.sk_socket;
   2573	struct page *page;
   2574	int err;
   2575
   2576	ph.raw = frame;
   2577
   2578	skb->protocol = proto;
   2579	skb->dev = dev;
   2580	skb->priority = po->sk.sk_priority;
   2581	skb->mark = po->sk.sk_mark;
   2582	skb->tstamp = sockc->transmit_time;
   2583	skb_setup_tx_timestamp(skb, sockc->tsflags);
   2584	skb_zcopy_set_nouarg(skb, ph.raw);
   2585
   2586	skb_reserve(skb, hlen);
   2587	skb_reset_network_header(skb);
   2588
   2589	to_write = tp_len;
   2590
   2591	if (sock->type == SOCK_DGRAM) {
   2592		err = dev_hard_header(skb, dev, ntohs(proto), addr,
   2593				NULL, tp_len);
   2594		if (unlikely(err < 0))
   2595			return -EINVAL;
   2596	} else if (copylen) {
   2597		int hdrlen = min_t(int, copylen, tp_len);
   2598
   2599		skb_push(skb, dev->hard_header_len);
   2600		skb_put(skb, copylen - dev->hard_header_len);
   2601		err = skb_store_bits(skb, 0, data, hdrlen);
   2602		if (unlikely(err))
   2603			return err;
   2604		if (!dev_validate_header(dev, skb->data, hdrlen))
   2605			return -EINVAL;
   2606
   2607		data += hdrlen;
   2608		to_write -= hdrlen;
   2609	}
   2610
   2611	offset = offset_in_page(data);
   2612	len_max = PAGE_SIZE - offset;
   2613	len = ((to_write > len_max) ? len_max : to_write);
   2614
   2615	skb->data_len = to_write;
   2616	skb->len += to_write;
   2617	skb->truesize += to_write;
   2618	refcount_add(to_write, &po->sk.sk_wmem_alloc);
   2619
   2620	while (likely(to_write)) {
   2621		nr_frags = skb_shinfo(skb)->nr_frags;
   2622
   2623		if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
   2624			pr_err("Packet exceed the number of skb frags(%lu)\n",
   2625			       MAX_SKB_FRAGS);
   2626			return -EFAULT;
   2627		}
   2628
   2629		page = pgv_to_page(data);
   2630		data += len;
   2631		flush_dcache_page(page);
   2632		get_page(page);
   2633		skb_fill_page_desc(skb, nr_frags, page, offset, len);
   2634		to_write -= len;
   2635		offset = 0;
   2636		len_max = PAGE_SIZE;
   2637		len = ((to_write > len_max) ? len_max : to_write);
   2638	}
   2639
   2640	packet_parse_headers(skb, sock);
   2641
   2642	return tp_len;
   2643}
   2644
   2645static int tpacket_parse_header(struct packet_sock *po, void *frame,
   2646				int size_max, void **data)
   2647{
   2648	union tpacket_uhdr ph;
   2649	int tp_len, off;
   2650
   2651	ph.raw = frame;
   2652
   2653	switch (po->tp_version) {
   2654	case TPACKET_V3:
   2655		if (ph.h3->tp_next_offset != 0) {
   2656			pr_warn_once("variable sized slot not supported");
   2657			return -EINVAL;
   2658		}
   2659		tp_len = ph.h3->tp_len;
   2660		break;
   2661	case TPACKET_V2:
   2662		tp_len = ph.h2->tp_len;
   2663		break;
   2664	default:
   2665		tp_len = ph.h1->tp_len;
   2666		break;
   2667	}
   2668	if (unlikely(tp_len > size_max)) {
   2669		pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
   2670		return -EMSGSIZE;
   2671	}
   2672
   2673	if (unlikely(po->tp_tx_has_off)) {
   2674		int off_min, off_max;
   2675
   2676		off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
   2677		off_max = po->tx_ring.frame_size - tp_len;
   2678		if (po->sk.sk_type == SOCK_DGRAM) {
   2679			switch (po->tp_version) {
   2680			case TPACKET_V3:
   2681				off = ph.h3->tp_net;
   2682				break;
   2683			case TPACKET_V2:
   2684				off = ph.h2->tp_net;
   2685				break;
   2686			default:
   2687				off = ph.h1->tp_net;
   2688				break;
   2689			}
   2690		} else {
   2691			switch (po->tp_version) {
   2692			case TPACKET_V3:
   2693				off = ph.h3->tp_mac;
   2694				break;
   2695			case TPACKET_V2:
   2696				off = ph.h2->tp_mac;
   2697				break;
   2698			default:
   2699				off = ph.h1->tp_mac;
   2700				break;
   2701			}
   2702		}
   2703		if (unlikely((off < off_min) || (off_max < off)))
   2704			return -EINVAL;
   2705	} else {
   2706		off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
   2707	}
   2708
   2709	*data = frame + off;
   2710	return tp_len;
   2711}
   2712
   2713static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
   2714{
   2715	struct sk_buff *skb = NULL;
   2716	struct net_device *dev;
   2717	struct virtio_net_hdr *vnet_hdr = NULL;
   2718	struct sockcm_cookie sockc;
   2719	__be16 proto;
   2720	int err, reserve = 0;
   2721	void *ph;
   2722	DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
   2723	bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
   2724	unsigned char *addr = NULL;
   2725	int tp_len, size_max;
   2726	void *data;
   2727	int len_sum = 0;
   2728	int status = TP_STATUS_AVAILABLE;
   2729	int hlen, tlen, copylen = 0;
   2730	long timeo = 0;
   2731
   2732	mutex_lock(&po->pg_vec_lock);
   2733
   2734	/* packet_sendmsg() check on tx_ring.pg_vec was lockless,
   2735	 * we need to confirm it under protection of pg_vec_lock.
   2736	 */
   2737	if (unlikely(!po->tx_ring.pg_vec)) {
   2738		err = -EBUSY;
   2739		goto out;
   2740	}
   2741	if (likely(saddr == NULL)) {
   2742		dev	= packet_cached_dev_get(po);
   2743		proto	= READ_ONCE(po->num);
   2744	} else {
   2745		err = -EINVAL;
   2746		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
   2747			goto out;
   2748		if (msg->msg_namelen < (saddr->sll_halen
   2749					+ offsetof(struct sockaddr_ll,
   2750						sll_addr)))
   2751			goto out;
   2752		proto	= saddr->sll_protocol;
   2753		dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
   2754		if (po->sk.sk_socket->type == SOCK_DGRAM) {
   2755			if (dev && msg->msg_namelen < dev->addr_len +
   2756				   offsetof(struct sockaddr_ll, sll_addr))
   2757				goto out_put;
   2758			addr = saddr->sll_addr;
   2759		}
   2760	}
   2761
   2762	err = -ENXIO;
   2763	if (unlikely(dev == NULL))
   2764		goto out;
   2765	err = -ENETDOWN;
   2766	if (unlikely(!(dev->flags & IFF_UP)))
   2767		goto out_put;
   2768
   2769	sockcm_init(&sockc, &po->sk);
   2770	if (msg->msg_controllen) {
   2771		err = sock_cmsg_send(&po->sk, msg, &sockc);
   2772		if (unlikely(err))
   2773			goto out_put;
   2774	}
   2775
   2776	if (po->sk.sk_socket->type == SOCK_RAW)
   2777		reserve = dev->hard_header_len;
   2778	size_max = po->tx_ring.frame_size
   2779		- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
   2780
   2781	if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
   2782		size_max = dev->mtu + reserve + VLAN_HLEN;
   2783
   2784	reinit_completion(&po->skb_completion);
   2785
   2786	do {
   2787		ph = packet_current_frame(po, &po->tx_ring,
   2788					  TP_STATUS_SEND_REQUEST);
   2789		if (unlikely(ph == NULL)) {
   2790			if (need_wait && skb) {
   2791				timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
   2792				timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
   2793				if (timeo <= 0) {
   2794					err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
   2795					goto out_put;
   2796				}
   2797			}
   2798			/* check for additional frames */
   2799			continue;
   2800		}
   2801
   2802		skb = NULL;
   2803		tp_len = tpacket_parse_header(po, ph, size_max, &data);
   2804		if (tp_len < 0)
   2805			goto tpacket_error;
   2806
   2807		status = TP_STATUS_SEND_REQUEST;
   2808		hlen = LL_RESERVED_SPACE(dev);
   2809		tlen = dev->needed_tailroom;
   2810		if (po->has_vnet_hdr) {
   2811			vnet_hdr = data;
   2812			data += sizeof(*vnet_hdr);
   2813			tp_len -= sizeof(*vnet_hdr);
   2814			if (tp_len < 0 ||
   2815			    __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
   2816				tp_len = -EINVAL;
   2817				goto tpacket_error;
   2818			}
   2819			copylen = __virtio16_to_cpu(vio_le(),
   2820						    vnet_hdr->hdr_len);
   2821		}
   2822		copylen = max_t(int, copylen, dev->hard_header_len);
   2823		skb = sock_alloc_send_skb(&po->sk,
   2824				hlen + tlen + sizeof(struct sockaddr_ll) +
   2825				(copylen - dev->hard_header_len),
   2826				!need_wait, &err);
   2827
   2828		if (unlikely(skb == NULL)) {
   2829			/* we assume the socket was initially writeable ... */
   2830			if (likely(len_sum > 0))
   2831				err = len_sum;
   2832			goto out_status;
   2833		}
   2834		tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
   2835					  addr, hlen, copylen, &sockc);
   2836		if (likely(tp_len >= 0) &&
   2837		    tp_len > dev->mtu + reserve &&
   2838		    !po->has_vnet_hdr &&
   2839		    !packet_extra_vlan_len_allowed(dev, skb))
   2840			tp_len = -EMSGSIZE;
   2841
   2842		if (unlikely(tp_len < 0)) {
   2843tpacket_error:
   2844			if (po->tp_loss) {
   2845				__packet_set_status(po, ph,
   2846						TP_STATUS_AVAILABLE);
   2847				packet_increment_head(&po->tx_ring);
   2848				kfree_skb(skb);
   2849				continue;
   2850			} else {
   2851				status = TP_STATUS_WRONG_FORMAT;
   2852				err = tp_len;
   2853				goto out_status;
   2854			}
   2855		}
   2856
   2857		if (po->has_vnet_hdr) {
   2858			if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
   2859				tp_len = -EINVAL;
   2860				goto tpacket_error;
   2861			}
   2862			virtio_net_hdr_set_proto(skb, vnet_hdr);
   2863		}
   2864
   2865		skb->destructor = tpacket_destruct_skb;
   2866		__packet_set_status(po, ph, TP_STATUS_SENDING);
   2867		packet_inc_pending(&po->tx_ring);
   2868
   2869		status = TP_STATUS_SEND_REQUEST;
   2870		err = po->xmit(skb);
   2871		if (unlikely(err != 0)) {
   2872			if (err > 0)
   2873				err = net_xmit_errno(err);
   2874			if (err && __packet_get_status(po, ph) ==
   2875				   TP_STATUS_AVAILABLE) {
   2876				/* skb was destructed already */
   2877				skb = NULL;
   2878				goto out_status;
   2879			}
   2880			/*
   2881			 * skb was dropped but not destructed yet;
   2882			 * let's treat it like congestion or err < 0
   2883			 */
   2884			err = 0;
   2885		}
   2886		packet_increment_head(&po->tx_ring);
   2887		len_sum += tp_len;
   2888	} while (likely((ph != NULL) ||
   2889		/* Note: packet_read_pending() might be slow if we have
   2890		 * to call it as it's per_cpu variable, but in fast-path
   2891		 * we already short-circuit the loop with the first
   2892		 * condition, and luckily don't have to go that path
   2893		 * anyway.
   2894		 */
   2895		 (need_wait && packet_read_pending(&po->tx_ring))));
   2896
   2897	err = len_sum;
   2898	goto out_put;
   2899
   2900out_status:
   2901	__packet_set_status(po, ph, status);
   2902	kfree_skb(skb);
   2903out_put:
   2904	dev_put(dev);
   2905out:
   2906	mutex_unlock(&po->pg_vec_lock);
   2907	return err;
   2908}
   2909
   2910static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
   2911				        size_t reserve, size_t len,
   2912				        size_t linear, int noblock,
   2913				        int *err)
   2914{
   2915	struct sk_buff *skb;
   2916
   2917	/* Under a page?  Don't bother with paged skb. */
   2918	if (prepad + len < PAGE_SIZE || !linear)
   2919		linear = len;
   2920
   2921	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
   2922				   err, 0);
   2923	if (!skb)
   2924		return NULL;
   2925
   2926	skb_reserve(skb, reserve);
   2927	skb_put(skb, linear);
   2928	skb->data_len = len - linear;
   2929	skb->len += len - linear;
   2930
   2931	return skb;
   2932}
   2933
   2934static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
   2935{
   2936	struct sock *sk = sock->sk;
   2937	DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
   2938	struct sk_buff *skb;
   2939	struct net_device *dev;
   2940	__be16 proto;
   2941	unsigned char *addr = NULL;
   2942	int err, reserve = 0;
   2943	struct sockcm_cookie sockc;
   2944	struct virtio_net_hdr vnet_hdr = { 0 };
   2945	int offset = 0;
   2946	struct packet_sock *po = pkt_sk(sk);
   2947	bool has_vnet_hdr = false;
   2948	int hlen, tlen, linear;
   2949	int extra_len = 0;
   2950
   2951	/*
   2952	 *	Get and verify the address.
   2953	 */
   2954
   2955	if (likely(saddr == NULL)) {
   2956		dev	= packet_cached_dev_get(po);
   2957		proto	= READ_ONCE(po->num);
   2958	} else {
   2959		err = -EINVAL;
   2960		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
   2961			goto out;
   2962		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
   2963			goto out;
   2964		proto	= saddr->sll_protocol;
   2965		dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
   2966		if (sock->type == SOCK_DGRAM) {
   2967			if (dev && msg->msg_namelen < dev->addr_len +
   2968				   offsetof(struct sockaddr_ll, sll_addr))
   2969				goto out_unlock;
   2970			addr = saddr->sll_addr;
   2971		}
   2972	}
   2973
   2974	err = -ENXIO;
   2975	if (unlikely(dev == NULL))
   2976		goto out_unlock;
   2977	err = -ENETDOWN;
   2978	if (unlikely(!(dev->flags & IFF_UP)))
   2979		goto out_unlock;
   2980
   2981	sockcm_init(&sockc, sk);
   2982	sockc.mark = sk->sk_mark;
   2983	if (msg->msg_controllen) {
   2984		err = sock_cmsg_send(sk, msg, &sockc);
   2985		if (unlikely(err))
   2986			goto out_unlock;
   2987	}
   2988
   2989	if (sock->type == SOCK_RAW)
   2990		reserve = dev->hard_header_len;
   2991	if (po->has_vnet_hdr) {
   2992		err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
   2993		if (err)
   2994			goto out_unlock;
   2995		has_vnet_hdr = true;
   2996	}
   2997
   2998	if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
   2999		if (!netif_supports_nofcs(dev)) {
   3000			err = -EPROTONOSUPPORT;
   3001			goto out_unlock;
   3002		}
   3003		extra_len = 4; /* We're doing our own CRC */
   3004	}
   3005
   3006	err = -EMSGSIZE;
   3007	if (!vnet_hdr.gso_type &&
   3008	    (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
   3009		goto out_unlock;
   3010
   3011	err = -ENOBUFS;
   3012	hlen = LL_RESERVED_SPACE(dev);
   3013	tlen = dev->needed_tailroom;
   3014	linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
   3015	linear = max(linear, min_t(int, len, dev->hard_header_len));
   3016	skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
   3017			       msg->msg_flags & MSG_DONTWAIT, &err);
   3018	if (skb == NULL)
   3019		goto out_unlock;
   3020
   3021	skb_reset_network_header(skb);
   3022
   3023	err = -EINVAL;
   3024	if (sock->type == SOCK_DGRAM) {
   3025		offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
   3026		if (unlikely(offset < 0))
   3027			goto out_free;
   3028	} else if (reserve) {
   3029		skb_reserve(skb, -reserve);
   3030		if (len < reserve + sizeof(struct ipv6hdr) &&
   3031		    dev->min_header_len != dev->hard_header_len)
   3032			skb_reset_network_header(skb);
   3033	}
   3034
   3035	/* Returns -EFAULT on error */
   3036	err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
   3037	if (err)
   3038		goto out_free;
   3039
   3040	if (sock->type == SOCK_RAW &&
   3041	    !dev_validate_header(dev, skb->data, len)) {
   3042		err = -EINVAL;
   3043		goto out_free;
   3044	}
   3045
   3046	skb_setup_tx_timestamp(skb, sockc.tsflags);
   3047
   3048	if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
   3049	    !packet_extra_vlan_len_allowed(dev, skb)) {
   3050		err = -EMSGSIZE;
   3051		goto out_free;
   3052	}
   3053
   3054	skb->protocol = proto;
   3055	skb->dev = dev;
   3056	skb->priority = sk->sk_priority;
   3057	skb->mark = sockc.mark;
   3058	skb->tstamp = sockc.transmit_time;
   3059
   3060	if (unlikely(extra_len == 4))
   3061		skb->no_fcs = 1;
   3062
   3063	packet_parse_headers(skb, sock);
   3064
   3065	if (has_vnet_hdr) {
   3066		err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
   3067		if (err)
   3068			goto out_free;
   3069		len += sizeof(vnet_hdr);
   3070		virtio_net_hdr_set_proto(skb, &vnet_hdr);
   3071	}
   3072
   3073	err = po->xmit(skb);
   3074	if (unlikely(err != 0)) {
   3075		if (err > 0)
   3076			err = net_xmit_errno(err);
   3077		if (err)
   3078			goto out_unlock;
   3079	}
   3080
   3081	dev_put(dev);
   3082
   3083	return len;
   3084
   3085out_free:
   3086	kfree_skb(skb);
   3087out_unlock:
   3088	dev_put(dev);
   3089out:
   3090	return err;
   3091}
   3092
   3093static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
   3094{
   3095	struct sock *sk = sock->sk;
   3096	struct packet_sock *po = pkt_sk(sk);
   3097
   3098	/* Reading tx_ring.pg_vec without holding pg_vec_lock is racy.
   3099	 * tpacket_snd() will redo the check safely.
   3100	 */
   3101	if (data_race(po->tx_ring.pg_vec))
   3102		return tpacket_snd(po, msg);
   3103
   3104	return packet_snd(sock, msg, len);
   3105}
   3106
   3107/*
   3108 *	Close a PACKET socket. This is fairly simple. We immediately go
   3109 *	to 'closed' state and remove our protocol entry in the device list.
   3110 */
   3111
   3112static int packet_release(struct socket *sock)
   3113{
   3114	struct sock *sk = sock->sk;
   3115	struct packet_sock *po;
   3116	struct packet_fanout *f;
   3117	struct net *net;
   3118	union tpacket_req_u req_u;
   3119
   3120	if (!sk)
   3121		return 0;
   3122
   3123	net = sock_net(sk);
   3124	po = pkt_sk(sk);
   3125
   3126	mutex_lock(&net->packet.sklist_lock);
   3127	sk_del_node_init_rcu(sk);
   3128	mutex_unlock(&net->packet.sklist_lock);
   3129
   3130	sock_prot_inuse_add(net, sk->sk_prot, -1);
   3131
   3132	spin_lock(&po->bind_lock);
   3133	unregister_prot_hook(sk, false);
   3134	packet_cached_dev_reset(po);
   3135
   3136	if (po->prot_hook.dev) {
   3137		dev_put_track(po->prot_hook.dev, &po->prot_hook.dev_tracker);
   3138		po->prot_hook.dev = NULL;
   3139	}
   3140	spin_unlock(&po->bind_lock);
   3141
   3142	packet_flush_mclist(sk);
   3143
   3144	lock_sock(sk);
   3145	if (po->rx_ring.pg_vec) {
   3146		memset(&req_u, 0, sizeof(req_u));
   3147		packet_set_ring(sk, &req_u, 1, 0);
   3148	}
   3149
   3150	if (po->tx_ring.pg_vec) {
   3151		memset(&req_u, 0, sizeof(req_u));
   3152		packet_set_ring(sk, &req_u, 1, 1);
   3153	}
   3154	release_sock(sk);
   3155
   3156	f = fanout_release(sk);
   3157
   3158	synchronize_net();
   3159
   3160	kfree(po->rollover);
   3161	if (f) {
   3162		fanout_release_data(f);
   3163		kvfree(f);
   3164	}
   3165	/*
   3166	 *	Now the socket is dead. No more input will appear.
   3167	 */
   3168	sock_orphan(sk);
   3169	sock->sk = NULL;
   3170
   3171	/* Purge queues */
   3172
   3173	skb_queue_purge(&sk->sk_receive_queue);
   3174	packet_free_pending(po);
   3175	sk_refcnt_debug_release(sk);
   3176
   3177	sock_put(sk);
   3178	return 0;
   3179}
   3180
   3181/*
   3182 *	Attach a packet hook.
   3183 */
   3184
   3185static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
   3186			  __be16 proto)
   3187{
   3188	struct packet_sock *po = pkt_sk(sk);
   3189	struct net_device *dev = NULL;
   3190	bool unlisted = false;
   3191	bool need_rehook;
   3192	int ret = 0;
   3193
   3194	lock_sock(sk);
   3195	spin_lock(&po->bind_lock);
   3196	rcu_read_lock();
   3197
   3198	if (po->fanout) {
   3199		ret = -EINVAL;
   3200		goto out_unlock;
   3201	}
   3202
   3203	if (name) {
   3204		dev = dev_get_by_name_rcu(sock_net(sk), name);
   3205		if (!dev) {
   3206			ret = -ENODEV;
   3207			goto out_unlock;
   3208		}
   3209	} else if (ifindex) {
   3210		dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
   3211		if (!dev) {
   3212			ret = -ENODEV;
   3213			goto out_unlock;
   3214		}
   3215	}
   3216
   3217	need_rehook = po->prot_hook.type != proto || po->prot_hook.dev != dev;
   3218
   3219	if (need_rehook) {
   3220		dev_hold(dev);
   3221		if (po->running) {
   3222			rcu_read_unlock();
   3223			/* prevents packet_notifier() from calling
   3224			 * register_prot_hook()
   3225			 */
   3226			WRITE_ONCE(po->num, 0);
   3227			__unregister_prot_hook(sk, true);
   3228			rcu_read_lock();
   3229			if (dev)
   3230				unlisted = !dev_get_by_index_rcu(sock_net(sk),
   3231								 dev->ifindex);
   3232		}
   3233
   3234		BUG_ON(po->running);
   3235		WRITE_ONCE(po->num, proto);
   3236		po->prot_hook.type = proto;
   3237
   3238		dev_put_track(po->prot_hook.dev, &po->prot_hook.dev_tracker);
   3239
   3240		if (unlikely(unlisted)) {
   3241			po->prot_hook.dev = NULL;
   3242			WRITE_ONCE(po->ifindex, -1);
   3243			packet_cached_dev_reset(po);
   3244		} else {
   3245			dev_hold_track(dev, &po->prot_hook.dev_tracker,
   3246				       GFP_ATOMIC);
   3247			po->prot_hook.dev = dev;
   3248			WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0);
   3249			packet_cached_dev_assign(po, dev);
   3250		}
   3251		dev_put(dev);
   3252	}
   3253
   3254	if (proto == 0 || !need_rehook)
   3255		goto out_unlock;
   3256
   3257	if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
   3258		register_prot_hook(sk);
   3259	} else {
   3260		sk->sk_err = ENETDOWN;
   3261		if (!sock_flag(sk, SOCK_DEAD))
   3262			sk_error_report(sk);
   3263	}
   3264
   3265out_unlock:
   3266	rcu_read_unlock();
   3267	spin_unlock(&po->bind_lock);
   3268	release_sock(sk);
   3269	return ret;
   3270}
   3271
   3272/*
   3273 *	Bind a packet socket to a device
   3274 */
   3275
   3276static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
   3277			    int addr_len)
   3278{
   3279	struct sock *sk = sock->sk;
   3280	char name[sizeof(uaddr->sa_data) + 1];
   3281
   3282	/*
   3283	 *	Check legality
   3284	 */
   3285
   3286	if (addr_len != sizeof(struct sockaddr))
   3287		return -EINVAL;
   3288	/* uaddr->sa_data comes from the userspace, it's not guaranteed to be
   3289	 * zero-terminated.
   3290	 */
   3291	memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
   3292	name[sizeof(uaddr->sa_data)] = 0;
   3293
   3294	return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
   3295}
   3296
   3297static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
   3298{
   3299	struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
   3300	struct sock *sk = sock->sk;
   3301
   3302	/*
   3303	 *	Check legality
   3304	 */
   3305
   3306	if (addr_len < sizeof(struct sockaddr_ll))
   3307		return -EINVAL;
   3308	if (sll->sll_family != AF_PACKET)
   3309		return -EINVAL;
   3310
   3311	return packet_do_bind(sk, NULL, sll->sll_ifindex,
   3312			      sll->sll_protocol ? : pkt_sk(sk)->num);
   3313}
   3314
   3315static struct proto packet_proto = {
   3316	.name	  = "PACKET",
   3317	.owner	  = THIS_MODULE,
   3318	.obj_size = sizeof(struct packet_sock),
   3319};
   3320
   3321/*
   3322 *	Create a packet of type SOCK_PACKET.
   3323 */
   3324
   3325static int packet_create(struct net *net, struct socket *sock, int protocol,
   3326			 int kern)
   3327{
   3328	struct sock *sk;
   3329	struct packet_sock *po;
   3330	__be16 proto = (__force __be16)protocol; /* weird, but documented */
   3331	int err;
   3332
   3333	if (!ns_capable(net->user_ns, CAP_NET_RAW))
   3334		return -EPERM;
   3335	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
   3336	    sock->type != SOCK_PACKET)
   3337		return -ESOCKTNOSUPPORT;
   3338
   3339	sock->state = SS_UNCONNECTED;
   3340
   3341	err = -ENOBUFS;
   3342	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
   3343	if (sk == NULL)
   3344		goto out;
   3345
   3346	sock->ops = &packet_ops;
   3347	if (sock->type == SOCK_PACKET)
   3348		sock->ops = &packet_ops_spkt;
   3349
   3350	sock_init_data(sock, sk);
   3351
   3352	po = pkt_sk(sk);
   3353	init_completion(&po->skb_completion);
   3354	sk->sk_family = PF_PACKET;
   3355	po->num = proto;
   3356	po->xmit = dev_queue_xmit;
   3357
   3358	err = packet_alloc_pending(po);
   3359	if (err)
   3360		goto out2;
   3361
   3362	packet_cached_dev_reset(po);
   3363
   3364	sk->sk_destruct = packet_sock_destruct;
   3365	sk_refcnt_debug_inc(sk);
   3366
   3367	/*
   3368	 *	Attach a protocol block
   3369	 */
   3370
   3371	spin_lock_init(&po->bind_lock);
   3372	mutex_init(&po->pg_vec_lock);
   3373	po->rollover = NULL;
   3374	po->prot_hook.func = packet_rcv;
   3375
   3376	if (sock->type == SOCK_PACKET)
   3377		po->prot_hook.func = packet_rcv_spkt;
   3378
   3379	po->prot_hook.af_packet_priv = sk;
   3380	po->prot_hook.af_packet_net = sock_net(sk);
   3381
   3382	if (proto) {
   3383		po->prot_hook.type = proto;
   3384		__register_prot_hook(sk);
   3385	}
   3386
   3387	mutex_lock(&net->packet.sklist_lock);
   3388	sk_add_node_tail_rcu(sk, &net->packet.sklist);
   3389	mutex_unlock(&net->packet.sklist_lock);
   3390
   3391	sock_prot_inuse_add(net, &packet_proto, 1);
   3392
   3393	return 0;
   3394out2:
   3395	sk_free(sk);
   3396out:
   3397	return err;
   3398}
   3399
   3400/*
   3401 *	Pull a packet from our receive queue and hand it to the user.
   3402 *	If necessary we block.
   3403 */
   3404
   3405static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
   3406			  int flags)
   3407{
   3408	struct sock *sk = sock->sk;
   3409	struct sk_buff *skb;
   3410	int copied, err;
   3411	int vnet_hdr_len = 0;
   3412	unsigned int origlen = 0;
   3413
   3414	err = -EINVAL;
   3415	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
   3416		goto out;
   3417
   3418#if 0
   3419	/* What error should we return now? EUNATTACH? */
   3420	if (pkt_sk(sk)->ifindex < 0)
   3421		return -ENODEV;
   3422#endif
   3423
   3424	if (flags & MSG_ERRQUEUE) {
   3425		err = sock_recv_errqueue(sk, msg, len,
   3426					 SOL_PACKET, PACKET_TX_TIMESTAMP);
   3427		goto out;
   3428	}
   3429
   3430	/*
   3431	 *	Call the generic datagram receiver. This handles all sorts
   3432	 *	of horrible races and re-entrancy so we can forget about it
   3433	 *	in the protocol layers.
   3434	 *
   3435	 *	Now it will return ENETDOWN, if device have just gone down,
   3436	 *	but then it will block.
   3437	 */
   3438
   3439	skb = skb_recv_datagram(sk, flags, &err);
   3440
   3441	/*
   3442	 *	An error occurred so return it. Because skb_recv_datagram()
   3443	 *	handles the blocking we don't see and worry about blocking
   3444	 *	retries.
   3445	 */
   3446
   3447	if (skb == NULL)
   3448		goto out;
   3449
   3450	packet_rcv_try_clear_pressure(pkt_sk(sk));
   3451
   3452	if (pkt_sk(sk)->has_vnet_hdr) {
   3453		err = packet_rcv_vnet(msg, skb, &len);
   3454		if (err)
   3455			goto out_free;
   3456		vnet_hdr_len = sizeof(struct virtio_net_hdr);
   3457	}
   3458
   3459	/* You lose any data beyond the buffer you gave. If it worries
   3460	 * a user program they can ask the device for its MTU
   3461	 * anyway.
   3462	 */
   3463	copied = skb->len;
   3464	if (copied > len) {
   3465		copied = len;
   3466		msg->msg_flags |= MSG_TRUNC;
   3467	}
   3468
   3469	err = skb_copy_datagram_msg(skb, 0, msg, copied);
   3470	if (err)
   3471		goto out_free;
   3472
   3473	if (sock->type != SOCK_PACKET) {
   3474		struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
   3475
   3476		/* Original length was stored in sockaddr_ll fields */
   3477		origlen = PACKET_SKB_CB(skb)->sa.origlen;
   3478		sll->sll_family = AF_PACKET;
   3479		sll->sll_protocol = skb->protocol;
   3480	}
   3481
   3482	sock_recv_cmsgs(msg, sk, skb);
   3483
   3484	if (msg->msg_name) {
   3485		const size_t max_len = min(sizeof(skb->cb),
   3486					   sizeof(struct sockaddr_storage));
   3487		int copy_len;
   3488
   3489		/* If the address length field is there to be filled
   3490		 * in, we fill it in now.
   3491		 */
   3492		if (sock->type == SOCK_PACKET) {
   3493			__sockaddr_check_size(sizeof(struct sockaddr_pkt));
   3494			msg->msg_namelen = sizeof(struct sockaddr_pkt);
   3495			copy_len = msg->msg_namelen;
   3496		} else {
   3497			struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
   3498
   3499			msg->msg_namelen = sll->sll_halen +
   3500				offsetof(struct sockaddr_ll, sll_addr);
   3501			copy_len = msg->msg_namelen;
   3502			if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
   3503				memset(msg->msg_name +
   3504				       offsetof(struct sockaddr_ll, sll_addr),
   3505				       0, sizeof(sll->sll_addr));
   3506				msg->msg_namelen = sizeof(struct sockaddr_ll);
   3507			}
   3508		}
   3509		if (WARN_ON_ONCE(copy_len > max_len)) {
   3510			copy_len = max_len;
   3511			msg->msg_namelen = copy_len;
   3512		}
   3513		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
   3514	}
   3515
   3516	if (pkt_sk(sk)->auxdata) {
   3517		struct tpacket_auxdata aux;
   3518
   3519		aux.tp_status = TP_STATUS_USER;
   3520		if (skb->ip_summed == CHECKSUM_PARTIAL)
   3521			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
   3522		else if (skb->pkt_type != PACKET_OUTGOING &&
   3523			 (skb->ip_summed == CHECKSUM_COMPLETE ||
   3524			  skb_csum_unnecessary(skb)))
   3525			aux.tp_status |= TP_STATUS_CSUM_VALID;
   3526
   3527		aux.tp_len = origlen;
   3528		aux.tp_snaplen = skb->len;
   3529		aux.tp_mac = 0;
   3530		aux.tp_net = skb_network_offset(skb);
   3531		if (skb_vlan_tag_present(skb)) {
   3532			aux.tp_vlan_tci = skb_vlan_tag_get(skb);
   3533			aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
   3534			aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
   3535		} else {
   3536			aux.tp_vlan_tci = 0;
   3537			aux.tp_vlan_tpid = 0;
   3538		}
   3539		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
   3540	}
   3541
   3542	/*
   3543	 *	Free or return the buffer as appropriate. Again this
   3544	 *	hides all the races and re-entrancy issues from us.
   3545	 */
   3546	err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
   3547
   3548out_free:
   3549	skb_free_datagram(sk, skb);
   3550out:
   3551	return err;
   3552}
   3553
   3554static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
   3555			       int peer)
   3556{
   3557	struct net_device *dev;
   3558	struct sock *sk	= sock->sk;
   3559
   3560	if (peer)
   3561		return -EOPNOTSUPP;
   3562
   3563	uaddr->sa_family = AF_PACKET;
   3564	memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
   3565	rcu_read_lock();
   3566	dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex));
   3567	if (dev)
   3568		strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
   3569	rcu_read_unlock();
   3570
   3571	return sizeof(*uaddr);
   3572}
   3573
   3574static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
   3575			  int peer)
   3576{
   3577	struct net_device *dev;
   3578	struct sock *sk = sock->sk;
   3579	struct packet_sock *po = pkt_sk(sk);
   3580	DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
   3581	int ifindex;
   3582
   3583	if (peer)
   3584		return -EOPNOTSUPP;
   3585
   3586	ifindex = READ_ONCE(po->ifindex);
   3587	sll->sll_family = AF_PACKET;
   3588	sll->sll_ifindex = ifindex;
   3589	sll->sll_protocol = READ_ONCE(po->num);
   3590	sll->sll_pkttype = 0;
   3591	rcu_read_lock();
   3592	dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
   3593	if (dev) {
   3594		sll->sll_hatype = dev->type;
   3595		sll->sll_halen = dev->addr_len;
   3596		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
   3597	} else {
   3598		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
   3599		sll->sll_halen = 0;
   3600	}
   3601	rcu_read_unlock();
   3602
   3603	return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
   3604}
   3605
   3606static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
   3607			 int what)
   3608{
   3609	switch (i->type) {
   3610	case PACKET_MR_MULTICAST:
   3611		if (i->alen != dev->addr_len)
   3612			return -EINVAL;
   3613		if (what > 0)
   3614			return dev_mc_add(dev, i->addr);
   3615		else
   3616			return dev_mc_del(dev, i->addr);
   3617		break;
   3618	case PACKET_MR_PROMISC:
   3619		return dev_set_promiscuity(dev, what);
   3620	case PACKET_MR_ALLMULTI:
   3621		return dev_set_allmulti(dev, what);
   3622	case PACKET_MR_UNICAST:
   3623		if (i->alen != dev->addr_len)
   3624			return -EINVAL;
   3625		if (what > 0)
   3626			return dev_uc_add(dev, i->addr);
   3627		else
   3628			return dev_uc_del(dev, i->addr);
   3629		break;
   3630	default:
   3631		break;
   3632	}
   3633	return 0;
   3634}
   3635
   3636static void packet_dev_mclist_delete(struct net_device *dev,
   3637				     struct packet_mclist **mlp)
   3638{
   3639	struct packet_mclist *ml;
   3640
   3641	while ((ml = *mlp) != NULL) {
   3642		if (ml->ifindex == dev->ifindex) {
   3643			packet_dev_mc(dev, ml, -1);
   3644			*mlp = ml->next;
   3645			kfree(ml);
   3646		} else
   3647			mlp = &ml->next;
   3648	}
   3649}
   3650
   3651static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
   3652{
   3653	struct packet_sock *po = pkt_sk(sk);
   3654	struct packet_mclist *ml, *i;
   3655	struct net_device *dev;
   3656	int err;
   3657
   3658	rtnl_lock();
   3659
   3660	err = -ENODEV;
   3661	dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
   3662	if (!dev)
   3663		goto done;
   3664
   3665	err = -EINVAL;
   3666	if (mreq->mr_alen > dev->addr_len)
   3667		goto done;
   3668
   3669	err = -ENOBUFS;
   3670	i = kmalloc(sizeof(*i), GFP_KERNEL);
   3671	if (i == NULL)
   3672		goto done;
   3673
   3674	err = 0;
   3675	for (ml = po->mclist; ml; ml = ml->next) {
   3676		if (ml->ifindex == mreq->mr_ifindex &&
   3677		    ml->type == mreq->mr_type &&
   3678		    ml->alen == mreq->mr_alen &&
   3679		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
   3680			ml->count++;
   3681			/* Free the new element ... */
   3682			kfree(i);
   3683			goto done;
   3684		}
   3685	}
   3686
   3687	i->type = mreq->mr_type;
   3688	i->ifindex = mreq->mr_ifindex;
   3689	i->alen = mreq->mr_alen;
   3690	memcpy(i->addr, mreq->mr_address, i->alen);
   3691	memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
   3692	i->count = 1;
   3693	i->next = po->mclist;
   3694	po->mclist = i;
   3695	err = packet_dev_mc(dev, i, 1);
   3696	if (err) {
   3697		po->mclist = i->next;
   3698		kfree(i);
   3699	}
   3700
   3701done:
   3702	rtnl_unlock();
   3703	return err;
   3704}
   3705
   3706static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
   3707{
   3708	struct packet_mclist *ml, **mlp;
   3709
   3710	rtnl_lock();
   3711
   3712	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
   3713		if (ml->ifindex == mreq->mr_ifindex &&
   3714		    ml->type == mreq->mr_type &&
   3715		    ml->alen == mreq->mr_alen &&
   3716		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
   3717			if (--ml->count == 0) {
   3718				struct net_device *dev;
   3719				*mlp = ml->next;
   3720				dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
   3721				if (dev)
   3722					packet_dev_mc(dev, ml, -1);
   3723				kfree(ml);
   3724			}
   3725			break;
   3726		}
   3727	}
   3728	rtnl_unlock();
   3729	return 0;
   3730}
   3731
   3732static void packet_flush_mclist(struct sock *sk)
   3733{
   3734	struct packet_sock *po = pkt_sk(sk);
   3735	struct packet_mclist *ml;
   3736
   3737	if (!po->mclist)
   3738		return;
   3739
   3740	rtnl_lock();
   3741	while ((ml = po->mclist) != NULL) {
   3742		struct net_device *dev;
   3743
   3744		po->mclist = ml->next;
   3745		dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
   3746		if (dev != NULL)
   3747			packet_dev_mc(dev, ml, -1);
   3748		kfree(ml);
   3749	}
   3750	rtnl_unlock();
   3751}
   3752
   3753static int
   3754packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
   3755		  unsigned int optlen)
   3756{
   3757	struct sock *sk = sock->sk;
   3758	struct packet_sock *po = pkt_sk(sk);
   3759	int ret;
   3760
   3761	if (level != SOL_PACKET)
   3762		return -ENOPROTOOPT;
   3763
   3764	switch (optname) {
   3765	case PACKET_ADD_MEMBERSHIP:
   3766	case PACKET_DROP_MEMBERSHIP:
   3767	{
   3768		struct packet_mreq_max mreq;
   3769		int len = optlen;
   3770		memset(&mreq, 0, sizeof(mreq));
   3771		if (len < sizeof(struct packet_mreq))
   3772			return -EINVAL;
   3773		if (len > sizeof(mreq))
   3774			len = sizeof(mreq);
   3775		if (copy_from_sockptr(&mreq, optval, len))
   3776			return -EFAULT;
   3777		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
   3778			return -EINVAL;
   3779		if (optname == PACKET_ADD_MEMBERSHIP)
   3780			ret = packet_mc_add(sk, &mreq);
   3781		else
   3782			ret = packet_mc_drop(sk, &mreq);
   3783		return ret;
   3784	}
   3785
   3786	case PACKET_RX_RING:
   3787	case PACKET_TX_RING:
   3788	{
   3789		union tpacket_req_u req_u;
   3790		int len;
   3791
   3792		lock_sock(sk);
   3793		switch (po->tp_version) {
   3794		case TPACKET_V1:
   3795		case TPACKET_V2:
   3796			len = sizeof(req_u.req);
   3797			break;
   3798		case TPACKET_V3:
   3799		default:
   3800			len = sizeof(req_u.req3);
   3801			break;
   3802		}
   3803		if (optlen < len) {
   3804			ret = -EINVAL;
   3805		} else {
   3806			if (copy_from_sockptr(&req_u.req, optval, len))
   3807				ret = -EFAULT;
   3808			else
   3809				ret = packet_set_ring(sk, &req_u, 0,
   3810						    optname == PACKET_TX_RING);
   3811		}
   3812		release_sock(sk);
   3813		return ret;
   3814	}
   3815	case PACKET_COPY_THRESH:
   3816	{
   3817		int val;
   3818
   3819		if (optlen != sizeof(val))
   3820			return -EINVAL;
   3821		if (copy_from_sockptr(&val, optval, sizeof(val)))
   3822			return -EFAULT;
   3823
   3824		pkt_sk(sk)->copy_thresh = val;
   3825		return 0;
   3826	}
   3827	case PACKET_VERSION:
   3828	{
   3829		int val;
   3830
   3831		if (optlen != sizeof(val))
   3832			return -EINVAL;
   3833		if (copy_from_sockptr(&val, optval, sizeof(val)))
   3834			return -EFAULT;
   3835		switch (val) {
   3836		case TPACKET_V1:
   3837		case TPACKET_V2:
   3838		case TPACKET_V3:
   3839			break;
   3840		default:
   3841			return -EINVAL;
   3842		}
   3843		lock_sock(sk);
   3844		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
   3845			ret = -EBUSY;
   3846		} else {
   3847			po->tp_version = val;
   3848			ret = 0;
   3849		}
   3850		release_sock(sk);
   3851		return ret;
   3852	}
   3853	case PACKET_RESERVE:
   3854	{
   3855		unsigned int val;
   3856
   3857		if (optlen != sizeof(val))
   3858			return -EINVAL;
   3859		if (copy_from_sockptr(&val, optval, sizeof(val)))
   3860			return -EFAULT;
   3861		if (val > INT_MAX)
   3862			return -EINVAL;
   3863		lock_sock(sk);
   3864		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
   3865			ret = -EBUSY;
   3866		} else {
   3867			po->tp_reserve = val;
   3868			ret = 0;
   3869		}
   3870		release_sock(sk);
   3871		return ret;
   3872	}
   3873	case PACKET_LOSS:
   3874	{
   3875		unsigned int val;
   3876
   3877		if (optlen != sizeof(val))
   3878			return -EINVAL;
   3879		if (copy_from_sockptr(&val, optval, sizeof(val)))
   3880			return -EFAULT;
   3881
   3882		lock_sock(sk);
   3883		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
   3884			ret = -EBUSY;
   3885		} else {
   3886			po->tp_loss = !!val;
   3887			ret = 0;
   3888		}
   3889		release_sock(sk);
   3890		return ret;
   3891	}
   3892	case PACKET_AUXDATA:
   3893	{
   3894		int val;
   3895
   3896		if (optlen < sizeof(val))
   3897			return -EINVAL;
   3898		if (copy_from_sockptr(&val, optval, sizeof(val)))
   3899			return -EFAULT;
   3900
   3901		lock_sock(sk);
   3902		po->auxdata = !!val;
   3903		release_sock(sk);
   3904		return 0;
   3905	}
   3906	case PACKET_ORIGDEV:
   3907	{
   3908		int val;
   3909
   3910		if (optlen < sizeof(val))
   3911			return -EINVAL;
   3912		if (copy_from_sockptr(&val, optval, sizeof(val)))
   3913			return -EFAULT;
   3914
   3915		lock_sock(sk);
   3916		po->origdev = !!val;
   3917		release_sock(sk);
   3918		return 0;
   3919	}
   3920	case PACKET_VNET_HDR:
   3921	{
   3922		int val;
   3923
   3924		if (sock->type != SOCK_RAW)
   3925			return -EINVAL;
   3926		if (optlen < sizeof(val))
   3927			return -EINVAL;
   3928		if (copy_from_sockptr(&val, optval, sizeof(val)))
   3929			return -EFAULT;
   3930
   3931		lock_sock(sk);
   3932		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
   3933			ret = -EBUSY;
   3934		} else {
   3935			po->has_vnet_hdr = !!val;
   3936			ret = 0;
   3937		}
   3938		release_sock(sk);
   3939		return ret;
   3940	}
   3941	case PACKET_TIMESTAMP:
   3942	{
   3943		int val;
   3944
   3945		if (optlen != sizeof(val))
   3946			return -EINVAL;
   3947		if (copy_from_sockptr(&val, optval, sizeof(val)))
   3948			return -EFAULT;
   3949
   3950		po->tp_tstamp = val;
   3951		return 0;
   3952	}
   3953	case PACKET_FANOUT:
   3954	{
   3955		struct fanout_args args = { 0 };
   3956
   3957		if (optlen != sizeof(int) && optlen != sizeof(args))
   3958			return -EINVAL;
   3959		if (copy_from_sockptr(&args, optval, optlen))
   3960			return -EFAULT;
   3961
   3962		return fanout_add(sk, &args);
   3963	}
   3964	case PACKET_FANOUT_DATA:
   3965	{
   3966		/* Paired with the WRITE_ONCE() in fanout_add() */
   3967		if (!READ_ONCE(po->fanout))
   3968			return -EINVAL;
   3969
   3970		return fanout_set_data(po, optval, optlen);
   3971	}
   3972	case PACKET_IGNORE_OUTGOING:
   3973	{
   3974		int val;
   3975
   3976		if (optlen != sizeof(val))
   3977			return -EINVAL;
   3978		if (copy_from_sockptr(&val, optval, sizeof(val)))
   3979			return -EFAULT;
   3980		if (val < 0 || val > 1)
   3981			return -EINVAL;
   3982
   3983		po->prot_hook.ignore_outgoing = !!val;
   3984		return 0;
   3985	}
   3986	case PACKET_TX_HAS_OFF:
   3987	{
   3988		unsigned int val;
   3989
   3990		if (optlen != sizeof(val))
   3991			return -EINVAL;
   3992		if (copy_from_sockptr(&val, optval, sizeof(val)))
   3993			return -EFAULT;
   3994
   3995		lock_sock(sk);
   3996		if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec)
   3997			po->tp_tx_has_off = !!val;
   3998
   3999		release_sock(sk);
   4000		return 0;
   4001	}
   4002	case PACKET_QDISC_BYPASS:
   4003	{
   4004		int val;
   4005
   4006		if (optlen != sizeof(val))
   4007			return -EINVAL;
   4008		if (copy_from_sockptr(&val, optval, sizeof(val)))
   4009			return -EFAULT;
   4010
   4011		po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
   4012		return 0;
   4013	}
   4014	default:
   4015		return -ENOPROTOOPT;
   4016	}
   4017}
   4018
   4019static int packet_getsockopt(struct socket *sock, int level, int optname,
   4020			     char __user *optval, int __user *optlen)
   4021{
   4022	int len;
   4023	int val, lv = sizeof(val);
   4024	struct sock *sk = sock->sk;
   4025	struct packet_sock *po = pkt_sk(sk);
   4026	void *data = &val;
   4027	union tpacket_stats_u st;
   4028	struct tpacket_rollover_stats rstats;
   4029	int drops;
   4030
   4031	if (level != SOL_PACKET)
   4032		return -ENOPROTOOPT;
   4033
   4034	if (get_user(len, optlen))
   4035		return -EFAULT;
   4036
   4037	if (len < 0)
   4038		return -EINVAL;
   4039
   4040	switch (optname) {
   4041	case PACKET_STATISTICS:
   4042		spin_lock_bh(&sk->sk_receive_queue.lock);
   4043		memcpy(&st, &po->stats, sizeof(st));
   4044		memset(&po->stats, 0, sizeof(po->stats));
   4045		spin_unlock_bh(&sk->sk_receive_queue.lock);
   4046		drops = atomic_xchg(&po->tp_drops, 0);
   4047
   4048		if (po->tp_version == TPACKET_V3) {
   4049			lv = sizeof(struct tpacket_stats_v3);
   4050			st.stats3.tp_drops = drops;
   4051			st.stats3.tp_packets += drops;
   4052			data = &st.stats3;
   4053		} else {
   4054			lv = sizeof(struct tpacket_stats);
   4055			st.stats1.tp_drops = drops;
   4056			st.stats1.tp_packets += drops;
   4057			data = &st.stats1;
   4058		}
   4059
   4060		break;
   4061	case PACKET_AUXDATA:
   4062		val = po->auxdata;
   4063		break;
   4064	case PACKET_ORIGDEV:
   4065		val = po->origdev;
   4066		break;
   4067	case PACKET_VNET_HDR:
   4068		val = po->has_vnet_hdr;
   4069		break;
   4070	case PACKET_VERSION:
   4071		val = po->tp_version;
   4072		break;
   4073	case PACKET_HDRLEN:
   4074		if (len > sizeof(int))
   4075			len = sizeof(int);
   4076		if (len < sizeof(int))
   4077			return -EINVAL;
   4078		if (copy_from_user(&val, optval, len))
   4079			return -EFAULT;
   4080		switch (val) {
   4081		case TPACKET_V1:
   4082			val = sizeof(struct tpacket_hdr);
   4083			break;
   4084		case TPACKET_V2:
   4085			val = sizeof(struct tpacket2_hdr);
   4086			break;
   4087		case TPACKET_V3:
   4088			val = sizeof(struct tpacket3_hdr);
   4089			break;
   4090		default:
   4091			return -EINVAL;
   4092		}
   4093		break;
   4094	case PACKET_RESERVE:
   4095		val = po->tp_reserve;
   4096		break;
   4097	case PACKET_LOSS:
   4098		val = po->tp_loss;
   4099		break;
   4100	case PACKET_TIMESTAMP:
   4101		val = po->tp_tstamp;
   4102		break;
   4103	case PACKET_FANOUT:
   4104		val = (po->fanout ?
   4105		       ((u32)po->fanout->id |
   4106			((u32)po->fanout->type << 16) |
   4107			((u32)po->fanout->flags << 24)) :
   4108		       0);
   4109		break;
   4110	case PACKET_IGNORE_OUTGOING:
   4111		val = po->prot_hook.ignore_outgoing;
   4112		break;
   4113	case PACKET_ROLLOVER_STATS:
   4114		if (!po->rollover)
   4115			return -EINVAL;
   4116		rstats.tp_all = atomic_long_read(&po->rollover->num);
   4117		rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
   4118		rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
   4119		data = &rstats;
   4120		lv = sizeof(rstats);
   4121		break;
   4122	case PACKET_TX_HAS_OFF:
   4123		val = po->tp_tx_has_off;
   4124		break;
   4125	case PACKET_QDISC_BYPASS:
   4126		val = packet_use_direct_xmit(po);
   4127		break;
   4128	default:
   4129		return -ENOPROTOOPT;
   4130	}
   4131
   4132	if (len > lv)
   4133		len = lv;
   4134	if (put_user(len, optlen))
   4135		return -EFAULT;
   4136	if (copy_to_user(optval, data, len))
   4137		return -EFAULT;
   4138	return 0;
   4139}
   4140
   4141static int packet_notifier(struct notifier_block *this,
   4142			   unsigned long msg, void *ptr)
   4143{
   4144	struct sock *sk;
   4145	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
   4146	struct net *net = dev_net(dev);
   4147
   4148	rcu_read_lock();
   4149	sk_for_each_rcu(sk, &net->packet.sklist) {
   4150		struct packet_sock *po = pkt_sk(sk);
   4151
   4152		switch (msg) {
   4153		case NETDEV_UNREGISTER:
   4154			if (po->mclist)
   4155				packet_dev_mclist_delete(dev, &po->mclist);
   4156			fallthrough;
   4157
   4158		case NETDEV_DOWN:
   4159			if (dev->ifindex == po->ifindex) {
   4160				spin_lock(&po->bind_lock);
   4161				if (po->running) {
   4162					__unregister_prot_hook(sk, false);
   4163					sk->sk_err = ENETDOWN;
   4164					if (!sock_flag(sk, SOCK_DEAD))
   4165						sk_error_report(sk);
   4166				}
   4167				if (msg == NETDEV_UNREGISTER) {
   4168					packet_cached_dev_reset(po);
   4169					WRITE_ONCE(po->ifindex, -1);
   4170					dev_put_track(po->prot_hook.dev,
   4171						      &po->prot_hook.dev_tracker);
   4172					po->prot_hook.dev = NULL;
   4173				}
   4174				spin_unlock(&po->bind_lock);
   4175			}
   4176			break;
   4177		case NETDEV_UP:
   4178			if (dev->ifindex == po->ifindex) {
   4179				spin_lock(&po->bind_lock);
   4180				if (po->num)
   4181					register_prot_hook(sk);
   4182				spin_unlock(&po->bind_lock);
   4183			}
   4184			break;
   4185		}
   4186	}
   4187	rcu_read_unlock();
   4188	return NOTIFY_DONE;
   4189}
   4190
   4191
   4192static int packet_ioctl(struct socket *sock, unsigned int cmd,
   4193			unsigned long arg)
   4194{
   4195	struct sock *sk = sock->sk;
   4196
   4197	switch (cmd) {
   4198	case SIOCOUTQ:
   4199	{
   4200		int amount = sk_wmem_alloc_get(sk);
   4201
   4202		return put_user(amount, (int __user *)arg);
   4203	}
   4204	case SIOCINQ:
   4205	{
   4206		struct sk_buff *skb;
   4207		int amount = 0;
   4208
   4209		spin_lock_bh(&sk->sk_receive_queue.lock);
   4210		skb = skb_peek(&sk->sk_receive_queue);
   4211		if (skb)
   4212			amount = skb->len;
   4213		spin_unlock_bh(&sk->sk_receive_queue.lock);
   4214		return put_user(amount, (int __user *)arg);
   4215	}
   4216#ifdef CONFIG_INET
   4217	case SIOCADDRT:
   4218	case SIOCDELRT:
   4219	case SIOCDARP:
   4220	case SIOCGARP:
   4221	case SIOCSARP:
   4222	case SIOCGIFADDR:
   4223	case SIOCSIFADDR:
   4224	case SIOCGIFBRDADDR:
   4225	case SIOCSIFBRDADDR:
   4226	case SIOCGIFNETMASK:
   4227	case SIOCSIFNETMASK:
   4228	case SIOCGIFDSTADDR:
   4229	case SIOCSIFDSTADDR:
   4230	case SIOCSIFFLAGS:
   4231		return inet_dgram_ops.ioctl(sock, cmd, arg);
   4232#endif
   4233
   4234	default:
   4235		return -ENOIOCTLCMD;
   4236	}
   4237	return 0;
   4238}
   4239
   4240static __poll_t packet_poll(struct file *file, struct socket *sock,
   4241				poll_table *wait)
   4242{
   4243	struct sock *sk = sock->sk;
   4244	struct packet_sock *po = pkt_sk(sk);
   4245	__poll_t mask = datagram_poll(file, sock, wait);
   4246
   4247	spin_lock_bh(&sk->sk_receive_queue.lock);
   4248	if (po->rx_ring.pg_vec) {
   4249		if (!packet_previous_rx_frame(po, &po->rx_ring,
   4250			TP_STATUS_KERNEL))
   4251			mask |= EPOLLIN | EPOLLRDNORM;
   4252	}
   4253	packet_rcv_try_clear_pressure(po);
   4254	spin_unlock_bh(&sk->sk_receive_queue.lock);
   4255	spin_lock_bh(&sk->sk_write_queue.lock);
   4256	if (po->tx_ring.pg_vec) {
   4257		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
   4258			mask |= EPOLLOUT | EPOLLWRNORM;
   4259	}
   4260	spin_unlock_bh(&sk->sk_write_queue.lock);
   4261	return mask;
   4262}
   4263
   4264
   4265/* Dirty? Well, I still did not learn better way to account
   4266 * for user mmaps.
   4267 */
   4268
   4269static void packet_mm_open(struct vm_area_struct *vma)
   4270{
   4271	struct file *file = vma->vm_file;
   4272	struct socket *sock = file->private_data;
   4273	struct sock *sk = sock->sk;
   4274
   4275	if (sk)
   4276		atomic_inc(&pkt_sk(sk)->mapped);
   4277}
   4278
   4279static void packet_mm_close(struct vm_area_struct *vma)
   4280{
   4281	struct file *file = vma->vm_file;
   4282	struct socket *sock = file->private_data;
   4283	struct sock *sk = sock->sk;
   4284
   4285	if (sk)
   4286		atomic_dec(&pkt_sk(sk)->mapped);
   4287}
   4288
   4289static const struct vm_operations_struct packet_mmap_ops = {
   4290	.open	=	packet_mm_open,
   4291	.close	=	packet_mm_close,
   4292};
   4293
   4294static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
   4295			unsigned int len)
   4296{
   4297	int i;
   4298
   4299	for (i = 0; i < len; i++) {
   4300		if (likely(pg_vec[i].buffer)) {
   4301			if (is_vmalloc_addr(pg_vec[i].buffer))
   4302				vfree(pg_vec[i].buffer);
   4303			else
   4304				free_pages((unsigned long)pg_vec[i].buffer,
   4305					   order);
   4306			pg_vec[i].buffer = NULL;
   4307		}
   4308	}
   4309	kfree(pg_vec);
   4310}
   4311
   4312static char *alloc_one_pg_vec_page(unsigned long order)
   4313{
   4314	char *buffer;
   4315	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
   4316			  __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
   4317
   4318	buffer = (char *) __get_free_pages(gfp_flags, order);
   4319	if (buffer)
   4320		return buffer;
   4321
   4322	/* __get_free_pages failed, fall back to vmalloc */
   4323	buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
   4324	if (buffer)
   4325		return buffer;
   4326
   4327	/* vmalloc failed, lets dig into swap here */
   4328	gfp_flags &= ~__GFP_NORETRY;
   4329	buffer = (char *) __get_free_pages(gfp_flags, order);
   4330	if (buffer)
   4331		return buffer;
   4332
   4333	/* complete and utter failure */
   4334	return NULL;
   4335}
   4336
   4337static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
   4338{
   4339	unsigned int block_nr = req->tp_block_nr;
   4340	struct pgv *pg_vec;
   4341	int i;
   4342
   4343	pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
   4344	if (unlikely(!pg_vec))
   4345		goto out;
   4346
   4347	for (i = 0; i < block_nr; i++) {
   4348		pg_vec[i].buffer = alloc_one_pg_vec_page(order);
   4349		if (unlikely(!pg_vec[i].buffer))
   4350			goto out_free_pgvec;
   4351	}
   4352
   4353out:
   4354	return pg_vec;
   4355
   4356out_free_pgvec:
   4357	free_pg_vec(pg_vec, order, block_nr);
   4358	pg_vec = NULL;
   4359	goto out;
   4360}
   4361
   4362static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
   4363		int closing, int tx_ring)
   4364{
   4365	struct pgv *pg_vec = NULL;
   4366	struct packet_sock *po = pkt_sk(sk);
   4367	unsigned long *rx_owner_map = NULL;
   4368	int was_running, order = 0;
   4369	struct packet_ring_buffer *rb;
   4370	struct sk_buff_head *rb_queue;
   4371	__be16 num;
   4372	int err;
   4373	/* Added to avoid minimal code churn */
   4374	struct tpacket_req *req = &req_u->req;
   4375
   4376	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
   4377	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
   4378
   4379	err = -EBUSY;
   4380	if (!closing) {
   4381		if (atomic_read(&po->mapped))
   4382			goto out;
   4383		if (packet_read_pending(rb))
   4384			goto out;
   4385	}
   4386
   4387	if (req->tp_block_nr) {
   4388		unsigned int min_frame_size;
   4389
   4390		/* Sanity tests and some calculations */
   4391		err = -EBUSY;
   4392		if (unlikely(rb->pg_vec))
   4393			goto out;
   4394
   4395		switch (po->tp_version) {
   4396		case TPACKET_V1:
   4397			po->tp_hdrlen = TPACKET_HDRLEN;
   4398			break;
   4399		case TPACKET_V2:
   4400			po->tp_hdrlen = TPACKET2_HDRLEN;
   4401			break;
   4402		case TPACKET_V3:
   4403			po->tp_hdrlen = TPACKET3_HDRLEN;
   4404			break;
   4405		}
   4406
   4407		err = -EINVAL;
   4408		if (unlikely((int)req->tp_block_size <= 0))
   4409			goto out;
   4410		if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
   4411			goto out;
   4412		min_frame_size = po->tp_hdrlen + po->tp_reserve;
   4413		if (po->tp_version >= TPACKET_V3 &&
   4414		    req->tp_block_size <
   4415		    BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
   4416			goto out;
   4417		if (unlikely(req->tp_frame_size < min_frame_size))
   4418			goto out;
   4419		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
   4420			goto out;
   4421
   4422		rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
   4423		if (unlikely(rb->frames_per_block == 0))
   4424			goto out;
   4425		if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
   4426			goto out;
   4427		if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
   4428					req->tp_frame_nr))
   4429			goto out;
   4430
   4431		err = -ENOMEM;
   4432		order = get_order(req->tp_block_size);
   4433		pg_vec = alloc_pg_vec(req, order);
   4434		if (unlikely(!pg_vec))
   4435			goto out;
   4436		switch (po->tp_version) {
   4437		case TPACKET_V3:
   4438			/* Block transmit is not supported yet */
   4439			if (!tx_ring) {
   4440				init_prb_bdqc(po, rb, pg_vec, req_u);
   4441			} else {
   4442				struct tpacket_req3 *req3 = &req_u->req3;
   4443
   4444				if (req3->tp_retire_blk_tov ||
   4445				    req3->tp_sizeof_priv ||
   4446				    req3->tp_feature_req_word) {
   4447					err = -EINVAL;
   4448					goto out_free_pg_vec;
   4449				}
   4450			}
   4451			break;
   4452		default:
   4453			if (!tx_ring) {
   4454				rx_owner_map = bitmap_alloc(req->tp_frame_nr,
   4455					GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
   4456				if (!rx_owner_map)
   4457					goto out_free_pg_vec;
   4458			}
   4459			break;
   4460		}
   4461	}
   4462	/* Done */
   4463	else {
   4464		err = -EINVAL;
   4465		if (unlikely(req->tp_frame_nr))
   4466			goto out;
   4467	}
   4468
   4469
   4470	/* Detach socket from network */
   4471	spin_lock(&po->bind_lock);
   4472	was_running = po->running;
   4473	num = po->num;
   4474	if (was_running) {
   4475		WRITE_ONCE(po->num, 0);
   4476		__unregister_prot_hook(sk, false);
   4477	}
   4478	spin_unlock(&po->bind_lock);
   4479
   4480	synchronize_net();
   4481
   4482	err = -EBUSY;
   4483	mutex_lock(&po->pg_vec_lock);
   4484	if (closing || atomic_read(&po->mapped) == 0) {
   4485		err = 0;
   4486		spin_lock_bh(&rb_queue->lock);
   4487		swap(rb->pg_vec, pg_vec);
   4488		if (po->tp_version <= TPACKET_V2)
   4489			swap(rb->rx_owner_map, rx_owner_map);
   4490		rb->frame_max = (req->tp_frame_nr - 1);
   4491		rb->head = 0;
   4492		rb->frame_size = req->tp_frame_size;
   4493		spin_unlock_bh(&rb_queue->lock);
   4494
   4495		swap(rb->pg_vec_order, order);
   4496		swap(rb->pg_vec_len, req->tp_block_nr);
   4497
   4498		rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
   4499		po->prot_hook.func = (po->rx_ring.pg_vec) ?
   4500						tpacket_rcv : packet_rcv;
   4501		skb_queue_purge(rb_queue);
   4502		if (atomic_read(&po->mapped))
   4503			pr_err("packet_mmap: vma is busy: %d\n",
   4504			       atomic_read(&po->mapped));
   4505	}
   4506	mutex_unlock(&po->pg_vec_lock);
   4507
   4508	spin_lock(&po->bind_lock);
   4509	if (was_running) {
   4510		WRITE_ONCE(po->num, num);
   4511		register_prot_hook(sk);
   4512	}
   4513	spin_unlock(&po->bind_lock);
   4514	if (pg_vec && (po->tp_version > TPACKET_V2)) {
   4515		/* Because we don't support block-based V3 on tx-ring */
   4516		if (!tx_ring)
   4517			prb_shutdown_retire_blk_timer(po, rb_queue);
   4518	}
   4519
   4520out_free_pg_vec:
   4521	if (pg_vec) {
   4522		bitmap_free(rx_owner_map);
   4523		free_pg_vec(pg_vec, order, req->tp_block_nr);
   4524	}
   4525out:
   4526	return err;
   4527}
   4528
   4529static int packet_mmap(struct file *file, struct socket *sock,
   4530		struct vm_area_struct *vma)
   4531{
   4532	struct sock *sk = sock->sk;
   4533	struct packet_sock *po = pkt_sk(sk);
   4534	unsigned long size, expected_size;
   4535	struct packet_ring_buffer *rb;
   4536	unsigned long start;
   4537	int err = -EINVAL;
   4538	int i;
   4539
   4540	if (vma->vm_pgoff)
   4541		return -EINVAL;
   4542
   4543	mutex_lock(&po->pg_vec_lock);
   4544
   4545	expected_size = 0;
   4546	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
   4547		if (rb->pg_vec) {
   4548			expected_size += rb->pg_vec_len
   4549						* rb->pg_vec_pages
   4550						* PAGE_SIZE;
   4551		}
   4552	}
   4553
   4554	if (expected_size == 0)
   4555		goto out;
   4556
   4557	size = vma->vm_end - vma->vm_start;
   4558	if (size != expected_size)
   4559		goto out;
   4560
   4561	start = vma->vm_start;
   4562	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
   4563		if (rb->pg_vec == NULL)
   4564			continue;
   4565
   4566		for (i = 0; i < rb->pg_vec_len; i++) {
   4567			struct page *page;
   4568			void *kaddr = rb->pg_vec[i].buffer;
   4569			int pg_num;
   4570
   4571			for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
   4572				page = pgv_to_page(kaddr);
   4573				err = vm_insert_page(vma, start, page);
   4574				if (unlikely(err))
   4575					goto out;
   4576				start += PAGE_SIZE;
   4577				kaddr += PAGE_SIZE;
   4578			}
   4579		}
   4580	}
   4581
   4582	atomic_inc(&po->mapped);
   4583	vma->vm_ops = &packet_mmap_ops;
   4584	err = 0;
   4585
   4586out:
   4587	mutex_unlock(&po->pg_vec_lock);
   4588	return err;
   4589}
   4590
   4591static const struct proto_ops packet_ops_spkt = {
   4592	.family =	PF_PACKET,
   4593	.owner =	THIS_MODULE,
   4594	.release =	packet_release,
   4595	.bind =		packet_bind_spkt,
   4596	.connect =	sock_no_connect,
   4597	.socketpair =	sock_no_socketpair,
   4598	.accept =	sock_no_accept,
   4599	.getname =	packet_getname_spkt,
   4600	.poll =		datagram_poll,
   4601	.ioctl =	packet_ioctl,
   4602	.gettstamp =	sock_gettstamp,
   4603	.listen =	sock_no_listen,
   4604	.shutdown =	sock_no_shutdown,
   4605	.sendmsg =	packet_sendmsg_spkt,
   4606	.recvmsg =	packet_recvmsg,
   4607	.mmap =		sock_no_mmap,
   4608	.sendpage =	sock_no_sendpage,
   4609};
   4610
   4611static const struct proto_ops packet_ops = {
   4612	.family =	PF_PACKET,
   4613	.owner =	THIS_MODULE,
   4614	.release =	packet_release,
   4615	.bind =		packet_bind,
   4616	.connect =	sock_no_connect,
   4617	.socketpair =	sock_no_socketpair,
   4618	.accept =	sock_no_accept,
   4619	.getname =	packet_getname,
   4620	.poll =		packet_poll,
   4621	.ioctl =	packet_ioctl,
   4622	.gettstamp =	sock_gettstamp,
   4623	.listen =	sock_no_listen,
   4624	.shutdown =	sock_no_shutdown,
   4625	.setsockopt =	packet_setsockopt,
   4626	.getsockopt =	packet_getsockopt,
   4627	.sendmsg =	packet_sendmsg,
   4628	.recvmsg =	packet_recvmsg,
   4629	.mmap =		packet_mmap,
   4630	.sendpage =	sock_no_sendpage,
   4631};
   4632
   4633static const struct net_proto_family packet_family_ops = {
   4634	.family =	PF_PACKET,
   4635	.create =	packet_create,
   4636	.owner	=	THIS_MODULE,
   4637};
   4638
   4639static struct notifier_block packet_netdev_notifier = {
   4640	.notifier_call =	packet_notifier,
   4641};
   4642
   4643#ifdef CONFIG_PROC_FS
   4644
   4645static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
   4646	__acquires(RCU)
   4647{
   4648	struct net *net = seq_file_net(seq);
   4649
   4650	rcu_read_lock();
   4651	return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
   4652}
   4653
   4654static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
   4655{
   4656	struct net *net = seq_file_net(seq);
   4657	return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
   4658}
   4659
   4660static void packet_seq_stop(struct seq_file *seq, void *v)
   4661	__releases(RCU)
   4662{
   4663	rcu_read_unlock();
   4664}
   4665
   4666static int packet_seq_show(struct seq_file *seq, void *v)
   4667{
   4668	if (v == SEQ_START_TOKEN)
   4669		seq_printf(seq,
   4670			   "%*sRefCnt Type Proto  Iface R Rmem   User   Inode\n",
   4671			   IS_ENABLED(CONFIG_64BIT) ? -17 : -9, "sk");
   4672	else {
   4673		struct sock *s = sk_entry(v);
   4674		const struct packet_sock *po = pkt_sk(s);
   4675
   4676		seq_printf(seq,
   4677			   "%pK %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
   4678			   s,
   4679			   refcount_read(&s->sk_refcnt),
   4680			   s->sk_type,
   4681			   ntohs(READ_ONCE(po->num)),
   4682			   READ_ONCE(po->ifindex),
   4683			   po->running,
   4684			   atomic_read(&s->sk_rmem_alloc),
   4685			   from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
   4686			   sock_i_ino(s));
   4687	}
   4688
   4689	return 0;
   4690}
   4691
   4692static const struct seq_operations packet_seq_ops = {
   4693	.start	= packet_seq_start,
   4694	.next	= packet_seq_next,
   4695	.stop	= packet_seq_stop,
   4696	.show	= packet_seq_show,
   4697};
   4698#endif
   4699
   4700static int __net_init packet_net_init(struct net *net)
   4701{
   4702	mutex_init(&net->packet.sklist_lock);
   4703	INIT_HLIST_HEAD(&net->packet.sklist);
   4704
   4705#ifdef CONFIG_PROC_FS
   4706	if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
   4707			sizeof(struct seq_net_private)))
   4708		return -ENOMEM;
   4709#endif /* CONFIG_PROC_FS */
   4710
   4711	return 0;
   4712}
   4713
   4714static void __net_exit packet_net_exit(struct net *net)
   4715{
   4716	remove_proc_entry("packet", net->proc_net);
   4717	WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
   4718}
   4719
   4720static struct pernet_operations packet_net_ops = {
   4721	.init = packet_net_init,
   4722	.exit = packet_net_exit,
   4723};
   4724
   4725
   4726static void __exit packet_exit(void)
   4727{
   4728	unregister_netdevice_notifier(&packet_netdev_notifier);
   4729	unregister_pernet_subsys(&packet_net_ops);
   4730	sock_unregister(PF_PACKET);
   4731	proto_unregister(&packet_proto);
   4732}
   4733
   4734static int __init packet_init(void)
   4735{
   4736	int rc;
   4737
   4738	rc = proto_register(&packet_proto, 0);
   4739	if (rc)
   4740		goto out;
   4741	rc = sock_register(&packet_family_ops);
   4742	if (rc)
   4743		goto out_proto;
   4744	rc = register_pernet_subsys(&packet_net_ops);
   4745	if (rc)
   4746		goto out_sock;
   4747	rc = register_netdevice_notifier(&packet_netdev_notifier);
   4748	if (rc)
   4749		goto out_pernet;
   4750
   4751	return 0;
   4752
   4753out_pernet:
   4754	unregister_pernet_subsys(&packet_net_ops);
   4755out_sock:
   4756	sock_unregister(PF_PACKET);
   4757out_proto:
   4758	proto_unregister(&packet_proto);
   4759out:
   4760	return rc;
   4761}
   4762
   4763module_init(packet_init);
   4764module_exit(packet_exit);
   4765MODULE_LICENSE("GPL");
   4766MODULE_ALIAS_NETPROTO(PF_PACKET);