cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

tun.c (88131B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 *  TUN - Universal TUN/TAP device driver.
      4 *  Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com>
      5 *
      6 *  $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $
      7 */
      8
      9/*
     10 *  Changes:
     11 *
     12 *  Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14
     13 *    Add TUNSETLINK ioctl to set the link encapsulation
     14 *
     15 *  Mark Smith <markzzzsmith@yahoo.com.au>
     16 *    Use eth_random_addr() for tap MAC address.
     17 *
     18 *  Harald Roelle <harald.roelle@ifi.lmu.de>  2004/04/20
     19 *    Fixes in packet dropping, queue length setting and queue wakeup.
     20 *    Increased default tx queue length.
     21 *    Added ethtool API.
     22 *    Minor cleanups
     23 *
     24 *  Daniel Podlejski <underley@underley.eu.org>
     25 *    Modifications for 2.3.99-pre5 kernel.
     26 */
     27
     28#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
     29
     30#define DRV_NAME	"tun"
     31#define DRV_VERSION	"1.6"
     32#define DRV_DESCRIPTION	"Universal TUN/TAP device driver"
     33#define DRV_COPYRIGHT	"(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>"
     34
     35#include <linux/module.h>
     36#include <linux/errno.h>
     37#include <linux/kernel.h>
     38#include <linux/sched/signal.h>
     39#include <linux/major.h>
     40#include <linux/slab.h>
     41#include <linux/poll.h>
     42#include <linux/fcntl.h>
     43#include <linux/init.h>
     44#include <linux/skbuff.h>
     45#include <linux/netdevice.h>
     46#include <linux/etherdevice.h>
     47#include <linux/miscdevice.h>
     48#include <linux/ethtool.h>
     49#include <linux/rtnetlink.h>
     50#include <linux/compat.h>
     51#include <linux/if.h>
     52#include <linux/if_arp.h>
     53#include <linux/if_ether.h>
     54#include <linux/if_tun.h>
     55#include <linux/if_vlan.h>
     56#include <linux/crc32.h>
     57#include <linux/nsproxy.h>
     58#include <linux/virtio_net.h>
     59#include <linux/rcupdate.h>
     60#include <net/net_namespace.h>
     61#include <net/netns/generic.h>
     62#include <net/rtnetlink.h>
     63#include <net/sock.h>
     64#include <net/xdp.h>
     65#include <net/ip_tunnels.h>
     66#include <linux/seq_file.h>
     67#include <linux/uio.h>
     68#include <linux/skb_array.h>
     69#include <linux/bpf.h>
     70#include <linux/bpf_trace.h>
     71#include <linux/mutex.h>
     72#include <linux/ieee802154.h>
     73#include <linux/if_ltalk.h>
     74#include <uapi/linux/if_fddi.h>
     75#include <uapi/linux/if_hippi.h>
     76#include <uapi/linux/if_fc.h>
     77#include <net/ax25.h>
     78#include <net/rose.h>
     79#include <net/6lowpan.h>
     80
     81#include <linux/uaccess.h>
     82#include <linux/proc_fs.h>
     83
     84static void tun_default_link_ksettings(struct net_device *dev,
     85				       struct ethtool_link_ksettings *cmd);
     86
     87#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
     88
     89/* TUN device flags */
     90
     91/* IFF_ATTACH_QUEUE is never stored in device flags,
     92 * overload it to mean fasync when stored there.
     93 */
     94#define TUN_FASYNC	IFF_ATTACH_QUEUE
     95/* High bits in flags field are unused. */
     96#define TUN_VNET_LE     0x80000000
     97#define TUN_VNET_BE     0x40000000
     98
     99#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
    100		      IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
    101
    102#define GOODCOPY_LEN 128
    103
    104#define FLT_EXACT_COUNT 8
    105struct tap_filter {
    106	unsigned int    count;    /* Number of addrs. Zero means disabled */
    107	u32             mask[2];  /* Mask of the hashed addrs */
    108	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
    109};
    110
    111/* MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal
    112 * to max number of VCPUs in guest. */
    113#define MAX_TAP_QUEUES 256
    114#define MAX_TAP_FLOWS  4096
    115
    116#define TUN_FLOW_EXPIRE (3 * HZ)
    117
    118/* A tun_file connects an open character device to a tuntap netdevice. It
    119 * also contains all socket related structures (except sock_fprog and tap_filter)
    120 * to serve as one transmit queue for tuntap device. The sock_fprog and
    121 * tap_filter were kept in tun_struct since they were used for filtering for the
    122 * netdevice not for a specific queue (at least I didn't see the requirement for
    123 * this).
    124 *
    125 * RCU usage:
    126 * The tun_file and tun_struct are loosely coupled, the pointer from one to the
    127 * other can only be read while rcu_read_lock or rtnl_lock is held.
    128 */
    129struct tun_file {
    130	struct sock sk;
    131	struct socket socket;
    132	struct tun_struct __rcu *tun;
    133	struct fasync_struct *fasync;
    134	/* only used for fasnyc */
    135	unsigned int flags;
    136	union {
    137		u16 queue_index;
    138		unsigned int ifindex;
    139	};
    140	struct napi_struct napi;
    141	bool napi_enabled;
    142	bool napi_frags_enabled;
    143	struct mutex napi_mutex;	/* Protects access to the above napi */
    144	struct list_head next;
    145	struct tun_struct *detached;
    146	struct ptr_ring tx_ring;
    147	struct xdp_rxq_info xdp_rxq;
    148};
    149
    150struct tun_page {
    151	struct page *page;
    152	int count;
    153};
    154
    155struct tun_flow_entry {
    156	struct hlist_node hash_link;
    157	struct rcu_head rcu;
    158	struct tun_struct *tun;
    159
    160	u32 rxhash;
    161	u32 rps_rxhash;
    162	int queue_index;
    163	unsigned long updated ____cacheline_aligned_in_smp;
    164};
    165
    166#define TUN_NUM_FLOW_ENTRIES 1024
    167#define TUN_MASK_FLOW_ENTRIES (TUN_NUM_FLOW_ENTRIES - 1)
    168
    169struct tun_prog {
    170	struct rcu_head rcu;
    171	struct bpf_prog *prog;
    172};
    173
    174/* Since the socket were moved to tun_file, to preserve the behavior of persist
    175 * device, socket filter, sndbuf and vnet header size were restore when the
    176 * file were attached to a persist device.
    177 */
    178struct tun_struct {
    179	struct tun_file __rcu	*tfiles[MAX_TAP_QUEUES];
    180	unsigned int            numqueues;
    181	unsigned int 		flags;
    182	kuid_t			owner;
    183	kgid_t			group;
    184
    185	struct net_device	*dev;
    186	netdev_features_t	set_features;
    187#define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
    188			  NETIF_F_TSO6)
    189
    190	int			align;
    191	int			vnet_hdr_sz;
    192	int			sndbuf;
    193	struct tap_filter	txflt;
    194	struct sock_fprog	fprog;
    195	/* protected by rtnl lock */
    196	bool			filter_attached;
    197	u32			msg_enable;
    198	spinlock_t lock;
    199	struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
    200	struct timer_list flow_gc_timer;
    201	unsigned long ageing_time;
    202	unsigned int numdisabled;
    203	struct list_head disabled;
    204	void *security;
    205	u32 flow_count;
    206	u32 rx_batched;
    207	atomic_long_t rx_frame_errors;
    208	struct bpf_prog __rcu *xdp_prog;
    209	struct tun_prog __rcu *steering_prog;
    210	struct tun_prog __rcu *filter_prog;
    211	struct ethtool_link_ksettings link_ksettings;
    212	/* init args */
    213	struct file *file;
    214	struct ifreq *ifr;
    215};
    216
    217struct veth {
    218	__be16 h_vlan_proto;
    219	__be16 h_vlan_TCI;
    220};
    221
    222static void tun_flow_init(struct tun_struct *tun);
    223static void tun_flow_uninit(struct tun_struct *tun);
    224
    225static int tun_napi_receive(struct napi_struct *napi, int budget)
    226{
    227	struct tun_file *tfile = container_of(napi, struct tun_file, napi);
    228	struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
    229	struct sk_buff_head process_queue;
    230	struct sk_buff *skb;
    231	int received = 0;
    232
    233	__skb_queue_head_init(&process_queue);
    234
    235	spin_lock(&queue->lock);
    236	skb_queue_splice_tail_init(queue, &process_queue);
    237	spin_unlock(&queue->lock);
    238
    239	while (received < budget && (skb = __skb_dequeue(&process_queue))) {
    240		napi_gro_receive(napi, skb);
    241		++received;
    242	}
    243
    244	if (!skb_queue_empty(&process_queue)) {
    245		spin_lock(&queue->lock);
    246		skb_queue_splice(&process_queue, queue);
    247		spin_unlock(&queue->lock);
    248	}
    249
    250	return received;
    251}
    252
    253static int tun_napi_poll(struct napi_struct *napi, int budget)
    254{
    255	unsigned int received;
    256
    257	received = tun_napi_receive(napi, budget);
    258
    259	if (received < budget)
    260		napi_complete_done(napi, received);
    261
    262	return received;
    263}
    264
    265static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
    266			  bool napi_en, bool napi_frags)
    267{
    268	tfile->napi_enabled = napi_en;
    269	tfile->napi_frags_enabled = napi_en && napi_frags;
    270	if (napi_en) {
    271		netif_napi_add_tx(tun->dev, &tfile->napi, tun_napi_poll);
    272		napi_enable(&tfile->napi);
    273	}
    274}
    275
    276static void tun_napi_enable(struct tun_file *tfile)
    277{
    278	if (tfile->napi_enabled)
    279		napi_enable(&tfile->napi);
    280}
    281
    282static void tun_napi_disable(struct tun_file *tfile)
    283{
    284	if (tfile->napi_enabled)
    285		napi_disable(&tfile->napi);
    286}
    287
    288static void tun_napi_del(struct tun_file *tfile)
    289{
    290	if (tfile->napi_enabled)
    291		netif_napi_del(&tfile->napi);
    292}
    293
    294static bool tun_napi_frags_enabled(const struct tun_file *tfile)
    295{
    296	return tfile->napi_frags_enabled;
    297}
    298
    299#ifdef CONFIG_TUN_VNET_CROSS_LE
    300static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
    301{
    302	return tun->flags & TUN_VNET_BE ? false :
    303		virtio_legacy_is_little_endian();
    304}
    305
    306static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp)
    307{
    308	int be = !!(tun->flags & TUN_VNET_BE);
    309
    310	if (put_user(be, argp))
    311		return -EFAULT;
    312
    313	return 0;
    314}
    315
    316static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp)
    317{
    318	int be;
    319
    320	if (get_user(be, argp))
    321		return -EFAULT;
    322
    323	if (be)
    324		tun->flags |= TUN_VNET_BE;
    325	else
    326		tun->flags &= ~TUN_VNET_BE;
    327
    328	return 0;
    329}
    330#else
    331static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
    332{
    333	return virtio_legacy_is_little_endian();
    334}
    335
    336static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp)
    337{
    338	return -EINVAL;
    339}
    340
    341static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp)
    342{
    343	return -EINVAL;
    344}
    345#endif /* CONFIG_TUN_VNET_CROSS_LE */
    346
    347static inline bool tun_is_little_endian(struct tun_struct *tun)
    348{
    349	return tun->flags & TUN_VNET_LE ||
    350		tun_legacy_is_little_endian(tun);
    351}
    352
    353static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val)
    354{
    355	return __virtio16_to_cpu(tun_is_little_endian(tun), val);
    356}
    357
    358static inline __virtio16 cpu_to_tun16(struct tun_struct *tun, u16 val)
    359{
    360	return __cpu_to_virtio16(tun_is_little_endian(tun), val);
    361}
    362
    363static inline u32 tun_hashfn(u32 rxhash)
    364{
    365	return rxhash & TUN_MASK_FLOW_ENTRIES;
    366}
    367
    368static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash)
    369{
    370	struct tun_flow_entry *e;
    371
    372	hlist_for_each_entry_rcu(e, head, hash_link) {
    373		if (e->rxhash == rxhash)
    374			return e;
    375	}
    376	return NULL;
    377}
    378
    379static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun,
    380					      struct hlist_head *head,
    381					      u32 rxhash, u16 queue_index)
    382{
    383	struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC);
    384
    385	if (e) {
    386		netif_info(tun, tx_queued, tun->dev,
    387			   "create flow: hash %u index %u\n",
    388			   rxhash, queue_index);
    389		e->updated = jiffies;
    390		e->rxhash = rxhash;
    391		e->rps_rxhash = 0;
    392		e->queue_index = queue_index;
    393		e->tun = tun;
    394		hlist_add_head_rcu(&e->hash_link, head);
    395		++tun->flow_count;
    396	}
    397	return e;
    398}
    399
    400static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
    401{
    402	netif_info(tun, tx_queued, tun->dev, "delete flow: hash %u index %u\n",
    403		   e->rxhash, e->queue_index);
    404	hlist_del_rcu(&e->hash_link);
    405	kfree_rcu(e, rcu);
    406	--tun->flow_count;
    407}
    408
    409static void tun_flow_flush(struct tun_struct *tun)
    410{
    411	int i;
    412
    413	spin_lock_bh(&tun->lock);
    414	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
    415		struct tun_flow_entry *e;
    416		struct hlist_node *n;
    417
    418		hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link)
    419			tun_flow_delete(tun, e);
    420	}
    421	spin_unlock_bh(&tun->lock);
    422}
    423
    424static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index)
    425{
    426	int i;
    427
    428	spin_lock_bh(&tun->lock);
    429	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
    430		struct tun_flow_entry *e;
    431		struct hlist_node *n;
    432
    433		hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
    434			if (e->queue_index == queue_index)
    435				tun_flow_delete(tun, e);
    436		}
    437	}
    438	spin_unlock_bh(&tun->lock);
    439}
    440
    441static void tun_flow_cleanup(struct timer_list *t)
    442{
    443	struct tun_struct *tun = from_timer(tun, t, flow_gc_timer);
    444	unsigned long delay = tun->ageing_time;
    445	unsigned long next_timer = jiffies + delay;
    446	unsigned long count = 0;
    447	int i;
    448
    449	spin_lock(&tun->lock);
    450	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
    451		struct tun_flow_entry *e;
    452		struct hlist_node *n;
    453
    454		hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
    455			unsigned long this_timer;
    456
    457			this_timer = e->updated + delay;
    458			if (time_before_eq(this_timer, jiffies)) {
    459				tun_flow_delete(tun, e);
    460				continue;
    461			}
    462			count++;
    463			if (time_before(this_timer, next_timer))
    464				next_timer = this_timer;
    465		}
    466	}
    467
    468	if (count)
    469		mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer));
    470	spin_unlock(&tun->lock);
    471}
    472
    473static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
    474			    struct tun_file *tfile)
    475{
    476	struct hlist_head *head;
    477	struct tun_flow_entry *e;
    478	unsigned long delay = tun->ageing_time;
    479	u16 queue_index = tfile->queue_index;
    480
    481	head = &tun->flows[tun_hashfn(rxhash)];
    482
    483	rcu_read_lock();
    484
    485	e = tun_flow_find(head, rxhash);
    486	if (likely(e)) {
    487		/* TODO: keep queueing to old queue until it's empty? */
    488		if (READ_ONCE(e->queue_index) != queue_index)
    489			WRITE_ONCE(e->queue_index, queue_index);
    490		if (e->updated != jiffies)
    491			e->updated = jiffies;
    492		sock_rps_record_flow_hash(e->rps_rxhash);
    493	} else {
    494		spin_lock_bh(&tun->lock);
    495		if (!tun_flow_find(head, rxhash) &&
    496		    tun->flow_count < MAX_TAP_FLOWS)
    497			tun_flow_create(tun, head, rxhash, queue_index);
    498
    499		if (!timer_pending(&tun->flow_gc_timer))
    500			mod_timer(&tun->flow_gc_timer,
    501				  round_jiffies_up(jiffies + delay));
    502		spin_unlock_bh(&tun->lock);
    503	}
    504
    505	rcu_read_unlock();
    506}
    507
    508/* Save the hash received in the stack receive path and update the
    509 * flow_hash table accordingly.
    510 */
    511static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
    512{
    513	if (unlikely(e->rps_rxhash != hash))
    514		e->rps_rxhash = hash;
    515}
    516
    517/* We try to identify a flow through its rxhash. The reason that
    518 * we do not check rxq no. is because some cards(e.g 82599), chooses
    519 * the rxq based on the txq where the last packet of the flow comes. As
    520 * the userspace application move between processors, we may get a
    521 * different rxq no. here.
    522 */
    523static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
    524{
    525	struct tun_flow_entry *e;
    526	u32 txq = 0;
    527	u32 numqueues = 0;
    528
    529	numqueues = READ_ONCE(tun->numqueues);
    530
    531	txq = __skb_get_hash_symmetric(skb);
    532	e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
    533	if (e) {
    534		tun_flow_save_rps_rxhash(e, txq);
    535		txq = e->queue_index;
    536	} else {
    537		/* use multiply and shift instead of expensive divide */
    538		txq = ((u64)txq * numqueues) >> 32;
    539	}
    540
    541	return txq;
    542}
    543
    544static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
    545{
    546	struct tun_prog *prog;
    547	u32 numqueues;
    548	u16 ret = 0;
    549
    550	numqueues = READ_ONCE(tun->numqueues);
    551	if (!numqueues)
    552		return 0;
    553
    554	prog = rcu_dereference(tun->steering_prog);
    555	if (prog)
    556		ret = bpf_prog_run_clear_cb(prog->prog, skb);
    557
    558	return ret % numqueues;
    559}
    560
    561static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
    562			    struct net_device *sb_dev)
    563{
    564	struct tun_struct *tun = netdev_priv(dev);
    565	u16 ret;
    566
    567	rcu_read_lock();
    568	if (rcu_dereference(tun->steering_prog))
    569		ret = tun_ebpf_select_queue(tun, skb);
    570	else
    571		ret = tun_automq_select_queue(tun, skb);
    572	rcu_read_unlock();
    573
    574	return ret;
    575}
    576
    577static inline bool tun_not_capable(struct tun_struct *tun)
    578{
    579	const struct cred *cred = current_cred();
    580	struct net *net = dev_net(tun->dev);
    581
    582	return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) ||
    583		  (gid_valid(tun->group) && !in_egroup_p(tun->group))) &&
    584		!ns_capable(net->user_ns, CAP_NET_ADMIN);
    585}
    586
    587static void tun_set_real_num_queues(struct tun_struct *tun)
    588{
    589	netif_set_real_num_tx_queues(tun->dev, tun->numqueues);
    590	netif_set_real_num_rx_queues(tun->dev, tun->numqueues);
    591}
    592
    593static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile)
    594{
    595	tfile->detached = tun;
    596	list_add_tail(&tfile->next, &tun->disabled);
    597	++tun->numdisabled;
    598}
    599
    600static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
    601{
    602	struct tun_struct *tun = tfile->detached;
    603
    604	tfile->detached = NULL;
    605	list_del_init(&tfile->next);
    606	--tun->numdisabled;
    607	return tun;
    608}
    609
    610void tun_ptr_free(void *ptr)
    611{
    612	if (!ptr)
    613		return;
    614	if (tun_is_xdp_frame(ptr)) {
    615		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
    616
    617		xdp_return_frame(xdpf);
    618	} else {
    619		__skb_array_destroy_skb(ptr);
    620	}
    621}
    622EXPORT_SYMBOL_GPL(tun_ptr_free);
    623
    624static void tun_queue_purge(struct tun_file *tfile)
    625{
    626	void *ptr;
    627
    628	while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
    629		tun_ptr_free(ptr);
    630
    631	skb_queue_purge(&tfile->sk.sk_write_queue);
    632	skb_queue_purge(&tfile->sk.sk_error_queue);
    633}
    634
    635static void __tun_detach(struct tun_file *tfile, bool clean)
    636{
    637	struct tun_file *ntfile;
    638	struct tun_struct *tun;
    639
    640	tun = rtnl_dereference(tfile->tun);
    641
    642	if (tun && clean) {
    643		if (!tfile->detached)
    644			tun_napi_disable(tfile);
    645		tun_napi_del(tfile);
    646	}
    647
    648	if (tun && !tfile->detached) {
    649		u16 index = tfile->queue_index;
    650		BUG_ON(index >= tun->numqueues);
    651
    652		rcu_assign_pointer(tun->tfiles[index],
    653				   tun->tfiles[tun->numqueues - 1]);
    654		ntfile = rtnl_dereference(tun->tfiles[index]);
    655		ntfile->queue_index = index;
    656		rcu_assign_pointer(tun->tfiles[tun->numqueues - 1],
    657				   NULL);
    658
    659		--tun->numqueues;
    660		if (clean) {
    661			RCU_INIT_POINTER(tfile->tun, NULL);
    662			sock_put(&tfile->sk);
    663		} else {
    664			tun_disable_queue(tun, tfile);
    665			tun_napi_disable(tfile);
    666		}
    667
    668		synchronize_net();
    669		tun_flow_delete_by_queue(tun, tun->numqueues + 1);
    670		/* Drop read queue */
    671		tun_queue_purge(tfile);
    672		tun_set_real_num_queues(tun);
    673	} else if (tfile->detached && clean) {
    674		tun = tun_enable_queue(tfile);
    675		sock_put(&tfile->sk);
    676	}
    677
    678	if (clean) {
    679		if (tun && tun->numqueues == 0 && tun->numdisabled == 0) {
    680			netif_carrier_off(tun->dev);
    681
    682			if (!(tun->flags & IFF_PERSIST) &&
    683			    tun->dev->reg_state == NETREG_REGISTERED)
    684				unregister_netdevice(tun->dev);
    685		}
    686		if (tun)
    687			xdp_rxq_info_unreg(&tfile->xdp_rxq);
    688		ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
    689		sock_put(&tfile->sk);
    690	}
    691}
    692
    693static void tun_detach(struct tun_file *tfile, bool clean)
    694{
    695	struct tun_struct *tun;
    696	struct net_device *dev;
    697
    698	rtnl_lock();
    699	tun = rtnl_dereference(tfile->tun);
    700	dev = tun ? tun->dev : NULL;
    701	__tun_detach(tfile, clean);
    702	if (dev)
    703		netdev_state_change(dev);
    704	rtnl_unlock();
    705}
    706
    707static void tun_detach_all(struct net_device *dev)
    708{
    709	struct tun_struct *tun = netdev_priv(dev);
    710	struct tun_file *tfile, *tmp;
    711	int i, n = tun->numqueues;
    712
    713	for (i = 0; i < n; i++) {
    714		tfile = rtnl_dereference(tun->tfiles[i]);
    715		BUG_ON(!tfile);
    716		tun_napi_disable(tfile);
    717		tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
    718		tfile->socket.sk->sk_data_ready(tfile->socket.sk);
    719		RCU_INIT_POINTER(tfile->tun, NULL);
    720		--tun->numqueues;
    721	}
    722	list_for_each_entry(tfile, &tun->disabled, next) {
    723		tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
    724		tfile->socket.sk->sk_data_ready(tfile->socket.sk);
    725		RCU_INIT_POINTER(tfile->tun, NULL);
    726	}
    727	BUG_ON(tun->numqueues != 0);
    728
    729	synchronize_net();
    730	for (i = 0; i < n; i++) {
    731		tfile = rtnl_dereference(tun->tfiles[i]);
    732		tun_napi_del(tfile);
    733		/* Drop read queue */
    734		tun_queue_purge(tfile);
    735		xdp_rxq_info_unreg(&tfile->xdp_rxq);
    736		sock_put(&tfile->sk);
    737	}
    738	list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
    739		tun_napi_del(tfile);
    740		tun_enable_queue(tfile);
    741		tun_queue_purge(tfile);
    742		xdp_rxq_info_unreg(&tfile->xdp_rxq);
    743		sock_put(&tfile->sk);
    744	}
    745	BUG_ON(tun->numdisabled != 0);
    746
    747	if (tun->flags & IFF_PERSIST)
    748		module_put(THIS_MODULE);
    749}
    750
    751static int tun_attach(struct tun_struct *tun, struct file *file,
    752		      bool skip_filter, bool napi, bool napi_frags,
    753		      bool publish_tun)
    754{
    755	struct tun_file *tfile = file->private_data;
    756	struct net_device *dev = tun->dev;
    757	int err;
    758
    759	err = security_tun_dev_attach(tfile->socket.sk, tun->security);
    760	if (err < 0)
    761		goto out;
    762
    763	err = -EINVAL;
    764	if (rtnl_dereference(tfile->tun) && !tfile->detached)
    765		goto out;
    766
    767	err = -EBUSY;
    768	if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == 1)
    769		goto out;
    770
    771	err = -E2BIG;
    772	if (!tfile->detached &&
    773	    tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES)
    774		goto out;
    775
    776	err = 0;
    777
    778	/* Re-attach the filter to persist device */
    779	if (!skip_filter && (tun->filter_attached == true)) {
    780		lock_sock(tfile->socket.sk);
    781		err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
    782		release_sock(tfile->socket.sk);
    783		if (!err)
    784			goto out;
    785	}
    786
    787	if (!tfile->detached &&
    788	    ptr_ring_resize(&tfile->tx_ring, dev->tx_queue_len,
    789			    GFP_KERNEL, tun_ptr_free)) {
    790		err = -ENOMEM;
    791		goto out;
    792	}
    793
    794	tfile->queue_index = tun->numqueues;
    795	tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
    796
    797	if (tfile->detached) {
    798		/* Re-attach detached tfile, updating XDP queue_index */
    799		WARN_ON(!xdp_rxq_info_is_reg(&tfile->xdp_rxq));
    800
    801		if (tfile->xdp_rxq.queue_index    != tfile->queue_index)
    802			tfile->xdp_rxq.queue_index = tfile->queue_index;
    803	} else {
    804		/* Setup XDP RX-queue info, for new tfile getting attached */
    805		err = xdp_rxq_info_reg(&tfile->xdp_rxq,
    806				       tun->dev, tfile->queue_index, 0);
    807		if (err < 0)
    808			goto out;
    809		err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq,
    810						 MEM_TYPE_PAGE_SHARED, NULL);
    811		if (err < 0) {
    812			xdp_rxq_info_unreg(&tfile->xdp_rxq);
    813			goto out;
    814		}
    815		err = 0;
    816	}
    817
    818	if (tfile->detached) {
    819		tun_enable_queue(tfile);
    820		tun_napi_enable(tfile);
    821	} else {
    822		sock_hold(&tfile->sk);
    823		tun_napi_init(tun, tfile, napi, napi_frags);
    824	}
    825
    826	if (rtnl_dereference(tun->xdp_prog))
    827		sock_set_flag(&tfile->sk, SOCK_XDP);
    828
    829	/* device is allowed to go away first, so no need to hold extra
    830	 * refcnt.
    831	 */
    832
    833	/* Publish tfile->tun and tun->tfiles only after we've fully
    834	 * initialized tfile; otherwise we risk using half-initialized
    835	 * object.
    836	 */
    837	if (publish_tun)
    838		rcu_assign_pointer(tfile->tun, tun);
    839	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
    840	tun->numqueues++;
    841	tun_set_real_num_queues(tun);
    842out:
    843	return err;
    844}
    845
    846static struct tun_struct *tun_get(struct tun_file *tfile)
    847{
    848	struct tun_struct *tun;
    849
    850	rcu_read_lock();
    851	tun = rcu_dereference(tfile->tun);
    852	if (tun)
    853		dev_hold(tun->dev);
    854	rcu_read_unlock();
    855
    856	return tun;
    857}
    858
    859static void tun_put(struct tun_struct *tun)
    860{
    861	dev_put(tun->dev);
    862}
    863
    864/* TAP filtering */
    865static void addr_hash_set(u32 *mask, const u8 *addr)
    866{
    867	int n = ether_crc(ETH_ALEN, addr) >> 26;
    868	mask[n >> 5] |= (1 << (n & 31));
    869}
    870
    871static unsigned int addr_hash_test(const u32 *mask, const u8 *addr)
    872{
    873	int n = ether_crc(ETH_ALEN, addr) >> 26;
    874	return mask[n >> 5] & (1 << (n & 31));
    875}
    876
    877static int update_filter(struct tap_filter *filter, void __user *arg)
    878{
    879	struct { u8 u[ETH_ALEN]; } *addr;
    880	struct tun_filter uf;
    881	int err, alen, n, nexact;
    882
    883	if (copy_from_user(&uf, arg, sizeof(uf)))
    884		return -EFAULT;
    885
    886	if (!uf.count) {
    887		/* Disabled */
    888		filter->count = 0;
    889		return 0;
    890	}
    891
    892	alen = ETH_ALEN * uf.count;
    893	addr = memdup_user(arg + sizeof(uf), alen);
    894	if (IS_ERR(addr))
    895		return PTR_ERR(addr);
    896
    897	/* The filter is updated without holding any locks. Which is
    898	 * perfectly safe. We disable it first and in the worst
    899	 * case we'll accept a few undesired packets. */
    900	filter->count = 0;
    901	wmb();
    902
    903	/* Use first set of addresses as an exact filter */
    904	for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++)
    905		memcpy(filter->addr[n], addr[n].u, ETH_ALEN);
    906
    907	nexact = n;
    908
    909	/* Remaining multicast addresses are hashed,
    910	 * unicast will leave the filter disabled. */
    911	memset(filter->mask, 0, sizeof(filter->mask));
    912	for (; n < uf.count; n++) {
    913		if (!is_multicast_ether_addr(addr[n].u)) {
    914			err = 0; /* no filter */
    915			goto free_addr;
    916		}
    917		addr_hash_set(filter->mask, addr[n].u);
    918	}
    919
    920	/* For ALLMULTI just set the mask to all ones.
    921	 * This overrides the mask populated above. */
    922	if ((uf.flags & TUN_FLT_ALLMULTI))
    923		memset(filter->mask, ~0, sizeof(filter->mask));
    924
    925	/* Now enable the filter */
    926	wmb();
    927	filter->count = nexact;
    928
    929	/* Return the number of exact filters */
    930	err = nexact;
    931free_addr:
    932	kfree(addr);
    933	return err;
    934}
    935
    936/* Returns: 0 - drop, !=0 - accept */
    937static int run_filter(struct tap_filter *filter, const struct sk_buff *skb)
    938{
    939	/* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect
    940	 * at this point. */
    941	struct ethhdr *eh = (struct ethhdr *) skb->data;
    942	int i;
    943
    944	/* Exact match */
    945	for (i = 0; i < filter->count; i++)
    946		if (ether_addr_equal(eh->h_dest, filter->addr[i]))
    947			return 1;
    948
    949	/* Inexact match (multicast only) */
    950	if (is_multicast_ether_addr(eh->h_dest))
    951		return addr_hash_test(filter->mask, eh->h_dest);
    952
    953	return 0;
    954}
    955
    956/*
    957 * Checks whether the packet is accepted or not.
    958 * Returns: 0 - drop, !=0 - accept
    959 */
    960static int check_filter(struct tap_filter *filter, const struct sk_buff *skb)
    961{
    962	if (!filter->count)
    963		return 1;
    964
    965	return run_filter(filter, skb);
    966}
    967
    968/* Network device part of the driver */
    969
    970static const struct ethtool_ops tun_ethtool_ops;
    971
    972static int tun_net_init(struct net_device *dev)
    973{
    974	struct tun_struct *tun = netdev_priv(dev);
    975	struct ifreq *ifr = tun->ifr;
    976	int err;
    977
    978	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
    979	if (!dev->tstats)
    980		return -ENOMEM;
    981
    982	spin_lock_init(&tun->lock);
    983
    984	err = security_tun_dev_alloc_security(&tun->security);
    985	if (err < 0) {
    986		free_percpu(dev->tstats);
    987		return err;
    988	}
    989
    990	tun_flow_init(tun);
    991
    992	dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
    993			   TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
    994			   NETIF_F_HW_VLAN_STAG_TX;
    995	dev->features = dev->hw_features | NETIF_F_LLTX;
    996	dev->vlan_features = dev->features &
    997			     ~(NETIF_F_HW_VLAN_CTAG_TX |
    998			       NETIF_F_HW_VLAN_STAG_TX);
    999
   1000	tun->flags = (tun->flags & ~TUN_FEATURES) |
   1001		      (ifr->ifr_flags & TUN_FEATURES);
   1002
   1003	INIT_LIST_HEAD(&tun->disabled);
   1004	err = tun_attach(tun, tun->file, false, ifr->ifr_flags & IFF_NAPI,
   1005			 ifr->ifr_flags & IFF_NAPI_FRAGS, false);
   1006	if (err < 0) {
   1007		tun_flow_uninit(tun);
   1008		security_tun_dev_free_security(tun->security);
   1009		free_percpu(dev->tstats);
   1010		return err;
   1011	}
   1012	return 0;
   1013}
   1014
   1015/* Net device detach from fd. */
   1016static void tun_net_uninit(struct net_device *dev)
   1017{
   1018	tun_detach_all(dev);
   1019}
   1020
   1021/* Net device open. */
   1022static int tun_net_open(struct net_device *dev)
   1023{
   1024	netif_tx_start_all_queues(dev);
   1025
   1026	return 0;
   1027}
   1028
   1029/* Net device close. */
   1030static int tun_net_close(struct net_device *dev)
   1031{
   1032	netif_tx_stop_all_queues(dev);
   1033	return 0;
   1034}
   1035
   1036/* Net device start xmit */
   1037static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
   1038{
   1039#ifdef CONFIG_RPS
   1040	if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) {
   1041		/* Select queue was not called for the skbuff, so we extract the
   1042		 * RPS hash and save it into the flow_table here.
   1043		 */
   1044		struct tun_flow_entry *e;
   1045		__u32 rxhash;
   1046
   1047		rxhash = __skb_get_hash_symmetric(skb);
   1048		e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash);
   1049		if (e)
   1050			tun_flow_save_rps_rxhash(e, rxhash);
   1051	}
   1052#endif
   1053}
   1054
   1055static unsigned int run_ebpf_filter(struct tun_struct *tun,
   1056				    struct sk_buff *skb,
   1057				    int len)
   1058{
   1059	struct tun_prog *prog = rcu_dereference(tun->filter_prog);
   1060
   1061	if (prog)
   1062		len = bpf_prog_run_clear_cb(prog->prog, skb);
   1063
   1064	return len;
   1065}
   1066
   1067/* Net device start xmit */
   1068static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
   1069{
   1070	struct tun_struct *tun = netdev_priv(dev);
   1071	enum skb_drop_reason drop_reason;
   1072	int txq = skb->queue_mapping;
   1073	struct netdev_queue *queue;
   1074	struct tun_file *tfile;
   1075	int len = skb->len;
   1076
   1077	rcu_read_lock();
   1078	tfile = rcu_dereference(tun->tfiles[txq]);
   1079
   1080	/* Drop packet if interface is not attached */
   1081	if (!tfile) {
   1082		drop_reason = SKB_DROP_REASON_DEV_READY;
   1083		goto drop;
   1084	}
   1085
   1086	if (!rcu_dereference(tun->steering_prog))
   1087		tun_automq_xmit(tun, skb);
   1088
   1089	netif_info(tun, tx_queued, tun->dev, "%s %d\n", __func__, skb->len);
   1090
   1091	/* Drop if the filter does not like it.
   1092	 * This is a noop if the filter is disabled.
   1093	 * Filter can be enabled only for the TAP devices. */
   1094	if (!check_filter(&tun->txflt, skb)) {
   1095		drop_reason = SKB_DROP_REASON_TAP_TXFILTER;
   1096		goto drop;
   1097	}
   1098
   1099	if (tfile->socket.sk->sk_filter &&
   1100	    sk_filter(tfile->socket.sk, skb)) {
   1101		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
   1102		goto drop;
   1103	}
   1104
   1105	len = run_ebpf_filter(tun, skb, len);
   1106	if (len == 0) {
   1107		drop_reason = SKB_DROP_REASON_TAP_FILTER;
   1108		goto drop;
   1109	}
   1110
   1111	if (pskb_trim(skb, len)) {
   1112		drop_reason = SKB_DROP_REASON_NOMEM;
   1113		goto drop;
   1114	}
   1115
   1116	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) {
   1117		drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT;
   1118		goto drop;
   1119	}
   1120
   1121	skb_tx_timestamp(skb);
   1122
   1123	/* Orphan the skb - required as we might hang on to it
   1124	 * for indefinite time.
   1125	 */
   1126	skb_orphan(skb);
   1127
   1128	nf_reset_ct(skb);
   1129
   1130	if (ptr_ring_produce(&tfile->tx_ring, skb)) {
   1131		drop_reason = SKB_DROP_REASON_FULL_RING;
   1132		goto drop;
   1133	}
   1134
   1135	/* NETIF_F_LLTX requires to do our own update of trans_start */
   1136	queue = netdev_get_tx_queue(dev, txq);
   1137	txq_trans_cond_update(queue);
   1138
   1139	/* Notify and wake up reader process */
   1140	if (tfile->flags & TUN_FASYNC)
   1141		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
   1142	tfile->socket.sk->sk_data_ready(tfile->socket.sk);
   1143
   1144	rcu_read_unlock();
   1145	return NETDEV_TX_OK;
   1146
   1147drop:
   1148	dev_core_stats_tx_dropped_inc(dev);
   1149	skb_tx_error(skb);
   1150	kfree_skb_reason(skb, drop_reason);
   1151	rcu_read_unlock();
   1152	return NET_XMIT_DROP;
   1153}
   1154
   1155static void tun_net_mclist(struct net_device *dev)
   1156{
   1157	/*
   1158	 * This callback is supposed to deal with mc filter in
   1159	 * _rx_ path and has nothing to do with the _tx_ path.
   1160	 * In rx path we always accept everything userspace gives us.
   1161	 */
   1162}
   1163
   1164static netdev_features_t tun_net_fix_features(struct net_device *dev,
   1165	netdev_features_t features)
   1166{
   1167	struct tun_struct *tun = netdev_priv(dev);
   1168
   1169	return (features & tun->set_features) | (features & ~TUN_USER_FEATURES);
   1170}
   1171
   1172static void tun_set_headroom(struct net_device *dev, int new_hr)
   1173{
   1174	struct tun_struct *tun = netdev_priv(dev);
   1175
   1176	if (new_hr < NET_SKB_PAD)
   1177		new_hr = NET_SKB_PAD;
   1178
   1179	tun->align = new_hr;
   1180}
   1181
   1182static void
   1183tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
   1184{
   1185	struct tun_struct *tun = netdev_priv(dev);
   1186
   1187	dev_get_tstats64(dev, stats);
   1188
   1189	stats->rx_frame_errors +=
   1190		(unsigned long)atomic_long_read(&tun->rx_frame_errors);
   1191}
   1192
   1193static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
   1194		       struct netlink_ext_ack *extack)
   1195{
   1196	struct tun_struct *tun = netdev_priv(dev);
   1197	struct tun_file *tfile;
   1198	struct bpf_prog *old_prog;
   1199	int i;
   1200
   1201	old_prog = rtnl_dereference(tun->xdp_prog);
   1202	rcu_assign_pointer(tun->xdp_prog, prog);
   1203	if (old_prog)
   1204		bpf_prog_put(old_prog);
   1205
   1206	for (i = 0; i < tun->numqueues; i++) {
   1207		tfile = rtnl_dereference(tun->tfiles[i]);
   1208		if (prog)
   1209			sock_set_flag(&tfile->sk, SOCK_XDP);
   1210		else
   1211			sock_reset_flag(&tfile->sk, SOCK_XDP);
   1212	}
   1213	list_for_each_entry(tfile, &tun->disabled, next) {
   1214		if (prog)
   1215			sock_set_flag(&tfile->sk, SOCK_XDP);
   1216		else
   1217			sock_reset_flag(&tfile->sk, SOCK_XDP);
   1218	}
   1219
   1220	return 0;
   1221}
   1222
   1223static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
   1224{
   1225	switch (xdp->command) {
   1226	case XDP_SETUP_PROG:
   1227		return tun_xdp_set(dev, xdp->prog, xdp->extack);
   1228	default:
   1229		return -EINVAL;
   1230	}
   1231}
   1232
   1233static int tun_net_change_carrier(struct net_device *dev, bool new_carrier)
   1234{
   1235	if (new_carrier) {
   1236		struct tun_struct *tun = netdev_priv(dev);
   1237
   1238		if (!tun->numqueues)
   1239			return -EPERM;
   1240
   1241		netif_carrier_on(dev);
   1242	} else {
   1243		netif_carrier_off(dev);
   1244	}
   1245	return 0;
   1246}
   1247
   1248static const struct net_device_ops tun_netdev_ops = {
   1249	.ndo_init		= tun_net_init,
   1250	.ndo_uninit		= tun_net_uninit,
   1251	.ndo_open		= tun_net_open,
   1252	.ndo_stop		= tun_net_close,
   1253	.ndo_start_xmit		= tun_net_xmit,
   1254	.ndo_fix_features	= tun_net_fix_features,
   1255	.ndo_select_queue	= tun_select_queue,
   1256	.ndo_set_rx_headroom	= tun_set_headroom,
   1257	.ndo_get_stats64	= tun_net_get_stats64,
   1258	.ndo_change_carrier	= tun_net_change_carrier,
   1259};
   1260
   1261static void __tun_xdp_flush_tfile(struct tun_file *tfile)
   1262{
   1263	/* Notify and wake up reader process */
   1264	if (tfile->flags & TUN_FASYNC)
   1265		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
   1266	tfile->socket.sk->sk_data_ready(tfile->socket.sk);
   1267}
   1268
   1269static int tun_xdp_xmit(struct net_device *dev, int n,
   1270			struct xdp_frame **frames, u32 flags)
   1271{
   1272	struct tun_struct *tun = netdev_priv(dev);
   1273	struct tun_file *tfile;
   1274	u32 numqueues;
   1275	int nxmit = 0;
   1276	int i;
   1277
   1278	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
   1279		return -EINVAL;
   1280
   1281	rcu_read_lock();
   1282
   1283resample:
   1284	numqueues = READ_ONCE(tun->numqueues);
   1285	if (!numqueues) {
   1286		rcu_read_unlock();
   1287		return -ENXIO; /* Caller will free/return all frames */
   1288	}
   1289
   1290	tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
   1291					    numqueues]);
   1292	if (unlikely(!tfile))
   1293		goto resample;
   1294
   1295	spin_lock(&tfile->tx_ring.producer_lock);
   1296	for (i = 0; i < n; i++) {
   1297		struct xdp_frame *xdp = frames[i];
   1298		/* Encode the XDP flag into lowest bit for consumer to differ
   1299		 * XDP buffer from sk_buff.
   1300		 */
   1301		void *frame = tun_xdp_to_ptr(xdp);
   1302
   1303		if (__ptr_ring_produce(&tfile->tx_ring, frame)) {
   1304			dev_core_stats_tx_dropped_inc(dev);
   1305			break;
   1306		}
   1307		nxmit++;
   1308	}
   1309	spin_unlock(&tfile->tx_ring.producer_lock);
   1310
   1311	if (flags & XDP_XMIT_FLUSH)
   1312		__tun_xdp_flush_tfile(tfile);
   1313
   1314	rcu_read_unlock();
   1315	return nxmit;
   1316}
   1317
   1318static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
   1319{
   1320	struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp);
   1321	int nxmit;
   1322
   1323	if (unlikely(!frame))
   1324		return -EOVERFLOW;
   1325
   1326	nxmit = tun_xdp_xmit(dev, 1, &frame, XDP_XMIT_FLUSH);
   1327	if (!nxmit)
   1328		xdp_return_frame_rx_napi(frame);
   1329	return nxmit;
   1330}
   1331
   1332static const struct net_device_ops tap_netdev_ops = {
   1333	.ndo_init		= tun_net_init,
   1334	.ndo_uninit		= tun_net_uninit,
   1335	.ndo_open		= tun_net_open,
   1336	.ndo_stop		= tun_net_close,
   1337	.ndo_start_xmit		= tun_net_xmit,
   1338	.ndo_fix_features	= tun_net_fix_features,
   1339	.ndo_set_rx_mode	= tun_net_mclist,
   1340	.ndo_set_mac_address	= eth_mac_addr,
   1341	.ndo_validate_addr	= eth_validate_addr,
   1342	.ndo_select_queue	= tun_select_queue,
   1343	.ndo_features_check	= passthru_features_check,
   1344	.ndo_set_rx_headroom	= tun_set_headroom,
   1345	.ndo_get_stats64	= dev_get_tstats64,
   1346	.ndo_bpf		= tun_xdp,
   1347	.ndo_xdp_xmit		= tun_xdp_xmit,
   1348	.ndo_change_carrier	= tun_net_change_carrier,
   1349};
   1350
   1351static void tun_flow_init(struct tun_struct *tun)
   1352{
   1353	int i;
   1354
   1355	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++)
   1356		INIT_HLIST_HEAD(&tun->flows[i]);
   1357
   1358	tun->ageing_time = TUN_FLOW_EXPIRE;
   1359	timer_setup(&tun->flow_gc_timer, tun_flow_cleanup, 0);
   1360	mod_timer(&tun->flow_gc_timer,
   1361		  round_jiffies_up(jiffies + tun->ageing_time));
   1362}
   1363
   1364static void tun_flow_uninit(struct tun_struct *tun)
   1365{
   1366	del_timer_sync(&tun->flow_gc_timer);
   1367	tun_flow_flush(tun);
   1368}
   1369
   1370#define MIN_MTU 68
   1371#define MAX_MTU 65535
   1372
   1373/* Initialize net device. */
   1374static void tun_net_initialize(struct net_device *dev)
   1375{
   1376	struct tun_struct *tun = netdev_priv(dev);
   1377
   1378	switch (tun->flags & TUN_TYPE_MASK) {
   1379	case IFF_TUN:
   1380		dev->netdev_ops = &tun_netdev_ops;
   1381		dev->header_ops = &ip_tunnel_header_ops;
   1382
   1383		/* Point-to-Point TUN Device */
   1384		dev->hard_header_len = 0;
   1385		dev->addr_len = 0;
   1386		dev->mtu = 1500;
   1387
   1388		/* Zero header length */
   1389		dev->type = ARPHRD_NONE;
   1390		dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
   1391		break;
   1392
   1393	case IFF_TAP:
   1394		dev->netdev_ops = &tap_netdev_ops;
   1395		/* Ethernet TAP Device */
   1396		ether_setup(dev);
   1397		dev->priv_flags &= ~IFF_TX_SKB_SHARING;
   1398		dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
   1399
   1400		eth_hw_addr_random(dev);
   1401
   1402		break;
   1403	}
   1404
   1405	dev->min_mtu = MIN_MTU;
   1406	dev->max_mtu = MAX_MTU - dev->hard_header_len;
   1407}
   1408
   1409static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file *tfile)
   1410{
   1411	struct sock *sk = tfile->socket.sk;
   1412
   1413	return (tun->dev->flags & IFF_UP) && sock_writeable(sk);
   1414}
   1415
   1416/* Character device part */
   1417
   1418/* Poll */
   1419static __poll_t tun_chr_poll(struct file *file, poll_table *wait)
   1420{
   1421	struct tun_file *tfile = file->private_data;
   1422	struct tun_struct *tun = tun_get(tfile);
   1423	struct sock *sk;
   1424	__poll_t mask = 0;
   1425
   1426	if (!tun)
   1427		return EPOLLERR;
   1428
   1429	sk = tfile->socket.sk;
   1430
   1431	poll_wait(file, sk_sleep(sk), wait);
   1432
   1433	if (!ptr_ring_empty(&tfile->tx_ring))
   1434		mask |= EPOLLIN | EPOLLRDNORM;
   1435
   1436	/* Make sure SOCKWQ_ASYNC_NOSPACE is set if not writable to
   1437	 * guarantee EPOLLOUT to be raised by either here or
   1438	 * tun_sock_write_space(). Then process could get notification
   1439	 * after it writes to a down device and meets -EIO.
   1440	 */
   1441	if (tun_sock_writeable(tun, tfile) ||
   1442	    (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
   1443	     tun_sock_writeable(tun, tfile)))
   1444		mask |= EPOLLOUT | EPOLLWRNORM;
   1445
   1446	if (tun->dev->reg_state != NETREG_REGISTERED)
   1447		mask = EPOLLERR;
   1448
   1449	tun_put(tun);
   1450	return mask;
   1451}
   1452
   1453static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
   1454					    size_t len,
   1455					    const struct iov_iter *it)
   1456{
   1457	struct sk_buff *skb;
   1458	size_t linear;
   1459	int err;
   1460	int i;
   1461
   1462	if (it->nr_segs > MAX_SKB_FRAGS + 1)
   1463		return ERR_PTR(-EMSGSIZE);
   1464
   1465	local_bh_disable();
   1466	skb = napi_get_frags(&tfile->napi);
   1467	local_bh_enable();
   1468	if (!skb)
   1469		return ERR_PTR(-ENOMEM);
   1470
   1471	linear = iov_iter_single_seg_count(it);
   1472	err = __skb_grow(skb, linear);
   1473	if (err)
   1474		goto free;
   1475
   1476	skb->len = len;
   1477	skb->data_len = len - linear;
   1478	skb->truesize += skb->data_len;
   1479
   1480	for (i = 1; i < it->nr_segs; i++) {
   1481		size_t fragsz = it->iov[i].iov_len;
   1482		struct page *page;
   1483		void *frag;
   1484
   1485		if (fragsz == 0 || fragsz > PAGE_SIZE) {
   1486			err = -EINVAL;
   1487			goto free;
   1488		}
   1489		frag = netdev_alloc_frag(fragsz);
   1490		if (!frag) {
   1491			err = -ENOMEM;
   1492			goto free;
   1493		}
   1494		page = virt_to_head_page(frag);
   1495		skb_fill_page_desc(skb, i - 1, page,
   1496				   frag - page_address(page), fragsz);
   1497	}
   1498
   1499	return skb;
   1500free:
   1501	/* frees skb and all frags allocated with napi_alloc_frag() */
   1502	napi_free_frags(&tfile->napi);
   1503	return ERR_PTR(err);
   1504}
   1505
   1506/* prepad is the amount to reserve at front.  len is length after that.
   1507 * linear is a hint as to how much to copy (usually headers). */
   1508static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
   1509				     size_t prepad, size_t len,
   1510				     size_t linear, int noblock)
   1511{
   1512	struct sock *sk = tfile->socket.sk;
   1513	struct sk_buff *skb;
   1514	int err;
   1515
   1516	/* Under a page?  Don't bother with paged skb. */
   1517	if (prepad + len < PAGE_SIZE || !linear)
   1518		linear = len;
   1519
   1520	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
   1521				   &err, 0);
   1522	if (!skb)
   1523		return ERR_PTR(err);
   1524
   1525	skb_reserve(skb, prepad);
   1526	skb_put(skb, linear);
   1527	skb->data_len = len - linear;
   1528	skb->len += len - linear;
   1529
   1530	return skb;
   1531}
   1532
   1533static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
   1534			   struct sk_buff *skb, int more)
   1535{
   1536	struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
   1537	struct sk_buff_head process_queue;
   1538	u32 rx_batched = tun->rx_batched;
   1539	bool rcv = false;
   1540
   1541	if (!rx_batched || (!more && skb_queue_empty(queue))) {
   1542		local_bh_disable();
   1543		skb_record_rx_queue(skb, tfile->queue_index);
   1544		netif_receive_skb(skb);
   1545		local_bh_enable();
   1546		return;
   1547	}
   1548
   1549	spin_lock(&queue->lock);
   1550	if (!more || skb_queue_len(queue) == rx_batched) {
   1551		__skb_queue_head_init(&process_queue);
   1552		skb_queue_splice_tail_init(queue, &process_queue);
   1553		rcv = true;
   1554	} else {
   1555		__skb_queue_tail(queue, skb);
   1556	}
   1557	spin_unlock(&queue->lock);
   1558
   1559	if (rcv) {
   1560		struct sk_buff *nskb;
   1561
   1562		local_bh_disable();
   1563		while ((nskb = __skb_dequeue(&process_queue))) {
   1564			skb_record_rx_queue(nskb, tfile->queue_index);
   1565			netif_receive_skb(nskb);
   1566		}
   1567		skb_record_rx_queue(skb, tfile->queue_index);
   1568		netif_receive_skb(skb);
   1569		local_bh_enable();
   1570	}
   1571}
   1572
   1573static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
   1574			      int len, int noblock, bool zerocopy)
   1575{
   1576	if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
   1577		return false;
   1578
   1579	if (tfile->socket.sk->sk_sndbuf != INT_MAX)
   1580		return false;
   1581
   1582	if (!noblock)
   1583		return false;
   1584
   1585	if (zerocopy)
   1586		return false;
   1587
   1588	if (SKB_DATA_ALIGN(len + TUN_RX_PAD) +
   1589	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
   1590		return false;
   1591
   1592	return true;
   1593}
   1594
   1595static struct sk_buff *__tun_build_skb(struct tun_file *tfile,
   1596				       struct page_frag *alloc_frag, char *buf,
   1597				       int buflen, int len, int pad)
   1598{
   1599	struct sk_buff *skb = build_skb(buf, buflen);
   1600
   1601	if (!skb)
   1602		return ERR_PTR(-ENOMEM);
   1603
   1604	skb_reserve(skb, pad);
   1605	skb_put(skb, len);
   1606	skb_set_owner_w(skb, tfile->socket.sk);
   1607
   1608	get_page(alloc_frag->page);
   1609	alloc_frag->offset += buflen;
   1610
   1611	return skb;
   1612}
   1613
   1614static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
   1615		       struct xdp_buff *xdp, u32 act)
   1616{
   1617	int err;
   1618
   1619	switch (act) {
   1620	case XDP_REDIRECT:
   1621		err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
   1622		if (err)
   1623			return err;
   1624		break;
   1625	case XDP_TX:
   1626		err = tun_xdp_tx(tun->dev, xdp);
   1627		if (err < 0)
   1628			return err;
   1629		break;
   1630	case XDP_PASS:
   1631		break;
   1632	default:
   1633		bpf_warn_invalid_xdp_action(tun->dev, xdp_prog, act);
   1634		fallthrough;
   1635	case XDP_ABORTED:
   1636		trace_xdp_exception(tun->dev, xdp_prog, act);
   1637		fallthrough;
   1638	case XDP_DROP:
   1639		dev_core_stats_rx_dropped_inc(tun->dev);
   1640		break;
   1641	}
   1642
   1643	return act;
   1644}
   1645
   1646static struct sk_buff *tun_build_skb(struct tun_struct *tun,
   1647				     struct tun_file *tfile,
   1648				     struct iov_iter *from,
   1649				     struct virtio_net_hdr *hdr,
   1650				     int len, int *skb_xdp)
   1651{
   1652	struct page_frag *alloc_frag = &current->task_frag;
   1653	struct bpf_prog *xdp_prog;
   1654	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
   1655	char *buf;
   1656	size_t copied;
   1657	int pad = TUN_RX_PAD;
   1658	int err = 0;
   1659
   1660	rcu_read_lock();
   1661	xdp_prog = rcu_dereference(tun->xdp_prog);
   1662	if (xdp_prog)
   1663		pad += XDP_PACKET_HEADROOM;
   1664	buflen += SKB_DATA_ALIGN(len + pad);
   1665	rcu_read_unlock();
   1666
   1667	alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
   1668	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
   1669		return ERR_PTR(-ENOMEM);
   1670
   1671	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
   1672	copied = copy_page_from_iter(alloc_frag->page,
   1673				     alloc_frag->offset + pad,
   1674				     len, from);
   1675	if (copied != len)
   1676		return ERR_PTR(-EFAULT);
   1677
   1678	/* There's a small window that XDP may be set after the check
   1679	 * of xdp_prog above, this should be rare and for simplicity
   1680	 * we do XDP on skb in case the headroom is not enough.
   1681	 */
   1682	if (hdr->gso_type || !xdp_prog) {
   1683		*skb_xdp = 1;
   1684		return __tun_build_skb(tfile, alloc_frag, buf, buflen, len,
   1685				       pad);
   1686	}
   1687
   1688	*skb_xdp = 0;
   1689
   1690	local_bh_disable();
   1691	rcu_read_lock();
   1692	xdp_prog = rcu_dereference(tun->xdp_prog);
   1693	if (xdp_prog) {
   1694		struct xdp_buff xdp;
   1695		u32 act;
   1696
   1697		xdp_init_buff(&xdp, buflen, &tfile->xdp_rxq);
   1698		xdp_prepare_buff(&xdp, buf, pad, len, false);
   1699
   1700		act = bpf_prog_run_xdp(xdp_prog, &xdp);
   1701		if (act == XDP_REDIRECT || act == XDP_TX) {
   1702			get_page(alloc_frag->page);
   1703			alloc_frag->offset += buflen;
   1704		}
   1705		err = tun_xdp_act(tun, xdp_prog, &xdp, act);
   1706		if (err < 0) {
   1707			if (act == XDP_REDIRECT || act == XDP_TX)
   1708				put_page(alloc_frag->page);
   1709			goto out;
   1710		}
   1711
   1712		if (err == XDP_REDIRECT)
   1713			xdp_do_flush();
   1714		if (err != XDP_PASS)
   1715			goto out;
   1716
   1717		pad = xdp.data - xdp.data_hard_start;
   1718		len = xdp.data_end - xdp.data;
   1719	}
   1720	rcu_read_unlock();
   1721	local_bh_enable();
   1722
   1723	return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad);
   1724
   1725out:
   1726	rcu_read_unlock();
   1727	local_bh_enable();
   1728	return NULL;
   1729}
   1730
   1731/* Get packet from user space buffer */
   1732static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
   1733			    void *msg_control, struct iov_iter *from,
   1734			    int noblock, bool more)
   1735{
   1736	struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
   1737	struct sk_buff *skb;
   1738	size_t total_len = iov_iter_count(from);
   1739	size_t len = total_len, align = tun->align, linear;
   1740	struct virtio_net_hdr gso = { 0 };
   1741	int good_linear;
   1742	int copylen;
   1743	bool zerocopy = false;
   1744	int err;
   1745	u32 rxhash = 0;
   1746	int skb_xdp = 1;
   1747	bool frags = tun_napi_frags_enabled(tfile);
   1748	enum skb_drop_reason drop_reason;
   1749
   1750	if (!(tun->flags & IFF_NO_PI)) {
   1751		if (len < sizeof(pi))
   1752			return -EINVAL;
   1753		len -= sizeof(pi);
   1754
   1755		if (!copy_from_iter_full(&pi, sizeof(pi), from))
   1756			return -EFAULT;
   1757	}
   1758
   1759	if (tun->flags & IFF_VNET_HDR) {
   1760		int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
   1761
   1762		if (len < vnet_hdr_sz)
   1763			return -EINVAL;
   1764		len -= vnet_hdr_sz;
   1765
   1766		if (!copy_from_iter_full(&gso, sizeof(gso), from))
   1767			return -EFAULT;
   1768
   1769		if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
   1770		    tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2 > tun16_to_cpu(tun, gso.hdr_len))
   1771			gso.hdr_len = cpu_to_tun16(tun, tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2);
   1772
   1773		if (tun16_to_cpu(tun, gso.hdr_len) > len)
   1774			return -EINVAL;
   1775		iov_iter_advance(from, vnet_hdr_sz - sizeof(gso));
   1776	}
   1777
   1778	if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) {
   1779		align += NET_IP_ALIGN;
   1780		if (unlikely(len < ETH_HLEN ||
   1781			     (gso.hdr_len && tun16_to_cpu(tun, gso.hdr_len) < ETH_HLEN)))
   1782			return -EINVAL;
   1783	}
   1784
   1785	good_linear = SKB_MAX_HEAD(align);
   1786
   1787	if (msg_control) {
   1788		struct iov_iter i = *from;
   1789
   1790		/* There are 256 bytes to be copied in skb, so there is
   1791		 * enough room for skb expand head in case it is used.
   1792		 * The rest of the buffer is mapped from userspace.
   1793		 */
   1794		copylen = gso.hdr_len ? tun16_to_cpu(tun, gso.hdr_len) : GOODCOPY_LEN;
   1795		if (copylen > good_linear)
   1796			copylen = good_linear;
   1797		linear = copylen;
   1798		iov_iter_advance(&i, copylen);
   1799		if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
   1800			zerocopy = true;
   1801	}
   1802
   1803	if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
   1804		/* For the packet that is not easy to be processed
   1805		 * (e.g gso or jumbo packet), we will do it at after
   1806		 * skb was created with generic XDP routine.
   1807		 */
   1808		skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp);
   1809		if (IS_ERR(skb)) {
   1810			dev_core_stats_rx_dropped_inc(tun->dev);
   1811			return PTR_ERR(skb);
   1812		}
   1813		if (!skb)
   1814			return total_len;
   1815	} else {
   1816		if (!zerocopy) {
   1817			copylen = len;
   1818			if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
   1819				linear = good_linear;
   1820			else
   1821				linear = tun16_to_cpu(tun, gso.hdr_len);
   1822		}
   1823
   1824		if (frags) {
   1825			mutex_lock(&tfile->napi_mutex);
   1826			skb = tun_napi_alloc_frags(tfile, copylen, from);
   1827			/* tun_napi_alloc_frags() enforces a layout for the skb.
   1828			 * If zerocopy is enabled, then this layout will be
   1829			 * overwritten by zerocopy_sg_from_iter().
   1830			 */
   1831			zerocopy = false;
   1832		} else {
   1833			skb = tun_alloc_skb(tfile, align, copylen, linear,
   1834					    noblock);
   1835		}
   1836
   1837		if (IS_ERR(skb)) {
   1838			if (PTR_ERR(skb) != -EAGAIN)
   1839				dev_core_stats_rx_dropped_inc(tun->dev);
   1840			if (frags)
   1841				mutex_unlock(&tfile->napi_mutex);
   1842			return PTR_ERR(skb);
   1843		}
   1844
   1845		if (zerocopy)
   1846			err = zerocopy_sg_from_iter(skb, from);
   1847		else
   1848			err = skb_copy_datagram_from_iter(skb, 0, from, len);
   1849
   1850		if (err) {
   1851			err = -EFAULT;
   1852			drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT;
   1853drop:
   1854			dev_core_stats_rx_dropped_inc(tun->dev);
   1855			kfree_skb_reason(skb, drop_reason);
   1856			if (frags) {
   1857				tfile->napi.skb = NULL;
   1858				mutex_unlock(&tfile->napi_mutex);
   1859			}
   1860
   1861			return err;
   1862		}
   1863	}
   1864
   1865	if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
   1866		atomic_long_inc(&tun->rx_frame_errors);
   1867		kfree_skb(skb);
   1868		if (frags) {
   1869			tfile->napi.skb = NULL;
   1870			mutex_unlock(&tfile->napi_mutex);
   1871		}
   1872
   1873		return -EINVAL;
   1874	}
   1875
   1876	switch (tun->flags & TUN_TYPE_MASK) {
   1877	case IFF_TUN:
   1878		if (tun->flags & IFF_NO_PI) {
   1879			u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0;
   1880
   1881			switch (ip_version) {
   1882			case 4:
   1883				pi.proto = htons(ETH_P_IP);
   1884				break;
   1885			case 6:
   1886				pi.proto = htons(ETH_P_IPV6);
   1887				break;
   1888			default:
   1889				dev_core_stats_rx_dropped_inc(tun->dev);
   1890				kfree_skb(skb);
   1891				return -EINVAL;
   1892			}
   1893		}
   1894
   1895		skb_reset_mac_header(skb);
   1896		skb->protocol = pi.proto;
   1897		skb->dev = tun->dev;
   1898		break;
   1899	case IFF_TAP:
   1900		if (frags && !pskb_may_pull(skb, ETH_HLEN)) {
   1901			err = -ENOMEM;
   1902			drop_reason = SKB_DROP_REASON_HDR_TRUNC;
   1903			goto drop;
   1904		}
   1905		skb->protocol = eth_type_trans(skb, tun->dev);
   1906		break;
   1907	}
   1908
   1909	/* copy skb_ubuf_info for callback when skb has no error */
   1910	if (zerocopy) {
   1911		skb_zcopy_init(skb, msg_control);
   1912	} else if (msg_control) {
   1913		struct ubuf_info *uarg = msg_control;
   1914		uarg->callback(NULL, uarg, false);
   1915	}
   1916
   1917	skb_reset_network_header(skb);
   1918	skb_probe_transport_header(skb);
   1919	skb_record_rx_queue(skb, tfile->queue_index);
   1920
   1921	if (skb_xdp) {
   1922		struct bpf_prog *xdp_prog;
   1923		int ret;
   1924
   1925		local_bh_disable();
   1926		rcu_read_lock();
   1927		xdp_prog = rcu_dereference(tun->xdp_prog);
   1928		if (xdp_prog) {
   1929			ret = do_xdp_generic(xdp_prog, skb);
   1930			if (ret != XDP_PASS) {
   1931				rcu_read_unlock();
   1932				local_bh_enable();
   1933				if (frags) {
   1934					tfile->napi.skb = NULL;
   1935					mutex_unlock(&tfile->napi_mutex);
   1936				}
   1937				return total_len;
   1938			}
   1939		}
   1940		rcu_read_unlock();
   1941		local_bh_enable();
   1942	}
   1943
   1944	/* Compute the costly rx hash only if needed for flow updates.
   1945	 * We may get a very small possibility of OOO during switching, not
   1946	 * worth to optimize.
   1947	 */
   1948	if (!rcu_access_pointer(tun->steering_prog) && tun->numqueues > 1 &&
   1949	    !tfile->detached)
   1950		rxhash = __skb_get_hash_symmetric(skb);
   1951
   1952	rcu_read_lock();
   1953	if (unlikely(!(tun->dev->flags & IFF_UP))) {
   1954		err = -EIO;
   1955		rcu_read_unlock();
   1956		drop_reason = SKB_DROP_REASON_DEV_READY;
   1957		goto drop;
   1958	}
   1959
   1960	if (frags) {
   1961		u32 headlen;
   1962
   1963		/* Exercise flow dissector code path. */
   1964		skb_push(skb, ETH_HLEN);
   1965		headlen = eth_get_headlen(tun->dev, skb->data,
   1966					  skb_headlen(skb));
   1967
   1968		if (unlikely(headlen > skb_headlen(skb))) {
   1969			dev_core_stats_rx_dropped_inc(tun->dev);
   1970			napi_free_frags(&tfile->napi);
   1971			rcu_read_unlock();
   1972			mutex_unlock(&tfile->napi_mutex);
   1973			WARN_ON(1);
   1974			return -ENOMEM;
   1975		}
   1976
   1977		local_bh_disable();
   1978		napi_gro_frags(&tfile->napi);
   1979		local_bh_enable();
   1980		mutex_unlock(&tfile->napi_mutex);
   1981	} else if (tfile->napi_enabled) {
   1982		struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
   1983		int queue_len;
   1984
   1985		spin_lock_bh(&queue->lock);
   1986		__skb_queue_tail(queue, skb);
   1987		queue_len = skb_queue_len(queue);
   1988		spin_unlock(&queue->lock);
   1989
   1990		if (!more || queue_len > NAPI_POLL_WEIGHT)
   1991			napi_schedule(&tfile->napi);
   1992
   1993		local_bh_enable();
   1994	} else if (!IS_ENABLED(CONFIG_4KSTACKS)) {
   1995		tun_rx_batched(tun, tfile, skb, more);
   1996	} else {
   1997		netif_rx(skb);
   1998	}
   1999	rcu_read_unlock();
   2000
   2001	preempt_disable();
   2002	dev_sw_netstats_rx_add(tun->dev, len);
   2003	preempt_enable();
   2004
   2005	if (rxhash)
   2006		tun_flow_update(tun, rxhash, tfile);
   2007
   2008	return total_len;
   2009}
   2010
   2011static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
   2012{
   2013	struct file *file = iocb->ki_filp;
   2014	struct tun_file *tfile = file->private_data;
   2015	struct tun_struct *tun = tun_get(tfile);
   2016	ssize_t result;
   2017	int noblock = 0;
   2018
   2019	if (!tun)
   2020		return -EBADFD;
   2021
   2022	if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT))
   2023		noblock = 1;
   2024
   2025	result = tun_get_user(tun, tfile, NULL, from, noblock, false);
   2026
   2027	tun_put(tun);
   2028	return result;
   2029}
   2030
   2031static ssize_t tun_put_user_xdp(struct tun_struct *tun,
   2032				struct tun_file *tfile,
   2033				struct xdp_frame *xdp_frame,
   2034				struct iov_iter *iter)
   2035{
   2036	int vnet_hdr_sz = 0;
   2037	size_t size = xdp_frame->len;
   2038	size_t ret;
   2039
   2040	if (tun->flags & IFF_VNET_HDR) {
   2041		struct virtio_net_hdr gso = { 0 };
   2042
   2043		vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
   2044		if (unlikely(iov_iter_count(iter) < vnet_hdr_sz))
   2045			return -EINVAL;
   2046		if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) !=
   2047			     sizeof(gso)))
   2048			return -EFAULT;
   2049		iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
   2050	}
   2051
   2052	ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz;
   2053
   2054	preempt_disable();
   2055	dev_sw_netstats_tx_add(tun->dev, 1, ret);
   2056	preempt_enable();
   2057
   2058	return ret;
   2059}
   2060
   2061/* Put packet to the user space buffer */
   2062static ssize_t tun_put_user(struct tun_struct *tun,
   2063			    struct tun_file *tfile,
   2064			    struct sk_buff *skb,
   2065			    struct iov_iter *iter)
   2066{
   2067	struct tun_pi pi = { 0, skb->protocol };
   2068	ssize_t total;
   2069	int vlan_offset = 0;
   2070	int vlan_hlen = 0;
   2071	int vnet_hdr_sz = 0;
   2072
   2073	if (skb_vlan_tag_present(skb))
   2074		vlan_hlen = VLAN_HLEN;
   2075
   2076	if (tun->flags & IFF_VNET_HDR)
   2077		vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
   2078
   2079	total = skb->len + vlan_hlen + vnet_hdr_sz;
   2080
   2081	if (!(tun->flags & IFF_NO_PI)) {
   2082		if (iov_iter_count(iter) < sizeof(pi))
   2083			return -EINVAL;
   2084
   2085		total += sizeof(pi);
   2086		if (iov_iter_count(iter) < total) {
   2087			/* Packet will be striped */
   2088			pi.flags |= TUN_PKT_STRIP;
   2089		}
   2090
   2091		if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi))
   2092			return -EFAULT;
   2093	}
   2094
   2095	if (vnet_hdr_sz) {
   2096		struct virtio_net_hdr gso;
   2097
   2098		if (iov_iter_count(iter) < vnet_hdr_sz)
   2099			return -EINVAL;
   2100
   2101		if (virtio_net_hdr_from_skb(skb, &gso,
   2102					    tun_is_little_endian(tun), true,
   2103					    vlan_hlen)) {
   2104			struct skb_shared_info *sinfo = skb_shinfo(skb);
   2105			pr_err("unexpected GSO type: "
   2106			       "0x%x, gso_size %d, hdr_len %d\n",
   2107			       sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size),
   2108			       tun16_to_cpu(tun, gso.hdr_len));
   2109			print_hex_dump(KERN_ERR, "tun: ",
   2110				       DUMP_PREFIX_NONE,
   2111				       16, 1, skb->head,
   2112				       min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true);
   2113			WARN_ON_ONCE(1);
   2114			return -EINVAL;
   2115		}
   2116
   2117		if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso))
   2118			return -EFAULT;
   2119
   2120		iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
   2121	}
   2122
   2123	if (vlan_hlen) {
   2124		int ret;
   2125		struct veth veth;
   2126
   2127		veth.h_vlan_proto = skb->vlan_proto;
   2128		veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
   2129
   2130		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
   2131
   2132		ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
   2133		if (ret || !iov_iter_count(iter))
   2134			goto done;
   2135
   2136		ret = copy_to_iter(&veth, sizeof(veth), iter);
   2137		if (ret != sizeof(veth) || !iov_iter_count(iter))
   2138			goto done;
   2139	}
   2140
   2141	skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset);
   2142
   2143done:
   2144	/* caller is in process context, */
   2145	preempt_disable();
   2146	dev_sw_netstats_tx_add(tun->dev, 1, skb->len + vlan_hlen);
   2147	preempt_enable();
   2148
   2149	return total;
   2150}
   2151
   2152static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
   2153{
   2154	DECLARE_WAITQUEUE(wait, current);
   2155	void *ptr = NULL;
   2156	int error = 0;
   2157
   2158	ptr = ptr_ring_consume(&tfile->tx_ring);
   2159	if (ptr)
   2160		goto out;
   2161	if (noblock) {
   2162		error = -EAGAIN;
   2163		goto out;
   2164	}
   2165
   2166	add_wait_queue(&tfile->socket.wq.wait, &wait);
   2167
   2168	while (1) {
   2169		set_current_state(TASK_INTERRUPTIBLE);
   2170		ptr = ptr_ring_consume(&tfile->tx_ring);
   2171		if (ptr)
   2172			break;
   2173		if (signal_pending(current)) {
   2174			error = -ERESTARTSYS;
   2175			break;
   2176		}
   2177		if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) {
   2178			error = -EFAULT;
   2179			break;
   2180		}
   2181
   2182		schedule();
   2183	}
   2184
   2185	__set_current_state(TASK_RUNNING);
   2186	remove_wait_queue(&tfile->socket.wq.wait, &wait);
   2187
   2188out:
   2189	*err = error;
   2190	return ptr;
   2191}
   2192
   2193static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
   2194			   struct iov_iter *to,
   2195			   int noblock, void *ptr)
   2196{
   2197	ssize_t ret;
   2198	int err;
   2199
   2200	if (!iov_iter_count(to)) {
   2201		tun_ptr_free(ptr);
   2202		return 0;
   2203	}
   2204
   2205	if (!ptr) {
   2206		/* Read frames from ring */
   2207		ptr = tun_ring_recv(tfile, noblock, &err);
   2208		if (!ptr)
   2209			return err;
   2210	}
   2211
   2212	if (tun_is_xdp_frame(ptr)) {
   2213		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
   2214
   2215		ret = tun_put_user_xdp(tun, tfile, xdpf, to);
   2216		xdp_return_frame(xdpf);
   2217	} else {
   2218		struct sk_buff *skb = ptr;
   2219
   2220		ret = tun_put_user(tun, tfile, skb, to);
   2221		if (unlikely(ret < 0))
   2222			kfree_skb(skb);
   2223		else
   2224			consume_skb(skb);
   2225	}
   2226
   2227	return ret;
   2228}
   2229
   2230static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
   2231{
   2232	struct file *file = iocb->ki_filp;
   2233	struct tun_file *tfile = file->private_data;
   2234	struct tun_struct *tun = tun_get(tfile);
   2235	ssize_t len = iov_iter_count(to), ret;
   2236	int noblock = 0;
   2237
   2238	if (!tun)
   2239		return -EBADFD;
   2240
   2241	if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT))
   2242		noblock = 1;
   2243
   2244	ret = tun_do_read(tun, tfile, to, noblock, NULL);
   2245	ret = min_t(ssize_t, ret, len);
   2246	if (ret > 0)
   2247		iocb->ki_pos = ret;
   2248	tun_put(tun);
   2249	return ret;
   2250}
   2251
   2252static void tun_prog_free(struct rcu_head *rcu)
   2253{
   2254	struct tun_prog *prog = container_of(rcu, struct tun_prog, rcu);
   2255
   2256	bpf_prog_destroy(prog->prog);
   2257	kfree(prog);
   2258}
   2259
   2260static int __tun_set_ebpf(struct tun_struct *tun,
   2261			  struct tun_prog __rcu **prog_p,
   2262			  struct bpf_prog *prog)
   2263{
   2264	struct tun_prog *old, *new = NULL;
   2265
   2266	if (prog) {
   2267		new = kmalloc(sizeof(*new), GFP_KERNEL);
   2268		if (!new)
   2269			return -ENOMEM;
   2270		new->prog = prog;
   2271	}
   2272
   2273	spin_lock_bh(&tun->lock);
   2274	old = rcu_dereference_protected(*prog_p,
   2275					lockdep_is_held(&tun->lock));
   2276	rcu_assign_pointer(*prog_p, new);
   2277	spin_unlock_bh(&tun->lock);
   2278
   2279	if (old)
   2280		call_rcu(&old->rcu, tun_prog_free);
   2281
   2282	return 0;
   2283}
   2284
   2285static void tun_free_netdev(struct net_device *dev)
   2286{
   2287	struct tun_struct *tun = netdev_priv(dev);
   2288
   2289	BUG_ON(!(list_empty(&tun->disabled)));
   2290
   2291	free_percpu(dev->tstats);
   2292	tun_flow_uninit(tun);
   2293	security_tun_dev_free_security(tun->security);
   2294	__tun_set_ebpf(tun, &tun->steering_prog, NULL);
   2295	__tun_set_ebpf(tun, &tun->filter_prog, NULL);
   2296}
   2297
   2298static void tun_setup(struct net_device *dev)
   2299{
   2300	struct tun_struct *tun = netdev_priv(dev);
   2301
   2302	tun->owner = INVALID_UID;
   2303	tun->group = INVALID_GID;
   2304	tun_default_link_ksettings(dev, &tun->link_ksettings);
   2305
   2306	dev->ethtool_ops = &tun_ethtool_ops;
   2307	dev->needs_free_netdev = true;
   2308	dev->priv_destructor = tun_free_netdev;
   2309	/* We prefer our own queue length */
   2310	dev->tx_queue_len = TUN_READQ_SIZE;
   2311}
   2312
   2313/* Trivial set of netlink ops to allow deleting tun or tap
   2314 * device with netlink.
   2315 */
   2316static int tun_validate(struct nlattr *tb[], struct nlattr *data[],
   2317			struct netlink_ext_ack *extack)
   2318{
   2319	NL_SET_ERR_MSG(extack,
   2320		       "tun/tap creation via rtnetlink is not supported.");
   2321	return -EOPNOTSUPP;
   2322}
   2323
   2324static size_t tun_get_size(const struct net_device *dev)
   2325{
   2326	BUILD_BUG_ON(sizeof(u32) != sizeof(uid_t));
   2327	BUILD_BUG_ON(sizeof(u32) != sizeof(gid_t));
   2328
   2329	return nla_total_size(sizeof(uid_t)) + /* OWNER */
   2330	       nla_total_size(sizeof(gid_t)) + /* GROUP */
   2331	       nla_total_size(sizeof(u8)) + /* TYPE */
   2332	       nla_total_size(sizeof(u8)) + /* PI */
   2333	       nla_total_size(sizeof(u8)) + /* VNET_HDR */
   2334	       nla_total_size(sizeof(u8)) + /* PERSIST */
   2335	       nla_total_size(sizeof(u8)) + /* MULTI_QUEUE */
   2336	       nla_total_size(sizeof(u32)) + /* NUM_QUEUES */
   2337	       nla_total_size(sizeof(u32)) + /* NUM_DISABLED_QUEUES */
   2338	       0;
   2339}
   2340
   2341static int tun_fill_info(struct sk_buff *skb, const struct net_device *dev)
   2342{
   2343	struct tun_struct *tun = netdev_priv(dev);
   2344
   2345	if (nla_put_u8(skb, IFLA_TUN_TYPE, tun->flags & TUN_TYPE_MASK))
   2346		goto nla_put_failure;
   2347	if (uid_valid(tun->owner) &&
   2348	    nla_put_u32(skb, IFLA_TUN_OWNER,
   2349			from_kuid_munged(current_user_ns(), tun->owner)))
   2350		goto nla_put_failure;
   2351	if (gid_valid(tun->group) &&
   2352	    nla_put_u32(skb, IFLA_TUN_GROUP,
   2353			from_kgid_munged(current_user_ns(), tun->group)))
   2354		goto nla_put_failure;
   2355	if (nla_put_u8(skb, IFLA_TUN_PI, !(tun->flags & IFF_NO_PI)))
   2356		goto nla_put_failure;
   2357	if (nla_put_u8(skb, IFLA_TUN_VNET_HDR, !!(tun->flags & IFF_VNET_HDR)))
   2358		goto nla_put_failure;
   2359	if (nla_put_u8(skb, IFLA_TUN_PERSIST, !!(tun->flags & IFF_PERSIST)))
   2360		goto nla_put_failure;
   2361	if (nla_put_u8(skb, IFLA_TUN_MULTI_QUEUE,
   2362		       !!(tun->flags & IFF_MULTI_QUEUE)))
   2363		goto nla_put_failure;
   2364	if (tun->flags & IFF_MULTI_QUEUE) {
   2365		if (nla_put_u32(skb, IFLA_TUN_NUM_QUEUES, tun->numqueues))
   2366			goto nla_put_failure;
   2367		if (nla_put_u32(skb, IFLA_TUN_NUM_DISABLED_QUEUES,
   2368				tun->numdisabled))
   2369			goto nla_put_failure;
   2370	}
   2371
   2372	return 0;
   2373
   2374nla_put_failure:
   2375	return -EMSGSIZE;
   2376}
   2377
   2378static struct rtnl_link_ops tun_link_ops __read_mostly = {
   2379	.kind		= DRV_NAME,
   2380	.priv_size	= sizeof(struct tun_struct),
   2381	.setup		= tun_setup,
   2382	.validate	= tun_validate,
   2383	.get_size       = tun_get_size,
   2384	.fill_info      = tun_fill_info,
   2385};
   2386
   2387static void tun_sock_write_space(struct sock *sk)
   2388{
   2389	struct tun_file *tfile;
   2390	wait_queue_head_t *wqueue;
   2391
   2392	if (!sock_writeable(sk))
   2393		return;
   2394
   2395	if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags))
   2396		return;
   2397
   2398	wqueue = sk_sleep(sk);
   2399	if (wqueue && waitqueue_active(wqueue))
   2400		wake_up_interruptible_sync_poll(wqueue, EPOLLOUT |
   2401						EPOLLWRNORM | EPOLLWRBAND);
   2402
   2403	tfile = container_of(sk, struct tun_file, sk);
   2404	kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
   2405}
   2406
   2407static void tun_put_page(struct tun_page *tpage)
   2408{
   2409	if (tpage->page)
   2410		__page_frag_cache_drain(tpage->page, tpage->count);
   2411}
   2412
   2413static int tun_xdp_one(struct tun_struct *tun,
   2414		       struct tun_file *tfile,
   2415		       struct xdp_buff *xdp, int *flush,
   2416		       struct tun_page *tpage)
   2417{
   2418	unsigned int datasize = xdp->data_end - xdp->data;
   2419	struct tun_xdp_hdr *hdr = xdp->data_hard_start;
   2420	struct virtio_net_hdr *gso = &hdr->gso;
   2421	struct bpf_prog *xdp_prog;
   2422	struct sk_buff *skb = NULL;
   2423	struct sk_buff_head *queue;
   2424	u32 rxhash = 0, act;
   2425	int buflen = hdr->buflen;
   2426	int ret = 0;
   2427	bool skb_xdp = false;
   2428	struct page *page;
   2429
   2430	xdp_prog = rcu_dereference(tun->xdp_prog);
   2431	if (xdp_prog) {
   2432		if (gso->gso_type) {
   2433			skb_xdp = true;
   2434			goto build;
   2435		}
   2436
   2437		xdp_init_buff(xdp, buflen, &tfile->xdp_rxq);
   2438		xdp_set_data_meta_invalid(xdp);
   2439
   2440		act = bpf_prog_run_xdp(xdp_prog, xdp);
   2441		ret = tun_xdp_act(tun, xdp_prog, xdp, act);
   2442		if (ret < 0) {
   2443			put_page(virt_to_head_page(xdp->data));
   2444			return ret;
   2445		}
   2446
   2447		switch (ret) {
   2448		case XDP_REDIRECT:
   2449			*flush = true;
   2450			fallthrough;
   2451		case XDP_TX:
   2452			return 0;
   2453		case XDP_PASS:
   2454			break;
   2455		default:
   2456			page = virt_to_head_page(xdp->data);
   2457			if (tpage->page == page) {
   2458				++tpage->count;
   2459			} else {
   2460				tun_put_page(tpage);
   2461				tpage->page = page;
   2462				tpage->count = 1;
   2463			}
   2464			return 0;
   2465		}
   2466	}
   2467
   2468build:
   2469	skb = build_skb(xdp->data_hard_start, buflen);
   2470	if (!skb) {
   2471		ret = -ENOMEM;
   2472		goto out;
   2473	}
   2474
   2475	skb_reserve(skb, xdp->data - xdp->data_hard_start);
   2476	skb_put(skb, xdp->data_end - xdp->data);
   2477
   2478	if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
   2479		atomic_long_inc(&tun->rx_frame_errors);
   2480		kfree_skb(skb);
   2481		ret = -EINVAL;
   2482		goto out;
   2483	}
   2484
   2485	skb->protocol = eth_type_trans(skb, tun->dev);
   2486	skb_reset_network_header(skb);
   2487	skb_probe_transport_header(skb);
   2488	skb_record_rx_queue(skb, tfile->queue_index);
   2489
   2490	if (skb_xdp) {
   2491		ret = do_xdp_generic(xdp_prog, skb);
   2492		if (ret != XDP_PASS) {
   2493			ret = 0;
   2494			goto out;
   2495		}
   2496	}
   2497
   2498	if (!rcu_dereference(tun->steering_prog) && tun->numqueues > 1 &&
   2499	    !tfile->detached)
   2500		rxhash = __skb_get_hash_symmetric(skb);
   2501
   2502	if (tfile->napi_enabled) {
   2503		queue = &tfile->sk.sk_write_queue;
   2504		spin_lock(&queue->lock);
   2505		__skb_queue_tail(queue, skb);
   2506		spin_unlock(&queue->lock);
   2507		ret = 1;
   2508	} else {
   2509		netif_receive_skb(skb);
   2510		ret = 0;
   2511	}
   2512
   2513	/* No need to disable preemption here since this function is
   2514	 * always called with bh disabled
   2515	 */
   2516	dev_sw_netstats_rx_add(tun->dev, datasize);
   2517
   2518	if (rxhash)
   2519		tun_flow_update(tun, rxhash, tfile);
   2520
   2521out:
   2522	return ret;
   2523}
   2524
   2525static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
   2526{
   2527	int ret, i;
   2528	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
   2529	struct tun_struct *tun = tun_get(tfile);
   2530	struct tun_msg_ctl *ctl = m->msg_control;
   2531	struct xdp_buff *xdp;
   2532
   2533	if (!tun)
   2534		return -EBADFD;
   2535
   2536	if (m->msg_controllen == sizeof(struct tun_msg_ctl) &&
   2537	    ctl && ctl->type == TUN_MSG_PTR) {
   2538		struct tun_page tpage;
   2539		int n = ctl->num;
   2540		int flush = 0, queued = 0;
   2541
   2542		memset(&tpage, 0, sizeof(tpage));
   2543
   2544		local_bh_disable();
   2545		rcu_read_lock();
   2546
   2547		for (i = 0; i < n; i++) {
   2548			xdp = &((struct xdp_buff *)ctl->ptr)[i];
   2549			ret = tun_xdp_one(tun, tfile, xdp, &flush, &tpage);
   2550			if (ret > 0)
   2551				queued += ret;
   2552		}
   2553
   2554		if (flush)
   2555			xdp_do_flush();
   2556
   2557		if (tfile->napi_enabled && queued > 0)
   2558			napi_schedule(&tfile->napi);
   2559
   2560		rcu_read_unlock();
   2561		local_bh_enable();
   2562
   2563		tun_put_page(&tpage);
   2564
   2565		ret = total_len;
   2566		goto out;
   2567	}
   2568
   2569	ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
   2570			   m->msg_flags & MSG_DONTWAIT,
   2571			   m->msg_flags & MSG_MORE);
   2572out:
   2573	tun_put(tun);
   2574	return ret;
   2575}
   2576
   2577static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
   2578		       int flags)
   2579{
   2580	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
   2581	struct tun_struct *tun = tun_get(tfile);
   2582	void *ptr = m->msg_control;
   2583	int ret;
   2584
   2585	if (!tun) {
   2586		ret = -EBADFD;
   2587		goto out_free;
   2588	}
   2589
   2590	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
   2591		ret = -EINVAL;
   2592		goto out_put_tun;
   2593	}
   2594	if (flags & MSG_ERRQUEUE) {
   2595		ret = sock_recv_errqueue(sock->sk, m, total_len,
   2596					 SOL_PACKET, TUN_TX_TIMESTAMP);
   2597		goto out;
   2598	}
   2599	ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, ptr);
   2600	if (ret > (ssize_t)total_len) {
   2601		m->msg_flags |= MSG_TRUNC;
   2602		ret = flags & MSG_TRUNC ? ret : total_len;
   2603	}
   2604out:
   2605	tun_put(tun);
   2606	return ret;
   2607
   2608out_put_tun:
   2609	tun_put(tun);
   2610out_free:
   2611	tun_ptr_free(ptr);
   2612	return ret;
   2613}
   2614
   2615static int tun_ptr_peek_len(void *ptr)
   2616{
   2617	if (likely(ptr)) {
   2618		if (tun_is_xdp_frame(ptr)) {
   2619			struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
   2620
   2621			return xdpf->len;
   2622		}
   2623		return __skb_array_len_with_tag(ptr);
   2624	} else {
   2625		return 0;
   2626	}
   2627}
   2628
   2629static int tun_peek_len(struct socket *sock)
   2630{
   2631	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
   2632	struct tun_struct *tun;
   2633	int ret = 0;
   2634
   2635	tun = tun_get(tfile);
   2636	if (!tun)
   2637		return 0;
   2638
   2639	ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
   2640	tun_put(tun);
   2641
   2642	return ret;
   2643}
   2644
   2645/* Ops structure to mimic raw sockets with tun */
   2646static const struct proto_ops tun_socket_ops = {
   2647	.peek_len = tun_peek_len,
   2648	.sendmsg = tun_sendmsg,
   2649	.recvmsg = tun_recvmsg,
   2650};
   2651
   2652static struct proto tun_proto = {
   2653	.name		= "tun",
   2654	.owner		= THIS_MODULE,
   2655	.obj_size	= sizeof(struct tun_file),
   2656};
   2657
   2658static int tun_flags(struct tun_struct *tun)
   2659{
   2660	return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP);
   2661}
   2662
   2663static ssize_t tun_flags_show(struct device *dev, struct device_attribute *attr,
   2664			      char *buf)
   2665{
   2666	struct tun_struct *tun = netdev_priv(to_net_dev(dev));
   2667	return sprintf(buf, "0x%x\n", tun_flags(tun));
   2668}
   2669
   2670static ssize_t owner_show(struct device *dev, struct device_attribute *attr,
   2671			  char *buf)
   2672{
   2673	struct tun_struct *tun = netdev_priv(to_net_dev(dev));
   2674	return uid_valid(tun->owner)?
   2675		sprintf(buf, "%u\n",
   2676			from_kuid_munged(current_user_ns(), tun->owner)):
   2677		sprintf(buf, "-1\n");
   2678}
   2679
   2680static ssize_t group_show(struct device *dev, struct device_attribute *attr,
   2681			  char *buf)
   2682{
   2683	struct tun_struct *tun = netdev_priv(to_net_dev(dev));
   2684	return gid_valid(tun->group) ?
   2685		sprintf(buf, "%u\n",
   2686			from_kgid_munged(current_user_ns(), tun->group)):
   2687		sprintf(buf, "-1\n");
   2688}
   2689
   2690static DEVICE_ATTR_RO(tun_flags);
   2691static DEVICE_ATTR_RO(owner);
   2692static DEVICE_ATTR_RO(group);
   2693
   2694static struct attribute *tun_dev_attrs[] = {
   2695	&dev_attr_tun_flags.attr,
   2696	&dev_attr_owner.attr,
   2697	&dev_attr_group.attr,
   2698	NULL
   2699};
   2700
   2701static const struct attribute_group tun_attr_group = {
   2702	.attrs = tun_dev_attrs
   2703};
   2704
   2705static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
   2706{
   2707	struct tun_struct *tun;
   2708	struct tun_file *tfile = file->private_data;
   2709	struct net_device *dev;
   2710	int err;
   2711
   2712	if (tfile->detached)
   2713		return -EINVAL;
   2714
   2715	if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) {
   2716		if (!capable(CAP_NET_ADMIN))
   2717			return -EPERM;
   2718
   2719		if (!(ifr->ifr_flags & IFF_NAPI) ||
   2720		    (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP)
   2721			return -EINVAL;
   2722	}
   2723
   2724	dev = __dev_get_by_name(net, ifr->ifr_name);
   2725	if (dev) {
   2726		if (ifr->ifr_flags & IFF_TUN_EXCL)
   2727			return -EBUSY;
   2728		if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
   2729			tun = netdev_priv(dev);
   2730		else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
   2731			tun = netdev_priv(dev);
   2732		else
   2733			return -EINVAL;
   2734
   2735		if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) !=
   2736		    !!(tun->flags & IFF_MULTI_QUEUE))
   2737			return -EINVAL;
   2738
   2739		if (tun_not_capable(tun))
   2740			return -EPERM;
   2741		err = security_tun_dev_open(tun->security);
   2742		if (err < 0)
   2743			return err;
   2744
   2745		err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
   2746				 ifr->ifr_flags & IFF_NAPI,
   2747				 ifr->ifr_flags & IFF_NAPI_FRAGS, true);
   2748		if (err < 0)
   2749			return err;
   2750
   2751		if (tun->flags & IFF_MULTI_QUEUE &&
   2752		    (tun->numqueues + tun->numdisabled > 1)) {
   2753			/* One or more queue has already been attached, no need
   2754			 * to initialize the device again.
   2755			 */
   2756			netdev_state_change(dev);
   2757			return 0;
   2758		}
   2759
   2760		tun->flags = (tun->flags & ~TUN_FEATURES) |
   2761			      (ifr->ifr_flags & TUN_FEATURES);
   2762
   2763		netdev_state_change(dev);
   2764	} else {
   2765		char *name;
   2766		unsigned long flags = 0;
   2767		int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ?
   2768			     MAX_TAP_QUEUES : 1;
   2769
   2770		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
   2771			return -EPERM;
   2772		err = security_tun_dev_create();
   2773		if (err < 0)
   2774			return err;
   2775
   2776		/* Set dev type */
   2777		if (ifr->ifr_flags & IFF_TUN) {
   2778			/* TUN device */
   2779			flags |= IFF_TUN;
   2780			name = "tun%d";
   2781		} else if (ifr->ifr_flags & IFF_TAP) {
   2782			/* TAP device */
   2783			flags |= IFF_TAP;
   2784			name = "tap%d";
   2785		} else
   2786			return -EINVAL;
   2787
   2788		if (*ifr->ifr_name)
   2789			name = ifr->ifr_name;
   2790
   2791		dev = alloc_netdev_mqs(sizeof(struct tun_struct), name,
   2792				       NET_NAME_UNKNOWN, tun_setup, queues,
   2793				       queues);
   2794
   2795		if (!dev)
   2796			return -ENOMEM;
   2797
   2798		dev_net_set(dev, net);
   2799		dev->rtnl_link_ops = &tun_link_ops;
   2800		dev->ifindex = tfile->ifindex;
   2801		dev->sysfs_groups[0] = &tun_attr_group;
   2802
   2803		tun = netdev_priv(dev);
   2804		tun->dev = dev;
   2805		tun->flags = flags;
   2806		tun->txflt.count = 0;
   2807		tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
   2808
   2809		tun->align = NET_SKB_PAD;
   2810		tun->filter_attached = false;
   2811		tun->sndbuf = tfile->socket.sk->sk_sndbuf;
   2812		tun->rx_batched = 0;
   2813		RCU_INIT_POINTER(tun->steering_prog, NULL);
   2814
   2815		tun->ifr = ifr;
   2816		tun->file = file;
   2817
   2818		tun_net_initialize(dev);
   2819
   2820		err = register_netdevice(tun->dev);
   2821		if (err < 0) {
   2822			free_netdev(dev);
   2823			return err;
   2824		}
   2825		/* free_netdev() won't check refcnt, to avoid race
   2826		 * with dev_put() we need publish tun after registration.
   2827		 */
   2828		rcu_assign_pointer(tfile->tun, tun);
   2829	}
   2830
   2831	netif_carrier_on(tun->dev);
   2832
   2833	/* Make sure persistent devices do not get stuck in
   2834	 * xoff state.
   2835	 */
   2836	if (netif_running(tun->dev))
   2837		netif_tx_wake_all_queues(tun->dev);
   2838
   2839	strcpy(ifr->ifr_name, tun->dev->name);
   2840	return 0;
   2841}
   2842
   2843static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr)
   2844{
   2845	strcpy(ifr->ifr_name, tun->dev->name);
   2846
   2847	ifr->ifr_flags = tun_flags(tun);
   2848
   2849}
   2850
   2851/* This is like a cut-down ethtool ops, except done via tun fd so no
   2852 * privs required. */
   2853static int set_offload(struct tun_struct *tun, unsigned long arg)
   2854{
   2855	netdev_features_t features = 0;
   2856
   2857	if (arg & TUN_F_CSUM) {
   2858		features |= NETIF_F_HW_CSUM;
   2859		arg &= ~TUN_F_CSUM;
   2860
   2861		if (arg & (TUN_F_TSO4|TUN_F_TSO6)) {
   2862			if (arg & TUN_F_TSO_ECN) {
   2863				features |= NETIF_F_TSO_ECN;
   2864				arg &= ~TUN_F_TSO_ECN;
   2865			}
   2866			if (arg & TUN_F_TSO4)
   2867				features |= NETIF_F_TSO;
   2868			if (arg & TUN_F_TSO6)
   2869				features |= NETIF_F_TSO6;
   2870			arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
   2871		}
   2872
   2873		arg &= ~TUN_F_UFO;
   2874	}
   2875
   2876	/* This gives the user a way to test for new features in future by
   2877	 * trying to set them. */
   2878	if (arg)
   2879		return -EINVAL;
   2880
   2881	tun->set_features = features;
   2882	tun->dev->wanted_features &= ~TUN_USER_FEATURES;
   2883	tun->dev->wanted_features |= features;
   2884	netdev_update_features(tun->dev);
   2885
   2886	return 0;
   2887}
   2888
   2889static void tun_detach_filter(struct tun_struct *tun, int n)
   2890{
   2891	int i;
   2892	struct tun_file *tfile;
   2893
   2894	for (i = 0; i < n; i++) {
   2895		tfile = rtnl_dereference(tun->tfiles[i]);
   2896		lock_sock(tfile->socket.sk);
   2897		sk_detach_filter(tfile->socket.sk);
   2898		release_sock(tfile->socket.sk);
   2899	}
   2900
   2901	tun->filter_attached = false;
   2902}
   2903
   2904static int tun_attach_filter(struct tun_struct *tun)
   2905{
   2906	int i, ret = 0;
   2907	struct tun_file *tfile;
   2908
   2909	for (i = 0; i < tun->numqueues; i++) {
   2910		tfile = rtnl_dereference(tun->tfiles[i]);
   2911		lock_sock(tfile->socket.sk);
   2912		ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
   2913		release_sock(tfile->socket.sk);
   2914		if (ret) {
   2915			tun_detach_filter(tun, i);
   2916			return ret;
   2917		}
   2918	}
   2919
   2920	tun->filter_attached = true;
   2921	return ret;
   2922}
   2923
   2924static void tun_set_sndbuf(struct tun_struct *tun)
   2925{
   2926	struct tun_file *tfile;
   2927	int i;
   2928
   2929	for (i = 0; i < tun->numqueues; i++) {
   2930		tfile = rtnl_dereference(tun->tfiles[i]);
   2931		tfile->socket.sk->sk_sndbuf = tun->sndbuf;
   2932	}
   2933}
   2934
   2935static int tun_set_queue(struct file *file, struct ifreq *ifr)
   2936{
   2937	struct tun_file *tfile = file->private_data;
   2938	struct tun_struct *tun;
   2939	int ret = 0;
   2940
   2941	rtnl_lock();
   2942
   2943	if (ifr->ifr_flags & IFF_ATTACH_QUEUE) {
   2944		tun = tfile->detached;
   2945		if (!tun) {
   2946			ret = -EINVAL;
   2947			goto unlock;
   2948		}
   2949		ret = security_tun_dev_attach_queue(tun->security);
   2950		if (ret < 0)
   2951			goto unlock;
   2952		ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI,
   2953				 tun->flags & IFF_NAPI_FRAGS, true);
   2954	} else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
   2955		tun = rtnl_dereference(tfile->tun);
   2956		if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)
   2957			ret = -EINVAL;
   2958		else
   2959			__tun_detach(tfile, false);
   2960	} else
   2961		ret = -EINVAL;
   2962
   2963	if (ret >= 0)
   2964		netdev_state_change(tun->dev);
   2965
   2966unlock:
   2967	rtnl_unlock();
   2968	return ret;
   2969}
   2970
   2971static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p,
   2972			void __user *data)
   2973{
   2974	struct bpf_prog *prog;
   2975	int fd;
   2976
   2977	if (copy_from_user(&fd, data, sizeof(fd)))
   2978		return -EFAULT;
   2979
   2980	if (fd == -1) {
   2981		prog = NULL;
   2982	} else {
   2983		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
   2984		if (IS_ERR(prog))
   2985			return PTR_ERR(prog);
   2986	}
   2987
   2988	return __tun_set_ebpf(tun, prog_p, prog);
   2989}
   2990
   2991/* Return correct value for tun->dev->addr_len based on tun->dev->type. */
   2992static unsigned char tun_get_addr_len(unsigned short type)
   2993{
   2994	switch (type) {
   2995	case ARPHRD_IP6GRE:
   2996	case ARPHRD_TUNNEL6:
   2997		return sizeof(struct in6_addr);
   2998	case ARPHRD_IPGRE:
   2999	case ARPHRD_TUNNEL:
   3000	case ARPHRD_SIT:
   3001		return 4;
   3002	case ARPHRD_ETHER:
   3003		return ETH_ALEN;
   3004	case ARPHRD_IEEE802154:
   3005	case ARPHRD_IEEE802154_MONITOR:
   3006		return IEEE802154_EXTENDED_ADDR_LEN;
   3007	case ARPHRD_PHONET_PIPE:
   3008	case ARPHRD_PPP:
   3009	case ARPHRD_NONE:
   3010		return 0;
   3011	case ARPHRD_6LOWPAN:
   3012		return EUI64_ADDR_LEN;
   3013	case ARPHRD_FDDI:
   3014		return FDDI_K_ALEN;
   3015	case ARPHRD_HIPPI:
   3016		return HIPPI_ALEN;
   3017	case ARPHRD_IEEE802:
   3018		return FC_ALEN;
   3019	case ARPHRD_ROSE:
   3020		return ROSE_ADDR_LEN;
   3021	case ARPHRD_NETROM:
   3022		return AX25_ADDR_LEN;
   3023	case ARPHRD_LOCALTLK:
   3024		return LTALK_ALEN;
   3025	default:
   3026		return 0;
   3027	}
   3028}
   3029
   3030static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
   3031			    unsigned long arg, int ifreq_len)
   3032{
   3033	struct tun_file *tfile = file->private_data;
   3034	struct net *net = sock_net(&tfile->sk);
   3035	struct tun_struct *tun;
   3036	void __user* argp = (void __user*)arg;
   3037	unsigned int ifindex, carrier;
   3038	struct ifreq ifr;
   3039	kuid_t owner;
   3040	kgid_t group;
   3041	int sndbuf;
   3042	int vnet_hdr_sz;
   3043	int le;
   3044	int ret;
   3045	bool do_notify = false;
   3046
   3047	if (cmd == TUNSETIFF || cmd == TUNSETQUEUE ||
   3048	    (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) {
   3049		if (copy_from_user(&ifr, argp, ifreq_len))
   3050			return -EFAULT;
   3051	} else {
   3052		memset(&ifr, 0, sizeof(ifr));
   3053	}
   3054	if (cmd == TUNGETFEATURES) {
   3055		/* Currently this just means: "what IFF flags are valid?".
   3056		 * This is needed because we never checked for invalid flags on
   3057		 * TUNSETIFF.
   3058		 */
   3059		return put_user(IFF_TUN | IFF_TAP | TUN_FEATURES,
   3060				(unsigned int __user*)argp);
   3061	} else if (cmd == TUNSETQUEUE) {
   3062		return tun_set_queue(file, &ifr);
   3063	} else if (cmd == SIOCGSKNS) {
   3064		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
   3065			return -EPERM;
   3066		return open_related_ns(&net->ns, get_net_ns);
   3067	}
   3068
   3069	rtnl_lock();
   3070
   3071	tun = tun_get(tfile);
   3072	if (cmd == TUNSETIFF) {
   3073		ret = -EEXIST;
   3074		if (tun)
   3075			goto unlock;
   3076
   3077		ifr.ifr_name[IFNAMSIZ-1] = '\0';
   3078
   3079		ret = tun_set_iff(net, file, &ifr);
   3080
   3081		if (ret)
   3082			goto unlock;
   3083
   3084		if (copy_to_user(argp, &ifr, ifreq_len))
   3085			ret = -EFAULT;
   3086		goto unlock;
   3087	}
   3088	if (cmd == TUNSETIFINDEX) {
   3089		ret = -EPERM;
   3090		if (tun)
   3091			goto unlock;
   3092
   3093		ret = -EFAULT;
   3094		if (copy_from_user(&ifindex, argp, sizeof(ifindex)))
   3095			goto unlock;
   3096
   3097		ret = 0;
   3098		tfile->ifindex = ifindex;
   3099		goto unlock;
   3100	}
   3101
   3102	ret = -EBADFD;
   3103	if (!tun)
   3104		goto unlock;
   3105
   3106	netif_info(tun, drv, tun->dev, "tun_chr_ioctl cmd %u\n", cmd);
   3107
   3108	net = dev_net(tun->dev);
   3109	ret = 0;
   3110	switch (cmd) {
   3111	case TUNGETIFF:
   3112		tun_get_iff(tun, &ifr);
   3113
   3114		if (tfile->detached)
   3115			ifr.ifr_flags |= IFF_DETACH_QUEUE;
   3116		if (!tfile->socket.sk->sk_filter)
   3117			ifr.ifr_flags |= IFF_NOFILTER;
   3118
   3119		if (copy_to_user(argp, &ifr, ifreq_len))
   3120			ret = -EFAULT;
   3121		break;
   3122
   3123	case TUNSETNOCSUM:
   3124		/* Disable/Enable checksum */
   3125
   3126		/* [unimplemented] */
   3127		netif_info(tun, drv, tun->dev, "ignored: set checksum %s\n",
   3128			   arg ? "disabled" : "enabled");
   3129		break;
   3130
   3131	case TUNSETPERSIST:
   3132		/* Disable/Enable persist mode. Keep an extra reference to the
   3133		 * module to prevent the module being unprobed.
   3134		 */
   3135		if (arg && !(tun->flags & IFF_PERSIST)) {
   3136			tun->flags |= IFF_PERSIST;
   3137			__module_get(THIS_MODULE);
   3138			do_notify = true;
   3139		}
   3140		if (!arg && (tun->flags & IFF_PERSIST)) {
   3141			tun->flags &= ~IFF_PERSIST;
   3142			module_put(THIS_MODULE);
   3143			do_notify = true;
   3144		}
   3145
   3146		netif_info(tun, drv, tun->dev, "persist %s\n",
   3147			   arg ? "enabled" : "disabled");
   3148		break;
   3149
   3150	case TUNSETOWNER:
   3151		/* Set owner of the device */
   3152		owner = make_kuid(current_user_ns(), arg);
   3153		if (!uid_valid(owner)) {
   3154			ret = -EINVAL;
   3155			break;
   3156		}
   3157		tun->owner = owner;
   3158		do_notify = true;
   3159		netif_info(tun, drv, tun->dev, "owner set to %u\n",
   3160			   from_kuid(&init_user_ns, tun->owner));
   3161		break;
   3162
   3163	case TUNSETGROUP:
   3164		/* Set group of the device */
   3165		group = make_kgid(current_user_ns(), arg);
   3166		if (!gid_valid(group)) {
   3167			ret = -EINVAL;
   3168			break;
   3169		}
   3170		tun->group = group;
   3171		do_notify = true;
   3172		netif_info(tun, drv, tun->dev, "group set to %u\n",
   3173			   from_kgid(&init_user_ns, tun->group));
   3174		break;
   3175
   3176	case TUNSETLINK:
   3177		/* Only allow setting the type when the interface is down */
   3178		if (tun->dev->flags & IFF_UP) {
   3179			netif_info(tun, drv, tun->dev,
   3180				   "Linktype set failed because interface is up\n");
   3181			ret = -EBUSY;
   3182		} else {
   3183			ret = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE,
   3184						       tun->dev);
   3185			ret = notifier_to_errno(ret);
   3186			if (ret) {
   3187				netif_info(tun, drv, tun->dev,
   3188					   "Refused to change device type\n");
   3189				break;
   3190			}
   3191			tun->dev->type = (int) arg;
   3192			tun->dev->addr_len = tun_get_addr_len(tun->dev->type);
   3193			netif_info(tun, drv, tun->dev, "linktype set to %d\n",
   3194				   tun->dev->type);
   3195			call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE,
   3196						 tun->dev);
   3197		}
   3198		break;
   3199
   3200	case TUNSETDEBUG:
   3201		tun->msg_enable = (u32)arg;
   3202		break;
   3203
   3204	case TUNSETOFFLOAD:
   3205		ret = set_offload(tun, arg);
   3206		break;
   3207
   3208	case TUNSETTXFILTER:
   3209		/* Can be set only for TAPs */
   3210		ret = -EINVAL;
   3211		if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
   3212			break;
   3213		ret = update_filter(&tun->txflt, (void __user *)arg);
   3214		break;
   3215
   3216	case SIOCGIFHWADDR:
   3217		/* Get hw address */
   3218		dev_get_mac_address(&ifr.ifr_hwaddr, net, tun->dev->name);
   3219		if (copy_to_user(argp, &ifr, ifreq_len))
   3220			ret = -EFAULT;
   3221		break;
   3222
   3223	case SIOCSIFHWADDR:
   3224		/* Set hw address */
   3225		ret = dev_set_mac_address_user(tun->dev, &ifr.ifr_hwaddr, NULL);
   3226		break;
   3227
   3228	case TUNGETSNDBUF:
   3229		sndbuf = tfile->socket.sk->sk_sndbuf;
   3230		if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
   3231			ret = -EFAULT;
   3232		break;
   3233
   3234	case TUNSETSNDBUF:
   3235		if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) {
   3236			ret = -EFAULT;
   3237			break;
   3238		}
   3239		if (sndbuf <= 0) {
   3240			ret = -EINVAL;
   3241			break;
   3242		}
   3243
   3244		tun->sndbuf = sndbuf;
   3245		tun_set_sndbuf(tun);
   3246		break;
   3247
   3248	case TUNGETVNETHDRSZ:
   3249		vnet_hdr_sz = tun->vnet_hdr_sz;
   3250		if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz)))
   3251			ret = -EFAULT;
   3252		break;
   3253
   3254	case TUNSETVNETHDRSZ:
   3255		if (copy_from_user(&vnet_hdr_sz, argp, sizeof(vnet_hdr_sz))) {
   3256			ret = -EFAULT;
   3257			break;
   3258		}
   3259		if (vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr)) {
   3260			ret = -EINVAL;
   3261			break;
   3262		}
   3263
   3264		tun->vnet_hdr_sz = vnet_hdr_sz;
   3265		break;
   3266
   3267	case TUNGETVNETLE:
   3268		le = !!(tun->flags & TUN_VNET_LE);
   3269		if (put_user(le, (int __user *)argp))
   3270			ret = -EFAULT;
   3271		break;
   3272
   3273	case TUNSETVNETLE:
   3274		if (get_user(le, (int __user *)argp)) {
   3275			ret = -EFAULT;
   3276			break;
   3277		}
   3278		if (le)
   3279			tun->flags |= TUN_VNET_LE;
   3280		else
   3281			tun->flags &= ~TUN_VNET_LE;
   3282		break;
   3283
   3284	case TUNGETVNETBE:
   3285		ret = tun_get_vnet_be(tun, argp);
   3286		break;
   3287
   3288	case TUNSETVNETBE:
   3289		ret = tun_set_vnet_be(tun, argp);
   3290		break;
   3291
   3292	case TUNATTACHFILTER:
   3293		/* Can be set only for TAPs */
   3294		ret = -EINVAL;
   3295		if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
   3296			break;
   3297		ret = -EFAULT;
   3298		if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog)))
   3299			break;
   3300
   3301		ret = tun_attach_filter(tun);
   3302		break;
   3303
   3304	case TUNDETACHFILTER:
   3305		/* Can be set only for TAPs */
   3306		ret = -EINVAL;
   3307		if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
   3308			break;
   3309		ret = 0;
   3310		tun_detach_filter(tun, tun->numqueues);
   3311		break;
   3312
   3313	case TUNGETFILTER:
   3314		ret = -EINVAL;
   3315		if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
   3316			break;
   3317		ret = -EFAULT;
   3318		if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog)))
   3319			break;
   3320		ret = 0;
   3321		break;
   3322
   3323	case TUNSETSTEERINGEBPF:
   3324		ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
   3325		break;
   3326
   3327	case TUNSETFILTEREBPF:
   3328		ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
   3329		break;
   3330
   3331	case TUNSETCARRIER:
   3332		ret = -EFAULT;
   3333		if (copy_from_user(&carrier, argp, sizeof(carrier)))
   3334			goto unlock;
   3335
   3336		ret = tun_net_change_carrier(tun->dev, (bool)carrier);
   3337		break;
   3338
   3339	case TUNGETDEVNETNS:
   3340		ret = -EPERM;
   3341		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
   3342			goto unlock;
   3343		ret = open_related_ns(&net->ns, get_net_ns);
   3344		break;
   3345
   3346	default:
   3347		ret = -EINVAL;
   3348		break;
   3349	}
   3350
   3351	if (do_notify)
   3352		netdev_state_change(tun->dev);
   3353
   3354unlock:
   3355	rtnl_unlock();
   3356	if (tun)
   3357		tun_put(tun);
   3358	return ret;
   3359}
   3360
   3361static long tun_chr_ioctl(struct file *file,
   3362			  unsigned int cmd, unsigned long arg)
   3363{
   3364	return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq));
   3365}
   3366
   3367#ifdef CONFIG_COMPAT
   3368static long tun_chr_compat_ioctl(struct file *file,
   3369			 unsigned int cmd, unsigned long arg)
   3370{
   3371	switch (cmd) {
   3372	case TUNSETIFF:
   3373	case TUNGETIFF:
   3374	case TUNSETTXFILTER:
   3375	case TUNGETSNDBUF:
   3376	case TUNSETSNDBUF:
   3377	case SIOCGIFHWADDR:
   3378	case SIOCSIFHWADDR:
   3379		arg = (unsigned long)compat_ptr(arg);
   3380		break;
   3381	default:
   3382		arg = (compat_ulong_t)arg;
   3383		break;
   3384	}
   3385
   3386	/*
   3387	 * compat_ifreq is shorter than ifreq, so we must not access beyond
   3388	 * the end of that structure. All fields that are used in this
   3389	 * driver are compatible though, we don't need to convert the
   3390	 * contents.
   3391	 */
   3392	return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq));
   3393}
   3394#endif /* CONFIG_COMPAT */
   3395
   3396static int tun_chr_fasync(int fd, struct file *file, int on)
   3397{
   3398	struct tun_file *tfile = file->private_data;
   3399	int ret;
   3400
   3401	if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0)
   3402		goto out;
   3403
   3404	if (on) {
   3405		__f_setown(file, task_pid(current), PIDTYPE_TGID, 0);
   3406		tfile->flags |= TUN_FASYNC;
   3407	} else
   3408		tfile->flags &= ~TUN_FASYNC;
   3409	ret = 0;
   3410out:
   3411	return ret;
   3412}
   3413
   3414static int tun_chr_open(struct inode *inode, struct file * file)
   3415{
   3416	struct net *net = current->nsproxy->net_ns;
   3417	struct tun_file *tfile;
   3418
   3419	tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
   3420					    &tun_proto, 0);
   3421	if (!tfile)
   3422		return -ENOMEM;
   3423	if (ptr_ring_init(&tfile->tx_ring, 0, GFP_KERNEL)) {
   3424		sk_free(&tfile->sk);
   3425		return -ENOMEM;
   3426	}
   3427
   3428	mutex_init(&tfile->napi_mutex);
   3429	RCU_INIT_POINTER(tfile->tun, NULL);
   3430	tfile->flags = 0;
   3431	tfile->ifindex = 0;
   3432
   3433	init_waitqueue_head(&tfile->socket.wq.wait);
   3434
   3435	tfile->socket.file = file;
   3436	tfile->socket.ops = &tun_socket_ops;
   3437
   3438	sock_init_data(&tfile->socket, &tfile->sk);
   3439
   3440	tfile->sk.sk_write_space = tun_sock_write_space;
   3441	tfile->sk.sk_sndbuf = INT_MAX;
   3442
   3443	file->private_data = tfile;
   3444	INIT_LIST_HEAD(&tfile->next);
   3445
   3446	sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);
   3447
   3448	return 0;
   3449}
   3450
   3451static int tun_chr_close(struct inode *inode, struct file *file)
   3452{
   3453	struct tun_file *tfile = file->private_data;
   3454
   3455	tun_detach(tfile, true);
   3456
   3457	return 0;
   3458}
   3459
   3460#ifdef CONFIG_PROC_FS
   3461static void tun_chr_show_fdinfo(struct seq_file *m, struct file *file)
   3462{
   3463	struct tun_file *tfile = file->private_data;
   3464	struct tun_struct *tun;
   3465	struct ifreq ifr;
   3466
   3467	memset(&ifr, 0, sizeof(ifr));
   3468
   3469	rtnl_lock();
   3470	tun = tun_get(tfile);
   3471	if (tun)
   3472		tun_get_iff(tun, &ifr);
   3473	rtnl_unlock();
   3474
   3475	if (tun)
   3476		tun_put(tun);
   3477
   3478	seq_printf(m, "iff:\t%s\n", ifr.ifr_name);
   3479}
   3480#endif
   3481
   3482static const struct file_operations tun_fops = {
   3483	.owner	= THIS_MODULE,
   3484	.llseek = no_llseek,
   3485	.read_iter  = tun_chr_read_iter,
   3486	.write_iter = tun_chr_write_iter,
   3487	.poll	= tun_chr_poll,
   3488	.unlocked_ioctl	= tun_chr_ioctl,
   3489#ifdef CONFIG_COMPAT
   3490	.compat_ioctl = tun_chr_compat_ioctl,
   3491#endif
   3492	.open	= tun_chr_open,
   3493	.release = tun_chr_close,
   3494	.fasync = tun_chr_fasync,
   3495#ifdef CONFIG_PROC_FS
   3496	.show_fdinfo = tun_chr_show_fdinfo,
   3497#endif
   3498};
   3499
   3500static struct miscdevice tun_miscdev = {
   3501	.minor = TUN_MINOR,
   3502	.name = "tun",
   3503	.nodename = "net/tun",
   3504	.fops = &tun_fops,
   3505};
   3506
   3507/* ethtool interface */
   3508
   3509static void tun_default_link_ksettings(struct net_device *dev,
   3510				       struct ethtool_link_ksettings *cmd)
   3511{
   3512	ethtool_link_ksettings_zero_link_mode(cmd, supported);
   3513	ethtool_link_ksettings_zero_link_mode(cmd, advertising);
   3514	cmd->base.speed		= SPEED_10;
   3515	cmd->base.duplex	= DUPLEX_FULL;
   3516	cmd->base.port		= PORT_TP;
   3517	cmd->base.phy_address	= 0;
   3518	cmd->base.autoneg	= AUTONEG_DISABLE;
   3519}
   3520
   3521static int tun_get_link_ksettings(struct net_device *dev,
   3522				  struct ethtool_link_ksettings *cmd)
   3523{
   3524	struct tun_struct *tun = netdev_priv(dev);
   3525
   3526	memcpy(cmd, &tun->link_ksettings, sizeof(*cmd));
   3527	return 0;
   3528}
   3529
   3530static int tun_set_link_ksettings(struct net_device *dev,
   3531				  const struct ethtool_link_ksettings *cmd)
   3532{
   3533	struct tun_struct *tun = netdev_priv(dev);
   3534
   3535	memcpy(&tun->link_ksettings, cmd, sizeof(*cmd));
   3536	return 0;
   3537}
   3538
   3539static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
   3540{
   3541	struct tun_struct *tun = netdev_priv(dev);
   3542
   3543	strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
   3544	strlcpy(info->version, DRV_VERSION, sizeof(info->version));
   3545
   3546	switch (tun->flags & TUN_TYPE_MASK) {
   3547	case IFF_TUN:
   3548		strlcpy(info->bus_info, "tun", sizeof(info->bus_info));
   3549		break;
   3550	case IFF_TAP:
   3551		strlcpy(info->bus_info, "tap", sizeof(info->bus_info));
   3552		break;
   3553	}
   3554}
   3555
   3556static u32 tun_get_msglevel(struct net_device *dev)
   3557{
   3558	struct tun_struct *tun = netdev_priv(dev);
   3559
   3560	return tun->msg_enable;
   3561}
   3562
   3563static void tun_set_msglevel(struct net_device *dev, u32 value)
   3564{
   3565	struct tun_struct *tun = netdev_priv(dev);
   3566
   3567	tun->msg_enable = value;
   3568}
   3569
   3570static int tun_get_coalesce(struct net_device *dev,
   3571			    struct ethtool_coalesce *ec,
   3572			    struct kernel_ethtool_coalesce *kernel_coal,
   3573			    struct netlink_ext_ack *extack)
   3574{
   3575	struct tun_struct *tun = netdev_priv(dev);
   3576
   3577	ec->rx_max_coalesced_frames = tun->rx_batched;
   3578
   3579	return 0;
   3580}
   3581
   3582static int tun_set_coalesce(struct net_device *dev,
   3583			    struct ethtool_coalesce *ec,
   3584			    struct kernel_ethtool_coalesce *kernel_coal,
   3585			    struct netlink_ext_ack *extack)
   3586{
   3587	struct tun_struct *tun = netdev_priv(dev);
   3588
   3589	if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT)
   3590		tun->rx_batched = NAPI_POLL_WEIGHT;
   3591	else
   3592		tun->rx_batched = ec->rx_max_coalesced_frames;
   3593
   3594	return 0;
   3595}
   3596
   3597static const struct ethtool_ops tun_ethtool_ops = {
   3598	.supported_coalesce_params = ETHTOOL_COALESCE_RX_MAX_FRAMES,
   3599	.get_drvinfo	= tun_get_drvinfo,
   3600	.get_msglevel	= tun_get_msglevel,
   3601	.set_msglevel	= tun_set_msglevel,
   3602	.get_link	= ethtool_op_get_link,
   3603	.get_ts_info	= ethtool_op_get_ts_info,
   3604	.get_coalesce   = tun_get_coalesce,
   3605	.set_coalesce   = tun_set_coalesce,
   3606	.get_link_ksettings = tun_get_link_ksettings,
   3607	.set_link_ksettings = tun_set_link_ksettings,
   3608};
   3609
   3610static int tun_queue_resize(struct tun_struct *tun)
   3611{
   3612	struct net_device *dev = tun->dev;
   3613	struct tun_file *tfile;
   3614	struct ptr_ring **rings;
   3615	int n = tun->numqueues + tun->numdisabled;
   3616	int ret, i;
   3617
   3618	rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL);
   3619	if (!rings)
   3620		return -ENOMEM;
   3621
   3622	for (i = 0; i < tun->numqueues; i++) {
   3623		tfile = rtnl_dereference(tun->tfiles[i]);
   3624		rings[i] = &tfile->tx_ring;
   3625	}
   3626	list_for_each_entry(tfile, &tun->disabled, next)
   3627		rings[i++] = &tfile->tx_ring;
   3628
   3629	ret = ptr_ring_resize_multiple(rings, n,
   3630				       dev->tx_queue_len, GFP_KERNEL,
   3631				       tun_ptr_free);
   3632
   3633	kfree(rings);
   3634	return ret;
   3635}
   3636
   3637static int tun_device_event(struct notifier_block *unused,
   3638			    unsigned long event, void *ptr)
   3639{
   3640	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
   3641	struct tun_struct *tun = netdev_priv(dev);
   3642	int i;
   3643
   3644	if (dev->rtnl_link_ops != &tun_link_ops)
   3645		return NOTIFY_DONE;
   3646
   3647	switch (event) {
   3648	case NETDEV_CHANGE_TX_QUEUE_LEN:
   3649		if (tun_queue_resize(tun))
   3650			return NOTIFY_BAD;
   3651		break;
   3652	case NETDEV_UP:
   3653		for (i = 0; i < tun->numqueues; i++) {
   3654			struct tun_file *tfile;
   3655
   3656			tfile = rtnl_dereference(tun->tfiles[i]);
   3657			tfile->socket.sk->sk_write_space(tfile->socket.sk);
   3658		}
   3659		break;
   3660	default:
   3661		break;
   3662	}
   3663
   3664	return NOTIFY_DONE;
   3665}
   3666
   3667static struct notifier_block tun_notifier_block __read_mostly = {
   3668	.notifier_call	= tun_device_event,
   3669};
   3670
   3671static int __init tun_init(void)
   3672{
   3673	int ret = 0;
   3674
   3675	pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION);
   3676
   3677	ret = rtnl_link_register(&tun_link_ops);
   3678	if (ret) {
   3679		pr_err("Can't register link_ops\n");
   3680		goto err_linkops;
   3681	}
   3682
   3683	ret = misc_register(&tun_miscdev);
   3684	if (ret) {
   3685		pr_err("Can't register misc device %d\n", TUN_MINOR);
   3686		goto err_misc;
   3687	}
   3688
   3689	ret = register_netdevice_notifier(&tun_notifier_block);
   3690	if (ret) {
   3691		pr_err("Can't register netdevice notifier\n");
   3692		goto err_notifier;
   3693	}
   3694
   3695	return  0;
   3696
   3697err_notifier:
   3698	misc_deregister(&tun_miscdev);
   3699err_misc:
   3700	rtnl_link_unregister(&tun_link_ops);
   3701err_linkops:
   3702	return ret;
   3703}
   3704
   3705static void tun_cleanup(void)
   3706{
   3707	misc_deregister(&tun_miscdev);
   3708	rtnl_link_unregister(&tun_link_ops);
   3709	unregister_netdevice_notifier(&tun_notifier_block);
   3710}
   3711
   3712/* Get an underlying socket object from tun file.  Returns error unless file is
   3713 * attached to a device.  The returned object works like a packet socket, it
   3714 * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
   3715 * holding a reference to the file for as long as the socket is in use. */
   3716struct socket *tun_get_socket(struct file *file)
   3717{
   3718	struct tun_file *tfile;
   3719	if (file->f_op != &tun_fops)
   3720		return ERR_PTR(-EINVAL);
   3721	tfile = file->private_data;
   3722	if (!tfile)
   3723		return ERR_PTR(-EBADFD);
   3724	return &tfile->socket;
   3725}
   3726EXPORT_SYMBOL_GPL(tun_get_socket);
   3727
   3728struct ptr_ring *tun_get_tx_ring(struct file *file)
   3729{
   3730	struct tun_file *tfile;
   3731
   3732	if (file->f_op != &tun_fops)
   3733		return ERR_PTR(-EINVAL);
   3734	tfile = file->private_data;
   3735	if (!tfile)
   3736		return ERR_PTR(-EBADFD);
   3737	return &tfile->tx_ring;
   3738}
   3739EXPORT_SYMBOL_GPL(tun_get_tx_ring);
   3740
   3741module_init(tun_init);
   3742module_exit(tun_cleanup);
   3743MODULE_DESCRIPTION(DRV_DESCRIPTION);
   3744MODULE_AUTHOR(DRV_COPYRIGHT);
   3745MODULE_LICENSE("GPL");
   3746MODULE_ALIAS_MISCDEV(TUN_MINOR);
   3747MODULE_ALIAS("devname:net/tun");