dev.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
dev.c (292930B)
      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 *      NET3    Protocol independent device support routines.
      4 *
      5 *	Derived from the non IP parts of dev.c 1.0.19
      6 *              Authors:	Ross Biro
      7 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
      8 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
      9 *
     10 *	Additional Authors:
     11 *		Florian la Roche <rzsfl@rz.uni-sb.de>
     12 *		Alan Cox <gw4pts@gw4pts.ampr.org>
     13 *		David Hinds <dahinds@users.sourceforge.net>
     14 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
     15 *		Adam Sulmicki <adam@cfar.umd.edu>
     16 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
     17 *
     18 *	Changes:
     19 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
     20 *                                      to 2 if register_netdev gets called
     21 *                                      before net_dev_init & also removed a
     22 *                                      few lines of code in the process.
     23 *		Alan Cox	:	device private ioctl copies fields back.
     24 *		Alan Cox	:	Transmit queue code does relevant
     25 *					stunts to keep the queue safe.
     26 *		Alan Cox	:	Fixed double lock.
     27 *		Alan Cox	:	Fixed promisc NULL pointer trap
     28 *		????????	:	Support the full private ioctl range
     29 *		Alan Cox	:	Moved ioctl permission check into
     30 *					drivers
     31 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
     32 *		Alan Cox	:	100 backlog just doesn't cut it when
     33 *					you start doing multicast video 8)
     34 *		Alan Cox	:	Rewrote net_bh and list manager.
     35 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
     36 *		Alan Cox	:	Took out transmit every packet pass
     37 *					Saved a few bytes in the ioctl handler
     38 *		Alan Cox	:	Network driver sets packet type before
     39 *					calling netif_rx. Saves a function
     40 *					call a packet.
     41 *		Alan Cox	:	Hashed net_bh()
     42 *		Richard Kooijman:	Timestamp fixes.
     43 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
     44 *		Alan Cox	:	Device lock protection.
     45 *              Alan Cox        :       Fixed nasty side effect of device close
     46 *					changes.
     47 *		Rudi Cilibrasi	:	Pass the right thing to
     48 *					set_mac_address()
     49 *		Dave Miller	:	32bit quantity for the device lock to
     50 *					make it work out on a Sparc.
     51 *		Bjorn Ekwall	:	Added KERNELD hack.
     52 *		Alan Cox	:	Cleaned up the backlog initialise.
     53 *		Craig Metz	:	SIOCGIFCONF fix if space for under
     54 *					1 device.
     55 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
     56 *					is no device open function.
     57 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
     58 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
     59 *		Cyrus Durgin	:	Cleaned for KMOD
     60 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
     61 *					A network device unload needs to purge
     62 *					the backlog queue.
     63 *	Paul Rusty Russell	:	SIOCSIFNAME
     64 *              Pekka Riikonen  :	Netdev boot-time settings code
     65 *              Andrew Morton   :       Make unregister_netdevice wait
     66 *                                      indefinitely on dev->refcnt
     67 *              J Hadi Salim    :       - Backlog queue sampling
     68 *				        - netif_rx() feedback
     69 */
     70
     71#include <linux/uaccess.h>
     72#include <linux/bitops.h>
     73#include <linux/capability.h>
     74#include <linux/cpu.h>
     75#include <linux/types.h>
     76#include <linux/kernel.h>
     77#include <linux/hash.h>
     78#include <linux/slab.h>
     79#include <linux/sched.h>
     80#include <linux/sched/mm.h>
     81#include <linux/mutex.h>
     82#include <linux/rwsem.h>
     83#include <linux/string.h>
     84#include <linux/mm.h>
     85#include <linux/socket.h>
     86#include <linux/sockios.h>
     87#include <linux/errno.h>
     88#include <linux/interrupt.h>
     89#include <linux/if_ether.h>
     90#include <linux/netdevice.h>
     91#include <linux/etherdevice.h>
     92#include <linux/ethtool.h>
     93#include <linux/skbuff.h>
     94#include <linux/kthread.h>
     95#include <linux/bpf.h>
     96#include <linux/bpf_trace.h>
     97#include <net/net_namespace.h>
     98#include <net/sock.h>
     99#include <net/busy_poll.h>
    100#include <linux/rtnetlink.h>
    101#include <linux/stat.h>
    102#include <net/dsa.h>
    103#include <net/dst.h>
    104#include <net/dst_metadata.h>
    105#include <net/gro.h>
    106#include <net/pkt_sched.h>
    107#include <net/pkt_cls.h>
    108#include <net/checksum.h>
    109#include <net/xfrm.h>
    110#include <linux/highmem.h>
    111#include <linux/init.h>
    112#include <linux/module.h>
    113#include <linux/netpoll.h>
    114#include <linux/rcupdate.h>
    115#include <linux/delay.h>
    116#include <net/iw_handler.h>
    117#include <asm/current.h>
    118#include <linux/audit.h>
    119#include <linux/dmaengine.h>
    120#include <linux/err.h>
    121#include <linux/ctype.h>
    122#include <linux/if_arp.h>
    123#include <linux/if_vlan.h>
    124#include <linux/ip.h>
    125#include <net/ip.h>
    126#include <net/mpls.h>
    127#include <linux/ipv6.h>
    128#include <linux/in.h>
    129#include <linux/jhash.h>
    130#include <linux/random.h>
    131#include <trace/events/napi.h>
    132#include <trace/events/net.h>
    133#include <trace/events/skb.h>
    134#include <trace/events/qdisc.h>
    135#include <linux/inetdevice.h>
    136#include <linux/cpu_rmap.h>
    137#include <linux/static_key.h>
    138#include <linux/hashtable.h>
    139#include <linux/vmalloc.h>
    140#include <linux/if_macvlan.h>
    141#include <linux/errqueue.h>
    142#include <linux/hrtimer.h>
    143#include <linux/netfilter_netdev.h>
    144#include <linux/crash_dump.h>
    145#include <linux/sctp.h>
    146#include <net/udp_tunnel.h>
    147#include <linux/net_namespace.h>
    148#include <linux/indirect_call_wrapper.h>
    149#include <net/devlink.h>
    150#include <linux/pm_runtime.h>
    151#include <linux/prandom.h>
    152#include <linux/once_lite.h>
    153
    154#include "dev.h"
    155#include "net-sysfs.h"
    156
    157
    158static DEFINE_SPINLOCK(ptype_lock);
    159struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
    160struct list_head ptype_all __read_mostly;	/* Taps */
    161
    162static int netif_rx_internal(struct sk_buff *skb);
    163static int call_netdevice_notifiers_info(unsigned long val,
    164					 struct netdev_notifier_info *info);
    165static int call_netdevice_notifiers_extack(unsigned long val,
    166					   struct net_device *dev,
    167					   struct netlink_ext_ack *extack);
    168static struct napi_struct *napi_by_id(unsigned int napi_id);
    169
    170/*
    171 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
    172 * semaphore.
    173 *
    174 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
    175 *
    176 * Writers must hold the rtnl semaphore while they loop through the
    177 * dev_base_head list, and hold dev_base_lock for writing when they do the
    178 * actual updates.  This allows pure readers to access the list even
    179 * while a writer is preparing to update it.
    180 *
    181 * To put it another way, dev_base_lock is held for writing only to
    182 * protect against pure readers; the rtnl semaphore provides the
    183 * protection against other writers.
    184 *
    185 * See, for example usages, register_netdevice() and
    186 * unregister_netdevice(), which must be called with the rtnl
    187 * semaphore held.
    188 */
    189DEFINE_RWLOCK(dev_base_lock);
    190EXPORT_SYMBOL(dev_base_lock);
    191
    192static DEFINE_MUTEX(ifalias_mutex);
    193
    194/* protects napi_hash addition/deletion and napi_gen_id */
    195static DEFINE_SPINLOCK(napi_hash_lock);
    196
    197static unsigned int napi_gen_id = NR_CPUS;
    198static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
    199
    200static DECLARE_RWSEM(devnet_rename_sem);
    201
    202static inline void dev_base_seq_inc(struct net *net)
    203{
    204	while (++net->dev_base_seq == 0)
    205		;
    206}
    207
    208static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
    209{
    210	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
    211
    212	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
    213}
    214
    215static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
    216{
    217	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
    218}
    219
    220static inline void rps_lock_irqsave(struct softnet_data *sd,
    221				    unsigned long *flags)
    222{
    223	if (IS_ENABLED(CONFIG_RPS))
    224		spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
    225	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
    226		local_irq_save(*flags);
    227}
    228
    229static inline void rps_lock_irq_disable(struct softnet_data *sd)
    230{
    231	if (IS_ENABLED(CONFIG_RPS))
    232		spin_lock_irq(&sd->input_pkt_queue.lock);
    233	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
    234		local_irq_disable();
    235}
    236
    237static inline void rps_unlock_irq_restore(struct softnet_data *sd,
    238					  unsigned long *flags)
    239{
    240	if (IS_ENABLED(CONFIG_RPS))
    241		spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
    242	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
    243		local_irq_restore(*flags);
    244}
    245
    246static inline void rps_unlock_irq_enable(struct softnet_data *sd)
    247{
    248	if (IS_ENABLED(CONFIG_RPS))
    249		spin_unlock_irq(&sd->input_pkt_queue.lock);
    250	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
    251		local_irq_enable();
    252}
    253
    254static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
    255						       const char *name)
    256{
    257	struct netdev_name_node *name_node;
    258
    259	name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
    260	if (!name_node)
    261		return NULL;
    262	INIT_HLIST_NODE(&name_node->hlist);
    263	name_node->dev = dev;
    264	name_node->name = name;
    265	return name_node;
    266}
    267
    268static struct netdev_name_node *
    269netdev_name_node_head_alloc(struct net_device *dev)
    270{
    271	struct netdev_name_node *name_node;
    272
    273	name_node = netdev_name_node_alloc(dev, dev->name);
    274	if (!name_node)
    275		return NULL;
    276	INIT_LIST_HEAD(&name_node->list);
    277	return name_node;
    278}
    279
    280static void netdev_name_node_free(struct netdev_name_node *name_node)
    281{
    282	kfree(name_node);
    283}
    284
    285static void netdev_name_node_add(struct net *net,
    286				 struct netdev_name_node *name_node)
    287{
    288	hlist_add_head_rcu(&name_node->hlist,
    289			   dev_name_hash(net, name_node->name));
    290}
    291
    292static void netdev_name_node_del(struct netdev_name_node *name_node)
    293{
    294	hlist_del_rcu(&name_node->hlist);
    295}
    296
    297static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
    298							const char *name)
    299{
    300	struct hlist_head *head = dev_name_hash(net, name);
    301	struct netdev_name_node *name_node;
    302
    303	hlist_for_each_entry(name_node, head, hlist)
    304		if (!strcmp(name_node->name, name))
    305			return name_node;
    306	return NULL;
    307}
    308
    309static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
    310							    const char *name)
    311{
    312	struct hlist_head *head = dev_name_hash(net, name);
    313	struct netdev_name_node *name_node;
    314
    315	hlist_for_each_entry_rcu(name_node, head, hlist)
    316		if (!strcmp(name_node->name, name))
    317			return name_node;
    318	return NULL;
    319}
    320
    321bool netdev_name_in_use(struct net *net, const char *name)
    322{
    323	return netdev_name_node_lookup(net, name);
    324}
    325EXPORT_SYMBOL(netdev_name_in_use);
    326
    327int netdev_name_node_alt_create(struct net_device *dev, const char *name)
    328{
    329	struct netdev_name_node *name_node;
    330	struct net *net = dev_net(dev);
    331
    332	name_node = netdev_name_node_lookup(net, name);
    333	if (name_node)
    334		return -EEXIST;
    335	name_node = netdev_name_node_alloc(dev, name);
    336	if (!name_node)
    337		return -ENOMEM;
    338	netdev_name_node_add(net, name_node);
    339	/* The node that holds dev->name acts as a head of per-device list. */
    340	list_add_tail(&name_node->list, &dev->name_node->list);
    341
    342	return 0;
    343}
    344
    345static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
    346{
    347	list_del(&name_node->list);
    348	netdev_name_node_del(name_node);
    349	kfree(name_node->name);
    350	netdev_name_node_free(name_node);
    351}
    352
    353int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
    354{
    355	struct netdev_name_node *name_node;
    356	struct net *net = dev_net(dev);
    357
    358	name_node = netdev_name_node_lookup(net, name);
    359	if (!name_node)
    360		return -ENOENT;
    361	/* lookup might have found our primary name or a name belonging
    362	 * to another device.
    363	 */
    364	if (name_node == dev->name_node || name_node->dev != dev)
    365		return -EINVAL;
    366
    367	__netdev_name_node_alt_destroy(name_node);
    368
    369	return 0;
    370}
    371
    372static void netdev_name_node_alt_flush(struct net_device *dev)
    373{
    374	struct netdev_name_node *name_node, *tmp;
    375
    376	list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
    377		__netdev_name_node_alt_destroy(name_node);
    378}
    379
    380/* Device list insertion */
    381static void list_netdevice(struct net_device *dev)
    382{
    383	struct net *net = dev_net(dev);
    384
    385	ASSERT_RTNL();
    386
    387	write_lock(&dev_base_lock);
    388	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
    389	netdev_name_node_add(net, dev->name_node);
    390	hlist_add_head_rcu(&dev->index_hlist,
    391			   dev_index_hash(net, dev->ifindex));
    392	write_unlock(&dev_base_lock);
    393
    394	dev_base_seq_inc(net);
    395}
    396
    397/* Device list removal
    398 * caller must respect a RCU grace period before freeing/reusing dev
    399 */
    400static void unlist_netdevice(struct net_device *dev, bool lock)
    401{
    402	ASSERT_RTNL();
    403
    404	/* Unlink dev from the device chain */
    405	if (lock)
    406		write_lock(&dev_base_lock);
    407	list_del_rcu(&dev->dev_list);
    408	netdev_name_node_del(dev->name_node);
    409	hlist_del_rcu(&dev->index_hlist);
    410	if (lock)
    411		write_unlock(&dev_base_lock);
    412
    413	dev_base_seq_inc(dev_net(dev));
    414}
    415
    416/*
    417 *	Our notifier list
    418 */
    419
    420static RAW_NOTIFIER_HEAD(netdev_chain);
    421
    422/*
    423 *	Device drivers call our routines to queue packets here. We empty the
    424 *	queue in the local softnet handler.
    425 */
    426
    427DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
    428EXPORT_PER_CPU_SYMBOL(softnet_data);
    429
    430#ifdef CONFIG_LOCKDEP
    431/*
    432 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
    433 * according to dev->type
    434 */
    435static const unsigned short netdev_lock_type[] = {
    436	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
    437	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
    438	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
    439	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
    440	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
    441	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
    442	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
    443	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
    444	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
    445	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
    446	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
    447	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
    448	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
    449	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
    450	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
    451
    452static const char *const netdev_lock_name[] = {
    453	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
    454	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
    455	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
    456	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
    457	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
    458	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
    459	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
    460	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
    461	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
    462	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
    463	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
    464	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
    465	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
    466	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
    467	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
    468
    469static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
    470static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
    471
    472static inline unsigned short netdev_lock_pos(unsigned short dev_type)
    473{
    474	int i;
    475
    476	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
    477		if (netdev_lock_type[i] == dev_type)
    478			return i;
    479	/* the last key is used by default */
    480	return ARRAY_SIZE(netdev_lock_type) - 1;
    481}
    482
    483static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
    484						 unsigned short dev_type)
    485{
    486	int i;
    487
    488	i = netdev_lock_pos(dev_type);
    489	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
    490				   netdev_lock_name[i]);
    491}
    492
    493static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
    494{
    495	int i;
    496
    497	i = netdev_lock_pos(dev->type);
    498	lockdep_set_class_and_name(&dev->addr_list_lock,
    499				   &netdev_addr_lock_key[i],
    500				   netdev_lock_name[i]);
    501}
    502#else
    503static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
    504						 unsigned short dev_type)
    505{
    506}
    507
    508static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
    509{
    510}
    511#endif
    512
    513/*******************************************************************************
    514 *
    515 *		Protocol management and registration routines
    516 *
    517 *******************************************************************************/
    518
    519
    520/*
    521 *	Add a protocol ID to the list. Now that the input handler is
    522 *	smarter we can dispense with all the messy stuff that used to be
    523 *	here.
    524 *
    525 *	BEWARE!!! Protocol handlers, mangling input packets,
    526 *	MUST BE last in hash buckets and checking protocol handlers
    527 *	MUST start from promiscuous ptype_all chain in net_bh.
    528 *	It is true now, do not change it.
    529 *	Explanation follows: if protocol handler, mangling packet, will
    530 *	be the first on list, it is not able to sense, that packet
    531 *	is cloned and should be copied-on-write, so that it will
    532 *	change it and subsequent readers will get broken packet.
    533 *							--ANK (980803)
    534 */
    535
    536static inline struct list_head *ptype_head(const struct packet_type *pt)
    537{
    538	if (pt->type == htons(ETH_P_ALL))
    539		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
    540	else
    541		return pt->dev ? &pt->dev->ptype_specific :
    542				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
    543}
    544
    545/**
    546 *	dev_add_pack - add packet handler
    547 *	@pt: packet type declaration
    548 *
    549 *	Add a protocol handler to the networking stack. The passed &packet_type
    550 *	is linked into kernel lists and may not be freed until it has been
    551 *	removed from the kernel lists.
    552 *
    553 *	This call does not sleep therefore it can not
    554 *	guarantee all CPU's that are in middle of receiving packets
    555 *	will see the new packet type (until the next received packet).
    556 */
    557
    558void dev_add_pack(struct packet_type *pt)
    559{
    560	struct list_head *head = ptype_head(pt);
    561
    562	spin_lock(&ptype_lock);
    563	list_add_rcu(&pt->list, head);
    564	spin_unlock(&ptype_lock);
    565}
    566EXPORT_SYMBOL(dev_add_pack);
    567
    568/**
    569 *	__dev_remove_pack	 - remove packet handler
    570 *	@pt: packet type declaration
    571 *
    572 *	Remove a protocol handler that was previously added to the kernel
    573 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
    574 *	from the kernel lists and can be freed or reused once this function
    575 *	returns.
    576 *
    577 *      The packet type might still be in use by receivers
    578 *	and must not be freed until after all the CPU's have gone
    579 *	through a quiescent state.
    580 */
    581void __dev_remove_pack(struct packet_type *pt)
    582{
    583	struct list_head *head = ptype_head(pt);
    584	struct packet_type *pt1;
    585
    586	spin_lock(&ptype_lock);
    587
    588	list_for_each_entry(pt1, head, list) {
    589		if (pt == pt1) {
    590			list_del_rcu(&pt->list);
    591			goto out;
    592		}
    593	}
    594
    595	pr_warn("dev_remove_pack: %p not found\n", pt);
    596out:
    597	spin_unlock(&ptype_lock);
    598}
    599EXPORT_SYMBOL(__dev_remove_pack);
    600
    601/**
    602 *	dev_remove_pack	 - remove packet handler
    603 *	@pt: packet type declaration
    604 *
    605 *	Remove a protocol handler that was previously added to the kernel
    606 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
    607 *	from the kernel lists and can be freed or reused once this function
    608 *	returns.
    609 *
    610 *	This call sleeps to guarantee that no CPU is looking at the packet
    611 *	type after return.
    612 */
    613void dev_remove_pack(struct packet_type *pt)
    614{
    615	__dev_remove_pack(pt);
    616
    617	synchronize_net();
    618}
    619EXPORT_SYMBOL(dev_remove_pack);
    620
    621
    622/*******************************************************************************
    623 *
    624 *			    Device Interface Subroutines
    625 *
    626 *******************************************************************************/
    627
    628/**
    629 *	dev_get_iflink	- get 'iflink' value of a interface
    630 *	@dev: targeted interface
    631 *
    632 *	Indicates the ifindex the interface is linked to.
    633 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
    634 */
    635
    636int dev_get_iflink(const struct net_device *dev)
    637{
    638	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
    639		return dev->netdev_ops->ndo_get_iflink(dev);
    640
    641	return dev->ifindex;
    642}
    643EXPORT_SYMBOL(dev_get_iflink);
    644
    645/**
    646 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
    647 *	@dev: targeted interface
    648 *	@skb: The packet.
    649 *
    650 *	For better visibility of tunnel traffic OVS needs to retrieve
    651 *	egress tunnel information for a packet. Following API allows
    652 *	user to get this info.
    653 */
    654int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
    655{
    656	struct ip_tunnel_info *info;
    657
    658	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
    659		return -EINVAL;
    660
    661	info = skb_tunnel_info_unclone(skb);
    662	if (!info)
    663		return -ENOMEM;
    664	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
    665		return -EINVAL;
    666
    667	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
    668}
    669EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
    670
    671static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
    672{
    673	int k = stack->num_paths++;
    674
    675	if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
    676		return NULL;
    677
    678	return &stack->path[k];
    679}
    680
    681int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
    682			  struct net_device_path_stack *stack)
    683{
    684	const struct net_device *last_dev;
    685	struct net_device_path_ctx ctx = {
    686		.dev	= dev,
    687	};
    688	struct net_device_path *path;
    689	int ret = 0;
    690
    691	memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
    692	stack->num_paths = 0;
    693	while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
    694		last_dev = ctx.dev;
    695		path = dev_fwd_path(stack);
    696		if (!path)
    697			return -1;
    698
    699		memset(path, 0, sizeof(struct net_device_path));
    700		ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
    701		if (ret < 0)
    702			return -1;
    703
    704		if (WARN_ON_ONCE(last_dev == ctx.dev))
    705			return -1;
    706	}
    707
    708	if (!ctx.dev)
    709		return ret;
    710
    711	path = dev_fwd_path(stack);
    712	if (!path)
    713		return -1;
    714	path->type = DEV_PATH_ETHERNET;
    715	path->dev = ctx.dev;
    716
    717	return ret;
    718}
    719EXPORT_SYMBOL_GPL(dev_fill_forward_path);
    720
    721/**
    722 *	__dev_get_by_name	- find a device by its name
    723 *	@net: the applicable net namespace
    724 *	@name: name to find
    725 *
    726 *	Find an interface by name. Must be called under RTNL semaphore
    727 *	or @dev_base_lock. If the name is found a pointer to the device
    728 *	is returned. If the name is not found then %NULL is returned. The
    729 *	reference counters are not incremented so the caller must be
    730 *	careful with locks.
    731 */
    732
    733struct net_device *__dev_get_by_name(struct net *net, const char *name)
    734{
    735	struct netdev_name_node *node_name;
    736
    737	node_name = netdev_name_node_lookup(net, name);
    738	return node_name ? node_name->dev : NULL;
    739}
    740EXPORT_SYMBOL(__dev_get_by_name);
    741
    742/**
    743 * dev_get_by_name_rcu	- find a device by its name
    744 * @net: the applicable net namespace
    745 * @name: name to find
    746 *
    747 * Find an interface by name.
    748 * If the name is found a pointer to the device is returned.
    749 * If the name is not found then %NULL is returned.
    750 * The reference counters are not incremented so the caller must be
    751 * careful with locks. The caller must hold RCU lock.
    752 */
    753
    754struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
    755{
    756	struct netdev_name_node *node_name;
    757
    758	node_name = netdev_name_node_lookup_rcu(net, name);
    759	return node_name ? node_name->dev : NULL;
    760}
    761EXPORT_SYMBOL(dev_get_by_name_rcu);
    762
    763/**
    764 *	dev_get_by_name		- find a device by its name
    765 *	@net: the applicable net namespace
    766 *	@name: name to find
    767 *
    768 *	Find an interface by name. This can be called from any
    769 *	context and does its own locking. The returned handle has
    770 *	the usage count incremented and the caller must use dev_put() to
    771 *	release it when it is no longer needed. %NULL is returned if no
    772 *	matching device is found.
    773 */
    774
    775struct net_device *dev_get_by_name(struct net *net, const char *name)
    776{
    777	struct net_device *dev;
    778
    779	rcu_read_lock();
    780	dev = dev_get_by_name_rcu(net, name);
    781	dev_hold(dev);
    782	rcu_read_unlock();
    783	return dev;
    784}
    785EXPORT_SYMBOL(dev_get_by_name);
    786
    787/**
    788 *	__dev_get_by_index - find a device by its ifindex
    789 *	@net: the applicable net namespace
    790 *	@ifindex: index of device
    791 *
    792 *	Search for an interface by index. Returns %NULL if the device
    793 *	is not found or a pointer to the device. The device has not
    794 *	had its reference counter increased so the caller must be careful
    795 *	about locking. The caller must hold either the RTNL semaphore
    796 *	or @dev_base_lock.
    797 */
    798
    799struct net_device *__dev_get_by_index(struct net *net, int ifindex)
    800{
    801	struct net_device *dev;
    802	struct hlist_head *head = dev_index_hash(net, ifindex);
    803
    804	hlist_for_each_entry(dev, head, index_hlist)
    805		if (dev->ifindex == ifindex)
    806			return dev;
    807
    808	return NULL;
    809}
    810EXPORT_SYMBOL(__dev_get_by_index);
    811
    812/**
    813 *	dev_get_by_index_rcu - find a device by its ifindex
    814 *	@net: the applicable net namespace
    815 *	@ifindex: index of device
    816 *
    817 *	Search for an interface by index. Returns %NULL if the device
    818 *	is not found or a pointer to the device. The device has not
    819 *	had its reference counter increased so the caller must be careful
    820 *	about locking. The caller must hold RCU lock.
    821 */
    822
    823struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
    824{
    825	struct net_device *dev;
    826	struct hlist_head *head = dev_index_hash(net, ifindex);
    827
    828	hlist_for_each_entry_rcu(dev, head, index_hlist)
    829		if (dev->ifindex == ifindex)
    830			return dev;
    831
    832	return NULL;
    833}
    834EXPORT_SYMBOL(dev_get_by_index_rcu);
    835
    836
    837/**
    838 *	dev_get_by_index - find a device by its ifindex
    839 *	@net: the applicable net namespace
    840 *	@ifindex: index of device
    841 *
    842 *	Search for an interface by index. Returns NULL if the device
    843 *	is not found or a pointer to the device. The device returned has
    844 *	had a reference added and the pointer is safe until the user calls
    845 *	dev_put to indicate they have finished with it.
    846 */
    847
    848struct net_device *dev_get_by_index(struct net *net, int ifindex)
    849{
    850	struct net_device *dev;
    851
    852	rcu_read_lock();
    853	dev = dev_get_by_index_rcu(net, ifindex);
    854	dev_hold(dev);
    855	rcu_read_unlock();
    856	return dev;
    857}
    858EXPORT_SYMBOL(dev_get_by_index);
    859
    860/**
    861 *	dev_get_by_napi_id - find a device by napi_id
    862 *	@napi_id: ID of the NAPI struct
    863 *
    864 *	Search for an interface by NAPI ID. Returns %NULL if the device
    865 *	is not found or a pointer to the device. The device has not had
    866 *	its reference counter increased so the caller must be careful
    867 *	about locking. The caller must hold RCU lock.
    868 */
    869
    870struct net_device *dev_get_by_napi_id(unsigned int napi_id)
    871{
    872	struct napi_struct *napi;
    873
    874	WARN_ON_ONCE(!rcu_read_lock_held());
    875
    876	if (napi_id < MIN_NAPI_ID)
    877		return NULL;
    878
    879	napi = napi_by_id(napi_id);
    880
    881	return napi ? napi->dev : NULL;
    882}
    883EXPORT_SYMBOL(dev_get_by_napi_id);
    884
    885/**
    886 *	netdev_get_name - get a netdevice name, knowing its ifindex.
    887 *	@net: network namespace
    888 *	@name: a pointer to the buffer where the name will be stored.
    889 *	@ifindex: the ifindex of the interface to get the name from.
    890 */
    891int netdev_get_name(struct net *net, char *name, int ifindex)
    892{
    893	struct net_device *dev;
    894	int ret;
    895
    896	down_read(&devnet_rename_sem);
    897	rcu_read_lock();
    898
    899	dev = dev_get_by_index_rcu(net, ifindex);
    900	if (!dev) {
    901		ret = -ENODEV;
    902		goto out;
    903	}
    904
    905	strcpy(name, dev->name);
    906
    907	ret = 0;
    908out:
    909	rcu_read_unlock();
    910	up_read(&devnet_rename_sem);
    911	return ret;
    912}
    913
    914/**
    915 *	dev_getbyhwaddr_rcu - find a device by its hardware address
    916 *	@net: the applicable net namespace
    917 *	@type: media type of device
    918 *	@ha: hardware address
    919 *
    920 *	Search for an interface by MAC address. Returns NULL if the device
    921 *	is not found or a pointer to the device.
    922 *	The caller must hold RCU or RTNL.
    923 *	The returned device has not had its ref count increased
    924 *	and the caller must therefore be careful about locking
    925 *
    926 */
    927
    928struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
    929				       const char *ha)
    930{
    931	struct net_device *dev;
    932
    933	for_each_netdev_rcu(net, dev)
    934		if (dev->type == type &&
    935		    !memcmp(dev->dev_addr, ha, dev->addr_len))
    936			return dev;
    937
    938	return NULL;
    939}
    940EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
    941
    942struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
    943{
    944	struct net_device *dev, *ret = NULL;
    945
    946	rcu_read_lock();
    947	for_each_netdev_rcu(net, dev)
    948		if (dev->type == type) {
    949			dev_hold(dev);
    950			ret = dev;
    951			break;
    952		}
    953	rcu_read_unlock();
    954	return ret;
    955}
    956EXPORT_SYMBOL(dev_getfirstbyhwtype);
    957
    958/**
    959 *	__dev_get_by_flags - find any device with given flags
    960 *	@net: the applicable net namespace
    961 *	@if_flags: IFF_* values
    962 *	@mask: bitmask of bits in if_flags to check
    963 *
    964 *	Search for any interface with the given flags. Returns NULL if a device
    965 *	is not found or a pointer to the device. Must be called inside
    966 *	rtnl_lock(), and result refcount is unchanged.
    967 */
    968
    969struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
    970				      unsigned short mask)
    971{
    972	struct net_device *dev, *ret;
    973
    974	ASSERT_RTNL();
    975
    976	ret = NULL;
    977	for_each_netdev(net, dev) {
    978		if (((dev->flags ^ if_flags) & mask) == 0) {
    979			ret = dev;
    980			break;
    981		}
    982	}
    983	return ret;
    984}
    985EXPORT_SYMBOL(__dev_get_by_flags);
    986
    987/**
    988 *	dev_valid_name - check if name is okay for network device
    989 *	@name: name string
    990 *
    991 *	Network device names need to be valid file names to
    992 *	allow sysfs to work.  We also disallow any kind of
    993 *	whitespace.
    994 */
    995bool dev_valid_name(const char *name)
    996{
    997	if (*name == '\0')
    998		return false;
    999	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
   1000		return false;
   1001	if (!strcmp(name, ".") || !strcmp(name, ".."))
   1002		return false;
   1003
   1004	while (*name) {
   1005		if (*name == '/' || *name == ':' || isspace(*name))
   1006			return false;
   1007		name++;
   1008	}
   1009	return true;
   1010}
   1011EXPORT_SYMBOL(dev_valid_name);
   1012
   1013/**
   1014 *	__dev_alloc_name - allocate a name for a device
   1015 *	@net: network namespace to allocate the device name in
   1016 *	@name: name format string
   1017 *	@buf:  scratch buffer and result name string
   1018 *
   1019 *	Passed a format string - eg "lt%d" it will try and find a suitable
   1020 *	id. It scans list of devices to build up a free map, then chooses
   1021 *	the first empty slot. The caller must hold the dev_base or rtnl lock
   1022 *	while allocating the name and adding the device in order to avoid
   1023 *	duplicates.
   1024 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
   1025 *	Returns the number of the unit assigned or a negative errno code.
   1026 */
   1027
   1028static int __dev_alloc_name(struct net *net, const char *name, char *buf)
   1029{
   1030	int i = 0;
   1031	const char *p;
   1032	const int max_netdevices = 8*PAGE_SIZE;
   1033	unsigned long *inuse;
   1034	struct net_device *d;
   1035
   1036	if (!dev_valid_name(name))
   1037		return -EINVAL;
   1038
   1039	p = strchr(name, '%');
   1040	if (p) {
   1041		/*
   1042		 * Verify the string as this thing may have come from
   1043		 * the user.  There must be either one "%d" and no other "%"
   1044		 * characters.
   1045		 */
   1046		if (p[1] != 'd' || strchr(p + 2, '%'))
   1047			return -EINVAL;
   1048
   1049		/* Use one page as a bit array of possible slots */
   1050		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
   1051		if (!inuse)
   1052			return -ENOMEM;
   1053
   1054		for_each_netdev(net, d) {
   1055			struct netdev_name_node *name_node;
   1056			list_for_each_entry(name_node, &d->name_node->list, list) {
   1057				if (!sscanf(name_node->name, name, &i))
   1058					continue;
   1059				if (i < 0 || i >= max_netdevices)
   1060					continue;
   1061
   1062				/*  avoid cases where sscanf is not exact inverse of printf */
   1063				snprintf(buf, IFNAMSIZ, name, i);
   1064				if (!strncmp(buf, name_node->name, IFNAMSIZ))
   1065					__set_bit(i, inuse);
   1066			}
   1067			if (!sscanf(d->name, name, &i))
   1068				continue;
   1069			if (i < 0 || i >= max_netdevices)
   1070				continue;
   1071
   1072			/*  avoid cases where sscanf is not exact inverse of printf */
   1073			snprintf(buf, IFNAMSIZ, name, i);
   1074			if (!strncmp(buf, d->name, IFNAMSIZ))
   1075				__set_bit(i, inuse);
   1076		}
   1077
   1078		i = find_first_zero_bit(inuse, max_netdevices);
   1079		free_page((unsigned long) inuse);
   1080	}
   1081
   1082	snprintf(buf, IFNAMSIZ, name, i);
   1083	if (!netdev_name_in_use(net, buf))
   1084		return i;
   1085
   1086	/* It is possible to run out of possible slots
   1087	 * when the name is long and there isn't enough space left
   1088	 * for the digits, or if all bits are used.
   1089	 */
   1090	return -ENFILE;
   1091}
   1092
   1093static int dev_alloc_name_ns(struct net *net,
   1094			     struct net_device *dev,
   1095			     const char *name)
   1096{
   1097	char buf[IFNAMSIZ];
   1098	int ret;
   1099
   1100	BUG_ON(!net);
   1101	ret = __dev_alloc_name(net, name, buf);
   1102	if (ret >= 0)
   1103		strlcpy(dev->name, buf, IFNAMSIZ);
   1104	return ret;
   1105}
   1106
   1107/**
   1108 *	dev_alloc_name - allocate a name for a device
   1109 *	@dev: device
   1110 *	@name: name format string
   1111 *
   1112 *	Passed a format string - eg "lt%d" it will try and find a suitable
   1113 *	id. It scans list of devices to build up a free map, then chooses
   1114 *	the first empty slot. The caller must hold the dev_base or rtnl lock
   1115 *	while allocating the name and adding the device in order to avoid
   1116 *	duplicates.
   1117 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
   1118 *	Returns the number of the unit assigned or a negative errno code.
   1119 */
   1120
   1121int dev_alloc_name(struct net_device *dev, const char *name)
   1122{
   1123	return dev_alloc_name_ns(dev_net(dev), dev, name);
   1124}
   1125EXPORT_SYMBOL(dev_alloc_name);
   1126
   1127static int dev_get_valid_name(struct net *net, struct net_device *dev,
   1128			      const char *name)
   1129{
   1130	BUG_ON(!net);
   1131
   1132	if (!dev_valid_name(name))
   1133		return -EINVAL;
   1134
   1135	if (strchr(name, '%'))
   1136		return dev_alloc_name_ns(net, dev, name);
   1137	else if (netdev_name_in_use(net, name))
   1138		return -EEXIST;
   1139	else if (dev->name != name)
   1140		strlcpy(dev->name, name, IFNAMSIZ);
   1141
   1142	return 0;
   1143}
   1144
   1145/**
   1146 *	dev_change_name - change name of a device
   1147 *	@dev: device
   1148 *	@newname: name (or format string) must be at least IFNAMSIZ
   1149 *
   1150 *	Change name of a device, can pass format strings "eth%d".
   1151 *	for wildcarding.
   1152 */
   1153int dev_change_name(struct net_device *dev, const char *newname)
   1154{
   1155	unsigned char old_assign_type;
   1156	char oldname[IFNAMSIZ];
   1157	int err = 0;
   1158	int ret;
   1159	struct net *net;
   1160
   1161	ASSERT_RTNL();
   1162	BUG_ON(!dev_net(dev));
   1163
   1164	net = dev_net(dev);
   1165
   1166	/* Some auto-enslaved devices e.g. failover slaves are
   1167	 * special, as userspace might rename the device after
   1168	 * the interface had been brought up and running since
   1169	 * the point kernel initiated auto-enslavement. Allow
   1170	 * live name change even when these slave devices are
   1171	 * up and running.
   1172	 *
   1173	 * Typically, users of these auto-enslaving devices
   1174	 * don't actually care about slave name change, as
   1175	 * they are supposed to operate on master interface
   1176	 * directly.
   1177	 */
   1178	if (dev->flags & IFF_UP &&
   1179	    likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
   1180		return -EBUSY;
   1181
   1182	down_write(&devnet_rename_sem);
   1183
   1184	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
   1185		up_write(&devnet_rename_sem);
   1186		return 0;
   1187	}
   1188
   1189	memcpy(oldname, dev->name, IFNAMSIZ);
   1190
   1191	err = dev_get_valid_name(net, dev, newname);
   1192	if (err < 0) {
   1193		up_write(&devnet_rename_sem);
   1194		return err;
   1195	}
   1196
   1197	if (oldname[0] && !strchr(oldname, '%'))
   1198		netdev_info(dev, "renamed from %s\n", oldname);
   1199
   1200	old_assign_type = dev->name_assign_type;
   1201	dev->name_assign_type = NET_NAME_RENAMED;
   1202
   1203rollback:
   1204	ret = device_rename(&dev->dev, dev->name);
   1205	if (ret) {
   1206		memcpy(dev->name, oldname, IFNAMSIZ);
   1207		dev->name_assign_type = old_assign_type;
   1208		up_write(&devnet_rename_sem);
   1209		return ret;
   1210	}
   1211
   1212	up_write(&devnet_rename_sem);
   1213
   1214	netdev_adjacent_rename_links(dev, oldname);
   1215
   1216	write_lock(&dev_base_lock);
   1217	netdev_name_node_del(dev->name_node);
   1218	write_unlock(&dev_base_lock);
   1219
   1220	synchronize_rcu();
   1221
   1222	write_lock(&dev_base_lock);
   1223	netdev_name_node_add(net, dev->name_node);
   1224	write_unlock(&dev_base_lock);
   1225
   1226	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
   1227	ret = notifier_to_errno(ret);
   1228
   1229	if (ret) {
   1230		/* err >= 0 after dev_alloc_name() or stores the first errno */
   1231		if (err >= 0) {
   1232			err = ret;
   1233			down_write(&devnet_rename_sem);
   1234			memcpy(dev->name, oldname, IFNAMSIZ);
   1235			memcpy(oldname, newname, IFNAMSIZ);
   1236			dev->name_assign_type = old_assign_type;
   1237			old_assign_type = NET_NAME_RENAMED;
   1238			goto rollback;
   1239		} else {
   1240			netdev_err(dev, "name change rollback failed: %d\n",
   1241				   ret);
   1242		}
   1243	}
   1244
   1245	return err;
   1246}
   1247
   1248/**
   1249 *	dev_set_alias - change ifalias of a device
   1250 *	@dev: device
   1251 *	@alias: name up to IFALIASZ
   1252 *	@len: limit of bytes to copy from info
   1253 *
   1254 *	Set ifalias for a device,
   1255 */
   1256int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
   1257{
   1258	struct dev_ifalias *new_alias = NULL;
   1259
   1260	if (len >= IFALIASZ)
   1261		return -EINVAL;
   1262
   1263	if (len) {
   1264		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
   1265		if (!new_alias)
   1266			return -ENOMEM;
   1267
   1268		memcpy(new_alias->ifalias, alias, len);
   1269		new_alias->ifalias[len] = 0;
   1270	}
   1271
   1272	mutex_lock(&ifalias_mutex);
   1273	new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
   1274					mutex_is_locked(&ifalias_mutex));
   1275	mutex_unlock(&ifalias_mutex);
   1276
   1277	if (new_alias)
   1278		kfree_rcu(new_alias, rcuhead);
   1279
   1280	return len;
   1281}
   1282EXPORT_SYMBOL(dev_set_alias);
   1283
   1284/**
   1285 *	dev_get_alias - get ifalias of a device
   1286 *	@dev: device
   1287 *	@name: buffer to store name of ifalias
   1288 *	@len: size of buffer
   1289 *
   1290 *	get ifalias for a device.  Caller must make sure dev cannot go
   1291 *	away,  e.g. rcu read lock or own a reference count to device.
   1292 */
   1293int dev_get_alias(const struct net_device *dev, char *name, size_t len)
   1294{
   1295	const struct dev_ifalias *alias;
   1296	int ret = 0;
   1297
   1298	rcu_read_lock();
   1299	alias = rcu_dereference(dev->ifalias);
   1300	if (alias)
   1301		ret = snprintf(name, len, "%s", alias->ifalias);
   1302	rcu_read_unlock();
   1303
   1304	return ret;
   1305}
   1306
   1307/**
   1308 *	netdev_features_change - device changes features
   1309 *	@dev: device to cause notification
   1310 *
   1311 *	Called to indicate a device has changed features.
   1312 */
   1313void netdev_features_change(struct net_device *dev)
   1314{
   1315	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
   1316}
   1317EXPORT_SYMBOL(netdev_features_change);
   1318
   1319/**
   1320 *	netdev_state_change - device changes state
   1321 *	@dev: device to cause notification
   1322 *
   1323 *	Called to indicate a device has changed state. This function calls
   1324 *	the notifier chains for netdev_chain and sends a NEWLINK message
   1325 *	to the routing socket.
   1326 */
   1327void netdev_state_change(struct net_device *dev)
   1328{
   1329	if (dev->flags & IFF_UP) {
   1330		struct netdev_notifier_change_info change_info = {
   1331			.info.dev = dev,
   1332		};
   1333
   1334		call_netdevice_notifiers_info(NETDEV_CHANGE,
   1335					      &change_info.info);
   1336		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
   1337	}
   1338}
   1339EXPORT_SYMBOL(netdev_state_change);
   1340
   1341/**
   1342 * __netdev_notify_peers - notify network peers about existence of @dev,
   1343 * to be called when rtnl lock is already held.
   1344 * @dev: network device
   1345 *
   1346 * Generate traffic such that interested network peers are aware of
   1347 * @dev, such as by generating a gratuitous ARP. This may be used when
   1348 * a device wants to inform the rest of the network about some sort of
   1349 * reconfiguration such as a failover event or virtual machine
   1350 * migration.
   1351 */
   1352void __netdev_notify_peers(struct net_device *dev)
   1353{
   1354	ASSERT_RTNL();
   1355	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
   1356	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
   1357}
   1358EXPORT_SYMBOL(__netdev_notify_peers);
   1359
   1360/**
   1361 * netdev_notify_peers - notify network peers about existence of @dev
   1362 * @dev: network device
   1363 *
   1364 * Generate traffic such that interested network peers are aware of
   1365 * @dev, such as by generating a gratuitous ARP. This may be used when
   1366 * a device wants to inform the rest of the network about some sort of
   1367 * reconfiguration such as a failover event or virtual machine
   1368 * migration.
   1369 */
   1370void netdev_notify_peers(struct net_device *dev)
   1371{
   1372	rtnl_lock();
   1373	__netdev_notify_peers(dev);
   1374	rtnl_unlock();
   1375}
   1376EXPORT_SYMBOL(netdev_notify_peers);
   1377
   1378static int napi_threaded_poll(void *data);
   1379
   1380static int napi_kthread_create(struct napi_struct *n)
   1381{
   1382	int err = 0;
   1383
   1384	/* Create and wake up the kthread once to put it in
   1385	 * TASK_INTERRUPTIBLE mode to avoid the blocked task
   1386	 * warning and work with loadavg.
   1387	 */
   1388	n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
   1389				n->dev->name, n->napi_id);
   1390	if (IS_ERR(n->thread)) {
   1391		err = PTR_ERR(n->thread);
   1392		pr_err("kthread_run failed with err %d\n", err);
   1393		n->thread = NULL;
   1394	}
   1395
   1396	return err;
   1397}
   1398
   1399static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
   1400{
   1401	const struct net_device_ops *ops = dev->netdev_ops;
   1402	int ret;
   1403
   1404	ASSERT_RTNL();
   1405	dev_addr_check(dev);
   1406
   1407	if (!netif_device_present(dev)) {
   1408		/* may be detached because parent is runtime-suspended */
   1409		if (dev->dev.parent)
   1410			pm_runtime_resume(dev->dev.parent);
   1411		if (!netif_device_present(dev))
   1412			return -ENODEV;
   1413	}
   1414
   1415	/* Block netpoll from trying to do any rx path servicing.
   1416	 * If we don't do this there is a chance ndo_poll_controller
   1417	 * or ndo_poll may be running while we open the device
   1418	 */
   1419	netpoll_poll_disable(dev);
   1420
   1421	ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
   1422	ret = notifier_to_errno(ret);
   1423	if (ret)
   1424		return ret;
   1425
   1426	set_bit(__LINK_STATE_START, &dev->state);
   1427
   1428	if (ops->ndo_validate_addr)
   1429		ret = ops->ndo_validate_addr(dev);
   1430
   1431	if (!ret && ops->ndo_open)
   1432		ret = ops->ndo_open(dev);
   1433
   1434	netpoll_poll_enable(dev);
   1435
   1436	if (ret)
   1437		clear_bit(__LINK_STATE_START, &dev->state);
   1438	else {
   1439		dev->flags |= IFF_UP;
   1440		dev_set_rx_mode(dev);
   1441		dev_activate(dev);
   1442		add_device_randomness(dev->dev_addr, dev->addr_len);
   1443	}
   1444
   1445	return ret;
   1446}
   1447
   1448/**
   1449 *	dev_open	- prepare an interface for use.
   1450 *	@dev: device to open
   1451 *	@extack: netlink extended ack
   1452 *
   1453 *	Takes a device from down to up state. The device's private open
   1454 *	function is invoked and then the multicast lists are loaded. Finally
   1455 *	the device is moved into the up state and a %NETDEV_UP message is
   1456 *	sent to the netdev notifier chain.
   1457 *
   1458 *	Calling this function on an active interface is a nop. On a failure
   1459 *	a negative errno code is returned.
   1460 */
   1461int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
   1462{
   1463	int ret;
   1464
   1465	if (dev->flags & IFF_UP)
   1466		return 0;
   1467
   1468	ret = __dev_open(dev, extack);
   1469	if (ret < 0)
   1470		return ret;
   1471
   1472	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
   1473	call_netdevice_notifiers(NETDEV_UP, dev);
   1474
   1475	return ret;
   1476}
   1477EXPORT_SYMBOL(dev_open);
   1478
   1479static void __dev_close_many(struct list_head *head)
   1480{
   1481	struct net_device *dev;
   1482
   1483	ASSERT_RTNL();
   1484	might_sleep();
   1485
   1486	list_for_each_entry(dev, head, close_list) {
   1487		/* Temporarily disable netpoll until the interface is down */
   1488		netpoll_poll_disable(dev);
   1489
   1490		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
   1491
   1492		clear_bit(__LINK_STATE_START, &dev->state);
   1493
   1494		/* Synchronize to scheduled poll. We cannot touch poll list, it
   1495		 * can be even on different cpu. So just clear netif_running().
   1496		 *
   1497		 * dev->stop() will invoke napi_disable() on all of it's
   1498		 * napi_struct instances on this device.
   1499		 */
   1500		smp_mb__after_atomic(); /* Commit netif_running(). */
   1501	}
   1502
   1503	dev_deactivate_many(head);
   1504
   1505	list_for_each_entry(dev, head, close_list) {
   1506		const struct net_device_ops *ops = dev->netdev_ops;
   1507
   1508		/*
   1509		 *	Call the device specific close. This cannot fail.
   1510		 *	Only if device is UP
   1511		 *
   1512		 *	We allow it to be called even after a DETACH hot-plug
   1513		 *	event.
   1514		 */
   1515		if (ops->ndo_stop)
   1516			ops->ndo_stop(dev);
   1517
   1518		dev->flags &= ~IFF_UP;
   1519		netpoll_poll_enable(dev);
   1520	}
   1521}
   1522
   1523static void __dev_close(struct net_device *dev)
   1524{
   1525	LIST_HEAD(single);
   1526
   1527	list_add(&dev->close_list, &single);
   1528	__dev_close_many(&single);
   1529	list_del(&single);
   1530}
   1531
   1532void dev_close_many(struct list_head *head, bool unlink)
   1533{
   1534	struct net_device *dev, *tmp;
   1535
   1536	/* Remove the devices that don't need to be closed */
   1537	list_for_each_entry_safe(dev, tmp, head, close_list)
   1538		if (!(dev->flags & IFF_UP))
   1539			list_del_init(&dev->close_list);
   1540
   1541	__dev_close_many(head);
   1542
   1543	list_for_each_entry_safe(dev, tmp, head, close_list) {
   1544		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
   1545		call_netdevice_notifiers(NETDEV_DOWN, dev);
   1546		if (unlink)
   1547			list_del_init(&dev->close_list);
   1548	}
   1549}
   1550EXPORT_SYMBOL(dev_close_many);
   1551
   1552/**
   1553 *	dev_close - shutdown an interface.
   1554 *	@dev: device to shutdown
   1555 *
   1556 *	This function moves an active device into down state. A
   1557 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
   1558 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
   1559 *	chain.
   1560 */
   1561void dev_close(struct net_device *dev)
   1562{
   1563	if (dev->flags & IFF_UP) {
   1564		LIST_HEAD(single);
   1565
   1566		list_add(&dev->close_list, &single);
   1567		dev_close_many(&single, true);
   1568		list_del(&single);
   1569	}
   1570}
   1571EXPORT_SYMBOL(dev_close);
   1572
   1573
   1574/**
   1575 *	dev_disable_lro - disable Large Receive Offload on a device
   1576 *	@dev: device
   1577 *
   1578 *	Disable Large Receive Offload (LRO) on a net device.  Must be
   1579 *	called under RTNL.  This is needed if received packets may be
   1580 *	forwarded to another interface.
   1581 */
   1582void dev_disable_lro(struct net_device *dev)
   1583{
   1584	struct net_device *lower_dev;
   1585	struct list_head *iter;
   1586
   1587	dev->wanted_features &= ~NETIF_F_LRO;
   1588	netdev_update_features(dev);
   1589
   1590	if (unlikely(dev->features & NETIF_F_LRO))
   1591		netdev_WARN(dev, "failed to disable LRO!\n");
   1592
   1593	netdev_for_each_lower_dev(dev, lower_dev, iter)
   1594		dev_disable_lro(lower_dev);
   1595}
   1596EXPORT_SYMBOL(dev_disable_lro);
   1597
   1598/**
   1599 *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
   1600 *	@dev: device
   1601 *
   1602 *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
   1603 *	called under RTNL.  This is needed if Generic XDP is installed on
   1604 *	the device.
   1605 */
   1606static void dev_disable_gro_hw(struct net_device *dev)
   1607{
   1608	dev->wanted_features &= ~NETIF_F_GRO_HW;
   1609	netdev_update_features(dev);
   1610
   1611	if (unlikely(dev->features & NETIF_F_GRO_HW))
   1612		netdev_WARN(dev, "failed to disable GRO_HW!\n");
   1613}
   1614
   1615const char *netdev_cmd_to_name(enum netdev_cmd cmd)
   1616{
   1617#define N(val) 						\
   1618	case NETDEV_##val:				\
   1619		return "NETDEV_" __stringify(val);
   1620	switch (cmd) {
   1621	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
   1622	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
   1623	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
   1624	N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
   1625	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
   1626	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
   1627	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
   1628	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
   1629	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
   1630	N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
   1631	N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
   1632	}
   1633#undef N
   1634	return "UNKNOWN_NETDEV_EVENT";
   1635}
   1636EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
   1637
   1638static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
   1639				   struct net_device *dev)
   1640{
   1641	struct netdev_notifier_info info = {
   1642		.dev = dev,
   1643	};
   1644
   1645	return nb->notifier_call(nb, val, &info);
   1646}
   1647
   1648static int call_netdevice_register_notifiers(struct notifier_block *nb,
   1649					     struct net_device *dev)
   1650{
   1651	int err;
   1652
   1653	err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
   1654	err = notifier_to_errno(err);
   1655	if (err)
   1656		return err;
   1657
   1658	if (!(dev->flags & IFF_UP))
   1659		return 0;
   1660
   1661	call_netdevice_notifier(nb, NETDEV_UP, dev);
   1662	return 0;
   1663}
   1664
   1665static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
   1666						struct net_device *dev)
   1667{
   1668	if (dev->flags & IFF_UP) {
   1669		call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
   1670					dev);
   1671		call_netdevice_notifier(nb, NETDEV_DOWN, dev);
   1672	}
   1673	call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
   1674}
   1675
   1676static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
   1677						 struct net *net)
   1678{
   1679	struct net_device *dev;
   1680	int err;
   1681
   1682	for_each_netdev(net, dev) {
   1683		err = call_netdevice_register_notifiers(nb, dev);
   1684		if (err)
   1685			goto rollback;
   1686	}
   1687	return 0;
   1688
   1689rollback:
   1690	for_each_netdev_continue_reverse(net, dev)
   1691		call_netdevice_unregister_notifiers(nb, dev);
   1692	return err;
   1693}
   1694
   1695static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
   1696						    struct net *net)
   1697{
   1698	struct net_device *dev;
   1699
   1700	for_each_netdev(net, dev)
   1701		call_netdevice_unregister_notifiers(nb, dev);
   1702}
   1703
   1704static int dev_boot_phase = 1;
   1705
   1706/**
   1707 * register_netdevice_notifier - register a network notifier block
   1708 * @nb: notifier
   1709 *
   1710 * Register a notifier to be called when network device events occur.
   1711 * The notifier passed is linked into the kernel structures and must
   1712 * not be reused until it has been unregistered. A negative errno code
   1713 * is returned on a failure.
   1714 *
   1715 * When registered all registration and up events are replayed
   1716 * to the new notifier to allow device to have a race free
   1717 * view of the network device list.
   1718 */
   1719
   1720int register_netdevice_notifier(struct notifier_block *nb)
   1721{
   1722	struct net *net;
   1723	int err;
   1724
   1725	/* Close race with setup_net() and cleanup_net() */
   1726	down_write(&pernet_ops_rwsem);
   1727	rtnl_lock();
   1728	err = raw_notifier_chain_register(&netdev_chain, nb);
   1729	if (err)
   1730		goto unlock;
   1731	if (dev_boot_phase)
   1732		goto unlock;
   1733	for_each_net(net) {
   1734		err = call_netdevice_register_net_notifiers(nb, net);
   1735		if (err)
   1736			goto rollback;
   1737	}
   1738
   1739unlock:
   1740	rtnl_unlock();
   1741	up_write(&pernet_ops_rwsem);
   1742	return err;
   1743
   1744rollback:
   1745	for_each_net_continue_reverse(net)
   1746		call_netdevice_unregister_net_notifiers(nb, net);
   1747
   1748	raw_notifier_chain_unregister(&netdev_chain, nb);
   1749	goto unlock;
   1750}
   1751EXPORT_SYMBOL(register_netdevice_notifier);
   1752
   1753/**
   1754 * unregister_netdevice_notifier - unregister a network notifier block
   1755 * @nb: notifier
   1756 *
   1757 * Unregister a notifier previously registered by
   1758 * register_netdevice_notifier(). The notifier is unlinked into the
   1759 * kernel structures and may then be reused. A negative errno code
   1760 * is returned on a failure.
   1761 *
   1762 * After unregistering unregister and down device events are synthesized
   1763 * for all devices on the device list to the removed notifier to remove
   1764 * the need for special case cleanup code.
   1765 */
   1766
   1767int unregister_netdevice_notifier(struct notifier_block *nb)
   1768{
   1769	struct net *net;
   1770	int err;
   1771
   1772	/* Close race with setup_net() and cleanup_net() */
   1773	down_write(&pernet_ops_rwsem);
   1774	rtnl_lock();
   1775	err = raw_notifier_chain_unregister(&netdev_chain, nb);
   1776	if (err)
   1777		goto unlock;
   1778
   1779	for_each_net(net)
   1780		call_netdevice_unregister_net_notifiers(nb, net);
   1781
   1782unlock:
   1783	rtnl_unlock();
   1784	up_write(&pernet_ops_rwsem);
   1785	return err;
   1786}
   1787EXPORT_SYMBOL(unregister_netdevice_notifier);
   1788
   1789static int __register_netdevice_notifier_net(struct net *net,
   1790					     struct notifier_block *nb,
   1791					     bool ignore_call_fail)
   1792{
   1793	int err;
   1794
   1795	err = raw_notifier_chain_register(&net->netdev_chain, nb);
   1796	if (err)
   1797		return err;
   1798	if (dev_boot_phase)
   1799		return 0;
   1800
   1801	err = call_netdevice_register_net_notifiers(nb, net);
   1802	if (err && !ignore_call_fail)
   1803		goto chain_unregister;
   1804
   1805	return 0;
   1806
   1807chain_unregister:
   1808	raw_notifier_chain_unregister(&net->netdev_chain, nb);
   1809	return err;
   1810}
   1811
   1812static int __unregister_netdevice_notifier_net(struct net *net,
   1813					       struct notifier_block *nb)
   1814{
   1815	int err;
   1816
   1817	err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
   1818	if (err)
   1819		return err;
   1820
   1821	call_netdevice_unregister_net_notifiers(nb, net);
   1822	return 0;
   1823}
   1824
   1825/**
   1826 * register_netdevice_notifier_net - register a per-netns network notifier block
   1827 * @net: network namespace
   1828 * @nb: notifier
   1829 *
   1830 * Register a notifier to be called when network device events occur.
   1831 * The notifier passed is linked into the kernel structures and must
   1832 * not be reused until it has been unregistered. A negative errno code
   1833 * is returned on a failure.
   1834 *
   1835 * When registered all registration and up events are replayed
   1836 * to the new notifier to allow device to have a race free
   1837 * view of the network device list.
   1838 */
   1839
   1840int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
   1841{
   1842	int err;
   1843
   1844	rtnl_lock();
   1845	err = __register_netdevice_notifier_net(net, nb, false);
   1846	rtnl_unlock();
   1847	return err;
   1848}
   1849EXPORT_SYMBOL(register_netdevice_notifier_net);
   1850
   1851/**
   1852 * unregister_netdevice_notifier_net - unregister a per-netns
   1853 *                                     network notifier block
   1854 * @net: network namespace
   1855 * @nb: notifier
   1856 *
   1857 * Unregister a notifier previously registered by
   1858 * register_netdevice_notifier(). The notifier is unlinked into the
   1859 * kernel structures and may then be reused. A negative errno code
   1860 * is returned on a failure.
   1861 *
   1862 * After unregistering unregister and down device events are synthesized
   1863 * for all devices on the device list to the removed notifier to remove
   1864 * the need for special case cleanup code.
   1865 */
   1866
   1867int unregister_netdevice_notifier_net(struct net *net,
   1868				      struct notifier_block *nb)
   1869{
   1870	int err;
   1871
   1872	rtnl_lock();
   1873	err = __unregister_netdevice_notifier_net(net, nb);
   1874	rtnl_unlock();
   1875	return err;
   1876}
   1877EXPORT_SYMBOL(unregister_netdevice_notifier_net);
   1878
   1879int register_netdevice_notifier_dev_net(struct net_device *dev,
   1880					struct notifier_block *nb,
   1881					struct netdev_net_notifier *nn)
   1882{
   1883	int err;
   1884
   1885	rtnl_lock();
   1886	err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
   1887	if (!err) {
   1888		nn->nb = nb;
   1889		list_add(&nn->list, &dev->net_notifier_list);
   1890	}
   1891	rtnl_unlock();
   1892	return err;
   1893}
   1894EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
   1895
   1896int unregister_netdevice_notifier_dev_net(struct net_device *dev,
   1897					  struct notifier_block *nb,
   1898					  struct netdev_net_notifier *nn)
   1899{
   1900	int err;
   1901
   1902	rtnl_lock();
   1903	list_del(&nn->list);
   1904	err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
   1905	rtnl_unlock();
   1906	return err;
   1907}
   1908EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
   1909
   1910static void move_netdevice_notifiers_dev_net(struct net_device *dev,
   1911					     struct net *net)
   1912{
   1913	struct netdev_net_notifier *nn;
   1914
   1915	list_for_each_entry(nn, &dev->net_notifier_list, list) {
   1916		__unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
   1917		__register_netdevice_notifier_net(net, nn->nb, true);
   1918	}
   1919}
   1920
   1921/**
   1922 *	call_netdevice_notifiers_info - call all network notifier blocks
   1923 *	@val: value passed unmodified to notifier function
   1924 *	@info: notifier information data
   1925 *
   1926 *	Call all network notifier blocks.  Parameters and return value
   1927 *	are as for raw_notifier_call_chain().
   1928 */
   1929
   1930static int call_netdevice_notifiers_info(unsigned long val,
   1931					 struct netdev_notifier_info *info)
   1932{
   1933	struct net *net = dev_net(info->dev);
   1934	int ret;
   1935
   1936	ASSERT_RTNL();
   1937
   1938	/* Run per-netns notifier block chain first, then run the global one.
   1939	 * Hopefully, one day, the global one is going to be removed after
   1940	 * all notifier block registrators get converted to be per-netns.
   1941	 */
   1942	ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
   1943	if (ret & NOTIFY_STOP_MASK)
   1944		return ret;
   1945	return raw_notifier_call_chain(&netdev_chain, val, info);
   1946}
   1947
   1948/**
   1949 *	call_netdevice_notifiers_info_robust - call per-netns notifier blocks
   1950 *	                                       for and rollback on error
   1951 *	@val_up: value passed unmodified to notifier function
   1952 *	@val_down: value passed unmodified to the notifier function when
   1953 *	           recovering from an error on @val_up
   1954 *	@info: notifier information data
   1955 *
   1956 *	Call all per-netns network notifier blocks, but not notifier blocks on
   1957 *	the global notifier chain. Parameters and return value are as for
   1958 *	raw_notifier_call_chain_robust().
   1959 */
   1960
   1961static int
   1962call_netdevice_notifiers_info_robust(unsigned long val_up,
   1963				     unsigned long val_down,
   1964				     struct netdev_notifier_info *info)
   1965{
   1966	struct net *net = dev_net(info->dev);
   1967
   1968	ASSERT_RTNL();
   1969
   1970	return raw_notifier_call_chain_robust(&net->netdev_chain,
   1971					      val_up, val_down, info);
   1972}
   1973
   1974static int call_netdevice_notifiers_extack(unsigned long val,
   1975					   struct net_device *dev,
   1976					   struct netlink_ext_ack *extack)
   1977{
   1978	struct netdev_notifier_info info = {
   1979		.dev = dev,
   1980		.extack = extack,
   1981	};
   1982
   1983	return call_netdevice_notifiers_info(val, &info);
   1984}
   1985
   1986/**
   1987 *	call_netdevice_notifiers - call all network notifier blocks
   1988 *      @val: value passed unmodified to notifier function
   1989 *      @dev: net_device pointer passed unmodified to notifier function
   1990 *
   1991 *	Call all network notifier blocks.  Parameters and return value
   1992 *	are as for raw_notifier_call_chain().
   1993 */
   1994
   1995int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
   1996{
   1997	return call_netdevice_notifiers_extack(val, dev, NULL);
   1998}
   1999EXPORT_SYMBOL(call_netdevice_notifiers);
   2000
   2001/**
   2002 *	call_netdevice_notifiers_mtu - call all network notifier blocks
   2003 *	@val: value passed unmodified to notifier function
   2004 *	@dev: net_device pointer passed unmodified to notifier function
   2005 *	@arg: additional u32 argument passed to the notifier function
   2006 *
   2007 *	Call all network notifier blocks.  Parameters and return value
   2008 *	are as for raw_notifier_call_chain().
   2009 */
   2010static int call_netdevice_notifiers_mtu(unsigned long val,
   2011					struct net_device *dev, u32 arg)
   2012{
   2013	struct netdev_notifier_info_ext info = {
   2014		.info.dev = dev,
   2015		.ext.mtu = arg,
   2016	};
   2017
   2018	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
   2019
   2020	return call_netdevice_notifiers_info(val, &info.info);
   2021}
   2022
   2023#ifdef CONFIG_NET_INGRESS
   2024static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
   2025
   2026void net_inc_ingress_queue(void)
   2027{
   2028	static_branch_inc(&ingress_needed_key);
   2029}
   2030EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
   2031
   2032void net_dec_ingress_queue(void)
   2033{
   2034	static_branch_dec(&ingress_needed_key);
   2035}
   2036EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
   2037#endif
   2038
   2039#ifdef CONFIG_NET_EGRESS
   2040static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
   2041
   2042void net_inc_egress_queue(void)
   2043{
   2044	static_branch_inc(&egress_needed_key);
   2045}
   2046EXPORT_SYMBOL_GPL(net_inc_egress_queue);
   2047
   2048void net_dec_egress_queue(void)
   2049{
   2050	static_branch_dec(&egress_needed_key);
   2051}
   2052EXPORT_SYMBOL_GPL(net_dec_egress_queue);
   2053#endif
   2054
   2055DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
   2056EXPORT_SYMBOL(netstamp_needed_key);
   2057#ifdef CONFIG_JUMP_LABEL
   2058static atomic_t netstamp_needed_deferred;
   2059static atomic_t netstamp_wanted;
   2060static void netstamp_clear(struct work_struct *work)
   2061{
   2062	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
   2063	int wanted;
   2064
   2065	wanted = atomic_add_return(deferred, &netstamp_wanted);
   2066	if (wanted > 0)
   2067		static_branch_enable(&netstamp_needed_key);
   2068	else
   2069		static_branch_disable(&netstamp_needed_key);
   2070}
   2071static DECLARE_WORK(netstamp_work, netstamp_clear);
   2072#endif
   2073
   2074void net_enable_timestamp(void)
   2075{
   2076#ifdef CONFIG_JUMP_LABEL
   2077	int wanted;
   2078
   2079	while (1) {
   2080		wanted = atomic_read(&netstamp_wanted);
   2081		if (wanted <= 0)
   2082			break;
   2083		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
   2084			return;
   2085	}
   2086	atomic_inc(&netstamp_needed_deferred);
   2087	schedule_work(&netstamp_work);
   2088#else
   2089	static_branch_inc(&netstamp_needed_key);
   2090#endif
   2091}
   2092EXPORT_SYMBOL(net_enable_timestamp);
   2093
   2094void net_disable_timestamp(void)
   2095{
   2096#ifdef CONFIG_JUMP_LABEL
   2097	int wanted;
   2098
   2099	while (1) {
   2100		wanted = atomic_read(&netstamp_wanted);
   2101		if (wanted <= 1)
   2102			break;
   2103		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
   2104			return;
   2105	}
   2106	atomic_dec(&netstamp_needed_deferred);
   2107	schedule_work(&netstamp_work);
   2108#else
   2109	static_branch_dec(&netstamp_needed_key);
   2110#endif
   2111}
   2112EXPORT_SYMBOL(net_disable_timestamp);
   2113
   2114static inline void net_timestamp_set(struct sk_buff *skb)
   2115{
   2116	skb->tstamp = 0;
   2117	skb->mono_delivery_time = 0;
   2118	if (static_branch_unlikely(&netstamp_needed_key))
   2119		skb->tstamp = ktime_get_real();
   2120}
   2121
   2122#define net_timestamp_check(COND, SKB)				\
   2123	if (static_branch_unlikely(&netstamp_needed_key)) {	\
   2124		if ((COND) && !(SKB)->tstamp)			\
   2125			(SKB)->tstamp = ktime_get_real();	\
   2126	}							\
   2127
   2128bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
   2129{
   2130	return __is_skb_forwardable(dev, skb, true);
   2131}
   2132EXPORT_SYMBOL_GPL(is_skb_forwardable);
   2133
   2134static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
   2135			      bool check_mtu)
   2136{
   2137	int ret = ____dev_forward_skb(dev, skb, check_mtu);
   2138
   2139	if (likely(!ret)) {
   2140		skb->protocol = eth_type_trans(skb, dev);
   2141		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
   2142	}
   2143
   2144	return ret;
   2145}
   2146
   2147int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
   2148{
   2149	return __dev_forward_skb2(dev, skb, true);
   2150}
   2151EXPORT_SYMBOL_GPL(__dev_forward_skb);
   2152
   2153/**
   2154 * dev_forward_skb - loopback an skb to another netif
   2155 *
   2156 * @dev: destination network device
   2157 * @skb: buffer to forward
   2158 *
   2159 * return values:
   2160 *	NET_RX_SUCCESS	(no congestion)
   2161 *	NET_RX_DROP     (packet was dropped, but freed)
   2162 *
   2163 * dev_forward_skb can be used for injecting an skb from the
   2164 * start_xmit function of one device into the receive queue
   2165 * of another device.
   2166 *
   2167 * The receiving device may be in another namespace, so
   2168 * we have to clear all information in the skb that could
   2169 * impact namespace isolation.
   2170 */
   2171int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
   2172{
   2173	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
   2174}
   2175EXPORT_SYMBOL_GPL(dev_forward_skb);
   2176
   2177int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
   2178{
   2179	return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
   2180}
   2181
   2182static inline int deliver_skb(struct sk_buff *skb,
   2183			      struct packet_type *pt_prev,
   2184			      struct net_device *orig_dev)
   2185{
   2186	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
   2187		return -ENOMEM;
   2188	refcount_inc(&skb->users);
   2189	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
   2190}
   2191
   2192static inline void deliver_ptype_list_skb(struct sk_buff *skb,
   2193					  struct packet_type **pt,
   2194					  struct net_device *orig_dev,
   2195					  __be16 type,
   2196					  struct list_head *ptype_list)
   2197{
   2198	struct packet_type *ptype, *pt_prev = *pt;
   2199
   2200	list_for_each_entry_rcu(ptype, ptype_list, list) {
   2201		if (ptype->type != type)
   2202			continue;
   2203		if (pt_prev)
   2204			deliver_skb(skb, pt_prev, orig_dev);
   2205		pt_prev = ptype;
   2206	}
   2207	*pt = pt_prev;
   2208}
   2209
   2210static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
   2211{
   2212	if (!ptype->af_packet_priv || !skb->sk)
   2213		return false;
   2214
   2215	if (ptype->id_match)
   2216		return ptype->id_match(ptype, skb->sk);
   2217	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
   2218		return true;
   2219
   2220	return false;
   2221}
   2222
   2223/**
   2224 * dev_nit_active - return true if any network interface taps are in use
   2225 *
   2226 * @dev: network device to check for the presence of taps
   2227 */
   2228bool dev_nit_active(struct net_device *dev)
   2229{
   2230	return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
   2231}
   2232EXPORT_SYMBOL_GPL(dev_nit_active);
   2233
   2234/*
   2235 *	Support routine. Sends outgoing frames to any network
   2236 *	taps currently in use.
   2237 */
   2238
   2239void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
   2240{
   2241	struct packet_type *ptype;
   2242	struct sk_buff *skb2 = NULL;
   2243	struct packet_type *pt_prev = NULL;
   2244	struct list_head *ptype_list = &ptype_all;
   2245
   2246	rcu_read_lock();
   2247again:
   2248	list_for_each_entry_rcu(ptype, ptype_list, list) {
   2249		if (ptype->ignore_outgoing)
   2250			continue;
   2251
   2252		/* Never send packets back to the socket
   2253		 * they originated from - MvS (miquels@drinkel.ow.org)
   2254		 */
   2255		if (skb_loop_sk(ptype, skb))
   2256			continue;
   2257
   2258		if (pt_prev) {
   2259			deliver_skb(skb2, pt_prev, skb->dev);
   2260			pt_prev = ptype;
   2261			continue;
   2262		}
   2263
   2264		/* need to clone skb, done only once */
   2265		skb2 = skb_clone(skb, GFP_ATOMIC);
   2266		if (!skb2)
   2267			goto out_unlock;
   2268
   2269		net_timestamp_set(skb2);
   2270
   2271		/* skb->nh should be correctly
   2272		 * set by sender, so that the second statement is
   2273		 * just protection against buggy protocols.
   2274		 */
   2275		skb_reset_mac_header(skb2);
   2276
   2277		if (skb_network_header(skb2) < skb2->data ||
   2278		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
   2279			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
   2280					     ntohs(skb2->protocol),
   2281					     dev->name);
   2282			skb_reset_network_header(skb2);
   2283		}
   2284
   2285		skb2->transport_header = skb2->network_header;
   2286		skb2->pkt_type = PACKET_OUTGOING;
   2287		pt_prev = ptype;
   2288	}
   2289
   2290	if (ptype_list == &ptype_all) {
   2291		ptype_list = &dev->ptype_all;
   2292		goto again;
   2293	}
   2294out_unlock:
   2295	if (pt_prev) {
   2296		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
   2297			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
   2298		else
   2299			kfree_skb(skb2);
   2300	}
   2301	rcu_read_unlock();
   2302}
   2303EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
   2304
   2305/**
   2306 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
   2307 * @dev: Network device
   2308 * @txq: number of queues available
   2309 *
   2310 * If real_num_tx_queues is changed the tc mappings may no longer be
   2311 * valid. To resolve this verify the tc mapping remains valid and if
   2312 * not NULL the mapping. With no priorities mapping to this
   2313 * offset/count pair it will no longer be used. In the worst case TC0
   2314 * is invalid nothing can be done so disable priority mappings. If is
   2315 * expected that drivers will fix this mapping if they can before
   2316 * calling netif_set_real_num_tx_queues.
   2317 */
   2318static void netif_setup_tc(struct net_device *dev, unsigned int txq)
   2319{
   2320	int i;
   2321	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
   2322
   2323	/* If TC0 is invalidated disable TC mapping */
   2324	if (tc->offset + tc->count > txq) {
   2325		netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
   2326		dev->num_tc = 0;
   2327		return;
   2328	}
   2329
   2330	/* Invalidated prio to tc mappings set to TC0 */
   2331	for (i = 1; i < TC_BITMASK + 1; i++) {
   2332		int q = netdev_get_prio_tc_map(dev, i);
   2333
   2334		tc = &dev->tc_to_txq[q];
   2335		if (tc->offset + tc->count > txq) {
   2336			netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
   2337				    i, q);
   2338			netdev_set_prio_tc_map(dev, i, 0);
   2339		}
   2340	}
   2341}
   2342
   2343int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
   2344{
   2345	if (dev->num_tc) {
   2346		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
   2347		int i;
   2348
   2349		/* walk through the TCs and see if it falls into any of them */
   2350		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
   2351			if ((txq - tc->offset) < tc->count)
   2352				return i;
   2353		}
   2354
   2355		/* didn't find it, just return -1 to indicate no match */
   2356		return -1;
   2357	}
   2358
   2359	return 0;
   2360}
   2361EXPORT_SYMBOL(netdev_txq_to_tc);
   2362
   2363#ifdef CONFIG_XPS
   2364static struct static_key xps_needed __read_mostly;
   2365static struct static_key xps_rxqs_needed __read_mostly;
   2366static DEFINE_MUTEX(xps_map_mutex);
   2367#define xmap_dereference(P)		\
   2368	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
   2369
   2370static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
   2371			     struct xps_dev_maps *old_maps, int tci, u16 index)
   2372{
   2373	struct xps_map *map = NULL;
   2374	int pos;
   2375
   2376	if (dev_maps)
   2377		map = xmap_dereference(dev_maps->attr_map[tci]);
   2378	if (!map)
   2379		return false;
   2380
   2381	for (pos = map->len; pos--;) {
   2382		if (map->queues[pos] != index)
   2383			continue;
   2384
   2385		if (map->len > 1) {
   2386			map->queues[pos] = map->queues[--map->len];
   2387			break;
   2388		}
   2389
   2390		if (old_maps)
   2391			RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
   2392		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
   2393		kfree_rcu(map, rcu);
   2394		return false;
   2395	}
   2396
   2397	return true;
   2398}
   2399
   2400static bool remove_xps_queue_cpu(struct net_device *dev,
   2401				 struct xps_dev_maps *dev_maps,
   2402				 int cpu, u16 offset, u16 count)
   2403{
   2404	int num_tc = dev_maps->num_tc;
   2405	bool active = false;
   2406	int tci;
   2407
   2408	for (tci = cpu * num_tc; num_tc--; tci++) {
   2409		int i, j;
   2410
   2411		for (i = count, j = offset; i--; j++) {
   2412			if (!remove_xps_queue(dev_maps, NULL, tci, j))
   2413				break;
   2414		}
   2415
   2416		active |= i < 0;
   2417	}
   2418
   2419	return active;
   2420}
   2421
   2422static void reset_xps_maps(struct net_device *dev,
   2423			   struct xps_dev_maps *dev_maps,
   2424			   enum xps_map_type type)
   2425{
   2426	static_key_slow_dec_cpuslocked(&xps_needed);
   2427	if (type == XPS_RXQS)
   2428		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
   2429
   2430	RCU_INIT_POINTER(dev->xps_maps[type], NULL);
   2431
   2432	kfree_rcu(dev_maps, rcu);
   2433}
   2434
   2435static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
   2436			   u16 offset, u16 count)
   2437{
   2438	struct xps_dev_maps *dev_maps;
   2439	bool active = false;
   2440	int i, j;
   2441
   2442	dev_maps = xmap_dereference(dev->xps_maps[type]);
   2443	if (!dev_maps)
   2444		return;
   2445
   2446	for (j = 0; j < dev_maps->nr_ids; j++)
   2447		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
   2448	if (!active)
   2449		reset_xps_maps(dev, dev_maps, type);
   2450
   2451	if (type == XPS_CPUS) {
   2452		for (i = offset + (count - 1); count--; i--)
   2453			netdev_queue_numa_node_write(
   2454				netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
   2455	}
   2456}
   2457
   2458static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
   2459				   u16 count)
   2460{
   2461	if (!static_key_false(&xps_needed))
   2462		return;
   2463
   2464	cpus_read_lock();
   2465	mutex_lock(&xps_map_mutex);
   2466
   2467	if (static_key_false(&xps_rxqs_needed))
   2468		clean_xps_maps(dev, XPS_RXQS, offset, count);
   2469
   2470	clean_xps_maps(dev, XPS_CPUS, offset, count);
   2471
   2472	mutex_unlock(&xps_map_mutex);
   2473	cpus_read_unlock();
   2474}
   2475
   2476static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
   2477{
   2478	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
   2479}
   2480
   2481static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
   2482				      u16 index, bool is_rxqs_map)
   2483{
   2484	struct xps_map *new_map;
   2485	int alloc_len = XPS_MIN_MAP_ALLOC;
   2486	int i, pos;
   2487
   2488	for (pos = 0; map && pos < map->len; pos++) {
   2489		if (map->queues[pos] != index)
   2490			continue;
   2491		return map;
   2492	}
   2493
   2494	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
   2495	if (map) {
   2496		if (pos < map->alloc_len)
   2497			return map;
   2498
   2499		alloc_len = map->alloc_len * 2;
   2500	}
   2501
   2502	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
   2503	 *  map
   2504	 */
   2505	if (is_rxqs_map)
   2506		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
   2507	else
   2508		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
   2509				       cpu_to_node(attr_index));
   2510	if (!new_map)
   2511		return NULL;
   2512
   2513	for (i = 0; i < pos; i++)
   2514		new_map->queues[i] = map->queues[i];
   2515	new_map->alloc_len = alloc_len;
   2516	new_map->len = pos;
   2517
   2518	return new_map;
   2519}
   2520
   2521/* Copy xps maps at a given index */
   2522static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
   2523			      struct xps_dev_maps *new_dev_maps, int index,
   2524			      int tc, bool skip_tc)
   2525{
   2526	int i, tci = index * dev_maps->num_tc;
   2527	struct xps_map *map;
   2528
   2529	/* copy maps belonging to foreign traffic classes */
   2530	for (i = 0; i < dev_maps->num_tc; i++, tci++) {
   2531		if (i == tc && skip_tc)
   2532			continue;
   2533
   2534		/* fill in the new device map from the old device map */
   2535		map = xmap_dereference(dev_maps->attr_map[tci]);
   2536		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
   2537	}
   2538}
   2539
   2540/* Must be called under cpus_read_lock */
   2541int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
   2542			  u16 index, enum xps_map_type type)
   2543{
   2544	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
   2545	const unsigned long *online_mask = NULL;
   2546	bool active = false, copy = false;
   2547	int i, j, tci, numa_node_id = -2;
   2548	int maps_sz, num_tc = 1, tc = 0;
   2549	struct xps_map *map, *new_map;
   2550	unsigned int nr_ids;
   2551
   2552	if (dev->num_tc) {
   2553		/* Do not allow XPS on subordinate device directly */
   2554		num_tc = dev->num_tc;
   2555		if (num_tc < 0)
   2556			return -EINVAL;
   2557
   2558		/* If queue belongs to subordinate dev use its map */
   2559		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
   2560
   2561		tc = netdev_txq_to_tc(dev, index);
   2562		if (tc < 0)
   2563			return -EINVAL;
   2564	}
   2565
   2566	mutex_lock(&xps_map_mutex);
   2567
   2568	dev_maps = xmap_dereference(dev->xps_maps[type]);
   2569	if (type == XPS_RXQS) {
   2570		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
   2571		nr_ids = dev->num_rx_queues;
   2572	} else {
   2573		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
   2574		if (num_possible_cpus() > 1)
   2575			online_mask = cpumask_bits(cpu_online_mask);
   2576		nr_ids = nr_cpu_ids;
   2577	}
   2578
   2579	if (maps_sz < L1_CACHE_BYTES)
   2580		maps_sz = L1_CACHE_BYTES;
   2581
   2582	/* The old dev_maps could be larger or smaller than the one we're
   2583	 * setting up now, as dev->num_tc or nr_ids could have been updated in
   2584	 * between. We could try to be smart, but let's be safe instead and only
   2585	 * copy foreign traffic classes if the two map sizes match.
   2586	 */
   2587	if (dev_maps &&
   2588	    dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
   2589		copy = true;
   2590
   2591	/* allocate memory for queue storage */
   2592	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
   2593	     j < nr_ids;) {
   2594		if (!new_dev_maps) {
   2595			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
   2596			if (!new_dev_maps) {
   2597				mutex_unlock(&xps_map_mutex);
   2598				return -ENOMEM;
   2599			}
   2600
   2601			new_dev_maps->nr_ids = nr_ids;
   2602			new_dev_maps->num_tc = num_tc;
   2603		}
   2604
   2605		tci = j * num_tc + tc;
   2606		map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
   2607
   2608		map = expand_xps_map(map, j, index, type == XPS_RXQS);
   2609		if (!map)
   2610			goto error;
   2611
   2612		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
   2613	}
   2614
   2615	if (!new_dev_maps)
   2616		goto out_no_new_maps;
   2617
   2618	if (!dev_maps) {
   2619		/* Increment static keys at most once per type */
   2620		static_key_slow_inc_cpuslocked(&xps_needed);
   2621		if (type == XPS_RXQS)
   2622			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
   2623	}
   2624
   2625	for (j = 0; j < nr_ids; j++) {
   2626		bool skip_tc = false;
   2627
   2628		tci = j * num_tc + tc;
   2629		if (netif_attr_test_mask(j, mask, nr_ids) &&
   2630		    netif_attr_test_online(j, online_mask, nr_ids)) {
   2631			/* add tx-queue to CPU/rx-queue maps */
   2632			int pos = 0;
   2633
   2634			skip_tc = true;
   2635
   2636			map = xmap_dereference(new_dev_maps->attr_map[tci]);
   2637			while ((pos < map->len) && (map->queues[pos] != index))
   2638				pos++;
   2639
   2640			if (pos == map->len)
   2641				map->queues[map->len++] = index;
   2642#ifdef CONFIG_NUMA
   2643			if (type == XPS_CPUS) {
   2644				if (numa_node_id == -2)
   2645					numa_node_id = cpu_to_node(j);
   2646				else if (numa_node_id != cpu_to_node(j))
   2647					numa_node_id = -1;
   2648			}
   2649#endif
   2650		}
   2651
   2652		if (copy)
   2653			xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
   2654					  skip_tc);
   2655	}
   2656
   2657	rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
   2658
   2659	/* Cleanup old maps */
   2660	if (!dev_maps)
   2661		goto out_no_old_maps;
   2662
   2663	for (j = 0; j < dev_maps->nr_ids; j++) {
   2664		for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
   2665			map = xmap_dereference(dev_maps->attr_map[tci]);
   2666			if (!map)
   2667				continue;
   2668
   2669			if (copy) {
   2670				new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
   2671				if (map == new_map)
   2672					continue;
   2673			}
   2674
   2675			RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
   2676			kfree_rcu(map, rcu);
   2677		}
   2678	}
   2679
   2680	old_dev_maps = dev_maps;
   2681
   2682out_no_old_maps:
   2683	dev_maps = new_dev_maps;
   2684	active = true;
   2685
   2686out_no_new_maps:
   2687	if (type == XPS_CPUS)
   2688		/* update Tx queue numa node */
   2689		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
   2690					     (numa_node_id >= 0) ?
   2691					     numa_node_id : NUMA_NO_NODE);
   2692
   2693	if (!dev_maps)
   2694		goto out_no_maps;
   2695
   2696	/* removes tx-queue from unused CPUs/rx-queues */
   2697	for (j = 0; j < dev_maps->nr_ids; j++) {
   2698		tci = j * dev_maps->num_tc;
   2699
   2700		for (i = 0; i < dev_maps->num_tc; i++, tci++) {
   2701			if (i == tc &&
   2702			    netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
   2703			    netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
   2704				continue;
   2705
   2706			active |= remove_xps_queue(dev_maps,
   2707						   copy ? old_dev_maps : NULL,
   2708						   tci, index);
   2709		}
   2710	}
   2711
   2712	if (old_dev_maps)
   2713		kfree_rcu(old_dev_maps, rcu);
   2714
   2715	/* free map if not active */
   2716	if (!active)
   2717		reset_xps_maps(dev, dev_maps, type);
   2718
   2719out_no_maps:
   2720	mutex_unlock(&xps_map_mutex);
   2721
   2722	return 0;
   2723error:
   2724	/* remove any maps that we added */
   2725	for (j = 0; j < nr_ids; j++) {
   2726		for (i = num_tc, tci = j * num_tc; i--; tci++) {
   2727			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
   2728			map = copy ?
   2729			      xmap_dereference(dev_maps->attr_map[tci]) :
   2730			      NULL;
   2731			if (new_map && new_map != map)
   2732				kfree(new_map);
   2733		}
   2734	}
   2735
   2736	mutex_unlock(&xps_map_mutex);
   2737
   2738	kfree(new_dev_maps);
   2739	return -ENOMEM;
   2740}
   2741EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
   2742
   2743int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
   2744			u16 index)
   2745{
   2746	int ret;
   2747
   2748	cpus_read_lock();
   2749	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
   2750	cpus_read_unlock();
   2751
   2752	return ret;
   2753}
   2754EXPORT_SYMBOL(netif_set_xps_queue);
   2755
   2756#endif
   2757static void netdev_unbind_all_sb_channels(struct net_device *dev)
   2758{
   2759	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
   2760
   2761	/* Unbind any subordinate channels */
   2762	while (txq-- != &dev->_tx[0]) {
   2763		if (txq->sb_dev)
   2764			netdev_unbind_sb_channel(dev, txq->sb_dev);
   2765	}
   2766}
   2767
   2768void netdev_reset_tc(struct net_device *dev)
   2769{
   2770#ifdef CONFIG_XPS
   2771	netif_reset_xps_queues_gt(dev, 0);
   2772#endif
   2773	netdev_unbind_all_sb_channels(dev);
   2774
   2775	/* Reset TC configuration of device */
   2776	dev->num_tc = 0;
   2777	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
   2778	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
   2779}
   2780EXPORT_SYMBOL(netdev_reset_tc);
   2781
   2782int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
   2783{
   2784	if (tc >= dev->num_tc)
   2785		return -EINVAL;
   2786
   2787#ifdef CONFIG_XPS
   2788	netif_reset_xps_queues(dev, offset, count);
   2789#endif
   2790	dev->tc_to_txq[tc].count = count;
   2791	dev->tc_to_txq[tc].offset = offset;
   2792	return 0;
   2793}
   2794EXPORT_SYMBOL(netdev_set_tc_queue);
   2795
   2796int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
   2797{
   2798	if (num_tc > TC_MAX_QUEUE)
   2799		return -EINVAL;
   2800
   2801#ifdef CONFIG_XPS
   2802	netif_reset_xps_queues_gt(dev, 0);
   2803#endif
   2804	netdev_unbind_all_sb_channels(dev);
   2805
   2806	dev->num_tc = num_tc;
   2807	return 0;
   2808}
   2809EXPORT_SYMBOL(netdev_set_num_tc);
   2810
   2811void netdev_unbind_sb_channel(struct net_device *dev,
   2812			      struct net_device *sb_dev)
   2813{
   2814	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
   2815
   2816#ifdef CONFIG_XPS
   2817	netif_reset_xps_queues_gt(sb_dev, 0);
   2818#endif
   2819	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
   2820	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
   2821
   2822	while (txq-- != &dev->_tx[0]) {
   2823		if (txq->sb_dev == sb_dev)
   2824			txq->sb_dev = NULL;
   2825	}
   2826}
   2827EXPORT_SYMBOL(netdev_unbind_sb_channel);
   2828
   2829int netdev_bind_sb_channel_queue(struct net_device *dev,
   2830				 struct net_device *sb_dev,
   2831				 u8 tc, u16 count, u16 offset)
   2832{
   2833	/* Make certain the sb_dev and dev are already configured */
   2834	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
   2835		return -EINVAL;
   2836
   2837	/* We cannot hand out queues we don't have */
   2838	if ((offset + count) > dev->real_num_tx_queues)
   2839		return -EINVAL;
   2840
   2841	/* Record the mapping */
   2842	sb_dev->tc_to_txq[tc].count = count;
   2843	sb_dev->tc_to_txq[tc].offset = offset;
   2844
   2845	/* Provide a way for Tx queue to find the tc_to_txq map or
   2846	 * XPS map for itself.
   2847	 */
   2848	while (count--)
   2849		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
   2850
   2851	return 0;
   2852}
   2853EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
   2854
   2855int netdev_set_sb_channel(struct net_device *dev, u16 channel)
   2856{
   2857	/* Do not use a multiqueue device to represent a subordinate channel */
   2858	if (netif_is_multiqueue(dev))
   2859		return -ENODEV;
   2860
   2861	/* We allow channels 1 - 32767 to be used for subordinate channels.
   2862	 * Channel 0 is meant to be "native" mode and used only to represent
   2863	 * the main root device. We allow writing 0 to reset the device back
   2864	 * to normal mode after being used as a subordinate channel.
   2865	 */
   2866	if (channel > S16_MAX)
   2867		return -EINVAL;
   2868
   2869	dev->num_tc = -channel;
   2870
   2871	return 0;
   2872}
   2873EXPORT_SYMBOL(netdev_set_sb_channel);
   2874
   2875/*
   2876 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
   2877 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
   2878 */
   2879int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
   2880{
   2881	bool disabling;
   2882	int rc;
   2883
   2884	disabling = txq < dev->real_num_tx_queues;
   2885
   2886	if (txq < 1 || txq > dev->num_tx_queues)
   2887		return -EINVAL;
   2888
   2889	if (dev->reg_state == NETREG_REGISTERED ||
   2890	    dev->reg_state == NETREG_UNREGISTERING) {
   2891		ASSERT_RTNL();
   2892
   2893		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
   2894						  txq);
   2895		if (rc)
   2896			return rc;
   2897
   2898		if (dev->num_tc)
   2899			netif_setup_tc(dev, txq);
   2900
   2901		dev_qdisc_change_real_num_tx(dev, txq);
   2902
   2903		dev->real_num_tx_queues = txq;
   2904
   2905		if (disabling) {
   2906			synchronize_net();
   2907			qdisc_reset_all_tx_gt(dev, txq);
   2908#ifdef CONFIG_XPS
   2909			netif_reset_xps_queues_gt(dev, txq);
   2910#endif
   2911		}
   2912	} else {
   2913		dev->real_num_tx_queues = txq;
   2914	}
   2915
   2916	return 0;
   2917}
   2918EXPORT_SYMBOL(netif_set_real_num_tx_queues);
   2919
   2920#ifdef CONFIG_SYSFS
   2921/**
   2922 *	netif_set_real_num_rx_queues - set actual number of RX queues used
   2923 *	@dev: Network device
   2924 *	@rxq: Actual number of RX queues
   2925 *
   2926 *	This must be called either with the rtnl_lock held or before
   2927 *	registration of the net device.  Returns 0 on success, or a
   2928 *	negative error code.  If called before registration, it always
   2929 *	succeeds.
   2930 */
   2931int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
   2932{
   2933	int rc;
   2934
   2935	if (rxq < 1 || rxq > dev->num_rx_queues)
   2936		return -EINVAL;
   2937
   2938	if (dev->reg_state == NETREG_REGISTERED) {
   2939		ASSERT_RTNL();
   2940
   2941		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
   2942						  rxq);
   2943		if (rc)
   2944			return rc;
   2945	}
   2946
   2947	dev->real_num_rx_queues = rxq;
   2948	return 0;
   2949}
   2950EXPORT_SYMBOL(netif_set_real_num_rx_queues);
   2951#endif
   2952
   2953/**
   2954 *	netif_set_real_num_queues - set actual number of RX and TX queues used
   2955 *	@dev: Network device
   2956 *	@txq: Actual number of TX queues
   2957 *	@rxq: Actual number of RX queues
   2958 *
   2959 *	Set the real number of both TX and RX queues.
   2960 *	Does nothing if the number of queues is already correct.
   2961 */
   2962int netif_set_real_num_queues(struct net_device *dev,
   2963			      unsigned int txq, unsigned int rxq)
   2964{
   2965	unsigned int old_rxq = dev->real_num_rx_queues;
   2966	int err;
   2967
   2968	if (txq < 1 || txq > dev->num_tx_queues ||
   2969	    rxq < 1 || rxq > dev->num_rx_queues)
   2970		return -EINVAL;
   2971
   2972	/* Start from increases, so the error path only does decreases -
   2973	 * decreases can't fail.
   2974	 */
   2975	if (rxq > dev->real_num_rx_queues) {
   2976		err = netif_set_real_num_rx_queues(dev, rxq);
   2977		if (err)
   2978			return err;
   2979	}
   2980	if (txq > dev->real_num_tx_queues) {
   2981		err = netif_set_real_num_tx_queues(dev, txq);
   2982		if (err)
   2983			goto undo_rx;
   2984	}
   2985	if (rxq < dev->real_num_rx_queues)
   2986		WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
   2987	if (txq < dev->real_num_tx_queues)
   2988		WARN_ON(netif_set_real_num_tx_queues(dev, txq));
   2989
   2990	return 0;
   2991undo_rx:
   2992	WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
   2993	return err;
   2994}
   2995EXPORT_SYMBOL(netif_set_real_num_queues);
   2996
   2997/**
   2998 * netif_set_tso_max_size() - set the max size of TSO frames supported
   2999 * @dev:	netdev to update
   3000 * @size:	max skb->len of a TSO frame
   3001 *
   3002 * Set the limit on the size of TSO super-frames the device can handle.
   3003 * Unless explicitly set the stack will assume the value of
   3004 * %GSO_LEGACY_MAX_SIZE.
   3005 */
   3006void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
   3007{
   3008	dev->tso_max_size = min(GSO_MAX_SIZE, size);
   3009	if (size < READ_ONCE(dev->gso_max_size))
   3010		netif_set_gso_max_size(dev, size);
   3011}
   3012EXPORT_SYMBOL(netif_set_tso_max_size);
   3013
   3014/**
   3015 * netif_set_tso_max_segs() - set the max number of segs supported for TSO
   3016 * @dev:	netdev to update
   3017 * @segs:	max number of TCP segments
   3018 *
   3019 * Set the limit on the number of TCP segments the device can generate from
   3020 * a single TSO super-frame.
   3021 * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
   3022 */
   3023void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
   3024{
   3025	dev->tso_max_segs = segs;
   3026	if (segs < READ_ONCE(dev->gso_max_segs))
   3027		netif_set_gso_max_segs(dev, segs);
   3028}
   3029EXPORT_SYMBOL(netif_set_tso_max_segs);
   3030
   3031/**
   3032 * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
   3033 * @to:		netdev to update
   3034 * @from:	netdev from which to copy the limits
   3035 */
   3036void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
   3037{
   3038	netif_set_tso_max_size(to, from->tso_max_size);
   3039	netif_set_tso_max_segs(to, from->tso_max_segs);
   3040}
   3041EXPORT_SYMBOL(netif_inherit_tso_max);
   3042
   3043/**
   3044 * netif_get_num_default_rss_queues - default number of RSS queues
   3045 *
   3046 * Default value is the number of physical cores if there are only 1 or 2, or
   3047 * divided by 2 if there are more.
   3048 */
   3049int netif_get_num_default_rss_queues(void)
   3050{
   3051	cpumask_var_t cpus;
   3052	int cpu, count = 0;
   3053
   3054	if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
   3055		return 1;
   3056
   3057	cpumask_copy(cpus, cpu_online_mask);
   3058	for_each_cpu(cpu, cpus) {
   3059		++count;
   3060		cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
   3061	}
   3062	free_cpumask_var(cpus);
   3063
   3064	return count > 2 ? DIV_ROUND_UP(count, 2) : count;
   3065}
   3066EXPORT_SYMBOL(netif_get_num_default_rss_queues);
   3067
   3068static void __netif_reschedule(struct Qdisc *q)
   3069{
   3070	struct softnet_data *sd;
   3071	unsigned long flags;
   3072
   3073	local_irq_save(flags);
   3074	sd = this_cpu_ptr(&softnet_data);
   3075	q->next_sched = NULL;
   3076	*sd->output_queue_tailp = q;
   3077	sd->output_queue_tailp = &q->next_sched;
   3078	raise_softirq_irqoff(NET_TX_SOFTIRQ);
   3079	local_irq_restore(flags);
   3080}
   3081
   3082void __netif_schedule(struct Qdisc *q)
   3083{
   3084	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
   3085		__netif_reschedule(q);
   3086}
   3087EXPORT_SYMBOL(__netif_schedule);
   3088
   3089struct dev_kfree_skb_cb {
   3090	enum skb_free_reason reason;
   3091};
   3092
   3093static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
   3094{
   3095	return (struct dev_kfree_skb_cb *)skb->cb;
   3096}
   3097
   3098void netif_schedule_queue(struct netdev_queue *txq)
   3099{
   3100	rcu_read_lock();
   3101	if (!netif_xmit_stopped(txq)) {
   3102		struct Qdisc *q = rcu_dereference(txq->qdisc);
   3103
   3104		__netif_schedule(q);
   3105	}
   3106	rcu_read_unlock();
   3107}
   3108EXPORT_SYMBOL(netif_schedule_queue);
   3109
   3110void netif_tx_wake_queue(struct netdev_queue *dev_queue)
   3111{
   3112	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
   3113		struct Qdisc *q;
   3114
   3115		rcu_read_lock();
   3116		q = rcu_dereference(dev_queue->qdisc);
   3117		__netif_schedule(q);
   3118		rcu_read_unlock();
   3119	}
   3120}
   3121EXPORT_SYMBOL(netif_tx_wake_queue);
   3122
   3123void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
   3124{
   3125	unsigned long flags;
   3126
   3127	if (unlikely(!skb))
   3128		return;
   3129
   3130	if (likely(refcount_read(&skb->users) == 1)) {
   3131		smp_rmb();
   3132		refcount_set(&skb->users, 0);
   3133	} else if (likely(!refcount_dec_and_test(&skb->users))) {
   3134		return;
   3135	}
   3136	get_kfree_skb_cb(skb)->reason = reason;
   3137	local_irq_save(flags);
   3138	skb->next = __this_cpu_read(softnet_data.completion_queue);
   3139	__this_cpu_write(softnet_data.completion_queue, skb);
   3140	raise_softirq_irqoff(NET_TX_SOFTIRQ);
   3141	local_irq_restore(flags);
   3142}
   3143EXPORT_SYMBOL(__dev_kfree_skb_irq);
   3144
   3145void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
   3146{
   3147	if (in_hardirq() || irqs_disabled())
   3148		__dev_kfree_skb_irq(skb, reason);
   3149	else
   3150		dev_kfree_skb(skb);
   3151}
   3152EXPORT_SYMBOL(__dev_kfree_skb_any);
   3153
   3154
   3155/**
   3156 * netif_device_detach - mark device as removed
   3157 * @dev: network device
   3158 *
   3159 * Mark device as removed from system and therefore no longer available.
   3160 */
   3161void netif_device_detach(struct net_device *dev)
   3162{
   3163	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
   3164	    netif_running(dev)) {
   3165		netif_tx_stop_all_queues(dev);
   3166	}
   3167}
   3168EXPORT_SYMBOL(netif_device_detach);
   3169
   3170/**
   3171 * netif_device_attach - mark device as attached
   3172 * @dev: network device
   3173 *
   3174 * Mark device as attached from system and restart if needed.
   3175 */
   3176void netif_device_attach(struct net_device *dev)
   3177{
   3178	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
   3179	    netif_running(dev)) {
   3180		netif_tx_wake_all_queues(dev);
   3181		__netdev_watchdog_up(dev);
   3182	}
   3183}
   3184EXPORT_SYMBOL(netif_device_attach);
   3185
   3186/*
   3187 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
   3188 * to be used as a distribution range.
   3189 */
   3190static u16 skb_tx_hash(const struct net_device *dev,
   3191		       const struct net_device *sb_dev,
   3192		       struct sk_buff *skb)
   3193{
   3194	u32 hash;
   3195	u16 qoffset = 0;
   3196	u16 qcount = dev->real_num_tx_queues;
   3197
   3198	if (dev->num_tc) {
   3199		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
   3200
   3201		qoffset = sb_dev->tc_to_txq[tc].offset;
   3202		qcount = sb_dev->tc_to_txq[tc].count;
   3203		if (unlikely(!qcount)) {
   3204			net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
   3205					     sb_dev->name, qoffset, tc);
   3206			qoffset = 0;
   3207			qcount = dev->real_num_tx_queues;
   3208		}
   3209	}
   3210
   3211	if (skb_rx_queue_recorded(skb)) {
   3212		hash = skb_get_rx_queue(skb);
   3213		if (hash >= qoffset)
   3214			hash -= qoffset;
   3215		while (unlikely(hash >= qcount))
   3216			hash -= qcount;
   3217		return hash + qoffset;
   3218	}
   3219
   3220	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
   3221}
   3222
   3223static void skb_warn_bad_offload(const struct sk_buff *skb)
   3224{
   3225	static const netdev_features_t null_features;
   3226	struct net_device *dev = skb->dev;
   3227	const char *name = "";
   3228
   3229	if (!net_ratelimit())
   3230		return;
   3231
   3232	if (dev) {
   3233		if (dev->dev.parent)
   3234			name = dev_driver_string(dev->dev.parent);
   3235		else
   3236			name = netdev_name(dev);
   3237	}
   3238	skb_dump(KERN_WARNING, skb, false);
   3239	WARN(1, "%s: caps=(%pNF, %pNF)\n",
   3240	     name, dev ? &dev->features : &null_features,
   3241	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
   3242}
   3243
   3244/*
   3245 * Invalidate hardware checksum when packet is to be mangled, and
   3246 * complete checksum manually on outgoing path.
   3247 */
   3248int skb_checksum_help(struct sk_buff *skb)
   3249{
   3250	__wsum csum;
   3251	int ret = 0, offset;
   3252
   3253	if (skb->ip_summed == CHECKSUM_COMPLETE)
   3254		goto out_set_summed;
   3255
   3256	if (unlikely(skb_is_gso(skb))) {
   3257		skb_warn_bad_offload(skb);
   3258		return -EINVAL;
   3259	}
   3260
   3261	/* Before computing a checksum, we should make sure no frag could
   3262	 * be modified by an external entity : checksum could be wrong.
   3263	 */
   3264	if (skb_has_shared_frag(skb)) {
   3265		ret = __skb_linearize(skb);
   3266		if (ret)
   3267			goto out;
   3268	}
   3269
   3270	offset = skb_checksum_start_offset(skb);
   3271	ret = -EINVAL;
   3272	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
   3273		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
   3274		goto out;
   3275	}
   3276	csum = skb_checksum(skb, offset, skb->len - offset, 0);
   3277
   3278	offset += skb->csum_offset;
   3279	if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb))) {
   3280		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
   3281		goto out;
   3282	}
   3283	ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
   3284	if (ret)
   3285		goto out;
   3286
   3287	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
   3288out_set_summed:
   3289	skb->ip_summed = CHECKSUM_NONE;
   3290out:
   3291	return ret;
   3292}
   3293EXPORT_SYMBOL(skb_checksum_help);
   3294
   3295int skb_crc32c_csum_help(struct sk_buff *skb)
   3296{
   3297	__le32 crc32c_csum;
   3298	int ret = 0, offset, start;
   3299
   3300	if (skb->ip_summed != CHECKSUM_PARTIAL)
   3301		goto out;
   3302
   3303	if (unlikely(skb_is_gso(skb)))
   3304		goto out;
   3305
   3306	/* Before computing a checksum, we should make sure no frag could
   3307	 * be modified by an external entity : checksum could be wrong.
   3308	 */
   3309	if (unlikely(skb_has_shared_frag(skb))) {
   3310		ret = __skb_linearize(skb);
   3311		if (ret)
   3312			goto out;
   3313	}
   3314	start = skb_checksum_start_offset(skb);
   3315	offset = start + offsetof(struct sctphdr, checksum);
   3316	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
   3317		ret = -EINVAL;
   3318		goto out;
   3319	}
   3320
   3321	ret = skb_ensure_writable(skb, offset + sizeof(__le32));
   3322	if (ret)
   3323		goto out;
   3324
   3325	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
   3326						  skb->len - start, ~(__u32)0,
   3327						  crc32c_csum_stub));
   3328	*(__le32 *)(skb->data + offset) = crc32c_csum;
   3329	skb->ip_summed = CHECKSUM_NONE;
   3330	skb->csum_not_inet = 0;
   3331out:
   3332	return ret;
   3333}
   3334
   3335__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
   3336{
   3337	__be16 type = skb->protocol;
   3338
   3339	/* Tunnel gso handlers can set protocol to ethernet. */
   3340	if (type == htons(ETH_P_TEB)) {
   3341		struct ethhdr *eth;
   3342
   3343		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
   3344			return 0;
   3345
   3346		eth = (struct ethhdr *)skb->data;
   3347		type = eth->h_proto;
   3348	}
   3349
   3350	return __vlan_get_protocol(skb, type, depth);
   3351}
   3352
   3353/* openvswitch calls this on rx path, so we need a different check.
   3354 */
   3355static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
   3356{
   3357	if (tx_path)
   3358		return skb->ip_summed != CHECKSUM_PARTIAL &&
   3359		       skb->ip_summed != CHECKSUM_UNNECESSARY;
   3360
   3361	return skb->ip_summed == CHECKSUM_NONE;
   3362}
   3363
   3364/**
   3365 *	__skb_gso_segment - Perform segmentation on skb.
   3366 *	@skb: buffer to segment
   3367 *	@features: features for the output path (see dev->features)
   3368 *	@tx_path: whether it is called in TX path
   3369 *
   3370 *	This function segments the given skb and returns a list of segments.
   3371 *
   3372 *	It may return NULL if the skb requires no segmentation.  This is
   3373 *	only possible when GSO is used for verifying header integrity.
   3374 *
   3375 *	Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
   3376 */
   3377struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
   3378				  netdev_features_t features, bool tx_path)
   3379{
   3380	struct sk_buff *segs;
   3381
   3382	if (unlikely(skb_needs_check(skb, tx_path))) {
   3383		int err;
   3384
   3385		/* We're going to init ->check field in TCP or UDP header */
   3386		err = skb_cow_head(skb, 0);
   3387		if (err < 0)
   3388			return ERR_PTR(err);
   3389	}
   3390
   3391	/* Only report GSO partial support if it will enable us to
   3392	 * support segmentation on this frame without needing additional
   3393	 * work.
   3394	 */
   3395	if (features & NETIF_F_GSO_PARTIAL) {
   3396		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
   3397		struct net_device *dev = skb->dev;
   3398
   3399		partial_features |= dev->features & dev->gso_partial_features;
   3400		if (!skb_gso_ok(skb, features | partial_features))
   3401			features &= ~NETIF_F_GSO_PARTIAL;
   3402	}
   3403
   3404	BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
   3405		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
   3406
   3407	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
   3408	SKB_GSO_CB(skb)->encap_level = 0;
   3409
   3410	skb_reset_mac_header(skb);
   3411	skb_reset_mac_len(skb);
   3412
   3413	segs = skb_mac_gso_segment(skb, features);
   3414
   3415	if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
   3416		skb_warn_bad_offload(skb);
   3417
   3418	return segs;
   3419}
   3420EXPORT_SYMBOL(__skb_gso_segment);
   3421
   3422/* Take action when hardware reception checksum errors are detected. */
   3423#ifdef CONFIG_BUG
   3424static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
   3425{
   3426	netdev_err(dev, "hw csum failure\n");
   3427	skb_dump(KERN_ERR, skb, true);
   3428	dump_stack();
   3429}
   3430
   3431void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
   3432{
   3433	DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
   3434}
   3435EXPORT_SYMBOL(netdev_rx_csum_fault);
   3436#endif
   3437
   3438/* XXX: check that highmem exists at all on the given machine. */
   3439static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
   3440{
   3441#ifdef CONFIG_HIGHMEM
   3442	int i;
   3443
   3444	if (!(dev->features & NETIF_F_HIGHDMA)) {
   3445		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
   3446			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
   3447
   3448			if (PageHighMem(skb_frag_page(frag)))
   3449				return 1;
   3450		}
   3451	}
   3452#endif
   3453	return 0;
   3454}
   3455
   3456/* If MPLS offload request, verify we are testing hardware MPLS features
   3457 * instead of standard features for the netdev.
   3458 */
   3459#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
   3460static netdev_features_t net_mpls_features(struct sk_buff *skb,
   3461					   netdev_features_t features,
   3462					   __be16 type)
   3463{
   3464	if (eth_p_mpls(type))
   3465		features &= skb->dev->mpls_features;
   3466
   3467	return features;
   3468}
   3469#else
   3470static netdev_features_t net_mpls_features(struct sk_buff *skb,
   3471					   netdev_features_t features,
   3472					   __be16 type)
   3473{
   3474	return features;
   3475}
   3476#endif
   3477
   3478static netdev_features_t harmonize_features(struct sk_buff *skb,
   3479	netdev_features_t features)
   3480{
   3481	__be16 type;
   3482
   3483	type = skb_network_protocol(skb, NULL);
   3484	features = net_mpls_features(skb, features, type);
   3485
   3486	if (skb->ip_summed != CHECKSUM_NONE &&
   3487	    !can_checksum_protocol(features, type)) {
   3488		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
   3489	}
   3490	if (illegal_highdma(skb->dev, skb))
   3491		features &= ~NETIF_F_SG;
   3492
   3493	return features;
   3494}
   3495
   3496netdev_features_t passthru_features_check(struct sk_buff *skb,
   3497					  struct net_device *dev,
   3498					  netdev_features_t features)
   3499{
   3500	return features;
   3501}
   3502EXPORT_SYMBOL(passthru_features_check);
   3503
   3504static netdev_features_t dflt_features_check(struct sk_buff *skb,
   3505					     struct net_device *dev,
   3506					     netdev_features_t features)
   3507{
   3508	return vlan_features_check(skb, features);
   3509}
   3510
   3511static netdev_features_t gso_features_check(const struct sk_buff *skb,
   3512					    struct net_device *dev,
   3513					    netdev_features_t features)
   3514{
   3515	u16 gso_segs = skb_shinfo(skb)->gso_segs;
   3516
   3517	if (gso_segs > READ_ONCE(dev->gso_max_segs))
   3518		return features & ~NETIF_F_GSO_MASK;
   3519
   3520	if (!skb_shinfo(skb)->gso_type) {
   3521		skb_warn_bad_offload(skb);
   3522		return features & ~NETIF_F_GSO_MASK;
   3523	}
   3524
   3525	/* Support for GSO partial features requires software
   3526	 * intervention before we can actually process the packets
   3527	 * so we need to strip support for any partial features now
   3528	 * and we can pull them back in after we have partially
   3529	 * segmented the frame.
   3530	 */
   3531	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
   3532		features &= ~dev->gso_partial_features;
   3533
   3534	/* Make sure to clear the IPv4 ID mangling feature if the
   3535	 * IPv4 header has the potential to be fragmented.
   3536	 */
   3537	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
   3538		struct iphdr *iph = skb->encapsulation ?
   3539				    inner_ip_hdr(skb) : ip_hdr(skb);
   3540
   3541		if (!(iph->frag_off & htons(IP_DF)))
   3542			features &= ~NETIF_F_TSO_MANGLEID;
   3543	}
   3544
   3545	return features;
   3546}
   3547
   3548netdev_features_t netif_skb_features(struct sk_buff *skb)
   3549{
   3550	struct net_device *dev = skb->dev;
   3551	netdev_features_t features = dev->features;
   3552
   3553	if (skb_is_gso(skb))
   3554		features = gso_features_check(skb, dev, features);
   3555
   3556	/* If encapsulation offload request, verify we are testing
   3557	 * hardware encapsulation features instead of standard
   3558	 * features for the netdev
   3559	 */
   3560	if (skb->encapsulation)
   3561		features &= dev->hw_enc_features;
   3562
   3563	if (skb_vlan_tagged(skb))
   3564		features = netdev_intersect_features(features,
   3565						     dev->vlan_features |
   3566						     NETIF_F_HW_VLAN_CTAG_TX |
   3567						     NETIF_F_HW_VLAN_STAG_TX);
   3568
   3569	if (dev->netdev_ops->ndo_features_check)
   3570		features &= dev->netdev_ops->ndo_features_check(skb, dev,
   3571								features);
   3572	else
   3573		features &= dflt_features_check(skb, dev, features);
   3574
   3575	return harmonize_features(skb, features);
   3576}
   3577EXPORT_SYMBOL(netif_skb_features);
   3578
   3579static int xmit_one(struct sk_buff *skb, struct net_device *dev,
   3580		    struct netdev_queue *txq, bool more)
   3581{
   3582	unsigned int len;
   3583	int rc;
   3584
   3585	if (dev_nit_active(dev))
   3586		dev_queue_xmit_nit(skb, dev);
   3587
   3588	len = skb->len;
   3589	trace_net_dev_start_xmit(skb, dev);
   3590	rc = netdev_start_xmit(skb, dev, txq, more);
   3591	trace_net_dev_xmit(skb, rc, dev, len);
   3592
   3593	return rc;
   3594}
   3595
   3596struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
   3597				    struct netdev_queue *txq, int *ret)
   3598{
   3599	struct sk_buff *skb = first;
   3600	int rc = NETDEV_TX_OK;
   3601
   3602	while (skb) {
   3603		struct sk_buff *next = skb->next;
   3604
   3605		skb_mark_not_on_list(skb);
   3606		rc = xmit_one(skb, dev, txq, next != NULL);
   3607		if (unlikely(!dev_xmit_complete(rc))) {
   3608			skb->next = next;
   3609			goto out;
   3610		}
   3611
   3612		skb = next;
   3613		if (netif_tx_queue_stopped(txq) && skb) {
   3614			rc = NETDEV_TX_BUSY;
   3615			break;
   3616		}
   3617	}
   3618
   3619out:
   3620	*ret = rc;
   3621	return skb;
   3622}
   3623
   3624static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
   3625					  netdev_features_t features)
   3626{
   3627	if (skb_vlan_tag_present(skb) &&
   3628	    !vlan_hw_offload_capable(features, skb->vlan_proto))
   3629		skb = __vlan_hwaccel_push_inside(skb);
   3630	return skb;
   3631}
   3632
   3633int skb_csum_hwoffload_help(struct sk_buff *skb,
   3634			    const netdev_features_t features)
   3635{
   3636	if (unlikely(skb_csum_is_sctp(skb)))
   3637		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
   3638			skb_crc32c_csum_help(skb);
   3639
   3640	if (features & NETIF_F_HW_CSUM)
   3641		return 0;
   3642
   3643	if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
   3644		switch (skb->csum_offset) {
   3645		case offsetof(struct tcphdr, check):
   3646		case offsetof(struct udphdr, check):
   3647			return 0;
   3648		}
   3649	}
   3650
   3651	return skb_checksum_help(skb);
   3652}
   3653EXPORT_SYMBOL(skb_csum_hwoffload_help);
   3654
   3655static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
   3656{
   3657	netdev_features_t features;
   3658
   3659	features = netif_skb_features(skb);
   3660	skb = validate_xmit_vlan(skb, features);
   3661	if (unlikely(!skb))
   3662		goto out_null;
   3663
   3664	skb = sk_validate_xmit_skb(skb, dev);
   3665	if (unlikely(!skb))
   3666		goto out_null;
   3667
   3668	if (netif_needs_gso(skb, features)) {
   3669		struct sk_buff *segs;
   3670
   3671		segs = skb_gso_segment(skb, features);
   3672		if (IS_ERR(segs)) {
   3673			goto out_kfree_skb;
   3674		} else if (segs) {
   3675			consume_skb(skb);
   3676			skb = segs;
   3677		}
   3678	} else {
   3679		if (skb_needs_linearize(skb, features) &&
   3680		    __skb_linearize(skb))
   3681			goto out_kfree_skb;
   3682
   3683		/* If packet is not checksummed and device does not
   3684		 * support checksumming for this protocol, complete
   3685		 * checksumming here.
   3686		 */
   3687		if (skb->ip_summed == CHECKSUM_PARTIAL) {
   3688			if (skb->encapsulation)
   3689				skb_set_inner_transport_header(skb,
   3690							       skb_checksum_start_offset(skb));
   3691			else
   3692				skb_set_transport_header(skb,
   3693							 skb_checksum_start_offset(skb));
   3694			if (skb_csum_hwoffload_help(skb, features))
   3695				goto out_kfree_skb;
   3696		}
   3697	}
   3698
   3699	skb = validate_xmit_xfrm(skb, features, again);
   3700
   3701	return skb;
   3702
   3703out_kfree_skb:
   3704	kfree_skb(skb);
   3705out_null:
   3706	dev_core_stats_tx_dropped_inc(dev);
   3707	return NULL;
   3708}
   3709
   3710struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
   3711{
   3712	struct sk_buff *next, *head = NULL, *tail;
   3713
   3714	for (; skb != NULL; skb = next) {
   3715		next = skb->next;
   3716		skb_mark_not_on_list(skb);
   3717
   3718		/* in case skb wont be segmented, point to itself */
   3719		skb->prev = skb;
   3720
   3721		skb = validate_xmit_skb(skb, dev, again);
   3722		if (!skb)
   3723			continue;
   3724
   3725		if (!head)
   3726			head = skb;
   3727		else
   3728			tail->next = skb;
   3729		/* If skb was segmented, skb->prev points to
   3730		 * the last segment. If not, it still contains skb.
   3731		 */
   3732		tail = skb->prev;
   3733	}
   3734	return head;
   3735}
   3736EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
   3737
   3738static void qdisc_pkt_len_init(struct sk_buff *skb)
   3739{
   3740	const struct skb_shared_info *shinfo = skb_shinfo(skb);
   3741
   3742	qdisc_skb_cb(skb)->pkt_len = skb->len;
   3743
   3744	/* To get more precise estimation of bytes sent on wire,
   3745	 * we add to pkt_len the headers size of all segments
   3746	 */
   3747	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
   3748		unsigned int hdr_len;
   3749		u16 gso_segs = shinfo->gso_segs;
   3750
   3751		/* mac layer + network layer */
   3752		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
   3753
   3754		/* + transport layer */
   3755		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
   3756			const struct tcphdr *th;
   3757			struct tcphdr _tcphdr;
   3758
   3759			th = skb_header_pointer(skb, skb_transport_offset(skb),
   3760						sizeof(_tcphdr), &_tcphdr);
   3761			if (likely(th))
   3762				hdr_len += __tcp_hdrlen(th);
   3763		} else {
   3764			struct udphdr _udphdr;
   3765
   3766			if (skb_header_pointer(skb, skb_transport_offset(skb),
   3767					       sizeof(_udphdr), &_udphdr))
   3768				hdr_len += sizeof(struct udphdr);
   3769		}
   3770
   3771		if (shinfo->gso_type & SKB_GSO_DODGY)
   3772			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
   3773						shinfo->gso_size);
   3774
   3775		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
   3776	}
   3777}
   3778
   3779static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
   3780			     struct sk_buff **to_free,
   3781			     struct netdev_queue *txq)
   3782{
   3783	int rc;
   3784
   3785	rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
   3786	if (rc == NET_XMIT_SUCCESS)
   3787		trace_qdisc_enqueue(q, txq, skb);
   3788	return rc;
   3789}
   3790
   3791static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
   3792				 struct net_device *dev,
   3793				 struct netdev_queue *txq)
   3794{
   3795	spinlock_t *root_lock = qdisc_lock(q);
   3796	struct sk_buff *to_free = NULL;
   3797	bool contended;
   3798	int rc;
   3799
   3800	qdisc_calculate_pkt_len(skb, q);
   3801
   3802	if (q->flags & TCQ_F_NOLOCK) {
   3803		if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
   3804		    qdisc_run_begin(q)) {
   3805			/* Retest nolock_qdisc_is_empty() within the protection
   3806			 * of q->seqlock to protect from racing with requeuing.
   3807			 */
   3808			if (unlikely(!nolock_qdisc_is_empty(q))) {
   3809				rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
   3810				__qdisc_run(q);
   3811				qdisc_run_end(q);
   3812
   3813				goto no_lock_out;
   3814			}
   3815
   3816			qdisc_bstats_cpu_update(q, skb);
   3817			if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
   3818			    !nolock_qdisc_is_empty(q))
   3819				__qdisc_run(q);
   3820
   3821			qdisc_run_end(q);
   3822			return NET_XMIT_SUCCESS;
   3823		}
   3824
   3825		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
   3826		qdisc_run(q);
   3827
   3828no_lock_out:
   3829		if (unlikely(to_free))
   3830			kfree_skb_list_reason(to_free,
   3831					      SKB_DROP_REASON_QDISC_DROP);
   3832		return rc;
   3833	}
   3834
   3835	/*
   3836	 * Heuristic to force contended enqueues to serialize on a
   3837	 * separate lock before trying to get qdisc main lock.
   3838	 * This permits qdisc->running owner to get the lock more
   3839	 * often and dequeue packets faster.
   3840	 * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
   3841	 * and then other tasks will only enqueue packets. The packets will be
   3842	 * sent after the qdisc owner is scheduled again. To prevent this
   3843	 * scenario the task always serialize on the lock.
   3844	 */
   3845	contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
   3846	if (unlikely(contended))
   3847		spin_lock(&q->busylock);
   3848
   3849	spin_lock(root_lock);
   3850	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
   3851		__qdisc_drop(skb, &to_free);
   3852		rc = NET_XMIT_DROP;
   3853	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
   3854		   qdisc_run_begin(q)) {
   3855		/*
   3856		 * This is a work-conserving queue; there are no old skbs
   3857		 * waiting to be sent out; and the qdisc is not running -
   3858		 * xmit the skb directly.
   3859		 */
   3860
   3861		qdisc_bstats_update(q, skb);
   3862
   3863		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
   3864			if (unlikely(contended)) {
   3865				spin_unlock(&q->busylock);
   3866				contended = false;
   3867			}
   3868			__qdisc_run(q);
   3869		}
   3870
   3871		qdisc_run_end(q);
   3872		rc = NET_XMIT_SUCCESS;
   3873	} else {
   3874		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
   3875		if (qdisc_run_begin(q)) {
   3876			if (unlikely(contended)) {
   3877				spin_unlock(&q->busylock);
   3878				contended = false;
   3879			}
   3880			__qdisc_run(q);
   3881			qdisc_run_end(q);
   3882		}
   3883	}
   3884	spin_unlock(root_lock);
   3885	if (unlikely(to_free))
   3886		kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP);
   3887	if (unlikely(contended))
   3888		spin_unlock(&q->busylock);
   3889	return rc;
   3890}
   3891
   3892#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
   3893static void skb_update_prio(struct sk_buff *skb)
   3894{
   3895	const struct netprio_map *map;
   3896	const struct sock *sk;
   3897	unsigned int prioidx;
   3898
   3899	if (skb->priority)
   3900		return;
   3901	map = rcu_dereference_bh(skb->dev->priomap);
   3902	if (!map)
   3903		return;
   3904	sk = skb_to_full_sk(skb);
   3905	if (!sk)
   3906		return;
   3907
   3908	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
   3909
   3910	if (prioidx < map->priomap_len)
   3911		skb->priority = map->priomap[prioidx];
   3912}
   3913#else
   3914#define skb_update_prio(skb)
   3915#endif
   3916
   3917/**
   3918 *	dev_loopback_xmit - loop back @skb
   3919 *	@net: network namespace this loopback is happening in
   3920 *	@sk:  sk needed to be a netfilter okfn
   3921 *	@skb: buffer to transmit
   3922 */
   3923int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
   3924{
   3925	skb_reset_mac_header(skb);
   3926	__skb_pull(skb, skb_network_offset(skb));
   3927	skb->pkt_type = PACKET_LOOPBACK;
   3928	if (skb->ip_summed == CHECKSUM_NONE)
   3929		skb->ip_summed = CHECKSUM_UNNECESSARY;
   3930	WARN_ON(!skb_dst(skb));
   3931	skb_dst_force(skb);
   3932	netif_rx(skb);
   3933	return 0;
   3934}
   3935EXPORT_SYMBOL(dev_loopback_xmit);
   3936
   3937#ifdef CONFIG_NET_EGRESS
   3938static struct sk_buff *
   3939sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
   3940{
   3941#ifdef CONFIG_NET_CLS_ACT
   3942	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
   3943	struct tcf_result cl_res;
   3944
   3945	if (!miniq)
   3946		return skb;
   3947
   3948	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
   3949	tc_skb_cb(skb)->mru = 0;
   3950	tc_skb_cb(skb)->post_ct = false;
   3951	mini_qdisc_bstats_cpu_update(miniq, skb);
   3952
   3953	switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
   3954	case TC_ACT_OK:
   3955	case TC_ACT_RECLASSIFY:
   3956		skb->tc_index = TC_H_MIN(cl_res.classid);
   3957		break;
   3958	case TC_ACT_SHOT:
   3959		mini_qdisc_qstats_cpu_drop(miniq);
   3960		*ret = NET_XMIT_DROP;
   3961		kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
   3962		return NULL;
   3963	case TC_ACT_STOLEN:
   3964	case TC_ACT_QUEUED:
   3965	case TC_ACT_TRAP:
   3966		*ret = NET_XMIT_SUCCESS;
   3967		consume_skb(skb);
   3968		return NULL;
   3969	case TC_ACT_REDIRECT:
   3970		/* No need to push/pop skb's mac_header here on egress! */
   3971		skb_do_redirect(skb);
   3972		*ret = NET_XMIT_SUCCESS;
   3973		return NULL;
   3974	default:
   3975		break;
   3976	}
   3977#endif /* CONFIG_NET_CLS_ACT */
   3978
   3979	return skb;
   3980}
   3981
   3982static struct netdev_queue *
   3983netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
   3984{
   3985	int qm = skb_get_queue_mapping(skb);
   3986
   3987	return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
   3988}
   3989
   3990static bool netdev_xmit_txqueue_skipped(void)
   3991{
   3992	return __this_cpu_read(softnet_data.xmit.skip_txqueue);
   3993}
   3994
   3995void netdev_xmit_skip_txqueue(bool skip)
   3996{
   3997	__this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
   3998}
   3999EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
   4000#endif /* CONFIG_NET_EGRESS */
   4001
   4002#ifdef CONFIG_XPS
   4003static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
   4004			       struct xps_dev_maps *dev_maps, unsigned int tci)
   4005{
   4006	int tc = netdev_get_prio_tc_map(dev, skb->priority);
   4007	struct xps_map *map;
   4008	int queue_index = -1;
   4009
   4010	if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
   4011		return queue_index;
   4012
   4013	tci *= dev_maps->num_tc;
   4014	tci += tc;
   4015
   4016	map = rcu_dereference(dev_maps->attr_map[tci]);
   4017	if (map) {
   4018		if (map->len == 1)
   4019			queue_index = map->queues[0];
   4020		else
   4021			queue_index = map->queues[reciprocal_scale(
   4022						skb_get_hash(skb), map->len)];
   4023		if (unlikely(queue_index >= dev->real_num_tx_queues))
   4024			queue_index = -1;
   4025	}
   4026	return queue_index;
   4027}
   4028#endif
   4029
   4030static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
   4031			 struct sk_buff *skb)
   4032{
   4033#ifdef CONFIG_XPS
   4034	struct xps_dev_maps *dev_maps;
   4035	struct sock *sk = skb->sk;
   4036	int queue_index = -1;
   4037
   4038	if (!static_key_false(&xps_needed))
   4039		return -1;
   4040
   4041	rcu_read_lock();
   4042	if (!static_key_false(&xps_rxqs_needed))
   4043		goto get_cpus_map;
   4044
   4045	dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
   4046	if (dev_maps) {
   4047		int tci = sk_rx_queue_get(sk);
   4048
   4049		if (tci >= 0)
   4050			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
   4051							  tci);
   4052	}
   4053
   4054get_cpus_map:
   4055	if (queue_index < 0) {
   4056		dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
   4057		if (dev_maps) {
   4058			unsigned int tci = skb->sender_cpu - 1;
   4059
   4060			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
   4061							  tci);
   4062		}
   4063	}
   4064	rcu_read_unlock();
   4065
   4066	return queue_index;
   4067#else
   4068	return -1;
   4069#endif
   4070}
   4071
   4072u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
   4073		     struct net_device *sb_dev)
   4074{
   4075	return 0;
   4076}
   4077EXPORT_SYMBOL(dev_pick_tx_zero);
   4078
   4079u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
   4080		       struct net_device *sb_dev)
   4081{
   4082	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
   4083}
   4084EXPORT_SYMBOL(dev_pick_tx_cpu_id);
   4085
   4086u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
   4087		     struct net_device *sb_dev)
   4088{
   4089	struct sock *sk = skb->sk;
   4090	int queue_index = sk_tx_queue_get(sk);
   4091
   4092	sb_dev = sb_dev ? : dev;
   4093
   4094	if (queue_index < 0 || skb->ooo_okay ||
   4095	    queue_index >= dev->real_num_tx_queues) {
   4096		int new_index = get_xps_queue(dev, sb_dev, skb);
   4097
   4098		if (new_index < 0)
   4099			new_index = skb_tx_hash(dev, sb_dev, skb);
   4100
   4101		if (queue_index != new_index && sk &&
   4102		    sk_fullsock(sk) &&
   4103		    rcu_access_pointer(sk->sk_dst_cache))
   4104			sk_tx_queue_set(sk, new_index);
   4105
   4106		queue_index = new_index;
   4107	}
   4108
   4109	return queue_index;
   4110}
   4111EXPORT_SYMBOL(netdev_pick_tx);
   4112
   4113struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
   4114					 struct sk_buff *skb,
   4115					 struct net_device *sb_dev)
   4116{
   4117	int queue_index = 0;
   4118
   4119#ifdef CONFIG_XPS
   4120	u32 sender_cpu = skb->sender_cpu - 1;
   4121
   4122	if (sender_cpu >= (u32)NR_CPUS)
   4123		skb->sender_cpu = raw_smp_processor_id() + 1;
   4124#endif
   4125
   4126	if (dev->real_num_tx_queues != 1) {
   4127		const struct net_device_ops *ops = dev->netdev_ops;
   4128
   4129		if (ops->ndo_select_queue)
   4130			queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
   4131		else
   4132			queue_index = netdev_pick_tx(dev, skb, sb_dev);
   4133
   4134		queue_index = netdev_cap_txqueue(dev, queue_index);
   4135	}
   4136
   4137	skb_set_queue_mapping(skb, queue_index);
   4138	return netdev_get_tx_queue(dev, queue_index);
   4139}
   4140
   4141/**
   4142 * __dev_queue_xmit() - transmit a buffer
   4143 * @skb:	buffer to transmit
   4144 * @sb_dev:	suboordinate device used for L2 forwarding offload
   4145 *
   4146 * Queue a buffer for transmission to a network device. The caller must
   4147 * have set the device and priority and built the buffer before calling
   4148 * this function. The function can be called from an interrupt.
   4149 *
   4150 * When calling this method, interrupts MUST be enabled. This is because
   4151 * the BH enable code must have IRQs enabled so that it will not deadlock.
   4152 *
   4153 * Regardless of the return value, the skb is consumed, so it is currently
   4154 * difficult to retry a send to this method. (You can bump the ref count
   4155 * before sending to hold a reference for retry if you are careful.)
   4156 *
   4157 * Return:
   4158 * * 0				- buffer successfully transmitted
   4159 * * positive qdisc return code	- NET_XMIT_DROP etc.
   4160 * * negative errno		- other errors
   4161 */
   4162int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
   4163{
   4164	struct net_device *dev = skb->dev;
   4165	struct netdev_queue *txq = NULL;
   4166	struct Qdisc *q;
   4167	int rc = -ENOMEM;
   4168	bool again = false;
   4169
   4170	skb_reset_mac_header(skb);
   4171
   4172	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
   4173		__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
   4174
   4175	/* Disable soft irqs for various locks below. Also
   4176	 * stops preemption for RCU.
   4177	 */
   4178	rcu_read_lock_bh();
   4179
   4180	skb_update_prio(skb);
   4181
   4182	qdisc_pkt_len_init(skb);
   4183#ifdef CONFIG_NET_CLS_ACT
   4184	skb->tc_at_ingress = 0;
   4185#endif
   4186#ifdef CONFIG_NET_EGRESS
   4187	if (static_branch_unlikely(&egress_needed_key)) {
   4188		if (nf_hook_egress_active()) {
   4189			skb = nf_hook_egress(skb, &rc, dev);
   4190			if (!skb)
   4191				goto out;
   4192		}
   4193
   4194		netdev_xmit_skip_txqueue(false);
   4195
   4196		nf_skip_egress(skb, true);
   4197		skb = sch_handle_egress(skb, &rc, dev);
   4198		if (!skb)
   4199			goto out;
   4200		nf_skip_egress(skb, false);
   4201
   4202		if (netdev_xmit_txqueue_skipped())
   4203			txq = netdev_tx_queue_mapping(dev, skb);
   4204	}
   4205#endif
   4206	/* If device/qdisc don't need skb->dst, release it right now while
   4207	 * its hot in this cpu cache.
   4208	 */
   4209	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
   4210		skb_dst_drop(skb);
   4211	else
   4212		skb_dst_force(skb);
   4213
   4214	if (!txq)
   4215		txq = netdev_core_pick_tx(dev, skb, sb_dev);
   4216
   4217	q = rcu_dereference_bh(txq->qdisc);
   4218
   4219	trace_net_dev_queue(skb);
   4220	if (q->enqueue) {
   4221		rc = __dev_xmit_skb(skb, q, dev, txq);
   4222		goto out;
   4223	}
   4224
   4225	/* The device has no queue. Common case for software devices:
   4226	 * loopback, all the sorts of tunnels...
   4227
   4228	 * Really, it is unlikely that netif_tx_lock protection is necessary
   4229	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
   4230	 * counters.)
   4231	 * However, it is possible, that they rely on protection
   4232	 * made by us here.
   4233
   4234	 * Check this and shot the lock. It is not prone from deadlocks.
   4235	 *Either shot noqueue qdisc, it is even simpler 8)
   4236	 */
   4237	if (dev->flags & IFF_UP) {
   4238		int cpu = smp_processor_id(); /* ok because BHs are off */
   4239
   4240		/* Other cpus might concurrently change txq->xmit_lock_owner
   4241		 * to -1 or to their cpu id, but not to our id.
   4242		 */
   4243		if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
   4244			if (dev_xmit_recursion())
   4245				goto recursion_alert;
   4246
   4247			skb = validate_xmit_skb(skb, dev, &again);
   4248			if (!skb)
   4249				goto out;
   4250
   4251			HARD_TX_LOCK(dev, txq, cpu);
   4252
   4253			if (!netif_xmit_stopped(txq)) {
   4254				dev_xmit_recursion_inc();
   4255				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
   4256				dev_xmit_recursion_dec();
   4257				if (dev_xmit_complete(rc)) {
   4258					HARD_TX_UNLOCK(dev, txq);
   4259					goto out;
   4260				}
   4261			}
   4262			HARD_TX_UNLOCK(dev, txq);
   4263			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
   4264					     dev->name);
   4265		} else {
   4266			/* Recursion is detected! It is possible,
   4267			 * unfortunately
   4268			 */
   4269recursion_alert:
   4270			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
   4271					     dev->name);
   4272		}
   4273	}
   4274
   4275	rc = -ENETDOWN;
   4276	rcu_read_unlock_bh();
   4277
   4278	dev_core_stats_tx_dropped_inc(dev);
   4279	kfree_skb_list(skb);
   4280	return rc;
   4281out:
   4282	rcu_read_unlock_bh();
   4283	return rc;
   4284}
   4285EXPORT_SYMBOL(__dev_queue_xmit);
   4286
   4287int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
   4288{
   4289	struct net_device *dev = skb->dev;
   4290	struct sk_buff *orig_skb = skb;
   4291	struct netdev_queue *txq;
   4292	int ret = NETDEV_TX_BUSY;
   4293	bool again = false;
   4294
   4295	if (unlikely(!netif_running(dev) ||
   4296		     !netif_carrier_ok(dev)))
   4297		goto drop;
   4298
   4299	skb = validate_xmit_skb_list(skb, dev, &again);
   4300	if (skb != orig_skb)
   4301		goto drop;
   4302
   4303	skb_set_queue_mapping(skb, queue_id);
   4304	txq = skb_get_tx_queue(dev, skb);
   4305
   4306	local_bh_disable();
   4307
   4308	dev_xmit_recursion_inc();
   4309	HARD_TX_LOCK(dev, txq, smp_processor_id());
   4310	if (!netif_xmit_frozen_or_drv_stopped(txq))
   4311		ret = netdev_start_xmit(skb, dev, txq, false);
   4312	HARD_TX_UNLOCK(dev, txq);
   4313	dev_xmit_recursion_dec();
   4314
   4315	local_bh_enable();
   4316	return ret;
   4317drop:
   4318	dev_core_stats_tx_dropped_inc(dev);
   4319	kfree_skb_list(skb);
   4320	return NET_XMIT_DROP;
   4321}
   4322EXPORT_SYMBOL(__dev_direct_xmit);
   4323
   4324/*************************************************************************
   4325 *			Receiver routines
   4326 *************************************************************************/
   4327
   4328int netdev_max_backlog __read_mostly = 1000;
   4329EXPORT_SYMBOL(netdev_max_backlog);
   4330
   4331int netdev_tstamp_prequeue __read_mostly = 1;
   4332unsigned int sysctl_skb_defer_max __read_mostly = 64;
   4333int netdev_budget __read_mostly = 300;
   4334/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
   4335unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
   4336int weight_p __read_mostly = 64;           /* old backlog weight */
   4337int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
   4338int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
   4339int dev_rx_weight __read_mostly = 64;
   4340int dev_tx_weight __read_mostly = 64;
   4341
   4342/* Called with irq disabled */
   4343static inline void ____napi_schedule(struct softnet_data *sd,
   4344				     struct napi_struct *napi)
   4345{
   4346	struct task_struct *thread;
   4347
   4348	lockdep_assert_irqs_disabled();
   4349
   4350	if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
   4351		/* Paired with smp_mb__before_atomic() in
   4352		 * napi_enable()/dev_set_threaded().
   4353		 * Use READ_ONCE() to guarantee a complete
   4354		 * read on napi->thread. Only call
   4355		 * wake_up_process() when it's not NULL.
   4356		 */
   4357		thread = READ_ONCE(napi->thread);
   4358		if (thread) {
   4359			/* Avoid doing set_bit() if the thread is in
   4360			 * INTERRUPTIBLE state, cause napi_thread_wait()
   4361			 * makes sure to proceed with napi polling
   4362			 * if the thread is explicitly woken from here.
   4363			 */
   4364			if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
   4365				set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
   4366			wake_up_process(thread);
   4367			return;
   4368		}
   4369	}
   4370
   4371	list_add_tail(&napi->poll_list, &sd->poll_list);
   4372	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
   4373}
   4374
   4375#ifdef CONFIG_RPS
   4376
   4377/* One global table that all flow-based protocols share. */
   4378struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
   4379EXPORT_SYMBOL(rps_sock_flow_table);
   4380u32 rps_cpu_mask __read_mostly;
   4381EXPORT_SYMBOL(rps_cpu_mask);
   4382
   4383struct static_key_false rps_needed __read_mostly;
   4384EXPORT_SYMBOL(rps_needed);
   4385struct static_key_false rfs_needed __read_mostly;
   4386EXPORT_SYMBOL(rfs_needed);
   4387
   4388static struct rps_dev_flow *
   4389set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
   4390	    struct rps_dev_flow *rflow, u16 next_cpu)
   4391{
   4392	if (next_cpu < nr_cpu_ids) {
   4393#ifdef CONFIG_RFS_ACCEL
   4394		struct netdev_rx_queue *rxqueue;
   4395		struct rps_dev_flow_table *flow_table;
   4396		struct rps_dev_flow *old_rflow;
   4397		u32 flow_id;
   4398		u16 rxq_index;
   4399		int rc;
   4400
   4401		/* Should we steer this flow to a different hardware queue? */
   4402		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
   4403		    !(dev->features & NETIF_F_NTUPLE))
   4404			goto out;
   4405		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
   4406		if (rxq_index == skb_get_rx_queue(skb))
   4407			goto out;
   4408
   4409		rxqueue = dev->_rx + rxq_index;
   4410		flow_table = rcu_dereference(rxqueue->rps_flow_table);
   4411		if (!flow_table)
   4412			goto out;
   4413		flow_id = skb_get_hash(skb) & flow_table->mask;
   4414		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
   4415							rxq_index, flow_id);
   4416		if (rc < 0)
   4417			goto out;
   4418		old_rflow = rflow;
   4419		rflow = &flow_table->flows[flow_id];
   4420		rflow->filter = rc;
   4421		if (old_rflow->filter == rflow->filter)
   4422			old_rflow->filter = RPS_NO_FILTER;
   4423	out:
   4424#endif
   4425		rflow->last_qtail =
   4426			per_cpu(softnet_data, next_cpu).input_queue_head;
   4427	}
   4428
   4429	rflow->cpu = next_cpu;
   4430	return rflow;
   4431}
   4432
   4433/*
   4434 * get_rps_cpu is called from netif_receive_skb and returns the target
   4435 * CPU from the RPS map of the receiving queue for a given skb.
   4436 * rcu_read_lock must be held on entry.
   4437 */
   4438static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
   4439		       struct rps_dev_flow **rflowp)
   4440{
   4441	const struct rps_sock_flow_table *sock_flow_table;
   4442	struct netdev_rx_queue *rxqueue = dev->_rx;
   4443	struct rps_dev_flow_table *flow_table;
   4444	struct rps_map *map;
   4445	int cpu = -1;
   4446	u32 tcpu;
   4447	u32 hash;
   4448
   4449	if (skb_rx_queue_recorded(skb)) {
   4450		u16 index = skb_get_rx_queue(skb);
   4451
   4452		if (unlikely(index >= dev->real_num_rx_queues)) {
   4453			WARN_ONCE(dev->real_num_rx_queues > 1,
   4454				  "%s received packet on queue %u, but number "
   4455				  "of RX queues is %u\n",
   4456				  dev->name, index, dev->real_num_rx_queues);
   4457			goto done;
   4458		}
   4459		rxqueue += index;
   4460	}
   4461
   4462	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
   4463
   4464	flow_table = rcu_dereference(rxqueue->rps_flow_table);
   4465	map = rcu_dereference(rxqueue->rps_map);
   4466	if (!flow_table && !map)
   4467		goto done;
   4468
   4469	skb_reset_network_header(skb);
   4470	hash = skb_get_hash(skb);
   4471	if (!hash)
   4472		goto done;
   4473
   4474	sock_flow_table = rcu_dereference(rps_sock_flow_table);
   4475	if (flow_table && sock_flow_table) {
   4476		struct rps_dev_flow *rflow;
   4477		u32 next_cpu;
   4478		u32 ident;
   4479
   4480		/* First check into global flow table if there is a match */
   4481		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
   4482		if ((ident ^ hash) & ~rps_cpu_mask)
   4483			goto try_rps;
   4484
   4485		next_cpu = ident & rps_cpu_mask;
   4486
   4487		/* OK, now we know there is a match,
   4488		 * we can look at the local (per receive queue) flow table
   4489		 */
   4490		rflow = &flow_table->flows[hash & flow_table->mask];
   4491		tcpu = rflow->cpu;
   4492
   4493		/*
   4494		 * If the desired CPU (where last recvmsg was done) is
   4495		 * different from current CPU (one in the rx-queue flow
   4496		 * table entry), switch if one of the following holds:
   4497		 *   - Current CPU is unset (>= nr_cpu_ids).
   4498		 *   - Current CPU is offline.
   4499		 *   - The current CPU's queue tail has advanced beyond the
   4500		 *     last packet that was enqueued using this table entry.
   4501		 *     This guarantees that all previous packets for the flow
   4502		 *     have been dequeued, thus preserving in order delivery.
   4503		 */
   4504		if (unlikely(tcpu != next_cpu) &&
   4505		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
   4506		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
   4507		      rflow->last_qtail)) >= 0)) {
   4508			tcpu = next_cpu;
   4509			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
   4510		}
   4511
   4512		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
   4513			*rflowp = rflow;
   4514			cpu = tcpu;
   4515			goto done;
   4516		}
   4517	}
   4518
   4519try_rps:
   4520
   4521	if (map) {
   4522		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
   4523		if (cpu_online(tcpu)) {
   4524			cpu = tcpu;
   4525			goto done;
   4526		}
   4527	}
   4528
   4529done:
   4530	return cpu;
   4531}
   4532
   4533#ifdef CONFIG_RFS_ACCEL
   4534
   4535/**
   4536 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
   4537 * @dev: Device on which the filter was set
   4538 * @rxq_index: RX queue index
   4539 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
   4540 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
   4541 *
   4542 * Drivers that implement ndo_rx_flow_steer() should periodically call
   4543 * this function for each installed filter and remove the filters for
   4544 * which it returns %true.
   4545 */
   4546bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
   4547			 u32 flow_id, u16 filter_id)
   4548{
   4549	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
   4550	struct rps_dev_flow_table *flow_table;
   4551	struct rps_dev_flow *rflow;
   4552	bool expire = true;
   4553	unsigned int cpu;
   4554
   4555	rcu_read_lock();
   4556	flow_table = rcu_dereference(rxqueue->rps_flow_table);
   4557	if (flow_table && flow_id <= flow_table->mask) {
   4558		rflow = &flow_table->flows[flow_id];
   4559		cpu = READ_ONCE(rflow->cpu);
   4560		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
   4561		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
   4562			   rflow->last_qtail) <
   4563		     (int)(10 * flow_table->mask)))
   4564			expire = false;
   4565	}
   4566	rcu_read_unlock();
   4567	return expire;
   4568}
   4569EXPORT_SYMBOL(rps_may_expire_flow);
   4570
   4571#endif /* CONFIG_RFS_ACCEL */
   4572
   4573/* Called from hardirq (IPI) context */
   4574static void rps_trigger_softirq(void *data)
   4575{
   4576	struct softnet_data *sd = data;
   4577
   4578	____napi_schedule(sd, &sd->backlog);
   4579	sd->received_rps++;
   4580}
   4581
   4582#endif /* CONFIG_RPS */
   4583
   4584/* Called from hardirq (IPI) context */
   4585static void trigger_rx_softirq(void *data)
   4586{
   4587	struct softnet_data *sd = data;
   4588
   4589	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
   4590	smp_store_release(&sd->defer_ipi_scheduled, 0);
   4591}
   4592
   4593/*
   4594 * Check if this softnet_data structure is another cpu one
   4595 * If yes, queue it to our IPI list and return 1
   4596 * If no, return 0
   4597 */
   4598static int napi_schedule_rps(struct softnet_data *sd)
   4599{
   4600	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
   4601
   4602#ifdef CONFIG_RPS
   4603	if (sd != mysd) {
   4604		sd->rps_ipi_next = mysd->rps_ipi_list;
   4605		mysd->rps_ipi_list = sd;
   4606
   4607		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
   4608		return 1;
   4609	}
   4610#endif /* CONFIG_RPS */
   4611	__napi_schedule_irqoff(&mysd->backlog);
   4612	return 0;
   4613}
   4614
   4615#ifdef CONFIG_NET_FLOW_LIMIT
   4616int netdev_flow_limit_table_len __read_mostly = (1 << 12);
   4617#endif
   4618
   4619static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
   4620{
   4621#ifdef CONFIG_NET_FLOW_LIMIT
   4622	struct sd_flow_limit *fl;
   4623	struct softnet_data *sd;
   4624	unsigned int old_flow, new_flow;
   4625
   4626	if (qlen < (netdev_max_backlog >> 1))
   4627		return false;
   4628
   4629	sd = this_cpu_ptr(&softnet_data);
   4630
   4631	rcu_read_lock();
   4632	fl = rcu_dereference(sd->flow_limit);
   4633	if (fl) {
   4634		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
   4635		old_flow = fl->history[fl->history_head];
   4636		fl->history[fl->history_head] = new_flow;
   4637
   4638		fl->history_head++;
   4639		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
   4640
   4641		if (likely(fl->buckets[old_flow]))
   4642			fl->buckets[old_flow]--;
   4643
   4644		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
   4645			fl->count++;
   4646			rcu_read_unlock();
   4647			return true;
   4648		}
   4649	}
   4650	rcu_read_unlock();
   4651#endif
   4652	return false;
   4653}
   4654
   4655/*
   4656 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
   4657 * queue (may be a remote CPU queue).
   4658 */
   4659static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
   4660			      unsigned int *qtail)
   4661{
   4662	enum skb_drop_reason reason;
   4663	struct softnet_data *sd;
   4664	unsigned long flags;
   4665	unsigned int qlen;
   4666
   4667	reason = SKB_DROP_REASON_NOT_SPECIFIED;
   4668	sd = &per_cpu(softnet_data, cpu);
   4669
   4670	rps_lock_irqsave(sd, &flags);
   4671	if (!netif_running(skb->dev))
   4672		goto drop;
   4673	qlen = skb_queue_len(&sd->input_pkt_queue);
   4674	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
   4675		if (qlen) {
   4676enqueue:
   4677			__skb_queue_tail(&sd->input_pkt_queue, skb);
   4678			input_queue_tail_incr_save(sd, qtail);
   4679			rps_unlock_irq_restore(sd, &flags);
   4680			return NET_RX_SUCCESS;
   4681		}
   4682
   4683		/* Schedule NAPI for backlog device
   4684		 * We can use non atomic operation since we own the queue lock
   4685		 */
   4686		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
   4687			napi_schedule_rps(sd);
   4688		goto enqueue;
   4689	}
   4690	reason = SKB_DROP_REASON_CPU_BACKLOG;
   4691
   4692drop:
   4693	sd->dropped++;
   4694	rps_unlock_irq_restore(sd, &flags);
   4695
   4696	dev_core_stats_rx_dropped_inc(skb->dev);
   4697	kfree_skb_reason(skb, reason);
   4698	return NET_RX_DROP;
   4699}
   4700
   4701static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
   4702{
   4703	struct net_device *dev = skb->dev;
   4704	struct netdev_rx_queue *rxqueue;
   4705
   4706	rxqueue = dev->_rx;
   4707
   4708	if (skb_rx_queue_recorded(skb)) {
   4709		u16 index = skb_get_rx_queue(skb);
   4710
   4711		if (unlikely(index >= dev->real_num_rx_queues)) {
   4712			WARN_ONCE(dev->real_num_rx_queues > 1,
   4713				  "%s received packet on queue %u, but number "
   4714				  "of RX queues is %u\n",
   4715				  dev->name, index, dev->real_num_rx_queues);
   4716
   4717			return rxqueue; /* Return first rxqueue */
   4718		}
   4719		rxqueue += index;
   4720	}
   4721	return rxqueue;
   4722}
   4723
   4724u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
   4725			     struct bpf_prog *xdp_prog)
   4726{
   4727	void *orig_data, *orig_data_end, *hard_start;
   4728	struct netdev_rx_queue *rxqueue;
   4729	bool orig_bcast, orig_host;
   4730	u32 mac_len, frame_sz;
   4731	__be16 orig_eth_type;
   4732	struct ethhdr *eth;
   4733	u32 metalen, act;
   4734	int off;
   4735
   4736	/* The XDP program wants to see the packet starting at the MAC
   4737	 * header.
   4738	 */
   4739	mac_len = skb->data - skb_mac_header(skb);
   4740	hard_start = skb->data - skb_headroom(skb);
   4741
   4742	/* SKB "head" area always have tailroom for skb_shared_info */
   4743	frame_sz = (void *)skb_end_pointer(skb) - hard_start;
   4744	frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
   4745
   4746	rxqueue = netif_get_rxqueue(skb);
   4747	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
   4748	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
   4749			 skb_headlen(skb) + mac_len, true);
   4750
   4751	orig_data_end = xdp->data_end;
   4752	orig_data = xdp->data;
   4753	eth = (struct ethhdr *)xdp->data;
   4754	orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
   4755	orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
   4756	orig_eth_type = eth->h_proto;
   4757
   4758	act = bpf_prog_run_xdp(xdp_prog, xdp);
   4759
   4760	/* check if bpf_xdp_adjust_head was used */
   4761	off = xdp->data - orig_data;
   4762	if (off) {
   4763		if (off > 0)
   4764			__skb_pull(skb, off);
   4765		else if (off < 0)
   4766			__skb_push(skb, -off);
   4767
   4768		skb->mac_header += off;
   4769		skb_reset_network_header(skb);
   4770	}
   4771
   4772	/* check if bpf_xdp_adjust_tail was used */
   4773	off = xdp->data_end - orig_data_end;
   4774	if (off != 0) {
   4775		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
   4776		skb->len += off; /* positive on grow, negative on shrink */
   4777	}
   4778
   4779	/* check if XDP changed eth hdr such SKB needs update */
   4780	eth = (struct ethhdr *)xdp->data;
   4781	if ((orig_eth_type != eth->h_proto) ||
   4782	    (orig_host != ether_addr_equal_64bits(eth->h_dest,
   4783						  skb->dev->dev_addr)) ||
   4784	    (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
   4785		__skb_push(skb, ETH_HLEN);
   4786		skb->pkt_type = PACKET_HOST;
   4787		skb->protocol = eth_type_trans(skb, skb->dev);
   4788	}
   4789
   4790	/* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
   4791	 * before calling us again on redirect path. We do not call do_redirect
   4792	 * as we leave that up to the caller.
   4793	 *
   4794	 * Caller is responsible for managing lifetime of skb (i.e. calling
   4795	 * kfree_skb in response to actions it cannot handle/XDP_DROP).
   4796	 */
   4797	switch (act) {
   4798	case XDP_REDIRECT:
   4799	case XDP_TX:
   4800		__skb_push(skb, mac_len);
   4801		break;
   4802	case XDP_PASS:
   4803		metalen = xdp->data - xdp->data_meta;
   4804		if (metalen)
   4805			skb_metadata_set(skb, metalen);
   4806		break;
   4807	}
   4808
   4809	return act;
   4810}
   4811
   4812static u32 netif_receive_generic_xdp(struct sk_buff *skb,
   4813				     struct xdp_buff *xdp,
   4814				     struct bpf_prog *xdp_prog)
   4815{
   4816	u32 act = XDP_DROP;
   4817
   4818	/* Reinjected packets coming from act_mirred or similar should
   4819	 * not get XDP generic processing.
   4820	 */
   4821	if (skb_is_redirected(skb))
   4822		return XDP_PASS;
   4823
   4824	/* XDP packets must be linear and must have sufficient headroom
   4825	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
   4826	 * native XDP provides, thus we need to do it here as well.
   4827	 */
   4828	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
   4829	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
   4830		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
   4831		int troom = skb->tail + skb->data_len - skb->end;
   4832
   4833		/* In case we have to go down the path and also linearize,
   4834		 * then lets do the pskb_expand_head() work just once here.
   4835		 */
   4836		if (pskb_expand_head(skb,
   4837				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
   4838				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
   4839			goto do_drop;
   4840		if (skb_linearize(skb))
   4841			goto do_drop;
   4842	}
   4843
   4844	act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
   4845	switch (act) {
   4846	case XDP_REDIRECT:
   4847	case XDP_TX:
   4848	case XDP_PASS:
   4849		break;
   4850	default:
   4851		bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
   4852		fallthrough;
   4853	case XDP_ABORTED:
   4854		trace_xdp_exception(skb->dev, xdp_prog, act);
   4855		fallthrough;
   4856	case XDP_DROP:
   4857	do_drop:
   4858		kfree_skb(skb);
   4859		break;
   4860	}
   4861
   4862	return act;
   4863}
   4864
   4865/* When doing generic XDP we have to bypass the qdisc layer and the
   4866 * network taps in order to match in-driver-XDP behavior.
   4867 */
   4868void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
   4869{
   4870	struct net_device *dev = skb->dev;
   4871	struct netdev_queue *txq;
   4872	bool free_skb = true;
   4873	int cpu, rc;
   4874
   4875	txq = netdev_core_pick_tx(dev, skb, NULL);
   4876	cpu = smp_processor_id();
   4877	HARD_TX_LOCK(dev, txq, cpu);
   4878	if (!netif_xmit_stopped(txq)) {
   4879		rc = netdev_start_xmit(skb, dev, txq, 0);
   4880		if (dev_xmit_complete(rc))
   4881			free_skb = false;
   4882	}
   4883	HARD_TX_UNLOCK(dev, txq);
   4884	if (free_skb) {
   4885		trace_xdp_exception(dev, xdp_prog, XDP_TX);
   4886		kfree_skb(skb);
   4887	}
   4888}
   4889
   4890static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
   4891
   4892int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
   4893{
   4894	if (xdp_prog) {
   4895		struct xdp_buff xdp;
   4896		u32 act;
   4897		int err;
   4898
   4899		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
   4900		if (act != XDP_PASS) {
   4901			switch (act) {
   4902			case XDP_REDIRECT:
   4903				err = xdp_do_generic_redirect(skb->dev, skb,
   4904							      &xdp, xdp_prog);
   4905				if (err)
   4906					goto out_redir;
   4907				break;
   4908			case XDP_TX:
   4909				generic_xdp_tx(skb, xdp_prog);
   4910				break;
   4911			}
   4912			return XDP_DROP;
   4913		}
   4914	}
   4915	return XDP_PASS;
   4916out_redir:
   4917	kfree_skb_reason(skb, SKB_DROP_REASON_XDP);
   4918	return XDP_DROP;
   4919}
   4920EXPORT_SYMBOL_GPL(do_xdp_generic);
   4921
   4922static int netif_rx_internal(struct sk_buff *skb)
   4923{
   4924	int ret;
   4925
   4926	net_timestamp_check(netdev_tstamp_prequeue, skb);
   4927
   4928	trace_netif_rx(skb);
   4929
   4930#ifdef CONFIG_RPS
   4931	if (static_branch_unlikely(&rps_needed)) {
   4932		struct rps_dev_flow voidflow, *rflow = &voidflow;
   4933		int cpu;
   4934
   4935		rcu_read_lock();
   4936
   4937		cpu = get_rps_cpu(skb->dev, skb, &rflow);
   4938		if (cpu < 0)
   4939			cpu = smp_processor_id();
   4940
   4941		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
   4942
   4943		rcu_read_unlock();
   4944	} else
   4945#endif
   4946	{
   4947		unsigned int qtail;
   4948
   4949		ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
   4950	}
   4951	return ret;
   4952}
   4953
   4954/**
   4955 *	__netif_rx	-	Slightly optimized version of netif_rx
   4956 *	@skb: buffer to post
   4957 *
   4958 *	This behaves as netif_rx except that it does not disable bottom halves.
   4959 *	As a result this function may only be invoked from the interrupt context
   4960 *	(either hard or soft interrupt).
   4961 */
   4962int __netif_rx(struct sk_buff *skb)
   4963{
   4964	int ret;
   4965
   4966	lockdep_assert_once(hardirq_count() | softirq_count());
   4967
   4968	trace_netif_rx_entry(skb);
   4969	ret = netif_rx_internal(skb);
   4970	trace_netif_rx_exit(ret);
   4971	return ret;
   4972}
   4973EXPORT_SYMBOL(__netif_rx);
   4974
   4975/**
   4976 *	netif_rx	-	post buffer to the network code
   4977 *	@skb: buffer to post
   4978 *
   4979 *	This function receives a packet from a device driver and queues it for
   4980 *	the upper (protocol) levels to process via the backlog NAPI device. It
   4981 *	always succeeds. The buffer may be dropped during processing for
   4982 *	congestion control or by the protocol layers.
   4983 *	The network buffer is passed via the backlog NAPI device. Modern NIC
   4984 *	driver should use NAPI and GRO.
   4985 *	This function can used from interrupt and from process context. The
   4986 *	caller from process context must not disable interrupts before invoking
   4987 *	this function.
   4988 *
   4989 *	return values:
   4990 *	NET_RX_SUCCESS	(no congestion)
   4991 *	NET_RX_DROP     (packet was dropped)
   4992 *
   4993 */
   4994int netif_rx(struct sk_buff *skb)
   4995{
   4996	bool need_bh_off = !(hardirq_count() | softirq_count());
   4997	int ret;
   4998
   4999	if (need_bh_off)
   5000		local_bh_disable();
   5001	trace_netif_rx_entry(skb);
   5002	ret = netif_rx_internal(skb);
   5003	trace_netif_rx_exit(ret);
   5004	if (need_bh_off)
   5005		local_bh_enable();
   5006	return ret;
   5007}
   5008EXPORT_SYMBOL(netif_rx);
   5009
   5010static __latent_entropy void net_tx_action(struct softirq_action *h)
   5011{
   5012	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
   5013
   5014	if (sd->completion_queue) {
   5015		struct sk_buff *clist;
   5016
   5017		local_irq_disable();
   5018		clist = sd->completion_queue;
   5019		sd->completion_queue = NULL;
   5020		local_irq_enable();
   5021
   5022		while (clist) {
   5023			struct sk_buff *skb = clist;
   5024
   5025			clist = clist->next;
   5026
   5027			WARN_ON(refcount_read(&skb->users));
   5028			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
   5029				trace_consume_skb(skb);
   5030			else
   5031				trace_kfree_skb(skb, net_tx_action,
   5032						SKB_DROP_REASON_NOT_SPECIFIED);
   5033
   5034			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
   5035				__kfree_skb(skb);
   5036			else
   5037				__kfree_skb_defer(skb);
   5038		}
   5039	}
   5040
   5041	if (sd->output_queue) {
   5042		struct Qdisc *head;
   5043
   5044		local_irq_disable();
   5045		head = sd->output_queue;
   5046		sd->output_queue = NULL;
   5047		sd->output_queue_tailp = &sd->output_queue;
   5048		local_irq_enable();
   5049
   5050		rcu_read_lock();
   5051
   5052		while (head) {
   5053			struct Qdisc *q = head;
   5054			spinlock_t *root_lock = NULL;
   5055
   5056			head = head->next_sched;
   5057
   5058			/* We need to make sure head->next_sched is read
   5059			 * before clearing __QDISC_STATE_SCHED
   5060			 */
   5061			smp_mb__before_atomic();
   5062
   5063			if (!(q->flags & TCQ_F_NOLOCK)) {
   5064				root_lock = qdisc_lock(q);
   5065				spin_lock(root_lock);
   5066			} else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
   5067						     &q->state))) {
   5068				/* There is a synchronize_net() between
   5069				 * STATE_DEACTIVATED flag being set and
   5070				 * qdisc_reset()/some_qdisc_is_busy() in
   5071				 * dev_deactivate(), so we can safely bail out
   5072				 * early here to avoid data race between
   5073				 * qdisc_deactivate() and some_qdisc_is_busy()
   5074				 * for lockless qdisc.
   5075				 */
   5076				clear_bit(__QDISC_STATE_SCHED, &q->state);
   5077				continue;
   5078			}
   5079
   5080			clear_bit(__QDISC_STATE_SCHED, &q->state);
   5081			qdisc_run(q);
   5082			if (root_lock)
   5083				spin_unlock(root_lock);
   5084		}
   5085
   5086		rcu_read_unlock();
   5087	}
   5088
   5089	xfrm_dev_backlog(sd);
   5090}
   5091
   5092#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
   5093/* This hook is defined here for ATM LANE */
   5094int (*br_fdb_test_addr_hook)(struct net_device *dev,
   5095			     unsigned char *addr) __read_mostly;
   5096EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
   5097#endif
   5098
   5099static inline struct sk_buff *
   5100sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
   5101		   struct net_device *orig_dev, bool *another)
   5102{
   5103#ifdef CONFIG_NET_CLS_ACT
   5104	struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
   5105	struct tcf_result cl_res;
   5106
   5107	/* If there's at least one ingress present somewhere (so
   5108	 * we get here via enabled static key), remaining devices
   5109	 * that are not configured with an ingress qdisc will bail
   5110	 * out here.
   5111	 */
   5112	if (!miniq)
   5113		return skb;
   5114
   5115	if (*pt_prev) {
   5116		*ret = deliver_skb(skb, *pt_prev, orig_dev);
   5117		*pt_prev = NULL;
   5118	}
   5119
   5120	qdisc_skb_cb(skb)->pkt_len = skb->len;
   5121	tc_skb_cb(skb)->mru = 0;
   5122	tc_skb_cb(skb)->post_ct = false;
   5123	skb->tc_at_ingress = 1;
   5124	mini_qdisc_bstats_cpu_update(miniq, skb);
   5125
   5126	switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
   5127	case TC_ACT_OK:
   5128	case TC_ACT_RECLASSIFY:
   5129		skb->tc_index = TC_H_MIN(cl_res.classid);
   5130		break;
   5131	case TC_ACT_SHOT:
   5132		mini_qdisc_qstats_cpu_drop(miniq);
   5133		kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
   5134		return NULL;
   5135	case TC_ACT_STOLEN:
   5136	case TC_ACT_QUEUED:
   5137	case TC_ACT_TRAP:
   5138		consume_skb(skb);
   5139		return NULL;
   5140	case TC_ACT_REDIRECT:
   5141		/* skb_mac_header check was done by cls/act_bpf, so
   5142		 * we can safely push the L2 header back before
   5143		 * redirecting to another netdev
   5144		 */
   5145		__skb_push(skb, skb->mac_len);
   5146		if (skb_do_redirect(skb) == -EAGAIN) {
   5147			__skb_pull(skb, skb->mac_len);
   5148			*another = true;
   5149			break;
   5150		}
   5151		return NULL;
   5152	case TC_ACT_CONSUMED:
   5153		return NULL;
   5154	default:
   5155		break;
   5156	}
   5157#endif /* CONFIG_NET_CLS_ACT */
   5158	return skb;
   5159}
   5160
   5161/**
   5162 *	netdev_is_rx_handler_busy - check if receive handler is registered
   5163 *	@dev: device to check
   5164 *
   5165 *	Check if a receive handler is already registered for a given device.
   5166 *	Return true if there one.
   5167 *
   5168 *	The caller must hold the rtnl_mutex.
   5169 */
   5170bool netdev_is_rx_handler_busy(struct net_device *dev)
   5171{
   5172	ASSERT_RTNL();
   5173	return dev && rtnl_dereference(dev->rx_handler);
   5174}
   5175EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
   5176
   5177/**
   5178 *	netdev_rx_handler_register - register receive handler
   5179 *	@dev: device to register a handler for
   5180 *	@rx_handler: receive handler to register
   5181 *	@rx_handler_data: data pointer that is used by rx handler
   5182 *
   5183 *	Register a receive handler for a device. This handler will then be
   5184 *	called from __netif_receive_skb. A negative errno code is returned
   5185 *	on a failure.
   5186 *
   5187 *	The caller must hold the rtnl_mutex.
   5188 *
   5189 *	For a general description of rx_handler, see enum rx_handler_result.
   5190 */
   5191int netdev_rx_handler_register(struct net_device *dev,
   5192			       rx_handler_func_t *rx_handler,
   5193			       void *rx_handler_data)
   5194{
   5195	if (netdev_is_rx_handler_busy(dev))
   5196		return -EBUSY;
   5197
   5198	if (dev->priv_flags & IFF_NO_RX_HANDLER)
   5199		return -EINVAL;
   5200
   5201	/* Note: rx_handler_data must be set before rx_handler */
   5202	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
   5203	rcu_assign_pointer(dev->rx_handler, rx_handler);
   5204
   5205	return 0;
   5206}
   5207EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
   5208
   5209/**
   5210 *	netdev_rx_handler_unregister - unregister receive handler
   5211 *	@dev: device to unregister a handler from
   5212 *
   5213 *	Unregister a receive handler from a device.
   5214 *
   5215 *	The caller must hold the rtnl_mutex.
   5216 */
   5217void netdev_rx_handler_unregister(struct net_device *dev)
   5218{
   5219
   5220	ASSERT_RTNL();
   5221	RCU_INIT_POINTER(dev->rx_handler, NULL);
   5222	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
   5223	 * section has a guarantee to see a non NULL rx_handler_data
   5224	 * as well.
   5225	 */
   5226	synchronize_net();
   5227	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
   5228}
   5229EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
   5230
   5231/*
   5232 * Limit the use of PFMEMALLOC reserves to those protocols that implement
   5233 * the special handling of PFMEMALLOC skbs.
   5234 */
   5235static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
   5236{
   5237	switch (skb->protocol) {
   5238	case htons(ETH_P_ARP):
   5239	case htons(ETH_P_IP):
   5240	case htons(ETH_P_IPV6):
   5241	case htons(ETH_P_8021Q):
   5242	case htons(ETH_P_8021AD):
   5243		return true;
   5244	default:
   5245		return false;
   5246	}
   5247}
   5248
   5249static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
   5250			     int *ret, struct net_device *orig_dev)
   5251{
   5252	if (nf_hook_ingress_active(skb)) {
   5253		int ingress_retval;
   5254
   5255		if (*pt_prev) {
   5256			*ret = deliver_skb(skb, *pt_prev, orig_dev);
   5257			*pt_prev = NULL;
   5258		}
   5259
   5260		rcu_read_lock();
   5261		ingress_retval = nf_hook_ingress(skb);
   5262		rcu_read_unlock();
   5263		return ingress_retval;
   5264	}
   5265	return 0;
   5266}
   5267
   5268static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
   5269				    struct packet_type **ppt_prev)
   5270{
   5271	struct packet_type *ptype, *pt_prev;
   5272	rx_handler_func_t *rx_handler;
   5273	struct sk_buff *skb = *pskb;
   5274	struct net_device *orig_dev;
   5275	bool deliver_exact = false;
   5276	int ret = NET_RX_DROP;
   5277	__be16 type;
   5278
   5279	net_timestamp_check(!netdev_tstamp_prequeue, skb);
   5280
   5281	trace_netif_receive_skb(skb);
   5282
   5283	orig_dev = skb->dev;
   5284
   5285	skb_reset_network_header(skb);
   5286	if (!skb_transport_header_was_set(skb))
   5287		skb_reset_transport_header(skb);
   5288	skb_reset_mac_len(skb);
   5289
   5290	pt_prev = NULL;
   5291
   5292another_round:
   5293	skb->skb_iif = skb->dev->ifindex;
   5294
   5295	__this_cpu_inc(softnet_data.processed);
   5296
   5297	if (static_branch_unlikely(&generic_xdp_needed_key)) {
   5298		int ret2;
   5299
   5300		migrate_disable();
   5301		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
   5302		migrate_enable();
   5303
   5304		if (ret2 != XDP_PASS) {
   5305			ret = NET_RX_DROP;
   5306			goto out;
   5307		}
   5308	}
   5309
   5310	if (eth_type_vlan(skb->protocol)) {
   5311		skb = skb_vlan_untag(skb);
   5312		if (unlikely(!skb))
   5313			goto out;
   5314	}
   5315
   5316	if (skb_skip_tc_classify(skb))
   5317		goto skip_classify;
   5318
   5319	if (pfmemalloc)
   5320		goto skip_taps;
   5321
   5322	list_for_each_entry_rcu(ptype, &ptype_all, list) {
   5323		if (pt_prev)
   5324			ret = deliver_skb(skb, pt_prev, orig_dev);
   5325		pt_prev = ptype;
   5326	}
   5327
   5328	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
   5329		if (pt_prev)
   5330			ret = deliver_skb(skb, pt_prev, orig_dev);
   5331		pt_prev = ptype;
   5332	}
   5333
   5334skip_taps:
   5335#ifdef CONFIG_NET_INGRESS
   5336	if (static_branch_unlikely(&ingress_needed_key)) {
   5337		bool another = false;
   5338
   5339		nf_skip_egress(skb, true);
   5340		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
   5341					 &another);
   5342		if (another)
   5343			goto another_round;
   5344		if (!skb)
   5345			goto out;
   5346
   5347		nf_skip_egress(skb, false);
   5348		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
   5349			goto out;
   5350	}
   5351#endif
   5352	skb_reset_redirect(skb);
   5353skip_classify:
   5354	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
   5355		goto drop;
   5356
   5357	if (skb_vlan_tag_present(skb)) {
   5358		if (pt_prev) {
   5359			ret = deliver_skb(skb, pt_prev, orig_dev);
   5360			pt_prev = NULL;
   5361		}
   5362		if (vlan_do_receive(&skb))
   5363			goto another_round;
   5364		else if (unlikely(!skb))
   5365			goto out;
   5366	}
   5367
   5368	rx_handler = rcu_dereference(skb->dev->rx_handler);
   5369	if (rx_handler) {
   5370		if (pt_prev) {
   5371			ret = deliver_skb(skb, pt_prev, orig_dev);
   5372			pt_prev = NULL;
   5373		}
   5374		switch (rx_handler(&skb)) {
   5375		case RX_HANDLER_CONSUMED:
   5376			ret = NET_RX_SUCCESS;
   5377			goto out;
   5378		case RX_HANDLER_ANOTHER:
   5379			goto another_round;
   5380		case RX_HANDLER_EXACT:
   5381			deliver_exact = true;
   5382			break;
   5383		case RX_HANDLER_PASS:
   5384			break;
   5385		default:
   5386			BUG();
   5387		}
   5388	}
   5389
   5390	if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
   5391check_vlan_id:
   5392		if (skb_vlan_tag_get_id(skb)) {
   5393			/* Vlan id is non 0 and vlan_do_receive() above couldn't
   5394			 * find vlan device.
   5395			 */
   5396			skb->pkt_type = PACKET_OTHERHOST;
   5397		} else if (eth_type_vlan(skb->protocol)) {
   5398			/* Outer header is 802.1P with vlan 0, inner header is
   5399			 * 802.1Q or 802.1AD and vlan_do_receive() above could
   5400			 * not find vlan dev for vlan id 0.
   5401			 */
   5402			__vlan_hwaccel_clear_tag(skb);
   5403			skb = skb_vlan_untag(skb);
   5404			if (unlikely(!skb))
   5405				goto out;
   5406			if (vlan_do_receive(&skb))
   5407				/* After stripping off 802.1P header with vlan 0
   5408				 * vlan dev is found for inner header.
   5409				 */
   5410				goto another_round;
   5411			else if (unlikely(!skb))
   5412				goto out;
   5413			else
   5414				/* We have stripped outer 802.1P vlan 0 header.
   5415				 * But could not find vlan dev.
   5416				 * check again for vlan id to set OTHERHOST.
   5417				 */
   5418				goto check_vlan_id;
   5419		}
   5420		/* Note: we might in the future use prio bits
   5421		 * and set skb->priority like in vlan_do_receive()
   5422		 * For the time being, just ignore Priority Code Point
   5423		 */
   5424		__vlan_hwaccel_clear_tag(skb);
   5425	}
   5426
   5427	type = skb->protocol;
   5428
   5429	/* deliver only exact match when indicated */
   5430	if (likely(!deliver_exact)) {
   5431		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
   5432				       &ptype_base[ntohs(type) &
   5433						   PTYPE_HASH_MASK]);
   5434	}
   5435
   5436	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
   5437			       &orig_dev->ptype_specific);
   5438
   5439	if (unlikely(skb->dev != orig_dev)) {
   5440		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
   5441				       &skb->dev->ptype_specific);
   5442	}
   5443
   5444	if (pt_prev) {
   5445		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
   5446			goto drop;
   5447		*ppt_prev = pt_prev;
   5448	} else {
   5449drop:
   5450		if (!deliver_exact)
   5451			dev_core_stats_rx_dropped_inc(skb->dev);
   5452		else
   5453			dev_core_stats_rx_nohandler_inc(skb->dev);
   5454		kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
   5455		/* Jamal, now you will not able to escape explaining
   5456		 * me how you were going to use this. :-)
   5457		 */
   5458		ret = NET_RX_DROP;
   5459	}
   5460
   5461out:
   5462	/* The invariant here is that if *ppt_prev is not NULL
   5463	 * then skb should also be non-NULL.
   5464	 *
   5465	 * Apparently *ppt_prev assignment above holds this invariant due to
   5466	 * skb dereferencing near it.
   5467	 */
   5468	*pskb = skb;
   5469	return ret;
   5470}
   5471
   5472static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
   5473{
   5474	struct net_device *orig_dev = skb->dev;
   5475	struct packet_type *pt_prev = NULL;
   5476	int ret;
   5477
   5478	ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
   5479	if (pt_prev)
   5480		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
   5481					 skb->dev, pt_prev, orig_dev);
   5482	return ret;
   5483}
   5484
   5485/**
   5486 *	netif_receive_skb_core - special purpose version of netif_receive_skb
   5487 *	@skb: buffer to process
   5488 *
   5489 *	More direct receive version of netif_receive_skb().  It should
   5490 *	only be used by callers that have a need to skip RPS and Generic XDP.
   5491 *	Caller must also take care of handling if ``(page_is_)pfmemalloc``.
   5492 *
   5493 *	This function may only be called from softirq context and interrupts
   5494 *	should be enabled.
   5495 *
   5496 *	Return values (usually ignored):
   5497 *	NET_RX_SUCCESS: no congestion
   5498 *	NET_RX_DROP: packet was dropped
   5499 */
   5500int netif_receive_skb_core(struct sk_buff *skb)
   5501{
   5502	int ret;
   5503
   5504	rcu_read_lock();
   5505	ret = __netif_receive_skb_one_core(skb, false);
   5506	rcu_read_unlock();
   5507
   5508	return ret;
   5509}
   5510EXPORT_SYMBOL(netif_receive_skb_core);
   5511
   5512static inline void __netif_receive_skb_list_ptype(struct list_head *head,
   5513						  struct packet_type *pt_prev,
   5514						  struct net_device *orig_dev)
   5515{
   5516	struct sk_buff *skb, *next;
   5517
   5518	if (!pt_prev)
   5519		return;
   5520	if (list_empty(head))
   5521		return;
   5522	if (pt_prev->list_func != NULL)
   5523		INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
   5524				   ip_list_rcv, head, pt_prev, orig_dev);
   5525	else
   5526		list_for_each_entry_safe(skb, next, head, list) {
   5527			skb_list_del_init(skb);
   5528			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
   5529		}
   5530}
   5531
   5532static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
   5533{
   5534	/* Fast-path assumptions:
   5535	 * - There is no RX handler.
   5536	 * - Only one packet_type matches.
   5537	 * If either of these fails, we will end up doing some per-packet
   5538	 * processing in-line, then handling the 'last ptype' for the whole
   5539	 * sublist.  This can't cause out-of-order delivery to any single ptype,
   5540	 * because the 'last ptype' must be constant across the sublist, and all
   5541	 * other ptypes are handled per-packet.
   5542	 */
   5543	/* Current (common) ptype of sublist */
   5544	struct packet_type *pt_curr = NULL;
   5545	/* Current (common) orig_dev of sublist */
   5546	struct net_device *od_curr = NULL;
   5547	struct list_head sublist;
   5548	struct sk_buff *skb, *next;
   5549
   5550	INIT_LIST_HEAD(&sublist);
   5551	list_for_each_entry_safe(skb, next, head, list) {
   5552		struct net_device *orig_dev = skb->dev;
   5553		struct packet_type *pt_prev = NULL;
   5554
   5555		skb_list_del_init(skb);
   5556		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
   5557		if (!pt_prev)
   5558			continue;
   5559		if (pt_curr != pt_prev || od_curr != orig_dev) {
   5560			/* dispatch old sublist */
   5561			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
   5562			/* start new sublist */
   5563			INIT_LIST_HEAD(&sublist);
   5564			pt_curr = pt_prev;
   5565			od_curr = orig_dev;
   5566		}
   5567		list_add_tail(&skb->list, &sublist);
   5568	}
   5569
   5570	/* dispatch final sublist */
   5571	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
   5572}
   5573
   5574static int __netif_receive_skb(struct sk_buff *skb)
   5575{
   5576	int ret;
   5577
   5578	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
   5579		unsigned int noreclaim_flag;
   5580
   5581		/*
   5582		 * PFMEMALLOC skbs are special, they should
   5583		 * - be delivered to SOCK_MEMALLOC sockets only
   5584		 * - stay away from userspace
   5585		 * - have bounded memory usage
   5586		 *
   5587		 * Use PF_MEMALLOC as this saves us from propagating the allocation
   5588		 * context down to all allocation sites.
   5589		 */
   5590		noreclaim_flag = memalloc_noreclaim_save();
   5591		ret = __netif_receive_skb_one_core(skb, true);
   5592		memalloc_noreclaim_restore(noreclaim_flag);
   5593	} else
   5594		ret = __netif_receive_skb_one_core(skb, false);
   5595
   5596	return ret;
   5597}
   5598
   5599static void __netif_receive_skb_list(struct list_head *head)
   5600{
   5601	unsigned long noreclaim_flag = 0;
   5602	struct sk_buff *skb, *next;
   5603	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
   5604
   5605	list_for_each_entry_safe(skb, next, head, list) {
   5606		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
   5607			struct list_head sublist;
   5608
   5609			/* Handle the previous sublist */
   5610			list_cut_before(&sublist, head, &skb->list);
   5611			if (!list_empty(&sublist))
   5612				__netif_receive_skb_list_core(&sublist, pfmemalloc);
   5613			pfmemalloc = !pfmemalloc;
   5614			/* See comments in __netif_receive_skb */
   5615			if (pfmemalloc)
   5616				noreclaim_flag = memalloc_noreclaim_save();
   5617			else
   5618				memalloc_noreclaim_restore(noreclaim_flag);
   5619		}
   5620	}
   5621	/* Handle the remaining sublist */
   5622	if (!list_empty(head))
   5623		__netif_receive_skb_list_core(head, pfmemalloc);
   5624	/* Restore pflags */
   5625	if (pfmemalloc)
   5626		memalloc_noreclaim_restore(noreclaim_flag);
   5627}
   5628
   5629static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
   5630{
   5631	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
   5632	struct bpf_prog *new = xdp->prog;
   5633	int ret = 0;
   5634
   5635	switch (xdp->command) {
   5636	case XDP_SETUP_PROG:
   5637		rcu_assign_pointer(dev->xdp_prog, new);
   5638		if (old)
   5639			bpf_prog_put(old);
   5640
   5641		if (old && !new) {
   5642			static_branch_dec(&generic_xdp_needed_key);
   5643		} else if (new && !old) {
   5644			static_branch_inc(&generic_xdp_needed_key);
   5645			dev_disable_lro(dev);
   5646			dev_disable_gro_hw(dev);
   5647		}
   5648		break;
   5649
   5650	default:
   5651		ret = -EINVAL;
   5652		break;
   5653	}
   5654
   5655	return ret;
   5656}
   5657
   5658static int netif_receive_skb_internal(struct sk_buff *skb)
   5659{
   5660	int ret;
   5661
   5662	net_timestamp_check(netdev_tstamp_prequeue, skb);
   5663
   5664	if (skb_defer_rx_timestamp(skb))
   5665		return NET_RX_SUCCESS;
   5666
   5667	rcu_read_lock();
   5668#ifdef CONFIG_RPS
   5669	if (static_branch_unlikely(&rps_needed)) {
   5670		struct rps_dev_flow voidflow, *rflow = &voidflow;
   5671		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
   5672
   5673		if (cpu >= 0) {
   5674			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
   5675			rcu_read_unlock();
   5676			return ret;
   5677		}
   5678	}
   5679#endif
   5680	ret = __netif_receive_skb(skb);
   5681	rcu_read_unlock();
   5682	return ret;
   5683}
   5684
   5685void netif_receive_skb_list_internal(struct list_head *head)
   5686{
   5687	struct sk_buff *skb, *next;
   5688	struct list_head sublist;
   5689
   5690	INIT_LIST_HEAD(&sublist);
   5691	list_for_each_entry_safe(skb, next, head, list) {
   5692		net_timestamp_check(netdev_tstamp_prequeue, skb);
   5693		skb_list_del_init(skb);
   5694		if (!skb_defer_rx_timestamp(skb))
   5695			list_add_tail(&skb->list, &sublist);
   5696	}
   5697	list_splice_init(&sublist, head);
   5698
   5699	rcu_read_lock();
   5700#ifdef CONFIG_RPS
   5701	if (static_branch_unlikely(&rps_needed)) {
   5702		list_for_each_entry_safe(skb, next, head, list) {
   5703			struct rps_dev_flow voidflow, *rflow = &voidflow;
   5704			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
   5705
   5706			if (cpu >= 0) {
   5707				/* Will be handled, remove from list */
   5708				skb_list_del_init(skb);
   5709				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
   5710			}
   5711		}
   5712	}
   5713#endif
   5714	__netif_receive_skb_list(head);
   5715	rcu_read_unlock();
   5716}
   5717
   5718/**
   5719 *	netif_receive_skb - process receive buffer from network
   5720 *	@skb: buffer to process
   5721 *
   5722 *	netif_receive_skb() is the main receive data processing function.
   5723 *	It always succeeds. The buffer may be dropped during processing
   5724 *	for congestion control or by the protocol layers.
   5725 *
   5726 *	This function may only be called from softirq context and interrupts
   5727 *	should be enabled.
   5728 *
   5729 *	Return values (usually ignored):
   5730 *	NET_RX_SUCCESS: no congestion
   5731 *	NET_RX_DROP: packet was dropped
   5732 */
   5733int netif_receive_skb(struct sk_buff *skb)
   5734{
   5735	int ret;
   5736
   5737	trace_netif_receive_skb_entry(skb);
   5738
   5739	ret = netif_receive_skb_internal(skb);
   5740	trace_netif_receive_skb_exit(ret);
   5741
   5742	return ret;
   5743}
   5744EXPORT_SYMBOL(netif_receive_skb);
   5745
   5746/**
   5747 *	netif_receive_skb_list - process many receive buffers from network
   5748 *	@head: list of skbs to process.
   5749 *
   5750 *	Since return value of netif_receive_skb() is normally ignored, and
   5751 *	wouldn't be meaningful for a list, this function returns void.
   5752 *
   5753 *	This function may only be called from softirq context and interrupts
   5754 *	should be enabled.
   5755 */
   5756void netif_receive_skb_list(struct list_head *head)
   5757{
   5758	struct sk_buff *skb;
   5759
   5760	if (list_empty(head))
   5761		return;
   5762	if (trace_netif_receive_skb_list_entry_enabled()) {
   5763		list_for_each_entry(skb, head, list)
   5764			trace_netif_receive_skb_list_entry(skb);
   5765	}
   5766	netif_receive_skb_list_internal(head);
   5767	trace_netif_receive_skb_list_exit(0);
   5768}
   5769EXPORT_SYMBOL(netif_receive_skb_list);
   5770
   5771static DEFINE_PER_CPU(struct work_struct, flush_works);
   5772
   5773/* Network device is going away, flush any packets still pending */
   5774static void flush_backlog(struct work_struct *work)
   5775{
   5776	struct sk_buff *skb, *tmp;
   5777	struct softnet_data *sd;
   5778
   5779	local_bh_disable();
   5780	sd = this_cpu_ptr(&softnet_data);
   5781
   5782	rps_lock_irq_disable(sd);
   5783	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
   5784		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
   5785			__skb_unlink(skb, &sd->input_pkt_queue);
   5786			dev_kfree_skb_irq(skb);
   5787			input_queue_head_incr(sd);
   5788		}
   5789	}
   5790	rps_unlock_irq_enable(sd);
   5791
   5792	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
   5793		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
   5794			__skb_unlink(skb, &sd->process_queue);
   5795			kfree_skb(skb);
   5796			input_queue_head_incr(sd);
   5797		}
   5798	}
   5799	local_bh_enable();
   5800}
   5801
   5802static bool flush_required(int cpu)
   5803{
   5804#if IS_ENABLED(CONFIG_RPS)
   5805	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
   5806	bool do_flush;
   5807
   5808	rps_lock_irq_disable(sd);
   5809
   5810	/* as insertion into process_queue happens with the rps lock held,
   5811	 * process_queue access may race only with dequeue
   5812	 */
   5813	do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
   5814		   !skb_queue_empty_lockless(&sd->process_queue);
   5815	rps_unlock_irq_enable(sd);
   5816
   5817	return do_flush;
   5818#endif
   5819	/* without RPS we can't safely check input_pkt_queue: during a
   5820	 * concurrent remote skb_queue_splice() we can detect as empty both
   5821	 * input_pkt_queue and process_queue even if the latter could end-up
   5822	 * containing a lot of packets.
   5823	 */
   5824	return true;
   5825}
   5826
   5827static void flush_all_backlogs(void)
   5828{
   5829	static cpumask_t flush_cpus;
   5830	unsigned int cpu;
   5831
   5832	/* since we are under rtnl lock protection we can use static data
   5833	 * for the cpumask and avoid allocating on stack the possibly
   5834	 * large mask
   5835	 */
   5836	ASSERT_RTNL();
   5837
   5838	cpus_read_lock();
   5839
   5840	cpumask_clear(&flush_cpus);
   5841	for_each_online_cpu(cpu) {
   5842		if (flush_required(cpu)) {
   5843			queue_work_on(cpu, system_highpri_wq,
   5844				      per_cpu_ptr(&flush_works, cpu));
   5845			cpumask_set_cpu(cpu, &flush_cpus);
   5846		}
   5847	}
   5848
   5849	/* we can have in flight packet[s] on the cpus we are not flushing,
   5850	 * synchronize_net() in unregister_netdevice_many() will take care of
   5851	 * them
   5852	 */
   5853	for_each_cpu(cpu, &flush_cpus)
   5854		flush_work(per_cpu_ptr(&flush_works, cpu));
   5855
   5856	cpus_read_unlock();
   5857}
   5858
   5859static void net_rps_send_ipi(struct softnet_data *remsd)
   5860{
   5861#ifdef CONFIG_RPS
   5862	while (remsd) {
   5863		struct softnet_data *next = remsd->rps_ipi_next;
   5864
   5865		if (cpu_online(remsd->cpu))
   5866			smp_call_function_single_async(remsd->cpu, &remsd->csd);
   5867		remsd = next;
   5868	}
   5869#endif
   5870}
   5871
   5872/*
   5873 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
   5874 * Note: called with local irq disabled, but exits with local irq enabled.
   5875 */
   5876static void net_rps_action_and_irq_enable(struct softnet_data *sd)
   5877{
   5878#ifdef CONFIG_RPS
   5879	struct softnet_data *remsd = sd->rps_ipi_list;
   5880
   5881	if (remsd) {
   5882		sd->rps_ipi_list = NULL;
   5883
   5884		local_irq_enable();
   5885
   5886		/* Send pending IPI's to kick RPS processing on remote cpus. */
   5887		net_rps_send_ipi(remsd);
   5888	} else
   5889#endif
   5890		local_irq_enable();
   5891}
   5892
   5893static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
   5894{
   5895#ifdef CONFIG_RPS
   5896	return sd->rps_ipi_list != NULL;
   5897#else
   5898	return false;
   5899#endif
   5900}
   5901
   5902static int process_backlog(struct napi_struct *napi, int quota)
   5903{
   5904	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
   5905	bool again = true;
   5906	int work = 0;
   5907
   5908	/* Check if we have pending ipi, its better to send them now,
   5909	 * not waiting net_rx_action() end.
   5910	 */
   5911	if (sd_has_rps_ipi_waiting(sd)) {
   5912		local_irq_disable();
   5913		net_rps_action_and_irq_enable(sd);
   5914	}
   5915
   5916	napi->weight = dev_rx_weight;
   5917	while (again) {
   5918		struct sk_buff *skb;
   5919
   5920		while ((skb = __skb_dequeue(&sd->process_queue))) {
   5921			rcu_read_lock();
   5922			__netif_receive_skb(skb);
   5923			rcu_read_unlock();
   5924			input_queue_head_incr(sd);
   5925			if (++work >= quota)
   5926				return work;
   5927
   5928		}
   5929
   5930		rps_lock_irq_disable(sd);
   5931		if (skb_queue_empty(&sd->input_pkt_queue)) {
   5932			/*
   5933			 * Inline a custom version of __napi_complete().
   5934			 * only current cpu owns and manipulates this napi,
   5935			 * and NAPI_STATE_SCHED is the only possible flag set
   5936			 * on backlog.
   5937			 * We can use a plain write instead of clear_bit(),
   5938			 * and we dont need an smp_mb() memory barrier.
   5939			 */
   5940			napi->state = 0;
   5941			again = false;
   5942		} else {
   5943			skb_queue_splice_tail_init(&sd->input_pkt_queue,
   5944						   &sd->process_queue);
   5945		}
   5946		rps_unlock_irq_enable(sd);
   5947	}
   5948
   5949	return work;
   5950}
   5951
   5952/**
   5953 * __napi_schedule - schedule for receive
   5954 * @n: entry to schedule
   5955 *
   5956 * The entry's receive function will be scheduled to run.
   5957 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
   5958 */
   5959void __napi_schedule(struct napi_struct *n)
   5960{
   5961	unsigned long flags;
   5962
   5963	local_irq_save(flags);
   5964	____napi_schedule(this_cpu_ptr(&softnet_data), n);
   5965	local_irq_restore(flags);
   5966}
   5967EXPORT_SYMBOL(__napi_schedule);
   5968
   5969/**
   5970 *	napi_schedule_prep - check if napi can be scheduled
   5971 *	@n: napi context
   5972 *
   5973 * Test if NAPI routine is already running, and if not mark
   5974 * it as running.  This is used as a condition variable to
   5975 * insure only one NAPI poll instance runs.  We also make
   5976 * sure there is no pending NAPI disable.
   5977 */
   5978bool napi_schedule_prep(struct napi_struct *n)
   5979{
   5980	unsigned long val, new;
   5981
   5982	do {
   5983		val = READ_ONCE(n->state);
   5984		if (unlikely(val & NAPIF_STATE_DISABLE))
   5985			return false;
   5986		new = val | NAPIF_STATE_SCHED;
   5987
   5988		/* Sets STATE_MISSED bit if STATE_SCHED was already set
   5989		 * This was suggested by Alexander Duyck, as compiler
   5990		 * emits better code than :
   5991		 * if (val & NAPIF_STATE_SCHED)
   5992		 *     new |= NAPIF_STATE_MISSED;
   5993		 */
   5994		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
   5995						   NAPIF_STATE_MISSED;
   5996	} while (cmpxchg(&n->state, val, new) != val);
   5997
   5998	return !(val & NAPIF_STATE_SCHED);
   5999}
   6000EXPORT_SYMBOL(napi_schedule_prep);
   6001
   6002/**
   6003 * __napi_schedule_irqoff - schedule for receive
   6004 * @n: entry to schedule
   6005 *
   6006 * Variant of __napi_schedule() assuming hard irqs are masked.
   6007 *
   6008 * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
   6009 * because the interrupt disabled assumption might not be true
   6010 * due to force-threaded interrupts and spinlock substitution.
   6011 */
   6012void __napi_schedule_irqoff(struct napi_struct *n)
   6013{
   6014	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
   6015		____napi_schedule(this_cpu_ptr(&softnet_data), n);
   6016	else
   6017		__napi_schedule(n);
   6018}
   6019EXPORT_SYMBOL(__napi_schedule_irqoff);
   6020
   6021bool napi_complete_done(struct napi_struct *n, int work_done)
   6022{
   6023	unsigned long flags, val, new, timeout = 0;
   6024	bool ret = true;
   6025
   6026	/*
   6027	 * 1) Don't let napi dequeue from the cpu poll list
   6028	 *    just in case its running on a different cpu.
   6029	 * 2) If we are busy polling, do nothing here, we have
   6030	 *    the guarantee we will be called later.
   6031	 */
   6032	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
   6033				 NAPIF_STATE_IN_BUSY_POLL)))
   6034		return false;
   6035
   6036	if (work_done) {
   6037		if (n->gro_bitmask)
   6038			timeout = READ_ONCE(n->dev->gro_flush_timeout);
   6039		n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
   6040	}
   6041	if (n->defer_hard_irqs_count > 0) {
   6042		n->defer_hard_irqs_count--;
   6043		timeout = READ_ONCE(n->dev->gro_flush_timeout);
   6044		if (timeout)
   6045			ret = false;
   6046	}
   6047	if (n->gro_bitmask) {
   6048		/* When the NAPI instance uses a timeout and keeps postponing
   6049		 * it, we need to bound somehow the time packets are kept in
   6050		 * the GRO layer
   6051		 */
   6052		napi_gro_flush(n, !!timeout);
   6053	}
   6054
   6055	gro_normal_list(n);
   6056
   6057	if (unlikely(!list_empty(&n->poll_list))) {
   6058		/* If n->poll_list is not empty, we need to mask irqs */
   6059		local_irq_save(flags);
   6060		list_del_init(&n->poll_list);
   6061		local_irq_restore(flags);
   6062	}
   6063
   6064	do {
   6065		val = READ_ONCE(n->state);
   6066
   6067		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
   6068
   6069		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
   6070			      NAPIF_STATE_SCHED_THREADED |
   6071			      NAPIF_STATE_PREFER_BUSY_POLL);
   6072
   6073		/* If STATE_MISSED was set, leave STATE_SCHED set,
   6074		 * because we will call napi->poll() one more time.
   6075		 * This C code was suggested by Alexander Duyck to help gcc.
   6076		 */
   6077		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
   6078						    NAPIF_STATE_SCHED;
   6079	} while (cmpxchg(&n->state, val, new) != val);
   6080
   6081	if (unlikely(val & NAPIF_STATE_MISSED)) {
   6082		__napi_schedule(n);
   6083		return false;
   6084	}
   6085
   6086	if (timeout)
   6087		hrtimer_start(&n->timer, ns_to_ktime(timeout),
   6088			      HRTIMER_MODE_REL_PINNED);
   6089	return ret;
   6090}
   6091EXPORT_SYMBOL(napi_complete_done);
   6092
   6093/* must be called under rcu_read_lock(), as we dont take a reference */
   6094static struct napi_struct *napi_by_id(unsigned int napi_id)
   6095{
   6096	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
   6097	struct napi_struct *napi;
   6098
   6099	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
   6100		if (napi->napi_id == napi_id)
   6101			return napi;
   6102
   6103	return NULL;
   6104}
   6105
   6106#if defined(CONFIG_NET_RX_BUSY_POLL)
   6107
   6108static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
   6109{
   6110	if (!skip_schedule) {
   6111		gro_normal_list(napi);
   6112		__napi_schedule(napi);
   6113		return;
   6114	}
   6115
   6116	if (napi->gro_bitmask) {
   6117		/* flush too old packets
   6118		 * If HZ < 1000, flush all packets.
   6119		 */
   6120		napi_gro_flush(napi, HZ >= 1000);
   6121	}
   6122
   6123	gro_normal_list(napi);
   6124	clear_bit(NAPI_STATE_SCHED, &napi->state);
   6125}
   6126
   6127static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
   6128			   u16 budget)
   6129{
   6130	bool skip_schedule = false;
   6131	unsigned long timeout;
   6132	int rc;
   6133
   6134	/* Busy polling means there is a high chance device driver hard irq
   6135	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
   6136	 * set in napi_schedule_prep().
   6137	 * Since we are about to call napi->poll() once more, we can safely
   6138	 * clear NAPI_STATE_MISSED.
   6139	 *
   6140	 * Note: x86 could use a single "lock and ..." instruction
   6141	 * to perform these two clear_bit()
   6142	 */
   6143	clear_bit(NAPI_STATE_MISSED, &napi->state);
   6144	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
   6145
   6146	local_bh_disable();
   6147
   6148	if (prefer_busy_poll) {
   6149		napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
   6150		timeout = READ_ONCE(napi->dev->gro_flush_timeout);
   6151		if (napi->defer_hard_irqs_count && timeout) {
   6152			hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
   6153			skip_schedule = true;
   6154		}
   6155	}
   6156
   6157	/* All we really want here is to re-enable device interrupts.
   6158	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
   6159	 */
   6160	rc = napi->poll(napi, budget);
   6161	/* We can't gro_normal_list() here, because napi->poll() might have
   6162	 * rearmed the napi (napi_complete_done()) in which case it could
   6163	 * already be running on another CPU.
   6164	 */
   6165	trace_napi_poll(napi, rc, budget);
   6166	netpoll_poll_unlock(have_poll_lock);
   6167	if (rc == budget)
   6168		__busy_poll_stop(napi, skip_schedule);
   6169	local_bh_enable();
   6170}
   6171
   6172void napi_busy_loop(unsigned int napi_id,
   6173		    bool (*loop_end)(void *, unsigned long),
   6174		    void *loop_end_arg, bool prefer_busy_poll, u16 budget)
   6175{
   6176	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
   6177	int (*napi_poll)(struct napi_struct *napi, int budget);
   6178	void *have_poll_lock = NULL;
   6179	struct napi_struct *napi;
   6180
   6181restart:
   6182	napi_poll = NULL;
   6183
   6184	rcu_read_lock();
   6185
   6186	napi = napi_by_id(napi_id);
   6187	if (!napi)
   6188		goto out;
   6189
   6190	preempt_disable();
   6191	for (;;) {
   6192		int work = 0;
   6193
   6194		local_bh_disable();
   6195		if (!napi_poll) {
   6196			unsigned long val = READ_ONCE(napi->state);
   6197
   6198			/* If multiple threads are competing for this napi,
   6199			 * we avoid dirtying napi->state as much as we can.
   6200			 */
   6201			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
   6202				   NAPIF_STATE_IN_BUSY_POLL)) {
   6203				if (prefer_busy_poll)
   6204					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
   6205				goto count;
   6206			}
   6207			if (cmpxchg(&napi->state, val,
   6208				    val | NAPIF_STATE_IN_BUSY_POLL |
   6209					  NAPIF_STATE_SCHED) != val) {
   6210				if (prefer_busy_poll)
   6211					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
   6212				goto count;
   6213			}
   6214			have_poll_lock = netpoll_poll_lock(napi);
   6215			napi_poll = napi->poll;
   6216		}
   6217		work = napi_poll(napi, budget);
   6218		trace_napi_poll(napi, work, budget);
   6219		gro_normal_list(napi);
   6220count:
   6221		if (work > 0)
   6222			__NET_ADD_STATS(dev_net(napi->dev),
   6223					LINUX_MIB_BUSYPOLLRXPACKETS, work);
   6224		local_bh_enable();
   6225
   6226		if (!loop_end || loop_end(loop_end_arg, start_time))
   6227			break;
   6228
   6229		if (unlikely(need_resched())) {
   6230			if (napi_poll)
   6231				busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
   6232			preempt_enable();
   6233			rcu_read_unlock();
   6234			cond_resched();
   6235			if (loop_end(loop_end_arg, start_time))
   6236				return;
   6237			goto restart;
   6238		}
   6239		cpu_relax();
   6240	}
   6241	if (napi_poll)
   6242		busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
   6243	preempt_enable();
   6244out:
   6245	rcu_read_unlock();
   6246}
   6247EXPORT_SYMBOL(napi_busy_loop);
   6248
   6249#endif /* CONFIG_NET_RX_BUSY_POLL */
   6250
   6251static void napi_hash_add(struct napi_struct *napi)
   6252{
   6253	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
   6254		return;
   6255
   6256	spin_lock(&napi_hash_lock);
   6257
   6258	/* 0..NR_CPUS range is reserved for sender_cpu use */
   6259	do {
   6260		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
   6261			napi_gen_id = MIN_NAPI_ID;
   6262	} while (napi_by_id(napi_gen_id));
   6263	napi->napi_id = napi_gen_id;
   6264
   6265	hlist_add_head_rcu(&napi->napi_hash_node,
   6266			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
   6267
   6268	spin_unlock(&napi_hash_lock);
   6269}
   6270
   6271/* Warning : caller is responsible to make sure rcu grace period
   6272 * is respected before freeing memory containing @napi
   6273 */
   6274static void napi_hash_del(struct napi_struct *napi)
   6275{
   6276	spin_lock(&napi_hash_lock);
   6277
   6278	hlist_del_init_rcu(&napi->napi_hash_node);
   6279
   6280	spin_unlock(&napi_hash_lock);
   6281}
   6282
   6283static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
   6284{
   6285	struct napi_struct *napi;
   6286
   6287	napi = container_of(timer, struct napi_struct, timer);
   6288
   6289	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
   6290	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
   6291	 */
   6292	if (!napi_disable_pending(napi) &&
   6293	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
   6294		clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
   6295		__napi_schedule_irqoff(napi);
   6296	}
   6297
   6298	return HRTIMER_NORESTART;
   6299}
   6300
   6301static void init_gro_hash(struct napi_struct *napi)
   6302{
   6303	int i;
   6304
   6305	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
   6306		INIT_LIST_HEAD(&napi->gro_hash[i].list);
   6307		napi->gro_hash[i].count = 0;
   6308	}
   6309	napi->gro_bitmask = 0;
   6310}
   6311
   6312int dev_set_threaded(struct net_device *dev, bool threaded)
   6313{
   6314	struct napi_struct *napi;
   6315	int err = 0;
   6316
   6317	if (dev->threaded == threaded)
   6318		return 0;
   6319
   6320	if (threaded) {
   6321		list_for_each_entry(napi, &dev->napi_list, dev_list) {
   6322			if (!napi->thread) {
   6323				err = napi_kthread_create(napi);
   6324				if (err) {
   6325					threaded = false;
   6326					break;
   6327				}
   6328			}
   6329		}
   6330	}
   6331
   6332	dev->threaded = threaded;
   6333
   6334	/* Make sure kthread is created before THREADED bit
   6335	 * is set.
   6336	 */
   6337	smp_mb__before_atomic();
   6338
   6339	/* Setting/unsetting threaded mode on a napi might not immediately
   6340	 * take effect, if the current napi instance is actively being
   6341	 * polled. In this case, the switch between threaded mode and
   6342	 * softirq mode will happen in the next round of napi_schedule().
   6343	 * This should not cause hiccups/stalls to the live traffic.
   6344	 */
   6345	list_for_each_entry(napi, &dev->napi_list, dev_list) {
   6346		if (threaded)
   6347			set_bit(NAPI_STATE_THREADED, &napi->state);
   6348		else
   6349			clear_bit(NAPI_STATE_THREADED, &napi->state);
   6350	}
   6351
   6352	return err;
   6353}
   6354EXPORT_SYMBOL(dev_set_threaded);
   6355
   6356void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
   6357			   int (*poll)(struct napi_struct *, int), int weight)
   6358{
   6359	if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
   6360		return;
   6361
   6362	INIT_LIST_HEAD(&napi->poll_list);
   6363	INIT_HLIST_NODE(&napi->napi_hash_node);
   6364	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
   6365	napi->timer.function = napi_watchdog;
   6366	init_gro_hash(napi);
   6367	napi->skb = NULL;
   6368	INIT_LIST_HEAD(&napi->rx_list);
   6369	napi->rx_count = 0;
   6370	napi->poll = poll;
   6371	if (weight > NAPI_POLL_WEIGHT)
   6372		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
   6373				weight);
   6374	napi->weight = weight;
   6375	napi->dev = dev;
   6376#ifdef CONFIG_NETPOLL
   6377	napi->poll_owner = -1;
   6378#endif
   6379	set_bit(NAPI_STATE_SCHED, &napi->state);
   6380	set_bit(NAPI_STATE_NPSVC, &napi->state);
   6381	list_add_rcu(&napi->dev_list, &dev->napi_list);
   6382	napi_hash_add(napi);
   6383	/* Create kthread for this napi if dev->threaded is set.
   6384	 * Clear dev->threaded if kthread creation failed so that
   6385	 * threaded mode will not be enabled in napi_enable().
   6386	 */
   6387	if (dev->threaded && napi_kthread_create(napi))
   6388		dev->threaded = 0;
   6389}
   6390EXPORT_SYMBOL(netif_napi_add_weight);
   6391
   6392void napi_disable(struct napi_struct *n)
   6393{
   6394	unsigned long val, new;
   6395
   6396	might_sleep();
   6397	set_bit(NAPI_STATE_DISABLE, &n->state);
   6398
   6399	for ( ; ; ) {
   6400		val = READ_ONCE(n->state);
   6401		if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
   6402			usleep_range(20, 200);
   6403			continue;
   6404		}
   6405
   6406		new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
   6407		new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
   6408
   6409		if (cmpxchg(&n->state, val, new) == val)
   6410			break;
   6411	}
   6412
   6413	hrtimer_cancel(&n->timer);
   6414
   6415	clear_bit(NAPI_STATE_DISABLE, &n->state);
   6416}
   6417EXPORT_SYMBOL(napi_disable);
   6418
   6419/**
   6420 *	napi_enable - enable NAPI scheduling
   6421 *	@n: NAPI context
   6422 *
   6423 * Resume NAPI from being scheduled on this context.
   6424 * Must be paired with napi_disable.
   6425 */
   6426void napi_enable(struct napi_struct *n)
   6427{
   6428	unsigned long val, new;
   6429
   6430	do {
   6431		val = READ_ONCE(n->state);
   6432		BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
   6433
   6434		new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
   6435		if (n->dev->threaded && n->thread)
   6436			new |= NAPIF_STATE_THREADED;
   6437	} while (cmpxchg(&n->state, val, new) != val);
   6438}
   6439EXPORT_SYMBOL(napi_enable);
   6440
   6441static void flush_gro_hash(struct napi_struct *napi)
   6442{
   6443	int i;
   6444
   6445	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
   6446		struct sk_buff *skb, *n;
   6447
   6448		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
   6449			kfree_skb(skb);
   6450		napi->gro_hash[i].count = 0;
   6451	}
   6452}
   6453
   6454/* Must be called in process context */
   6455void __netif_napi_del(struct napi_struct *napi)
   6456{
   6457	if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
   6458		return;
   6459
   6460	napi_hash_del(napi);
   6461	list_del_rcu(&napi->dev_list);
   6462	napi_free_frags(napi);
   6463
   6464	flush_gro_hash(napi);
   6465	napi->gro_bitmask = 0;
   6466
   6467	if (napi->thread) {
   6468		kthread_stop(napi->thread);
   6469		napi->thread = NULL;
   6470	}
   6471}
   6472EXPORT_SYMBOL(__netif_napi_del);
   6473
   6474static int __napi_poll(struct napi_struct *n, bool *repoll)
   6475{
   6476	int work, weight;
   6477
   6478	weight = n->weight;
   6479
   6480	/* This NAPI_STATE_SCHED test is for avoiding a race
   6481	 * with netpoll's poll_napi().  Only the entity which
   6482	 * obtains the lock and sees NAPI_STATE_SCHED set will
   6483	 * actually make the ->poll() call.  Therefore we avoid
   6484	 * accidentally calling ->poll() when NAPI is not scheduled.
   6485	 */
   6486	work = 0;
   6487	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
   6488		work = n->poll(n, weight);
   6489		trace_napi_poll(n, work, weight);
   6490	}
   6491
   6492	if (unlikely(work > weight))
   6493		netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
   6494				n->poll, work, weight);
   6495
   6496	if (likely(work < weight))
   6497		return work;
   6498
   6499	/* Drivers must not modify the NAPI state if they
   6500	 * consume the entire weight.  In such cases this code
   6501	 * still "owns" the NAPI instance and therefore can
   6502	 * move the instance around on the list at-will.
   6503	 */
   6504	if (unlikely(napi_disable_pending(n))) {
   6505		napi_complete(n);
   6506		return work;
   6507	}
   6508
   6509	/* The NAPI context has more processing work, but busy-polling
   6510	 * is preferred. Exit early.
   6511	 */
   6512	if (napi_prefer_busy_poll(n)) {
   6513		if (napi_complete_done(n, work)) {
   6514			/* If timeout is not set, we need to make sure
   6515			 * that the NAPI is re-scheduled.
   6516			 */
   6517			napi_schedule(n);
   6518		}
   6519		return work;
   6520	}
   6521
   6522	if (n->gro_bitmask) {
   6523		/* flush too old packets
   6524		 * If HZ < 1000, flush all packets.
   6525		 */
   6526		napi_gro_flush(n, HZ >= 1000);
   6527	}
   6528
   6529	gro_normal_list(n);
   6530
   6531	/* Some drivers may have called napi_schedule
   6532	 * prior to exhausting their budget.
   6533	 */
   6534	if (unlikely(!list_empty(&n->poll_list))) {
   6535		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
   6536			     n->dev ? n->dev->name : "backlog");
   6537		return work;
   6538	}
   6539
   6540	*repoll = true;
   6541
   6542	return work;
   6543}
   6544
   6545static int napi_poll(struct napi_struct *n, struct list_head *repoll)
   6546{
   6547	bool do_repoll = false;
   6548	void *have;
   6549	int work;
   6550
   6551	list_del_init(&n->poll_list);
   6552
   6553	have = netpoll_poll_lock(n);
   6554
   6555	work = __napi_poll(n, &do_repoll);
   6556
   6557	if (do_repoll)
   6558		list_add_tail(&n->poll_list, repoll);
   6559
   6560	netpoll_poll_unlock(have);
   6561
   6562	return work;
   6563}
   6564
   6565static int napi_thread_wait(struct napi_struct *napi)
   6566{
   6567	bool woken = false;
   6568
   6569	set_current_state(TASK_INTERRUPTIBLE);
   6570
   6571	while (!kthread_should_stop()) {
   6572		/* Testing SCHED_THREADED bit here to make sure the current
   6573		 * kthread owns this napi and could poll on this napi.
   6574		 * Testing SCHED bit is not enough because SCHED bit might be
   6575		 * set by some other busy poll thread or by napi_disable().
   6576		 */
   6577		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
   6578			WARN_ON(!list_empty(&napi->poll_list));
   6579			__set_current_state(TASK_RUNNING);
   6580			return 0;
   6581		}
   6582
   6583		schedule();
   6584		/* woken being true indicates this thread owns this napi. */
   6585		woken = true;
   6586		set_current_state(TASK_INTERRUPTIBLE);
   6587	}
   6588	__set_current_state(TASK_RUNNING);
   6589
   6590	return -1;
   6591}
   6592
   6593static int napi_threaded_poll(void *data)
   6594{
   6595	struct napi_struct *napi = data;
   6596	void *have;
   6597
   6598	while (!napi_thread_wait(napi)) {
   6599		for (;;) {
   6600			bool repoll = false;
   6601
   6602			local_bh_disable();
   6603
   6604			have = netpoll_poll_lock(napi);
   6605			__napi_poll(napi, &repoll);
   6606			netpoll_poll_unlock(have);
   6607
   6608			local_bh_enable();
   6609
   6610			if (!repoll)
   6611				break;
   6612
   6613			cond_resched();
   6614		}
   6615	}
   6616	return 0;
   6617}
   6618
   6619static void skb_defer_free_flush(struct softnet_data *sd)
   6620{
   6621	struct sk_buff *skb, *next;
   6622	unsigned long flags;
   6623
   6624	/* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
   6625	if (!READ_ONCE(sd->defer_list))
   6626		return;
   6627
   6628	spin_lock_irqsave(&sd->defer_lock, flags);
   6629	skb = sd->defer_list;
   6630	sd->defer_list = NULL;
   6631	sd->defer_count = 0;
   6632	spin_unlock_irqrestore(&sd->defer_lock, flags);
   6633
   6634	while (skb != NULL) {
   6635		next = skb->next;
   6636		napi_consume_skb(skb, 1);
   6637		skb = next;
   6638	}
   6639}
   6640
   6641static __latent_entropy void net_rx_action(struct softirq_action *h)
   6642{
   6643	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
   6644	unsigned long time_limit = jiffies +
   6645		usecs_to_jiffies(netdev_budget_usecs);
   6646	int budget = netdev_budget;
   6647	LIST_HEAD(list);
   6648	LIST_HEAD(repoll);
   6649
   6650	local_irq_disable();
   6651	list_splice_init(&sd->poll_list, &list);
   6652	local_irq_enable();
   6653
   6654	for (;;) {
   6655		struct napi_struct *n;
   6656
   6657		skb_defer_free_flush(sd);
   6658
   6659		if (list_empty(&list)) {
   6660			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
   6661				goto end;
   6662			break;
   6663		}
   6664
   6665		n = list_first_entry(&list, struct napi_struct, poll_list);
   6666		budget -= napi_poll(n, &repoll);
   6667
   6668		/* If softirq window is exhausted then punt.
   6669		 * Allow this to run for 2 jiffies since which will allow
   6670		 * an average latency of 1.5/HZ.
   6671		 */
   6672		if (unlikely(budget <= 0 ||
   6673			     time_after_eq(jiffies, time_limit))) {
   6674			sd->time_squeeze++;
   6675			break;
   6676		}
   6677	}
   6678
   6679	local_irq_disable();
   6680
   6681	list_splice_tail_init(&sd->poll_list, &list);
   6682	list_splice_tail(&repoll, &list);
   6683	list_splice(&list, &sd->poll_list);
   6684	if (!list_empty(&sd->poll_list))
   6685		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
   6686
   6687	net_rps_action_and_irq_enable(sd);
   6688end:;
   6689}
   6690
   6691struct netdev_adjacent {
   6692	struct net_device *dev;
   6693	netdevice_tracker dev_tracker;
   6694
   6695	/* upper master flag, there can only be one master device per list */
   6696	bool master;
   6697
   6698	/* lookup ignore flag */
   6699	bool ignore;
   6700
   6701	/* counter for the number of times this device was added to us */
   6702	u16 ref_nr;
   6703
   6704	/* private field for the users */
   6705	void *private;
   6706
   6707	struct list_head list;
   6708	struct rcu_head rcu;
   6709};
   6710
   6711static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
   6712						 struct list_head *adj_list)
   6713{
   6714	struct netdev_adjacent *adj;
   6715
   6716	list_for_each_entry(adj, adj_list, list) {
   6717		if (adj->dev == adj_dev)
   6718			return adj;
   6719	}
   6720	return NULL;
   6721}
   6722
   6723static int ____netdev_has_upper_dev(struct net_device *upper_dev,
   6724				    struct netdev_nested_priv *priv)
   6725{
   6726	struct net_device *dev = (struct net_device *)priv->data;
   6727
   6728	return upper_dev == dev;
   6729}
   6730
   6731/**
   6732 * netdev_has_upper_dev - Check if device is linked to an upper device
   6733 * @dev: device
   6734 * @upper_dev: upper device to check
   6735 *
   6736 * Find out if a device is linked to specified upper device and return true
   6737 * in case it is. Note that this checks only immediate upper device,
   6738 * not through a complete stack of devices. The caller must hold the RTNL lock.
   6739 */
   6740bool netdev_has_upper_dev(struct net_device *dev,
   6741			  struct net_device *upper_dev)
   6742{
   6743	struct netdev_nested_priv priv = {
   6744		.data = (void *)upper_dev,
   6745	};
   6746
   6747	ASSERT_RTNL();
   6748
   6749	return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
   6750					     &priv);
   6751}
   6752EXPORT_SYMBOL(netdev_has_upper_dev);
   6753
   6754/**
   6755 * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
   6756 * @dev: device
   6757 * @upper_dev: upper device to check
   6758 *
   6759 * Find out if a device is linked to specified upper device and return true
   6760 * in case it is. Note that this checks the entire upper device chain.
   6761 * The caller must hold rcu lock.
   6762 */
   6763
   6764bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
   6765				  struct net_device *upper_dev)
   6766{
   6767	struct netdev_nested_priv priv = {
   6768		.data = (void *)upper_dev,
   6769	};
   6770
   6771	return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
   6772					       &priv);
   6773}
   6774EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
   6775
   6776/**
   6777 * netdev_has_any_upper_dev - Check if device is linked to some device
   6778 * @dev: device
   6779 *
   6780 * Find out if a device is linked to an upper device and return true in case
   6781 * it is. The caller must hold the RTNL lock.
   6782 */
   6783bool netdev_has_any_upper_dev(struct net_device *dev)
   6784{
   6785	ASSERT_RTNL();
   6786
   6787	return !list_empty(&dev->adj_list.upper);
   6788}
   6789EXPORT_SYMBOL(netdev_has_any_upper_dev);
   6790
   6791/**
   6792 * netdev_master_upper_dev_get - Get master upper device
   6793 * @dev: device
   6794 *
   6795 * Find a master upper device and return pointer to it or NULL in case
   6796 * it's not there. The caller must hold the RTNL lock.
   6797 */
   6798struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
   6799{
   6800	struct netdev_adjacent *upper;
   6801
   6802	ASSERT_RTNL();
   6803
   6804	if (list_empty(&dev->adj_list.upper))
   6805		return NULL;
   6806
   6807	upper = list_first_entry(&dev->adj_list.upper,
   6808				 struct netdev_adjacent, list);
   6809	if (likely(upper->master))
   6810		return upper->dev;
   6811	return NULL;
   6812}
   6813EXPORT_SYMBOL(netdev_master_upper_dev_get);
   6814
   6815static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
   6816{
   6817	struct netdev_adjacent *upper;
   6818
   6819	ASSERT_RTNL();
   6820
   6821	if (list_empty(&dev->adj_list.upper))
   6822		return NULL;
   6823
   6824	upper = list_first_entry(&dev->adj_list.upper,
   6825				 struct netdev_adjacent, list);
   6826	if (likely(upper->master) && !upper->ignore)
   6827		return upper->dev;
   6828	return NULL;
   6829}
   6830
   6831/**
   6832 * netdev_has_any_lower_dev - Check if device is linked to some device
   6833 * @dev: device
   6834 *
   6835 * Find out if a device is linked to a lower device and return true in case
   6836 * it is. The caller must hold the RTNL lock.
   6837 */
   6838static bool netdev_has_any_lower_dev(struct net_device *dev)
   6839{
   6840	ASSERT_RTNL();
   6841
   6842	return !list_empty(&dev->adj_list.lower);
   6843}
   6844
   6845void *netdev_adjacent_get_private(struct list_head *adj_list)
   6846{
   6847	struct netdev_adjacent *adj;
   6848
   6849	adj = list_entry(adj_list, struct netdev_adjacent, list);
   6850
   6851	return adj->private;
   6852}
   6853EXPORT_SYMBOL(netdev_adjacent_get_private);
   6854
   6855/**
   6856 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
   6857 * @dev: device
   6858 * @iter: list_head ** of the current position
   6859 *
   6860 * Gets the next device from the dev's upper list, starting from iter
   6861 * position. The caller must hold RCU read lock.
   6862 */
   6863struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
   6864						 struct list_head **iter)
   6865{
   6866	struct netdev_adjacent *upper;
   6867
   6868	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
   6869
   6870	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
   6871
   6872	if (&upper->list == &dev->adj_list.upper)
   6873		return NULL;
   6874
   6875	*iter = &upper->list;
   6876
   6877	return upper->dev;
   6878}
   6879EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
   6880
   6881static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
   6882						  struct list_head **iter,
   6883						  bool *ignore)
   6884{
   6885	struct netdev_adjacent *upper;
   6886
   6887	upper = list_entry((*iter)->next, struct netdev_adjacent, list);
   6888
   6889	if (&upper->list == &dev->adj_list.upper)
   6890		return NULL;
   6891
   6892	*iter = &upper->list;
   6893	*ignore = upper->ignore;
   6894
   6895	return upper->dev;
   6896}
   6897
   6898static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
   6899						    struct list_head **iter)
   6900{
   6901	struct netdev_adjacent *upper;
   6902
   6903	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
   6904
   6905	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
   6906
   6907	if (&upper->list == &dev->adj_list.upper)
   6908		return NULL;
   6909
   6910	*iter = &upper->list;
   6911
   6912	return upper->dev;
   6913}
   6914
   6915static int __netdev_walk_all_upper_dev(struct net_device *dev,
   6916				       int (*fn)(struct net_device *dev,
   6917					 struct netdev_nested_priv *priv),
   6918				       struct netdev_nested_priv *priv)
   6919{
   6920	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
   6921	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
   6922	int ret, cur = 0;
   6923	bool ignore;
   6924
   6925	now = dev;
   6926	iter = &dev->adj_list.upper;
   6927
   6928	while (1) {
   6929		if (now != dev) {
   6930			ret = fn(now, priv);
   6931			if (ret)
   6932				return ret;
   6933		}
   6934
   6935		next = NULL;
   6936		while (1) {
   6937			udev = __netdev_next_upper_dev(now, &iter, &ignore);
   6938			if (!udev)
   6939				break;
   6940			if (ignore)
   6941				continue;
   6942
   6943			next = udev;
   6944			niter = &udev->adj_list.upper;
   6945			dev_stack[cur] = now;
   6946			iter_stack[cur++] = iter;
   6947			break;
   6948		}
   6949
   6950		if (!next) {
   6951			if (!cur)
   6952				return 0;
   6953			next = dev_stack[--cur];
   6954			niter = iter_stack[cur];
   6955		}
   6956
   6957		now = next;
   6958		iter = niter;
   6959	}
   6960
   6961	return 0;
   6962}
   6963
   6964int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
   6965				  int (*fn)(struct net_device *dev,
   6966					    struct netdev_nested_priv *priv),
   6967				  struct netdev_nested_priv *priv)
   6968{
   6969	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
   6970	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
   6971	int ret, cur = 0;
   6972
   6973	now = dev;
   6974	iter = &dev->adj_list.upper;
   6975
   6976	while (1) {
   6977		if (now != dev) {
   6978			ret = fn(now, priv);
   6979			if (ret)
   6980				return ret;
   6981		}
   6982
   6983		next = NULL;
   6984		while (1) {
   6985			udev = netdev_next_upper_dev_rcu(now, &iter);
   6986			if (!udev)
   6987				break;
   6988
   6989			next = udev;
   6990			niter = &udev->adj_list.upper;
   6991			dev_stack[cur] = now;
   6992			iter_stack[cur++] = iter;
   6993			break;
   6994		}
   6995
   6996		if (!next) {
   6997			if (!cur)
   6998				return 0;
   6999			next = dev_stack[--cur];
   7000			niter = iter_stack[cur];
   7001		}
   7002
   7003		now = next;
   7004		iter = niter;
   7005	}
   7006
   7007	return 0;
   7008}
   7009EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
   7010
   7011static bool __netdev_has_upper_dev(struct net_device *dev,
   7012				   struct net_device *upper_dev)
   7013{
   7014	struct netdev_nested_priv priv = {
   7015		.flags = 0,
   7016		.data = (void *)upper_dev,
   7017	};
   7018
   7019	ASSERT_RTNL();
   7020
   7021	return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
   7022					   &priv);
   7023}
   7024
   7025/**
   7026 * netdev_lower_get_next_private - Get the next ->private from the
   7027 *				   lower neighbour list
   7028 * @dev: device
   7029 * @iter: list_head ** of the current position
   7030 *
   7031 * Gets the next netdev_adjacent->private from the dev's lower neighbour
   7032 * list, starting from iter position. The caller must hold either hold the
   7033 * RTNL lock or its own locking that guarantees that the neighbour lower
   7034 * list will remain unchanged.
   7035 */
   7036void *netdev_lower_get_next_private(struct net_device *dev,
   7037				    struct list_head **iter)
   7038{
   7039	struct netdev_adjacent *lower;
   7040
   7041	lower = list_entry(*iter, struct netdev_adjacent, list);
   7042
   7043	if (&lower->list == &dev->adj_list.lower)
   7044		return NULL;
   7045
   7046	*iter = lower->list.next;
   7047
   7048	return lower->private;
   7049}
   7050EXPORT_SYMBOL(netdev_lower_get_next_private);
   7051
   7052/**
   7053 * netdev_lower_get_next_private_rcu - Get the next ->private from the
   7054 *				       lower neighbour list, RCU
   7055 *				       variant
   7056 * @dev: device
   7057 * @iter: list_head ** of the current position
   7058 *
   7059 * Gets the next netdev_adjacent->private from the dev's lower neighbour
   7060 * list, starting from iter position. The caller must hold RCU read lock.
   7061 */
   7062void *netdev_lower_get_next_private_rcu(struct net_device *dev,
   7063					struct list_head **iter)
   7064{
   7065	struct netdev_adjacent *lower;
   7066
   7067	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
   7068
   7069	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
   7070
   7071	if (&lower->list == &dev->adj_list.lower)
   7072		return NULL;
   7073
   7074	*iter = &lower->list;
   7075
   7076	return lower->private;
   7077}
   7078EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
   7079
   7080/**
   7081 * netdev_lower_get_next - Get the next device from the lower neighbour
   7082 *                         list
   7083 * @dev: device
   7084 * @iter: list_head ** of the current position
   7085 *
   7086 * Gets the next netdev_adjacent from the dev's lower neighbour
   7087 * list, starting from iter position. The caller must hold RTNL lock or
   7088 * its own locking that guarantees that the neighbour lower
   7089 * list will remain unchanged.
   7090 */
   7091void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
   7092{
   7093	struct netdev_adjacent *lower;
   7094
   7095	lower = list_entry(*iter, struct netdev_adjacent, list);
   7096
   7097	if (&lower->list == &dev->adj_list.lower)
   7098		return NULL;
   7099
   7100	*iter = lower->list.next;
   7101
   7102	return lower->dev;
   7103}
   7104EXPORT_SYMBOL(netdev_lower_get_next);
   7105
   7106static struct net_device *netdev_next_lower_dev(struct net_device *dev,
   7107						struct list_head **iter)
   7108{
   7109	struct netdev_adjacent *lower;
   7110
   7111	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
   7112
   7113	if (&lower->list == &dev->adj_list.lower)
   7114		return NULL;
   7115
   7116	*iter = &lower->list;
   7117
   7118	return lower->dev;
   7119}
   7120
   7121static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
   7122						  struct list_head **iter,
   7123						  bool *ignore)
   7124{
   7125	struct netdev_adjacent *lower;
   7126
   7127	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
   7128
   7129	if (&lower->list == &dev->adj_list.lower)
   7130		return NULL;
   7131
   7132	*iter = &lower->list;
   7133	*ignore = lower->ignore;
   7134
   7135	return lower->dev;
   7136}
   7137
   7138int netdev_walk_all_lower_dev(struct net_device *dev,
   7139			      int (*fn)(struct net_device *dev,
   7140					struct netdev_nested_priv *priv),
   7141			      struct netdev_nested_priv *priv)
   7142{
   7143	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
   7144	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
   7145	int ret, cur = 0;
   7146
   7147	now = dev;
   7148	iter = &dev->adj_list.lower;
   7149
   7150	while (1) {
   7151		if (now != dev) {
   7152			ret = fn(now, priv);
   7153			if (ret)
   7154				return ret;
   7155		}
   7156
   7157		next = NULL;
   7158		while (1) {
   7159			ldev = netdev_next_lower_dev(now, &iter);
   7160			if (!ldev)
   7161				break;
   7162
   7163			next = ldev;
   7164			niter = &ldev->adj_list.lower;
   7165			dev_stack[cur] = now;
   7166			iter_stack[cur++] = iter;
   7167			break;
   7168		}
   7169
   7170		if (!next) {
   7171			if (!cur)
   7172				return 0;
   7173			next = dev_stack[--cur];
   7174			niter = iter_stack[cur];
   7175		}
   7176
   7177		now = next;
   7178		iter = niter;
   7179	}
   7180
   7181	return 0;
   7182}
   7183EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
   7184
   7185static int __netdev_walk_all_lower_dev(struct net_device *dev,
   7186				       int (*fn)(struct net_device *dev,
   7187					 struct netdev_nested_priv *priv),
   7188				       struct netdev_nested_priv *priv)
   7189{
   7190	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
   7191	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
   7192	int ret, cur = 0;
   7193	bool ignore;
   7194
   7195	now = dev;
   7196	iter = &dev->adj_list.lower;
   7197
   7198	while (1) {
   7199		if (now != dev) {
   7200			ret = fn(now, priv);
   7201			if (ret)
   7202				return ret;
   7203		}
   7204
   7205		next = NULL;
   7206		while (1) {
   7207			ldev = __netdev_next_lower_dev(now, &iter, &ignore);
   7208			if (!ldev)
   7209				break;
   7210			if (ignore)
   7211				continue;
   7212
   7213			next = ldev;
   7214			niter = &ldev->adj_list.lower;
   7215			dev_stack[cur] = now;
   7216			iter_stack[cur++] = iter;
   7217			break;
   7218		}
   7219
   7220		if (!next) {
   7221			if (!cur)
   7222				return 0;
   7223			next = dev_stack[--cur];
   7224			niter = iter_stack[cur];
   7225		}
   7226
   7227		now = next;
   7228		iter = niter;
   7229	}
   7230
   7231	return 0;
   7232}
   7233
   7234struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
   7235					     struct list_head **iter)
   7236{
   7237	struct netdev_adjacent *lower;
   7238
   7239	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
   7240	if (&lower->list == &dev->adj_list.lower)
   7241		return NULL;
   7242
   7243	*iter = &lower->list;
   7244
   7245	return lower->dev;
   7246}
   7247EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
   7248
   7249static u8 __netdev_upper_depth(struct net_device *dev)
   7250{
   7251	struct net_device *udev;
   7252	struct list_head *iter;
   7253	u8 max_depth = 0;
   7254	bool ignore;
   7255
   7256	for (iter = &dev->adj_list.upper,
   7257	     udev = __netdev_next_upper_dev(dev, &iter, &ignore);
   7258	     udev;
   7259	     udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
   7260		if (ignore)
   7261			continue;
   7262		if (max_depth < udev->upper_level)
   7263			max_depth = udev->upper_level;
   7264	}
   7265
   7266	return max_depth;
   7267}
   7268
   7269static u8 __netdev_lower_depth(struct net_device *dev)
   7270{
   7271	struct net_device *ldev;
   7272	struct list_head *iter;
   7273	u8 max_depth = 0;
   7274	bool ignore;
   7275
   7276	for (iter = &dev->adj_list.lower,
   7277	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
   7278	     ldev;
   7279	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
   7280		if (ignore)
   7281			continue;
   7282		if (max_depth < ldev->lower_level)
   7283			max_depth = ldev->lower_level;
   7284	}
   7285
   7286	return max_depth;
   7287}
   7288
   7289static int __netdev_update_upper_level(struct net_device *dev,
   7290				       struct netdev_nested_priv *__unused)
   7291{
   7292	dev->upper_level = __netdev_upper_depth(dev) + 1;
   7293	return 0;
   7294}
   7295
   7296#ifdef CONFIG_LOCKDEP
   7297static LIST_HEAD(net_unlink_list);
   7298
   7299static void net_unlink_todo(struct net_device *dev)
   7300{
   7301	if (list_empty(&dev->unlink_list))
   7302		list_add_tail(&dev->unlink_list, &net_unlink_list);
   7303}
   7304#endif
   7305
   7306static int __netdev_update_lower_level(struct net_device *dev,
   7307				       struct netdev_nested_priv *priv)
   7308{
   7309	dev->lower_level = __netdev_lower_depth(dev) + 1;
   7310
   7311#ifdef CONFIG_LOCKDEP
   7312	if (!priv)
   7313		return 0;
   7314
   7315	if (priv->flags & NESTED_SYNC_IMM)
   7316		dev->nested_level = dev->lower_level - 1;
   7317	if (priv->flags & NESTED_SYNC_TODO)
   7318		net_unlink_todo(dev);
   7319#endif
   7320	return 0;
   7321}
   7322
   7323int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
   7324				  int (*fn)(struct net_device *dev,
   7325					    struct netdev_nested_priv *priv),
   7326				  struct netdev_nested_priv *priv)
   7327{
   7328	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
   7329	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
   7330	int ret, cur = 0;
   7331
   7332	now = dev;
   7333	iter = &dev->adj_list.lower;
   7334
   7335	while (1) {
   7336		if (now != dev) {
   7337			ret = fn(now, priv);
   7338			if (ret)
   7339				return ret;
   7340		}
   7341
   7342		next = NULL;
   7343		while (1) {
   7344			ldev = netdev_next_lower_dev_rcu(now, &iter);
   7345			if (!ldev)
   7346				break;
   7347
   7348			next = ldev;
   7349			niter = &ldev->adj_list.lower;
   7350			dev_stack[cur] = now;
   7351			iter_stack[cur++] = iter;
   7352			break;
   7353		}
   7354
   7355		if (!next) {
   7356			if (!cur)
   7357				return 0;
   7358			next = dev_stack[--cur];
   7359			niter = iter_stack[cur];
   7360		}
   7361
   7362		now = next;
   7363		iter = niter;
   7364	}
   7365
   7366	return 0;
   7367}
   7368EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
   7369
   7370/**
   7371 * netdev_lower_get_first_private_rcu - Get the first ->private from the
   7372 *				       lower neighbour list, RCU
   7373 *				       variant
   7374 * @dev: device
   7375 *
   7376 * Gets the first netdev_adjacent->private from the dev's lower neighbour
   7377 * list. The caller must hold RCU read lock.
   7378 */
   7379void *netdev_lower_get_first_private_rcu(struct net_device *dev)
   7380{
   7381	struct netdev_adjacent *lower;
   7382
   7383	lower = list_first_or_null_rcu(&dev->adj_list.lower,
   7384			struct netdev_adjacent, list);
   7385	if (lower)
   7386		return lower->private;
   7387	return NULL;
   7388}
   7389EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
   7390
   7391/**
   7392 * netdev_master_upper_dev_get_rcu - Get master upper device
   7393 * @dev: device
   7394 *
   7395 * Find a master upper device and return pointer to it or NULL in case
   7396 * it's not there. The caller must hold the RCU read lock.
   7397 */
   7398struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
   7399{
   7400	struct netdev_adjacent *upper;
   7401
   7402	upper = list_first_or_null_rcu(&dev->adj_list.upper,
   7403				       struct netdev_adjacent, list);
   7404	if (upper && likely(upper->master))
   7405		return upper->dev;
   7406	return NULL;
   7407}
   7408EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
   7409
   7410static int netdev_adjacent_sysfs_add(struct net_device *dev,
   7411			      struct net_device *adj_dev,
   7412			      struct list_head *dev_list)
   7413{
   7414	char linkname[IFNAMSIZ+7];
   7415
   7416	sprintf(linkname, dev_list == &dev->adj_list.upper ?
   7417		"upper_%s" : "lower_%s", adj_dev->name);
   7418	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
   7419				 linkname);
   7420}
   7421static void netdev_adjacent_sysfs_del(struct net_device *dev,
   7422			       char *name,
   7423			       struct list_head *dev_list)
   7424{
   7425	char linkname[IFNAMSIZ+7];
   7426
   7427	sprintf(linkname, dev_list == &dev->adj_list.upper ?
   7428		"upper_%s" : "lower_%s", name);
   7429	sysfs_remove_link(&(dev->dev.kobj), linkname);
   7430}
   7431
   7432static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
   7433						 struct net_device *adj_dev,
   7434						 struct list_head *dev_list)
   7435{
   7436	return (dev_list == &dev->adj_list.upper ||
   7437		dev_list == &dev->adj_list.lower) &&
   7438		net_eq(dev_net(dev), dev_net(adj_dev));
   7439}
   7440
   7441static int __netdev_adjacent_dev_insert(struct net_device *dev,
   7442					struct net_device *adj_dev,
   7443					struct list_head *dev_list,
   7444					void *private, bool master)
   7445{
   7446	struct netdev_adjacent *adj;
   7447	int ret;
   7448
   7449	adj = __netdev_find_adj(adj_dev, dev_list);
   7450
   7451	if (adj) {
   7452		adj->ref_nr += 1;
   7453		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
   7454			 dev->name, adj_dev->name, adj->ref_nr);
   7455
   7456		return 0;
   7457	}
   7458
   7459	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
   7460	if (!adj)
   7461		return -ENOMEM;
   7462
   7463	adj->dev = adj_dev;
   7464	adj->master = master;
   7465	adj->ref_nr = 1;
   7466	adj->private = private;
   7467	adj->ignore = false;
   7468	dev_hold_track(adj_dev, &adj->dev_tracker, GFP_KERNEL);
   7469
   7470	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
   7471		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
   7472
   7473	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
   7474		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
   7475		if (ret)
   7476			goto free_adj;
   7477	}
   7478
   7479	/* Ensure that master link is always the first item in list. */
   7480	if (master) {
   7481		ret = sysfs_create_link(&(dev->dev.kobj),
   7482					&(adj_dev->dev.kobj), "master");
   7483		if (ret)
   7484			goto remove_symlinks;
   7485
   7486		list_add_rcu(&adj->list, dev_list);
   7487	} else {
   7488		list_add_tail_rcu(&adj->list, dev_list);
   7489	}
   7490
   7491	return 0;
   7492
   7493remove_symlinks:
   7494	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
   7495		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
   7496free_adj:
   7497	dev_put_track(adj_dev, &adj->dev_tracker);
   7498	kfree(adj);
   7499
   7500	return ret;
   7501}
   7502
   7503static void __netdev_adjacent_dev_remove(struct net_device *dev,
   7504					 struct net_device *adj_dev,
   7505					 u16 ref_nr,
   7506					 struct list_head *dev_list)
   7507{
   7508	struct netdev_adjacent *adj;
   7509
   7510	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
   7511		 dev->name, adj_dev->name, ref_nr);
   7512
   7513	adj = __netdev_find_adj(adj_dev, dev_list);
   7514
   7515	if (!adj) {
   7516		pr_err("Adjacency does not exist for device %s from %s\n",
   7517		       dev->name, adj_dev->name);
   7518		WARN_ON(1);
   7519		return;
   7520	}
   7521
   7522	if (adj->ref_nr > ref_nr) {
   7523		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
   7524			 dev->name, adj_dev->name, ref_nr,
   7525			 adj->ref_nr - ref_nr);
   7526		adj->ref_nr -= ref_nr;
   7527		return;
   7528	}
   7529
   7530	if (adj->master)
   7531		sysfs_remove_link(&(dev->dev.kobj), "master");
   7532
   7533	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
   7534		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
   7535
   7536	list_del_rcu(&adj->list);
   7537	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
   7538		 adj_dev->name, dev->name, adj_dev->name);
   7539	dev_put_track(adj_dev, &adj->dev_tracker);
   7540	kfree_rcu(adj, rcu);
   7541}
   7542
   7543static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
   7544					    struct net_device *upper_dev,
   7545					    struct list_head *up_list,
   7546					    struct list_head *down_list,
   7547					    void *private, bool master)
   7548{
   7549	int ret;
   7550
   7551	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
   7552					   private, master);
   7553	if (ret)
   7554		return ret;
   7555
   7556	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
   7557					   private, false);
   7558	if (ret) {
   7559		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
   7560		return ret;
   7561	}
   7562
   7563	return 0;
   7564}
   7565
   7566static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
   7567					       struct net_device *upper_dev,
   7568					       u16 ref_nr,
   7569					       struct list_head *up_list,
   7570					       struct list_head *down_list)
   7571{
   7572	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
   7573	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
   7574}
   7575
   7576static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
   7577						struct net_device *upper_dev,
   7578						void *private, bool master)
   7579{
   7580	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
   7581						&dev->adj_list.upper,
   7582						&upper_dev->adj_list.lower,
   7583						private, master);
   7584}
   7585
   7586static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
   7587						   struct net_device *upper_dev)
   7588{
   7589	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
   7590					   &dev->adj_list.upper,
   7591					   &upper_dev->adj_list.lower);
   7592}
   7593
   7594static int __netdev_upper_dev_link(struct net_device *dev,
   7595				   struct net_device *upper_dev, bool master,
   7596				   void *upper_priv, void *upper_info,
   7597				   struct netdev_nested_priv *priv,
   7598				   struct netlink_ext_ack *extack)
   7599{
   7600	struct netdev_notifier_changeupper_info changeupper_info = {
   7601		.info = {
   7602			.dev = dev,
   7603			.extack = extack,
   7604		},
   7605		.upper_dev = upper_dev,
   7606		.master = master,
   7607		.linking = true,
   7608		.upper_info = upper_info,
   7609	};
   7610	struct net_device *master_dev;
   7611	int ret = 0;
   7612
   7613	ASSERT_RTNL();
   7614
   7615	if (dev == upper_dev)
   7616		return -EBUSY;
   7617
   7618	/* To prevent loops, check if dev is not upper device to upper_dev. */
   7619	if (__netdev_has_upper_dev(upper_dev, dev))
   7620		return -EBUSY;
   7621
   7622	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
   7623		return -EMLINK;
   7624
   7625	if (!master) {
   7626		if (__netdev_has_upper_dev(dev, upper_dev))
   7627			return -EEXIST;
   7628	} else {
   7629		master_dev = __netdev_master_upper_dev_get(dev);
   7630		if (master_dev)
   7631			return master_dev == upper_dev ? -EEXIST : -EBUSY;
   7632	}
   7633
   7634	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
   7635					    &changeupper_info.info);
   7636	ret = notifier_to_errno(ret);
   7637	if (ret)
   7638		return ret;
   7639
   7640	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
   7641						   master);
   7642	if (ret)
   7643		return ret;
   7644
   7645	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
   7646					    &changeupper_info.info);
   7647	ret = notifier_to_errno(ret);
   7648	if (ret)
   7649		goto rollback;
   7650
   7651	__netdev_update_upper_level(dev, NULL);
   7652	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
   7653
   7654	__netdev_update_lower_level(upper_dev, priv);
   7655	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
   7656				    priv);
   7657
   7658	return 0;
   7659
   7660rollback:
   7661	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
   7662
   7663	return ret;
   7664}
   7665
   7666/**
   7667 * netdev_upper_dev_link - Add a link to the upper device
   7668 * @dev: device
   7669 * @upper_dev: new upper device
   7670 * @extack: netlink extended ack
   7671 *
   7672 * Adds a link to device which is upper to this one. The caller must hold
   7673 * the RTNL lock. On a failure a negative errno code is returned.
   7674 * On success the reference counts are adjusted and the function
   7675 * returns zero.
   7676 */
   7677int netdev_upper_dev_link(struct net_device *dev,
   7678			  struct net_device *upper_dev,
   7679			  struct netlink_ext_ack *extack)
   7680{
   7681	struct netdev_nested_priv priv = {
   7682		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
   7683		.data = NULL,
   7684	};
   7685
   7686	return __netdev_upper_dev_link(dev, upper_dev, false,
   7687				       NULL, NULL, &priv, extack);
   7688}
   7689EXPORT_SYMBOL(netdev_upper_dev_link);
   7690
   7691/**
   7692 * netdev_master_upper_dev_link - Add a master link to the upper device
   7693 * @dev: device
   7694 * @upper_dev: new upper device
   7695 * @upper_priv: upper device private
   7696 * @upper_info: upper info to be passed down via notifier
   7697 * @extack: netlink extended ack
   7698 *
   7699 * Adds a link to device which is upper to this one. In this case, only
   7700 * one master upper device can be linked, although other non-master devices
   7701 * might be linked as well. The caller must hold the RTNL lock.
   7702 * On a failure a negative errno code is returned. On success the reference
   7703 * counts are adjusted and the function returns zero.
   7704 */
   7705int netdev_master_upper_dev_link(struct net_device *dev,
   7706				 struct net_device *upper_dev,
   7707				 void *upper_priv, void *upper_info,
   7708				 struct netlink_ext_ack *extack)
   7709{
   7710	struct netdev_nested_priv priv = {
   7711		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
   7712		.data = NULL,
   7713	};
   7714
   7715	return __netdev_upper_dev_link(dev, upper_dev, true,
   7716				       upper_priv, upper_info, &priv, extack);
   7717}
   7718EXPORT_SYMBOL(netdev_master_upper_dev_link);
   7719
   7720static void __netdev_upper_dev_unlink(struct net_device *dev,
   7721				      struct net_device *upper_dev,
   7722				      struct netdev_nested_priv *priv)
   7723{
   7724	struct netdev_notifier_changeupper_info changeupper_info = {
   7725		.info = {
   7726			.dev = dev,
   7727		},
   7728		.upper_dev = upper_dev,
   7729		.linking = false,
   7730	};
   7731
   7732	ASSERT_RTNL();
   7733
   7734	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
   7735
   7736	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
   7737				      &changeupper_info.info);
   7738
   7739	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
   7740
   7741	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
   7742				      &changeupper_info.info);
   7743
   7744	__netdev_update_upper_level(dev, NULL);
   7745	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
   7746
   7747	__netdev_update_lower_level(upper_dev, priv);
   7748	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
   7749				    priv);
   7750}
   7751
   7752/**
   7753 * netdev_upper_dev_unlink - Removes a link to upper device
   7754 * @dev: device
   7755 * @upper_dev: new upper device
   7756 *
   7757 * Removes a link to device which is upper to this one. The caller must hold
   7758 * the RTNL lock.
   7759 */
   7760void netdev_upper_dev_unlink(struct net_device *dev,
   7761			     struct net_device *upper_dev)
   7762{
   7763	struct netdev_nested_priv priv = {
   7764		.flags = NESTED_SYNC_TODO,
   7765		.data = NULL,
   7766	};
   7767
   7768	__netdev_upper_dev_unlink(dev, upper_dev, &priv);
   7769}
   7770EXPORT_SYMBOL(netdev_upper_dev_unlink);
   7771
   7772static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
   7773				      struct net_device *lower_dev,
   7774				      bool val)
   7775{
   7776	struct netdev_adjacent *adj;
   7777
   7778	adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
   7779	if (adj)
   7780		adj->ignore = val;
   7781
   7782	adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
   7783	if (adj)
   7784		adj->ignore = val;
   7785}
   7786
   7787static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
   7788					struct net_device *lower_dev)
   7789{
   7790	__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
   7791}
   7792
   7793static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
   7794				       struct net_device *lower_dev)
   7795{
   7796	__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
   7797}
   7798
   7799int netdev_adjacent_change_prepare(struct net_device *old_dev,
   7800				   struct net_device *new_dev,
   7801				   struct net_device *dev,
   7802				   struct netlink_ext_ack *extack)
   7803{
   7804	struct netdev_nested_priv priv = {
   7805		.flags = 0,
   7806		.data = NULL,
   7807	};
   7808	int err;
   7809
   7810	if (!new_dev)
   7811		return 0;
   7812
   7813	if (old_dev && new_dev != old_dev)
   7814		netdev_adjacent_dev_disable(dev, old_dev);
   7815	err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
   7816				      extack);
   7817	if (err) {
   7818		if (old_dev && new_dev != old_dev)
   7819			netdev_adjacent_dev_enable(dev, old_dev);
   7820		return err;
   7821	}
   7822
   7823	return 0;
   7824}
   7825EXPORT_SYMBOL(netdev_adjacent_change_prepare);
   7826
   7827void netdev_adjacent_change_commit(struct net_device *old_dev,
   7828				   struct net_device *new_dev,
   7829				   struct net_device *dev)
   7830{
   7831	struct netdev_nested_priv priv = {
   7832		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
   7833		.data = NULL,
   7834	};
   7835
   7836	if (!new_dev || !old_dev)
   7837		return;
   7838
   7839	if (new_dev == old_dev)
   7840		return;
   7841
   7842	netdev_adjacent_dev_enable(dev, old_dev);
   7843	__netdev_upper_dev_unlink(old_dev, dev, &priv);
   7844}
   7845EXPORT_SYMBOL(netdev_adjacent_change_commit);
   7846
   7847void netdev_adjacent_change_abort(struct net_device *old_dev,
   7848				  struct net_device *new_dev,
   7849				  struct net_device *dev)
   7850{
   7851	struct netdev_nested_priv priv = {
   7852		.flags = 0,
   7853		.data = NULL,
   7854	};
   7855
   7856	if (!new_dev)
   7857		return;
   7858
   7859	if (old_dev && new_dev != old_dev)
   7860		netdev_adjacent_dev_enable(dev, old_dev);
   7861
   7862	__netdev_upper_dev_unlink(new_dev, dev, &priv);
   7863}
   7864EXPORT_SYMBOL(netdev_adjacent_change_abort);
   7865
   7866/**
   7867 * netdev_bonding_info_change - Dispatch event about slave change
   7868 * @dev: device
   7869 * @bonding_info: info to dispatch
   7870 *
   7871 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
   7872 * The caller must hold the RTNL lock.
   7873 */
   7874void netdev_bonding_info_change(struct net_device *dev,
   7875				struct netdev_bonding_info *bonding_info)
   7876{
   7877	struct netdev_notifier_bonding_info info = {
   7878		.info.dev = dev,
   7879	};
   7880
   7881	memcpy(&info.bonding_info, bonding_info,
   7882	       sizeof(struct netdev_bonding_info));
   7883	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
   7884				      &info.info);
   7885}
   7886EXPORT_SYMBOL(netdev_bonding_info_change);
   7887
   7888static int netdev_offload_xstats_enable_l3(struct net_device *dev,
   7889					   struct netlink_ext_ack *extack)
   7890{
   7891	struct netdev_notifier_offload_xstats_info info = {
   7892		.info.dev = dev,
   7893		.info.extack = extack,
   7894		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
   7895	};
   7896	int err;
   7897	int rc;
   7898
   7899	dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
   7900					 GFP_KERNEL);
   7901	if (!dev->offload_xstats_l3)
   7902		return -ENOMEM;
   7903
   7904	rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
   7905						  NETDEV_OFFLOAD_XSTATS_DISABLE,
   7906						  &info.info);
   7907	err = notifier_to_errno(rc);
   7908	if (err)
   7909		goto free_stats;
   7910
   7911	return 0;
   7912
   7913free_stats:
   7914	kfree(dev->offload_xstats_l3);
   7915	dev->offload_xstats_l3 = NULL;
   7916	return err;
   7917}
   7918
   7919int netdev_offload_xstats_enable(struct net_device *dev,
   7920				 enum netdev_offload_xstats_type type,
   7921				 struct netlink_ext_ack *extack)
   7922{
   7923	ASSERT_RTNL();
   7924
   7925	if (netdev_offload_xstats_enabled(dev, type))
   7926		return -EALREADY;
   7927
   7928	switch (type) {
   7929	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
   7930		return netdev_offload_xstats_enable_l3(dev, extack);
   7931	}
   7932
   7933	WARN_ON(1);
   7934	return -EINVAL;
   7935}
   7936EXPORT_SYMBOL(netdev_offload_xstats_enable);
   7937
   7938static void netdev_offload_xstats_disable_l3(struct net_device *dev)
   7939{
   7940	struct netdev_notifier_offload_xstats_info info = {
   7941		.info.dev = dev,
   7942		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
   7943	};
   7944
   7945	call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
   7946				      &info.info);
   7947	kfree(dev->offload_xstats_l3);
   7948	dev->offload_xstats_l3 = NULL;
   7949}
   7950
   7951int netdev_offload_xstats_disable(struct net_device *dev,
   7952				  enum netdev_offload_xstats_type type)
   7953{
   7954	ASSERT_RTNL();
   7955
   7956	if (!netdev_offload_xstats_enabled(dev, type))
   7957		return -EALREADY;
   7958
   7959	switch (type) {
   7960	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
   7961		netdev_offload_xstats_disable_l3(dev);
   7962		return 0;
   7963	}
   7964
   7965	WARN_ON(1);
   7966	return -EINVAL;
   7967}
   7968EXPORT_SYMBOL(netdev_offload_xstats_disable);
   7969
   7970static void netdev_offload_xstats_disable_all(struct net_device *dev)
   7971{
   7972	netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
   7973}
   7974
   7975static struct rtnl_hw_stats64 *
   7976netdev_offload_xstats_get_ptr(const struct net_device *dev,
   7977			      enum netdev_offload_xstats_type type)
   7978{
   7979	switch (type) {
   7980	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
   7981		return dev->offload_xstats_l3;
   7982	}
   7983
   7984	WARN_ON(1);
   7985	return NULL;
   7986}
   7987
   7988bool netdev_offload_xstats_enabled(const struct net_device *dev,
   7989				   enum netdev_offload_xstats_type type)
   7990{
   7991	ASSERT_RTNL();
   7992
   7993	return netdev_offload_xstats_get_ptr(dev, type);
   7994}
   7995EXPORT_SYMBOL(netdev_offload_xstats_enabled);
   7996
   7997struct netdev_notifier_offload_xstats_ru {
   7998	bool used;
   7999};
   8000
   8001struct netdev_notifier_offload_xstats_rd {
   8002	struct rtnl_hw_stats64 stats;
   8003	bool used;
   8004};
   8005
   8006static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
   8007				  const struct rtnl_hw_stats64 *src)
   8008{
   8009	dest->rx_packets	  += src->rx_packets;
   8010	dest->tx_packets	  += src->tx_packets;
   8011	dest->rx_bytes		  += src->rx_bytes;
   8012	dest->tx_bytes		  += src->tx_bytes;
   8013	dest->rx_errors		  += src->rx_errors;
   8014	dest->tx_errors		  += src->tx_errors;
   8015	dest->rx_dropped	  += src->rx_dropped;
   8016	dest->tx_dropped	  += src->tx_dropped;
   8017	dest->multicast		  += src->multicast;
   8018}
   8019
   8020static int netdev_offload_xstats_get_used(struct net_device *dev,
   8021					  enum netdev_offload_xstats_type type,
   8022					  bool *p_used,
   8023					  struct netlink_ext_ack *extack)
   8024{
   8025	struct netdev_notifier_offload_xstats_ru report_used = {};
   8026	struct netdev_notifier_offload_xstats_info info = {
   8027		.info.dev = dev,
   8028		.info.extack = extack,
   8029		.type = type,
   8030		.report_used = &report_used,
   8031	};
   8032	int rc;
   8033
   8034	WARN_ON(!netdev_offload_xstats_enabled(dev, type));
   8035	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
   8036					   &info.info);
   8037	*p_used = report_used.used;
   8038	return notifier_to_errno(rc);
   8039}
   8040
   8041static int netdev_offload_xstats_get_stats(struct net_device *dev,
   8042					   enum netdev_offload_xstats_type type,
   8043					   struct rtnl_hw_stats64 *p_stats,
   8044					   bool *p_used,
   8045					   struct netlink_ext_ack *extack)
   8046{
   8047	struct netdev_notifier_offload_xstats_rd report_delta = {};
   8048	struct netdev_notifier_offload_xstats_info info = {
   8049		.info.dev = dev,
   8050		.info.extack = extack,
   8051		.type = type,
   8052		.report_delta = &report_delta,
   8053	};
   8054	struct rtnl_hw_stats64 *stats;
   8055	int rc;
   8056
   8057	stats = netdev_offload_xstats_get_ptr(dev, type);
   8058	if (WARN_ON(!stats))
   8059		return -EINVAL;
   8060
   8061	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
   8062					   &info.info);
   8063
   8064	/* Cache whatever we got, even if there was an error, otherwise the
   8065	 * successful stats retrievals would get lost.
   8066	 */
   8067	netdev_hw_stats64_add(stats, &report_delta.stats);
   8068
   8069	if (p_stats)
   8070		*p_stats = *stats;
   8071	*p_used = report_delta.used;
   8072
   8073	return notifier_to_errno(rc);
   8074}
   8075
   8076int netdev_offload_xstats_get(struct net_device *dev,
   8077			      enum netdev_offload_xstats_type type,
   8078			      struct rtnl_hw_stats64 *p_stats, bool *p_used,
   8079			      struct netlink_ext_ack *extack)
   8080{
   8081	ASSERT_RTNL();
   8082
   8083	if (p_stats)
   8084		return netdev_offload_xstats_get_stats(dev, type, p_stats,
   8085						       p_used, extack);
   8086	else
   8087		return netdev_offload_xstats_get_used(dev, type, p_used,
   8088						      extack);
   8089}
   8090EXPORT_SYMBOL(netdev_offload_xstats_get);
   8091
   8092void
   8093netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
   8094				   const struct rtnl_hw_stats64 *stats)
   8095{
   8096	report_delta->used = true;
   8097	netdev_hw_stats64_add(&report_delta->stats, stats);
   8098}
   8099EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
   8100
   8101void
   8102netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
   8103{
   8104	report_used->used = true;
   8105}
   8106EXPORT_SYMBOL(netdev_offload_xstats_report_used);
   8107
   8108void netdev_offload_xstats_push_delta(struct net_device *dev,
   8109				      enum netdev_offload_xstats_type type,
   8110				      const struct rtnl_hw_stats64 *p_stats)
   8111{
   8112	struct rtnl_hw_stats64 *stats;
   8113
   8114	ASSERT_RTNL();
   8115
   8116	stats = netdev_offload_xstats_get_ptr(dev, type);
   8117	if (WARN_ON(!stats))
   8118		return;
   8119
   8120	netdev_hw_stats64_add(stats, p_stats);
   8121}
   8122EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
   8123
   8124/**
   8125 * netdev_get_xmit_slave - Get the xmit slave of master device
   8126 * @dev: device
   8127 * @skb: The packet
   8128 * @all_slaves: assume all the slaves are active
   8129 *
   8130 * The reference counters are not incremented so the caller must be
   8131 * careful with locks. The caller must hold RCU lock.
   8132 * %NULL is returned if no slave is found.
   8133 */
   8134
   8135struct net_device *netdev_get_xmit_slave(struct net_device *dev,
   8136					 struct sk_buff *skb,
   8137					 bool all_slaves)
   8138{
   8139	const struct net_device_ops *ops = dev->netdev_ops;
   8140
   8141	if (!ops->ndo_get_xmit_slave)
   8142		return NULL;
   8143	return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
   8144}
   8145EXPORT_SYMBOL(netdev_get_xmit_slave);
   8146
   8147static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
   8148						  struct sock *sk)
   8149{
   8150	const struct net_device_ops *ops = dev->netdev_ops;
   8151
   8152	if (!ops->ndo_sk_get_lower_dev)
   8153		return NULL;
   8154	return ops->ndo_sk_get_lower_dev(dev, sk);
   8155}
   8156
   8157/**
   8158 * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
   8159 * @dev: device
   8160 * @sk: the socket
   8161 *
   8162 * %NULL is returned if no lower device is found.
   8163 */
   8164
   8165struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
   8166					    struct sock *sk)
   8167{
   8168	struct net_device *lower;
   8169
   8170	lower = netdev_sk_get_lower_dev(dev, sk);
   8171	while (lower) {
   8172		dev = lower;
   8173		lower = netdev_sk_get_lower_dev(dev, sk);
   8174	}
   8175
   8176	return dev;
   8177}
   8178EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
   8179
   8180static void netdev_adjacent_add_links(struct net_device *dev)
   8181{
   8182	struct netdev_adjacent *iter;
   8183
   8184	struct net *net = dev_net(dev);
   8185
   8186	list_for_each_entry(iter, &dev->adj_list.upper, list) {
   8187		if (!net_eq(net, dev_net(iter->dev)))
   8188			continue;
   8189		netdev_adjacent_sysfs_add(iter->dev, dev,
   8190					  &iter->dev->adj_list.lower);
   8191		netdev_adjacent_sysfs_add(dev, iter->dev,
   8192					  &dev->adj_list.upper);
   8193	}
   8194
   8195	list_for_each_entry(iter, &dev->adj_list.lower, list) {
   8196		if (!net_eq(net, dev_net(iter->dev)))
   8197			continue;
   8198		netdev_adjacent_sysfs_add(iter->dev, dev,
   8199					  &iter->dev->adj_list.upper);
   8200		netdev_adjacent_sysfs_add(dev, iter->dev,
   8201					  &dev->adj_list.lower);
   8202	}
   8203}
   8204
   8205static void netdev_adjacent_del_links(struct net_device *dev)
   8206{
   8207	struct netdev_adjacent *iter;
   8208
   8209	struct net *net = dev_net(dev);
   8210
   8211	list_for_each_entry(iter, &dev->adj_list.upper, list) {
   8212		if (!net_eq(net, dev_net(iter->dev)))
   8213			continue;
   8214		netdev_adjacent_sysfs_del(iter->dev, dev->name,
   8215					  &iter->dev->adj_list.lower);
   8216		netdev_adjacent_sysfs_del(dev, iter->dev->name,
   8217					  &dev->adj_list.upper);
   8218	}
   8219
   8220	list_for_each_entry(iter, &dev->adj_list.lower, list) {
   8221		if (!net_eq(net, dev_net(iter->dev)))
   8222			continue;
   8223		netdev_adjacent_sysfs_del(iter->dev, dev->name,
   8224					  &iter->dev->adj_list.upper);
   8225		netdev_adjacent_sysfs_del(dev, iter->dev->name,
   8226					  &dev->adj_list.lower);
   8227	}
   8228}
   8229
   8230void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
   8231{
   8232	struct netdev_adjacent *iter;
   8233
   8234	struct net *net = dev_net(dev);
   8235
   8236	list_for_each_entry(iter, &dev->adj_list.upper, list) {
   8237		if (!net_eq(net, dev_net(iter->dev)))
   8238			continue;
   8239		netdev_adjacent_sysfs_del(iter->dev, oldname,
   8240					  &iter->dev->adj_list.lower);
   8241		netdev_adjacent_sysfs_add(iter->dev, dev,
   8242					  &iter->dev->adj_list.lower);
   8243	}
   8244
   8245	list_for_each_entry(iter, &dev->adj_list.lower, list) {
   8246		if (!net_eq(net, dev_net(iter->dev)))
   8247			continue;
   8248		netdev_adjacent_sysfs_del(iter->dev, oldname,
   8249					  &iter->dev->adj_list.upper);
   8250		netdev_adjacent_sysfs_add(iter->dev, dev,
   8251					  &iter->dev->adj_list.upper);
   8252	}
   8253}
   8254
   8255void *netdev_lower_dev_get_private(struct net_device *dev,
   8256				   struct net_device *lower_dev)
   8257{
   8258	struct netdev_adjacent *lower;
   8259
   8260	if (!lower_dev)
   8261		return NULL;
   8262	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
   8263	if (!lower)
   8264		return NULL;
   8265
   8266	return lower->private;
   8267}
   8268EXPORT_SYMBOL(netdev_lower_dev_get_private);
   8269
   8270
   8271/**
   8272 * netdev_lower_state_changed - Dispatch event about lower device state change
   8273 * @lower_dev: device
   8274 * @lower_state_info: state to dispatch
   8275 *
   8276 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
   8277 * The caller must hold the RTNL lock.
   8278 */
   8279void netdev_lower_state_changed(struct net_device *lower_dev,
   8280				void *lower_state_info)
   8281{
   8282	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
   8283		.info.dev = lower_dev,
   8284	};
   8285
   8286	ASSERT_RTNL();
   8287	changelowerstate_info.lower_state_info = lower_state_info;
   8288	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
   8289				      &changelowerstate_info.info);
   8290}
   8291EXPORT_SYMBOL(netdev_lower_state_changed);
   8292
   8293static void dev_change_rx_flags(struct net_device *dev, int flags)
   8294{
   8295	const struct net_device_ops *ops = dev->netdev_ops;
   8296
   8297	if (ops->ndo_change_rx_flags)
   8298		ops->ndo_change_rx_flags(dev, flags);
   8299}
   8300
   8301static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
   8302{
   8303	unsigned int old_flags = dev->flags;
   8304	kuid_t uid;
   8305	kgid_t gid;
   8306
   8307	ASSERT_RTNL();
   8308
   8309	dev->flags |= IFF_PROMISC;
   8310	dev->promiscuity += inc;
   8311	if (dev->promiscuity == 0) {
   8312		/*
   8313		 * Avoid overflow.
   8314		 * If inc causes overflow, untouch promisc and return error.
   8315		 */
   8316		if (inc < 0)
   8317			dev->flags &= ~IFF_PROMISC;
   8318		else {
   8319			dev->promiscuity -= inc;
   8320			netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
   8321			return -EOVERFLOW;
   8322		}
   8323	}
   8324	if (dev->flags != old_flags) {
   8325		pr_info("device %s %s promiscuous mode\n",
   8326			dev->name,
   8327			dev->flags & IFF_PROMISC ? "entered" : "left");
   8328		if (audit_enabled) {
   8329			current_uid_gid(&uid, &gid);
   8330			audit_log(audit_context(), GFP_ATOMIC,
   8331				  AUDIT_ANOM_PROMISCUOUS,
   8332				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
   8333				  dev->name, (dev->flags & IFF_PROMISC),
   8334				  (old_flags & IFF_PROMISC),
   8335				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
   8336				  from_kuid(&init_user_ns, uid),
   8337				  from_kgid(&init_user_ns, gid),
   8338				  audit_get_sessionid(current));
   8339		}
   8340
   8341		dev_change_rx_flags(dev, IFF_PROMISC);
   8342	}
   8343	if (notify)
   8344		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
   8345	return 0;
   8346}
   8347
   8348/**
   8349 *	dev_set_promiscuity	- update promiscuity count on a device
   8350 *	@dev: device
   8351 *	@inc: modifier
   8352 *
   8353 *	Add or remove promiscuity from a device. While the count in the device
   8354 *	remains above zero the interface remains promiscuous. Once it hits zero
   8355 *	the device reverts back to normal filtering operation. A negative inc
   8356 *	value is used to drop promiscuity on the device.
   8357 *	Return 0 if successful or a negative errno code on error.
   8358 */
   8359int dev_set_promiscuity(struct net_device *dev, int inc)
   8360{
   8361	unsigned int old_flags = dev->flags;
   8362	int err;
   8363
   8364	err = __dev_set_promiscuity(dev, inc, true);
   8365	if (err < 0)
   8366		return err;
   8367	if (dev->flags != old_flags)
   8368		dev_set_rx_mode(dev);
   8369	return err;
   8370}
   8371EXPORT_SYMBOL(dev_set_promiscuity);
   8372
   8373static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
   8374{
   8375	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
   8376
   8377	ASSERT_RTNL();
   8378
   8379	dev->flags |= IFF_ALLMULTI;
   8380	dev->allmulti += inc;
   8381	if (dev->allmulti == 0) {
   8382		/*
   8383		 * Avoid overflow.
   8384		 * If inc causes overflow, untouch allmulti and return error.
   8385		 */
   8386		if (inc < 0)
   8387			dev->flags &= ~IFF_ALLMULTI;
   8388		else {
   8389			dev->allmulti -= inc;
   8390			netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
   8391			return -EOVERFLOW;
   8392		}
   8393	}
   8394	if (dev->flags ^ old_flags) {
   8395		dev_change_rx_flags(dev, IFF_ALLMULTI);
   8396		dev_set_rx_mode(dev);
   8397		if (notify)
   8398			__dev_notify_flags(dev, old_flags,
   8399					   dev->gflags ^ old_gflags);
   8400	}
   8401	return 0;
   8402}
   8403
   8404/**
   8405 *	dev_set_allmulti	- update allmulti count on a device
   8406 *	@dev: device
   8407 *	@inc: modifier
   8408 *
   8409 *	Add or remove reception of all multicast frames to a device. While the
   8410 *	count in the device remains above zero the interface remains listening
   8411 *	to all interfaces. Once it hits zero the device reverts back to normal
   8412 *	filtering operation. A negative @inc value is used to drop the counter
   8413 *	when releasing a resource needing all multicasts.
   8414 *	Return 0 if successful or a negative errno code on error.
   8415 */
   8416
   8417int dev_set_allmulti(struct net_device *dev, int inc)
   8418{
   8419	return __dev_set_allmulti(dev, inc, true);
   8420}
   8421EXPORT_SYMBOL(dev_set_allmulti);
   8422
   8423/*
   8424 *	Upload unicast and multicast address lists to device and
   8425 *	configure RX filtering. When the device doesn't support unicast
   8426 *	filtering it is put in promiscuous mode while unicast addresses
   8427 *	are present.
   8428 */
   8429void __dev_set_rx_mode(struct net_device *dev)
   8430{
   8431	const struct net_device_ops *ops = dev->netdev_ops;
   8432
   8433	/* dev_open will call this function so the list will stay sane. */
   8434	if (!(dev->flags&IFF_UP))
   8435		return;
   8436
   8437	if (!netif_device_present(dev))
   8438		return;
   8439
   8440	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
   8441		/* Unicast addresses changes may only happen under the rtnl,
   8442		 * therefore calling __dev_set_promiscuity here is safe.
   8443		 */
   8444		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
   8445			__dev_set_promiscuity(dev, 1, false);
   8446			dev->uc_promisc = true;
   8447		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
   8448			__dev_set_promiscuity(dev, -1, false);
   8449			dev->uc_promisc = false;
   8450		}
   8451	}
   8452
   8453	if (ops->ndo_set_rx_mode)
   8454		ops->ndo_set_rx_mode(dev);
   8455}
   8456
   8457void dev_set_rx_mode(struct net_device *dev)
   8458{
   8459	netif_addr_lock_bh(dev);
   8460	__dev_set_rx_mode(dev);
   8461	netif_addr_unlock_bh(dev);
   8462}
   8463
   8464/**
   8465 *	dev_get_flags - get flags reported to userspace
   8466 *	@dev: device
   8467 *
   8468 *	Get the combination of flag bits exported through APIs to userspace.
   8469 */
   8470unsigned int dev_get_flags(const struct net_device *dev)
   8471{
   8472	unsigned int flags;
   8473
   8474	flags = (dev->flags & ~(IFF_PROMISC |
   8475				IFF_ALLMULTI |
   8476				IFF_RUNNING |
   8477				IFF_LOWER_UP |
   8478				IFF_DORMANT)) |
   8479		(dev->gflags & (IFF_PROMISC |
   8480				IFF_ALLMULTI));
   8481
   8482	if (netif_running(dev)) {
   8483		if (netif_oper_up(dev))
   8484			flags |= IFF_RUNNING;
   8485		if (netif_carrier_ok(dev))
   8486			flags |= IFF_LOWER_UP;
   8487		if (netif_dormant(dev))
   8488			flags |= IFF_DORMANT;
   8489	}
   8490
   8491	return flags;
   8492}
   8493EXPORT_SYMBOL(dev_get_flags);
   8494
   8495int __dev_change_flags(struct net_device *dev, unsigned int flags,
   8496		       struct netlink_ext_ack *extack)
   8497{
   8498	unsigned int old_flags = dev->flags;
   8499	int ret;
   8500
   8501	ASSERT_RTNL();
   8502
   8503	/*
   8504	 *	Set the flags on our device.
   8505	 */
   8506
   8507	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
   8508			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
   8509			       IFF_AUTOMEDIA)) |
   8510		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
   8511				    IFF_ALLMULTI));
   8512
   8513	/*
   8514	 *	Load in the correct multicast list now the flags have changed.
   8515	 */
   8516
   8517	if ((old_flags ^ flags) & IFF_MULTICAST)
   8518		dev_change_rx_flags(dev, IFF_MULTICAST);
   8519
   8520	dev_set_rx_mode(dev);
   8521
   8522	/*
   8523	 *	Have we downed the interface. We handle IFF_UP ourselves
   8524	 *	according to user attempts to set it, rather than blindly
   8525	 *	setting it.
   8526	 */
   8527
   8528	ret = 0;
   8529	if ((old_flags ^ flags) & IFF_UP) {
   8530		if (old_flags & IFF_UP)
   8531			__dev_close(dev);
   8532		else
   8533			ret = __dev_open(dev, extack);
   8534	}
   8535
   8536	if ((flags ^ dev->gflags) & IFF_PROMISC) {
   8537		int inc = (flags & IFF_PROMISC) ? 1 : -1;
   8538		unsigned int old_flags = dev->flags;
   8539
   8540		dev->gflags ^= IFF_PROMISC;
   8541
   8542		if (__dev_set_promiscuity(dev, inc, false) >= 0)
   8543			if (dev->flags != old_flags)
   8544				dev_set_rx_mode(dev);
   8545	}
   8546
   8547	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
   8548	 * is important. Some (broken) drivers set IFF_PROMISC, when
   8549	 * IFF_ALLMULTI is requested not asking us and not reporting.
   8550	 */
   8551	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
   8552		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
   8553
   8554		dev->gflags ^= IFF_ALLMULTI;
   8555		__dev_set_allmulti(dev, inc, false);
   8556	}
   8557
   8558	return ret;
   8559}
   8560
   8561void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
   8562			unsigned int gchanges)
   8563{
   8564	unsigned int changes = dev->flags ^ old_flags;
   8565
   8566	if (gchanges)
   8567		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
   8568
   8569	if (changes & IFF_UP) {
   8570		if (dev->flags & IFF_UP)
   8571			call_netdevice_notifiers(NETDEV_UP, dev);
   8572		else
   8573			call_netdevice_notifiers(NETDEV_DOWN, dev);
   8574	}
   8575
   8576	if (dev->flags & IFF_UP &&
   8577	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
   8578		struct netdev_notifier_change_info change_info = {
   8579			.info = {
   8580				.dev = dev,
   8581			},
   8582			.flags_changed = changes,
   8583		};
   8584
   8585		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
   8586	}
   8587}
   8588
   8589/**
   8590 *	dev_change_flags - change device settings
   8591 *	@dev: device
   8592 *	@flags: device state flags
   8593 *	@extack: netlink extended ack
   8594 *
   8595 *	Change settings on device based state flags. The flags are
   8596 *	in the userspace exported format.
   8597 */
   8598int dev_change_flags(struct net_device *dev, unsigned int flags,
   8599		     struct netlink_ext_ack *extack)
   8600{
   8601	int ret;
   8602	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
   8603
   8604	ret = __dev_change_flags(dev, flags, extack);
   8605	if (ret < 0)
   8606		return ret;
   8607
   8608	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
   8609	__dev_notify_flags(dev, old_flags, changes);
   8610	return ret;
   8611}
   8612EXPORT_SYMBOL(dev_change_flags);
   8613
   8614int __dev_set_mtu(struct net_device *dev, int new_mtu)
   8615{
   8616	const struct net_device_ops *ops = dev->netdev_ops;
   8617
   8618	if (ops->ndo_change_mtu)
   8619		return ops->ndo_change_mtu(dev, new_mtu);
   8620
   8621	/* Pairs with all the lockless reads of dev->mtu in the stack */
   8622	WRITE_ONCE(dev->mtu, new_mtu);
   8623	return 0;
   8624}
   8625EXPORT_SYMBOL(__dev_set_mtu);
   8626
   8627int dev_validate_mtu(struct net_device *dev, int new_mtu,
   8628		     struct netlink_ext_ack *extack)
   8629{
   8630	/* MTU must be positive, and in range */
   8631	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
   8632		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
   8633		return -EINVAL;
   8634	}
   8635
   8636	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
   8637		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
   8638		return -EINVAL;
   8639	}
   8640	return 0;
   8641}
   8642
   8643/**
   8644 *	dev_set_mtu_ext - Change maximum transfer unit
   8645 *	@dev: device
   8646 *	@new_mtu: new transfer unit
   8647 *	@extack: netlink extended ack
   8648 *
   8649 *	Change the maximum transfer size of the network device.
   8650 */
   8651int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
   8652		    struct netlink_ext_ack *extack)
   8653{
   8654	int err, orig_mtu;
   8655
   8656	if (new_mtu == dev->mtu)
   8657		return 0;
   8658
   8659	err = dev_validate_mtu(dev, new_mtu, extack);
   8660	if (err)
   8661		return err;
   8662
   8663	if (!netif_device_present(dev))
   8664		return -ENODEV;
   8665
   8666	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
   8667	err = notifier_to_errno(err);
   8668	if (err)
   8669		return err;
   8670
   8671	orig_mtu = dev->mtu;
   8672	err = __dev_set_mtu(dev, new_mtu);
   8673
   8674	if (!err) {
   8675		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
   8676						   orig_mtu);
   8677		err = notifier_to_errno(err);
   8678		if (err) {
   8679			/* setting mtu back and notifying everyone again,
   8680			 * so that they have a chance to revert changes.
   8681			 */
   8682			__dev_set_mtu(dev, orig_mtu);
   8683			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
   8684						     new_mtu);
   8685		}
   8686	}
   8687	return err;
   8688}
   8689
   8690int dev_set_mtu(struct net_device *dev, int new_mtu)
   8691{
   8692	struct netlink_ext_ack extack;
   8693	int err;
   8694
   8695	memset(&extack, 0, sizeof(extack));
   8696	err = dev_set_mtu_ext(dev, new_mtu, &extack);
   8697	if (err && extack._msg)
   8698		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
   8699	return err;
   8700}
   8701EXPORT_SYMBOL(dev_set_mtu);
   8702
   8703/**
   8704 *	dev_change_tx_queue_len - Change TX queue length of a netdevice
   8705 *	@dev: device
   8706 *	@new_len: new tx queue length
   8707 */
   8708int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
   8709{
   8710	unsigned int orig_len = dev->tx_queue_len;
   8711	int res;
   8712
   8713	if (new_len != (unsigned int)new_len)
   8714		return -ERANGE;
   8715
   8716	if (new_len != orig_len) {
   8717		dev->tx_queue_len = new_len;
   8718		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
   8719		res = notifier_to_errno(res);
   8720		if (res)
   8721			goto err_rollback;
   8722		res = dev_qdisc_change_tx_queue_len(dev);
   8723		if (res)
   8724			goto err_rollback;
   8725	}
   8726
   8727	return 0;
   8728
   8729err_rollback:
   8730	netdev_err(dev, "refused to change device tx_queue_len\n");
   8731	dev->tx_queue_len = orig_len;
   8732	return res;
   8733}
   8734
   8735/**
   8736 *	dev_set_group - Change group this device belongs to
   8737 *	@dev: device
   8738 *	@new_group: group this device should belong to
   8739 */
   8740void dev_set_group(struct net_device *dev, int new_group)
   8741{
   8742	dev->group = new_group;
   8743}
   8744
   8745/**
   8746 *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
   8747 *	@dev: device
   8748 *	@addr: new address
   8749 *	@extack: netlink extended ack
   8750 */
   8751int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
   8752			      struct netlink_ext_ack *extack)
   8753{
   8754	struct netdev_notifier_pre_changeaddr_info info = {
   8755		.info.dev = dev,
   8756		.info.extack = extack,
   8757		.dev_addr = addr,
   8758	};
   8759	int rc;
   8760
   8761	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
   8762	return notifier_to_errno(rc);
   8763}
   8764EXPORT_SYMBOL(dev_pre_changeaddr_notify);
   8765
   8766/**
   8767 *	dev_set_mac_address - Change Media Access Control Address
   8768 *	@dev: device
   8769 *	@sa: new address
   8770 *	@extack: netlink extended ack
   8771 *
   8772 *	Change the hardware (MAC) address of the device
   8773 */
   8774int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
   8775			struct netlink_ext_ack *extack)
   8776{
   8777	const struct net_device_ops *ops = dev->netdev_ops;
   8778	int err;
   8779
   8780	if (!ops->ndo_set_mac_address)
   8781		return -EOPNOTSUPP;
   8782	if (sa->sa_family != dev->type)
   8783		return -EINVAL;
   8784	if (!netif_device_present(dev))
   8785		return -ENODEV;
   8786	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
   8787	if (err)
   8788		return err;
   8789	err = ops->ndo_set_mac_address(dev, sa);
   8790	if (err)
   8791		return err;
   8792	dev->addr_assign_type = NET_ADDR_SET;
   8793	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
   8794	add_device_randomness(dev->dev_addr, dev->addr_len);
   8795	return 0;
   8796}
   8797EXPORT_SYMBOL(dev_set_mac_address);
   8798
   8799static DECLARE_RWSEM(dev_addr_sem);
   8800
   8801int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
   8802			     struct netlink_ext_ack *extack)
   8803{
   8804	int ret;
   8805
   8806	down_write(&dev_addr_sem);
   8807	ret = dev_set_mac_address(dev, sa, extack);
   8808	up_write(&dev_addr_sem);
   8809	return ret;
   8810}
   8811EXPORT_SYMBOL(dev_set_mac_address_user);
   8812
   8813int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
   8814{
   8815	size_t size = sizeof(sa->sa_data);
   8816	struct net_device *dev;
   8817	int ret = 0;
   8818
   8819	down_read(&dev_addr_sem);
   8820	rcu_read_lock();
   8821
   8822	dev = dev_get_by_name_rcu(net, dev_name);
   8823	if (!dev) {
   8824		ret = -ENODEV;
   8825		goto unlock;
   8826	}
   8827	if (!dev->addr_len)
   8828		memset(sa->sa_data, 0, size);
   8829	else
   8830		memcpy(sa->sa_data, dev->dev_addr,
   8831		       min_t(size_t, size, dev->addr_len));
   8832	sa->sa_family = dev->type;
   8833
   8834unlock:
   8835	rcu_read_unlock();
   8836	up_read(&dev_addr_sem);
   8837	return ret;
   8838}
   8839EXPORT_SYMBOL(dev_get_mac_address);
   8840
   8841/**
   8842 *	dev_change_carrier - Change device carrier
   8843 *	@dev: device
   8844 *	@new_carrier: new value
   8845 *
   8846 *	Change device carrier
   8847 */
   8848int dev_change_carrier(struct net_device *dev, bool new_carrier)
   8849{
   8850	const struct net_device_ops *ops = dev->netdev_ops;
   8851
   8852	if (!ops->ndo_change_carrier)
   8853		return -EOPNOTSUPP;
   8854	if (!netif_device_present(dev))
   8855		return -ENODEV;
   8856	return ops->ndo_change_carrier(dev, new_carrier);
   8857}
   8858
   8859/**
   8860 *	dev_get_phys_port_id - Get device physical port ID
   8861 *	@dev: device
   8862 *	@ppid: port ID
   8863 *
   8864 *	Get device physical port ID
   8865 */
   8866int dev_get_phys_port_id(struct net_device *dev,
   8867			 struct netdev_phys_item_id *ppid)
   8868{
   8869	const struct net_device_ops *ops = dev->netdev_ops;
   8870
   8871	if (!ops->ndo_get_phys_port_id)
   8872		return -EOPNOTSUPP;
   8873	return ops->ndo_get_phys_port_id(dev, ppid);
   8874}
   8875
   8876/**
   8877 *	dev_get_phys_port_name - Get device physical port name
   8878 *	@dev: device
   8879 *	@name: port name
   8880 *	@len: limit of bytes to copy to name
   8881 *
   8882 *	Get device physical port name
   8883 */
   8884int dev_get_phys_port_name(struct net_device *dev,
   8885			   char *name, size_t len)
   8886{
   8887	const struct net_device_ops *ops = dev->netdev_ops;
   8888	int err;
   8889
   8890	if (ops->ndo_get_phys_port_name) {
   8891		err = ops->ndo_get_phys_port_name(dev, name, len);
   8892		if (err != -EOPNOTSUPP)
   8893			return err;
   8894	}
   8895	return devlink_compat_phys_port_name_get(dev, name, len);
   8896}
   8897
   8898/**
   8899 *	dev_get_port_parent_id - Get the device's port parent identifier
   8900 *	@dev: network device
   8901 *	@ppid: pointer to a storage for the port's parent identifier
   8902 *	@recurse: allow/disallow recursion to lower devices
   8903 *
   8904 *	Get the devices's port parent identifier
   8905 */
   8906int dev_get_port_parent_id(struct net_device *dev,
   8907			   struct netdev_phys_item_id *ppid,
   8908			   bool recurse)
   8909{
   8910	const struct net_device_ops *ops = dev->netdev_ops;
   8911	struct netdev_phys_item_id first = { };
   8912	struct net_device *lower_dev;
   8913	struct list_head *iter;
   8914	int err;
   8915
   8916	if (ops->ndo_get_port_parent_id) {
   8917		err = ops->ndo_get_port_parent_id(dev, ppid);
   8918		if (err != -EOPNOTSUPP)
   8919			return err;
   8920	}
   8921
   8922	err = devlink_compat_switch_id_get(dev, ppid);
   8923	if (!recurse || err != -EOPNOTSUPP)
   8924		return err;
   8925
   8926	netdev_for_each_lower_dev(dev, lower_dev, iter) {
   8927		err = dev_get_port_parent_id(lower_dev, ppid, true);
   8928		if (err)
   8929			break;
   8930		if (!first.id_len)
   8931			first = *ppid;
   8932		else if (memcmp(&first, ppid, sizeof(*ppid)))
   8933			return -EOPNOTSUPP;
   8934	}
   8935
   8936	return err;
   8937}
   8938EXPORT_SYMBOL(dev_get_port_parent_id);
   8939
   8940/**
   8941 *	netdev_port_same_parent_id - Indicate if two network devices have
   8942 *	the same port parent identifier
   8943 *	@a: first network device
   8944 *	@b: second network device
   8945 */
   8946bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
   8947{
   8948	struct netdev_phys_item_id a_id = { };
   8949	struct netdev_phys_item_id b_id = { };
   8950
   8951	if (dev_get_port_parent_id(a, &a_id, true) ||
   8952	    dev_get_port_parent_id(b, &b_id, true))
   8953		return false;
   8954
   8955	return netdev_phys_item_id_same(&a_id, &b_id);
   8956}
   8957EXPORT_SYMBOL(netdev_port_same_parent_id);
   8958
   8959/**
   8960 *	dev_change_proto_down - set carrier according to proto_down.
   8961 *
   8962 *	@dev: device
   8963 *	@proto_down: new value
   8964 */
   8965int dev_change_proto_down(struct net_device *dev, bool proto_down)
   8966{
   8967	if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN))
   8968		return -EOPNOTSUPP;
   8969	if (!netif_device_present(dev))
   8970		return -ENODEV;
   8971	if (proto_down)
   8972		netif_carrier_off(dev);
   8973	else
   8974		netif_carrier_on(dev);
   8975	dev->proto_down = proto_down;
   8976	return 0;
   8977}
   8978
   8979/**
   8980 *	dev_change_proto_down_reason - proto down reason
   8981 *
   8982 *	@dev: device
   8983 *	@mask: proto down mask
   8984 *	@value: proto down value
   8985 */
   8986void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
   8987				  u32 value)
   8988{
   8989	int b;
   8990
   8991	if (!mask) {
   8992		dev->proto_down_reason = value;
   8993	} else {
   8994		for_each_set_bit(b, &mask, 32) {
   8995			if (value & (1 << b))
   8996				dev->proto_down_reason |= BIT(b);
   8997			else
   8998				dev->proto_down_reason &= ~BIT(b);
   8999		}
   9000	}
   9001}
   9002
   9003struct bpf_xdp_link {
   9004	struct bpf_link link;
   9005	struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
   9006	int flags;
   9007};
   9008
   9009static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
   9010{
   9011	if (flags & XDP_FLAGS_HW_MODE)
   9012		return XDP_MODE_HW;
   9013	if (flags & XDP_FLAGS_DRV_MODE)
   9014		return XDP_MODE_DRV;
   9015	if (flags & XDP_FLAGS_SKB_MODE)
   9016		return XDP_MODE_SKB;
   9017	return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
   9018}
   9019
   9020static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
   9021{
   9022	switch (mode) {
   9023	case XDP_MODE_SKB:
   9024		return generic_xdp_install;
   9025	case XDP_MODE_DRV:
   9026	case XDP_MODE_HW:
   9027		return dev->netdev_ops->ndo_bpf;
   9028	default:
   9029		return NULL;
   9030	}
   9031}
   9032
   9033static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
   9034					 enum bpf_xdp_mode mode)
   9035{
   9036	return dev->xdp_state[mode].link;
   9037}
   9038
   9039static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
   9040				     enum bpf_xdp_mode mode)
   9041{
   9042	struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
   9043
   9044	if (link)
   9045		return link->link.prog;
   9046	return dev->xdp_state[mode].prog;
   9047}
   9048
   9049u8 dev_xdp_prog_count(struct net_device *dev)
   9050{
   9051	u8 count = 0;
   9052	int i;
   9053
   9054	for (i = 0; i < __MAX_XDP_MODE; i++)
   9055		if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
   9056			count++;
   9057	return count;
   9058}
   9059EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
   9060
   9061u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
   9062{
   9063	struct bpf_prog *prog = dev_xdp_prog(dev, mode);
   9064
   9065	return prog ? prog->aux->id : 0;
   9066}
   9067
   9068static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
   9069			     struct bpf_xdp_link *link)
   9070{
   9071	dev->xdp_state[mode].link = link;
   9072	dev->xdp_state[mode].prog = NULL;
   9073}
   9074
   9075static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
   9076			     struct bpf_prog *prog)
   9077{
   9078	dev->xdp_state[mode].link = NULL;
   9079	dev->xdp_state[mode].prog = prog;
   9080}
   9081
   9082static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
   9083			   bpf_op_t bpf_op, struct netlink_ext_ack *extack,
   9084			   u32 flags, struct bpf_prog *prog)
   9085{
   9086	struct netdev_bpf xdp;
   9087	int err;
   9088
   9089	memset(&xdp, 0, sizeof(xdp));
   9090	xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
   9091	xdp.extack = extack;
   9092	xdp.flags = flags;
   9093	xdp.prog = prog;
   9094
   9095	/* Drivers assume refcnt is already incremented (i.e, prog pointer is
   9096	 * "moved" into driver), so they don't increment it on their own, but
   9097	 * they do decrement refcnt when program is detached or replaced.
   9098	 * Given net_device also owns link/prog, we need to bump refcnt here
   9099	 * to prevent drivers from underflowing it.
   9100	 */
   9101	if (prog)
   9102		bpf_prog_inc(prog);
   9103	err = bpf_op(dev, &xdp);
   9104	if (err) {
   9105		if (prog)
   9106			bpf_prog_put(prog);
   9107		return err;
   9108	}
   9109
   9110	if (mode != XDP_MODE_HW)
   9111		bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
   9112
   9113	return 0;
   9114}
   9115
   9116static void dev_xdp_uninstall(struct net_device *dev)
   9117{
   9118	struct bpf_xdp_link *link;
   9119	struct bpf_prog *prog;
   9120	enum bpf_xdp_mode mode;
   9121	bpf_op_t bpf_op;
   9122
   9123	ASSERT_RTNL();
   9124
   9125	for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
   9126		prog = dev_xdp_prog(dev, mode);
   9127		if (!prog)
   9128			continue;
   9129
   9130		bpf_op = dev_xdp_bpf_op(dev, mode);
   9131		if (!bpf_op)
   9132			continue;
   9133
   9134		WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
   9135
   9136		/* auto-detach link from net device */
   9137		link = dev_xdp_link(dev, mode);
   9138		if (link)
   9139			link->dev = NULL;
   9140		else
   9141			bpf_prog_put(prog);
   9142
   9143		dev_xdp_set_link(dev, mode, NULL);
   9144	}
   9145}
   9146
   9147static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
   9148			  struct bpf_xdp_link *link, struct bpf_prog *new_prog,
   9149			  struct bpf_prog *old_prog, u32 flags)
   9150{
   9151	unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
   9152	struct bpf_prog *cur_prog;
   9153	struct net_device *upper;
   9154	struct list_head *iter;
   9155	enum bpf_xdp_mode mode;
   9156	bpf_op_t bpf_op;
   9157	int err;
   9158
   9159	ASSERT_RTNL();
   9160
   9161	/* either link or prog attachment, never both */
   9162	if (link && (new_prog || old_prog))
   9163		return -EINVAL;
   9164	/* link supports only XDP mode flags */
   9165	if (link && (flags & ~XDP_FLAGS_MODES)) {
   9166		NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
   9167		return -EINVAL;
   9168	}
   9169	/* just one XDP mode bit should be set, zero defaults to drv/skb mode */
   9170	if (num_modes > 1) {
   9171		NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
   9172		return -EINVAL;
   9173	}
   9174	/* avoid ambiguity if offload + drv/skb mode progs are both loaded */
   9175	if (!num_modes && dev_xdp_prog_count(dev) > 1) {
   9176		NL_SET_ERR_MSG(extack,
   9177			       "More than one program loaded, unset mode is ambiguous");
   9178		return -EINVAL;
   9179	}
   9180	/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
   9181	if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
   9182		NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
   9183		return -EINVAL;
   9184	}
   9185
   9186	mode = dev_xdp_mode(dev, flags);
   9187	/* can't replace attached link */
   9188	if (dev_xdp_link(dev, mode)) {
   9189		NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
   9190		return -EBUSY;
   9191	}
   9192
   9193	/* don't allow if an upper device already has a program */
   9194	netdev_for_each_upper_dev_rcu(dev, upper, iter) {
   9195		if (dev_xdp_prog_count(upper) > 0) {
   9196			NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
   9197			return -EEXIST;
   9198		}
   9199	}
   9200
   9201	cur_prog = dev_xdp_prog(dev, mode);
   9202	/* can't replace attached prog with link */
   9203	if (link && cur_prog) {
   9204		NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
   9205		return -EBUSY;
   9206	}
   9207	if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
   9208		NL_SET_ERR_MSG(extack, "Active program does not match expected");
   9209		return -EEXIST;
   9210	}
   9211
   9212	/* put effective new program into new_prog */
   9213	if (link)
   9214		new_prog = link->link.prog;
   9215
   9216	if (new_prog) {
   9217		bool offload = mode == XDP_MODE_HW;
   9218		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
   9219					       ? XDP_MODE_DRV : XDP_MODE_SKB;
   9220
   9221		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
   9222			NL_SET_ERR_MSG(extack, "XDP program already attached");
   9223			return -EBUSY;
   9224		}
   9225		if (!offload && dev_xdp_prog(dev, other_mode)) {
   9226			NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
   9227			return -EEXIST;
   9228		}
   9229		if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
   9230			NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
   9231			return -EINVAL;
   9232		}
   9233		if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
   9234			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
   9235			return -EINVAL;
   9236		}
   9237		if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
   9238			NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
   9239			return -EINVAL;
   9240		}
   9241	}
   9242
   9243	/* don't call drivers if the effective program didn't change */
   9244	if (new_prog != cur_prog) {
   9245		bpf_op = dev_xdp_bpf_op(dev, mode);
   9246		if (!bpf_op) {
   9247			NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
   9248			return -EOPNOTSUPP;
   9249		}
   9250
   9251		err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
   9252		if (err)
   9253			return err;
   9254	}
   9255
   9256	if (link)
   9257		dev_xdp_set_link(dev, mode, link);
   9258	else
   9259		dev_xdp_set_prog(dev, mode, new_prog);
   9260	if (cur_prog)
   9261		bpf_prog_put(cur_prog);
   9262
   9263	return 0;
   9264}
   9265
   9266static int dev_xdp_attach_link(struct net_device *dev,
   9267			       struct netlink_ext_ack *extack,
   9268			       struct bpf_xdp_link *link)
   9269{
   9270	return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
   9271}
   9272
   9273static int dev_xdp_detach_link(struct net_device *dev,
   9274			       struct netlink_ext_ack *extack,
   9275			       struct bpf_xdp_link *link)
   9276{
   9277	enum bpf_xdp_mode mode;
   9278	bpf_op_t bpf_op;
   9279
   9280	ASSERT_RTNL();
   9281
   9282	mode = dev_xdp_mode(dev, link->flags);
   9283	if (dev_xdp_link(dev, mode) != link)
   9284		return -EINVAL;
   9285
   9286	bpf_op = dev_xdp_bpf_op(dev, mode);
   9287	WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
   9288	dev_xdp_set_link(dev, mode, NULL);
   9289	return 0;
   9290}
   9291
   9292static void bpf_xdp_link_release(struct bpf_link *link)
   9293{
   9294	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
   9295
   9296	rtnl_lock();
   9297
   9298	/* if racing with net_device's tear down, xdp_link->dev might be
   9299	 * already NULL, in which case link was already auto-detached
   9300	 */
   9301	if (xdp_link->dev) {
   9302		WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
   9303		xdp_link->dev = NULL;
   9304	}
   9305
   9306	rtnl_unlock();
   9307}
   9308
   9309static int bpf_xdp_link_detach(struct bpf_link *link)
   9310{
   9311	bpf_xdp_link_release(link);
   9312	return 0;
   9313}
   9314
   9315static void bpf_xdp_link_dealloc(struct bpf_link *link)
   9316{
   9317	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
   9318
   9319	kfree(xdp_link);
   9320}
   9321
   9322static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
   9323				     struct seq_file *seq)
   9324{
   9325	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
   9326	u32 ifindex = 0;
   9327
   9328	rtnl_lock();
   9329	if (xdp_link->dev)
   9330		ifindex = xdp_link->dev->ifindex;
   9331	rtnl_unlock();
   9332
   9333	seq_printf(seq, "ifindex:\t%u\n", ifindex);
   9334}
   9335
   9336static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
   9337				       struct bpf_link_info *info)
   9338{
   9339	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
   9340	u32 ifindex = 0;
   9341
   9342	rtnl_lock();
   9343	if (xdp_link->dev)
   9344		ifindex = xdp_link->dev->ifindex;
   9345	rtnl_unlock();
   9346
   9347	info->xdp.ifindex = ifindex;
   9348	return 0;
   9349}
   9350
   9351static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
   9352			       struct bpf_prog *old_prog)
   9353{
   9354	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
   9355	enum bpf_xdp_mode mode;
   9356	bpf_op_t bpf_op;
   9357	int err = 0;
   9358
   9359	rtnl_lock();
   9360
   9361	/* link might have been auto-released already, so fail */
   9362	if (!xdp_link->dev) {
   9363		err = -ENOLINK;
   9364		goto out_unlock;
   9365	}
   9366
   9367	if (old_prog && link->prog != old_prog) {
   9368		err = -EPERM;
   9369		goto out_unlock;
   9370	}
   9371	old_prog = link->prog;
   9372	if (old_prog->type != new_prog->type ||
   9373	    old_prog->expected_attach_type != new_prog->expected_attach_type) {
   9374		err = -EINVAL;
   9375		goto out_unlock;
   9376	}
   9377
   9378	if (old_prog == new_prog) {
   9379		/* no-op, don't disturb drivers */
   9380		bpf_prog_put(new_prog);
   9381		goto out_unlock;
   9382	}
   9383
   9384	mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
   9385	bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
   9386	err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
   9387			      xdp_link->flags, new_prog);
   9388	if (err)
   9389		goto out_unlock;
   9390
   9391	old_prog = xchg(&link->prog, new_prog);
   9392	bpf_prog_put(old_prog);
   9393
   9394out_unlock:
   9395	rtnl_unlock();
   9396	return err;
   9397}
   9398
   9399static const struct bpf_link_ops bpf_xdp_link_lops = {
   9400	.release = bpf_xdp_link_release,
   9401	.dealloc = bpf_xdp_link_dealloc,
   9402	.detach = bpf_xdp_link_detach,
   9403	.show_fdinfo = bpf_xdp_link_show_fdinfo,
   9404	.fill_link_info = bpf_xdp_link_fill_link_info,
   9405	.update_prog = bpf_xdp_link_update,
   9406};
   9407
   9408int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
   9409{
   9410	struct net *net = current->nsproxy->net_ns;
   9411	struct bpf_link_primer link_primer;
   9412	struct bpf_xdp_link *link;
   9413	struct net_device *dev;
   9414	int err, fd;
   9415
   9416	rtnl_lock();
   9417	dev = dev_get_by_index(net, attr->link_create.target_ifindex);
   9418	if (!dev) {
   9419		rtnl_unlock();
   9420		return -EINVAL;
   9421	}
   9422
   9423	link = kzalloc(sizeof(*link), GFP_USER);
   9424	if (!link) {
   9425		err = -ENOMEM;
   9426		goto unlock;
   9427	}
   9428
   9429	bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
   9430	link->dev = dev;
   9431	link->flags = attr->link_create.flags;
   9432
   9433	err = bpf_link_prime(&link->link, &link_primer);
   9434	if (err) {
   9435		kfree(link);
   9436		goto unlock;
   9437	}
   9438
   9439	err = dev_xdp_attach_link(dev, NULL, link);
   9440	rtnl_unlock();
   9441
   9442	if (err) {
   9443		link->dev = NULL;
   9444		bpf_link_cleanup(&link_primer);
   9445		goto out_put_dev;
   9446	}
   9447
   9448	fd = bpf_link_settle(&link_primer);
   9449	/* link itself doesn't hold dev's refcnt to not complicate shutdown */
   9450	dev_put(dev);
   9451	return fd;
   9452
   9453unlock:
   9454	rtnl_unlock();
   9455
   9456out_put_dev:
   9457	dev_put(dev);
   9458	return err;
   9459}
   9460
   9461/**
   9462 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
   9463 *	@dev: device
   9464 *	@extack: netlink extended ack
   9465 *	@fd: new program fd or negative value to clear
   9466 *	@expected_fd: old program fd that userspace expects to replace or clear
   9467 *	@flags: xdp-related flags
   9468 *
   9469 *	Set or clear a bpf program for a device
   9470 */
   9471int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
   9472		      int fd, int expected_fd, u32 flags)
   9473{
   9474	enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
   9475	struct bpf_prog *new_prog = NULL, *old_prog = NULL;
   9476	int err;
   9477
   9478	ASSERT_RTNL();
   9479
   9480	if (fd >= 0) {
   9481		new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
   9482						 mode != XDP_MODE_SKB);
   9483		if (IS_ERR(new_prog))
   9484			return PTR_ERR(new_prog);
   9485	}
   9486
   9487	if (expected_fd >= 0) {
   9488		old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
   9489						 mode != XDP_MODE_SKB);
   9490		if (IS_ERR(old_prog)) {
   9491			err = PTR_ERR(old_prog);
   9492			old_prog = NULL;
   9493			goto err_out;
   9494		}
   9495	}
   9496
   9497	err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
   9498
   9499err_out:
   9500	if (err && new_prog)
   9501		bpf_prog_put(new_prog);
   9502	if (old_prog)
   9503		bpf_prog_put(old_prog);
   9504	return err;
   9505}
   9506
   9507/**
   9508 *	dev_new_index	-	allocate an ifindex
   9509 *	@net: the applicable net namespace
   9510 *
   9511 *	Returns a suitable unique value for a new device interface
   9512 *	number.  The caller must hold the rtnl semaphore or the
   9513 *	dev_base_lock to be sure it remains unique.
   9514 */
   9515static int dev_new_index(struct net *net)
   9516{
   9517	int ifindex = net->ifindex;
   9518
   9519	for (;;) {
   9520		if (++ifindex <= 0)
   9521			ifindex = 1;
   9522		if (!__dev_get_by_index(net, ifindex))
   9523			return net->ifindex = ifindex;
   9524	}
   9525}
   9526
   9527/* Delayed registration/unregisteration */
   9528LIST_HEAD(net_todo_list);
   9529DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
   9530
   9531static void net_set_todo(struct net_device *dev)
   9532{
   9533	list_add_tail(&dev->todo_list, &net_todo_list);
   9534	atomic_inc(&dev_net(dev)->dev_unreg_count);
   9535}
   9536
   9537static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
   9538	struct net_device *upper, netdev_features_t features)
   9539{
   9540	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
   9541	netdev_features_t feature;
   9542	int feature_bit;
   9543
   9544	for_each_netdev_feature(upper_disables, feature_bit) {
   9545		feature = __NETIF_F_BIT(feature_bit);
   9546		if (!(upper->wanted_features & feature)
   9547		    && (features & feature)) {
   9548			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
   9549				   &feature, upper->name);
   9550			features &= ~feature;
   9551		}
   9552	}
   9553
   9554	return features;
   9555}
   9556
   9557static void netdev_sync_lower_features(struct net_device *upper,
   9558	struct net_device *lower, netdev_features_t features)
   9559{
   9560	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
   9561	netdev_features_t feature;
   9562	int feature_bit;
   9563
   9564	for_each_netdev_feature(upper_disables, feature_bit) {
   9565		feature = __NETIF_F_BIT(feature_bit);
   9566		if (!(features & feature) && (lower->features & feature)) {
   9567			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
   9568				   &feature, lower->name);
   9569			lower->wanted_features &= ~feature;
   9570			__netdev_update_features(lower);
   9571
   9572			if (unlikely(lower->features & feature))
   9573				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
   9574					    &feature, lower->name);
   9575			else
   9576				netdev_features_change(lower);
   9577		}
   9578	}
   9579}
   9580
   9581static netdev_features_t netdev_fix_features(struct net_device *dev,
   9582	netdev_features_t features)
   9583{
   9584	/* Fix illegal checksum combinations */
   9585	if ((features & NETIF_F_HW_CSUM) &&
   9586	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
   9587		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
   9588		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
   9589	}
   9590
   9591	/* TSO requires that SG is present as well. */
   9592	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
   9593		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
   9594		features &= ~NETIF_F_ALL_TSO;
   9595	}
   9596
   9597	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
   9598					!(features & NETIF_F_IP_CSUM)) {
   9599		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
   9600		features &= ~NETIF_F_TSO;
   9601		features &= ~NETIF_F_TSO_ECN;
   9602	}
   9603
   9604	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
   9605					 !(features & NETIF_F_IPV6_CSUM)) {
   9606		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
   9607		features &= ~NETIF_F_TSO6;
   9608	}
   9609
   9610	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
   9611	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
   9612		features &= ~NETIF_F_TSO_MANGLEID;
   9613
   9614	/* TSO ECN requires that TSO is present as well. */
   9615	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
   9616		features &= ~NETIF_F_TSO_ECN;
   9617
   9618	/* Software GSO depends on SG. */
   9619	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
   9620		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
   9621		features &= ~NETIF_F_GSO;
   9622	}
   9623
   9624	/* GSO partial features require GSO partial be set */
   9625	if ((features & dev->gso_partial_features) &&
   9626	    !(features & NETIF_F_GSO_PARTIAL)) {
   9627		netdev_dbg(dev,
   9628			   "Dropping partially supported GSO features since no GSO partial.\n");
   9629		features &= ~dev->gso_partial_features;
   9630	}
   9631
   9632	if (!(features & NETIF_F_RXCSUM)) {
   9633		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
   9634		 * successfully merged by hardware must also have the
   9635		 * checksum verified by hardware.  If the user does not
   9636		 * want to enable RXCSUM, logically, we should disable GRO_HW.
   9637		 */
   9638		if (features & NETIF_F_GRO_HW) {
   9639			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
   9640			features &= ~NETIF_F_GRO_HW;
   9641		}
   9642	}
   9643
   9644	/* LRO/HW-GRO features cannot be combined with RX-FCS */
   9645	if (features & NETIF_F_RXFCS) {
   9646		if (features & NETIF_F_LRO) {
   9647			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
   9648			features &= ~NETIF_F_LRO;
   9649		}
   9650
   9651		if (features & NETIF_F_GRO_HW) {
   9652			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
   9653			features &= ~NETIF_F_GRO_HW;
   9654		}
   9655	}
   9656
   9657	if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
   9658		netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
   9659		features &= ~NETIF_F_LRO;
   9660	}
   9661
   9662	if (features & NETIF_F_HW_TLS_TX) {
   9663		bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
   9664			(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
   9665		bool hw_csum = features & NETIF_F_HW_CSUM;
   9666
   9667		if (!ip_csum && !hw_csum) {
   9668			netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
   9669			features &= ~NETIF_F_HW_TLS_TX;
   9670		}
   9671	}
   9672
   9673	if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
   9674		netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
   9675		features &= ~NETIF_F_HW_TLS_RX;
   9676	}
   9677
   9678	return features;
   9679}
   9680
   9681int __netdev_update_features(struct net_device *dev)
   9682{
   9683	struct net_device *upper, *lower;
   9684	netdev_features_t features;
   9685	struct list_head *iter;
   9686	int err = -1;
   9687
   9688	ASSERT_RTNL();
   9689
   9690	features = netdev_get_wanted_features(dev);
   9691
   9692	if (dev->netdev_ops->ndo_fix_features)
   9693		features = dev->netdev_ops->ndo_fix_features(dev, features);
   9694
   9695	/* driver might be less strict about feature dependencies */
   9696	features = netdev_fix_features(dev, features);
   9697
   9698	/* some features can't be enabled if they're off on an upper device */
   9699	netdev_for_each_upper_dev_rcu(dev, upper, iter)
   9700		features = netdev_sync_upper_features(dev, upper, features);
   9701
   9702	if (dev->features == features)
   9703		goto sync_lower;
   9704
   9705	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
   9706		&dev->features, &features);
   9707
   9708	if (dev->netdev_ops->ndo_set_features)
   9709		err = dev->netdev_ops->ndo_set_features(dev, features);
   9710	else
   9711		err = 0;
   9712
   9713	if (unlikely(err < 0)) {
   9714		netdev_err(dev,
   9715			"set_features() failed (%d); wanted %pNF, left %pNF\n",
   9716			err, &features, &dev->features);
   9717		/* return non-0 since some features might have changed and
   9718		 * it's better to fire a spurious notification than miss it
   9719		 */
   9720		return -1;
   9721	}
   9722
   9723sync_lower:
   9724	/* some features must be disabled on lower devices when disabled
   9725	 * on an upper device (think: bonding master or bridge)
   9726	 */
   9727	netdev_for_each_lower_dev(dev, lower, iter)
   9728		netdev_sync_lower_features(dev, lower, features);
   9729
   9730	if (!err) {
   9731		netdev_features_t diff = features ^ dev->features;
   9732
   9733		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
   9734			/* udp_tunnel_{get,drop}_rx_info both need
   9735			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
   9736			 * device, or they won't do anything.
   9737			 * Thus we need to update dev->features
   9738			 * *before* calling udp_tunnel_get_rx_info,
   9739			 * but *after* calling udp_tunnel_drop_rx_info.
   9740			 */
   9741			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
   9742				dev->features = features;
   9743				udp_tunnel_get_rx_info(dev);
   9744			} else {
   9745				udp_tunnel_drop_rx_info(dev);
   9746			}
   9747		}
   9748
   9749		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
   9750			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
   9751				dev->features = features;
   9752				err |= vlan_get_rx_ctag_filter_info(dev);
   9753			} else {
   9754				vlan_drop_rx_ctag_filter_info(dev);
   9755			}
   9756		}
   9757
   9758		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
   9759			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
   9760				dev->features = features;
   9761				err |= vlan_get_rx_stag_filter_info(dev);
   9762			} else {
   9763				vlan_drop_rx_stag_filter_info(dev);
   9764			}
   9765		}
   9766
   9767		dev->features = features;
   9768	}
   9769
   9770	return err < 0 ? 0 : 1;
   9771}
   9772
   9773/**
   9774 *	netdev_update_features - recalculate device features
   9775 *	@dev: the device to check
   9776 *
   9777 *	Recalculate dev->features set and send notifications if it
   9778 *	has changed. Should be called after driver or hardware dependent
   9779 *	conditions might have changed that influence the features.
   9780 */
   9781void netdev_update_features(struct net_device *dev)
   9782{
   9783	if (__netdev_update_features(dev))
   9784		netdev_features_change(dev);
   9785}
   9786EXPORT_SYMBOL(netdev_update_features);
   9787
   9788/**
   9789 *	netdev_change_features - recalculate device features
   9790 *	@dev: the device to check
   9791 *
   9792 *	Recalculate dev->features set and send notifications even
   9793 *	if they have not changed. Should be called instead of
   9794 *	netdev_update_features() if also dev->vlan_features might
   9795 *	have changed to allow the changes to be propagated to stacked
   9796 *	VLAN devices.
   9797 */
   9798void netdev_change_features(struct net_device *dev)
   9799{
   9800	__netdev_update_features(dev);
   9801	netdev_features_change(dev);
   9802}
   9803EXPORT_SYMBOL(netdev_change_features);
   9804
   9805/**
   9806 *	netif_stacked_transfer_operstate -	transfer operstate
   9807 *	@rootdev: the root or lower level device to transfer state from
   9808 *	@dev: the device to transfer operstate to
   9809 *
   9810 *	Transfer operational state from root to device. This is normally
   9811 *	called when a stacking relationship exists between the root
   9812 *	device and the device(a leaf device).
   9813 */
   9814void netif_stacked_transfer_operstate(const struct net_device *rootdev,
   9815					struct net_device *dev)
   9816{
   9817	if (rootdev->operstate == IF_OPER_DORMANT)
   9818		netif_dormant_on(dev);
   9819	else
   9820		netif_dormant_off(dev);
   9821
   9822	if (rootdev->operstate == IF_OPER_TESTING)
   9823		netif_testing_on(dev);
   9824	else
   9825		netif_testing_off(dev);
   9826
   9827	if (netif_carrier_ok(rootdev))
   9828		netif_carrier_on(dev);
   9829	else
   9830		netif_carrier_off(dev);
   9831}
   9832EXPORT_SYMBOL(netif_stacked_transfer_operstate);
   9833
   9834static int netif_alloc_rx_queues(struct net_device *dev)
   9835{
   9836	unsigned int i, count = dev->num_rx_queues;
   9837	struct netdev_rx_queue *rx;
   9838	size_t sz = count * sizeof(*rx);
   9839	int err = 0;
   9840
   9841	BUG_ON(count < 1);
   9842
   9843	rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
   9844	if (!rx)
   9845		return -ENOMEM;
   9846
   9847	dev->_rx = rx;
   9848
   9849	for (i = 0; i < count; i++) {
   9850		rx[i].dev = dev;
   9851
   9852		/* XDP RX-queue setup */
   9853		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
   9854		if (err < 0)
   9855			goto err_rxq_info;
   9856	}
   9857	return 0;
   9858
   9859err_rxq_info:
   9860	/* Rollback successful reg's and free other resources */
   9861	while (i--)
   9862		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
   9863	kvfree(dev->_rx);
   9864	dev->_rx = NULL;
   9865	return err;
   9866}
   9867
   9868static void netif_free_rx_queues(struct net_device *dev)
   9869{
   9870	unsigned int i, count = dev->num_rx_queues;
   9871
   9872	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
   9873	if (!dev->_rx)
   9874		return;
   9875
   9876	for (i = 0; i < count; i++)
   9877		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
   9878
   9879	kvfree(dev->_rx);
   9880}
   9881
   9882static void netdev_init_one_queue(struct net_device *dev,
   9883				  struct netdev_queue *queue, void *_unused)
   9884{
   9885	/* Initialize queue lock */
   9886	spin_lock_init(&queue->_xmit_lock);
   9887	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
   9888	queue->xmit_lock_owner = -1;
   9889	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
   9890	queue->dev = dev;
   9891#ifdef CONFIG_BQL
   9892	dql_init(&queue->dql, HZ);
   9893#endif
   9894}
   9895
   9896static void netif_free_tx_queues(struct net_device *dev)
   9897{
   9898	kvfree(dev->_tx);
   9899}
   9900
   9901static int netif_alloc_netdev_queues(struct net_device *dev)
   9902{
   9903	unsigned int count = dev->num_tx_queues;
   9904	struct netdev_queue *tx;
   9905	size_t sz = count * sizeof(*tx);
   9906
   9907	if (count < 1 || count > 0xffff)
   9908		return -EINVAL;
   9909
   9910	tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
   9911	if (!tx)
   9912		return -ENOMEM;
   9913
   9914	dev->_tx = tx;
   9915
   9916	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
   9917	spin_lock_init(&dev->tx_global_lock);
   9918
   9919	return 0;
   9920}
   9921
   9922void netif_tx_stop_all_queues(struct net_device *dev)
   9923{
   9924	unsigned int i;
   9925
   9926	for (i = 0; i < dev->num_tx_queues; i++) {
   9927		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
   9928
   9929		netif_tx_stop_queue(txq);
   9930	}
   9931}
   9932EXPORT_SYMBOL(netif_tx_stop_all_queues);
   9933
   9934/**
   9935 * register_netdevice() - register a network device
   9936 * @dev: device to register
   9937 *
   9938 * Take a prepared network device structure and make it externally accessible.
   9939 * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
   9940 * Callers must hold the rtnl lock - you may want register_netdev()
   9941 * instead of this.
   9942 */
   9943int register_netdevice(struct net_device *dev)
   9944{
   9945	int ret;
   9946	struct net *net = dev_net(dev);
   9947
   9948	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
   9949		     NETDEV_FEATURE_COUNT);
   9950	BUG_ON(dev_boot_phase);
   9951	ASSERT_RTNL();
   9952
   9953	might_sleep();
   9954
   9955	/* When net_device's are persistent, this will be fatal. */
   9956	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
   9957	BUG_ON(!net);
   9958
   9959	ret = ethtool_check_ops(dev->ethtool_ops);
   9960	if (ret)
   9961		return ret;
   9962
   9963	spin_lock_init(&dev->addr_list_lock);
   9964	netdev_set_addr_lockdep_class(dev);
   9965
   9966	ret = dev_get_valid_name(net, dev, dev->name);
   9967	if (ret < 0)
   9968		goto out;
   9969
   9970	ret = -ENOMEM;
   9971	dev->name_node = netdev_name_node_head_alloc(dev);
   9972	if (!dev->name_node)
   9973		goto out;
   9974
   9975	/* Init, if this function is available */
   9976	if (dev->netdev_ops->ndo_init) {
   9977		ret = dev->netdev_ops->ndo_init(dev);
   9978		if (ret) {
   9979			if (ret > 0)
   9980				ret = -EIO;
   9981			goto err_free_name;
   9982		}
   9983	}
   9984
   9985	if (((dev->hw_features | dev->features) &
   9986	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
   9987	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
   9988	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
   9989		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
   9990		ret = -EINVAL;
   9991		goto err_uninit;
   9992	}
   9993
   9994	ret = -EBUSY;
   9995	if (!dev->ifindex)
   9996		dev->ifindex = dev_new_index(net);
   9997	else if (__dev_get_by_index(net, dev->ifindex))
   9998		goto err_uninit;
   9999
  10000	/* Transfer changeable features to wanted_features and enable
  10001	 * software offloads (GSO and GRO).
  10002	 */
  10003	dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
  10004	dev->features |= NETIF_F_SOFT_FEATURES;
  10005
  10006	if (dev->udp_tunnel_nic_info) {
  10007		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
  10008		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
  10009	}
  10010
  10011	dev->wanted_features = dev->features & dev->hw_features;
  10012
  10013	if (!(dev->flags & IFF_LOOPBACK))
  10014		dev->hw_features |= NETIF_F_NOCACHE_COPY;
  10015
  10016	/* If IPv4 TCP segmentation offload is supported we should also
  10017	 * allow the device to enable segmenting the frame with the option
  10018	 * of ignoring a static IP ID value.  This doesn't enable the
  10019	 * feature itself but allows the user to enable it later.
  10020	 */
  10021	if (dev->hw_features & NETIF_F_TSO)
  10022		dev->hw_features |= NETIF_F_TSO_MANGLEID;
  10023	if (dev->vlan_features & NETIF_F_TSO)
  10024		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
  10025	if (dev->mpls_features & NETIF_F_TSO)
  10026		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
  10027	if (dev->hw_enc_features & NETIF_F_TSO)
  10028		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
  10029
  10030	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
  10031	 */
  10032	dev->vlan_features |= NETIF_F_HIGHDMA;
  10033
  10034	/* Make NETIF_F_SG inheritable to tunnel devices.
  10035	 */
  10036	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
  10037
  10038	/* Make NETIF_F_SG inheritable to MPLS.
  10039	 */
  10040	dev->mpls_features |= NETIF_F_SG;
  10041
  10042	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
  10043	ret = notifier_to_errno(ret);
  10044	if (ret)
  10045		goto err_uninit;
  10046
  10047	ret = netdev_register_kobject(dev);
  10048	write_lock(&dev_base_lock);
  10049	dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED;
  10050	write_unlock(&dev_base_lock);
  10051	if (ret)
  10052		goto err_uninit;
  10053
  10054	__netdev_update_features(dev);
  10055
  10056	/*
  10057	 *	Default initial state at registry is that the
  10058	 *	device is present.
  10059	 */
  10060
  10061	set_bit(__LINK_STATE_PRESENT, &dev->state);
  10062
  10063	linkwatch_init_dev(dev);
  10064
  10065	dev_init_scheduler(dev);
  10066
  10067	dev_hold_track(dev, &dev->dev_registered_tracker, GFP_KERNEL);
  10068	list_netdevice(dev);
  10069
  10070	add_device_randomness(dev->dev_addr, dev->addr_len);
  10071
  10072	/* If the device has permanent device address, driver should
  10073	 * set dev_addr and also addr_assign_type should be set to
  10074	 * NET_ADDR_PERM (default value).
  10075	 */
  10076	if (dev->addr_assign_type == NET_ADDR_PERM)
  10077		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
  10078
  10079	/* Notify protocols, that a new device appeared. */
  10080	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
  10081	ret = notifier_to_errno(ret);
  10082	if (ret) {
  10083		/* Expect explicit free_netdev() on failure */
  10084		dev->needs_free_netdev = false;
  10085		unregister_netdevice_queue(dev, NULL);
  10086		goto out;
  10087	}
  10088	/*
  10089	 *	Prevent userspace races by waiting until the network
  10090	 *	device is fully setup before sending notifications.
  10091	 */
  10092	if (!dev->rtnl_link_ops ||
  10093	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
  10094		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
  10095
  10096out:
  10097	return ret;
  10098
  10099err_uninit:
  10100	if (dev->netdev_ops->ndo_uninit)
  10101		dev->netdev_ops->ndo_uninit(dev);
  10102	if (dev->priv_destructor)
  10103		dev->priv_destructor(dev);
  10104err_free_name:
  10105	netdev_name_node_free(dev->name_node);
  10106	goto out;
  10107}
  10108EXPORT_SYMBOL(register_netdevice);
  10109
  10110/**
  10111 *	init_dummy_netdev	- init a dummy network device for NAPI
  10112 *	@dev: device to init
  10113 *
  10114 *	This takes a network device structure and initialize the minimum
  10115 *	amount of fields so it can be used to schedule NAPI polls without
  10116 *	registering a full blown interface. This is to be used by drivers
  10117 *	that need to tie several hardware interfaces to a single NAPI
  10118 *	poll scheduler due to HW limitations.
  10119 */
  10120int init_dummy_netdev(struct net_device *dev)
  10121{
  10122	/* Clear everything. Note we don't initialize spinlocks
  10123	 * are they aren't supposed to be taken by any of the
  10124	 * NAPI code and this dummy netdev is supposed to be
  10125	 * only ever used for NAPI polls
  10126	 */
  10127	memset(dev, 0, sizeof(struct net_device));
  10128
  10129	/* make sure we BUG if trying to hit standard
  10130	 * register/unregister code path
  10131	 */
  10132	dev->reg_state = NETREG_DUMMY;
  10133
  10134	/* NAPI wants this */
  10135	INIT_LIST_HEAD(&dev->napi_list);
  10136
  10137	/* a dummy interface is started by default */
  10138	set_bit(__LINK_STATE_PRESENT, &dev->state);
  10139	set_bit(__LINK_STATE_START, &dev->state);
  10140
  10141	/* napi_busy_loop stats accounting wants this */
  10142	dev_net_set(dev, &init_net);
  10143
  10144	/* Note : We dont allocate pcpu_refcnt for dummy devices,
  10145	 * because users of this 'device' dont need to change
  10146	 * its refcount.
  10147	 */
  10148
  10149	return 0;
  10150}
  10151EXPORT_SYMBOL_GPL(init_dummy_netdev);
  10152
  10153
  10154/**
  10155 *	register_netdev	- register a network device
  10156 *	@dev: device to register
  10157 *
  10158 *	Take a completed network device structure and add it to the kernel
  10159 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
  10160 *	chain. 0 is returned on success. A negative errno code is returned
  10161 *	on a failure to set up the device, or if the name is a duplicate.
  10162 *
  10163 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
  10164 *	and expands the device name if you passed a format string to
  10165 *	alloc_netdev.
  10166 */
  10167int register_netdev(struct net_device *dev)
  10168{
  10169	int err;
  10170
  10171	if (rtnl_lock_killable())
  10172		return -EINTR;
  10173	err = register_netdevice(dev);
  10174	rtnl_unlock();
  10175	return err;
  10176}
  10177EXPORT_SYMBOL(register_netdev);
  10178
  10179int netdev_refcnt_read(const struct net_device *dev)
  10180{
  10181#ifdef CONFIG_PCPU_DEV_REFCNT
  10182	int i, refcnt = 0;
  10183
  10184	for_each_possible_cpu(i)
  10185		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
  10186	return refcnt;
  10187#else
  10188	return refcount_read(&dev->dev_refcnt);
  10189#endif
  10190}
  10191EXPORT_SYMBOL(netdev_refcnt_read);
  10192
  10193int netdev_unregister_timeout_secs __read_mostly = 10;
  10194
  10195#define WAIT_REFS_MIN_MSECS 1
  10196#define WAIT_REFS_MAX_MSECS 250
  10197/**
  10198 * netdev_wait_allrefs_any - wait until all references are gone.
  10199 * @list: list of net_devices to wait on
  10200 *
  10201 * This is called when unregistering network devices.
  10202 *
  10203 * Any protocol or device that holds a reference should register
  10204 * for netdevice notification, and cleanup and put back the
  10205 * reference if they receive an UNREGISTER event.
  10206 * We can get stuck here if buggy protocols don't correctly
  10207 * call dev_put.
  10208 */
  10209static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
  10210{
  10211	unsigned long rebroadcast_time, warning_time;
  10212	struct net_device *dev;
  10213	int wait = 0;
  10214
  10215	rebroadcast_time = warning_time = jiffies;
  10216
  10217	list_for_each_entry(dev, list, todo_list)
  10218		if (netdev_refcnt_read(dev) == 1)
  10219			return dev;
  10220
  10221	while (true) {
  10222		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
  10223			rtnl_lock();
  10224
  10225			/* Rebroadcast unregister notification */
  10226			list_for_each_entry(dev, list, todo_list)
  10227				call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
  10228
  10229			__rtnl_unlock();
  10230			rcu_barrier();
  10231			rtnl_lock();
  10232
  10233			list_for_each_entry(dev, list, todo_list)
  10234				if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
  10235					     &dev->state)) {
  10236					/* We must not have linkwatch events
  10237					 * pending on unregister. If this
  10238					 * happens, we simply run the queue
  10239					 * unscheduled, resulting in a noop
  10240					 * for this device.
  10241					 */
  10242					linkwatch_run_queue();
  10243					break;
  10244				}
  10245
  10246			__rtnl_unlock();
  10247
  10248			rebroadcast_time = jiffies;
  10249		}
  10250
  10251		if (!wait) {
  10252			rcu_barrier();
  10253			wait = WAIT_REFS_MIN_MSECS;
  10254		} else {
  10255			msleep(wait);
  10256			wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
  10257		}
  10258
  10259		list_for_each_entry(dev, list, todo_list)
  10260			if (netdev_refcnt_read(dev) == 1)
  10261				return dev;
  10262
  10263		if (time_after(jiffies, warning_time +
  10264			       netdev_unregister_timeout_secs * HZ)) {
  10265			list_for_each_entry(dev, list, todo_list) {
  10266				pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
  10267					 dev->name, netdev_refcnt_read(dev));
  10268				ref_tracker_dir_print(&dev->refcnt_tracker, 10);
  10269			}
  10270
  10271			warning_time = jiffies;
  10272		}
  10273	}
  10274}
  10275
  10276/* The sequence is:
  10277 *
  10278 *	rtnl_lock();
  10279 *	...
  10280 *	register_netdevice(x1);
  10281 *	register_netdevice(x2);
  10282 *	...
  10283 *	unregister_netdevice(y1);
  10284 *	unregister_netdevice(y2);
  10285 *      ...
  10286 *	rtnl_unlock();
  10287 *	free_netdev(y1);
  10288 *	free_netdev(y2);
  10289 *
  10290 * We are invoked by rtnl_unlock().
  10291 * This allows us to deal with problems:
  10292 * 1) We can delete sysfs objects which invoke hotplug
  10293 *    without deadlocking with linkwatch via keventd.
  10294 * 2) Since we run with the RTNL semaphore not held, we can sleep
  10295 *    safely in order to wait for the netdev refcnt to drop to zero.
  10296 *
  10297 * We must not return until all unregister events added during
  10298 * the interval the lock was held have been completed.
  10299 */
  10300void netdev_run_todo(void)
  10301{
  10302	struct net_device *dev, *tmp;
  10303	struct list_head list;
  10304#ifdef CONFIG_LOCKDEP
  10305	struct list_head unlink_list;
  10306
  10307	list_replace_init(&net_unlink_list, &unlink_list);
  10308
  10309	while (!list_empty(&unlink_list)) {
  10310		struct net_device *dev = list_first_entry(&unlink_list,
  10311							  struct net_device,
  10312							  unlink_list);
  10313		list_del_init(&dev->unlink_list);
  10314		dev->nested_level = dev->lower_level - 1;
  10315	}
  10316#endif
  10317
  10318	/* Snapshot list, allow later requests */
  10319	list_replace_init(&net_todo_list, &list);
  10320
  10321	__rtnl_unlock();
  10322
  10323	/* Wait for rcu callbacks to finish before next phase */
  10324	if (!list_empty(&list))
  10325		rcu_barrier();
  10326
  10327	list_for_each_entry_safe(dev, tmp, &list, todo_list) {
  10328		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
  10329			netdev_WARN(dev, "run_todo but not unregistering\n");
  10330			list_del(&dev->todo_list);
  10331			continue;
  10332		}
  10333
  10334		write_lock(&dev_base_lock);
  10335		dev->reg_state = NETREG_UNREGISTERED;
  10336		write_unlock(&dev_base_lock);
  10337		linkwatch_forget_dev(dev);
  10338	}
  10339
  10340	while (!list_empty(&list)) {
  10341		dev = netdev_wait_allrefs_any(&list);
  10342		list_del(&dev->todo_list);
  10343
  10344		/* paranoia */
  10345		BUG_ON(netdev_refcnt_read(dev) != 1);
  10346		BUG_ON(!list_empty(&dev->ptype_all));
  10347		BUG_ON(!list_empty(&dev->ptype_specific));
  10348		WARN_ON(rcu_access_pointer(dev->ip_ptr));
  10349		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
  10350#if IS_ENABLED(CONFIG_DECNET)
  10351		WARN_ON(dev->dn_ptr);
  10352#endif
  10353		if (dev->priv_destructor)
  10354			dev->priv_destructor(dev);
  10355		if (dev->needs_free_netdev)
  10356			free_netdev(dev);
  10357
  10358		if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count))
  10359			wake_up(&netdev_unregistering_wq);
  10360
  10361		/* Free network device */
  10362		kobject_put(&dev->dev.kobj);
  10363	}
  10364}
  10365
  10366/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
  10367 * all the same fields in the same order as net_device_stats, with only
  10368 * the type differing, but rtnl_link_stats64 may have additional fields
  10369 * at the end for newer counters.
  10370 */
  10371void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
  10372			     const struct net_device_stats *netdev_stats)
  10373{
  10374#if BITS_PER_LONG == 64
  10375	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
  10376	memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
  10377	/* zero out counters that only exist in rtnl_link_stats64 */
  10378	memset((char *)stats64 + sizeof(*netdev_stats), 0,
  10379	       sizeof(*stats64) - sizeof(*netdev_stats));
  10380#else
  10381	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
  10382	const unsigned long *src = (const unsigned long *)netdev_stats;
  10383	u64 *dst = (u64 *)stats64;
  10384
  10385	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
  10386	for (i = 0; i < n; i++)
  10387		dst[i] = src[i];
  10388	/* zero out counters that only exist in rtnl_link_stats64 */
  10389	memset((char *)stats64 + n * sizeof(u64), 0,
  10390	       sizeof(*stats64) - n * sizeof(u64));
  10391#endif
  10392}
  10393EXPORT_SYMBOL(netdev_stats_to_stats64);
  10394
  10395struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev)
  10396{
  10397	struct net_device_core_stats __percpu *p;
  10398
  10399	p = alloc_percpu_gfp(struct net_device_core_stats,
  10400			     GFP_ATOMIC | __GFP_NOWARN);
  10401
  10402	if (p && cmpxchg(&dev->core_stats, NULL, p))
  10403		free_percpu(p);
  10404
  10405	/* This READ_ONCE() pairs with the cmpxchg() above */
  10406	return READ_ONCE(dev->core_stats);
  10407}
  10408EXPORT_SYMBOL(netdev_core_stats_alloc);
  10409
  10410/**
  10411 *	dev_get_stats	- get network device statistics
  10412 *	@dev: device to get statistics from
  10413 *	@storage: place to store stats
  10414 *
  10415 *	Get network statistics from device. Return @storage.
  10416 *	The device driver may provide its own method by setting
  10417 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
  10418 *	otherwise the internal statistics structure is used.
  10419 */
  10420struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
  10421					struct rtnl_link_stats64 *storage)
  10422{
  10423	const struct net_device_ops *ops = dev->netdev_ops;
  10424	const struct net_device_core_stats __percpu *p;
  10425
  10426	if (ops->ndo_get_stats64) {
  10427		memset(storage, 0, sizeof(*storage));
  10428		ops->ndo_get_stats64(dev, storage);
  10429	} else if (ops->ndo_get_stats) {
  10430		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
  10431	} else {
  10432		netdev_stats_to_stats64(storage, &dev->stats);
  10433	}
  10434
  10435	/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
  10436	p = READ_ONCE(dev->core_stats);
  10437	if (p) {
  10438		const struct net_device_core_stats *core_stats;
  10439		int i;
  10440
  10441		for_each_possible_cpu(i) {
  10442			core_stats = per_cpu_ptr(p, i);
  10443			storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
  10444			storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
  10445			storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
  10446			storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
  10447		}
  10448	}
  10449	return storage;
  10450}
  10451EXPORT_SYMBOL(dev_get_stats);
  10452
  10453/**
  10454 *	dev_fetch_sw_netstats - get per-cpu network device statistics
  10455 *	@s: place to store stats
  10456 *	@netstats: per-cpu network stats to read from
  10457 *
  10458 *	Read per-cpu network statistics and populate the related fields in @s.
  10459 */
  10460void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
  10461			   const struct pcpu_sw_netstats __percpu *netstats)
  10462{
  10463	int cpu;
  10464
  10465	for_each_possible_cpu(cpu) {
  10466		const struct pcpu_sw_netstats *stats;
  10467		struct pcpu_sw_netstats tmp;
  10468		unsigned int start;
  10469
  10470		stats = per_cpu_ptr(netstats, cpu);
  10471		do {
  10472			start = u64_stats_fetch_begin_irq(&stats->syncp);
  10473			tmp.rx_packets = stats->rx_packets;
  10474			tmp.rx_bytes   = stats->rx_bytes;
  10475			tmp.tx_packets = stats->tx_packets;
  10476			tmp.tx_bytes   = stats->tx_bytes;
  10477		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
  10478
  10479		s->rx_packets += tmp.rx_packets;
  10480		s->rx_bytes   += tmp.rx_bytes;
  10481		s->tx_packets += tmp.tx_packets;
  10482		s->tx_bytes   += tmp.tx_bytes;
  10483	}
  10484}
  10485EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
  10486
  10487/**
  10488 *	dev_get_tstats64 - ndo_get_stats64 implementation
  10489 *	@dev: device to get statistics from
  10490 *	@s: place to store stats
  10491 *
  10492 *	Populate @s from dev->stats and dev->tstats. Can be used as
  10493 *	ndo_get_stats64() callback.
  10494 */
  10495void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
  10496{
  10497	netdev_stats_to_stats64(s, &dev->stats);
  10498	dev_fetch_sw_netstats(s, dev->tstats);
  10499}
  10500EXPORT_SYMBOL_GPL(dev_get_tstats64);
  10501
  10502struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
  10503{
  10504	struct netdev_queue *queue = dev_ingress_queue(dev);
  10505
  10506#ifdef CONFIG_NET_CLS_ACT
  10507	if (queue)
  10508		return queue;
  10509	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
  10510	if (!queue)
  10511		return NULL;
  10512	netdev_init_one_queue(dev, queue, NULL);
  10513	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
  10514	queue->qdisc_sleeping = &noop_qdisc;
  10515	rcu_assign_pointer(dev->ingress_queue, queue);
  10516#endif
  10517	return queue;
  10518}
  10519
  10520static const struct ethtool_ops default_ethtool_ops;
  10521
  10522void netdev_set_default_ethtool_ops(struct net_device *dev,
  10523				    const struct ethtool_ops *ops)
  10524{
  10525	if (dev->ethtool_ops == &default_ethtool_ops)
  10526		dev->ethtool_ops = ops;
  10527}
  10528EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
  10529
  10530void netdev_freemem(struct net_device *dev)
  10531{
  10532	char *addr = (char *)dev - dev->padded;
  10533
  10534	kvfree(addr);
  10535}
  10536
  10537/**
  10538 * alloc_netdev_mqs - allocate network device
  10539 * @sizeof_priv: size of private data to allocate space for
  10540 * @name: device name format string
  10541 * @name_assign_type: origin of device name
  10542 * @setup: callback to initialize device
  10543 * @txqs: the number of TX subqueues to allocate
  10544 * @rxqs: the number of RX subqueues to allocate
  10545 *
  10546 * Allocates a struct net_device with private data area for driver use
  10547 * and performs basic initialization.  Also allocates subqueue structs
  10548 * for each queue on the device.
  10549 */
  10550struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
  10551		unsigned char name_assign_type,
  10552		void (*setup)(struct net_device *),
  10553		unsigned int txqs, unsigned int rxqs)
  10554{
  10555	struct net_device *dev;
  10556	unsigned int alloc_size;
  10557	struct net_device *p;
  10558
  10559	BUG_ON(strlen(name) >= sizeof(dev->name));
  10560
  10561	if (txqs < 1) {
  10562		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
  10563		return NULL;
  10564	}
  10565
  10566	if (rxqs < 1) {
  10567		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
  10568		return NULL;
  10569	}
  10570
  10571	alloc_size = sizeof(struct net_device);
  10572	if (sizeof_priv) {
  10573		/* ensure 32-byte alignment of private area */
  10574		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
  10575		alloc_size += sizeof_priv;
  10576	}
  10577	/* ensure 32-byte alignment of whole construct */
  10578	alloc_size += NETDEV_ALIGN - 1;
  10579
  10580	p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
  10581	if (!p)
  10582		return NULL;
  10583
  10584	dev = PTR_ALIGN(p, NETDEV_ALIGN);
  10585	dev->padded = (char *)dev - (char *)p;
  10586
  10587	ref_tracker_dir_init(&dev->refcnt_tracker, 128);
  10588#ifdef CONFIG_PCPU_DEV_REFCNT
  10589	dev->pcpu_refcnt = alloc_percpu(int);
  10590	if (!dev->pcpu_refcnt)
  10591		goto free_dev;
  10592	__dev_hold(dev);
  10593#else
  10594	refcount_set(&dev->dev_refcnt, 1);
  10595#endif
  10596
  10597	if (dev_addr_init(dev))
  10598		goto free_pcpu;
  10599
  10600	dev_mc_init(dev);
  10601	dev_uc_init(dev);
  10602
  10603	dev_net_set(dev, &init_net);
  10604
  10605	dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
  10606	dev->gso_max_segs = GSO_MAX_SEGS;
  10607	dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
  10608	dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
  10609	dev->tso_max_segs = TSO_MAX_SEGS;
  10610	dev->upper_level = 1;
  10611	dev->lower_level = 1;
  10612#ifdef CONFIG_LOCKDEP
  10613	dev->nested_level = 0;
  10614	INIT_LIST_HEAD(&dev->unlink_list);
  10615#endif
  10616
  10617	INIT_LIST_HEAD(&dev->napi_list);
  10618	INIT_LIST_HEAD(&dev->unreg_list);
  10619	INIT_LIST_HEAD(&dev->close_list);
  10620	INIT_LIST_HEAD(&dev->link_watch_list);
  10621	INIT_LIST_HEAD(&dev->adj_list.upper);
  10622	INIT_LIST_HEAD(&dev->adj_list.lower);
  10623	INIT_LIST_HEAD(&dev->ptype_all);
  10624	INIT_LIST_HEAD(&dev->ptype_specific);
  10625	INIT_LIST_HEAD(&dev->net_notifier_list);
  10626#ifdef CONFIG_NET_SCHED
  10627	hash_init(dev->qdisc_hash);
  10628#endif
  10629	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
  10630	setup(dev);
  10631
  10632	if (!dev->tx_queue_len) {
  10633		dev->priv_flags |= IFF_NO_QUEUE;
  10634		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
  10635	}
  10636
  10637	dev->num_tx_queues = txqs;
  10638	dev->real_num_tx_queues = txqs;
  10639	if (netif_alloc_netdev_queues(dev))
  10640		goto free_all;
  10641
  10642	dev->num_rx_queues = rxqs;
  10643	dev->real_num_rx_queues = rxqs;
  10644	if (netif_alloc_rx_queues(dev))
  10645		goto free_all;
  10646
  10647	strcpy(dev->name, name);
  10648	dev->name_assign_type = name_assign_type;
  10649	dev->group = INIT_NETDEV_GROUP;
  10650	if (!dev->ethtool_ops)
  10651		dev->ethtool_ops = &default_ethtool_ops;
  10652
  10653	nf_hook_netdev_init(dev);
  10654
  10655	return dev;
  10656
  10657free_all:
  10658	free_netdev(dev);
  10659	return NULL;
  10660
  10661free_pcpu:
  10662#ifdef CONFIG_PCPU_DEV_REFCNT
  10663	free_percpu(dev->pcpu_refcnt);
  10664free_dev:
  10665#endif
  10666	netdev_freemem(dev);
  10667	return NULL;
  10668}
  10669EXPORT_SYMBOL(alloc_netdev_mqs);
  10670
  10671/**
  10672 * free_netdev - free network device
  10673 * @dev: device
  10674 *
  10675 * This function does the last stage of destroying an allocated device
  10676 * interface. The reference to the device object is released. If this
  10677 * is the last reference then it will be freed.Must be called in process
  10678 * context.
  10679 */
  10680void free_netdev(struct net_device *dev)
  10681{
  10682	struct napi_struct *p, *n;
  10683
  10684	might_sleep();
  10685
  10686	/* When called immediately after register_netdevice() failed the unwind
  10687	 * handling may still be dismantling the device. Handle that case by
  10688	 * deferring the free.
  10689	 */
  10690	if (dev->reg_state == NETREG_UNREGISTERING) {
  10691		ASSERT_RTNL();
  10692		dev->needs_free_netdev = true;
  10693		return;
  10694	}
  10695
  10696	netif_free_tx_queues(dev);
  10697	netif_free_rx_queues(dev);
  10698
  10699	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
  10700
  10701	/* Flush device addresses */
  10702	dev_addr_flush(dev);
  10703
  10704	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
  10705		netif_napi_del(p);
  10706
  10707	ref_tracker_dir_exit(&dev->refcnt_tracker);
  10708#ifdef CONFIG_PCPU_DEV_REFCNT
  10709	free_percpu(dev->pcpu_refcnt);
  10710	dev->pcpu_refcnt = NULL;
  10711#endif
  10712	free_percpu(dev->core_stats);
  10713	dev->core_stats = NULL;
  10714	free_percpu(dev->xdp_bulkq);
  10715	dev->xdp_bulkq = NULL;
  10716
  10717	/*  Compatibility with error handling in drivers */
  10718	if (dev->reg_state == NETREG_UNINITIALIZED) {
  10719		netdev_freemem(dev);
  10720		return;
  10721	}
  10722
  10723	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
  10724	dev->reg_state = NETREG_RELEASED;
  10725
  10726	/* will free via device release */
  10727	put_device(&dev->dev);
  10728}
  10729EXPORT_SYMBOL(free_netdev);
  10730
  10731/**
  10732 *	synchronize_net -  Synchronize with packet receive processing
  10733 *
  10734 *	Wait for packets currently being received to be done.
  10735 *	Does not block later packets from starting.
  10736 */
  10737void synchronize_net(void)
  10738{
  10739	might_sleep();
  10740	if (rtnl_is_locked())
  10741		synchronize_rcu_expedited();
  10742	else
  10743		synchronize_rcu();
  10744}
  10745EXPORT_SYMBOL(synchronize_net);
  10746
  10747/**
  10748 *	unregister_netdevice_queue - remove device from the kernel
  10749 *	@dev: device
  10750 *	@head: list
  10751 *
  10752 *	This function shuts down a device interface and removes it
  10753 *	from the kernel tables.
  10754 *	If head not NULL, device is queued to be unregistered later.
  10755 *
  10756 *	Callers must hold the rtnl semaphore.  You may want
  10757 *	unregister_netdev() instead of this.
  10758 */
  10759
  10760void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
  10761{
  10762	ASSERT_RTNL();
  10763
  10764	if (head) {
  10765		list_move_tail(&dev->unreg_list, head);
  10766	} else {
  10767		LIST_HEAD(single);
  10768
  10769		list_add(&dev->unreg_list, &single);
  10770		unregister_netdevice_many(&single);
  10771	}
  10772}
  10773EXPORT_SYMBOL(unregister_netdevice_queue);
  10774
  10775/**
  10776 *	unregister_netdevice_many - unregister many devices
  10777 *	@head: list of devices
  10778 *
  10779 *  Note: As most callers use a stack allocated list_head,
  10780 *  we force a list_del() to make sure stack wont be corrupted later.
  10781 */
  10782void unregister_netdevice_many(struct list_head *head)
  10783{
  10784	struct net_device *dev, *tmp;
  10785	LIST_HEAD(close_head);
  10786
  10787	BUG_ON(dev_boot_phase);
  10788	ASSERT_RTNL();
  10789
  10790	if (list_empty(head))
  10791		return;
  10792
  10793	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
  10794		/* Some devices call without registering
  10795		 * for initialization unwind. Remove those
  10796		 * devices and proceed with the remaining.
  10797		 */
  10798		if (dev->reg_state == NETREG_UNINITIALIZED) {
  10799			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
  10800				 dev->name, dev);
  10801
  10802			WARN_ON(1);
  10803			list_del(&dev->unreg_list);
  10804			continue;
  10805		}
  10806		dev->dismantle = true;
  10807		BUG_ON(dev->reg_state != NETREG_REGISTERED);
  10808	}
  10809
  10810	/* If device is running, close it first. */
  10811	list_for_each_entry(dev, head, unreg_list)
  10812		list_add_tail(&dev->close_list, &close_head);
  10813	dev_close_many(&close_head, true);
  10814
  10815	list_for_each_entry(dev, head, unreg_list) {
  10816		/* And unlink it from device chain. */
  10817		write_lock(&dev_base_lock);
  10818		unlist_netdevice(dev, false);
  10819		dev->reg_state = NETREG_UNREGISTERING;
  10820		write_unlock(&dev_base_lock);
  10821	}
  10822	flush_all_backlogs();
  10823
  10824	synchronize_net();
  10825
  10826	list_for_each_entry(dev, head, unreg_list) {
  10827		struct sk_buff *skb = NULL;
  10828
  10829		/* Shutdown queueing discipline. */
  10830		dev_shutdown(dev);
  10831
  10832		dev_xdp_uninstall(dev);
  10833
  10834		netdev_offload_xstats_disable_all(dev);
  10835
  10836		/* Notify protocols, that we are about to destroy
  10837		 * this device. They should clean all the things.
  10838		 */
  10839		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
  10840
  10841		if (!dev->rtnl_link_ops ||
  10842		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
  10843			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
  10844						     GFP_KERNEL, NULL, 0);
  10845
  10846		/*
  10847		 *	Flush the unicast and multicast chains
  10848		 */
  10849		dev_uc_flush(dev);
  10850		dev_mc_flush(dev);
  10851
  10852		netdev_name_node_alt_flush(dev);
  10853		netdev_name_node_free(dev->name_node);
  10854
  10855		if (dev->netdev_ops->ndo_uninit)
  10856			dev->netdev_ops->ndo_uninit(dev);
  10857
  10858		if (skb)
  10859			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
  10860
  10861		/* Notifier chain MUST detach us all upper devices. */
  10862		WARN_ON(netdev_has_any_upper_dev(dev));
  10863		WARN_ON(netdev_has_any_lower_dev(dev));
  10864
  10865		/* Remove entries from kobject tree */
  10866		netdev_unregister_kobject(dev);
  10867#ifdef CONFIG_XPS
  10868		/* Remove XPS queueing entries */
  10869		netif_reset_xps_queues_gt(dev, 0);
  10870#endif
  10871	}
  10872
  10873	synchronize_net();
  10874
  10875	list_for_each_entry(dev, head, unreg_list) {
  10876		dev_put_track(dev, &dev->dev_registered_tracker);
  10877		net_set_todo(dev);
  10878	}
  10879
  10880	list_del(head);
  10881}
  10882EXPORT_SYMBOL(unregister_netdevice_many);
  10883
  10884/**
  10885 *	unregister_netdev - remove device from the kernel
  10886 *	@dev: device
  10887 *
  10888 *	This function shuts down a device interface and removes it
  10889 *	from the kernel tables.
  10890 *
  10891 *	This is just a wrapper for unregister_netdevice that takes
  10892 *	the rtnl semaphore.  In general you want to use this and not
  10893 *	unregister_netdevice.
  10894 */
  10895void unregister_netdev(struct net_device *dev)
  10896{
  10897	rtnl_lock();
  10898	unregister_netdevice(dev);
  10899	rtnl_unlock();
  10900}
  10901EXPORT_SYMBOL(unregister_netdev);
  10902
  10903/**
  10904 *	__dev_change_net_namespace - move device to different nethost namespace
  10905 *	@dev: device
  10906 *	@net: network namespace
  10907 *	@pat: If not NULL name pattern to try if the current device name
  10908 *	      is already taken in the destination network namespace.
  10909 *	@new_ifindex: If not zero, specifies device index in the target
  10910 *	              namespace.
  10911 *
  10912 *	This function shuts down a device interface and moves it
  10913 *	to a new network namespace. On success 0 is returned, on
  10914 *	a failure a netagive errno code is returned.
  10915 *
  10916 *	Callers must hold the rtnl semaphore.
  10917 */
  10918
  10919int __dev_change_net_namespace(struct net_device *dev, struct net *net,
  10920			       const char *pat, int new_ifindex)
  10921{
  10922	struct net *net_old = dev_net(dev);
  10923	int err, new_nsid;
  10924
  10925	ASSERT_RTNL();
  10926
  10927	/* Don't allow namespace local devices to be moved. */
  10928	err = -EINVAL;
  10929	if (dev->features & NETIF_F_NETNS_LOCAL)
  10930		goto out;
  10931
  10932	/* Ensure the device has been registrered */
  10933	if (dev->reg_state != NETREG_REGISTERED)
  10934		goto out;
  10935
  10936	/* Get out if there is nothing todo */
  10937	err = 0;
  10938	if (net_eq(net_old, net))
  10939		goto out;
  10940
  10941	/* Pick the destination device name, and ensure
  10942	 * we can use it in the destination network namespace.
  10943	 */
  10944	err = -EEXIST;
  10945	if (netdev_name_in_use(net, dev->name)) {
  10946		/* We get here if we can't use the current device name */
  10947		if (!pat)
  10948			goto out;
  10949		err = dev_get_valid_name(net, dev, pat);
  10950		if (err < 0)
  10951			goto out;
  10952	}
  10953
  10954	/* Check that new_ifindex isn't used yet. */
  10955	err = -EBUSY;
  10956	if (new_ifindex && __dev_get_by_index(net, new_ifindex))
  10957		goto out;
  10958
  10959	/*
  10960	 * And now a mini version of register_netdevice unregister_netdevice.
  10961	 */
  10962
  10963	/* If device is running close it first. */
  10964	dev_close(dev);
  10965
  10966	/* And unlink it from device chain */
  10967	unlist_netdevice(dev, true);
  10968
  10969	synchronize_net();
  10970
  10971	/* Shutdown queueing discipline. */
  10972	dev_shutdown(dev);
  10973
  10974	/* Notify protocols, that we are about to destroy
  10975	 * this device. They should clean all the things.
  10976	 *
  10977	 * Note that dev->reg_state stays at NETREG_REGISTERED.
  10978	 * This is wanted because this way 8021q and macvlan know
  10979	 * the device is just moving and can keep their slaves up.
  10980	 */
  10981	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
  10982	rcu_barrier();
  10983
  10984	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
  10985	/* If there is an ifindex conflict assign a new one */
  10986	if (!new_ifindex) {
  10987		if (__dev_get_by_index(net, dev->ifindex))
  10988			new_ifindex = dev_new_index(net);
  10989		else
  10990			new_ifindex = dev->ifindex;
  10991	}
  10992
  10993	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
  10994			    new_ifindex);
  10995
  10996	/*
  10997	 *	Flush the unicast and multicast chains
  10998	 */
  10999	dev_uc_flush(dev);
  11000	dev_mc_flush(dev);
  11001
  11002	/* Send a netdev-removed uevent to the old namespace */
  11003	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
  11004	netdev_adjacent_del_links(dev);
  11005
  11006	/* Move per-net netdevice notifiers that are following the netdevice */
  11007	move_netdevice_notifiers_dev_net(dev, net);
  11008
  11009	/* Actually switch the network namespace */
  11010	dev_net_set(dev, net);
  11011	dev->ifindex = new_ifindex;
  11012
  11013	/* Send a netdev-add uevent to the new namespace */
  11014	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
  11015	netdev_adjacent_add_links(dev);
  11016
  11017	/* Fixup kobjects */
  11018	err = device_rename(&dev->dev, dev->name);
  11019	WARN_ON(err);
  11020
  11021	/* Adapt owner in case owning user namespace of target network
  11022	 * namespace is different from the original one.
  11023	 */
  11024	err = netdev_change_owner(dev, net_old, net);
  11025	WARN_ON(err);
  11026
  11027	/* Add the device back in the hashes */
  11028	list_netdevice(dev);
  11029
  11030	/* Notify protocols, that a new device appeared. */
  11031	call_netdevice_notifiers(NETDEV_REGISTER, dev);
  11032
  11033	/*
  11034	 *	Prevent userspace races by waiting until the network
  11035	 *	device is fully setup before sending notifications.
  11036	 */
  11037	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
  11038
  11039	synchronize_net();
  11040	err = 0;
  11041out:
  11042	return err;
  11043}
  11044EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
  11045
  11046static int dev_cpu_dead(unsigned int oldcpu)
  11047{
  11048	struct sk_buff **list_skb;
  11049	struct sk_buff *skb;
  11050	unsigned int cpu;
  11051	struct softnet_data *sd, *oldsd, *remsd = NULL;
  11052
  11053	local_irq_disable();
  11054	cpu = smp_processor_id();
  11055	sd = &per_cpu(softnet_data, cpu);
  11056	oldsd = &per_cpu(softnet_data, oldcpu);
  11057
  11058	/* Find end of our completion_queue. */
  11059	list_skb = &sd->completion_queue;
  11060	while (*list_skb)
  11061		list_skb = &(*list_skb)->next;
  11062	/* Append completion queue from offline CPU. */
  11063	*list_skb = oldsd->completion_queue;
  11064	oldsd->completion_queue = NULL;
  11065
  11066	/* Append output queue from offline CPU. */
  11067	if (oldsd->output_queue) {
  11068		*sd->output_queue_tailp = oldsd->output_queue;
  11069		sd->output_queue_tailp = oldsd->output_queue_tailp;
  11070		oldsd->output_queue = NULL;
  11071		oldsd->output_queue_tailp = &oldsd->output_queue;
  11072	}
  11073	/* Append NAPI poll list from offline CPU, with one exception :
  11074	 * process_backlog() must be called by cpu owning percpu backlog.
  11075	 * We properly handle process_queue & input_pkt_queue later.
  11076	 */
  11077	while (!list_empty(&oldsd->poll_list)) {
  11078		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
  11079							    struct napi_struct,
  11080							    poll_list);
  11081
  11082		list_del_init(&napi->poll_list);
  11083		if (napi->poll == process_backlog)
  11084			napi->state = 0;
  11085		else
  11086			____napi_schedule(sd, napi);
  11087	}
  11088
  11089	raise_softirq_irqoff(NET_TX_SOFTIRQ);
  11090	local_irq_enable();
  11091
  11092#ifdef CONFIG_RPS
  11093	remsd = oldsd->rps_ipi_list;
  11094	oldsd->rps_ipi_list = NULL;
  11095#endif
  11096	/* send out pending IPI's on offline CPU */
  11097	net_rps_send_ipi(remsd);
  11098
  11099	/* Process offline CPU's input_pkt_queue */
  11100	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
  11101		netif_rx(skb);
  11102		input_queue_head_incr(oldsd);
  11103	}
  11104	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
  11105		netif_rx(skb);
  11106		input_queue_head_incr(oldsd);
  11107	}
  11108
  11109	return 0;
  11110}
  11111
  11112/**
  11113 *	netdev_increment_features - increment feature set by one
  11114 *	@all: current feature set
  11115 *	@one: new feature set
  11116 *	@mask: mask feature set
  11117 *
  11118 *	Computes a new feature set after adding a device with feature set
  11119 *	@one to the master device with current feature set @all.  Will not
  11120 *	enable anything that is off in @mask. Returns the new feature set.
  11121 */
  11122netdev_features_t netdev_increment_features(netdev_features_t all,
  11123	netdev_features_t one, netdev_features_t mask)
  11124{
  11125	if (mask & NETIF_F_HW_CSUM)
  11126		mask |= NETIF_F_CSUM_MASK;
  11127	mask |= NETIF_F_VLAN_CHALLENGED;
  11128
  11129	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
  11130	all &= one | ~NETIF_F_ALL_FOR_ALL;
  11131
  11132	/* If one device supports hw checksumming, set for all. */
  11133	if (all & NETIF_F_HW_CSUM)
  11134		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
  11135
  11136	return all;
  11137}
  11138EXPORT_SYMBOL(netdev_increment_features);
  11139
  11140static struct hlist_head * __net_init netdev_create_hash(void)
  11141{
  11142	int i;
  11143	struct hlist_head *hash;
  11144
  11145	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
  11146	if (hash != NULL)
  11147		for (i = 0; i < NETDEV_HASHENTRIES; i++)
  11148			INIT_HLIST_HEAD(&hash[i]);
  11149
  11150	return hash;
  11151}
  11152
  11153/* Initialize per network namespace state */
  11154static int __net_init netdev_init(struct net *net)
  11155{
  11156	BUILD_BUG_ON(GRO_HASH_BUCKETS >
  11157		     8 * sizeof_field(struct napi_struct, gro_bitmask));
  11158
  11159	INIT_LIST_HEAD(&net->dev_base_head);
  11160
  11161	net->dev_name_head = netdev_create_hash();
  11162	if (net->dev_name_head == NULL)
  11163		goto err_name;
  11164
  11165	net->dev_index_head = netdev_create_hash();
  11166	if (net->dev_index_head == NULL)
  11167		goto err_idx;
  11168
  11169	RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
  11170
  11171	return 0;
  11172
  11173err_idx:
  11174	kfree(net->dev_name_head);
  11175err_name:
  11176	return -ENOMEM;
  11177}
  11178
  11179/**
  11180 *	netdev_drivername - network driver for the device
  11181 *	@dev: network device
  11182 *
  11183 *	Determine network driver for device.
  11184 */
  11185const char *netdev_drivername(const struct net_device *dev)
  11186{
  11187	const struct device_driver *driver;
  11188	const struct device *parent;
  11189	const char *empty = "";
  11190
  11191	parent = dev->dev.parent;
  11192	if (!parent)
  11193		return empty;
  11194
  11195	driver = parent->driver;
  11196	if (driver && driver->name)
  11197		return driver->name;
  11198	return empty;
  11199}
  11200
  11201static void __netdev_printk(const char *level, const struct net_device *dev,
  11202			    struct va_format *vaf)
  11203{
  11204	if (dev && dev->dev.parent) {
  11205		dev_printk_emit(level[1] - '0',
  11206				dev->dev.parent,
  11207				"%s %s %s%s: %pV",
  11208				dev_driver_string(dev->dev.parent),
  11209				dev_name(dev->dev.parent),
  11210				netdev_name(dev), netdev_reg_state(dev),
  11211				vaf);
  11212	} else if (dev) {
  11213		printk("%s%s%s: %pV",
  11214		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
  11215	} else {
  11216		printk("%s(NULL net_device): %pV", level, vaf);
  11217	}
  11218}
  11219
  11220void netdev_printk(const char *level, const struct net_device *dev,
  11221		   const char *format, ...)
  11222{
  11223	struct va_format vaf;
  11224	va_list args;
  11225
  11226	va_start(args, format);
  11227
  11228	vaf.fmt = format;
  11229	vaf.va = &args;
  11230
  11231	__netdev_printk(level, dev, &vaf);
  11232
  11233	va_end(args);
  11234}
  11235EXPORT_SYMBOL(netdev_printk);
  11236
  11237#define define_netdev_printk_level(func, level)			\
  11238void func(const struct net_device *dev, const char *fmt, ...)	\
  11239{								\
  11240	struct va_format vaf;					\
  11241	va_list args;						\
  11242								\
  11243	va_start(args, fmt);					\
  11244								\
  11245	vaf.fmt = fmt;						\
  11246	vaf.va = &args;						\
  11247								\
  11248	__netdev_printk(level, dev, &vaf);			\
  11249								\
  11250	va_end(args);						\
  11251}								\
  11252EXPORT_SYMBOL(func);
  11253
  11254define_netdev_printk_level(netdev_emerg, KERN_EMERG);
  11255define_netdev_printk_level(netdev_alert, KERN_ALERT);
  11256define_netdev_printk_level(netdev_crit, KERN_CRIT);
  11257define_netdev_printk_level(netdev_err, KERN_ERR);
  11258define_netdev_printk_level(netdev_warn, KERN_WARNING);
  11259define_netdev_printk_level(netdev_notice, KERN_NOTICE);
  11260define_netdev_printk_level(netdev_info, KERN_INFO);
  11261
  11262static void __net_exit netdev_exit(struct net *net)
  11263{
  11264	kfree(net->dev_name_head);
  11265	kfree(net->dev_index_head);
  11266	if (net != &init_net)
  11267		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
  11268}
  11269
  11270static struct pernet_operations __net_initdata netdev_net_ops = {
  11271	.init = netdev_init,
  11272	.exit = netdev_exit,
  11273};
  11274
  11275static void __net_exit default_device_exit_net(struct net *net)
  11276{
  11277	struct net_device *dev, *aux;
  11278	/*
  11279	 * Push all migratable network devices back to the
  11280	 * initial network namespace
  11281	 */
  11282	ASSERT_RTNL();
  11283	for_each_netdev_safe(net, dev, aux) {
  11284		int err;
  11285		char fb_name[IFNAMSIZ];
  11286
  11287		/* Ignore unmoveable devices (i.e. loopback) */
  11288		if (dev->features & NETIF_F_NETNS_LOCAL)
  11289			continue;
  11290
  11291		/* Leave virtual devices for the generic cleanup */
  11292		if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
  11293			continue;
  11294
  11295		/* Push remaining network devices to init_net */
  11296		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
  11297		if (netdev_name_in_use(&init_net, fb_name))
  11298			snprintf(fb_name, IFNAMSIZ, "dev%%d");
  11299		err = dev_change_net_namespace(dev, &init_net, fb_name);
  11300		if (err) {
  11301			pr_emerg("%s: failed to move %s to init_net: %d\n",
  11302				 __func__, dev->name, err);
  11303			BUG();
  11304		}
  11305	}
  11306}
  11307
  11308static void __net_exit default_device_exit_batch(struct list_head *net_list)
  11309{
  11310	/* At exit all network devices most be removed from a network
  11311	 * namespace.  Do this in the reverse order of registration.
  11312	 * Do this across as many network namespaces as possible to
  11313	 * improve batching efficiency.
  11314	 */
  11315	struct net_device *dev;
  11316	struct net *net;
  11317	LIST_HEAD(dev_kill_list);
  11318
  11319	rtnl_lock();
  11320	list_for_each_entry(net, net_list, exit_list) {
  11321		default_device_exit_net(net);
  11322		cond_resched();
  11323	}
  11324
  11325	list_for_each_entry(net, net_list, exit_list) {
  11326		for_each_netdev_reverse(net, dev) {
  11327			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
  11328				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
  11329			else
  11330				unregister_netdevice_queue(dev, &dev_kill_list);
  11331		}
  11332	}
  11333	unregister_netdevice_many(&dev_kill_list);
  11334	rtnl_unlock();
  11335}
  11336
  11337static struct pernet_operations __net_initdata default_device_ops = {
  11338	.exit_batch = default_device_exit_batch,
  11339};
  11340
  11341/*
  11342 *	Initialize the DEV module. At boot time this walks the device list and
  11343 *	unhooks any devices that fail to initialise (normally hardware not
  11344 *	present) and leaves us with a valid list of present and active devices.
  11345 *
  11346 */
  11347
  11348/*
  11349 *       This is called single threaded during boot, so no need
  11350 *       to take the rtnl semaphore.
  11351 */
  11352static int __init net_dev_init(void)
  11353{
  11354	int i, rc = -ENOMEM;
  11355
  11356	BUG_ON(!dev_boot_phase);
  11357
  11358	if (dev_proc_init())
  11359		goto out;
  11360
  11361	if (netdev_kobject_init())
  11362		goto out;
  11363
  11364	INIT_LIST_HEAD(&ptype_all);
  11365	for (i = 0; i < PTYPE_HASH_SIZE; i++)
  11366		INIT_LIST_HEAD(&ptype_base[i]);
  11367
  11368	if (register_pernet_subsys(&netdev_net_ops))
  11369		goto out;
  11370
  11371	/*
  11372	 *	Initialise the packet receive queues.
  11373	 */
  11374
  11375	for_each_possible_cpu(i) {
  11376		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
  11377		struct softnet_data *sd = &per_cpu(softnet_data, i);
  11378
  11379		INIT_WORK(flush, flush_backlog);
  11380
  11381		skb_queue_head_init(&sd->input_pkt_queue);
  11382		skb_queue_head_init(&sd->process_queue);
  11383#ifdef CONFIG_XFRM_OFFLOAD
  11384		skb_queue_head_init(&sd->xfrm_backlog);
  11385#endif
  11386		INIT_LIST_HEAD(&sd->poll_list);
  11387		sd->output_queue_tailp = &sd->output_queue;
  11388#ifdef CONFIG_RPS
  11389		INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
  11390		sd->cpu = i;
  11391#endif
  11392		INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
  11393		spin_lock_init(&sd->defer_lock);
  11394
  11395		init_gro_hash(&sd->backlog);
  11396		sd->backlog.poll = process_backlog;
  11397		sd->backlog.weight = weight_p;
  11398	}
  11399
  11400	dev_boot_phase = 0;
  11401
  11402	/* The loopback device is special if any other network devices
  11403	 * is present in a network namespace the loopback device must
  11404	 * be present. Since we now dynamically allocate and free the
  11405	 * loopback device ensure this invariant is maintained by
  11406	 * keeping the loopback device as the first device on the
  11407	 * list of network devices.  Ensuring the loopback devices
  11408	 * is the first device that appears and the last network device
  11409	 * that disappears.
  11410	 */
  11411	if (register_pernet_device(&loopback_net_ops))
  11412		goto out;
  11413
  11414	if (register_pernet_device(&default_device_ops))
  11415		goto out;
  11416
  11417	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
  11418	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
  11419
  11420	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
  11421				       NULL, dev_cpu_dead);
  11422	WARN_ON(rc < 0);
  11423	rc = 0;
  11424out:
  11425	return rc;
  11426}
  11427
  11428subsys_initcall(net_dev_init);