cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

route.c (170763B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 *	Linux INET6 implementation
      4 *	FIB front-end.
      5 *
      6 *	Authors:
      7 *	Pedro Roque		<roque@di.fc.ul.pt>
      8 */
      9
     10/*	Changes:
     11 *
     12 *	YOSHIFUJI Hideaki @USAGI
     13 *		reworked default router selection.
     14 *		- respect outgoing interface
     15 *		- select from (probably) reachable routers (i.e.
     16 *		routers in REACHABLE, STALE, DELAY or PROBE states).
     17 *		- always select the same router if it is (probably)
     18 *		reachable.  otherwise, round-robin the list.
     19 *	Ville Nuorvala
     20 *		Fixed routing subtrees.
     21 */
     22
     23#define pr_fmt(fmt) "IPv6: " fmt
     24
     25#include <linux/capability.h>
     26#include <linux/errno.h>
     27#include <linux/export.h>
     28#include <linux/types.h>
     29#include <linux/times.h>
     30#include <linux/socket.h>
     31#include <linux/sockios.h>
     32#include <linux/net.h>
     33#include <linux/route.h>
     34#include <linux/netdevice.h>
     35#include <linux/in6.h>
     36#include <linux/mroute6.h>
     37#include <linux/init.h>
     38#include <linux/if_arp.h>
     39#include <linux/proc_fs.h>
     40#include <linux/seq_file.h>
     41#include <linux/nsproxy.h>
     42#include <linux/slab.h>
     43#include <linux/jhash.h>
     44#include <linux/siphash.h>
     45#include <net/net_namespace.h>
     46#include <net/snmp.h>
     47#include <net/ipv6.h>
     48#include <net/ip6_fib.h>
     49#include <net/ip6_route.h>
     50#include <net/ndisc.h>
     51#include <net/addrconf.h>
     52#include <net/tcp.h>
     53#include <linux/rtnetlink.h>
     54#include <net/dst.h>
     55#include <net/dst_metadata.h>
     56#include <net/xfrm.h>
     57#include <net/netevent.h>
     58#include <net/netlink.h>
     59#include <net/rtnh.h>
     60#include <net/lwtunnel.h>
     61#include <net/ip_tunnels.h>
     62#include <net/l3mdev.h>
     63#include <net/ip.h>
     64#include <linux/uaccess.h>
     65#include <linux/btf_ids.h>
     66
     67#ifdef CONFIG_SYSCTL
     68#include <linux/sysctl.h>
     69#endif
     70
     71static int ip6_rt_type_to_error(u8 fib6_type);
     72
     73#define CREATE_TRACE_POINTS
     74#include <trace/events/fib6.h>
     75EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
     76#undef CREATE_TRACE_POINTS
     77
     78enum rt6_nud_state {
     79	RT6_NUD_FAIL_HARD = -3,
     80	RT6_NUD_FAIL_PROBE = -2,
     81	RT6_NUD_FAIL_DO_RR = -1,
     82	RT6_NUD_SUCCEED = 1
     83};
     84
     85INDIRECT_CALLABLE_SCOPE
     86struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
     87static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
     88INDIRECT_CALLABLE_SCOPE
     89unsigned int		ip6_mtu(const struct dst_entry *dst);
     90static struct dst_entry *ip6_negative_advice(struct dst_entry *);
     91static void		ip6_dst_destroy(struct dst_entry *);
     92static void		ip6_dst_ifdown(struct dst_entry *,
     93				       struct net_device *dev, int how);
     94static int		 ip6_dst_gc(struct dst_ops *ops);
     95
     96static int		ip6_pkt_discard(struct sk_buff *skb);
     97static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
     98static int		ip6_pkt_prohibit(struct sk_buff *skb);
     99static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
    100static void		ip6_link_failure(struct sk_buff *skb);
    101static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
    102					   struct sk_buff *skb, u32 mtu,
    103					   bool confirm_neigh);
    104static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
    105					struct sk_buff *skb);
    106static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
    107			   int strict);
    108static size_t rt6_nlmsg_size(struct fib6_info *f6i);
    109static int rt6_fill_node(struct net *net, struct sk_buff *skb,
    110			 struct fib6_info *rt, struct dst_entry *dst,
    111			 struct in6_addr *dest, struct in6_addr *src,
    112			 int iif, int type, u32 portid, u32 seq,
    113			 unsigned int flags);
    114static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
    115					   const struct in6_addr *daddr,
    116					   const struct in6_addr *saddr);
    117
    118#ifdef CONFIG_IPV6_ROUTE_INFO
    119static struct fib6_info *rt6_add_route_info(struct net *net,
    120					   const struct in6_addr *prefix, int prefixlen,
    121					   const struct in6_addr *gwaddr,
    122					   struct net_device *dev,
    123					   unsigned int pref);
    124static struct fib6_info *rt6_get_route_info(struct net *net,
    125					   const struct in6_addr *prefix, int prefixlen,
    126					   const struct in6_addr *gwaddr,
    127					   struct net_device *dev);
    128#endif
    129
    130struct uncached_list {
    131	spinlock_t		lock;
    132	struct list_head	head;
    133	struct list_head	quarantine;
    134};
    135
    136static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
    137
    138void rt6_uncached_list_add(struct rt6_info *rt)
    139{
    140	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
    141
    142	rt->rt6i_uncached_list = ul;
    143
    144	spin_lock_bh(&ul->lock);
    145	list_add_tail(&rt->rt6i_uncached, &ul->head);
    146	spin_unlock_bh(&ul->lock);
    147}
    148
    149void rt6_uncached_list_del(struct rt6_info *rt)
    150{
    151	if (!list_empty(&rt->rt6i_uncached)) {
    152		struct uncached_list *ul = rt->rt6i_uncached_list;
    153
    154		spin_lock_bh(&ul->lock);
    155		list_del_init(&rt->rt6i_uncached);
    156		spin_unlock_bh(&ul->lock);
    157	}
    158}
    159
    160static void rt6_uncached_list_flush_dev(struct net_device *dev)
    161{
    162	int cpu;
    163
    164	for_each_possible_cpu(cpu) {
    165		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
    166		struct rt6_info *rt, *safe;
    167
    168		if (list_empty(&ul->head))
    169			continue;
    170
    171		spin_lock_bh(&ul->lock);
    172		list_for_each_entry_safe(rt, safe, &ul->head, rt6i_uncached) {
    173			struct inet6_dev *rt_idev = rt->rt6i_idev;
    174			struct net_device *rt_dev = rt->dst.dev;
    175			bool handled = false;
    176
    177			if (rt_idev->dev == dev) {
    178				rt->rt6i_idev = in6_dev_get(blackhole_netdev);
    179				in6_dev_put(rt_idev);
    180				handled = true;
    181			}
    182
    183			if (rt_dev == dev) {
    184				rt->dst.dev = blackhole_netdev;
    185				dev_replace_track(rt_dev, blackhole_netdev,
    186						  &rt->dst.dev_tracker,
    187						  GFP_ATOMIC);
    188				handled = true;
    189			}
    190			if (handled)
    191				list_move(&rt->rt6i_uncached,
    192					  &ul->quarantine);
    193		}
    194		spin_unlock_bh(&ul->lock);
    195	}
    196}
    197
    198static inline const void *choose_neigh_daddr(const struct in6_addr *p,
    199					     struct sk_buff *skb,
    200					     const void *daddr)
    201{
    202	if (!ipv6_addr_any(p))
    203		return (const void *) p;
    204	else if (skb)
    205		return &ipv6_hdr(skb)->daddr;
    206	return daddr;
    207}
    208
    209struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
    210				   struct net_device *dev,
    211				   struct sk_buff *skb,
    212				   const void *daddr)
    213{
    214	struct neighbour *n;
    215
    216	daddr = choose_neigh_daddr(gw, skb, daddr);
    217	n = __ipv6_neigh_lookup(dev, daddr);
    218	if (n)
    219		return n;
    220
    221	n = neigh_create(&nd_tbl, daddr, dev);
    222	return IS_ERR(n) ? NULL : n;
    223}
    224
    225static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
    226					      struct sk_buff *skb,
    227					      const void *daddr)
    228{
    229	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
    230
    231	return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any),
    232				dst->dev, skb, daddr);
    233}
    234
    235static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
    236{
    237	struct net_device *dev = dst->dev;
    238	struct rt6_info *rt = (struct rt6_info *)dst;
    239
    240	daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr);
    241	if (!daddr)
    242		return;
    243	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
    244		return;
    245	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
    246		return;
    247	__ipv6_confirm_neigh(dev, daddr);
    248}
    249
    250static struct dst_ops ip6_dst_ops_template = {
    251	.family			=	AF_INET6,
    252	.gc			=	ip6_dst_gc,
    253	.gc_thresh		=	1024,
    254	.check			=	ip6_dst_check,
    255	.default_advmss		=	ip6_default_advmss,
    256	.mtu			=	ip6_mtu,
    257	.cow_metrics		=	dst_cow_metrics_generic,
    258	.destroy		=	ip6_dst_destroy,
    259	.ifdown			=	ip6_dst_ifdown,
    260	.negative_advice	=	ip6_negative_advice,
    261	.link_failure		=	ip6_link_failure,
    262	.update_pmtu		=	ip6_rt_update_pmtu,
    263	.redirect		=	rt6_do_redirect,
    264	.local_out		=	__ip6_local_out,
    265	.neigh_lookup		=	ip6_dst_neigh_lookup,
    266	.confirm_neigh		=	ip6_confirm_neigh,
    267};
    268
    269static struct dst_ops ip6_dst_blackhole_ops = {
    270	.family			= AF_INET6,
    271	.default_advmss		= ip6_default_advmss,
    272	.neigh_lookup		= ip6_dst_neigh_lookup,
    273	.check			= ip6_dst_check,
    274	.destroy		= ip6_dst_destroy,
    275	.cow_metrics		= dst_cow_metrics_generic,
    276	.update_pmtu		= dst_blackhole_update_pmtu,
    277	.redirect		= dst_blackhole_redirect,
    278	.mtu			= dst_blackhole_mtu,
    279};
    280
    281static const u32 ip6_template_metrics[RTAX_MAX] = {
    282	[RTAX_HOPLIMIT - 1] = 0,
    283};
    284
    285static const struct fib6_info fib6_null_entry_template = {
    286	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
    287	.fib6_protocol  = RTPROT_KERNEL,
    288	.fib6_metric	= ~(u32)0,
    289	.fib6_ref	= REFCOUNT_INIT(1),
    290	.fib6_type	= RTN_UNREACHABLE,
    291	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
    292};
    293
    294static const struct rt6_info ip6_null_entry_template = {
    295	.dst = {
    296		.__refcnt	= ATOMIC_INIT(1),
    297		.__use		= 1,
    298		.obsolete	= DST_OBSOLETE_FORCE_CHK,
    299		.error		= -ENETUNREACH,
    300		.input		= ip6_pkt_discard,
    301		.output		= ip6_pkt_discard_out,
    302	},
    303	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
    304};
    305
    306#ifdef CONFIG_IPV6_MULTIPLE_TABLES
    307
    308static const struct rt6_info ip6_prohibit_entry_template = {
    309	.dst = {
    310		.__refcnt	= ATOMIC_INIT(1),
    311		.__use		= 1,
    312		.obsolete	= DST_OBSOLETE_FORCE_CHK,
    313		.error		= -EACCES,
    314		.input		= ip6_pkt_prohibit,
    315		.output		= ip6_pkt_prohibit_out,
    316	},
    317	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
    318};
    319
    320static const struct rt6_info ip6_blk_hole_entry_template = {
    321	.dst = {
    322		.__refcnt	= ATOMIC_INIT(1),
    323		.__use		= 1,
    324		.obsolete	= DST_OBSOLETE_FORCE_CHK,
    325		.error		= -EINVAL,
    326		.input		= dst_discard,
    327		.output		= dst_discard_out,
    328	},
    329	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
    330};
    331
    332#endif
    333
    334static void rt6_info_init(struct rt6_info *rt)
    335{
    336	memset_after(rt, 0, dst);
    337	INIT_LIST_HEAD(&rt->rt6i_uncached);
    338}
    339
    340/* allocate dst with ip6_dst_ops */
    341struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
    342			       int flags)
    343{
    344	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
    345					1, DST_OBSOLETE_FORCE_CHK, flags);
    346
    347	if (rt) {
    348		rt6_info_init(rt);
    349		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
    350	}
    351
    352	return rt;
    353}
    354EXPORT_SYMBOL(ip6_dst_alloc);
    355
    356static void ip6_dst_destroy(struct dst_entry *dst)
    357{
    358	struct rt6_info *rt = (struct rt6_info *)dst;
    359	struct fib6_info *from;
    360	struct inet6_dev *idev;
    361
    362	ip_dst_metrics_put(dst);
    363	rt6_uncached_list_del(rt);
    364
    365	idev = rt->rt6i_idev;
    366	if (idev) {
    367		rt->rt6i_idev = NULL;
    368		in6_dev_put(idev);
    369	}
    370
    371	from = xchg((__force struct fib6_info **)&rt->from, NULL);
    372	fib6_info_release(from);
    373}
    374
    375static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
    376			   int how)
    377{
    378	struct rt6_info *rt = (struct rt6_info *)dst;
    379	struct inet6_dev *idev = rt->rt6i_idev;
    380
    381	if (idev && idev->dev != blackhole_netdev) {
    382		struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev);
    383
    384		if (blackhole_idev) {
    385			rt->rt6i_idev = blackhole_idev;
    386			in6_dev_put(idev);
    387		}
    388	}
    389}
    390
    391static bool __rt6_check_expired(const struct rt6_info *rt)
    392{
    393	if (rt->rt6i_flags & RTF_EXPIRES)
    394		return time_after(jiffies, rt->dst.expires);
    395	else
    396		return false;
    397}
    398
    399static bool rt6_check_expired(const struct rt6_info *rt)
    400{
    401	struct fib6_info *from;
    402
    403	from = rcu_dereference(rt->from);
    404
    405	if (rt->rt6i_flags & RTF_EXPIRES) {
    406		if (time_after(jiffies, rt->dst.expires))
    407			return true;
    408	} else if (from) {
    409		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
    410			fib6_check_expired(from);
    411	}
    412	return false;
    413}
    414
    415void fib6_select_path(const struct net *net, struct fib6_result *res,
    416		      struct flowi6 *fl6, int oif, bool have_oif_match,
    417		      const struct sk_buff *skb, int strict)
    418{
    419	struct fib6_info *sibling, *next_sibling;
    420	struct fib6_info *match = res->f6i;
    421
    422	if (!match->nh && (!match->fib6_nsiblings || have_oif_match))
    423		goto out;
    424
    425	if (match->nh && have_oif_match && res->nh)
    426		return;
    427
    428	/* We might have already computed the hash for ICMPv6 errors. In such
    429	 * case it will always be non-zero. Otherwise now is the time to do it.
    430	 */
    431	if (!fl6->mp_hash &&
    432	    (!match->nh || nexthop_is_multipath(match->nh)))
    433		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
    434
    435	if (unlikely(match->nh)) {
    436		nexthop_path_fib6_result(res, fl6->mp_hash);
    437		return;
    438	}
    439
    440	if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
    441		goto out;
    442
    443	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
    444				 fib6_siblings) {
    445		const struct fib6_nh *nh = sibling->fib6_nh;
    446		int nh_upper_bound;
    447
    448		nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
    449		if (fl6->mp_hash > nh_upper_bound)
    450			continue;
    451		if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
    452			break;
    453		match = sibling;
    454		break;
    455	}
    456
    457out:
    458	res->f6i = match;
    459	res->nh = match->fib6_nh;
    460}
    461
    462/*
    463 *	Route lookup. rcu_read_lock() should be held.
    464 */
    465
    466static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
    467			       const struct in6_addr *saddr, int oif, int flags)
    468{
    469	const struct net_device *dev;
    470
    471	if (nh->fib_nh_flags & RTNH_F_DEAD)
    472		return false;
    473
    474	dev = nh->fib_nh_dev;
    475	if (oif) {
    476		if (dev->ifindex == oif)
    477			return true;
    478	} else {
    479		if (ipv6_chk_addr(net, saddr, dev,
    480				  flags & RT6_LOOKUP_F_IFACE))
    481			return true;
    482	}
    483
    484	return false;
    485}
    486
    487struct fib6_nh_dm_arg {
    488	struct net		*net;
    489	const struct in6_addr	*saddr;
    490	int			oif;
    491	int			flags;
    492	struct fib6_nh		*nh;
    493};
    494
    495static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg)
    496{
    497	struct fib6_nh_dm_arg *arg = _arg;
    498
    499	arg->nh = nh;
    500	return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif,
    501				  arg->flags);
    502}
    503
    504/* returns fib6_nh from nexthop or NULL */
    505static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh,
    506					struct fib6_result *res,
    507					const struct in6_addr *saddr,
    508					int oif, int flags)
    509{
    510	struct fib6_nh_dm_arg arg = {
    511		.net   = net,
    512		.saddr = saddr,
    513		.oif   = oif,
    514		.flags = flags,
    515	};
    516
    517	if (nexthop_is_blackhole(nh))
    518		return NULL;
    519
    520	if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg))
    521		return arg.nh;
    522
    523	return NULL;
    524}
    525
    526static void rt6_device_match(struct net *net, struct fib6_result *res,
    527			     const struct in6_addr *saddr, int oif, int flags)
    528{
    529	struct fib6_info *f6i = res->f6i;
    530	struct fib6_info *spf6i;
    531	struct fib6_nh *nh;
    532
    533	if (!oif && ipv6_addr_any(saddr)) {
    534		if (unlikely(f6i->nh)) {
    535			nh = nexthop_fib6_nh(f6i->nh);
    536			if (nexthop_is_blackhole(f6i->nh))
    537				goto out_blackhole;
    538		} else {
    539			nh = f6i->fib6_nh;
    540		}
    541		if (!(nh->fib_nh_flags & RTNH_F_DEAD))
    542			goto out;
    543	}
    544
    545	for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
    546		bool matched = false;
    547
    548		if (unlikely(spf6i->nh)) {
    549			nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr,
    550					      oif, flags);
    551			if (nh)
    552				matched = true;
    553		} else {
    554			nh = spf6i->fib6_nh;
    555			if (__rt6_device_match(net, nh, saddr, oif, flags))
    556				matched = true;
    557		}
    558		if (matched) {
    559			res->f6i = spf6i;
    560			goto out;
    561		}
    562	}
    563
    564	if (oif && flags & RT6_LOOKUP_F_IFACE) {
    565		res->f6i = net->ipv6.fib6_null_entry;
    566		nh = res->f6i->fib6_nh;
    567		goto out;
    568	}
    569
    570	if (unlikely(f6i->nh)) {
    571		nh = nexthop_fib6_nh(f6i->nh);
    572		if (nexthop_is_blackhole(f6i->nh))
    573			goto out_blackhole;
    574	} else {
    575		nh = f6i->fib6_nh;
    576	}
    577
    578	if (nh->fib_nh_flags & RTNH_F_DEAD) {
    579		res->f6i = net->ipv6.fib6_null_entry;
    580		nh = res->f6i->fib6_nh;
    581	}
    582out:
    583	res->nh = nh;
    584	res->fib6_type = res->f6i->fib6_type;
    585	res->fib6_flags = res->f6i->fib6_flags;
    586	return;
    587
    588out_blackhole:
    589	res->fib6_flags |= RTF_REJECT;
    590	res->fib6_type = RTN_BLACKHOLE;
    591	res->nh = nh;
    592}
    593
    594#ifdef CONFIG_IPV6_ROUTER_PREF
    595struct __rt6_probe_work {
    596	struct work_struct work;
    597	struct in6_addr target;
    598	struct net_device *dev;
    599	netdevice_tracker dev_tracker;
    600};
    601
    602static void rt6_probe_deferred(struct work_struct *w)
    603{
    604	struct in6_addr mcaddr;
    605	struct __rt6_probe_work *work =
    606		container_of(w, struct __rt6_probe_work, work);
    607
    608	addrconf_addr_solict_mult(&work->target, &mcaddr);
    609	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
    610	dev_put_track(work->dev, &work->dev_tracker);
    611	kfree(work);
    612}
    613
    614static void rt6_probe(struct fib6_nh *fib6_nh)
    615{
    616	struct __rt6_probe_work *work = NULL;
    617	const struct in6_addr *nh_gw;
    618	unsigned long last_probe;
    619	struct neighbour *neigh;
    620	struct net_device *dev;
    621	struct inet6_dev *idev;
    622
    623	/*
    624	 * Okay, this does not seem to be appropriate
    625	 * for now, however, we need to check if it
    626	 * is really so; aka Router Reachability Probing.
    627	 *
    628	 * Router Reachability Probe MUST be rate-limited
    629	 * to no more than one per minute.
    630	 */
    631	if (!fib6_nh->fib_nh_gw_family)
    632		return;
    633
    634	nh_gw = &fib6_nh->fib_nh_gw6;
    635	dev = fib6_nh->fib_nh_dev;
    636	rcu_read_lock_bh();
    637	last_probe = READ_ONCE(fib6_nh->last_probe);
    638	idev = __in6_dev_get(dev);
    639	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
    640	if (neigh) {
    641		if (neigh->nud_state & NUD_VALID)
    642			goto out;
    643
    644		write_lock(&neigh->lock);
    645		if (!(neigh->nud_state & NUD_VALID) &&
    646		    time_after(jiffies,
    647			       neigh->updated + idev->cnf.rtr_probe_interval)) {
    648			work = kmalloc(sizeof(*work), GFP_ATOMIC);
    649			if (work)
    650				__neigh_set_probe_once(neigh);
    651		}
    652		write_unlock(&neigh->lock);
    653	} else if (time_after(jiffies, last_probe +
    654				       idev->cnf.rtr_probe_interval)) {
    655		work = kmalloc(sizeof(*work), GFP_ATOMIC);
    656	}
    657
    658	if (!work || cmpxchg(&fib6_nh->last_probe,
    659			     last_probe, jiffies) != last_probe) {
    660		kfree(work);
    661	} else {
    662		INIT_WORK(&work->work, rt6_probe_deferred);
    663		work->target = *nh_gw;
    664		dev_hold_track(dev, &work->dev_tracker, GFP_ATOMIC);
    665		work->dev = dev;
    666		schedule_work(&work->work);
    667	}
    668
    669out:
    670	rcu_read_unlock_bh();
    671}
    672#else
    673static inline void rt6_probe(struct fib6_nh *fib6_nh)
    674{
    675}
    676#endif
    677
    678/*
    679 * Default Router Selection (RFC 2461 6.3.6)
    680 */
    681static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
    682{
    683	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
    684	struct neighbour *neigh;
    685
    686	rcu_read_lock_bh();
    687	neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
    688					  &fib6_nh->fib_nh_gw6);
    689	if (neigh) {
    690		read_lock(&neigh->lock);
    691		if (neigh->nud_state & NUD_VALID)
    692			ret = RT6_NUD_SUCCEED;
    693#ifdef CONFIG_IPV6_ROUTER_PREF
    694		else if (!(neigh->nud_state & NUD_FAILED))
    695			ret = RT6_NUD_SUCCEED;
    696		else
    697			ret = RT6_NUD_FAIL_PROBE;
    698#endif
    699		read_unlock(&neigh->lock);
    700	} else {
    701		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
    702		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
    703	}
    704	rcu_read_unlock_bh();
    705
    706	return ret;
    707}
    708
    709static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
    710			   int strict)
    711{
    712	int m = 0;
    713
    714	if (!oif || nh->fib_nh_dev->ifindex == oif)
    715		m = 2;
    716
    717	if (!m && (strict & RT6_LOOKUP_F_IFACE))
    718		return RT6_NUD_FAIL_HARD;
    719#ifdef CONFIG_IPV6_ROUTER_PREF
    720	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
    721#endif
    722	if ((strict & RT6_LOOKUP_F_REACHABLE) &&
    723	    !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
    724		int n = rt6_check_neigh(nh);
    725		if (n < 0)
    726			return n;
    727	}
    728	return m;
    729}
    730
    731static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
    732		       int oif, int strict, int *mpri, bool *do_rr)
    733{
    734	bool match_do_rr = false;
    735	bool rc = false;
    736	int m;
    737
    738	if (nh->fib_nh_flags & RTNH_F_DEAD)
    739		goto out;
    740
    741	if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
    742	    nh->fib_nh_flags & RTNH_F_LINKDOWN &&
    743	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
    744		goto out;
    745
    746	m = rt6_score_route(nh, fib6_flags, oif, strict);
    747	if (m == RT6_NUD_FAIL_DO_RR) {
    748		match_do_rr = true;
    749		m = 0; /* lowest valid score */
    750	} else if (m == RT6_NUD_FAIL_HARD) {
    751		goto out;
    752	}
    753
    754	if (strict & RT6_LOOKUP_F_REACHABLE)
    755		rt6_probe(nh);
    756
    757	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
    758	if (m > *mpri) {
    759		*do_rr = match_do_rr;
    760		*mpri = m;
    761		rc = true;
    762	}
    763out:
    764	return rc;
    765}
    766
    767struct fib6_nh_frl_arg {
    768	u32		flags;
    769	int		oif;
    770	int		strict;
    771	int		*mpri;
    772	bool		*do_rr;
    773	struct fib6_nh	*nh;
    774};
    775
    776static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg)
    777{
    778	struct fib6_nh_frl_arg *arg = _arg;
    779
    780	arg->nh = nh;
    781	return find_match(nh, arg->flags, arg->oif, arg->strict,
    782			  arg->mpri, arg->do_rr);
    783}
    784
    785static void __find_rr_leaf(struct fib6_info *f6i_start,
    786			   struct fib6_info *nomatch, u32 metric,
    787			   struct fib6_result *res, struct fib6_info **cont,
    788			   int oif, int strict, bool *do_rr, int *mpri)
    789{
    790	struct fib6_info *f6i;
    791
    792	for (f6i = f6i_start;
    793	     f6i && f6i != nomatch;
    794	     f6i = rcu_dereference(f6i->fib6_next)) {
    795		bool matched = false;
    796		struct fib6_nh *nh;
    797
    798		if (cont && f6i->fib6_metric != metric) {
    799			*cont = f6i;
    800			return;
    801		}
    802
    803		if (fib6_check_expired(f6i))
    804			continue;
    805
    806		if (unlikely(f6i->nh)) {
    807			struct fib6_nh_frl_arg arg = {
    808				.flags  = f6i->fib6_flags,
    809				.oif    = oif,
    810				.strict = strict,
    811				.mpri   = mpri,
    812				.do_rr  = do_rr
    813			};
    814
    815			if (nexthop_is_blackhole(f6i->nh)) {
    816				res->fib6_flags = RTF_REJECT;
    817				res->fib6_type = RTN_BLACKHOLE;
    818				res->f6i = f6i;
    819				res->nh = nexthop_fib6_nh(f6i->nh);
    820				return;
    821			}
    822			if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match,
    823						     &arg)) {
    824				matched = true;
    825				nh = arg.nh;
    826			}
    827		} else {
    828			nh = f6i->fib6_nh;
    829			if (find_match(nh, f6i->fib6_flags, oif, strict,
    830				       mpri, do_rr))
    831				matched = true;
    832		}
    833		if (matched) {
    834			res->f6i = f6i;
    835			res->nh = nh;
    836			res->fib6_flags = f6i->fib6_flags;
    837			res->fib6_type = f6i->fib6_type;
    838		}
    839	}
    840}
    841
    842static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
    843			 struct fib6_info *rr_head, int oif, int strict,
    844			 bool *do_rr, struct fib6_result *res)
    845{
    846	u32 metric = rr_head->fib6_metric;
    847	struct fib6_info *cont = NULL;
    848	int mpri = -1;
    849
    850	__find_rr_leaf(rr_head, NULL, metric, res, &cont,
    851		       oif, strict, do_rr, &mpri);
    852
    853	__find_rr_leaf(leaf, rr_head, metric, res, &cont,
    854		       oif, strict, do_rr, &mpri);
    855
    856	if (res->f6i || !cont)
    857		return;
    858
    859	__find_rr_leaf(cont, NULL, metric, res, NULL,
    860		       oif, strict, do_rr, &mpri);
    861}
    862
    863static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
    864		       struct fib6_result *res, int strict)
    865{
    866	struct fib6_info *leaf = rcu_dereference(fn->leaf);
    867	struct fib6_info *rt0;
    868	bool do_rr = false;
    869	int key_plen;
    870
    871	/* make sure this function or its helpers sets f6i */
    872	res->f6i = NULL;
    873
    874	if (!leaf || leaf == net->ipv6.fib6_null_entry)
    875		goto out;
    876
    877	rt0 = rcu_dereference(fn->rr_ptr);
    878	if (!rt0)
    879		rt0 = leaf;
    880
    881	/* Double check to make sure fn is not an intermediate node
    882	 * and fn->leaf does not points to its child's leaf
    883	 * (This might happen if all routes under fn are deleted from
    884	 * the tree and fib6_repair_tree() is called on the node.)
    885	 */
    886	key_plen = rt0->fib6_dst.plen;
    887#ifdef CONFIG_IPV6_SUBTREES
    888	if (rt0->fib6_src.plen)
    889		key_plen = rt0->fib6_src.plen;
    890#endif
    891	if (fn->fn_bit != key_plen)
    892		goto out;
    893
    894	find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
    895	if (do_rr) {
    896		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
    897
    898		/* no entries matched; do round-robin */
    899		if (!next || next->fib6_metric != rt0->fib6_metric)
    900			next = leaf;
    901
    902		if (next != rt0) {
    903			spin_lock_bh(&leaf->fib6_table->tb6_lock);
    904			/* make sure next is not being deleted from the tree */
    905			if (next->fib6_node)
    906				rcu_assign_pointer(fn->rr_ptr, next);
    907			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
    908		}
    909	}
    910
    911out:
    912	if (!res->f6i) {
    913		res->f6i = net->ipv6.fib6_null_entry;
    914		res->nh = res->f6i->fib6_nh;
    915		res->fib6_flags = res->f6i->fib6_flags;
    916		res->fib6_type = res->f6i->fib6_type;
    917	}
    918}
    919
    920static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
    921{
    922	return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
    923	       res->nh->fib_nh_gw_family;
    924}
    925
    926#ifdef CONFIG_IPV6_ROUTE_INFO
    927int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
    928		  const struct in6_addr *gwaddr)
    929{
    930	struct net *net = dev_net(dev);
    931	struct route_info *rinfo = (struct route_info *) opt;
    932	struct in6_addr prefix_buf, *prefix;
    933	unsigned int pref;
    934	unsigned long lifetime;
    935	struct fib6_info *rt;
    936
    937	if (len < sizeof(struct route_info)) {
    938		return -EINVAL;
    939	}
    940
    941	/* Sanity check for prefix_len and length */
    942	if (rinfo->length > 3) {
    943		return -EINVAL;
    944	} else if (rinfo->prefix_len > 128) {
    945		return -EINVAL;
    946	} else if (rinfo->prefix_len > 64) {
    947		if (rinfo->length < 2) {
    948			return -EINVAL;
    949		}
    950	} else if (rinfo->prefix_len > 0) {
    951		if (rinfo->length < 1) {
    952			return -EINVAL;
    953		}
    954	}
    955
    956	pref = rinfo->route_pref;
    957	if (pref == ICMPV6_ROUTER_PREF_INVALID)
    958		return -EINVAL;
    959
    960	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
    961
    962	if (rinfo->length == 3)
    963		prefix = (struct in6_addr *)rinfo->prefix;
    964	else {
    965		/* this function is safe */
    966		ipv6_addr_prefix(&prefix_buf,
    967				 (struct in6_addr *)rinfo->prefix,
    968				 rinfo->prefix_len);
    969		prefix = &prefix_buf;
    970	}
    971
    972	if (rinfo->prefix_len == 0)
    973		rt = rt6_get_dflt_router(net, gwaddr, dev);
    974	else
    975		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
    976					gwaddr, dev);
    977
    978	if (rt && !lifetime) {
    979		ip6_del_rt(net, rt, false);
    980		rt = NULL;
    981	}
    982
    983	if (!rt && lifetime)
    984		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
    985					dev, pref);
    986	else if (rt)
    987		rt->fib6_flags = RTF_ROUTEINFO |
    988				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
    989
    990	if (rt) {
    991		if (!addrconf_finite_timeout(lifetime))
    992			fib6_clean_expires(rt);
    993		else
    994			fib6_set_expires(rt, jiffies + HZ * lifetime);
    995
    996		fib6_info_release(rt);
    997	}
    998	return 0;
    999}
   1000#endif
   1001
   1002/*
   1003 *	Misc support functions
   1004 */
   1005
   1006/* called with rcu_lock held */
   1007static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
   1008{
   1009	struct net_device *dev = res->nh->fib_nh_dev;
   1010
   1011	if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
   1012		/* for copies of local routes, dst->dev needs to be the
   1013		 * device if it is a master device, the master device if
   1014		 * device is enslaved, and the loopback as the default
   1015		 */
   1016		if (netif_is_l3_slave(dev) &&
   1017		    !rt6_need_strict(&res->f6i->fib6_dst.addr))
   1018			dev = l3mdev_master_dev_rcu(dev);
   1019		else if (!netif_is_l3_master(dev))
   1020			dev = dev_net(dev)->loopback_dev;
   1021		/* last case is netif_is_l3_master(dev) is true in which
   1022		 * case we want dev returned to be dev
   1023		 */
   1024	}
   1025
   1026	return dev;
   1027}
   1028
   1029static const int fib6_prop[RTN_MAX + 1] = {
   1030	[RTN_UNSPEC]	= 0,
   1031	[RTN_UNICAST]	= 0,
   1032	[RTN_LOCAL]	= 0,
   1033	[RTN_BROADCAST]	= 0,
   1034	[RTN_ANYCAST]	= 0,
   1035	[RTN_MULTICAST]	= 0,
   1036	[RTN_BLACKHOLE]	= -EINVAL,
   1037	[RTN_UNREACHABLE] = -EHOSTUNREACH,
   1038	[RTN_PROHIBIT]	= -EACCES,
   1039	[RTN_THROW]	= -EAGAIN,
   1040	[RTN_NAT]	= -EINVAL,
   1041	[RTN_XRESOLVE]	= -EINVAL,
   1042};
   1043
   1044static int ip6_rt_type_to_error(u8 fib6_type)
   1045{
   1046	return fib6_prop[fib6_type];
   1047}
   1048
   1049static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
   1050{
   1051	unsigned short flags = 0;
   1052
   1053	if (rt->dst_nocount)
   1054		flags |= DST_NOCOUNT;
   1055	if (rt->dst_nopolicy)
   1056		flags |= DST_NOPOLICY;
   1057
   1058	return flags;
   1059}
   1060
   1061static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
   1062{
   1063	rt->dst.error = ip6_rt_type_to_error(fib6_type);
   1064
   1065	switch (fib6_type) {
   1066	case RTN_BLACKHOLE:
   1067		rt->dst.output = dst_discard_out;
   1068		rt->dst.input = dst_discard;
   1069		break;
   1070	case RTN_PROHIBIT:
   1071		rt->dst.output = ip6_pkt_prohibit_out;
   1072		rt->dst.input = ip6_pkt_prohibit;
   1073		break;
   1074	case RTN_THROW:
   1075	case RTN_UNREACHABLE:
   1076	default:
   1077		rt->dst.output = ip6_pkt_discard_out;
   1078		rt->dst.input = ip6_pkt_discard;
   1079		break;
   1080	}
   1081}
   1082
   1083static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
   1084{
   1085	struct fib6_info *f6i = res->f6i;
   1086
   1087	if (res->fib6_flags & RTF_REJECT) {
   1088		ip6_rt_init_dst_reject(rt, res->fib6_type);
   1089		return;
   1090	}
   1091
   1092	rt->dst.error = 0;
   1093	rt->dst.output = ip6_output;
   1094
   1095	if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
   1096		rt->dst.input = ip6_input;
   1097	} else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
   1098		rt->dst.input = ip6_mc_input;
   1099	} else {
   1100		rt->dst.input = ip6_forward;
   1101	}
   1102
   1103	if (res->nh->fib_nh_lws) {
   1104		rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
   1105		lwtunnel_set_redirect(&rt->dst);
   1106	}
   1107
   1108	rt->dst.lastuse = jiffies;
   1109}
   1110
   1111/* Caller must already hold reference to @from */
   1112static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
   1113{
   1114	rt->rt6i_flags &= ~RTF_EXPIRES;
   1115	rcu_assign_pointer(rt->from, from);
   1116	ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
   1117}
   1118
   1119/* Caller must already hold reference to f6i in result */
   1120static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
   1121{
   1122	const struct fib6_nh *nh = res->nh;
   1123	const struct net_device *dev = nh->fib_nh_dev;
   1124	struct fib6_info *f6i = res->f6i;
   1125
   1126	ip6_rt_init_dst(rt, res);
   1127
   1128	rt->rt6i_dst = f6i->fib6_dst;
   1129	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
   1130	rt->rt6i_flags = res->fib6_flags;
   1131	if (nh->fib_nh_gw_family) {
   1132		rt->rt6i_gateway = nh->fib_nh_gw6;
   1133		rt->rt6i_flags |= RTF_GATEWAY;
   1134	}
   1135	rt6_set_from(rt, f6i);
   1136#ifdef CONFIG_IPV6_SUBTREES
   1137	rt->rt6i_src = f6i->fib6_src;
   1138#endif
   1139}
   1140
   1141static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
   1142					struct in6_addr *saddr)
   1143{
   1144	struct fib6_node *pn, *sn;
   1145	while (1) {
   1146		if (fn->fn_flags & RTN_TL_ROOT)
   1147			return NULL;
   1148		pn = rcu_dereference(fn->parent);
   1149		sn = FIB6_SUBTREE(pn);
   1150		if (sn && sn != fn)
   1151			fn = fib6_node_lookup(sn, NULL, saddr);
   1152		else
   1153			fn = pn;
   1154		if (fn->fn_flags & RTN_RTINFO)
   1155			return fn;
   1156	}
   1157}
   1158
   1159static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
   1160{
   1161	struct rt6_info *rt = *prt;
   1162
   1163	if (dst_hold_safe(&rt->dst))
   1164		return true;
   1165	if (net) {
   1166		rt = net->ipv6.ip6_null_entry;
   1167		dst_hold(&rt->dst);
   1168	} else {
   1169		rt = NULL;
   1170	}
   1171	*prt = rt;
   1172	return false;
   1173}
   1174
   1175/* called with rcu_lock held */
   1176static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
   1177{
   1178	struct net_device *dev = res->nh->fib_nh_dev;
   1179	struct fib6_info *f6i = res->f6i;
   1180	unsigned short flags;
   1181	struct rt6_info *nrt;
   1182
   1183	if (!fib6_info_hold_safe(f6i))
   1184		goto fallback;
   1185
   1186	flags = fib6_info_dst_flags(f6i);
   1187	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
   1188	if (!nrt) {
   1189		fib6_info_release(f6i);
   1190		goto fallback;
   1191	}
   1192
   1193	ip6_rt_copy_init(nrt, res);
   1194	return nrt;
   1195
   1196fallback:
   1197	nrt = dev_net(dev)->ipv6.ip6_null_entry;
   1198	dst_hold(&nrt->dst);
   1199	return nrt;
   1200}
   1201
   1202INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net,
   1203					     struct fib6_table *table,
   1204					     struct flowi6 *fl6,
   1205					     const struct sk_buff *skb,
   1206					     int flags)
   1207{
   1208	struct fib6_result res = {};
   1209	struct fib6_node *fn;
   1210	struct rt6_info *rt;
   1211
   1212	rcu_read_lock();
   1213	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
   1214restart:
   1215	res.f6i = rcu_dereference(fn->leaf);
   1216	if (!res.f6i)
   1217		res.f6i = net->ipv6.fib6_null_entry;
   1218	else
   1219		rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
   1220				 flags);
   1221
   1222	if (res.f6i == net->ipv6.fib6_null_entry) {
   1223		fn = fib6_backtrack(fn, &fl6->saddr);
   1224		if (fn)
   1225			goto restart;
   1226
   1227		rt = net->ipv6.ip6_null_entry;
   1228		dst_hold(&rt->dst);
   1229		goto out;
   1230	} else if (res.fib6_flags & RTF_REJECT) {
   1231		goto do_create;
   1232	}
   1233
   1234	fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
   1235			 fl6->flowi6_oif != 0, skb, flags);
   1236
   1237	/* Search through exception table */
   1238	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
   1239	if (rt) {
   1240		if (ip6_hold_safe(net, &rt))
   1241			dst_use_noref(&rt->dst, jiffies);
   1242	} else {
   1243do_create:
   1244		rt = ip6_create_rt_rcu(&res);
   1245	}
   1246
   1247out:
   1248	trace_fib6_table_lookup(net, &res, table, fl6);
   1249
   1250	rcu_read_unlock();
   1251
   1252	return rt;
   1253}
   1254
   1255struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
   1256				   const struct sk_buff *skb, int flags)
   1257{
   1258	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
   1259}
   1260EXPORT_SYMBOL_GPL(ip6_route_lookup);
   1261
   1262struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
   1263			    const struct in6_addr *saddr, int oif,
   1264			    const struct sk_buff *skb, int strict)
   1265{
   1266	struct flowi6 fl6 = {
   1267		.flowi6_oif = oif,
   1268		.daddr = *daddr,
   1269	};
   1270	struct dst_entry *dst;
   1271	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
   1272
   1273	if (saddr) {
   1274		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
   1275		flags |= RT6_LOOKUP_F_HAS_SADDR;
   1276	}
   1277
   1278	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
   1279	if (dst->error == 0)
   1280		return (struct rt6_info *) dst;
   1281
   1282	dst_release(dst);
   1283
   1284	return NULL;
   1285}
   1286EXPORT_SYMBOL(rt6_lookup);
   1287
   1288/* ip6_ins_rt is called with FREE table->tb6_lock.
   1289 * It takes new route entry, the addition fails by any reason the
   1290 * route is released.
   1291 * Caller must hold dst before calling it.
   1292 */
   1293
   1294static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
   1295			struct netlink_ext_ack *extack)
   1296{
   1297	int err;
   1298	struct fib6_table *table;
   1299
   1300	table = rt->fib6_table;
   1301	spin_lock_bh(&table->tb6_lock);
   1302	err = fib6_add(&table->tb6_root, rt, info, extack);
   1303	spin_unlock_bh(&table->tb6_lock);
   1304
   1305	return err;
   1306}
   1307
   1308int ip6_ins_rt(struct net *net, struct fib6_info *rt)
   1309{
   1310	struct nl_info info = {	.nl_net = net, };
   1311
   1312	return __ip6_ins_rt(rt, &info, NULL);
   1313}
   1314
   1315static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
   1316					   const struct in6_addr *daddr,
   1317					   const struct in6_addr *saddr)
   1318{
   1319	struct fib6_info *f6i = res->f6i;
   1320	struct net_device *dev;
   1321	struct rt6_info *rt;
   1322
   1323	/*
   1324	 *	Clone the route.
   1325	 */
   1326
   1327	if (!fib6_info_hold_safe(f6i))
   1328		return NULL;
   1329
   1330	dev = ip6_rt_get_dev_rcu(res);
   1331	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
   1332	if (!rt) {
   1333		fib6_info_release(f6i);
   1334		return NULL;
   1335	}
   1336
   1337	ip6_rt_copy_init(rt, res);
   1338	rt->rt6i_flags |= RTF_CACHE;
   1339	rt->rt6i_dst.addr = *daddr;
   1340	rt->rt6i_dst.plen = 128;
   1341
   1342	if (!rt6_is_gw_or_nonexthop(res)) {
   1343		if (f6i->fib6_dst.plen != 128 &&
   1344		    ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
   1345			rt->rt6i_flags |= RTF_ANYCAST;
   1346#ifdef CONFIG_IPV6_SUBTREES
   1347		if (rt->rt6i_src.plen && saddr) {
   1348			rt->rt6i_src.addr = *saddr;
   1349			rt->rt6i_src.plen = 128;
   1350		}
   1351#endif
   1352	}
   1353
   1354	return rt;
   1355}
   1356
   1357static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
   1358{
   1359	struct fib6_info *f6i = res->f6i;
   1360	unsigned short flags = fib6_info_dst_flags(f6i);
   1361	struct net_device *dev;
   1362	struct rt6_info *pcpu_rt;
   1363
   1364	if (!fib6_info_hold_safe(f6i))
   1365		return NULL;
   1366
   1367	rcu_read_lock();
   1368	dev = ip6_rt_get_dev_rcu(res);
   1369	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT);
   1370	rcu_read_unlock();
   1371	if (!pcpu_rt) {
   1372		fib6_info_release(f6i);
   1373		return NULL;
   1374	}
   1375	ip6_rt_copy_init(pcpu_rt, res);
   1376	pcpu_rt->rt6i_flags |= RTF_PCPU;
   1377
   1378	if (f6i->nh)
   1379		pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev));
   1380
   1381	return pcpu_rt;
   1382}
   1383
   1384static bool rt6_is_valid(const struct rt6_info *rt6)
   1385{
   1386	return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev));
   1387}
   1388
   1389/* It should be called with rcu_read_lock() acquired */
   1390static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
   1391{
   1392	struct rt6_info *pcpu_rt;
   1393
   1394	pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu);
   1395
   1396	if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) {
   1397		struct rt6_info *prev, **p;
   1398
   1399		p = this_cpu_ptr(res->nh->rt6i_pcpu);
   1400		prev = xchg(p, NULL);
   1401		if (prev) {
   1402			dst_dev_put(&prev->dst);
   1403			dst_release(&prev->dst);
   1404		}
   1405
   1406		pcpu_rt = NULL;
   1407	}
   1408
   1409	return pcpu_rt;
   1410}
   1411
   1412static struct rt6_info *rt6_make_pcpu_route(struct net *net,
   1413					    const struct fib6_result *res)
   1414{
   1415	struct rt6_info *pcpu_rt, *prev, **p;
   1416
   1417	pcpu_rt = ip6_rt_pcpu_alloc(res);
   1418	if (!pcpu_rt)
   1419		return NULL;
   1420
   1421	p = this_cpu_ptr(res->nh->rt6i_pcpu);
   1422	prev = cmpxchg(p, NULL, pcpu_rt);
   1423	BUG_ON(prev);
   1424
   1425	if (res->f6i->fib6_destroying) {
   1426		struct fib6_info *from;
   1427
   1428		from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
   1429		fib6_info_release(from);
   1430	}
   1431
   1432	return pcpu_rt;
   1433}
   1434
   1435/* exception hash table implementation
   1436 */
   1437static DEFINE_SPINLOCK(rt6_exception_lock);
   1438
   1439/* Remove rt6_ex from hash table and free the memory
   1440 * Caller must hold rt6_exception_lock
   1441 */
   1442static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
   1443				 struct rt6_exception *rt6_ex)
   1444{
   1445	struct fib6_info *from;
   1446	struct net *net;
   1447
   1448	if (!bucket || !rt6_ex)
   1449		return;
   1450
   1451	net = dev_net(rt6_ex->rt6i->dst.dev);
   1452	net->ipv6.rt6_stats->fib_rt_cache--;
   1453
   1454	/* purge completely the exception to allow releasing the held resources:
   1455	 * some [sk] cache may keep the dst around for unlimited time
   1456	 */
   1457	from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
   1458	fib6_info_release(from);
   1459	dst_dev_put(&rt6_ex->rt6i->dst);
   1460
   1461	hlist_del_rcu(&rt6_ex->hlist);
   1462	dst_release(&rt6_ex->rt6i->dst);
   1463	kfree_rcu(rt6_ex, rcu);
   1464	WARN_ON_ONCE(!bucket->depth);
   1465	bucket->depth--;
   1466}
   1467
   1468/* Remove oldest rt6_ex in bucket and free the memory
   1469 * Caller must hold rt6_exception_lock
   1470 */
   1471static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
   1472{
   1473	struct rt6_exception *rt6_ex, *oldest = NULL;
   1474
   1475	if (!bucket)
   1476		return;
   1477
   1478	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
   1479		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
   1480			oldest = rt6_ex;
   1481	}
   1482	rt6_remove_exception(bucket, oldest);
   1483}
   1484
   1485static u32 rt6_exception_hash(const struct in6_addr *dst,
   1486			      const struct in6_addr *src)
   1487{
   1488	static siphash_aligned_key_t rt6_exception_key;
   1489	struct {
   1490		struct in6_addr dst;
   1491		struct in6_addr src;
   1492	} __aligned(SIPHASH_ALIGNMENT) combined = {
   1493		.dst = *dst,
   1494	};
   1495	u64 val;
   1496
   1497	net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key));
   1498
   1499#ifdef CONFIG_IPV6_SUBTREES
   1500	if (src)
   1501		combined.src = *src;
   1502#endif
   1503	val = siphash(&combined, sizeof(combined), &rt6_exception_key);
   1504
   1505	return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
   1506}
   1507
   1508/* Helper function to find the cached rt in the hash table
   1509 * and update bucket pointer to point to the bucket for this
   1510 * (daddr, saddr) pair
   1511 * Caller must hold rt6_exception_lock
   1512 */
   1513static struct rt6_exception *
   1514__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
   1515			      const struct in6_addr *daddr,
   1516			      const struct in6_addr *saddr)
   1517{
   1518	struct rt6_exception *rt6_ex;
   1519	u32 hval;
   1520
   1521	if (!(*bucket) || !daddr)
   1522		return NULL;
   1523
   1524	hval = rt6_exception_hash(daddr, saddr);
   1525	*bucket += hval;
   1526
   1527	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
   1528		struct rt6_info *rt6 = rt6_ex->rt6i;
   1529		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
   1530
   1531#ifdef CONFIG_IPV6_SUBTREES
   1532		if (matched && saddr)
   1533			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
   1534#endif
   1535		if (matched)
   1536			return rt6_ex;
   1537	}
   1538	return NULL;
   1539}
   1540
   1541/* Helper function to find the cached rt in the hash table
   1542 * and update bucket pointer to point to the bucket for this
   1543 * (daddr, saddr) pair
   1544 * Caller must hold rcu_read_lock()
   1545 */
   1546static struct rt6_exception *
   1547__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
   1548			 const struct in6_addr *daddr,
   1549			 const struct in6_addr *saddr)
   1550{
   1551	struct rt6_exception *rt6_ex;
   1552	u32 hval;
   1553
   1554	WARN_ON_ONCE(!rcu_read_lock_held());
   1555
   1556	if (!(*bucket) || !daddr)
   1557		return NULL;
   1558
   1559	hval = rt6_exception_hash(daddr, saddr);
   1560	*bucket += hval;
   1561
   1562	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
   1563		struct rt6_info *rt6 = rt6_ex->rt6i;
   1564		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
   1565
   1566#ifdef CONFIG_IPV6_SUBTREES
   1567		if (matched && saddr)
   1568			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
   1569#endif
   1570		if (matched)
   1571			return rt6_ex;
   1572	}
   1573	return NULL;
   1574}
   1575
   1576static unsigned int fib6_mtu(const struct fib6_result *res)
   1577{
   1578	const struct fib6_nh *nh = res->nh;
   1579	unsigned int mtu;
   1580
   1581	if (res->f6i->fib6_pmtu) {
   1582		mtu = res->f6i->fib6_pmtu;
   1583	} else {
   1584		struct net_device *dev = nh->fib_nh_dev;
   1585		struct inet6_dev *idev;
   1586
   1587		rcu_read_lock();
   1588		idev = __in6_dev_get(dev);
   1589		mtu = idev->cnf.mtu6;
   1590		rcu_read_unlock();
   1591	}
   1592
   1593	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
   1594
   1595	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
   1596}
   1597
   1598#define FIB6_EXCEPTION_BUCKET_FLUSHED  0x1UL
   1599
   1600/* used when the flushed bit is not relevant, only access to the bucket
   1601 * (ie., all bucket users except rt6_insert_exception);
   1602 *
   1603 * called under rcu lock; sometimes called with rt6_exception_lock held
   1604 */
   1605static
   1606struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh,
   1607						       spinlock_t *lock)
   1608{
   1609	struct rt6_exception_bucket *bucket;
   1610
   1611	if (lock)
   1612		bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
   1613						   lockdep_is_held(lock));
   1614	else
   1615		bucket = rcu_dereference(nh->rt6i_exception_bucket);
   1616
   1617	/* remove bucket flushed bit if set */
   1618	if (bucket) {
   1619		unsigned long p = (unsigned long)bucket;
   1620
   1621		p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED;
   1622		bucket = (struct rt6_exception_bucket *)p;
   1623	}
   1624
   1625	return bucket;
   1626}
   1627
   1628static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket)
   1629{
   1630	unsigned long p = (unsigned long)bucket;
   1631
   1632	return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED);
   1633}
   1634
   1635/* called with rt6_exception_lock held */
   1636static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh,
   1637					      spinlock_t *lock)
   1638{
   1639	struct rt6_exception_bucket *bucket;
   1640	unsigned long p;
   1641
   1642	bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
   1643					   lockdep_is_held(lock));
   1644
   1645	p = (unsigned long)bucket;
   1646	p |= FIB6_EXCEPTION_BUCKET_FLUSHED;
   1647	bucket = (struct rt6_exception_bucket *)p;
   1648	rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
   1649}
   1650
   1651static int rt6_insert_exception(struct rt6_info *nrt,
   1652				const struct fib6_result *res)
   1653{
   1654	struct net *net = dev_net(nrt->dst.dev);
   1655	struct rt6_exception_bucket *bucket;
   1656	struct fib6_info *f6i = res->f6i;
   1657	struct in6_addr *src_key = NULL;
   1658	struct rt6_exception *rt6_ex;
   1659	struct fib6_nh *nh = res->nh;
   1660	int max_depth;
   1661	int err = 0;
   1662
   1663	spin_lock_bh(&rt6_exception_lock);
   1664
   1665	bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
   1666					  lockdep_is_held(&rt6_exception_lock));
   1667	if (!bucket) {
   1668		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
   1669				 GFP_ATOMIC);
   1670		if (!bucket) {
   1671			err = -ENOMEM;
   1672			goto out;
   1673		}
   1674		rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
   1675	} else if (fib6_nh_excptn_bucket_flushed(bucket)) {
   1676		err = -EINVAL;
   1677		goto out;
   1678	}
   1679
   1680#ifdef CONFIG_IPV6_SUBTREES
   1681	/* fib6_src.plen != 0 indicates f6i is in subtree
   1682	 * and exception table is indexed by a hash of
   1683	 * both fib6_dst and fib6_src.
   1684	 * Otherwise, the exception table is indexed by
   1685	 * a hash of only fib6_dst.
   1686	 */
   1687	if (f6i->fib6_src.plen)
   1688		src_key = &nrt->rt6i_src.addr;
   1689#endif
   1690	/* rt6_mtu_change() might lower mtu on f6i.
   1691	 * Only insert this exception route if its mtu
   1692	 * is less than f6i's mtu value.
   1693	 */
   1694	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
   1695		err = -EINVAL;
   1696		goto out;
   1697	}
   1698
   1699	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
   1700					       src_key);
   1701	if (rt6_ex)
   1702		rt6_remove_exception(bucket, rt6_ex);
   1703
   1704	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
   1705	if (!rt6_ex) {
   1706		err = -ENOMEM;
   1707		goto out;
   1708	}
   1709	rt6_ex->rt6i = nrt;
   1710	rt6_ex->stamp = jiffies;
   1711	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
   1712	bucket->depth++;
   1713	net->ipv6.rt6_stats->fib_rt_cache++;
   1714
   1715	/* Randomize max depth to avoid some side channels attacks. */
   1716	max_depth = FIB6_MAX_DEPTH + prandom_u32_max(FIB6_MAX_DEPTH);
   1717	while (bucket->depth > max_depth)
   1718		rt6_exception_remove_oldest(bucket);
   1719
   1720out:
   1721	spin_unlock_bh(&rt6_exception_lock);
   1722
   1723	/* Update fn->fn_sernum to invalidate all cached dst */
   1724	if (!err) {
   1725		spin_lock_bh(&f6i->fib6_table->tb6_lock);
   1726		fib6_update_sernum(net, f6i);
   1727		spin_unlock_bh(&f6i->fib6_table->tb6_lock);
   1728		fib6_force_start_gc(net);
   1729	}
   1730
   1731	return err;
   1732}
   1733
   1734static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from)
   1735{
   1736	struct rt6_exception_bucket *bucket;
   1737	struct rt6_exception *rt6_ex;
   1738	struct hlist_node *tmp;
   1739	int i;
   1740
   1741	spin_lock_bh(&rt6_exception_lock);
   1742
   1743	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
   1744	if (!bucket)
   1745		goto out;
   1746
   1747	/* Prevent rt6_insert_exception() to recreate the bucket list */
   1748	if (!from)
   1749		fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock);
   1750
   1751	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
   1752		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) {
   1753			if (!from ||
   1754			    rcu_access_pointer(rt6_ex->rt6i->from) == from)
   1755				rt6_remove_exception(bucket, rt6_ex);
   1756		}
   1757		WARN_ON_ONCE(!from && bucket->depth);
   1758		bucket++;
   1759	}
   1760out:
   1761	spin_unlock_bh(&rt6_exception_lock);
   1762}
   1763
   1764static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg)
   1765{
   1766	struct fib6_info *f6i = arg;
   1767
   1768	fib6_nh_flush_exceptions(nh, f6i);
   1769
   1770	return 0;
   1771}
   1772
   1773void rt6_flush_exceptions(struct fib6_info *f6i)
   1774{
   1775	if (f6i->nh)
   1776		nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions,
   1777					 f6i);
   1778	else
   1779		fib6_nh_flush_exceptions(f6i->fib6_nh, f6i);
   1780}
   1781
   1782/* Find cached rt in the hash table inside passed in rt
   1783 * Caller has to hold rcu_read_lock()
   1784 */
   1785static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
   1786					   const struct in6_addr *daddr,
   1787					   const struct in6_addr *saddr)
   1788{
   1789	const struct in6_addr *src_key = NULL;
   1790	struct rt6_exception_bucket *bucket;
   1791	struct rt6_exception *rt6_ex;
   1792	struct rt6_info *ret = NULL;
   1793
   1794#ifdef CONFIG_IPV6_SUBTREES
   1795	/* fib6i_src.plen != 0 indicates f6i is in subtree
   1796	 * and exception table is indexed by a hash of
   1797	 * both fib6_dst and fib6_src.
   1798	 * However, the src addr used to create the hash
   1799	 * might not be exactly the passed in saddr which
   1800	 * is a /128 addr from the flow.
   1801	 * So we need to use f6i->fib6_src to redo lookup
   1802	 * if the passed in saddr does not find anything.
   1803	 * (See the logic in ip6_rt_cache_alloc() on how
   1804	 * rt->rt6i_src is updated.)
   1805	 */
   1806	if (res->f6i->fib6_src.plen)
   1807		src_key = saddr;
   1808find_ex:
   1809#endif
   1810	bucket = fib6_nh_get_excptn_bucket(res->nh, NULL);
   1811	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
   1812
   1813	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
   1814		ret = rt6_ex->rt6i;
   1815
   1816#ifdef CONFIG_IPV6_SUBTREES
   1817	/* Use fib6_src as src_key and redo lookup */
   1818	if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
   1819		src_key = &res->f6i->fib6_src.addr;
   1820		goto find_ex;
   1821	}
   1822#endif
   1823
   1824	return ret;
   1825}
   1826
   1827/* Remove the passed in cached rt from the hash table that contains it */
   1828static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen,
   1829				    const struct rt6_info *rt)
   1830{
   1831	const struct in6_addr *src_key = NULL;
   1832	struct rt6_exception_bucket *bucket;
   1833	struct rt6_exception *rt6_ex;
   1834	int err;
   1835
   1836	if (!rcu_access_pointer(nh->rt6i_exception_bucket))
   1837		return -ENOENT;
   1838
   1839	spin_lock_bh(&rt6_exception_lock);
   1840	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
   1841
   1842#ifdef CONFIG_IPV6_SUBTREES
   1843	/* rt6i_src.plen != 0 indicates 'from' is in subtree
   1844	 * and exception table is indexed by a hash of
   1845	 * both rt6i_dst and rt6i_src.
   1846	 * Otherwise, the exception table is indexed by
   1847	 * a hash of only rt6i_dst.
   1848	 */
   1849	if (plen)
   1850		src_key = &rt->rt6i_src.addr;
   1851#endif
   1852	rt6_ex = __rt6_find_exception_spinlock(&bucket,
   1853					       &rt->rt6i_dst.addr,
   1854					       src_key);
   1855	if (rt6_ex) {
   1856		rt6_remove_exception(bucket, rt6_ex);
   1857		err = 0;
   1858	} else {
   1859		err = -ENOENT;
   1860	}
   1861
   1862	spin_unlock_bh(&rt6_exception_lock);
   1863	return err;
   1864}
   1865
   1866struct fib6_nh_excptn_arg {
   1867	struct rt6_info	*rt;
   1868	int		plen;
   1869};
   1870
   1871static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg)
   1872{
   1873	struct fib6_nh_excptn_arg *arg = _arg;
   1874	int err;
   1875
   1876	err = fib6_nh_remove_exception(nh, arg->plen, arg->rt);
   1877	if (err == 0)
   1878		return 1;
   1879
   1880	return 0;
   1881}
   1882
   1883static int rt6_remove_exception_rt(struct rt6_info *rt)
   1884{
   1885	struct fib6_info *from;
   1886
   1887	from = rcu_dereference(rt->from);
   1888	if (!from || !(rt->rt6i_flags & RTF_CACHE))
   1889		return -EINVAL;
   1890
   1891	if (from->nh) {
   1892		struct fib6_nh_excptn_arg arg = {
   1893			.rt = rt,
   1894			.plen = from->fib6_src.plen
   1895		};
   1896		int rc;
   1897
   1898		/* rc = 1 means an entry was found */
   1899		rc = nexthop_for_each_fib6_nh(from->nh,
   1900					      rt6_nh_remove_exception_rt,
   1901					      &arg);
   1902		return rc ? 0 : -ENOENT;
   1903	}
   1904
   1905	return fib6_nh_remove_exception(from->fib6_nh,
   1906					from->fib6_src.plen, rt);
   1907}
   1908
   1909/* Find rt6_ex which contains the passed in rt cache and
   1910 * refresh its stamp
   1911 */
   1912static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen,
   1913				     const struct rt6_info *rt)
   1914{
   1915	const struct in6_addr *src_key = NULL;
   1916	struct rt6_exception_bucket *bucket;
   1917	struct rt6_exception *rt6_ex;
   1918
   1919	bucket = fib6_nh_get_excptn_bucket(nh, NULL);
   1920#ifdef CONFIG_IPV6_SUBTREES
   1921	/* rt6i_src.plen != 0 indicates 'from' is in subtree
   1922	 * and exception table is indexed by a hash of
   1923	 * both rt6i_dst and rt6i_src.
   1924	 * Otherwise, the exception table is indexed by
   1925	 * a hash of only rt6i_dst.
   1926	 */
   1927	if (plen)
   1928		src_key = &rt->rt6i_src.addr;
   1929#endif
   1930	rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key);
   1931	if (rt6_ex)
   1932		rt6_ex->stamp = jiffies;
   1933}
   1934
   1935struct fib6_nh_match_arg {
   1936	const struct net_device *dev;
   1937	const struct in6_addr	*gw;
   1938	struct fib6_nh		*match;
   1939};
   1940
   1941/* determine if fib6_nh has given device and gateway */
   1942static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg)
   1943{
   1944	struct fib6_nh_match_arg *arg = _arg;
   1945
   1946	if (arg->dev != nh->fib_nh_dev ||
   1947	    (arg->gw && !nh->fib_nh_gw_family) ||
   1948	    (!arg->gw && nh->fib_nh_gw_family) ||
   1949	    (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6)))
   1950		return 0;
   1951
   1952	arg->match = nh;
   1953
   1954	/* found a match, break the loop */
   1955	return 1;
   1956}
   1957
   1958static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
   1959{
   1960	struct fib6_info *from;
   1961	struct fib6_nh *fib6_nh;
   1962
   1963	rcu_read_lock();
   1964
   1965	from = rcu_dereference(rt->from);
   1966	if (!from || !(rt->rt6i_flags & RTF_CACHE))
   1967		goto unlock;
   1968
   1969	if (from->nh) {
   1970		struct fib6_nh_match_arg arg = {
   1971			.dev = rt->dst.dev,
   1972			.gw = &rt->rt6i_gateway,
   1973		};
   1974
   1975		nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg);
   1976
   1977		if (!arg.match)
   1978			goto unlock;
   1979		fib6_nh = arg.match;
   1980	} else {
   1981		fib6_nh = from->fib6_nh;
   1982	}
   1983	fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt);
   1984unlock:
   1985	rcu_read_unlock();
   1986}
   1987
   1988static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
   1989					 struct rt6_info *rt, int mtu)
   1990{
   1991	/* If the new MTU is lower than the route PMTU, this new MTU will be the
   1992	 * lowest MTU in the path: always allow updating the route PMTU to
   1993	 * reflect PMTU decreases.
   1994	 *
   1995	 * If the new MTU is higher, and the route PMTU is equal to the local
   1996	 * MTU, this means the old MTU is the lowest in the path, so allow
   1997	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
   1998	 * handle this.
   1999	 */
   2000
   2001	if (dst_mtu(&rt->dst) >= mtu)
   2002		return true;
   2003
   2004	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
   2005		return true;
   2006
   2007	return false;
   2008}
   2009
   2010static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
   2011				       const struct fib6_nh *nh, int mtu)
   2012{
   2013	struct rt6_exception_bucket *bucket;
   2014	struct rt6_exception *rt6_ex;
   2015	int i;
   2016
   2017	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
   2018	if (!bucket)
   2019		return;
   2020
   2021	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
   2022		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
   2023			struct rt6_info *entry = rt6_ex->rt6i;
   2024
   2025			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
   2026			 * route), the metrics of its rt->from have already
   2027			 * been updated.
   2028			 */
   2029			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
   2030			    rt6_mtu_change_route_allowed(idev, entry, mtu))
   2031				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
   2032		}
   2033		bucket++;
   2034	}
   2035}
   2036
   2037#define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
   2038
   2039static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh,
   2040					    const struct in6_addr *gateway)
   2041{
   2042	struct rt6_exception_bucket *bucket;
   2043	struct rt6_exception *rt6_ex;
   2044	struct hlist_node *tmp;
   2045	int i;
   2046
   2047	if (!rcu_access_pointer(nh->rt6i_exception_bucket))
   2048		return;
   2049
   2050	spin_lock_bh(&rt6_exception_lock);
   2051	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
   2052	if (bucket) {
   2053		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
   2054			hlist_for_each_entry_safe(rt6_ex, tmp,
   2055						  &bucket->chain, hlist) {
   2056				struct rt6_info *entry = rt6_ex->rt6i;
   2057
   2058				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
   2059				    RTF_CACHE_GATEWAY &&
   2060				    ipv6_addr_equal(gateway,
   2061						    &entry->rt6i_gateway)) {
   2062					rt6_remove_exception(bucket, rt6_ex);
   2063				}
   2064			}
   2065			bucket++;
   2066		}
   2067	}
   2068
   2069	spin_unlock_bh(&rt6_exception_lock);
   2070}
   2071
   2072static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
   2073				      struct rt6_exception *rt6_ex,
   2074				      struct fib6_gc_args *gc_args,
   2075				      unsigned long now)
   2076{
   2077	struct rt6_info *rt = rt6_ex->rt6i;
   2078
   2079	/* we are pruning and obsoleting aged-out and non gateway exceptions
   2080	 * even if others have still references to them, so that on next
   2081	 * dst_check() such references can be dropped.
   2082	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
   2083	 * expired, independently from their aging, as per RFC 8201 section 4
   2084	 */
   2085	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
   2086		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
   2087			RT6_TRACE("aging clone %p\n", rt);
   2088			rt6_remove_exception(bucket, rt6_ex);
   2089			return;
   2090		}
   2091	} else if (time_after(jiffies, rt->dst.expires)) {
   2092		RT6_TRACE("purging expired route %p\n", rt);
   2093		rt6_remove_exception(bucket, rt6_ex);
   2094		return;
   2095	}
   2096
   2097	if (rt->rt6i_flags & RTF_GATEWAY) {
   2098		struct neighbour *neigh;
   2099
   2100		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
   2101
   2102		if (!(neigh && (neigh->flags & NTF_ROUTER))) {
   2103			RT6_TRACE("purging route %p via non-router but gateway\n",
   2104				  rt);
   2105			rt6_remove_exception(bucket, rt6_ex);
   2106			return;
   2107		}
   2108	}
   2109
   2110	gc_args->more++;
   2111}
   2112
   2113static void fib6_nh_age_exceptions(const struct fib6_nh *nh,
   2114				   struct fib6_gc_args *gc_args,
   2115				   unsigned long now)
   2116{
   2117	struct rt6_exception_bucket *bucket;
   2118	struct rt6_exception *rt6_ex;
   2119	struct hlist_node *tmp;
   2120	int i;
   2121
   2122	if (!rcu_access_pointer(nh->rt6i_exception_bucket))
   2123		return;
   2124
   2125	rcu_read_lock_bh();
   2126	spin_lock(&rt6_exception_lock);
   2127	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
   2128	if (bucket) {
   2129		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
   2130			hlist_for_each_entry_safe(rt6_ex, tmp,
   2131						  &bucket->chain, hlist) {
   2132				rt6_age_examine_exception(bucket, rt6_ex,
   2133							  gc_args, now);
   2134			}
   2135			bucket++;
   2136		}
   2137	}
   2138	spin_unlock(&rt6_exception_lock);
   2139	rcu_read_unlock_bh();
   2140}
   2141
   2142struct fib6_nh_age_excptn_arg {
   2143	struct fib6_gc_args	*gc_args;
   2144	unsigned long		now;
   2145};
   2146
   2147static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg)
   2148{
   2149	struct fib6_nh_age_excptn_arg *arg = _arg;
   2150
   2151	fib6_nh_age_exceptions(nh, arg->gc_args, arg->now);
   2152	return 0;
   2153}
   2154
   2155void rt6_age_exceptions(struct fib6_info *f6i,
   2156			struct fib6_gc_args *gc_args,
   2157			unsigned long now)
   2158{
   2159	if (f6i->nh) {
   2160		struct fib6_nh_age_excptn_arg arg = {
   2161			.gc_args = gc_args,
   2162			.now = now
   2163		};
   2164
   2165		nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions,
   2166					 &arg);
   2167	} else {
   2168		fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now);
   2169	}
   2170}
   2171
   2172/* must be called with rcu lock held */
   2173int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
   2174		      struct flowi6 *fl6, struct fib6_result *res, int strict)
   2175{
   2176	struct fib6_node *fn, *saved_fn;
   2177
   2178	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
   2179	saved_fn = fn;
   2180
   2181redo_rt6_select:
   2182	rt6_select(net, fn, oif, res, strict);
   2183	if (res->f6i == net->ipv6.fib6_null_entry) {
   2184		fn = fib6_backtrack(fn, &fl6->saddr);
   2185		if (fn)
   2186			goto redo_rt6_select;
   2187		else if (strict & RT6_LOOKUP_F_REACHABLE) {
   2188			/* also consider unreachable route */
   2189			strict &= ~RT6_LOOKUP_F_REACHABLE;
   2190			fn = saved_fn;
   2191			goto redo_rt6_select;
   2192		}
   2193	}
   2194
   2195	trace_fib6_table_lookup(net, res, table, fl6);
   2196
   2197	return 0;
   2198}
   2199
   2200struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
   2201			       int oif, struct flowi6 *fl6,
   2202			       const struct sk_buff *skb, int flags)
   2203{
   2204	struct fib6_result res = {};
   2205	struct rt6_info *rt = NULL;
   2206	int strict = 0;
   2207
   2208	WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) &&
   2209		     !rcu_read_lock_held());
   2210
   2211	strict |= flags & RT6_LOOKUP_F_IFACE;
   2212	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
   2213	if (net->ipv6.devconf_all->forwarding == 0)
   2214		strict |= RT6_LOOKUP_F_REACHABLE;
   2215
   2216	rcu_read_lock();
   2217
   2218	fib6_table_lookup(net, table, oif, fl6, &res, strict);
   2219	if (res.f6i == net->ipv6.fib6_null_entry)
   2220		goto out;
   2221
   2222	fib6_select_path(net, &res, fl6, oif, false, skb, strict);
   2223
   2224	/*Search through exception table */
   2225	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
   2226	if (rt) {
   2227		goto out;
   2228	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
   2229			    !res.nh->fib_nh_gw_family)) {
   2230		/* Create a RTF_CACHE clone which will not be
   2231		 * owned by the fib6 tree.  It is for the special case where
   2232		 * the daddr in the skb during the neighbor look-up is different
   2233		 * from the fl6->daddr used to look-up route here.
   2234		 */
   2235		rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
   2236
   2237		if (rt) {
   2238			/* 1 refcnt is taken during ip6_rt_cache_alloc().
   2239			 * As rt6_uncached_list_add() does not consume refcnt,
   2240			 * this refcnt is always returned to the caller even
   2241			 * if caller sets RT6_LOOKUP_F_DST_NOREF flag.
   2242			 */
   2243			rt6_uncached_list_add(rt);
   2244			rcu_read_unlock();
   2245
   2246			return rt;
   2247		}
   2248	} else {
   2249		/* Get a percpu copy */
   2250		local_bh_disable();
   2251		rt = rt6_get_pcpu_route(&res);
   2252
   2253		if (!rt)
   2254			rt = rt6_make_pcpu_route(net, &res);
   2255
   2256		local_bh_enable();
   2257	}
   2258out:
   2259	if (!rt)
   2260		rt = net->ipv6.ip6_null_entry;
   2261	if (!(flags & RT6_LOOKUP_F_DST_NOREF))
   2262		ip6_hold_safe(net, &rt);
   2263	rcu_read_unlock();
   2264
   2265	return rt;
   2266}
   2267EXPORT_SYMBOL_GPL(ip6_pol_route);
   2268
   2269INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net,
   2270					    struct fib6_table *table,
   2271					    struct flowi6 *fl6,
   2272					    const struct sk_buff *skb,
   2273					    int flags)
   2274{
   2275	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
   2276}
   2277
   2278struct dst_entry *ip6_route_input_lookup(struct net *net,
   2279					 struct net_device *dev,
   2280					 struct flowi6 *fl6,
   2281					 const struct sk_buff *skb,
   2282					 int flags)
   2283{
   2284	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
   2285		flags |= RT6_LOOKUP_F_IFACE;
   2286
   2287	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
   2288}
   2289EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
   2290
   2291static void ip6_multipath_l3_keys(const struct sk_buff *skb,
   2292				  struct flow_keys *keys,
   2293				  struct flow_keys *flkeys)
   2294{
   2295	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
   2296	const struct ipv6hdr *key_iph = outer_iph;
   2297	struct flow_keys *_flkeys = flkeys;
   2298	const struct ipv6hdr *inner_iph;
   2299	const struct icmp6hdr *icmph;
   2300	struct ipv6hdr _inner_iph;
   2301	struct icmp6hdr _icmph;
   2302
   2303	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
   2304		goto out;
   2305
   2306	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
   2307				   sizeof(_icmph), &_icmph);
   2308	if (!icmph)
   2309		goto out;
   2310
   2311	if (!icmpv6_is_err(icmph->icmp6_type))
   2312		goto out;
   2313
   2314	inner_iph = skb_header_pointer(skb,
   2315				       skb_transport_offset(skb) + sizeof(*icmph),
   2316				       sizeof(_inner_iph), &_inner_iph);
   2317	if (!inner_iph)
   2318		goto out;
   2319
   2320	key_iph = inner_iph;
   2321	_flkeys = NULL;
   2322out:
   2323	if (_flkeys) {
   2324		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
   2325		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
   2326		keys->tags.flow_label = _flkeys->tags.flow_label;
   2327		keys->basic.ip_proto = _flkeys->basic.ip_proto;
   2328	} else {
   2329		keys->addrs.v6addrs.src = key_iph->saddr;
   2330		keys->addrs.v6addrs.dst = key_iph->daddr;
   2331		keys->tags.flow_label = ip6_flowlabel(key_iph);
   2332		keys->basic.ip_proto = key_iph->nexthdr;
   2333	}
   2334}
   2335
   2336static u32 rt6_multipath_custom_hash_outer(const struct net *net,
   2337					   const struct sk_buff *skb,
   2338					   bool *p_has_inner)
   2339{
   2340	u32 hash_fields = ip6_multipath_hash_fields(net);
   2341	struct flow_keys keys, hash_keys;
   2342
   2343	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
   2344		return 0;
   2345
   2346	memset(&hash_keys, 0, sizeof(hash_keys));
   2347	skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);
   2348
   2349	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
   2350	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
   2351		hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
   2352	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
   2353		hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
   2354	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
   2355		hash_keys.basic.ip_proto = keys.basic.ip_proto;
   2356	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
   2357		hash_keys.tags.flow_label = keys.tags.flow_label;
   2358	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
   2359		hash_keys.ports.src = keys.ports.src;
   2360	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
   2361		hash_keys.ports.dst = keys.ports.dst;
   2362
   2363	*p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
   2364	return flow_hash_from_keys(&hash_keys);
   2365}
   2366
   2367static u32 rt6_multipath_custom_hash_inner(const struct net *net,
   2368					   const struct sk_buff *skb,
   2369					   bool has_inner)
   2370{
   2371	u32 hash_fields = ip6_multipath_hash_fields(net);
   2372	struct flow_keys keys, hash_keys;
   2373
   2374	/* We assume the packet carries an encapsulation, but if none was
   2375	 * encountered during dissection of the outer flow, then there is no
   2376	 * point in calling the flow dissector again.
   2377	 */
   2378	if (!has_inner)
   2379		return 0;
   2380
   2381	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
   2382		return 0;
   2383
   2384	memset(&hash_keys, 0, sizeof(hash_keys));
   2385	skb_flow_dissect_flow_keys(skb, &keys, 0);
   2386
   2387	if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
   2388		return 0;
   2389
   2390	if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
   2391		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
   2392		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
   2393			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
   2394		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
   2395			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
   2396	} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
   2397		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
   2398		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
   2399			hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
   2400		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
   2401			hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
   2402		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
   2403			hash_keys.tags.flow_label = keys.tags.flow_label;
   2404	}
   2405
   2406	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
   2407		hash_keys.basic.ip_proto = keys.basic.ip_proto;
   2408	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
   2409		hash_keys.ports.src = keys.ports.src;
   2410	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
   2411		hash_keys.ports.dst = keys.ports.dst;
   2412
   2413	return flow_hash_from_keys(&hash_keys);
   2414}
   2415
   2416static u32 rt6_multipath_custom_hash_skb(const struct net *net,
   2417					 const struct sk_buff *skb)
   2418{
   2419	u32 mhash, mhash_inner;
   2420	bool has_inner = true;
   2421
   2422	mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner);
   2423	mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner);
   2424
   2425	return jhash_2words(mhash, mhash_inner, 0);
   2426}
   2427
   2428static u32 rt6_multipath_custom_hash_fl6(const struct net *net,
   2429					 const struct flowi6 *fl6)
   2430{
   2431	u32 hash_fields = ip6_multipath_hash_fields(net);
   2432	struct flow_keys hash_keys;
   2433
   2434	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
   2435		return 0;
   2436
   2437	memset(&hash_keys, 0, sizeof(hash_keys));
   2438	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
   2439	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
   2440		hash_keys.addrs.v6addrs.src = fl6->saddr;
   2441	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
   2442		hash_keys.addrs.v6addrs.dst = fl6->daddr;
   2443	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
   2444		hash_keys.basic.ip_proto = fl6->flowi6_proto;
   2445	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
   2446		hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
   2447	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
   2448		hash_keys.ports.src = fl6->fl6_sport;
   2449	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
   2450		hash_keys.ports.dst = fl6->fl6_dport;
   2451
   2452	return flow_hash_from_keys(&hash_keys);
   2453}
   2454
   2455/* if skb is set it will be used and fl6 can be NULL */
   2456u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
   2457		       const struct sk_buff *skb, struct flow_keys *flkeys)
   2458{
   2459	struct flow_keys hash_keys;
   2460	u32 mhash = 0;
   2461
   2462	switch (ip6_multipath_hash_policy(net)) {
   2463	case 0:
   2464		memset(&hash_keys, 0, sizeof(hash_keys));
   2465		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
   2466		if (skb) {
   2467			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
   2468		} else {
   2469			hash_keys.addrs.v6addrs.src = fl6->saddr;
   2470			hash_keys.addrs.v6addrs.dst = fl6->daddr;
   2471			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
   2472			hash_keys.basic.ip_proto = fl6->flowi6_proto;
   2473		}
   2474		mhash = flow_hash_from_keys(&hash_keys);
   2475		break;
   2476	case 1:
   2477		if (skb) {
   2478			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
   2479			struct flow_keys keys;
   2480
   2481			/* short-circuit if we already have L4 hash present */
   2482			if (skb->l4_hash)
   2483				return skb_get_hash_raw(skb) >> 1;
   2484
   2485			memset(&hash_keys, 0, sizeof(hash_keys));
   2486
   2487			if (!flkeys) {
   2488				skb_flow_dissect_flow_keys(skb, &keys, flag);
   2489				flkeys = &keys;
   2490			}
   2491			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
   2492			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
   2493			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
   2494			hash_keys.ports.src = flkeys->ports.src;
   2495			hash_keys.ports.dst = flkeys->ports.dst;
   2496			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
   2497		} else {
   2498			memset(&hash_keys, 0, sizeof(hash_keys));
   2499			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
   2500			hash_keys.addrs.v6addrs.src = fl6->saddr;
   2501			hash_keys.addrs.v6addrs.dst = fl6->daddr;
   2502			hash_keys.ports.src = fl6->fl6_sport;
   2503			hash_keys.ports.dst = fl6->fl6_dport;
   2504			hash_keys.basic.ip_proto = fl6->flowi6_proto;
   2505		}
   2506		mhash = flow_hash_from_keys(&hash_keys);
   2507		break;
   2508	case 2:
   2509		memset(&hash_keys, 0, sizeof(hash_keys));
   2510		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
   2511		if (skb) {
   2512			struct flow_keys keys;
   2513
   2514			if (!flkeys) {
   2515				skb_flow_dissect_flow_keys(skb, &keys, 0);
   2516				flkeys = &keys;
   2517			}
   2518
   2519			/* Inner can be v4 or v6 */
   2520			if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
   2521				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
   2522				hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
   2523				hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
   2524			} else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
   2525				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
   2526				hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
   2527				hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
   2528				hash_keys.tags.flow_label = flkeys->tags.flow_label;
   2529				hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
   2530			} else {
   2531				/* Same as case 0 */
   2532				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
   2533				ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
   2534			}
   2535		} else {
   2536			/* Same as case 0 */
   2537			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
   2538			hash_keys.addrs.v6addrs.src = fl6->saddr;
   2539			hash_keys.addrs.v6addrs.dst = fl6->daddr;
   2540			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
   2541			hash_keys.basic.ip_proto = fl6->flowi6_proto;
   2542		}
   2543		mhash = flow_hash_from_keys(&hash_keys);
   2544		break;
   2545	case 3:
   2546		if (skb)
   2547			mhash = rt6_multipath_custom_hash_skb(net, skb);
   2548		else
   2549			mhash = rt6_multipath_custom_hash_fl6(net, fl6);
   2550		break;
   2551	}
   2552
   2553	return mhash >> 1;
   2554}
   2555
   2556/* Called with rcu held */
   2557void ip6_route_input(struct sk_buff *skb)
   2558{
   2559	const struct ipv6hdr *iph = ipv6_hdr(skb);
   2560	struct net *net = dev_net(skb->dev);
   2561	int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF;
   2562	struct ip_tunnel_info *tun_info;
   2563	struct flowi6 fl6 = {
   2564		.flowi6_iif = skb->dev->ifindex,
   2565		.daddr = iph->daddr,
   2566		.saddr = iph->saddr,
   2567		.flowlabel = ip6_flowinfo(iph),
   2568		.flowi6_mark = skb->mark,
   2569		.flowi6_proto = iph->nexthdr,
   2570	};
   2571	struct flow_keys *flkeys = NULL, _flkeys;
   2572
   2573	tun_info = skb_tunnel_info(skb);
   2574	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
   2575		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
   2576
   2577	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
   2578		flkeys = &_flkeys;
   2579
   2580	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
   2581		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
   2582	skb_dst_drop(skb);
   2583	skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev,
   2584						      &fl6, skb, flags));
   2585}
   2586
   2587INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net,
   2588					     struct fib6_table *table,
   2589					     struct flowi6 *fl6,
   2590					     const struct sk_buff *skb,
   2591					     int flags)
   2592{
   2593	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
   2594}
   2595
   2596struct dst_entry *ip6_route_output_flags_noref(struct net *net,
   2597					       const struct sock *sk,
   2598					       struct flowi6 *fl6, int flags)
   2599{
   2600	bool any_src;
   2601
   2602	if (ipv6_addr_type(&fl6->daddr) &
   2603	    (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
   2604		struct dst_entry *dst;
   2605
   2606		/* This function does not take refcnt on the dst */
   2607		dst = l3mdev_link_scope_lookup(net, fl6);
   2608		if (dst)
   2609			return dst;
   2610	}
   2611
   2612	fl6->flowi6_iif = LOOPBACK_IFINDEX;
   2613
   2614	flags |= RT6_LOOKUP_F_DST_NOREF;
   2615	any_src = ipv6_addr_any(&fl6->saddr);
   2616	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
   2617	    (fl6->flowi6_oif && any_src))
   2618		flags |= RT6_LOOKUP_F_IFACE;
   2619
   2620	if (!any_src)
   2621		flags |= RT6_LOOKUP_F_HAS_SADDR;
   2622	else if (sk)
   2623		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
   2624
   2625	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
   2626}
   2627EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref);
   2628
   2629struct dst_entry *ip6_route_output_flags(struct net *net,
   2630					 const struct sock *sk,
   2631					 struct flowi6 *fl6,
   2632					 int flags)
   2633{
   2634	struct dst_entry *dst;
   2635	struct rt6_info *rt6;
   2636
   2637	rcu_read_lock();
   2638	dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
   2639	rt6 = (struct rt6_info *)dst;
   2640	/* For dst cached in uncached_list, refcnt is already taken. */
   2641	if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
   2642		dst = &net->ipv6.ip6_null_entry->dst;
   2643		dst_hold(dst);
   2644	}
   2645	rcu_read_unlock();
   2646
   2647	return dst;
   2648}
   2649EXPORT_SYMBOL_GPL(ip6_route_output_flags);
   2650
   2651struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
   2652{
   2653	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
   2654	struct net_device *loopback_dev = net->loopback_dev;
   2655	struct dst_entry *new = NULL;
   2656
   2657	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
   2658		       DST_OBSOLETE_DEAD, 0);
   2659	if (rt) {
   2660		rt6_info_init(rt);
   2661		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
   2662
   2663		new = &rt->dst;
   2664		new->__use = 1;
   2665		new->input = dst_discard;
   2666		new->output = dst_discard_out;
   2667
   2668		dst_copy_metrics(new, &ort->dst);
   2669
   2670		rt->rt6i_idev = in6_dev_get(loopback_dev);
   2671		rt->rt6i_gateway = ort->rt6i_gateway;
   2672		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
   2673
   2674		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
   2675#ifdef CONFIG_IPV6_SUBTREES
   2676		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
   2677#endif
   2678	}
   2679
   2680	dst_release(dst_orig);
   2681	return new ? new : ERR_PTR(-ENOMEM);
   2682}
   2683
   2684/*
   2685 *	Destination cache support functions
   2686 */
   2687
   2688static bool fib6_check(struct fib6_info *f6i, u32 cookie)
   2689{
   2690	u32 rt_cookie = 0;
   2691
   2692	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
   2693		return false;
   2694
   2695	if (fib6_check_expired(f6i))
   2696		return false;
   2697
   2698	return true;
   2699}
   2700
   2701static struct dst_entry *rt6_check(struct rt6_info *rt,
   2702				   struct fib6_info *from,
   2703				   u32 cookie)
   2704{
   2705	u32 rt_cookie = 0;
   2706
   2707	if (!from || !fib6_get_cookie_safe(from, &rt_cookie) ||
   2708	    rt_cookie != cookie)
   2709		return NULL;
   2710
   2711	if (rt6_check_expired(rt))
   2712		return NULL;
   2713
   2714	return &rt->dst;
   2715}
   2716
   2717static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
   2718					    struct fib6_info *from,
   2719					    u32 cookie)
   2720{
   2721	if (!__rt6_check_expired(rt) &&
   2722	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
   2723	    fib6_check(from, cookie))
   2724		return &rt->dst;
   2725	else
   2726		return NULL;
   2727}
   2728
   2729INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst,
   2730							u32 cookie)
   2731{
   2732	struct dst_entry *dst_ret;
   2733	struct fib6_info *from;
   2734	struct rt6_info *rt;
   2735
   2736	rt = container_of(dst, struct rt6_info, dst);
   2737
   2738	if (rt->sernum)
   2739		return rt6_is_valid(rt) ? dst : NULL;
   2740
   2741	rcu_read_lock();
   2742
   2743	/* All IPV6 dsts are created with ->obsolete set to the value
   2744	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
   2745	 * into this function always.
   2746	 */
   2747
   2748	from = rcu_dereference(rt->from);
   2749
   2750	if (from && (rt->rt6i_flags & RTF_PCPU ||
   2751	    unlikely(!list_empty(&rt->rt6i_uncached))))
   2752		dst_ret = rt6_dst_from_check(rt, from, cookie);
   2753	else
   2754		dst_ret = rt6_check(rt, from, cookie);
   2755
   2756	rcu_read_unlock();
   2757
   2758	return dst_ret;
   2759}
   2760EXPORT_INDIRECT_CALLABLE(ip6_dst_check);
   2761
   2762static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
   2763{
   2764	struct rt6_info *rt = (struct rt6_info *) dst;
   2765
   2766	if (rt) {
   2767		if (rt->rt6i_flags & RTF_CACHE) {
   2768			rcu_read_lock();
   2769			if (rt6_check_expired(rt)) {
   2770				rt6_remove_exception_rt(rt);
   2771				dst = NULL;
   2772			}
   2773			rcu_read_unlock();
   2774		} else {
   2775			dst_release(dst);
   2776			dst = NULL;
   2777		}
   2778	}
   2779	return dst;
   2780}
   2781
   2782static void ip6_link_failure(struct sk_buff *skb)
   2783{
   2784	struct rt6_info *rt;
   2785
   2786	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
   2787
   2788	rt = (struct rt6_info *) skb_dst(skb);
   2789	if (rt) {
   2790		rcu_read_lock();
   2791		if (rt->rt6i_flags & RTF_CACHE) {
   2792			rt6_remove_exception_rt(rt);
   2793		} else {
   2794			struct fib6_info *from;
   2795			struct fib6_node *fn;
   2796
   2797			from = rcu_dereference(rt->from);
   2798			if (from) {
   2799				fn = rcu_dereference(from->fib6_node);
   2800				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
   2801					WRITE_ONCE(fn->fn_sernum, -1);
   2802			}
   2803		}
   2804		rcu_read_unlock();
   2805	}
   2806}
   2807
   2808static void rt6_update_expires(struct rt6_info *rt0, int timeout)
   2809{
   2810	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
   2811		struct fib6_info *from;
   2812
   2813		rcu_read_lock();
   2814		from = rcu_dereference(rt0->from);
   2815		if (from)
   2816			rt0->dst.expires = from->expires;
   2817		rcu_read_unlock();
   2818	}
   2819
   2820	dst_set_expires(&rt0->dst, timeout);
   2821	rt0->rt6i_flags |= RTF_EXPIRES;
   2822}
   2823
   2824static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
   2825{
   2826	struct net *net = dev_net(rt->dst.dev);
   2827
   2828	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
   2829	rt->rt6i_flags |= RTF_MODIFIED;
   2830	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
   2831}
   2832
   2833static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
   2834{
   2835	return !(rt->rt6i_flags & RTF_CACHE) &&
   2836		(rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
   2837}
   2838
   2839static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
   2840				 const struct ipv6hdr *iph, u32 mtu,
   2841				 bool confirm_neigh)
   2842{
   2843	const struct in6_addr *daddr, *saddr;
   2844	struct rt6_info *rt6 = (struct rt6_info *)dst;
   2845
   2846	/* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU)
   2847	 * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it.
   2848	 * [see also comment in rt6_mtu_change_route()]
   2849	 */
   2850
   2851	if (iph) {
   2852		daddr = &iph->daddr;
   2853		saddr = &iph->saddr;
   2854	} else if (sk) {
   2855		daddr = &sk->sk_v6_daddr;
   2856		saddr = &inet6_sk(sk)->saddr;
   2857	} else {
   2858		daddr = NULL;
   2859		saddr = NULL;
   2860	}
   2861
   2862	if (confirm_neigh)
   2863		dst_confirm_neigh(dst, daddr);
   2864
   2865	if (mtu < IPV6_MIN_MTU)
   2866		return;
   2867	if (mtu >= dst_mtu(dst))
   2868		return;
   2869
   2870	if (!rt6_cache_allowed_for_pmtu(rt6)) {
   2871		rt6_do_update_pmtu(rt6, mtu);
   2872		/* update rt6_ex->stamp for cache */
   2873		if (rt6->rt6i_flags & RTF_CACHE)
   2874			rt6_update_exception_stamp_rt(rt6);
   2875	} else if (daddr) {
   2876		struct fib6_result res = {};
   2877		struct rt6_info *nrt6;
   2878
   2879		rcu_read_lock();
   2880		res.f6i = rcu_dereference(rt6->from);
   2881		if (!res.f6i)
   2882			goto out_unlock;
   2883
   2884		res.fib6_flags = res.f6i->fib6_flags;
   2885		res.fib6_type = res.f6i->fib6_type;
   2886
   2887		if (res.f6i->nh) {
   2888			struct fib6_nh_match_arg arg = {
   2889				.dev = dst->dev,
   2890				.gw = &rt6->rt6i_gateway,
   2891			};
   2892
   2893			nexthop_for_each_fib6_nh(res.f6i->nh,
   2894						 fib6_nh_find_match, &arg);
   2895
   2896			/* fib6_info uses a nexthop that does not have fib6_nh
   2897			 * using the dst->dev + gw. Should be impossible.
   2898			 */
   2899			if (!arg.match)
   2900				goto out_unlock;
   2901
   2902			res.nh = arg.match;
   2903		} else {
   2904			res.nh = res.f6i->fib6_nh;
   2905		}
   2906
   2907		nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
   2908		if (nrt6) {
   2909			rt6_do_update_pmtu(nrt6, mtu);
   2910			if (rt6_insert_exception(nrt6, &res))
   2911				dst_release_immediate(&nrt6->dst);
   2912		}
   2913out_unlock:
   2914		rcu_read_unlock();
   2915	}
   2916}
   2917
   2918static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
   2919			       struct sk_buff *skb, u32 mtu,
   2920			       bool confirm_neigh)
   2921{
   2922	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu,
   2923			     confirm_neigh);
   2924}
   2925
   2926void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
   2927		     int oif, u32 mark, kuid_t uid)
   2928{
   2929	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
   2930	struct dst_entry *dst;
   2931	struct flowi6 fl6 = {
   2932		.flowi6_oif = oif,
   2933		.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
   2934		.daddr = iph->daddr,
   2935		.saddr = iph->saddr,
   2936		.flowlabel = ip6_flowinfo(iph),
   2937		.flowi6_uid = uid,
   2938	};
   2939
   2940	dst = ip6_route_output(net, NULL, &fl6);
   2941	if (!dst->error)
   2942		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true);
   2943	dst_release(dst);
   2944}
   2945EXPORT_SYMBOL_GPL(ip6_update_pmtu);
   2946
   2947void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
   2948{
   2949	int oif = sk->sk_bound_dev_if;
   2950	struct dst_entry *dst;
   2951
   2952	if (!oif && skb->dev)
   2953		oif = l3mdev_master_ifindex(skb->dev);
   2954
   2955	ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
   2956
   2957	dst = __sk_dst_get(sk);
   2958	if (!dst || !dst->obsolete ||
   2959	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
   2960		return;
   2961
   2962	bh_lock_sock(sk);
   2963	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
   2964		ip6_datagram_dst_update(sk, false);
   2965	bh_unlock_sock(sk);
   2966}
   2967EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
   2968
   2969void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
   2970			   const struct flowi6 *fl6)
   2971{
   2972#ifdef CONFIG_IPV6_SUBTREES
   2973	struct ipv6_pinfo *np = inet6_sk(sk);
   2974#endif
   2975
   2976	ip6_dst_store(sk, dst,
   2977		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
   2978		      &sk->sk_v6_daddr : NULL,
   2979#ifdef CONFIG_IPV6_SUBTREES
   2980		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
   2981		      &np->saddr :
   2982#endif
   2983		      NULL);
   2984}
   2985
   2986static bool ip6_redirect_nh_match(const struct fib6_result *res,
   2987				  struct flowi6 *fl6,
   2988				  const struct in6_addr *gw,
   2989				  struct rt6_info **ret)
   2990{
   2991	const struct fib6_nh *nh = res->nh;
   2992
   2993	if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
   2994	    fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
   2995		return false;
   2996
   2997	/* rt_cache's gateway might be different from its 'parent'
   2998	 * in the case of an ip redirect.
   2999	 * So we keep searching in the exception table if the gateway
   3000	 * is different.
   3001	 */
   3002	if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
   3003		struct rt6_info *rt_cache;
   3004
   3005		rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
   3006		if (rt_cache &&
   3007		    ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
   3008			*ret = rt_cache;
   3009			return true;
   3010		}
   3011		return false;
   3012	}
   3013	return true;
   3014}
   3015
   3016struct fib6_nh_rd_arg {
   3017	struct fib6_result	*res;
   3018	struct flowi6		*fl6;
   3019	const struct in6_addr	*gw;
   3020	struct rt6_info		**ret;
   3021};
   3022
   3023static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg)
   3024{
   3025	struct fib6_nh_rd_arg *arg = _arg;
   3026
   3027	arg->res->nh = nh;
   3028	return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret);
   3029}
   3030
   3031/* Handle redirects */
   3032struct ip6rd_flowi {
   3033	struct flowi6 fl6;
   3034	struct in6_addr gateway;
   3035};
   3036
   3037INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net,
   3038					     struct fib6_table *table,
   3039					     struct flowi6 *fl6,
   3040					     const struct sk_buff *skb,
   3041					     int flags)
   3042{
   3043	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
   3044	struct rt6_info *ret = NULL;
   3045	struct fib6_result res = {};
   3046	struct fib6_nh_rd_arg arg = {
   3047		.res = &res,
   3048		.fl6 = fl6,
   3049		.gw  = &rdfl->gateway,
   3050		.ret = &ret
   3051	};
   3052	struct fib6_info *rt;
   3053	struct fib6_node *fn;
   3054
   3055	/* Get the "current" route for this destination and
   3056	 * check if the redirect has come from appropriate router.
   3057	 *
   3058	 * RFC 4861 specifies that redirects should only be
   3059	 * accepted if they come from the nexthop to the target.
   3060	 * Due to the way the routes are chosen, this notion
   3061	 * is a bit fuzzy and one might need to check all possible
   3062	 * routes.
   3063	 */
   3064
   3065	rcu_read_lock();
   3066	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
   3067restart:
   3068	for_each_fib6_node_rt_rcu(fn) {
   3069		res.f6i = rt;
   3070		if (fib6_check_expired(rt))
   3071			continue;
   3072		if (rt->fib6_flags & RTF_REJECT)
   3073			break;
   3074		if (unlikely(rt->nh)) {
   3075			if (nexthop_is_blackhole(rt->nh))
   3076				continue;
   3077			/* on match, res->nh is filled in and potentially ret */
   3078			if (nexthop_for_each_fib6_nh(rt->nh,
   3079						     fib6_nh_redirect_match,
   3080						     &arg))
   3081				goto out;
   3082		} else {
   3083			res.nh = rt->fib6_nh;
   3084			if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway,
   3085						  &ret))
   3086				goto out;
   3087		}
   3088	}
   3089
   3090	if (!rt)
   3091		rt = net->ipv6.fib6_null_entry;
   3092	else if (rt->fib6_flags & RTF_REJECT) {
   3093		ret = net->ipv6.ip6_null_entry;
   3094		goto out;
   3095	}
   3096
   3097	if (rt == net->ipv6.fib6_null_entry) {
   3098		fn = fib6_backtrack(fn, &fl6->saddr);
   3099		if (fn)
   3100			goto restart;
   3101	}
   3102
   3103	res.f6i = rt;
   3104	res.nh = rt->fib6_nh;
   3105out:
   3106	if (ret) {
   3107		ip6_hold_safe(net, &ret);
   3108	} else {
   3109		res.fib6_flags = res.f6i->fib6_flags;
   3110		res.fib6_type = res.f6i->fib6_type;
   3111		ret = ip6_create_rt_rcu(&res);
   3112	}
   3113
   3114	rcu_read_unlock();
   3115
   3116	trace_fib6_table_lookup(net, &res, table, fl6);
   3117	return ret;
   3118};
   3119
   3120static struct dst_entry *ip6_route_redirect(struct net *net,
   3121					    const struct flowi6 *fl6,
   3122					    const struct sk_buff *skb,
   3123					    const struct in6_addr *gateway)
   3124{
   3125	int flags = RT6_LOOKUP_F_HAS_SADDR;
   3126	struct ip6rd_flowi rdfl;
   3127
   3128	rdfl.fl6 = *fl6;
   3129	rdfl.gateway = *gateway;
   3130
   3131	return fib6_rule_lookup(net, &rdfl.fl6, skb,
   3132				flags, __ip6_route_redirect);
   3133}
   3134
   3135void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
   3136		  kuid_t uid)
   3137{
   3138	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
   3139	struct dst_entry *dst;
   3140	struct flowi6 fl6 = {
   3141		.flowi6_iif = LOOPBACK_IFINDEX,
   3142		.flowi6_oif = oif,
   3143		.flowi6_mark = mark,
   3144		.daddr = iph->daddr,
   3145		.saddr = iph->saddr,
   3146		.flowlabel = ip6_flowinfo(iph),
   3147		.flowi6_uid = uid,
   3148	};
   3149
   3150	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
   3151	rt6_do_redirect(dst, NULL, skb);
   3152	dst_release(dst);
   3153}
   3154EXPORT_SYMBOL_GPL(ip6_redirect);
   3155
   3156void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
   3157{
   3158	const struct ipv6hdr *iph = ipv6_hdr(skb);
   3159	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
   3160	struct dst_entry *dst;
   3161	struct flowi6 fl6 = {
   3162		.flowi6_iif = LOOPBACK_IFINDEX,
   3163		.flowi6_oif = oif,
   3164		.daddr = msg->dest,
   3165		.saddr = iph->daddr,
   3166		.flowi6_uid = sock_net_uid(net, NULL),
   3167	};
   3168
   3169	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
   3170	rt6_do_redirect(dst, NULL, skb);
   3171	dst_release(dst);
   3172}
   3173
   3174void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
   3175{
   3176	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
   3177		     sk->sk_uid);
   3178}
   3179EXPORT_SYMBOL_GPL(ip6_sk_redirect);
   3180
   3181static unsigned int ip6_default_advmss(const struct dst_entry *dst)
   3182{
   3183	struct net_device *dev = dst->dev;
   3184	unsigned int mtu = dst_mtu(dst);
   3185	struct net *net = dev_net(dev);
   3186
   3187	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
   3188
   3189	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
   3190		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
   3191
   3192	/*
   3193	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
   3194	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
   3195	 * IPV6_MAXPLEN is also valid and means: "any MSS,
   3196	 * rely only on pmtu discovery"
   3197	 */
   3198	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
   3199		mtu = IPV6_MAXPLEN;
   3200	return mtu;
   3201}
   3202
   3203INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst)
   3204{
   3205	return ip6_dst_mtu_maybe_forward(dst, false);
   3206}
   3207EXPORT_INDIRECT_CALLABLE(ip6_mtu);
   3208
   3209/* MTU selection:
   3210 * 1. mtu on route is locked - use it
   3211 * 2. mtu from nexthop exception
   3212 * 3. mtu from egress device
   3213 *
   3214 * based on ip6_dst_mtu_forward and exception logic of
   3215 * rt6_find_cached_rt; called with rcu_read_lock
   3216 */
   3217u32 ip6_mtu_from_fib6(const struct fib6_result *res,
   3218		      const struct in6_addr *daddr,
   3219		      const struct in6_addr *saddr)
   3220{
   3221	const struct fib6_nh *nh = res->nh;
   3222	struct fib6_info *f6i = res->f6i;
   3223	struct inet6_dev *idev;
   3224	struct rt6_info *rt;
   3225	u32 mtu = 0;
   3226
   3227	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
   3228		mtu = f6i->fib6_pmtu;
   3229		if (mtu)
   3230			goto out;
   3231	}
   3232
   3233	rt = rt6_find_cached_rt(res, daddr, saddr);
   3234	if (unlikely(rt)) {
   3235		mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
   3236	} else {
   3237		struct net_device *dev = nh->fib_nh_dev;
   3238
   3239		mtu = IPV6_MIN_MTU;
   3240		idev = __in6_dev_get(dev);
   3241		if (idev && idev->cnf.mtu6 > mtu)
   3242			mtu = idev->cnf.mtu6;
   3243	}
   3244
   3245	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
   3246out:
   3247	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
   3248}
   3249
   3250struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
   3251				  struct flowi6 *fl6)
   3252{
   3253	struct dst_entry *dst;
   3254	struct rt6_info *rt;
   3255	struct inet6_dev *idev = in6_dev_get(dev);
   3256	struct net *net = dev_net(dev);
   3257
   3258	if (unlikely(!idev))
   3259		return ERR_PTR(-ENODEV);
   3260
   3261	rt = ip6_dst_alloc(net, dev, 0);
   3262	if (unlikely(!rt)) {
   3263		in6_dev_put(idev);
   3264		dst = ERR_PTR(-ENOMEM);
   3265		goto out;
   3266	}
   3267
   3268	rt->dst.input = ip6_input;
   3269	rt->dst.output  = ip6_output;
   3270	rt->rt6i_gateway  = fl6->daddr;
   3271	rt->rt6i_dst.addr = fl6->daddr;
   3272	rt->rt6i_dst.plen = 128;
   3273	rt->rt6i_idev     = idev;
   3274	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
   3275
   3276	/* Add this dst into uncached_list so that rt6_disable_ip() can
   3277	 * do proper release of the net_device
   3278	 */
   3279	rt6_uncached_list_add(rt);
   3280
   3281	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
   3282
   3283out:
   3284	return dst;
   3285}
   3286
   3287static int ip6_dst_gc(struct dst_ops *ops)
   3288{
   3289	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
   3290	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
   3291	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
   3292	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
   3293	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
   3294	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
   3295	unsigned int val;
   3296	int entries;
   3297
   3298	entries = dst_entries_get_fast(ops);
   3299	if (entries > rt_max_size)
   3300		entries = dst_entries_get_slow(ops);
   3301
   3302	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
   3303	    entries <= rt_max_size)
   3304		goto out;
   3305
   3306	fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true);
   3307	entries = dst_entries_get_slow(ops);
   3308	if (entries < ops->gc_thresh)
   3309		atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1);
   3310out:
   3311	val = atomic_read(&net->ipv6.ip6_rt_gc_expire);
   3312	atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity));
   3313	return entries > rt_max_size;
   3314}
   3315
   3316static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg,
   3317			       const struct in6_addr *gw_addr, u32 tbid,
   3318			       int flags, struct fib6_result *res)
   3319{
   3320	struct flowi6 fl6 = {
   3321		.flowi6_oif = cfg->fc_ifindex,
   3322		.daddr = *gw_addr,
   3323		.saddr = cfg->fc_prefsrc,
   3324	};
   3325	struct fib6_table *table;
   3326	int err;
   3327
   3328	table = fib6_get_table(net, tbid);
   3329	if (!table)
   3330		return -EINVAL;
   3331
   3332	if (!ipv6_addr_any(&cfg->fc_prefsrc))
   3333		flags |= RT6_LOOKUP_F_HAS_SADDR;
   3334
   3335	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
   3336
   3337	err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags);
   3338	if (!err && res->f6i != net->ipv6.fib6_null_entry)
   3339		fib6_select_path(net, res, &fl6, cfg->fc_ifindex,
   3340				 cfg->fc_ifindex != 0, NULL, flags);
   3341
   3342	return err;
   3343}
   3344
   3345static int ip6_route_check_nh_onlink(struct net *net,
   3346				     struct fib6_config *cfg,
   3347				     const struct net_device *dev,
   3348				     struct netlink_ext_ack *extack)
   3349{
   3350	u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
   3351	const struct in6_addr *gw_addr = &cfg->fc_gateway;
   3352	struct fib6_result res = {};
   3353	int err;
   3354
   3355	err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res);
   3356	if (!err && !(res.fib6_flags & RTF_REJECT) &&
   3357	    /* ignore match if it is the default route */
   3358	    !ipv6_addr_any(&res.f6i->fib6_dst.addr) &&
   3359	    (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) {
   3360		NL_SET_ERR_MSG(extack,
   3361			       "Nexthop has invalid gateway or device mismatch");
   3362		err = -EINVAL;
   3363	}
   3364
   3365	return err;
   3366}
   3367
   3368static int ip6_route_check_nh(struct net *net,
   3369			      struct fib6_config *cfg,
   3370			      struct net_device **_dev,
   3371			      struct inet6_dev **idev)
   3372{
   3373	const struct in6_addr *gw_addr = &cfg->fc_gateway;
   3374	struct net_device *dev = _dev ? *_dev : NULL;
   3375	int flags = RT6_LOOKUP_F_IFACE;
   3376	struct fib6_result res = {};
   3377	int err = -EHOSTUNREACH;
   3378
   3379	if (cfg->fc_table) {
   3380		err = ip6_nh_lookup_table(net, cfg, gw_addr,
   3381					  cfg->fc_table, flags, &res);
   3382		/* gw_addr can not require a gateway or resolve to a reject
   3383		 * route. If a device is given, it must match the result.
   3384		 */
   3385		if (err || res.fib6_flags & RTF_REJECT ||
   3386		    res.nh->fib_nh_gw_family ||
   3387		    (dev && dev != res.nh->fib_nh_dev))
   3388			err = -EHOSTUNREACH;
   3389	}
   3390
   3391	if (err < 0) {
   3392		struct flowi6 fl6 = {
   3393			.flowi6_oif = cfg->fc_ifindex,
   3394			.daddr = *gw_addr,
   3395		};
   3396
   3397		err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags);
   3398		if (err || res.fib6_flags & RTF_REJECT ||
   3399		    res.nh->fib_nh_gw_family)
   3400			err = -EHOSTUNREACH;
   3401
   3402		if (err)
   3403			return err;
   3404
   3405		fib6_select_path(net, &res, &fl6, cfg->fc_ifindex,
   3406				 cfg->fc_ifindex != 0, NULL, flags);
   3407	}
   3408
   3409	err = 0;
   3410	if (dev) {
   3411		if (dev != res.nh->fib_nh_dev)
   3412			err = -EHOSTUNREACH;
   3413	} else {
   3414		*_dev = dev = res.nh->fib_nh_dev;
   3415		dev_hold(dev);
   3416		*idev = in6_dev_get(dev);
   3417	}
   3418
   3419	return err;
   3420}
   3421
   3422static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
   3423			   struct net_device **_dev, struct inet6_dev **idev,
   3424			   struct netlink_ext_ack *extack)
   3425{
   3426	const struct in6_addr *gw_addr = &cfg->fc_gateway;
   3427	int gwa_type = ipv6_addr_type(gw_addr);
   3428	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
   3429	const struct net_device *dev = *_dev;
   3430	bool need_addr_check = !dev;
   3431	int err = -EINVAL;
   3432
   3433	/* if gw_addr is local we will fail to detect this in case
   3434	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
   3435	 * will return already-added prefix route via interface that
   3436	 * prefix route was assigned to, which might be non-loopback.
   3437	 */
   3438	if (dev &&
   3439	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
   3440		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
   3441		goto out;
   3442	}
   3443
   3444	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
   3445		/* IPv6 strictly inhibits using not link-local
   3446		 * addresses as nexthop address.
   3447		 * Otherwise, router will not able to send redirects.
   3448		 * It is very good, but in some (rare!) circumstances
   3449		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
   3450		 * some exceptions. --ANK
   3451		 * We allow IPv4-mapped nexthops to support RFC4798-type
   3452		 * addressing
   3453		 */
   3454		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
   3455			NL_SET_ERR_MSG(extack, "Invalid gateway address");
   3456			goto out;
   3457		}
   3458
   3459		rcu_read_lock();
   3460
   3461		if (cfg->fc_flags & RTNH_F_ONLINK)
   3462			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
   3463		else
   3464			err = ip6_route_check_nh(net, cfg, _dev, idev);
   3465
   3466		rcu_read_unlock();
   3467
   3468		if (err)
   3469			goto out;
   3470	}
   3471
   3472	/* reload in case device was changed */
   3473	dev = *_dev;
   3474
   3475	err = -EINVAL;
   3476	if (!dev) {
   3477		NL_SET_ERR_MSG(extack, "Egress device not specified");
   3478		goto out;
   3479	} else if (dev->flags & IFF_LOOPBACK) {
   3480		NL_SET_ERR_MSG(extack,
   3481			       "Egress device can not be loopback device for this route");
   3482		goto out;
   3483	}
   3484
   3485	/* if we did not check gw_addr above, do so now that the
   3486	 * egress device has been resolved.
   3487	 */
   3488	if (need_addr_check &&
   3489	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
   3490		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
   3491		goto out;
   3492	}
   3493
   3494	err = 0;
   3495out:
   3496	return err;
   3497}
   3498
   3499static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
   3500{
   3501	if ((flags & RTF_REJECT) ||
   3502	    (dev && (dev->flags & IFF_LOOPBACK) &&
   3503	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
   3504	     !(flags & (RTF_ANYCAST | RTF_LOCAL))))
   3505		return true;
   3506
   3507	return false;
   3508}
   3509
   3510int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
   3511		 struct fib6_config *cfg, gfp_t gfp_flags,
   3512		 struct netlink_ext_ack *extack)
   3513{
   3514	struct net_device *dev = NULL;
   3515	struct inet6_dev *idev = NULL;
   3516	int addr_type;
   3517	int err;
   3518
   3519	fib6_nh->fib_nh_family = AF_INET6;
   3520#ifdef CONFIG_IPV6_ROUTER_PREF
   3521	fib6_nh->last_probe = jiffies;
   3522#endif
   3523	if (cfg->fc_is_fdb) {
   3524		fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
   3525		fib6_nh->fib_nh_gw_family = AF_INET6;
   3526		return 0;
   3527	}
   3528
   3529	err = -ENODEV;
   3530	if (cfg->fc_ifindex) {
   3531		dev = dev_get_by_index(net, cfg->fc_ifindex);
   3532		if (!dev)
   3533			goto out;
   3534		idev = in6_dev_get(dev);
   3535		if (!idev)
   3536			goto out;
   3537	}
   3538
   3539	if (cfg->fc_flags & RTNH_F_ONLINK) {
   3540		if (!dev) {
   3541			NL_SET_ERR_MSG(extack,
   3542				       "Nexthop device required for onlink");
   3543			goto out;
   3544		}
   3545
   3546		if (!(dev->flags & IFF_UP)) {
   3547			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
   3548			err = -ENETDOWN;
   3549			goto out;
   3550		}
   3551
   3552		fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
   3553	}
   3554
   3555	fib6_nh->fib_nh_weight = 1;
   3556
   3557	/* We cannot add true routes via loopback here,
   3558	 * they would result in kernel looping; promote them to reject routes
   3559	 */
   3560	addr_type = ipv6_addr_type(&cfg->fc_dst);
   3561	if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
   3562		/* hold loopback dev/idev if we haven't done so. */
   3563		if (dev != net->loopback_dev) {
   3564			if (dev) {
   3565				dev_put(dev);
   3566				in6_dev_put(idev);
   3567			}
   3568			dev = net->loopback_dev;
   3569			dev_hold(dev);
   3570			idev = in6_dev_get(dev);
   3571			if (!idev) {
   3572				err = -ENODEV;
   3573				goto out;
   3574			}
   3575		}
   3576		goto pcpu_alloc;
   3577	}
   3578
   3579	if (cfg->fc_flags & RTF_GATEWAY) {
   3580		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
   3581		if (err)
   3582			goto out;
   3583
   3584		fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
   3585		fib6_nh->fib_nh_gw_family = AF_INET6;
   3586	}
   3587
   3588	err = -ENODEV;
   3589	if (!dev)
   3590		goto out;
   3591
   3592	if (idev->cnf.disable_ipv6) {
   3593		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
   3594		err = -EACCES;
   3595		goto out;
   3596	}
   3597
   3598	if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
   3599		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
   3600		err = -ENETDOWN;
   3601		goto out;
   3602	}
   3603
   3604	if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
   3605	    !netif_carrier_ok(dev))
   3606		fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
   3607
   3608	err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap,
   3609				 cfg->fc_encap_type, cfg, gfp_flags, extack);
   3610	if (err)
   3611		goto out;
   3612
   3613pcpu_alloc:
   3614	fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
   3615	if (!fib6_nh->rt6i_pcpu) {
   3616		err = -ENOMEM;
   3617		goto out;
   3618	}
   3619
   3620	fib6_nh->fib_nh_dev = dev;
   3621	netdev_tracker_alloc(dev, &fib6_nh->fib_nh_dev_tracker, gfp_flags);
   3622
   3623	fib6_nh->fib_nh_oif = dev->ifindex;
   3624	err = 0;
   3625out:
   3626	if (idev)
   3627		in6_dev_put(idev);
   3628
   3629	if (err) {
   3630		lwtstate_put(fib6_nh->fib_nh_lws);
   3631		fib6_nh->fib_nh_lws = NULL;
   3632		dev_put(dev);
   3633	}
   3634
   3635	return err;
   3636}
   3637
   3638void fib6_nh_release(struct fib6_nh *fib6_nh)
   3639{
   3640	struct rt6_exception_bucket *bucket;
   3641
   3642	rcu_read_lock();
   3643
   3644	fib6_nh_flush_exceptions(fib6_nh, NULL);
   3645	bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL);
   3646	if (bucket) {
   3647		rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL);
   3648		kfree(bucket);
   3649	}
   3650
   3651	rcu_read_unlock();
   3652
   3653	fib6_nh_release_dsts(fib6_nh);
   3654	free_percpu(fib6_nh->rt6i_pcpu);
   3655
   3656	fib_nh_common_release(&fib6_nh->nh_common);
   3657}
   3658
   3659void fib6_nh_release_dsts(struct fib6_nh *fib6_nh)
   3660{
   3661	int cpu;
   3662
   3663	if (!fib6_nh->rt6i_pcpu)
   3664		return;
   3665
   3666	for_each_possible_cpu(cpu) {
   3667		struct rt6_info *pcpu_rt, **ppcpu_rt;
   3668
   3669		ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
   3670		pcpu_rt = xchg(ppcpu_rt, NULL);
   3671		if (pcpu_rt) {
   3672			dst_dev_put(&pcpu_rt->dst);
   3673			dst_release(&pcpu_rt->dst);
   3674		}
   3675	}
   3676}
   3677
   3678static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
   3679					      gfp_t gfp_flags,
   3680					      struct netlink_ext_ack *extack)
   3681{
   3682	struct net *net = cfg->fc_nlinfo.nl_net;
   3683	struct fib6_info *rt = NULL;
   3684	struct nexthop *nh = NULL;
   3685	struct fib6_table *table;
   3686	struct fib6_nh *fib6_nh;
   3687	int err = -EINVAL;
   3688	int addr_type;
   3689
   3690	/* RTF_PCPU is an internal flag; can not be set by userspace */
   3691	if (cfg->fc_flags & RTF_PCPU) {
   3692		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
   3693		goto out;
   3694	}
   3695
   3696	/* RTF_CACHE is an internal flag; can not be set by userspace */
   3697	if (cfg->fc_flags & RTF_CACHE) {
   3698		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
   3699		goto out;
   3700	}
   3701
   3702	if (cfg->fc_type > RTN_MAX) {
   3703		NL_SET_ERR_MSG(extack, "Invalid route type");
   3704		goto out;
   3705	}
   3706
   3707	if (cfg->fc_dst_len > 128) {
   3708		NL_SET_ERR_MSG(extack, "Invalid prefix length");
   3709		goto out;
   3710	}
   3711	if (cfg->fc_src_len > 128) {
   3712		NL_SET_ERR_MSG(extack, "Invalid source address length");
   3713		goto out;
   3714	}
   3715#ifndef CONFIG_IPV6_SUBTREES
   3716	if (cfg->fc_src_len) {
   3717		NL_SET_ERR_MSG(extack,
   3718			       "Specifying source address requires IPV6_SUBTREES to be enabled");
   3719		goto out;
   3720	}
   3721#endif
   3722	if (cfg->fc_nh_id) {
   3723		nh = nexthop_find_by_id(net, cfg->fc_nh_id);
   3724		if (!nh) {
   3725			NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
   3726			goto out;
   3727		}
   3728		err = fib6_check_nexthop(nh, cfg, extack);
   3729		if (err)
   3730			goto out;
   3731	}
   3732
   3733	err = -ENOBUFS;
   3734	if (cfg->fc_nlinfo.nlh &&
   3735	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
   3736		table = fib6_get_table(net, cfg->fc_table);
   3737		if (!table) {
   3738			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
   3739			table = fib6_new_table(net, cfg->fc_table);
   3740		}
   3741	} else {
   3742		table = fib6_new_table(net, cfg->fc_table);
   3743	}
   3744
   3745	if (!table)
   3746		goto out;
   3747
   3748	err = -ENOMEM;
   3749	rt = fib6_info_alloc(gfp_flags, !nh);
   3750	if (!rt)
   3751		goto out;
   3752
   3753	rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
   3754					       extack);
   3755	if (IS_ERR(rt->fib6_metrics)) {
   3756		err = PTR_ERR(rt->fib6_metrics);
   3757		/* Do not leave garbage there. */
   3758		rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
   3759		goto out_free;
   3760	}
   3761
   3762	if (cfg->fc_flags & RTF_ADDRCONF)
   3763		rt->dst_nocount = true;
   3764
   3765	if (cfg->fc_flags & RTF_EXPIRES)
   3766		fib6_set_expires(rt, jiffies +
   3767				clock_t_to_jiffies(cfg->fc_expires));
   3768	else
   3769		fib6_clean_expires(rt);
   3770
   3771	if (cfg->fc_protocol == RTPROT_UNSPEC)
   3772		cfg->fc_protocol = RTPROT_BOOT;
   3773	rt->fib6_protocol = cfg->fc_protocol;
   3774
   3775	rt->fib6_table = table;
   3776	rt->fib6_metric = cfg->fc_metric;
   3777	rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
   3778	rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
   3779
   3780	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
   3781	rt->fib6_dst.plen = cfg->fc_dst_len;
   3782
   3783#ifdef CONFIG_IPV6_SUBTREES
   3784	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
   3785	rt->fib6_src.plen = cfg->fc_src_len;
   3786#endif
   3787	if (nh) {
   3788		if (rt->fib6_src.plen) {
   3789			NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing");
   3790			goto out_free;
   3791		}
   3792		if (!nexthop_get(nh)) {
   3793			NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
   3794			goto out_free;
   3795		}
   3796		rt->nh = nh;
   3797		fib6_nh = nexthop_fib6_nh(rt->nh);
   3798	} else {
   3799		err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
   3800		if (err)
   3801			goto out;
   3802
   3803		fib6_nh = rt->fib6_nh;
   3804
   3805		/* We cannot add true routes via loopback here, they would
   3806		 * result in kernel looping; promote them to reject routes
   3807		 */
   3808		addr_type = ipv6_addr_type(&cfg->fc_dst);
   3809		if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev,
   3810				   addr_type))
   3811			rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
   3812	}
   3813
   3814	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
   3815		struct net_device *dev = fib6_nh->fib_nh_dev;
   3816
   3817		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
   3818			NL_SET_ERR_MSG(extack, "Invalid source address");
   3819			err = -EINVAL;
   3820			goto out;
   3821		}
   3822		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
   3823		rt->fib6_prefsrc.plen = 128;
   3824	} else
   3825		rt->fib6_prefsrc.plen = 0;
   3826
   3827	return rt;
   3828out:
   3829	fib6_info_release(rt);
   3830	return ERR_PTR(err);
   3831out_free:
   3832	ip_fib_metrics_put(rt->fib6_metrics);
   3833	kfree(rt);
   3834	return ERR_PTR(err);
   3835}
   3836
   3837int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
   3838		  struct netlink_ext_ack *extack)
   3839{
   3840	struct fib6_info *rt;
   3841	int err;
   3842
   3843	rt = ip6_route_info_create(cfg, gfp_flags, extack);
   3844	if (IS_ERR(rt))
   3845		return PTR_ERR(rt);
   3846
   3847	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
   3848	fib6_info_release(rt);
   3849
   3850	return err;
   3851}
   3852
   3853static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
   3854{
   3855	struct net *net = info->nl_net;
   3856	struct fib6_table *table;
   3857	int err;
   3858
   3859	if (rt == net->ipv6.fib6_null_entry) {
   3860		err = -ENOENT;
   3861		goto out;
   3862	}
   3863
   3864	table = rt->fib6_table;
   3865	spin_lock_bh(&table->tb6_lock);
   3866	err = fib6_del(rt, info);
   3867	spin_unlock_bh(&table->tb6_lock);
   3868
   3869out:
   3870	fib6_info_release(rt);
   3871	return err;
   3872}
   3873
   3874int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify)
   3875{
   3876	struct nl_info info = {
   3877		.nl_net = net,
   3878		.skip_notify = skip_notify
   3879	};
   3880
   3881	return __ip6_del_rt(rt, &info);
   3882}
   3883
   3884static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
   3885{
   3886	struct nl_info *info = &cfg->fc_nlinfo;
   3887	struct net *net = info->nl_net;
   3888	struct sk_buff *skb = NULL;
   3889	struct fib6_table *table;
   3890	int err = -ENOENT;
   3891
   3892	if (rt == net->ipv6.fib6_null_entry)
   3893		goto out_put;
   3894	table = rt->fib6_table;
   3895	spin_lock_bh(&table->tb6_lock);
   3896
   3897	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
   3898		struct fib6_info *sibling, *next_sibling;
   3899		struct fib6_node *fn;
   3900
   3901		/* prefer to send a single notification with all hops */
   3902		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
   3903		if (skb) {
   3904			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
   3905
   3906			if (rt6_fill_node(net, skb, rt, NULL,
   3907					  NULL, NULL, 0, RTM_DELROUTE,
   3908					  info->portid, seq, 0) < 0) {
   3909				kfree_skb(skb);
   3910				skb = NULL;
   3911			} else
   3912				info->skip_notify = 1;
   3913		}
   3914
   3915		/* 'rt' points to the first sibling route. If it is not the
   3916		 * leaf, then we do not need to send a notification. Otherwise,
   3917		 * we need to check if the last sibling has a next route or not
   3918		 * and emit a replace or delete notification, respectively.
   3919		 */
   3920		info->skip_notify_kernel = 1;
   3921		fn = rcu_dereference_protected(rt->fib6_node,
   3922					    lockdep_is_held(&table->tb6_lock));
   3923		if (rcu_access_pointer(fn->leaf) == rt) {
   3924			struct fib6_info *last_sibling, *replace_rt;
   3925
   3926			last_sibling = list_last_entry(&rt->fib6_siblings,
   3927						       struct fib6_info,
   3928						       fib6_siblings);
   3929			replace_rt = rcu_dereference_protected(
   3930					    last_sibling->fib6_next,
   3931					    lockdep_is_held(&table->tb6_lock));
   3932			if (replace_rt)
   3933				call_fib6_entry_notifiers_replace(net,
   3934								  replace_rt);
   3935			else
   3936				call_fib6_multipath_entry_notifiers(net,
   3937						       FIB_EVENT_ENTRY_DEL,
   3938						       rt, rt->fib6_nsiblings,
   3939						       NULL);
   3940		}
   3941		list_for_each_entry_safe(sibling, next_sibling,
   3942					 &rt->fib6_siblings,
   3943					 fib6_siblings) {
   3944			err = fib6_del(sibling, info);
   3945			if (err)
   3946				goto out_unlock;
   3947		}
   3948	}
   3949
   3950	err = fib6_del(rt, info);
   3951out_unlock:
   3952	spin_unlock_bh(&table->tb6_lock);
   3953out_put:
   3954	fib6_info_release(rt);
   3955
   3956	if (skb) {
   3957		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
   3958			    info->nlh, gfp_any());
   3959	}
   3960	return err;
   3961}
   3962
   3963static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
   3964{
   3965	int rc = -ESRCH;
   3966
   3967	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
   3968		goto out;
   3969
   3970	if (cfg->fc_flags & RTF_GATEWAY &&
   3971	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
   3972		goto out;
   3973
   3974	rc = rt6_remove_exception_rt(rt);
   3975out:
   3976	return rc;
   3977}
   3978
   3979static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt,
   3980			     struct fib6_nh *nh)
   3981{
   3982	struct fib6_result res = {
   3983		.f6i = rt,
   3984		.nh = nh,
   3985	};
   3986	struct rt6_info *rt_cache;
   3987
   3988	rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src);
   3989	if (rt_cache)
   3990		return __ip6_del_cached_rt(rt_cache, cfg);
   3991
   3992	return 0;
   3993}
   3994
   3995struct fib6_nh_del_cached_rt_arg {
   3996	struct fib6_config *cfg;
   3997	struct fib6_info *f6i;
   3998};
   3999
   4000static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg)
   4001{
   4002	struct fib6_nh_del_cached_rt_arg *arg = _arg;
   4003	int rc;
   4004
   4005	rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh);
   4006	return rc != -ESRCH ? rc : 0;
   4007}
   4008
   4009static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i)
   4010{
   4011	struct fib6_nh_del_cached_rt_arg arg = {
   4012		.cfg = cfg,
   4013		.f6i = f6i
   4014	};
   4015
   4016	return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg);
   4017}
   4018
   4019static int ip6_route_del(struct fib6_config *cfg,
   4020			 struct netlink_ext_ack *extack)
   4021{
   4022	struct fib6_table *table;
   4023	struct fib6_info *rt;
   4024	struct fib6_node *fn;
   4025	int err = -ESRCH;
   4026
   4027	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
   4028	if (!table) {
   4029		NL_SET_ERR_MSG(extack, "FIB table does not exist");
   4030		return err;
   4031	}
   4032
   4033	rcu_read_lock();
   4034
   4035	fn = fib6_locate(&table->tb6_root,
   4036			 &cfg->fc_dst, cfg->fc_dst_len,
   4037			 &cfg->fc_src, cfg->fc_src_len,
   4038			 !(cfg->fc_flags & RTF_CACHE));
   4039
   4040	if (fn) {
   4041		for_each_fib6_node_rt_rcu(fn) {
   4042			struct fib6_nh *nh;
   4043
   4044			if (rt->nh && cfg->fc_nh_id &&
   4045			    rt->nh->id != cfg->fc_nh_id)
   4046				continue;
   4047
   4048			if (cfg->fc_flags & RTF_CACHE) {
   4049				int rc = 0;
   4050
   4051				if (rt->nh) {
   4052					rc = ip6_del_cached_rt_nh(cfg, rt);
   4053				} else if (cfg->fc_nh_id) {
   4054					continue;
   4055				} else {
   4056					nh = rt->fib6_nh;
   4057					rc = ip6_del_cached_rt(cfg, rt, nh);
   4058				}
   4059				if (rc != -ESRCH) {
   4060					rcu_read_unlock();
   4061					return rc;
   4062				}
   4063				continue;
   4064			}
   4065
   4066			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
   4067				continue;
   4068			if (cfg->fc_protocol &&
   4069			    cfg->fc_protocol != rt->fib6_protocol)
   4070				continue;
   4071
   4072			if (rt->nh) {
   4073				if (!fib6_info_hold_safe(rt))
   4074					continue;
   4075				rcu_read_unlock();
   4076
   4077				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
   4078			}
   4079			if (cfg->fc_nh_id)
   4080				continue;
   4081
   4082			nh = rt->fib6_nh;
   4083			if (cfg->fc_ifindex &&
   4084			    (!nh->fib_nh_dev ||
   4085			     nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
   4086				continue;
   4087			if (cfg->fc_flags & RTF_GATEWAY &&
   4088			    !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
   4089				continue;
   4090			if (!fib6_info_hold_safe(rt))
   4091				continue;
   4092			rcu_read_unlock();
   4093
   4094			/* if gateway was specified only delete the one hop */
   4095			if (cfg->fc_flags & RTF_GATEWAY)
   4096				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
   4097
   4098			return __ip6_del_rt_siblings(rt, cfg);
   4099		}
   4100	}
   4101	rcu_read_unlock();
   4102
   4103	return err;
   4104}
   4105
   4106static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
   4107{
   4108	struct netevent_redirect netevent;
   4109	struct rt6_info *rt, *nrt = NULL;
   4110	struct fib6_result res = {};
   4111	struct ndisc_options ndopts;
   4112	struct inet6_dev *in6_dev;
   4113	struct neighbour *neigh;
   4114	struct rd_msg *msg;
   4115	int optlen, on_link;
   4116	u8 *lladdr;
   4117
   4118	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
   4119	optlen -= sizeof(*msg);
   4120
   4121	if (optlen < 0) {
   4122		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
   4123		return;
   4124	}
   4125
   4126	msg = (struct rd_msg *)icmp6_hdr(skb);
   4127
   4128	if (ipv6_addr_is_multicast(&msg->dest)) {
   4129		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
   4130		return;
   4131	}
   4132
   4133	on_link = 0;
   4134	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
   4135		on_link = 1;
   4136	} else if (ipv6_addr_type(&msg->target) !=
   4137		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
   4138		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
   4139		return;
   4140	}
   4141
   4142	in6_dev = __in6_dev_get(skb->dev);
   4143	if (!in6_dev)
   4144		return;
   4145	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
   4146		return;
   4147
   4148	/* RFC2461 8.1:
   4149	 *	The IP source address of the Redirect MUST be the same as the current
   4150	 *	first-hop router for the specified ICMP Destination Address.
   4151	 */
   4152
   4153	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
   4154		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
   4155		return;
   4156	}
   4157
   4158	lladdr = NULL;
   4159	if (ndopts.nd_opts_tgt_lladdr) {
   4160		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
   4161					     skb->dev);
   4162		if (!lladdr) {
   4163			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
   4164			return;
   4165		}
   4166	}
   4167
   4168	rt = (struct rt6_info *) dst;
   4169	if (rt->rt6i_flags & RTF_REJECT) {
   4170		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
   4171		return;
   4172	}
   4173
   4174	/* Redirect received -> path was valid.
   4175	 * Look, redirects are sent only in response to data packets,
   4176	 * so that this nexthop apparently is reachable. --ANK
   4177	 */
   4178	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
   4179
   4180	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
   4181	if (!neigh)
   4182		return;
   4183
   4184	/*
   4185	 *	We have finally decided to accept it.
   4186	 */
   4187
   4188	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
   4189		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
   4190		     NEIGH_UPDATE_F_OVERRIDE|
   4191		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
   4192				     NEIGH_UPDATE_F_ISROUTER)),
   4193		     NDISC_REDIRECT, &ndopts);
   4194
   4195	rcu_read_lock();
   4196	res.f6i = rcu_dereference(rt->from);
   4197	if (!res.f6i)
   4198		goto out;
   4199
   4200	if (res.f6i->nh) {
   4201		struct fib6_nh_match_arg arg = {
   4202			.dev = dst->dev,
   4203			.gw = &rt->rt6i_gateway,
   4204		};
   4205
   4206		nexthop_for_each_fib6_nh(res.f6i->nh,
   4207					 fib6_nh_find_match, &arg);
   4208
   4209		/* fib6_info uses a nexthop that does not have fib6_nh
   4210		 * using the dst->dev. Should be impossible
   4211		 */
   4212		if (!arg.match)
   4213			goto out;
   4214		res.nh = arg.match;
   4215	} else {
   4216		res.nh = res.f6i->fib6_nh;
   4217	}
   4218
   4219	res.fib6_flags = res.f6i->fib6_flags;
   4220	res.fib6_type = res.f6i->fib6_type;
   4221	nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
   4222	if (!nrt)
   4223		goto out;
   4224
   4225	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
   4226	if (on_link)
   4227		nrt->rt6i_flags &= ~RTF_GATEWAY;
   4228
   4229	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
   4230
   4231	/* rt6_insert_exception() will take care of duplicated exceptions */
   4232	if (rt6_insert_exception(nrt, &res)) {
   4233		dst_release_immediate(&nrt->dst);
   4234		goto out;
   4235	}
   4236
   4237	netevent.old = &rt->dst;
   4238	netevent.new = &nrt->dst;
   4239	netevent.daddr = &msg->dest;
   4240	netevent.neigh = neigh;
   4241	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
   4242
   4243out:
   4244	rcu_read_unlock();
   4245	neigh_release(neigh);
   4246}
   4247
   4248#ifdef CONFIG_IPV6_ROUTE_INFO
   4249static struct fib6_info *rt6_get_route_info(struct net *net,
   4250					   const struct in6_addr *prefix, int prefixlen,
   4251					   const struct in6_addr *gwaddr,
   4252					   struct net_device *dev)
   4253{
   4254	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
   4255	int ifindex = dev->ifindex;
   4256	struct fib6_node *fn;
   4257	struct fib6_info *rt = NULL;
   4258	struct fib6_table *table;
   4259
   4260	table = fib6_get_table(net, tb_id);
   4261	if (!table)
   4262		return NULL;
   4263
   4264	rcu_read_lock();
   4265	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
   4266	if (!fn)
   4267		goto out;
   4268
   4269	for_each_fib6_node_rt_rcu(fn) {
   4270		/* these routes do not use nexthops */
   4271		if (rt->nh)
   4272			continue;
   4273		if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
   4274			continue;
   4275		if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
   4276		    !rt->fib6_nh->fib_nh_gw_family)
   4277			continue;
   4278		if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr))
   4279			continue;
   4280		if (!fib6_info_hold_safe(rt))
   4281			continue;
   4282		break;
   4283	}
   4284out:
   4285	rcu_read_unlock();
   4286	return rt;
   4287}
   4288
   4289static struct fib6_info *rt6_add_route_info(struct net *net,
   4290					   const struct in6_addr *prefix, int prefixlen,
   4291					   const struct in6_addr *gwaddr,
   4292					   struct net_device *dev,
   4293					   unsigned int pref)
   4294{
   4295	struct fib6_config cfg = {
   4296		.fc_metric	= IP6_RT_PRIO_USER,
   4297		.fc_ifindex	= dev->ifindex,
   4298		.fc_dst_len	= prefixlen,
   4299		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
   4300				  RTF_UP | RTF_PREF(pref),
   4301		.fc_protocol = RTPROT_RA,
   4302		.fc_type = RTN_UNICAST,
   4303		.fc_nlinfo.portid = 0,
   4304		.fc_nlinfo.nlh = NULL,
   4305		.fc_nlinfo.nl_net = net,
   4306	};
   4307
   4308	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
   4309	cfg.fc_dst = *prefix;
   4310	cfg.fc_gateway = *gwaddr;
   4311
   4312	/* We should treat it as a default route if prefix length is 0. */
   4313	if (!prefixlen)
   4314		cfg.fc_flags |= RTF_DEFAULT;
   4315
   4316	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
   4317
   4318	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
   4319}
   4320#endif
   4321
   4322struct fib6_info *rt6_get_dflt_router(struct net *net,
   4323				     const struct in6_addr *addr,
   4324				     struct net_device *dev)
   4325{
   4326	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
   4327	struct fib6_info *rt;
   4328	struct fib6_table *table;
   4329
   4330	table = fib6_get_table(net, tb_id);
   4331	if (!table)
   4332		return NULL;
   4333
   4334	rcu_read_lock();
   4335	for_each_fib6_node_rt_rcu(&table->tb6_root) {
   4336		struct fib6_nh *nh;
   4337
   4338		/* RA routes do not use nexthops */
   4339		if (rt->nh)
   4340			continue;
   4341
   4342		nh = rt->fib6_nh;
   4343		if (dev == nh->fib_nh_dev &&
   4344		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
   4345		    ipv6_addr_equal(&nh->fib_nh_gw6, addr))
   4346			break;
   4347	}
   4348	if (rt && !fib6_info_hold_safe(rt))
   4349		rt = NULL;
   4350	rcu_read_unlock();
   4351	return rt;
   4352}
   4353
   4354struct fib6_info *rt6_add_dflt_router(struct net *net,
   4355				     const struct in6_addr *gwaddr,
   4356				     struct net_device *dev,
   4357				     unsigned int pref,
   4358				     u32 defrtr_usr_metric)
   4359{
   4360	struct fib6_config cfg = {
   4361		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
   4362		.fc_metric	= defrtr_usr_metric,
   4363		.fc_ifindex	= dev->ifindex,
   4364		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
   4365				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
   4366		.fc_protocol = RTPROT_RA,
   4367		.fc_type = RTN_UNICAST,
   4368		.fc_nlinfo.portid = 0,
   4369		.fc_nlinfo.nlh = NULL,
   4370		.fc_nlinfo.nl_net = net,
   4371	};
   4372
   4373	cfg.fc_gateway = *gwaddr;
   4374
   4375	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
   4376		struct fib6_table *table;
   4377
   4378		table = fib6_get_table(dev_net(dev), cfg.fc_table);
   4379		if (table)
   4380			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
   4381	}
   4382
   4383	return rt6_get_dflt_router(net, gwaddr, dev);
   4384}
   4385
   4386static void __rt6_purge_dflt_routers(struct net *net,
   4387				     struct fib6_table *table)
   4388{
   4389	struct fib6_info *rt;
   4390
   4391restart:
   4392	rcu_read_lock();
   4393	for_each_fib6_node_rt_rcu(&table->tb6_root) {
   4394		struct net_device *dev = fib6_info_nh_dev(rt);
   4395		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
   4396
   4397		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
   4398		    (!idev || idev->cnf.accept_ra != 2) &&
   4399		    fib6_info_hold_safe(rt)) {
   4400			rcu_read_unlock();
   4401			ip6_del_rt(net, rt, false);
   4402			goto restart;
   4403		}
   4404	}
   4405	rcu_read_unlock();
   4406
   4407	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
   4408}
   4409
   4410void rt6_purge_dflt_routers(struct net *net)
   4411{
   4412	struct fib6_table *table;
   4413	struct hlist_head *head;
   4414	unsigned int h;
   4415
   4416	rcu_read_lock();
   4417
   4418	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
   4419		head = &net->ipv6.fib_table_hash[h];
   4420		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
   4421			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
   4422				__rt6_purge_dflt_routers(net, table);
   4423		}
   4424	}
   4425
   4426	rcu_read_unlock();
   4427}
   4428
   4429static void rtmsg_to_fib6_config(struct net *net,
   4430				 struct in6_rtmsg *rtmsg,
   4431				 struct fib6_config *cfg)
   4432{
   4433	*cfg = (struct fib6_config){
   4434		.fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
   4435			 : RT6_TABLE_MAIN,
   4436		.fc_ifindex = rtmsg->rtmsg_ifindex,
   4437		.fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
   4438		.fc_expires = rtmsg->rtmsg_info,
   4439		.fc_dst_len = rtmsg->rtmsg_dst_len,
   4440		.fc_src_len = rtmsg->rtmsg_src_len,
   4441		.fc_flags = rtmsg->rtmsg_flags,
   4442		.fc_type = rtmsg->rtmsg_type,
   4443
   4444		.fc_nlinfo.nl_net = net,
   4445
   4446		.fc_dst = rtmsg->rtmsg_dst,
   4447		.fc_src = rtmsg->rtmsg_src,
   4448		.fc_gateway = rtmsg->rtmsg_gateway,
   4449	};
   4450}
   4451
   4452int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg)
   4453{
   4454	struct fib6_config cfg;
   4455	int err;
   4456
   4457	if (cmd != SIOCADDRT && cmd != SIOCDELRT)
   4458		return -EINVAL;
   4459	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
   4460		return -EPERM;
   4461
   4462	rtmsg_to_fib6_config(net, rtmsg, &cfg);
   4463
   4464	rtnl_lock();
   4465	switch (cmd) {
   4466	case SIOCADDRT:
   4467		err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
   4468		break;
   4469	case SIOCDELRT:
   4470		err = ip6_route_del(&cfg, NULL);
   4471		break;
   4472	}
   4473	rtnl_unlock();
   4474	return err;
   4475}
   4476
   4477/*
   4478 *	Drop the packet on the floor
   4479 */
   4480
   4481static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
   4482{
   4483	struct dst_entry *dst = skb_dst(skb);
   4484	struct net *net = dev_net(dst->dev);
   4485	struct inet6_dev *idev;
   4486	SKB_DR(reason);
   4487	int type;
   4488
   4489	if (netif_is_l3_master(skb->dev) ||
   4490	    dst->dev == net->loopback_dev)
   4491		idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
   4492	else
   4493		idev = ip6_dst_idev(dst);
   4494
   4495	switch (ipstats_mib_noroutes) {
   4496	case IPSTATS_MIB_INNOROUTES:
   4497		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
   4498		if (type == IPV6_ADDR_ANY) {
   4499			SKB_DR_SET(reason, IP_INADDRERRORS);
   4500			IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
   4501			break;
   4502		}
   4503		SKB_DR_SET(reason, IP_INNOROUTES);
   4504		fallthrough;
   4505	case IPSTATS_MIB_OUTNOROUTES:
   4506		SKB_DR_OR(reason, IP_OUTNOROUTES);
   4507		IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
   4508		break;
   4509	}
   4510
   4511	/* Start over by dropping the dst for l3mdev case */
   4512	if (netif_is_l3_master(skb->dev))
   4513		skb_dst_drop(skb);
   4514
   4515	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
   4516	kfree_skb_reason(skb, reason);
   4517	return 0;
   4518}
   4519
   4520static int ip6_pkt_discard(struct sk_buff *skb)
   4521{
   4522	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
   4523}
   4524
   4525static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
   4526{
   4527	skb->dev = skb_dst(skb)->dev;
   4528	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
   4529}
   4530
   4531static int ip6_pkt_prohibit(struct sk_buff *skb)
   4532{
   4533	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
   4534}
   4535
   4536static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
   4537{
   4538	skb->dev = skb_dst(skb)->dev;
   4539	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
   4540}
   4541
   4542/*
   4543 *	Allocate a dst for local (unicast / anycast) address.
   4544 */
   4545
   4546struct fib6_info *addrconf_f6i_alloc(struct net *net,
   4547				     struct inet6_dev *idev,
   4548				     const struct in6_addr *addr,
   4549				     bool anycast, gfp_t gfp_flags)
   4550{
   4551	struct fib6_config cfg = {
   4552		.fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
   4553		.fc_ifindex = idev->dev->ifindex,
   4554		.fc_flags = RTF_UP | RTF_NONEXTHOP,
   4555		.fc_dst = *addr,
   4556		.fc_dst_len = 128,
   4557		.fc_protocol = RTPROT_KERNEL,
   4558		.fc_nlinfo.nl_net = net,
   4559		.fc_ignore_dev_down = true,
   4560	};
   4561	struct fib6_info *f6i;
   4562
   4563	if (anycast) {
   4564		cfg.fc_type = RTN_ANYCAST;
   4565		cfg.fc_flags |= RTF_ANYCAST;
   4566	} else {
   4567		cfg.fc_type = RTN_LOCAL;
   4568		cfg.fc_flags |= RTF_LOCAL;
   4569	}
   4570
   4571	f6i = ip6_route_info_create(&cfg, gfp_flags, NULL);
   4572	if (!IS_ERR(f6i)) {
   4573		f6i->dst_nocount = true;
   4574
   4575		if (!anycast &&
   4576		    (net->ipv6.devconf_all->disable_policy ||
   4577		     idev->cnf.disable_policy))
   4578			f6i->dst_nopolicy = true;
   4579	}
   4580
   4581	return f6i;
   4582}
   4583
   4584/* remove deleted ip from prefsrc entries */
   4585struct arg_dev_net_ip {
   4586	struct net_device *dev;
   4587	struct net *net;
   4588	struct in6_addr *addr;
   4589};
   4590
   4591static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
   4592{
   4593	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
   4594	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
   4595	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
   4596
   4597	if (!rt->nh &&
   4598	    ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) &&
   4599	    rt != net->ipv6.fib6_null_entry &&
   4600	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
   4601		spin_lock_bh(&rt6_exception_lock);
   4602		/* remove prefsrc entry */
   4603		rt->fib6_prefsrc.plen = 0;
   4604		spin_unlock_bh(&rt6_exception_lock);
   4605	}
   4606	return 0;
   4607}
   4608
   4609void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
   4610{
   4611	struct net *net = dev_net(ifp->idev->dev);
   4612	struct arg_dev_net_ip adni = {
   4613		.dev = ifp->idev->dev,
   4614		.net = net,
   4615		.addr = &ifp->addr,
   4616	};
   4617	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
   4618}
   4619
   4620#define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT)
   4621
   4622/* Remove routers and update dst entries when gateway turn into host. */
   4623static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
   4624{
   4625	struct in6_addr *gateway = (struct in6_addr *)arg;
   4626	struct fib6_nh *nh;
   4627
   4628	/* RA routes do not use nexthops */
   4629	if (rt->nh)
   4630		return 0;
   4631
   4632	nh = rt->fib6_nh;
   4633	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
   4634	    nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
   4635		return -1;
   4636
   4637	/* Further clean up cached routes in exception table.
   4638	 * This is needed because cached route may have a different
   4639	 * gateway than its 'parent' in the case of an ip redirect.
   4640	 */
   4641	fib6_nh_exceptions_clean_tohost(nh, gateway);
   4642
   4643	return 0;
   4644}
   4645
   4646void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
   4647{
   4648	fib6_clean_all(net, fib6_clean_tohost, gateway);
   4649}
   4650
   4651struct arg_netdev_event {
   4652	const struct net_device *dev;
   4653	union {
   4654		unsigned char nh_flags;
   4655		unsigned long event;
   4656	};
   4657};
   4658
   4659static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
   4660{
   4661	struct fib6_info *iter;
   4662	struct fib6_node *fn;
   4663
   4664	fn = rcu_dereference_protected(rt->fib6_node,
   4665			lockdep_is_held(&rt->fib6_table->tb6_lock));
   4666	iter = rcu_dereference_protected(fn->leaf,
   4667			lockdep_is_held(&rt->fib6_table->tb6_lock));
   4668	while (iter) {
   4669		if (iter->fib6_metric == rt->fib6_metric &&
   4670		    rt6_qualify_for_ecmp(iter))
   4671			return iter;
   4672		iter = rcu_dereference_protected(iter->fib6_next,
   4673				lockdep_is_held(&rt->fib6_table->tb6_lock));
   4674	}
   4675
   4676	return NULL;
   4677}
   4678
   4679/* only called for fib entries with builtin fib6_nh */
   4680static bool rt6_is_dead(const struct fib6_info *rt)
   4681{
   4682	if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
   4683	    (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN &&
   4684	     ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev)))
   4685		return true;
   4686
   4687	return false;
   4688}
   4689
   4690static int rt6_multipath_total_weight(const struct fib6_info *rt)
   4691{
   4692	struct fib6_info *iter;
   4693	int total = 0;
   4694
   4695	if (!rt6_is_dead(rt))
   4696		total += rt->fib6_nh->fib_nh_weight;
   4697
   4698	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
   4699		if (!rt6_is_dead(iter))
   4700			total += iter->fib6_nh->fib_nh_weight;
   4701	}
   4702
   4703	return total;
   4704}
   4705
   4706static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
   4707{
   4708	int upper_bound = -1;
   4709
   4710	if (!rt6_is_dead(rt)) {
   4711		*weight += rt->fib6_nh->fib_nh_weight;
   4712		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
   4713						    total) - 1;
   4714	}
   4715	atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound);
   4716}
   4717
   4718static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
   4719{
   4720	struct fib6_info *iter;
   4721	int weight = 0;
   4722
   4723	rt6_upper_bound_set(rt, &weight, total);
   4724
   4725	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
   4726		rt6_upper_bound_set(iter, &weight, total);
   4727}
   4728
   4729void rt6_multipath_rebalance(struct fib6_info *rt)
   4730{
   4731	struct fib6_info *first;
   4732	int total;
   4733
   4734	/* In case the entire multipath route was marked for flushing,
   4735	 * then there is no need to rebalance upon the removal of every
   4736	 * sibling route.
   4737	 */
   4738	if (!rt->fib6_nsiblings || rt->should_flush)
   4739		return;
   4740
   4741	/* During lookup routes are evaluated in order, so we need to
   4742	 * make sure upper bounds are assigned from the first sibling
   4743	 * onwards.
   4744	 */
   4745	first = rt6_multipath_first_sibling(rt);
   4746	if (WARN_ON_ONCE(!first))
   4747		return;
   4748
   4749	total = rt6_multipath_total_weight(first);
   4750	rt6_multipath_upper_bound_set(first, total);
   4751}
   4752
   4753static int fib6_ifup(struct fib6_info *rt, void *p_arg)
   4754{
   4755	const struct arg_netdev_event *arg = p_arg;
   4756	struct net *net = dev_net(arg->dev);
   4757
   4758	if (rt != net->ipv6.fib6_null_entry && !rt->nh &&
   4759	    rt->fib6_nh->fib_nh_dev == arg->dev) {
   4760		rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
   4761		fib6_update_sernum_upto_root(net, rt);
   4762		rt6_multipath_rebalance(rt);
   4763	}
   4764
   4765	return 0;
   4766}
   4767
   4768void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
   4769{
   4770	struct arg_netdev_event arg = {
   4771		.dev = dev,
   4772		{
   4773			.nh_flags = nh_flags,
   4774		},
   4775	};
   4776
   4777	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
   4778		arg.nh_flags |= RTNH_F_LINKDOWN;
   4779
   4780	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
   4781}
   4782
   4783/* only called for fib entries with inline fib6_nh */
   4784static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
   4785				   const struct net_device *dev)
   4786{
   4787	struct fib6_info *iter;
   4788
   4789	if (rt->fib6_nh->fib_nh_dev == dev)
   4790		return true;
   4791	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
   4792		if (iter->fib6_nh->fib_nh_dev == dev)
   4793			return true;
   4794
   4795	return false;
   4796}
   4797
   4798static void rt6_multipath_flush(struct fib6_info *rt)
   4799{
   4800	struct fib6_info *iter;
   4801
   4802	rt->should_flush = 1;
   4803	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
   4804		iter->should_flush = 1;
   4805}
   4806
   4807static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
   4808					     const struct net_device *down_dev)
   4809{
   4810	struct fib6_info *iter;
   4811	unsigned int dead = 0;
   4812
   4813	if (rt->fib6_nh->fib_nh_dev == down_dev ||
   4814	    rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
   4815		dead++;
   4816	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
   4817		if (iter->fib6_nh->fib_nh_dev == down_dev ||
   4818		    iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
   4819			dead++;
   4820
   4821	return dead;
   4822}
   4823
   4824static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
   4825				       const struct net_device *dev,
   4826				       unsigned char nh_flags)
   4827{
   4828	struct fib6_info *iter;
   4829
   4830	if (rt->fib6_nh->fib_nh_dev == dev)
   4831		rt->fib6_nh->fib_nh_flags |= nh_flags;
   4832	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
   4833		if (iter->fib6_nh->fib_nh_dev == dev)
   4834			iter->fib6_nh->fib_nh_flags |= nh_flags;
   4835}
   4836
   4837/* called with write lock held for table with rt */
   4838static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
   4839{
   4840	const struct arg_netdev_event *arg = p_arg;
   4841	const struct net_device *dev = arg->dev;
   4842	struct net *net = dev_net(dev);
   4843
   4844	if (rt == net->ipv6.fib6_null_entry || rt->nh)
   4845		return 0;
   4846
   4847	switch (arg->event) {
   4848	case NETDEV_UNREGISTER:
   4849		return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
   4850	case NETDEV_DOWN:
   4851		if (rt->should_flush)
   4852			return -1;
   4853		if (!rt->fib6_nsiblings)
   4854			return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
   4855		if (rt6_multipath_uses_dev(rt, dev)) {
   4856			unsigned int count;
   4857
   4858			count = rt6_multipath_dead_count(rt, dev);
   4859			if (rt->fib6_nsiblings + 1 == count) {
   4860				rt6_multipath_flush(rt);
   4861				return -1;
   4862			}
   4863			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
   4864						   RTNH_F_LINKDOWN);
   4865			fib6_update_sernum(net, rt);
   4866			rt6_multipath_rebalance(rt);
   4867		}
   4868		return -2;
   4869	case NETDEV_CHANGE:
   4870		if (rt->fib6_nh->fib_nh_dev != dev ||
   4871		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
   4872			break;
   4873		rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
   4874		rt6_multipath_rebalance(rt);
   4875		break;
   4876	}
   4877
   4878	return 0;
   4879}
   4880
   4881void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
   4882{
   4883	struct arg_netdev_event arg = {
   4884		.dev = dev,
   4885		{
   4886			.event = event,
   4887		},
   4888	};
   4889	struct net *net = dev_net(dev);
   4890
   4891	if (net->ipv6.sysctl.skip_notify_on_dev_down)
   4892		fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
   4893	else
   4894		fib6_clean_all(net, fib6_ifdown, &arg);
   4895}
   4896
   4897void rt6_disable_ip(struct net_device *dev, unsigned long event)
   4898{
   4899	rt6_sync_down_dev(dev, event);
   4900	rt6_uncached_list_flush_dev(dev);
   4901	neigh_ifdown(&nd_tbl, dev);
   4902}
   4903
   4904struct rt6_mtu_change_arg {
   4905	struct net_device *dev;
   4906	unsigned int mtu;
   4907	struct fib6_info *f6i;
   4908};
   4909
   4910static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg)
   4911{
   4912	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg;
   4913	struct fib6_info *f6i = arg->f6i;
   4914
   4915	/* For administrative MTU increase, there is no way to discover
   4916	 * IPv6 PMTU increase, so PMTU increase should be updated here.
   4917	 * Since RFC 1981 doesn't include administrative MTU increase
   4918	 * update PMTU increase is a MUST. (i.e. jumbo frame)
   4919	 */
   4920	if (nh->fib_nh_dev == arg->dev) {
   4921		struct inet6_dev *idev = __in6_dev_get(arg->dev);
   4922		u32 mtu = f6i->fib6_pmtu;
   4923
   4924		if (mtu >= arg->mtu ||
   4925		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
   4926			fib6_metric_set(f6i, RTAX_MTU, arg->mtu);
   4927
   4928		spin_lock_bh(&rt6_exception_lock);
   4929		rt6_exceptions_update_pmtu(idev, nh, arg->mtu);
   4930		spin_unlock_bh(&rt6_exception_lock);
   4931	}
   4932
   4933	return 0;
   4934}
   4935
   4936static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg)
   4937{
   4938	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
   4939	struct inet6_dev *idev;
   4940
   4941	/* In IPv6 pmtu discovery is not optional,
   4942	   so that RTAX_MTU lock cannot disable it.
   4943	   We still use this lock to block changes
   4944	   caused by addrconf/ndisc.
   4945	*/
   4946
   4947	idev = __in6_dev_get(arg->dev);
   4948	if (!idev)
   4949		return 0;
   4950
   4951	if (fib6_metric_locked(f6i, RTAX_MTU))
   4952		return 0;
   4953
   4954	arg->f6i = f6i;
   4955	if (f6i->nh) {
   4956		/* fib6_nh_mtu_change only returns 0, so this is safe */
   4957		return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change,
   4958						arg);
   4959	}
   4960
   4961	return fib6_nh_mtu_change(f6i->fib6_nh, arg);
   4962}
   4963
   4964void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
   4965{
   4966	struct rt6_mtu_change_arg arg = {
   4967		.dev = dev,
   4968		.mtu = mtu,
   4969	};
   4970
   4971	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
   4972}
   4973
   4974static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
   4975	[RTA_UNSPEC]		= { .strict_start_type = RTA_DPORT + 1 },
   4976	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
   4977	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
   4978	[RTA_OIF]               = { .type = NLA_U32 },
   4979	[RTA_IIF]		= { .type = NLA_U32 },
   4980	[RTA_PRIORITY]          = { .type = NLA_U32 },
   4981	[RTA_METRICS]           = { .type = NLA_NESTED },
   4982	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
   4983	[RTA_PREF]              = { .type = NLA_U8 },
   4984	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
   4985	[RTA_ENCAP]		= { .type = NLA_NESTED },
   4986	[RTA_EXPIRES]		= { .type = NLA_U32 },
   4987	[RTA_UID]		= { .type = NLA_U32 },
   4988	[RTA_MARK]		= { .type = NLA_U32 },
   4989	[RTA_TABLE]		= { .type = NLA_U32 },
   4990	[RTA_IP_PROTO]		= { .type = NLA_U8 },
   4991	[RTA_SPORT]		= { .type = NLA_U16 },
   4992	[RTA_DPORT]		= { .type = NLA_U16 },
   4993	[RTA_NH_ID]		= { .type = NLA_U32 },
   4994};
   4995
   4996static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
   4997			      struct fib6_config *cfg,
   4998			      struct netlink_ext_ack *extack)
   4999{
   5000	struct rtmsg *rtm;
   5001	struct nlattr *tb[RTA_MAX+1];
   5002	unsigned int pref;
   5003	int err;
   5004
   5005	err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
   5006				     rtm_ipv6_policy, extack);
   5007	if (err < 0)
   5008		goto errout;
   5009
   5010	err = -EINVAL;
   5011	rtm = nlmsg_data(nlh);
   5012
   5013	if (rtm->rtm_tos) {
   5014		NL_SET_ERR_MSG(extack,
   5015			       "Invalid dsfield (tos): option not available for IPv6");
   5016		goto errout;
   5017	}
   5018
   5019	*cfg = (struct fib6_config){
   5020		.fc_table = rtm->rtm_table,
   5021		.fc_dst_len = rtm->rtm_dst_len,
   5022		.fc_src_len = rtm->rtm_src_len,
   5023		.fc_flags = RTF_UP,
   5024		.fc_protocol = rtm->rtm_protocol,
   5025		.fc_type = rtm->rtm_type,
   5026
   5027		.fc_nlinfo.portid = NETLINK_CB(skb).portid,
   5028		.fc_nlinfo.nlh = nlh,
   5029		.fc_nlinfo.nl_net = sock_net(skb->sk),
   5030	};
   5031
   5032	if (rtm->rtm_type == RTN_UNREACHABLE ||
   5033	    rtm->rtm_type == RTN_BLACKHOLE ||
   5034	    rtm->rtm_type == RTN_PROHIBIT ||
   5035	    rtm->rtm_type == RTN_THROW)
   5036		cfg->fc_flags |= RTF_REJECT;
   5037
   5038	if (rtm->rtm_type == RTN_LOCAL)
   5039		cfg->fc_flags |= RTF_LOCAL;
   5040
   5041	if (rtm->rtm_flags & RTM_F_CLONED)
   5042		cfg->fc_flags |= RTF_CACHE;
   5043
   5044	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
   5045
   5046	if (tb[RTA_NH_ID]) {
   5047		if (tb[RTA_GATEWAY]   || tb[RTA_OIF] ||
   5048		    tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) {
   5049			NL_SET_ERR_MSG(extack,
   5050				       "Nexthop specification and nexthop id are mutually exclusive");
   5051			goto errout;
   5052		}
   5053		cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]);
   5054	}
   5055
   5056	if (tb[RTA_GATEWAY]) {
   5057		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
   5058		cfg->fc_flags |= RTF_GATEWAY;
   5059	}
   5060	if (tb[RTA_VIA]) {
   5061		NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
   5062		goto errout;
   5063	}
   5064
   5065	if (tb[RTA_DST]) {
   5066		int plen = (rtm->rtm_dst_len + 7) >> 3;
   5067
   5068		if (nla_len(tb[RTA_DST]) < plen)
   5069			goto errout;
   5070
   5071		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
   5072	}
   5073
   5074	if (tb[RTA_SRC]) {
   5075		int plen = (rtm->rtm_src_len + 7) >> 3;
   5076
   5077		if (nla_len(tb[RTA_SRC]) < plen)
   5078			goto errout;
   5079
   5080		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
   5081	}
   5082
   5083	if (tb[RTA_PREFSRC])
   5084		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
   5085
   5086	if (tb[RTA_OIF])
   5087		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
   5088
   5089	if (tb[RTA_PRIORITY])
   5090		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
   5091
   5092	if (tb[RTA_METRICS]) {
   5093		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
   5094		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
   5095	}
   5096
   5097	if (tb[RTA_TABLE])
   5098		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
   5099
   5100	if (tb[RTA_MULTIPATH]) {
   5101		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
   5102		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
   5103
   5104		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
   5105						     cfg->fc_mp_len, extack);
   5106		if (err < 0)
   5107			goto errout;
   5108	}
   5109
   5110	if (tb[RTA_PREF]) {
   5111		pref = nla_get_u8(tb[RTA_PREF]);
   5112		if (pref != ICMPV6_ROUTER_PREF_LOW &&
   5113		    pref != ICMPV6_ROUTER_PREF_HIGH)
   5114			pref = ICMPV6_ROUTER_PREF_MEDIUM;
   5115		cfg->fc_flags |= RTF_PREF(pref);
   5116	}
   5117
   5118	if (tb[RTA_ENCAP])
   5119		cfg->fc_encap = tb[RTA_ENCAP];
   5120
   5121	if (tb[RTA_ENCAP_TYPE]) {
   5122		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
   5123
   5124		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
   5125		if (err < 0)
   5126			goto errout;
   5127	}
   5128
   5129	if (tb[RTA_EXPIRES]) {
   5130		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
   5131
   5132		if (addrconf_finite_timeout(timeout)) {
   5133			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
   5134			cfg->fc_flags |= RTF_EXPIRES;
   5135		}
   5136	}
   5137
   5138	err = 0;
   5139errout:
   5140	return err;
   5141}
   5142
   5143struct rt6_nh {
   5144	struct fib6_info *fib6_info;
   5145	struct fib6_config r_cfg;
   5146	struct list_head next;
   5147};
   5148
   5149static int ip6_route_info_append(struct net *net,
   5150				 struct list_head *rt6_nh_list,
   5151				 struct fib6_info *rt,
   5152				 struct fib6_config *r_cfg)
   5153{
   5154	struct rt6_nh *nh;
   5155	int err = -EEXIST;
   5156
   5157	list_for_each_entry(nh, rt6_nh_list, next) {
   5158		/* check if fib6_info already exists */
   5159		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
   5160			return err;
   5161	}
   5162
   5163	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
   5164	if (!nh)
   5165		return -ENOMEM;
   5166	nh->fib6_info = rt;
   5167	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
   5168	list_add_tail(&nh->next, rt6_nh_list);
   5169
   5170	return 0;
   5171}
   5172
   5173static void ip6_route_mpath_notify(struct fib6_info *rt,
   5174				   struct fib6_info *rt_last,
   5175				   struct nl_info *info,
   5176				   __u16 nlflags)
   5177{
   5178	/* if this is an APPEND route, then rt points to the first route
   5179	 * inserted and rt_last points to last route inserted. Userspace
   5180	 * wants a consistent dump of the route which starts at the first
   5181	 * nexthop. Since sibling routes are always added at the end of
   5182	 * the list, find the first sibling of the last route appended
   5183	 */
   5184	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
   5185		rt = list_first_entry(&rt_last->fib6_siblings,
   5186				      struct fib6_info,
   5187				      fib6_siblings);
   5188	}
   5189
   5190	if (rt)
   5191		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
   5192}
   5193
   5194static bool ip6_route_mpath_should_notify(const struct fib6_info *rt)
   5195{
   5196	bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
   5197	bool should_notify = false;
   5198	struct fib6_info *leaf;
   5199	struct fib6_node *fn;
   5200
   5201	rcu_read_lock();
   5202	fn = rcu_dereference(rt->fib6_node);
   5203	if (!fn)
   5204		goto out;
   5205
   5206	leaf = rcu_dereference(fn->leaf);
   5207	if (!leaf)
   5208		goto out;
   5209
   5210	if (rt == leaf ||
   5211	    (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric &&
   5212	     rt6_qualify_for_ecmp(leaf)))
   5213		should_notify = true;
   5214out:
   5215	rcu_read_unlock();
   5216
   5217	return should_notify;
   5218}
   5219
   5220static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla,
   5221			     struct netlink_ext_ack *extack)
   5222{
   5223	if (nla_len(nla) < sizeof(*gw)) {
   5224		NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY");
   5225		return -EINVAL;
   5226	}
   5227
   5228	*gw = nla_get_in6_addr(nla);
   5229
   5230	return 0;
   5231}
   5232
   5233static int ip6_route_multipath_add(struct fib6_config *cfg,
   5234				   struct netlink_ext_ack *extack)
   5235{
   5236	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
   5237	struct nl_info *info = &cfg->fc_nlinfo;
   5238	struct fib6_config r_cfg;
   5239	struct rtnexthop *rtnh;
   5240	struct fib6_info *rt;
   5241	struct rt6_nh *err_nh;
   5242	struct rt6_nh *nh, *nh_safe;
   5243	__u16 nlflags;
   5244	int remaining;
   5245	int attrlen;
   5246	int err = 1;
   5247	int nhn = 0;
   5248	int replace = (cfg->fc_nlinfo.nlh &&
   5249		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
   5250	LIST_HEAD(rt6_nh_list);
   5251
   5252	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
   5253	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
   5254		nlflags |= NLM_F_APPEND;
   5255
   5256	remaining = cfg->fc_mp_len;
   5257	rtnh = (struct rtnexthop *)cfg->fc_mp;
   5258
   5259	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
   5260	 * fib6_info structs per nexthop
   5261	 */
   5262	while (rtnh_ok(rtnh, remaining)) {
   5263		memcpy(&r_cfg, cfg, sizeof(*cfg));
   5264		if (rtnh->rtnh_ifindex)
   5265			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
   5266
   5267		attrlen = rtnh_attrlen(rtnh);
   5268		if (attrlen > 0) {
   5269			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
   5270
   5271			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
   5272			if (nla) {
   5273				err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
   5274							extack);
   5275				if (err)
   5276					goto cleanup;
   5277
   5278				r_cfg.fc_flags |= RTF_GATEWAY;
   5279			}
   5280			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
   5281
   5282			/* RTA_ENCAP_TYPE length checked in
   5283			 * lwtunnel_valid_encap_type_attr
   5284			 */
   5285			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
   5286			if (nla)
   5287				r_cfg.fc_encap_type = nla_get_u16(nla);
   5288		}
   5289
   5290		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
   5291		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
   5292		if (IS_ERR(rt)) {
   5293			err = PTR_ERR(rt);
   5294			rt = NULL;
   5295			goto cleanup;
   5296		}
   5297		if (!rt6_qualify_for_ecmp(rt)) {
   5298			err = -EINVAL;
   5299			NL_SET_ERR_MSG(extack,
   5300				       "Device only routes can not be added for IPv6 using the multipath API.");
   5301			fib6_info_release(rt);
   5302			goto cleanup;
   5303		}
   5304
   5305		rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;
   5306
   5307		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
   5308					    rt, &r_cfg);
   5309		if (err) {
   5310			fib6_info_release(rt);
   5311			goto cleanup;
   5312		}
   5313
   5314		rtnh = rtnh_next(rtnh, &remaining);
   5315	}
   5316
   5317	if (list_empty(&rt6_nh_list)) {
   5318		NL_SET_ERR_MSG(extack,
   5319			       "Invalid nexthop configuration - no valid nexthops");
   5320		return -EINVAL;
   5321	}
   5322
   5323	/* for add and replace send one notification with all nexthops.
   5324	 * Skip the notification in fib6_add_rt2node and send one with
   5325	 * the full route when done
   5326	 */
   5327	info->skip_notify = 1;
   5328
   5329	/* For add and replace, send one notification with all nexthops. For
   5330	 * append, send one notification with all appended nexthops.
   5331	 */
   5332	info->skip_notify_kernel = 1;
   5333
   5334	err_nh = NULL;
   5335	list_for_each_entry(nh, &rt6_nh_list, next) {
   5336		err = __ip6_ins_rt(nh->fib6_info, info, extack);
   5337		fib6_info_release(nh->fib6_info);
   5338
   5339		if (!err) {
   5340			/* save reference to last route successfully inserted */
   5341			rt_last = nh->fib6_info;
   5342
   5343			/* save reference to first route for notification */
   5344			if (!rt_notif)
   5345				rt_notif = nh->fib6_info;
   5346		}
   5347
   5348		/* nh->fib6_info is used or freed at this point, reset to NULL*/
   5349		nh->fib6_info = NULL;
   5350		if (err) {
   5351			if (replace && nhn)
   5352				NL_SET_ERR_MSG_MOD(extack,
   5353						   "multipath route replace failed (check consistency of installed routes)");
   5354			err_nh = nh;
   5355			goto add_errout;
   5356		}
   5357
   5358		/* Because each route is added like a single route we remove
   5359		 * these flags after the first nexthop: if there is a collision,
   5360		 * we have already failed to add the first nexthop:
   5361		 * fib6_add_rt2node() has rejected it; when replacing, old
   5362		 * nexthops have been replaced by first new, the rest should
   5363		 * be added to it.
   5364		 */
   5365		if (cfg->fc_nlinfo.nlh) {
   5366			cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
   5367							     NLM_F_REPLACE);
   5368			cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
   5369		}
   5370		nhn++;
   5371	}
   5372
   5373	/* An in-kernel notification should only be sent in case the new
   5374	 * multipath route is added as the first route in the node, or if
   5375	 * it was appended to it. We pass 'rt_notif' since it is the first
   5376	 * sibling and might allow us to skip some checks in the replace case.
   5377	 */
   5378	if (ip6_route_mpath_should_notify(rt_notif)) {
   5379		enum fib_event_type fib_event;
   5380
   5381		if (rt_notif->fib6_nsiblings != nhn - 1)
   5382			fib_event = FIB_EVENT_ENTRY_APPEND;
   5383		else
   5384			fib_event = FIB_EVENT_ENTRY_REPLACE;
   5385
   5386		err = call_fib6_multipath_entry_notifiers(info->nl_net,
   5387							  fib_event, rt_notif,
   5388							  nhn - 1, extack);
   5389		if (err) {
   5390			/* Delete all the siblings that were just added */
   5391			err_nh = NULL;
   5392			goto add_errout;
   5393		}
   5394	}
   5395
   5396	/* success ... tell user about new route */
   5397	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
   5398	goto cleanup;
   5399
   5400add_errout:
   5401	/* send notification for routes that were added so that
   5402	 * the delete notifications sent by ip6_route_del are
   5403	 * coherent
   5404	 */
   5405	if (rt_notif)
   5406		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
   5407
   5408	/* Delete routes that were already added */
   5409	list_for_each_entry(nh, &rt6_nh_list, next) {
   5410		if (err_nh == nh)
   5411			break;
   5412		ip6_route_del(&nh->r_cfg, extack);
   5413	}
   5414
   5415cleanup:
   5416	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
   5417		if (nh->fib6_info)
   5418			fib6_info_release(nh->fib6_info);
   5419		list_del(&nh->next);
   5420		kfree(nh);
   5421	}
   5422
   5423	return err;
   5424}
   5425
   5426static int ip6_route_multipath_del(struct fib6_config *cfg,
   5427				   struct netlink_ext_ack *extack)
   5428{
   5429	struct fib6_config r_cfg;
   5430	struct rtnexthop *rtnh;
   5431	int last_err = 0;
   5432	int remaining;
   5433	int attrlen;
   5434	int err;
   5435
   5436	remaining = cfg->fc_mp_len;
   5437	rtnh = (struct rtnexthop *)cfg->fc_mp;
   5438
   5439	/* Parse a Multipath Entry */
   5440	while (rtnh_ok(rtnh, remaining)) {
   5441		memcpy(&r_cfg, cfg, sizeof(*cfg));
   5442		if (rtnh->rtnh_ifindex)
   5443			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
   5444
   5445		attrlen = rtnh_attrlen(rtnh);
   5446		if (attrlen > 0) {
   5447			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
   5448
   5449			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
   5450			if (nla) {
   5451				err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
   5452							extack);
   5453				if (err) {
   5454					last_err = err;
   5455					goto next_rtnh;
   5456				}
   5457
   5458				r_cfg.fc_flags |= RTF_GATEWAY;
   5459			}
   5460		}
   5461		err = ip6_route_del(&r_cfg, extack);
   5462		if (err)
   5463			last_err = err;
   5464
   5465next_rtnh:
   5466		rtnh = rtnh_next(rtnh, &remaining);
   5467	}
   5468
   5469	return last_err;
   5470}
   5471
   5472static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
   5473			      struct netlink_ext_ack *extack)
   5474{
   5475	struct fib6_config cfg;
   5476	int err;
   5477
   5478	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
   5479	if (err < 0)
   5480		return err;
   5481
   5482	if (cfg.fc_nh_id &&
   5483	    !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) {
   5484		NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
   5485		return -EINVAL;
   5486	}
   5487
   5488	if (cfg.fc_mp)
   5489		return ip6_route_multipath_del(&cfg, extack);
   5490	else {
   5491		cfg.fc_delete_all_nh = 1;
   5492		return ip6_route_del(&cfg, extack);
   5493	}
   5494}
   5495
   5496static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
   5497			      struct netlink_ext_ack *extack)
   5498{
   5499	struct fib6_config cfg;
   5500	int err;
   5501
   5502	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
   5503	if (err < 0)
   5504		return err;
   5505
   5506	if (cfg.fc_metric == 0)
   5507		cfg.fc_metric = IP6_RT_PRIO_USER;
   5508
   5509	if (cfg.fc_mp)
   5510		return ip6_route_multipath_add(&cfg, extack);
   5511	else
   5512		return ip6_route_add(&cfg, GFP_KERNEL, extack);
   5513}
   5514
   5515/* add the overhead of this fib6_nh to nexthop_len */
   5516static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg)
   5517{
   5518	int *nexthop_len = arg;
   5519
   5520	*nexthop_len += nla_total_size(0)	 /* RTA_MULTIPATH */
   5521		     + NLA_ALIGN(sizeof(struct rtnexthop))
   5522		     + nla_total_size(16); /* RTA_GATEWAY */
   5523
   5524	if (nh->fib_nh_lws) {
   5525		/* RTA_ENCAP_TYPE */
   5526		*nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
   5527		/* RTA_ENCAP */
   5528		*nexthop_len += nla_total_size(2);
   5529	}
   5530
   5531	return 0;
   5532}
   5533
   5534static size_t rt6_nlmsg_size(struct fib6_info *f6i)
   5535{
   5536	int nexthop_len;
   5537
   5538	if (f6i->nh) {
   5539		nexthop_len = nla_total_size(4); /* RTA_NH_ID */
   5540		nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size,
   5541					 &nexthop_len);
   5542	} else {
   5543		struct fib6_nh *nh = f6i->fib6_nh;
   5544
   5545		nexthop_len = 0;
   5546		if (f6i->fib6_nsiblings) {
   5547			nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
   5548				    + NLA_ALIGN(sizeof(struct rtnexthop))
   5549				    + nla_total_size(16) /* RTA_GATEWAY */
   5550				    + lwtunnel_get_encap_size(nh->fib_nh_lws);
   5551
   5552			nexthop_len *= f6i->fib6_nsiblings;
   5553		}
   5554		nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
   5555	}
   5556
   5557	return NLMSG_ALIGN(sizeof(struct rtmsg))
   5558	       + nla_total_size(16) /* RTA_SRC */
   5559	       + nla_total_size(16) /* RTA_DST */
   5560	       + nla_total_size(16) /* RTA_GATEWAY */
   5561	       + nla_total_size(16) /* RTA_PREFSRC */
   5562	       + nla_total_size(4) /* RTA_TABLE */
   5563	       + nla_total_size(4) /* RTA_IIF */
   5564	       + nla_total_size(4) /* RTA_OIF */
   5565	       + nla_total_size(4) /* RTA_PRIORITY */
   5566	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
   5567	       + nla_total_size(sizeof(struct rta_cacheinfo))
   5568	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
   5569	       + nla_total_size(1) /* RTA_PREF */
   5570	       + nexthop_len;
   5571}
   5572
   5573static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
   5574				 unsigned char *flags)
   5575{
   5576	if (nexthop_is_multipath(nh)) {
   5577		struct nlattr *mp;
   5578
   5579		mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
   5580		if (!mp)
   5581			goto nla_put_failure;
   5582
   5583		if (nexthop_mpath_fill_node(skb, nh, AF_INET6))
   5584			goto nla_put_failure;
   5585
   5586		nla_nest_end(skb, mp);
   5587	} else {
   5588		struct fib6_nh *fib6_nh;
   5589
   5590		fib6_nh = nexthop_fib6_nh(nh);
   5591		if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6,
   5592				     flags, false) < 0)
   5593			goto nla_put_failure;
   5594	}
   5595
   5596	return 0;
   5597
   5598nla_put_failure:
   5599	return -EMSGSIZE;
   5600}
   5601
   5602static int rt6_fill_node(struct net *net, struct sk_buff *skb,
   5603			 struct fib6_info *rt, struct dst_entry *dst,
   5604			 struct in6_addr *dest, struct in6_addr *src,
   5605			 int iif, int type, u32 portid, u32 seq,
   5606			 unsigned int flags)
   5607{
   5608	struct rt6_info *rt6 = (struct rt6_info *)dst;
   5609	struct rt6key *rt6_dst, *rt6_src;
   5610	u32 *pmetrics, table, rt6_flags;
   5611	unsigned char nh_flags = 0;
   5612	struct nlmsghdr *nlh;
   5613	struct rtmsg *rtm;
   5614	long expires = 0;
   5615
   5616	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
   5617	if (!nlh)
   5618		return -EMSGSIZE;
   5619
   5620	if (rt6) {
   5621		rt6_dst = &rt6->rt6i_dst;
   5622		rt6_src = &rt6->rt6i_src;
   5623		rt6_flags = rt6->rt6i_flags;
   5624	} else {
   5625		rt6_dst = &rt->fib6_dst;
   5626		rt6_src = &rt->fib6_src;
   5627		rt6_flags = rt->fib6_flags;
   5628	}
   5629
   5630	rtm = nlmsg_data(nlh);
   5631	rtm->rtm_family = AF_INET6;
   5632	rtm->rtm_dst_len = rt6_dst->plen;
   5633	rtm->rtm_src_len = rt6_src->plen;
   5634	rtm->rtm_tos = 0;
   5635	if (rt->fib6_table)
   5636		table = rt->fib6_table->tb6_id;
   5637	else
   5638		table = RT6_TABLE_UNSPEC;
   5639	rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
   5640	if (nla_put_u32(skb, RTA_TABLE, table))
   5641		goto nla_put_failure;
   5642
   5643	rtm->rtm_type = rt->fib6_type;
   5644	rtm->rtm_flags = 0;
   5645	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
   5646	rtm->rtm_protocol = rt->fib6_protocol;
   5647
   5648	if (rt6_flags & RTF_CACHE)
   5649		rtm->rtm_flags |= RTM_F_CLONED;
   5650
   5651	if (dest) {
   5652		if (nla_put_in6_addr(skb, RTA_DST, dest))
   5653			goto nla_put_failure;
   5654		rtm->rtm_dst_len = 128;
   5655	} else if (rtm->rtm_dst_len)
   5656		if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
   5657			goto nla_put_failure;
   5658#ifdef CONFIG_IPV6_SUBTREES
   5659	if (src) {
   5660		if (nla_put_in6_addr(skb, RTA_SRC, src))
   5661			goto nla_put_failure;
   5662		rtm->rtm_src_len = 128;
   5663	} else if (rtm->rtm_src_len &&
   5664		   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
   5665		goto nla_put_failure;
   5666#endif
   5667	if (iif) {
   5668#ifdef CONFIG_IPV6_MROUTE
   5669		if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
   5670			int err = ip6mr_get_route(net, skb, rtm, portid);
   5671
   5672			if (err == 0)
   5673				return 0;
   5674			if (err < 0)
   5675				goto nla_put_failure;
   5676		} else
   5677#endif
   5678			if (nla_put_u32(skb, RTA_IIF, iif))
   5679				goto nla_put_failure;
   5680	} else if (dest) {
   5681		struct in6_addr saddr_buf;
   5682		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
   5683		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
   5684			goto nla_put_failure;
   5685	}
   5686
   5687	if (rt->fib6_prefsrc.plen) {
   5688		struct in6_addr saddr_buf;
   5689		saddr_buf = rt->fib6_prefsrc.addr;
   5690		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
   5691			goto nla_put_failure;
   5692	}
   5693
   5694	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
   5695	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
   5696		goto nla_put_failure;
   5697
   5698	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
   5699		goto nla_put_failure;
   5700
   5701	/* For multipath routes, walk the siblings list and add
   5702	 * each as a nexthop within RTA_MULTIPATH.
   5703	 */
   5704	if (rt6) {
   5705		if (rt6_flags & RTF_GATEWAY &&
   5706		    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
   5707			goto nla_put_failure;
   5708
   5709		if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
   5710			goto nla_put_failure;
   5711
   5712		if (dst->lwtstate &&
   5713		    lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
   5714			goto nla_put_failure;
   5715	} else if (rt->fib6_nsiblings) {
   5716		struct fib6_info *sibling, *next_sibling;
   5717		struct nlattr *mp;
   5718
   5719		mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
   5720		if (!mp)
   5721			goto nla_put_failure;
   5722
   5723		if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common,
   5724				    rt->fib6_nh->fib_nh_weight, AF_INET6,
   5725				    0) < 0)
   5726			goto nla_put_failure;
   5727
   5728		list_for_each_entry_safe(sibling, next_sibling,
   5729					 &rt->fib6_siblings, fib6_siblings) {
   5730			if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
   5731					    sibling->fib6_nh->fib_nh_weight,
   5732					    AF_INET6, 0) < 0)
   5733				goto nla_put_failure;
   5734		}
   5735
   5736		nla_nest_end(skb, mp);
   5737	} else if (rt->nh) {
   5738		if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
   5739			goto nla_put_failure;
   5740
   5741		if (nexthop_is_blackhole(rt->nh))
   5742			rtm->rtm_type = RTN_BLACKHOLE;
   5743
   5744		if (net->ipv4.sysctl_nexthop_compat_mode &&
   5745		    rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
   5746			goto nla_put_failure;
   5747
   5748		rtm->rtm_flags |= nh_flags;
   5749	} else {
   5750		if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6,
   5751				     &nh_flags, false) < 0)
   5752			goto nla_put_failure;
   5753
   5754		rtm->rtm_flags |= nh_flags;
   5755	}
   5756
   5757	if (rt6_flags & RTF_EXPIRES) {
   5758		expires = dst ? dst->expires : rt->expires;
   5759		expires -= jiffies;
   5760	}
   5761
   5762	if (!dst) {
   5763		if (READ_ONCE(rt->offload))
   5764			rtm->rtm_flags |= RTM_F_OFFLOAD;
   5765		if (READ_ONCE(rt->trap))
   5766			rtm->rtm_flags |= RTM_F_TRAP;
   5767		if (READ_ONCE(rt->offload_failed))
   5768			rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED;
   5769	}
   5770
   5771	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
   5772		goto nla_put_failure;
   5773
   5774	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
   5775		goto nla_put_failure;
   5776
   5777
   5778	nlmsg_end(skb, nlh);
   5779	return 0;
   5780
   5781nla_put_failure:
   5782	nlmsg_cancel(skb, nlh);
   5783	return -EMSGSIZE;
   5784}
   5785
   5786static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg)
   5787{
   5788	const struct net_device *dev = arg;
   5789
   5790	if (nh->fib_nh_dev == dev)
   5791		return 1;
   5792
   5793	return 0;
   5794}
   5795
   5796static bool fib6_info_uses_dev(const struct fib6_info *f6i,
   5797			       const struct net_device *dev)
   5798{
   5799	if (f6i->nh) {
   5800		struct net_device *_dev = (struct net_device *)dev;
   5801
   5802		return !!nexthop_for_each_fib6_nh(f6i->nh,
   5803						  fib6_info_nh_uses_dev,
   5804						  _dev);
   5805	}
   5806
   5807	if (f6i->fib6_nh->fib_nh_dev == dev)
   5808		return true;
   5809
   5810	if (f6i->fib6_nsiblings) {
   5811		struct fib6_info *sibling, *next_sibling;
   5812
   5813		list_for_each_entry_safe(sibling, next_sibling,
   5814					 &f6i->fib6_siblings, fib6_siblings) {
   5815			if (sibling->fib6_nh->fib_nh_dev == dev)
   5816				return true;
   5817		}
   5818	}
   5819
   5820	return false;
   5821}
   5822
   5823struct fib6_nh_exception_dump_walker {
   5824	struct rt6_rtnl_dump_arg *dump;
   5825	struct fib6_info *rt;
   5826	unsigned int flags;
   5827	unsigned int skip;
   5828	unsigned int count;
   5829};
   5830
   5831static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg)
   5832{
   5833	struct fib6_nh_exception_dump_walker *w = arg;
   5834	struct rt6_rtnl_dump_arg *dump = w->dump;
   5835	struct rt6_exception_bucket *bucket;
   5836	struct rt6_exception *rt6_ex;
   5837	int i, err;
   5838
   5839	bucket = fib6_nh_get_excptn_bucket(nh, NULL);
   5840	if (!bucket)
   5841		return 0;
   5842
   5843	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
   5844		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
   5845			if (w->skip) {
   5846				w->skip--;
   5847				continue;
   5848			}
   5849
   5850			/* Expiration of entries doesn't bump sernum, insertion
   5851			 * does. Removal is triggered by insertion, so we can
   5852			 * rely on the fact that if entries change between two
   5853			 * partial dumps, this node is scanned again completely,
   5854			 * see rt6_insert_exception() and fib6_dump_table().
   5855			 *
   5856			 * Count expired entries we go through as handled
   5857			 * entries that we'll skip next time, in case of partial
   5858			 * node dump. Otherwise, if entries expire meanwhile,
   5859			 * we'll skip the wrong amount.
   5860			 */
   5861			if (rt6_check_expired(rt6_ex->rt6i)) {
   5862				w->count++;
   5863				continue;
   5864			}
   5865
   5866			err = rt6_fill_node(dump->net, dump->skb, w->rt,
   5867					    &rt6_ex->rt6i->dst, NULL, NULL, 0,
   5868					    RTM_NEWROUTE,
   5869					    NETLINK_CB(dump->cb->skb).portid,
   5870					    dump->cb->nlh->nlmsg_seq, w->flags);
   5871			if (err)
   5872				return err;
   5873
   5874			w->count++;
   5875		}
   5876		bucket++;
   5877	}
   5878
   5879	return 0;
   5880}
   5881
   5882/* Return -1 if done with node, number of handled routes on partial dump */
   5883int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip)
   5884{
   5885	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
   5886	struct fib_dump_filter *filter = &arg->filter;
   5887	unsigned int flags = NLM_F_MULTI;
   5888	struct net *net = arg->net;
   5889	int count = 0;
   5890
   5891	if (rt == net->ipv6.fib6_null_entry)
   5892		return -1;
   5893
   5894	if ((filter->flags & RTM_F_PREFIX) &&
   5895	    !(rt->fib6_flags & RTF_PREFIX_RT)) {
   5896		/* success since this is not a prefix route */
   5897		return -1;
   5898	}
   5899	if (filter->filter_set &&
   5900	    ((filter->rt_type  && rt->fib6_type != filter->rt_type) ||
   5901	     (filter->dev      && !fib6_info_uses_dev(rt, filter->dev)) ||
   5902	     (filter->protocol && rt->fib6_protocol != filter->protocol))) {
   5903		return -1;
   5904	}
   5905
   5906	if (filter->filter_set ||
   5907	    !filter->dump_routes || !filter->dump_exceptions) {
   5908		flags |= NLM_F_DUMP_FILTERED;
   5909	}
   5910
   5911	if (filter->dump_routes) {
   5912		if (skip) {
   5913			skip--;
   5914		} else {
   5915			if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL,
   5916					  0, RTM_NEWROUTE,
   5917					  NETLINK_CB(arg->cb->skb).portid,
   5918					  arg->cb->nlh->nlmsg_seq, flags)) {
   5919				return 0;
   5920			}
   5921			count++;
   5922		}
   5923	}
   5924
   5925	if (filter->dump_exceptions) {
   5926		struct fib6_nh_exception_dump_walker w = { .dump = arg,
   5927							   .rt = rt,
   5928							   .flags = flags,
   5929							   .skip = skip,
   5930							   .count = 0 };
   5931		int err;
   5932
   5933		rcu_read_lock();
   5934		if (rt->nh) {
   5935			err = nexthop_for_each_fib6_nh(rt->nh,
   5936						       rt6_nh_dump_exceptions,
   5937						       &w);
   5938		} else {
   5939			err = rt6_nh_dump_exceptions(rt->fib6_nh, &w);
   5940		}
   5941		rcu_read_unlock();
   5942
   5943		if (err)
   5944			return count += w.count;
   5945	}
   5946
   5947	return -1;
   5948}
   5949
   5950static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
   5951					const struct nlmsghdr *nlh,
   5952					struct nlattr **tb,
   5953					struct netlink_ext_ack *extack)
   5954{
   5955	struct rtmsg *rtm;
   5956	int i, err;
   5957
   5958	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
   5959		NL_SET_ERR_MSG_MOD(extack,
   5960				   "Invalid header for get route request");
   5961		return -EINVAL;
   5962	}
   5963
   5964	if (!netlink_strict_get_check(skb))
   5965		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
   5966					      rtm_ipv6_policy, extack);
   5967
   5968	rtm = nlmsg_data(nlh);
   5969	if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
   5970	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
   5971	    rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
   5972	    rtm->rtm_type) {
   5973		NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
   5974		return -EINVAL;
   5975	}
   5976	if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
   5977		NL_SET_ERR_MSG_MOD(extack,
   5978				   "Invalid flags for get route request");
   5979		return -EINVAL;
   5980	}
   5981
   5982	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
   5983					    rtm_ipv6_policy, extack);
   5984	if (err)
   5985		return err;
   5986
   5987	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
   5988	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
   5989		NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
   5990		return -EINVAL;
   5991	}
   5992
   5993	for (i = 0; i <= RTA_MAX; i++) {
   5994		if (!tb[i])
   5995			continue;
   5996
   5997		switch (i) {
   5998		case RTA_SRC:
   5999		case RTA_DST:
   6000		case RTA_IIF:
   6001		case RTA_OIF:
   6002		case RTA_MARK:
   6003		case RTA_UID:
   6004		case RTA_SPORT:
   6005		case RTA_DPORT:
   6006		case RTA_IP_PROTO:
   6007			break;
   6008		default:
   6009			NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
   6010			return -EINVAL;
   6011		}
   6012	}
   6013
   6014	return 0;
   6015}
   6016
   6017static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
   6018			      struct netlink_ext_ack *extack)
   6019{
   6020	struct net *net = sock_net(in_skb->sk);
   6021	struct nlattr *tb[RTA_MAX+1];
   6022	int err, iif = 0, oif = 0;
   6023	struct fib6_info *from;
   6024	struct dst_entry *dst;
   6025	struct rt6_info *rt;
   6026	struct sk_buff *skb;
   6027	struct rtmsg *rtm;
   6028	struct flowi6 fl6 = {};
   6029	bool fibmatch;
   6030
   6031	err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
   6032	if (err < 0)
   6033		goto errout;
   6034
   6035	err = -EINVAL;
   6036	rtm = nlmsg_data(nlh);
   6037	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
   6038	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
   6039
   6040	if (tb[RTA_SRC]) {
   6041		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
   6042			goto errout;
   6043
   6044		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
   6045	}
   6046
   6047	if (tb[RTA_DST]) {
   6048		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
   6049			goto errout;
   6050
   6051		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
   6052	}
   6053
   6054	if (tb[RTA_IIF])
   6055		iif = nla_get_u32(tb[RTA_IIF]);
   6056
   6057	if (tb[RTA_OIF])
   6058		oif = nla_get_u32(tb[RTA_OIF]);
   6059
   6060	if (tb[RTA_MARK])
   6061		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
   6062
   6063	if (tb[RTA_UID])
   6064		fl6.flowi6_uid = make_kuid(current_user_ns(),
   6065					   nla_get_u32(tb[RTA_UID]));
   6066	else
   6067		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
   6068
   6069	if (tb[RTA_SPORT])
   6070		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
   6071
   6072	if (tb[RTA_DPORT])
   6073		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
   6074
   6075	if (tb[RTA_IP_PROTO]) {
   6076		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
   6077						  &fl6.flowi6_proto, AF_INET6,
   6078						  extack);
   6079		if (err)
   6080			goto errout;
   6081	}
   6082
   6083	if (iif) {
   6084		struct net_device *dev;
   6085		int flags = 0;
   6086
   6087		rcu_read_lock();
   6088
   6089		dev = dev_get_by_index_rcu(net, iif);
   6090		if (!dev) {
   6091			rcu_read_unlock();
   6092			err = -ENODEV;
   6093			goto errout;
   6094		}
   6095
   6096		fl6.flowi6_iif = iif;
   6097
   6098		if (!ipv6_addr_any(&fl6.saddr))
   6099			flags |= RT6_LOOKUP_F_HAS_SADDR;
   6100
   6101		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
   6102
   6103		rcu_read_unlock();
   6104	} else {
   6105		fl6.flowi6_oif = oif;
   6106
   6107		dst = ip6_route_output(net, NULL, &fl6);
   6108	}
   6109
   6110
   6111	rt = container_of(dst, struct rt6_info, dst);
   6112	if (rt->dst.error) {
   6113		err = rt->dst.error;
   6114		ip6_rt_put(rt);
   6115		goto errout;
   6116	}
   6117
   6118	if (rt == net->ipv6.ip6_null_entry) {
   6119		err = rt->dst.error;
   6120		ip6_rt_put(rt);
   6121		goto errout;
   6122	}
   6123
   6124	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
   6125	if (!skb) {
   6126		ip6_rt_put(rt);
   6127		err = -ENOBUFS;
   6128		goto errout;
   6129	}
   6130
   6131	skb_dst_set(skb, &rt->dst);
   6132
   6133	rcu_read_lock();
   6134	from = rcu_dereference(rt->from);
   6135	if (from) {
   6136		if (fibmatch)
   6137			err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
   6138					    iif, RTM_NEWROUTE,
   6139					    NETLINK_CB(in_skb).portid,
   6140					    nlh->nlmsg_seq, 0);
   6141		else
   6142			err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
   6143					    &fl6.saddr, iif, RTM_NEWROUTE,
   6144					    NETLINK_CB(in_skb).portid,
   6145					    nlh->nlmsg_seq, 0);
   6146	} else {
   6147		err = -ENETUNREACH;
   6148	}
   6149	rcu_read_unlock();
   6150
   6151	if (err < 0) {
   6152		kfree_skb(skb);
   6153		goto errout;
   6154	}
   6155
   6156	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
   6157errout:
   6158	return err;
   6159}
   6160
   6161void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
   6162		     unsigned int nlm_flags)
   6163{
   6164	struct sk_buff *skb;
   6165	struct net *net = info->nl_net;
   6166	u32 seq;
   6167	int err;
   6168
   6169	err = -ENOBUFS;
   6170	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
   6171
   6172	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
   6173	if (!skb)
   6174		goto errout;
   6175
   6176	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
   6177			    event, info->portid, seq, nlm_flags);
   6178	if (err < 0) {
   6179		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
   6180		WARN_ON(err == -EMSGSIZE);
   6181		kfree_skb(skb);
   6182		goto errout;
   6183	}
   6184	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
   6185		    info->nlh, gfp_any());
   6186	return;
   6187errout:
   6188	if (err < 0)
   6189		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
   6190}
   6191
   6192void fib6_rt_update(struct net *net, struct fib6_info *rt,
   6193		    struct nl_info *info)
   6194{
   6195	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
   6196	struct sk_buff *skb;
   6197	int err = -ENOBUFS;
   6198
   6199	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
   6200	if (!skb)
   6201		goto errout;
   6202
   6203	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
   6204			    RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE);
   6205	if (err < 0) {
   6206		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
   6207		WARN_ON(err == -EMSGSIZE);
   6208		kfree_skb(skb);
   6209		goto errout;
   6210	}
   6211	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
   6212		    info->nlh, gfp_any());
   6213	return;
   6214errout:
   6215	if (err < 0)
   6216		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
   6217}
   6218
   6219void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i,
   6220			    bool offload, bool trap, bool offload_failed)
   6221{
   6222	struct sk_buff *skb;
   6223	int err;
   6224
   6225	if (READ_ONCE(f6i->offload) == offload &&
   6226	    READ_ONCE(f6i->trap) == trap &&
   6227	    READ_ONCE(f6i->offload_failed) == offload_failed)
   6228		return;
   6229
   6230	WRITE_ONCE(f6i->offload, offload);
   6231	WRITE_ONCE(f6i->trap, trap);
   6232
   6233	/* 2 means send notifications only if offload_failed was changed. */
   6234	if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 &&
   6235	    READ_ONCE(f6i->offload_failed) == offload_failed)
   6236		return;
   6237
   6238	WRITE_ONCE(f6i->offload_failed, offload_failed);
   6239
   6240	if (!rcu_access_pointer(f6i->fib6_node))
   6241		/* The route was removed from the tree, do not send
   6242		 * notification.
   6243		 */
   6244		return;
   6245
   6246	if (!net->ipv6.sysctl.fib_notify_on_flag_change)
   6247		return;
   6248
   6249	skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL);
   6250	if (!skb) {
   6251		err = -ENOBUFS;
   6252		goto errout;
   6253	}
   6254
   6255	err = rt6_fill_node(net, skb, f6i, NULL, NULL, NULL, 0, RTM_NEWROUTE, 0,
   6256			    0, 0);
   6257	if (err < 0) {
   6258		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
   6259		WARN_ON(err == -EMSGSIZE);
   6260		kfree_skb(skb);
   6261		goto errout;
   6262	}
   6263
   6264	rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ROUTE, NULL, GFP_KERNEL);
   6265	return;
   6266
   6267errout:
   6268	rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
   6269}
   6270EXPORT_SYMBOL(fib6_info_hw_flags_set);
   6271
   6272static int ip6_route_dev_notify(struct notifier_block *this,
   6273				unsigned long event, void *ptr)
   6274{
   6275	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
   6276	struct net *net = dev_net(dev);
   6277
   6278	if (!(dev->flags & IFF_LOOPBACK))
   6279		return NOTIFY_OK;
   6280
   6281	if (event == NETDEV_REGISTER) {
   6282		net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev;
   6283		net->ipv6.ip6_null_entry->dst.dev = dev;
   6284		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
   6285#ifdef CONFIG_IPV6_MULTIPLE_TABLES
   6286		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
   6287		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
   6288		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
   6289		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
   6290#endif
   6291	 } else if (event == NETDEV_UNREGISTER &&
   6292		    dev->reg_state != NETREG_UNREGISTERED) {
   6293		/* NETDEV_UNREGISTER could be fired for multiple times by
   6294		 * netdev_wait_allrefs(). Make sure we only call this once.
   6295		 */
   6296		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
   6297#ifdef CONFIG_IPV6_MULTIPLE_TABLES
   6298		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
   6299		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
   6300#endif
   6301	}
   6302
   6303	return NOTIFY_OK;
   6304}
   6305
   6306/*
   6307 *	/proc
   6308 */
   6309
   6310#ifdef CONFIG_PROC_FS
   6311static int rt6_stats_seq_show(struct seq_file *seq, void *v)
   6312{
   6313	struct net *net = (struct net *)seq->private;
   6314	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
   6315		   net->ipv6.rt6_stats->fib_nodes,
   6316		   net->ipv6.rt6_stats->fib_route_nodes,
   6317		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
   6318		   net->ipv6.rt6_stats->fib_rt_entries,
   6319		   net->ipv6.rt6_stats->fib_rt_cache,
   6320		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
   6321		   net->ipv6.rt6_stats->fib_discarded_routes);
   6322
   6323	return 0;
   6324}
   6325#endif	/* CONFIG_PROC_FS */
   6326
   6327#ifdef CONFIG_SYSCTL
   6328
   6329static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
   6330			      void *buffer, size_t *lenp, loff_t *ppos)
   6331{
   6332	struct net *net;
   6333	int delay;
   6334	int ret;
   6335	if (!write)
   6336		return -EINVAL;
   6337
   6338	net = (struct net *)ctl->extra1;
   6339	delay = net->ipv6.sysctl.flush_delay;
   6340	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
   6341	if (ret)
   6342		return ret;
   6343
   6344	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
   6345	return 0;
   6346}
   6347
   6348static struct ctl_table ipv6_route_table_template[] = {
   6349	{
   6350		.procname	=	"max_size",
   6351		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
   6352		.maxlen		=	sizeof(int),
   6353		.mode		=	0644,
   6354		.proc_handler	=	proc_dointvec,
   6355	},
   6356	{
   6357		.procname	=	"gc_thresh",
   6358		.data		=	&ip6_dst_ops_template.gc_thresh,
   6359		.maxlen		=	sizeof(int),
   6360		.mode		=	0644,
   6361		.proc_handler	=	proc_dointvec,
   6362	},
   6363	{
   6364		.procname	=	"flush",
   6365		.data		=	&init_net.ipv6.sysctl.flush_delay,
   6366		.maxlen		=	sizeof(int),
   6367		.mode		=	0200,
   6368		.proc_handler	=	ipv6_sysctl_rtcache_flush
   6369	},
   6370	{
   6371		.procname	=	"gc_min_interval",
   6372		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
   6373		.maxlen		=	sizeof(int),
   6374		.mode		=	0644,
   6375		.proc_handler	=	proc_dointvec_jiffies,
   6376	},
   6377	{
   6378		.procname	=	"gc_timeout",
   6379		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
   6380		.maxlen		=	sizeof(int),
   6381		.mode		=	0644,
   6382		.proc_handler	=	proc_dointvec_jiffies,
   6383	},
   6384	{
   6385		.procname	=	"gc_interval",
   6386		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
   6387		.maxlen		=	sizeof(int),
   6388		.mode		=	0644,
   6389		.proc_handler	=	proc_dointvec_jiffies,
   6390	},
   6391	{
   6392		.procname	=	"gc_elasticity",
   6393		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
   6394		.maxlen		=	sizeof(int),
   6395		.mode		=	0644,
   6396		.proc_handler	=	proc_dointvec,
   6397	},
   6398	{
   6399		.procname	=	"mtu_expires",
   6400		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
   6401		.maxlen		=	sizeof(int),
   6402		.mode		=	0644,
   6403		.proc_handler	=	proc_dointvec_jiffies,
   6404	},
   6405	{
   6406		.procname	=	"min_adv_mss",
   6407		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
   6408		.maxlen		=	sizeof(int),
   6409		.mode		=	0644,
   6410		.proc_handler	=	proc_dointvec,
   6411	},
   6412	{
   6413		.procname	=	"gc_min_interval_ms",
   6414		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
   6415		.maxlen		=	sizeof(int),
   6416		.mode		=	0644,
   6417		.proc_handler	=	proc_dointvec_ms_jiffies,
   6418	},
   6419	{
   6420		.procname	=	"skip_notify_on_dev_down",
   6421		.data		=	&init_net.ipv6.sysctl.skip_notify_on_dev_down,
   6422		.maxlen		=	sizeof(int),
   6423		.mode		=	0644,
   6424		.proc_handler	=	proc_dointvec_minmax,
   6425		.extra1		=	SYSCTL_ZERO,
   6426		.extra2		=	SYSCTL_ONE,
   6427	},
   6428	{ }
   6429};
   6430
   6431struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
   6432{
   6433	struct ctl_table *table;
   6434
   6435	table = kmemdup(ipv6_route_table_template,
   6436			sizeof(ipv6_route_table_template),
   6437			GFP_KERNEL);
   6438
   6439	if (table) {
   6440		table[0].data = &net->ipv6.sysctl.ip6_rt_max_size;
   6441		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
   6442		table[2].data = &net->ipv6.sysctl.flush_delay;
   6443		table[2].extra1 = net;
   6444		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
   6445		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
   6446		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
   6447		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
   6448		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
   6449		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
   6450		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
   6451		table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
   6452
   6453		/* Don't export sysctls to unprivileged users */
   6454		if (net->user_ns != &init_user_ns)
   6455			table[1].procname = NULL;
   6456	}
   6457
   6458	return table;
   6459}
   6460#endif
   6461
   6462static int __net_init ip6_route_net_init(struct net *net)
   6463{
   6464	int ret = -ENOMEM;
   6465
   6466	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
   6467	       sizeof(net->ipv6.ip6_dst_ops));
   6468
   6469	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
   6470		goto out_ip6_dst_ops;
   6471
   6472	net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true);
   6473	if (!net->ipv6.fib6_null_entry)
   6474		goto out_ip6_dst_entries;
   6475	memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template,
   6476	       sizeof(*net->ipv6.fib6_null_entry));
   6477
   6478	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
   6479					   sizeof(*net->ipv6.ip6_null_entry),
   6480					   GFP_KERNEL);
   6481	if (!net->ipv6.ip6_null_entry)
   6482		goto out_fib6_null_entry;
   6483	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
   6484	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
   6485			 ip6_template_metrics, true);
   6486	INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached);
   6487
   6488#ifdef CONFIG_IPV6_MULTIPLE_TABLES
   6489	net->ipv6.fib6_has_custom_rules = false;
   6490	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
   6491					       sizeof(*net->ipv6.ip6_prohibit_entry),
   6492					       GFP_KERNEL);
   6493	if (!net->ipv6.ip6_prohibit_entry)
   6494		goto out_ip6_null_entry;
   6495	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
   6496	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
   6497			 ip6_template_metrics, true);
   6498	INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached);
   6499
   6500	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
   6501					       sizeof(*net->ipv6.ip6_blk_hole_entry),
   6502					       GFP_KERNEL);
   6503	if (!net->ipv6.ip6_blk_hole_entry)
   6504		goto out_ip6_prohibit_entry;
   6505	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
   6506	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
   6507			 ip6_template_metrics, true);
   6508	INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached);
   6509#ifdef CONFIG_IPV6_SUBTREES
   6510	net->ipv6.fib6_routes_require_src = 0;
   6511#endif
   6512#endif
   6513
   6514	net->ipv6.sysctl.flush_delay = 0;
   6515	net->ipv6.sysctl.ip6_rt_max_size = 4096;
   6516	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
   6517	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
   6518	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
   6519	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
   6520	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
   6521	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
   6522	net->ipv6.sysctl.skip_notify_on_dev_down = 0;
   6523
   6524	atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ);
   6525
   6526	ret = 0;
   6527out:
   6528	return ret;
   6529
   6530#ifdef CONFIG_IPV6_MULTIPLE_TABLES
   6531out_ip6_prohibit_entry:
   6532	kfree(net->ipv6.ip6_prohibit_entry);
   6533out_ip6_null_entry:
   6534	kfree(net->ipv6.ip6_null_entry);
   6535#endif
   6536out_fib6_null_entry:
   6537	kfree(net->ipv6.fib6_null_entry);
   6538out_ip6_dst_entries:
   6539	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
   6540out_ip6_dst_ops:
   6541	goto out;
   6542}
   6543
   6544static void __net_exit ip6_route_net_exit(struct net *net)
   6545{
   6546	kfree(net->ipv6.fib6_null_entry);
   6547	kfree(net->ipv6.ip6_null_entry);
   6548#ifdef CONFIG_IPV6_MULTIPLE_TABLES
   6549	kfree(net->ipv6.ip6_prohibit_entry);
   6550	kfree(net->ipv6.ip6_blk_hole_entry);
   6551#endif
   6552	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
   6553}
   6554
   6555static int __net_init ip6_route_net_init_late(struct net *net)
   6556{
   6557#ifdef CONFIG_PROC_FS
   6558	proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
   6559			sizeof(struct ipv6_route_iter));
   6560	proc_create_net_single("rt6_stats", 0444, net->proc_net,
   6561			rt6_stats_seq_show, NULL);
   6562#endif
   6563	return 0;
   6564}
   6565
   6566static void __net_exit ip6_route_net_exit_late(struct net *net)
   6567{
   6568#ifdef CONFIG_PROC_FS
   6569	remove_proc_entry("ipv6_route", net->proc_net);
   6570	remove_proc_entry("rt6_stats", net->proc_net);
   6571#endif
   6572}
   6573
   6574static struct pernet_operations ip6_route_net_ops = {
   6575	.init = ip6_route_net_init,
   6576	.exit = ip6_route_net_exit,
   6577};
   6578
   6579static int __net_init ipv6_inetpeer_init(struct net *net)
   6580{
   6581	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
   6582
   6583	if (!bp)
   6584		return -ENOMEM;
   6585	inet_peer_base_init(bp);
   6586	net->ipv6.peers = bp;
   6587	return 0;
   6588}
   6589
   6590static void __net_exit ipv6_inetpeer_exit(struct net *net)
   6591{
   6592	struct inet_peer_base *bp = net->ipv6.peers;
   6593
   6594	net->ipv6.peers = NULL;
   6595	inetpeer_invalidate_tree(bp);
   6596	kfree(bp);
   6597}
   6598
   6599static struct pernet_operations ipv6_inetpeer_ops = {
   6600	.init	=	ipv6_inetpeer_init,
   6601	.exit	=	ipv6_inetpeer_exit,
   6602};
   6603
   6604static struct pernet_operations ip6_route_net_late_ops = {
   6605	.init = ip6_route_net_init_late,
   6606	.exit = ip6_route_net_exit_late,
   6607};
   6608
   6609static struct notifier_block ip6_route_dev_notifier = {
   6610	.notifier_call = ip6_route_dev_notify,
   6611	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
   6612};
   6613
   6614void __init ip6_route_init_special_entries(void)
   6615{
   6616	/* Registering of the loopback is done before this portion of code,
   6617	 * the loopback reference in rt6_info will not be taken, do it
   6618	 * manually for init_net */
   6619	init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev;
   6620	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
   6621	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
   6622  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
   6623	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
   6624	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
   6625	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
   6626	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
   6627  #endif
   6628}
   6629
   6630#if IS_BUILTIN(CONFIG_IPV6)
   6631#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
   6632DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt)
   6633
   6634BTF_ID_LIST(btf_fib6_info_id)
   6635BTF_ID(struct, fib6_info)
   6636
   6637static const struct bpf_iter_seq_info ipv6_route_seq_info = {
   6638	.seq_ops		= &ipv6_route_seq_ops,
   6639	.init_seq_private	= bpf_iter_init_seq_net,
   6640	.fini_seq_private	= bpf_iter_fini_seq_net,
   6641	.seq_priv_size		= sizeof(struct ipv6_route_iter),
   6642};
   6643
   6644static struct bpf_iter_reg ipv6_route_reg_info = {
   6645	.target			= "ipv6_route",
   6646	.ctx_arg_info_size	= 1,
   6647	.ctx_arg_info		= {
   6648		{ offsetof(struct bpf_iter__ipv6_route, rt),
   6649		  PTR_TO_BTF_ID_OR_NULL },
   6650	},
   6651	.seq_info		= &ipv6_route_seq_info,
   6652};
   6653
   6654static int __init bpf_iter_register(void)
   6655{
   6656	ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id;
   6657	return bpf_iter_reg_target(&ipv6_route_reg_info);
   6658}
   6659
   6660static void bpf_iter_unregister(void)
   6661{
   6662	bpf_iter_unreg_target(&ipv6_route_reg_info);
   6663}
   6664#endif
   6665#endif
   6666
   6667int __init ip6_route_init(void)
   6668{
   6669	int ret;
   6670	int cpu;
   6671
   6672	ret = -ENOMEM;
   6673	ip6_dst_ops_template.kmem_cachep =
   6674		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
   6675				  SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
   6676	if (!ip6_dst_ops_template.kmem_cachep)
   6677		goto out;
   6678
   6679	ret = dst_entries_init(&ip6_dst_blackhole_ops);
   6680	if (ret)
   6681		goto out_kmem_cache;
   6682
   6683	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
   6684	if (ret)
   6685		goto out_dst_entries;
   6686
   6687	ret = register_pernet_subsys(&ip6_route_net_ops);
   6688	if (ret)
   6689		goto out_register_inetpeer;
   6690
   6691	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
   6692
   6693	ret = fib6_init();
   6694	if (ret)
   6695		goto out_register_subsys;
   6696
   6697	ret = xfrm6_init();
   6698	if (ret)
   6699		goto out_fib6_init;
   6700
   6701	ret = fib6_rules_init();
   6702	if (ret)
   6703		goto xfrm6_init;
   6704
   6705	ret = register_pernet_subsys(&ip6_route_net_late_ops);
   6706	if (ret)
   6707		goto fib6_rules_init;
   6708
   6709	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
   6710				   inet6_rtm_newroute, NULL, 0);
   6711	if (ret < 0)
   6712		goto out_register_late_subsys;
   6713
   6714	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
   6715				   inet6_rtm_delroute, NULL, 0);
   6716	if (ret < 0)
   6717		goto out_register_late_subsys;
   6718
   6719	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
   6720				   inet6_rtm_getroute, NULL,
   6721				   RTNL_FLAG_DOIT_UNLOCKED);
   6722	if (ret < 0)
   6723		goto out_register_late_subsys;
   6724
   6725	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
   6726	if (ret)
   6727		goto out_register_late_subsys;
   6728
   6729#if IS_BUILTIN(CONFIG_IPV6)
   6730#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
   6731	ret = bpf_iter_register();
   6732	if (ret)
   6733		goto out_register_late_subsys;
   6734#endif
   6735#endif
   6736
   6737	for_each_possible_cpu(cpu) {
   6738		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
   6739
   6740		INIT_LIST_HEAD(&ul->head);
   6741		INIT_LIST_HEAD(&ul->quarantine);
   6742		spin_lock_init(&ul->lock);
   6743	}
   6744
   6745out:
   6746	return ret;
   6747
   6748out_register_late_subsys:
   6749	rtnl_unregister_all(PF_INET6);
   6750	unregister_pernet_subsys(&ip6_route_net_late_ops);
   6751fib6_rules_init:
   6752	fib6_rules_cleanup();
   6753xfrm6_init:
   6754	xfrm6_fini();
   6755out_fib6_init:
   6756	fib6_gc_cleanup();
   6757out_register_subsys:
   6758	unregister_pernet_subsys(&ip6_route_net_ops);
   6759out_register_inetpeer:
   6760	unregister_pernet_subsys(&ipv6_inetpeer_ops);
   6761out_dst_entries:
   6762	dst_entries_destroy(&ip6_dst_blackhole_ops);
   6763out_kmem_cache:
   6764	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
   6765	goto out;
   6766}
   6767
   6768void ip6_route_cleanup(void)
   6769{
   6770#if IS_BUILTIN(CONFIG_IPV6)
   6771#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
   6772	bpf_iter_unregister();
   6773#endif
   6774#endif
   6775	unregister_netdevice_notifier(&ip6_route_dev_notifier);
   6776	unregister_pernet_subsys(&ip6_route_net_late_ops);
   6777	fib6_rules_cleanup();
   6778	xfrm6_fini();
   6779	fib6_gc_cleanup();
   6780	unregister_pernet_subsys(&ipv6_inetpeer_ops);
   6781	unregister_pernet_subsys(&ip6_route_net_ops);
   6782	dst_entries_destroy(&ip6_dst_blackhole_ops);
   6783	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
   6784}