cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

nexthop.c (93061B)


      1// SPDX-License-Identifier: GPL-2.0
      2/* Generic nexthop implementation
      3 *
      4 * Copyright (c) 2017-19 Cumulus Networks
      5 * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com>
      6 */
      7
      8#include <linux/nexthop.h>
      9#include <linux/rtnetlink.h>
     10#include <linux/slab.h>
     11#include <linux/vmalloc.h>
     12#include <net/arp.h>
     13#include <net/ipv6_stubs.h>
     14#include <net/lwtunnel.h>
     15#include <net/ndisc.h>
     16#include <net/nexthop.h>
     17#include <net/route.h>
     18#include <net/sock.h>
     19
     20#define NH_RES_DEFAULT_IDLE_TIMER	(120 * HZ)
     21#define NH_RES_DEFAULT_UNBALANCED_TIMER	0	/* No forced rebalancing. */
     22
     23static void remove_nexthop(struct net *net, struct nexthop *nh,
     24			   struct nl_info *nlinfo);
     25
     26#define NH_DEV_HASHBITS  8
     27#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
     28
     29static const struct nla_policy rtm_nh_policy_new[] = {
     30	[NHA_ID]		= { .type = NLA_U32 },
     31	[NHA_GROUP]		= { .type = NLA_BINARY },
     32	[NHA_GROUP_TYPE]	= { .type = NLA_U16 },
     33	[NHA_BLACKHOLE]		= { .type = NLA_FLAG },
     34	[NHA_OIF]		= { .type = NLA_U32 },
     35	[NHA_GATEWAY]		= { .type = NLA_BINARY },
     36	[NHA_ENCAP_TYPE]	= { .type = NLA_U16 },
     37	[NHA_ENCAP]		= { .type = NLA_NESTED },
     38	[NHA_FDB]		= { .type = NLA_FLAG },
     39	[NHA_RES_GROUP]		= { .type = NLA_NESTED },
     40};
     41
     42static const struct nla_policy rtm_nh_policy_get[] = {
     43	[NHA_ID]		= { .type = NLA_U32 },
     44};
     45
     46static const struct nla_policy rtm_nh_policy_dump[] = {
     47	[NHA_OIF]		= { .type = NLA_U32 },
     48	[NHA_GROUPS]		= { .type = NLA_FLAG },
     49	[NHA_MASTER]		= { .type = NLA_U32 },
     50	[NHA_FDB]		= { .type = NLA_FLAG },
     51};
     52
     53static const struct nla_policy rtm_nh_res_policy_new[] = {
     54	[NHA_RES_GROUP_BUCKETS]			= { .type = NLA_U16 },
     55	[NHA_RES_GROUP_IDLE_TIMER]		= { .type = NLA_U32 },
     56	[NHA_RES_GROUP_UNBALANCED_TIMER]	= { .type = NLA_U32 },
     57};
     58
     59static const struct nla_policy rtm_nh_policy_dump_bucket[] = {
     60	[NHA_ID]		= { .type = NLA_U32 },
     61	[NHA_OIF]		= { .type = NLA_U32 },
     62	[NHA_MASTER]		= { .type = NLA_U32 },
     63	[NHA_RES_BUCKET]	= { .type = NLA_NESTED },
     64};
     65
     66static const struct nla_policy rtm_nh_res_bucket_policy_dump[] = {
     67	[NHA_RES_BUCKET_NH_ID]	= { .type = NLA_U32 },
     68};
     69
     70static const struct nla_policy rtm_nh_policy_get_bucket[] = {
     71	[NHA_ID]		= { .type = NLA_U32 },
     72	[NHA_RES_BUCKET]	= { .type = NLA_NESTED },
     73};
     74
     75static const struct nla_policy rtm_nh_res_bucket_policy_get[] = {
     76	[NHA_RES_BUCKET_INDEX]	= { .type = NLA_U16 },
     77};
     78
     79static bool nexthop_notifiers_is_empty(struct net *net)
     80{
     81	return !net->nexthop.notifier_chain.head;
     82}
     83
     84static void
     85__nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info,
     86			       const struct nh_info *nhi)
     87{
     88	nh_info->dev = nhi->fib_nhc.nhc_dev;
     89	nh_info->gw_family = nhi->fib_nhc.nhc_gw_family;
     90	if (nh_info->gw_family == AF_INET)
     91		nh_info->ipv4 = nhi->fib_nhc.nhc_gw.ipv4;
     92	else if (nh_info->gw_family == AF_INET6)
     93		nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6;
     94
     95	nh_info->is_reject = nhi->reject_nh;
     96	nh_info->is_fdb = nhi->fdb_nh;
     97	nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate;
     98}
     99
    100static int nh_notifier_single_info_init(struct nh_notifier_info *info,
    101					const struct nexthop *nh)
    102{
    103	struct nh_info *nhi = rtnl_dereference(nh->nh_info);
    104
    105	info->type = NH_NOTIFIER_INFO_TYPE_SINGLE;
    106	info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL);
    107	if (!info->nh)
    108		return -ENOMEM;
    109
    110	__nh_notifier_single_info_init(info->nh, nhi);
    111
    112	return 0;
    113}
    114
    115static void nh_notifier_single_info_fini(struct nh_notifier_info *info)
    116{
    117	kfree(info->nh);
    118}
    119
    120static int nh_notifier_mpath_info_init(struct nh_notifier_info *info,
    121				       struct nh_group *nhg)
    122{
    123	u16 num_nh = nhg->num_nh;
    124	int i;
    125
    126	info->type = NH_NOTIFIER_INFO_TYPE_GRP;
    127	info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh),
    128			       GFP_KERNEL);
    129	if (!info->nh_grp)
    130		return -ENOMEM;
    131
    132	info->nh_grp->num_nh = num_nh;
    133	info->nh_grp->is_fdb = nhg->fdb_nh;
    134
    135	for (i = 0; i < num_nh; i++) {
    136		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
    137		struct nh_info *nhi;
    138
    139		nhi = rtnl_dereference(nhge->nh->nh_info);
    140		info->nh_grp->nh_entries[i].id = nhge->nh->id;
    141		info->nh_grp->nh_entries[i].weight = nhge->weight;
    142		__nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh,
    143					       nhi);
    144	}
    145
    146	return 0;
    147}
    148
    149static int nh_notifier_res_table_info_init(struct nh_notifier_info *info,
    150					   struct nh_group *nhg)
    151{
    152	struct nh_res_table *res_table = rtnl_dereference(nhg->res_table);
    153	u16 num_nh_buckets = res_table->num_nh_buckets;
    154	unsigned long size;
    155	u16 i;
    156
    157	info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE;
    158	size = struct_size(info->nh_res_table, nhs, num_nh_buckets);
    159	info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO |
    160				       __GFP_NOWARN);
    161	if (!info->nh_res_table)
    162		return -ENOMEM;
    163
    164	info->nh_res_table->num_nh_buckets = num_nh_buckets;
    165
    166	for (i = 0; i < num_nh_buckets; i++) {
    167		struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
    168		struct nh_grp_entry *nhge;
    169		struct nh_info *nhi;
    170
    171		nhge = rtnl_dereference(bucket->nh_entry);
    172		nhi = rtnl_dereference(nhge->nh->nh_info);
    173		__nh_notifier_single_info_init(&info->nh_res_table->nhs[i],
    174					       nhi);
    175	}
    176
    177	return 0;
    178}
    179
    180static int nh_notifier_grp_info_init(struct nh_notifier_info *info,
    181				     const struct nexthop *nh)
    182{
    183	struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
    184
    185	if (nhg->hash_threshold)
    186		return nh_notifier_mpath_info_init(info, nhg);
    187	else if (nhg->resilient)
    188		return nh_notifier_res_table_info_init(info, nhg);
    189	return -EINVAL;
    190}
    191
    192static void nh_notifier_grp_info_fini(struct nh_notifier_info *info,
    193				      const struct nexthop *nh)
    194{
    195	struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
    196
    197	if (nhg->hash_threshold)
    198		kfree(info->nh_grp);
    199	else if (nhg->resilient)
    200		vfree(info->nh_res_table);
    201}
    202
    203static int nh_notifier_info_init(struct nh_notifier_info *info,
    204				 const struct nexthop *nh)
    205{
    206	info->id = nh->id;
    207
    208	if (nh->is_group)
    209		return nh_notifier_grp_info_init(info, nh);
    210	else
    211		return nh_notifier_single_info_init(info, nh);
    212}
    213
    214static void nh_notifier_info_fini(struct nh_notifier_info *info,
    215				  const struct nexthop *nh)
    216{
    217	if (nh->is_group)
    218		nh_notifier_grp_info_fini(info, nh);
    219	else
    220		nh_notifier_single_info_fini(info);
    221}
    222
    223static int call_nexthop_notifiers(struct net *net,
    224				  enum nexthop_event_type event_type,
    225				  struct nexthop *nh,
    226				  struct netlink_ext_ack *extack)
    227{
    228	struct nh_notifier_info info = {
    229		.net = net,
    230		.extack = extack,
    231	};
    232	int err;
    233
    234	ASSERT_RTNL();
    235
    236	if (nexthop_notifiers_is_empty(net))
    237		return 0;
    238
    239	err = nh_notifier_info_init(&info, nh);
    240	if (err) {
    241		NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info");
    242		return err;
    243	}
    244
    245	err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
    246					   event_type, &info);
    247	nh_notifier_info_fini(&info, nh);
    248
    249	return notifier_to_errno(err);
    250}
    251
    252static int
    253nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info,
    254				      bool force, unsigned int *p_idle_timer_ms)
    255{
    256	struct nh_res_table *res_table;
    257	struct nh_group *nhg;
    258	struct nexthop *nh;
    259	int err = 0;
    260
    261	/* When 'force' is false, nexthop bucket replacement is performed
    262	 * because the bucket was deemed to be idle. In this case, capable
    263	 * listeners can choose to perform an atomic replacement: The bucket is
    264	 * only replaced if it is inactive. However, if the idle timer interval
    265	 * is smaller than the interval in which a listener is querying
    266	 * buckets' activity from the device, then atomic replacement should
    267	 * not be tried. Pass the idle timer value to listeners, so that they
    268	 * could determine which type of replacement to perform.
    269	 */
    270	if (force) {
    271		*p_idle_timer_ms = 0;
    272		return 0;
    273	}
    274
    275	rcu_read_lock();
    276
    277	nh = nexthop_find_by_id(info->net, info->id);
    278	if (!nh) {
    279		err = -EINVAL;
    280		goto out;
    281	}
    282
    283	nhg = rcu_dereference(nh->nh_grp);
    284	res_table = rcu_dereference(nhg->res_table);
    285	*p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer);
    286
    287out:
    288	rcu_read_unlock();
    289
    290	return err;
    291}
    292
    293static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info,
    294					    u16 bucket_index, bool force,
    295					    struct nh_info *oldi,
    296					    struct nh_info *newi)
    297{
    298	unsigned int idle_timer_ms;
    299	int err;
    300
    301	err = nh_notifier_res_bucket_idle_timer_get(info, force,
    302						    &idle_timer_ms);
    303	if (err)
    304		return err;
    305
    306	info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET;
    307	info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket),
    308				      GFP_KERNEL);
    309	if (!info->nh_res_bucket)
    310		return -ENOMEM;
    311
    312	info->nh_res_bucket->bucket_index = bucket_index;
    313	info->nh_res_bucket->idle_timer_ms = idle_timer_ms;
    314	info->nh_res_bucket->force = force;
    315	__nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi);
    316	__nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi);
    317	return 0;
    318}
    319
    320static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info)
    321{
    322	kfree(info->nh_res_bucket);
    323}
    324
    325static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id,
    326					       u16 bucket_index, bool force,
    327					       struct nh_info *oldi,
    328					       struct nh_info *newi,
    329					       struct netlink_ext_ack *extack)
    330{
    331	struct nh_notifier_info info = {
    332		.net = net,
    333		.extack = extack,
    334		.id = nhg_id,
    335	};
    336	int err;
    337
    338	if (nexthop_notifiers_is_empty(net))
    339		return 0;
    340
    341	err = nh_notifier_res_bucket_info_init(&info, bucket_index, force,
    342					       oldi, newi);
    343	if (err)
    344		return err;
    345
    346	err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
    347					   NEXTHOP_EVENT_BUCKET_REPLACE, &info);
    348	nh_notifier_res_bucket_info_fini(&info);
    349
    350	return notifier_to_errno(err);
    351}
    352
    353/* There are three users of RES_TABLE, and NHs etc. referenced from there:
    354 *
    355 * 1) a collection of callbacks for NH maintenance. This operates under
    356 *    RTNL,
    357 * 2) the delayed work that gradually balances the resilient table,
    358 * 3) and nexthop_select_path(), operating under RCU.
    359 *
    360 * Both the delayed work and the RTNL block are writers, and need to
    361 * maintain mutual exclusion. Since there are only two and well-known
    362 * writers for each table, the RTNL code can make sure it has exclusive
    363 * access thus:
    364 *
    365 * - Have the DW operate without locking;
    366 * - synchronously cancel the DW;
    367 * - do the writing;
    368 * - if the write was not actually a delete, call upkeep, which schedules
    369 *   DW again if necessary.
    370 *
    371 * The functions that are always called from the RTNL context use
    372 * rtnl_dereference(). The functions that can also be called from the DW do
    373 * a raw dereference and rely on the above mutual exclusion scheme.
    374 */
    375#define nh_res_dereference(p) (rcu_dereference_raw(p))
    376
    377static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id,
    378					     u16 bucket_index, bool force,
    379					     struct nexthop *old_nh,
    380					     struct nexthop *new_nh,
    381					     struct netlink_ext_ack *extack)
    382{
    383	struct nh_info *oldi = nh_res_dereference(old_nh->nh_info);
    384	struct nh_info *newi = nh_res_dereference(new_nh->nh_info);
    385
    386	return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index,
    387						   force, oldi, newi, extack);
    388}
    389
    390static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh,
    391					    struct netlink_ext_ack *extack)
    392{
    393	struct nh_notifier_info info = {
    394		.net = net,
    395		.extack = extack,
    396	};
    397	struct nh_group *nhg;
    398	int err;
    399
    400	ASSERT_RTNL();
    401
    402	if (nexthop_notifiers_is_empty(net))
    403		return 0;
    404
    405	/* At this point, the nexthop buckets are still not populated. Only
    406	 * emit a notification with the logical nexthops, so that a listener
    407	 * could potentially veto it in case of unsupported configuration.
    408	 */
    409	nhg = rtnl_dereference(nh->nh_grp);
    410	err = nh_notifier_mpath_info_init(&info, nhg);
    411	if (err) {
    412		NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info");
    413		return err;
    414	}
    415
    416	err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
    417					   NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE,
    418					   &info);
    419	kfree(info.nh_grp);
    420
    421	return notifier_to_errno(err);
    422}
    423
    424static int call_nexthop_notifier(struct notifier_block *nb, struct net *net,
    425				 enum nexthop_event_type event_type,
    426				 struct nexthop *nh,
    427				 struct netlink_ext_ack *extack)
    428{
    429	struct nh_notifier_info info = {
    430		.net = net,
    431		.extack = extack,
    432	};
    433	int err;
    434
    435	err = nh_notifier_info_init(&info, nh);
    436	if (err)
    437		return err;
    438
    439	err = nb->notifier_call(nb, event_type, &info);
    440	nh_notifier_info_fini(&info, nh);
    441
    442	return notifier_to_errno(err);
    443}
    444
    445static unsigned int nh_dev_hashfn(unsigned int val)
    446{
    447	unsigned int mask = NH_DEV_HASHSIZE - 1;
    448
    449	return (val ^
    450		(val >> NH_DEV_HASHBITS) ^
    451		(val >> (NH_DEV_HASHBITS * 2))) & mask;
    452}
    453
    454static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
    455{
    456	struct net_device *dev = nhi->fib_nhc.nhc_dev;
    457	struct hlist_head *head;
    458	unsigned int hash;
    459
    460	WARN_ON(!dev);
    461
    462	hash = nh_dev_hashfn(dev->ifindex);
    463	head = &net->nexthop.devhash[hash];
    464	hlist_add_head(&nhi->dev_hash, head);
    465}
    466
    467static void nexthop_free_group(struct nexthop *nh)
    468{
    469	struct nh_group *nhg;
    470	int i;
    471
    472	nhg = rcu_dereference_raw(nh->nh_grp);
    473	for (i = 0; i < nhg->num_nh; ++i) {
    474		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
    475
    476		WARN_ON(!list_empty(&nhge->nh_list));
    477		nexthop_put(nhge->nh);
    478	}
    479
    480	WARN_ON(nhg->spare == nhg);
    481
    482	if (nhg->resilient)
    483		vfree(rcu_dereference_raw(nhg->res_table));
    484
    485	kfree(nhg->spare);
    486	kfree(nhg);
    487}
    488
    489static void nexthop_free_single(struct nexthop *nh)
    490{
    491	struct nh_info *nhi;
    492
    493	nhi = rcu_dereference_raw(nh->nh_info);
    494	switch (nhi->family) {
    495	case AF_INET:
    496		fib_nh_release(nh->net, &nhi->fib_nh);
    497		break;
    498	case AF_INET6:
    499		ipv6_stub->fib6_nh_release(&nhi->fib6_nh);
    500		break;
    501	}
    502	kfree(nhi);
    503}
    504
    505void nexthop_free_rcu(struct rcu_head *head)
    506{
    507	struct nexthop *nh = container_of(head, struct nexthop, rcu);
    508
    509	if (nh->is_group)
    510		nexthop_free_group(nh);
    511	else
    512		nexthop_free_single(nh);
    513
    514	kfree(nh);
    515}
    516EXPORT_SYMBOL_GPL(nexthop_free_rcu);
    517
    518static struct nexthop *nexthop_alloc(void)
    519{
    520	struct nexthop *nh;
    521
    522	nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL);
    523	if (nh) {
    524		INIT_LIST_HEAD(&nh->fi_list);
    525		INIT_LIST_HEAD(&nh->f6i_list);
    526		INIT_LIST_HEAD(&nh->grp_list);
    527		INIT_LIST_HEAD(&nh->fdb_list);
    528	}
    529	return nh;
    530}
    531
    532static struct nh_group *nexthop_grp_alloc(u16 num_nh)
    533{
    534	struct nh_group *nhg;
    535
    536	nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL);
    537	if (nhg)
    538		nhg->num_nh = num_nh;
    539
    540	return nhg;
    541}
    542
    543static void nh_res_table_upkeep_dw(struct work_struct *work);
    544
    545static struct nh_res_table *
    546nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg)
    547{
    548	const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets;
    549	struct nh_res_table *res_table;
    550	unsigned long size;
    551
    552	size = struct_size(res_table, nh_buckets, num_nh_buckets);
    553	res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
    554	if (!res_table)
    555		return NULL;
    556
    557	res_table->net = net;
    558	res_table->nhg_id = nhg_id;
    559	INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw);
    560	INIT_LIST_HEAD(&res_table->uw_nh_entries);
    561	res_table->idle_timer = cfg->nh_grp_res_idle_timer;
    562	res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer;
    563	res_table->num_nh_buckets = num_nh_buckets;
    564	return res_table;
    565}
    566
    567static void nh_base_seq_inc(struct net *net)
    568{
    569	while (++net->nexthop.seq == 0)
    570		;
    571}
    572
    573/* no reference taken; rcu lock or rtnl must be held */
    574struct nexthop *nexthop_find_by_id(struct net *net, u32 id)
    575{
    576	struct rb_node **pp, *parent = NULL, *next;
    577
    578	pp = &net->nexthop.rb_root.rb_node;
    579	while (1) {
    580		struct nexthop *nh;
    581
    582		next = rcu_dereference_raw(*pp);
    583		if (!next)
    584			break;
    585		parent = next;
    586
    587		nh = rb_entry(parent, struct nexthop, rb_node);
    588		if (id < nh->id)
    589			pp = &next->rb_left;
    590		else if (id > nh->id)
    591			pp = &next->rb_right;
    592		else
    593			return nh;
    594	}
    595	return NULL;
    596}
    597EXPORT_SYMBOL_GPL(nexthop_find_by_id);
    598
    599/* used for auto id allocation; called with rtnl held */
    600static u32 nh_find_unused_id(struct net *net)
    601{
    602	u32 id_start = net->nexthop.last_id_allocated;
    603
    604	while (1) {
    605		net->nexthop.last_id_allocated++;
    606		if (net->nexthop.last_id_allocated == id_start)
    607			break;
    608
    609		if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated))
    610			return net->nexthop.last_id_allocated;
    611	}
    612	return 0;
    613}
    614
    615static void nh_res_time_set_deadline(unsigned long next_time,
    616				     unsigned long *deadline)
    617{
    618	if (time_before(next_time, *deadline))
    619		*deadline = next_time;
    620}
    621
    622static clock_t nh_res_table_unbalanced_time(struct nh_res_table *res_table)
    623{
    624	if (list_empty(&res_table->uw_nh_entries))
    625		return 0;
    626	return jiffies_delta_to_clock_t(jiffies - res_table->unbalanced_since);
    627}
    628
    629static int nla_put_nh_group_res(struct sk_buff *skb, struct nh_group *nhg)
    630{
    631	struct nh_res_table *res_table = rtnl_dereference(nhg->res_table);
    632	struct nlattr *nest;
    633
    634	nest = nla_nest_start(skb, NHA_RES_GROUP);
    635	if (!nest)
    636		return -EMSGSIZE;
    637
    638	if (nla_put_u16(skb, NHA_RES_GROUP_BUCKETS,
    639			res_table->num_nh_buckets) ||
    640	    nla_put_u32(skb, NHA_RES_GROUP_IDLE_TIMER,
    641			jiffies_to_clock_t(res_table->idle_timer)) ||
    642	    nla_put_u32(skb, NHA_RES_GROUP_UNBALANCED_TIMER,
    643			jiffies_to_clock_t(res_table->unbalanced_timer)) ||
    644	    nla_put_u64_64bit(skb, NHA_RES_GROUP_UNBALANCED_TIME,
    645			      nh_res_table_unbalanced_time(res_table),
    646			      NHA_RES_GROUP_PAD))
    647		goto nla_put_failure;
    648
    649	nla_nest_end(skb, nest);
    650	return 0;
    651
    652nla_put_failure:
    653	nla_nest_cancel(skb, nest);
    654	return -EMSGSIZE;
    655}
    656
    657static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg)
    658{
    659	struct nexthop_grp *p;
    660	size_t len = nhg->num_nh * sizeof(*p);
    661	struct nlattr *nla;
    662	u16 group_type = 0;
    663	int i;
    664
    665	if (nhg->hash_threshold)
    666		group_type = NEXTHOP_GRP_TYPE_MPATH;
    667	else if (nhg->resilient)
    668		group_type = NEXTHOP_GRP_TYPE_RES;
    669
    670	if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type))
    671		goto nla_put_failure;
    672
    673	nla = nla_reserve(skb, NHA_GROUP, len);
    674	if (!nla)
    675		goto nla_put_failure;
    676
    677	p = nla_data(nla);
    678	for (i = 0; i < nhg->num_nh; ++i) {
    679		p->id = nhg->nh_entries[i].nh->id;
    680		p->weight = nhg->nh_entries[i].weight - 1;
    681		p += 1;
    682	}
    683
    684	if (nhg->resilient && nla_put_nh_group_res(skb, nhg))
    685		goto nla_put_failure;
    686
    687	return 0;
    688
    689nla_put_failure:
    690	return -EMSGSIZE;
    691}
    692
    693static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
    694			int event, u32 portid, u32 seq, unsigned int nlflags)
    695{
    696	struct fib6_nh *fib6_nh;
    697	struct fib_nh *fib_nh;
    698	struct nlmsghdr *nlh;
    699	struct nh_info *nhi;
    700	struct nhmsg *nhm;
    701
    702	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
    703	if (!nlh)
    704		return -EMSGSIZE;
    705
    706	nhm = nlmsg_data(nlh);
    707	nhm->nh_family = AF_UNSPEC;
    708	nhm->nh_flags = nh->nh_flags;
    709	nhm->nh_protocol = nh->protocol;
    710	nhm->nh_scope = 0;
    711	nhm->resvd = 0;
    712
    713	if (nla_put_u32(skb, NHA_ID, nh->id))
    714		goto nla_put_failure;
    715
    716	if (nh->is_group) {
    717		struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
    718
    719		if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB))
    720			goto nla_put_failure;
    721		if (nla_put_nh_group(skb, nhg))
    722			goto nla_put_failure;
    723		goto out;
    724	}
    725
    726	nhi = rtnl_dereference(nh->nh_info);
    727	nhm->nh_family = nhi->family;
    728	if (nhi->reject_nh) {
    729		if (nla_put_flag(skb, NHA_BLACKHOLE))
    730			goto nla_put_failure;
    731		goto out;
    732	} else if (nhi->fdb_nh) {
    733		if (nla_put_flag(skb, NHA_FDB))
    734			goto nla_put_failure;
    735	} else {
    736		const struct net_device *dev;
    737
    738		dev = nhi->fib_nhc.nhc_dev;
    739		if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex))
    740			goto nla_put_failure;
    741	}
    742
    743	nhm->nh_scope = nhi->fib_nhc.nhc_scope;
    744	switch (nhi->family) {
    745	case AF_INET:
    746		fib_nh = &nhi->fib_nh;
    747		if (fib_nh->fib_nh_gw_family &&
    748		    nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4))
    749			goto nla_put_failure;
    750		break;
    751
    752	case AF_INET6:
    753		fib6_nh = &nhi->fib6_nh;
    754		if (fib6_nh->fib_nh_gw_family &&
    755		    nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6))
    756			goto nla_put_failure;
    757		break;
    758	}
    759
    760	if (nhi->fib_nhc.nhc_lwtstate &&
    761	    lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate,
    762				NHA_ENCAP, NHA_ENCAP_TYPE) < 0)
    763		goto nla_put_failure;
    764
    765out:
    766	nlmsg_end(skb, nlh);
    767	return 0;
    768
    769nla_put_failure:
    770	nlmsg_cancel(skb, nlh);
    771	return -EMSGSIZE;
    772}
    773
    774static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg)
    775{
    776	return nla_total_size(0) +	/* NHA_RES_GROUP */
    777		nla_total_size(2) +	/* NHA_RES_GROUP_BUCKETS */
    778		nla_total_size(4) +	/* NHA_RES_GROUP_IDLE_TIMER */
    779		nla_total_size(4) +	/* NHA_RES_GROUP_UNBALANCED_TIMER */
    780		nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */
    781}
    782
    783static size_t nh_nlmsg_size_grp(struct nexthop *nh)
    784{
    785	struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
    786	size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh;
    787	size_t tot = nla_total_size(sz) +
    788		nla_total_size(2); /* NHA_GROUP_TYPE */
    789
    790	if (nhg->resilient)
    791		tot += nh_nlmsg_size_grp_res(nhg);
    792
    793	return tot;
    794}
    795
    796static size_t nh_nlmsg_size_single(struct nexthop *nh)
    797{
    798	struct nh_info *nhi = rtnl_dereference(nh->nh_info);
    799	size_t sz;
    800
    801	/* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
    802	 * are mutually exclusive
    803	 */
    804	sz = nla_total_size(4);  /* NHA_OIF */
    805
    806	switch (nhi->family) {
    807	case AF_INET:
    808		if (nhi->fib_nh.fib_nh_gw_family)
    809			sz += nla_total_size(4);  /* NHA_GATEWAY */
    810		break;
    811
    812	case AF_INET6:
    813		/* NHA_GATEWAY */
    814		if (nhi->fib6_nh.fib_nh_gw_family)
    815			sz += nla_total_size(sizeof(const struct in6_addr));
    816		break;
    817	}
    818
    819	if (nhi->fib_nhc.nhc_lwtstate) {
    820		sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate);
    821		sz += nla_total_size(2);  /* NHA_ENCAP_TYPE */
    822	}
    823
    824	return sz;
    825}
    826
    827static size_t nh_nlmsg_size(struct nexthop *nh)
    828{
    829	size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg));
    830
    831	sz += nla_total_size(4); /* NHA_ID */
    832
    833	if (nh->is_group)
    834		sz += nh_nlmsg_size_grp(nh);
    835	else
    836		sz += nh_nlmsg_size_single(nh);
    837
    838	return sz;
    839}
    840
    841static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
    842{
    843	unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0;
    844	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
    845	struct sk_buff *skb;
    846	int err = -ENOBUFS;
    847
    848	skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any());
    849	if (!skb)
    850		goto errout;
    851
    852	err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags);
    853	if (err < 0) {
    854		/* -EMSGSIZE implies BUG in nh_nlmsg_size() */
    855		WARN_ON(err == -EMSGSIZE);
    856		kfree_skb(skb);
    857		goto errout;
    858	}
    859
    860	rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP,
    861		    info->nlh, gfp_any());
    862	return;
    863errout:
    864	if (err < 0)
    865		rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err);
    866}
    867
    868static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket)
    869{
    870	return (unsigned long)atomic_long_read(&bucket->used_time);
    871}
    872
    873static unsigned long
    874nh_res_bucket_idle_point(const struct nh_res_table *res_table,
    875			 const struct nh_res_bucket *bucket,
    876			 unsigned long now)
    877{
    878	unsigned long time = nh_res_bucket_used_time(bucket);
    879
    880	/* Bucket was not used since it was migrated. The idle time is now. */
    881	if (time == bucket->migrated_time)
    882		return now;
    883
    884	return time + res_table->idle_timer;
    885}
    886
    887static unsigned long
    888nh_res_table_unb_point(const struct nh_res_table *res_table)
    889{
    890	return res_table->unbalanced_since + res_table->unbalanced_timer;
    891}
    892
    893static void nh_res_bucket_set_idle(const struct nh_res_table *res_table,
    894				   struct nh_res_bucket *bucket)
    895{
    896	unsigned long now = jiffies;
    897
    898	atomic_long_set(&bucket->used_time, (long)now);
    899	bucket->migrated_time = now;
    900}
    901
    902static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket)
    903{
    904	atomic_long_set(&bucket->used_time, (long)jiffies);
    905}
    906
    907static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket *bucket)
    908{
    909	unsigned long used_time = nh_res_bucket_used_time(bucket);
    910
    911	return jiffies_delta_to_clock_t(jiffies - used_time);
    912}
    913
    914static int nh_fill_res_bucket(struct sk_buff *skb, struct nexthop *nh,
    915			      struct nh_res_bucket *bucket, u16 bucket_index,
    916			      int event, u32 portid, u32 seq,
    917			      unsigned int nlflags,
    918			      struct netlink_ext_ack *extack)
    919{
    920	struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry);
    921	struct nlmsghdr *nlh;
    922	struct nlattr *nest;
    923	struct nhmsg *nhm;
    924
    925	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
    926	if (!nlh)
    927		return -EMSGSIZE;
    928
    929	nhm = nlmsg_data(nlh);
    930	nhm->nh_family = AF_UNSPEC;
    931	nhm->nh_flags = bucket->nh_flags;
    932	nhm->nh_protocol = nh->protocol;
    933	nhm->nh_scope = 0;
    934	nhm->resvd = 0;
    935
    936	if (nla_put_u32(skb, NHA_ID, nh->id))
    937		goto nla_put_failure;
    938
    939	nest = nla_nest_start(skb, NHA_RES_BUCKET);
    940	if (!nest)
    941		goto nla_put_failure;
    942
    943	if (nla_put_u16(skb, NHA_RES_BUCKET_INDEX, bucket_index) ||
    944	    nla_put_u32(skb, NHA_RES_BUCKET_NH_ID, nhge->nh->id) ||
    945	    nla_put_u64_64bit(skb, NHA_RES_BUCKET_IDLE_TIME,
    946			      nh_res_bucket_idle_time(bucket),
    947			      NHA_RES_BUCKET_PAD))
    948		goto nla_put_failure_nest;
    949
    950	nla_nest_end(skb, nest);
    951	nlmsg_end(skb, nlh);
    952	return 0;
    953
    954nla_put_failure_nest:
    955	nla_nest_cancel(skb, nest);
    956nla_put_failure:
    957	nlmsg_cancel(skb, nlh);
    958	return -EMSGSIZE;
    959}
    960
    961static void nexthop_bucket_notify(struct nh_res_table *res_table,
    962				  u16 bucket_index)
    963{
    964	struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index];
    965	struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry);
    966	struct nexthop *nh = nhge->nh_parent;
    967	struct sk_buff *skb;
    968	int err = -ENOBUFS;
    969
    970	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
    971	if (!skb)
    972		goto errout;
    973
    974	err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
    975				 RTM_NEWNEXTHOPBUCKET, 0, 0, NLM_F_REPLACE,
    976				 NULL);
    977	if (err < 0) {
    978		kfree_skb(skb);
    979		goto errout;
    980	}
    981
    982	rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL);
    983	return;
    984errout:
    985	if (err < 0)
    986		rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err);
    987}
    988
    989static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
    990			   bool *is_fdb, struct netlink_ext_ack *extack)
    991{
    992	if (nh->is_group) {
    993		struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
    994
    995		/* Nesting groups within groups is not supported. */
    996		if (nhg->hash_threshold) {
    997			NL_SET_ERR_MSG(extack,
    998				       "Hash-threshold group can not be a nexthop within a group");
    999			return false;
   1000		}
   1001		if (nhg->resilient) {
   1002			NL_SET_ERR_MSG(extack,
   1003				       "Resilient group can not be a nexthop within a group");
   1004			return false;
   1005		}
   1006		*is_fdb = nhg->fdb_nh;
   1007	} else {
   1008		struct nh_info *nhi = rtnl_dereference(nh->nh_info);
   1009
   1010		if (nhi->reject_nh && npaths > 1) {
   1011			NL_SET_ERR_MSG(extack,
   1012				       "Blackhole nexthop can not be used in a group with more than 1 path");
   1013			return false;
   1014		}
   1015		*is_fdb = nhi->fdb_nh;
   1016	}
   1017
   1018	return true;
   1019}
   1020
   1021static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family,
   1022				   struct netlink_ext_ack *extack)
   1023{
   1024	struct nh_info *nhi;
   1025
   1026	nhi = rtnl_dereference(nh->nh_info);
   1027
   1028	if (!nhi->fdb_nh) {
   1029		NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops");
   1030		return -EINVAL;
   1031	}
   1032
   1033	if (*nh_family == AF_UNSPEC) {
   1034		*nh_family = nhi->family;
   1035	} else if (*nh_family != nhi->family) {
   1036		NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops");
   1037		return -EINVAL;
   1038	}
   1039
   1040	return 0;
   1041}
   1042
   1043static int nh_check_attr_group(struct net *net,
   1044			       struct nlattr *tb[], size_t tb_size,
   1045			       u16 nh_grp_type, struct netlink_ext_ack *extack)
   1046{
   1047	unsigned int len = nla_len(tb[NHA_GROUP]);
   1048	u8 nh_family = AF_UNSPEC;
   1049	struct nexthop_grp *nhg;
   1050	unsigned int i, j;
   1051	u8 nhg_fdb = 0;
   1052
   1053	if (!len || len & (sizeof(struct nexthop_grp) - 1)) {
   1054		NL_SET_ERR_MSG(extack,
   1055			       "Invalid length for nexthop group attribute");
   1056		return -EINVAL;
   1057	}
   1058
   1059	/* convert len to number of nexthop ids */
   1060	len /= sizeof(*nhg);
   1061
   1062	nhg = nla_data(tb[NHA_GROUP]);
   1063	for (i = 0; i < len; ++i) {
   1064		if (nhg[i].resvd1 || nhg[i].resvd2) {
   1065			NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0");
   1066			return -EINVAL;
   1067		}
   1068		if (nhg[i].weight > 254) {
   1069			NL_SET_ERR_MSG(extack, "Invalid value for weight");
   1070			return -EINVAL;
   1071		}
   1072		for (j = i + 1; j < len; ++j) {
   1073			if (nhg[i].id == nhg[j].id) {
   1074				NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group");
   1075				return -EINVAL;
   1076			}
   1077		}
   1078	}
   1079
   1080	if (tb[NHA_FDB])
   1081		nhg_fdb = 1;
   1082	nhg = nla_data(tb[NHA_GROUP]);
   1083	for (i = 0; i < len; ++i) {
   1084		struct nexthop *nh;
   1085		bool is_fdb_nh;
   1086
   1087		nh = nexthop_find_by_id(net, nhg[i].id);
   1088		if (!nh) {
   1089			NL_SET_ERR_MSG(extack, "Invalid nexthop id");
   1090			return -EINVAL;
   1091		}
   1092		if (!valid_group_nh(nh, len, &is_fdb_nh, extack))
   1093			return -EINVAL;
   1094
   1095		if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack))
   1096			return -EINVAL;
   1097
   1098		if (!nhg_fdb && is_fdb_nh) {
   1099			NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops");
   1100			return -EINVAL;
   1101		}
   1102	}
   1103	for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) {
   1104		if (!tb[i])
   1105			continue;
   1106		switch (i) {
   1107		case NHA_FDB:
   1108			continue;
   1109		case NHA_RES_GROUP:
   1110			if (nh_grp_type == NEXTHOP_GRP_TYPE_RES)
   1111				continue;
   1112			break;
   1113		}
   1114		NL_SET_ERR_MSG(extack,
   1115			       "No other attributes can be set in nexthop groups");
   1116		return -EINVAL;
   1117	}
   1118
   1119	return 0;
   1120}
   1121
   1122static bool ipv6_good_nh(const struct fib6_nh *nh)
   1123{
   1124	int state = NUD_REACHABLE;
   1125	struct neighbour *n;
   1126
   1127	rcu_read_lock_bh();
   1128
   1129	n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6);
   1130	if (n)
   1131		state = n->nud_state;
   1132
   1133	rcu_read_unlock_bh();
   1134
   1135	return !!(state & NUD_VALID);
   1136}
   1137
   1138static bool ipv4_good_nh(const struct fib_nh *nh)
   1139{
   1140	int state = NUD_REACHABLE;
   1141	struct neighbour *n;
   1142
   1143	rcu_read_lock_bh();
   1144
   1145	n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
   1146				      (__force u32)nh->fib_nh_gw4);
   1147	if (n)
   1148		state = n->nud_state;
   1149
   1150	rcu_read_unlock_bh();
   1151
   1152	return !!(state & NUD_VALID);
   1153}
   1154
   1155static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash)
   1156{
   1157	struct nexthop *rc = NULL;
   1158	int i;
   1159
   1160	for (i = 0; i < nhg->num_nh; ++i) {
   1161		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
   1162		struct nh_info *nhi;
   1163
   1164		if (hash > atomic_read(&nhge->hthr.upper_bound))
   1165			continue;
   1166
   1167		nhi = rcu_dereference(nhge->nh->nh_info);
   1168		if (nhi->fdb_nh)
   1169			return nhge->nh;
   1170
   1171		/* nexthops always check if it is good and does
   1172		 * not rely on a sysctl for this behavior
   1173		 */
   1174		switch (nhi->family) {
   1175		case AF_INET:
   1176			if (ipv4_good_nh(&nhi->fib_nh))
   1177				return nhge->nh;
   1178			break;
   1179		case AF_INET6:
   1180			if (ipv6_good_nh(&nhi->fib6_nh))
   1181				return nhge->nh;
   1182			break;
   1183		}
   1184
   1185		if (!rc)
   1186			rc = nhge->nh;
   1187	}
   1188
   1189	return rc;
   1190}
   1191
   1192static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash)
   1193{
   1194	struct nh_res_table *res_table = rcu_dereference(nhg->res_table);
   1195	u16 bucket_index = hash % res_table->num_nh_buckets;
   1196	struct nh_res_bucket *bucket;
   1197	struct nh_grp_entry *nhge;
   1198
   1199	/* nexthop_select_path() is expected to return a non-NULL value, so
   1200	 * skip protocol validation and just hand out whatever there is.
   1201	 */
   1202	bucket = &res_table->nh_buckets[bucket_index];
   1203	nh_res_bucket_set_busy(bucket);
   1204	nhge = rcu_dereference(bucket->nh_entry);
   1205	return nhge->nh;
   1206}
   1207
   1208struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
   1209{
   1210	struct nh_group *nhg;
   1211
   1212	if (!nh->is_group)
   1213		return nh;
   1214
   1215	nhg = rcu_dereference(nh->nh_grp);
   1216	if (nhg->hash_threshold)
   1217		return nexthop_select_path_hthr(nhg, hash);
   1218	else if (nhg->resilient)
   1219		return nexthop_select_path_res(nhg, hash);
   1220
   1221	/* Unreachable. */
   1222	return NULL;
   1223}
   1224EXPORT_SYMBOL_GPL(nexthop_select_path);
   1225
   1226int nexthop_for_each_fib6_nh(struct nexthop *nh,
   1227			     int (*cb)(struct fib6_nh *nh, void *arg),
   1228			     void *arg)
   1229{
   1230	struct nh_info *nhi;
   1231	int err;
   1232
   1233	if (nh->is_group) {
   1234		struct nh_group *nhg;
   1235		int i;
   1236
   1237		nhg = rcu_dereference_rtnl(nh->nh_grp);
   1238		for (i = 0; i < nhg->num_nh; i++) {
   1239			struct nh_grp_entry *nhge = &nhg->nh_entries[i];
   1240
   1241			nhi = rcu_dereference_rtnl(nhge->nh->nh_info);
   1242			err = cb(&nhi->fib6_nh, arg);
   1243			if (err)
   1244				return err;
   1245		}
   1246	} else {
   1247		nhi = rcu_dereference_rtnl(nh->nh_info);
   1248		err = cb(&nhi->fib6_nh, arg);
   1249		if (err)
   1250			return err;
   1251	}
   1252
   1253	return 0;
   1254}
   1255EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh);
   1256
   1257static int check_src_addr(const struct in6_addr *saddr,
   1258			  struct netlink_ext_ack *extack)
   1259{
   1260	if (!ipv6_addr_any(saddr)) {
   1261		NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects");
   1262		return -EINVAL;
   1263	}
   1264	return 0;
   1265}
   1266
   1267int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
   1268		       struct netlink_ext_ack *extack)
   1269{
   1270	struct nh_info *nhi;
   1271	bool is_fdb_nh;
   1272
   1273	/* fib6_src is unique to a fib6_info and limits the ability to cache
   1274	 * routes in fib6_nh within a nexthop that is potentially shared
   1275	 * across multiple fib entries. If the config wants to use source
   1276	 * routing it can not use nexthop objects. mlxsw also does not allow
   1277	 * fib6_src on routes.
   1278	 */
   1279	if (cfg && check_src_addr(&cfg->fc_src, extack) < 0)
   1280		return -EINVAL;
   1281
   1282	if (nh->is_group) {
   1283		struct nh_group *nhg;
   1284
   1285		nhg = rtnl_dereference(nh->nh_grp);
   1286		if (nhg->has_v4)
   1287			goto no_v4_nh;
   1288		is_fdb_nh = nhg->fdb_nh;
   1289	} else {
   1290		nhi = rtnl_dereference(nh->nh_info);
   1291		if (nhi->family == AF_INET)
   1292			goto no_v4_nh;
   1293		is_fdb_nh = nhi->fdb_nh;
   1294	}
   1295
   1296	if (is_fdb_nh) {
   1297		NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
   1298		return -EINVAL;
   1299	}
   1300
   1301	return 0;
   1302no_v4_nh:
   1303	NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop");
   1304	return -EINVAL;
   1305}
   1306EXPORT_SYMBOL_GPL(fib6_check_nexthop);
   1307
   1308/* if existing nexthop has ipv6 routes linked to it, need
   1309 * to verify this new spec works with ipv6
   1310 */
   1311static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new,
   1312			      struct netlink_ext_ack *extack)
   1313{
   1314	struct fib6_info *f6i;
   1315
   1316	if (list_empty(&old->f6i_list))
   1317		return 0;
   1318
   1319	list_for_each_entry(f6i, &old->f6i_list, nh_list) {
   1320		if (check_src_addr(&f6i->fib6_src.addr, extack) < 0)
   1321			return -EINVAL;
   1322	}
   1323
   1324	return fib6_check_nexthop(new, NULL, extack);
   1325}
   1326
   1327static int nexthop_check_scope(struct nh_info *nhi, u8 scope,
   1328			       struct netlink_ext_ack *extack)
   1329{
   1330	if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) {
   1331		NL_SET_ERR_MSG(extack,
   1332			       "Route with host scope can not have a gateway");
   1333		return -EINVAL;
   1334	}
   1335
   1336	if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) {
   1337		NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop");
   1338		return -EINVAL;
   1339	}
   1340
   1341	return 0;
   1342}
   1343
   1344/* Invoked by fib add code to verify nexthop by id is ok with
   1345 * config for prefix; parts of fib_check_nh not done when nexthop
   1346 * object is used.
   1347 */
   1348int fib_check_nexthop(struct nexthop *nh, u8 scope,
   1349		      struct netlink_ext_ack *extack)
   1350{
   1351	struct nh_info *nhi;
   1352	int err = 0;
   1353
   1354	if (nh->is_group) {
   1355		struct nh_group *nhg;
   1356
   1357		nhg = rtnl_dereference(nh->nh_grp);
   1358		if (nhg->fdb_nh) {
   1359			NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
   1360			err = -EINVAL;
   1361			goto out;
   1362		}
   1363
   1364		if (scope == RT_SCOPE_HOST) {
   1365			NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops");
   1366			err = -EINVAL;
   1367			goto out;
   1368		}
   1369
   1370		/* all nexthops in a group have the same scope */
   1371		nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info);
   1372		err = nexthop_check_scope(nhi, scope, extack);
   1373	} else {
   1374		nhi = rtnl_dereference(nh->nh_info);
   1375		if (nhi->fdb_nh) {
   1376			NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
   1377			err = -EINVAL;
   1378			goto out;
   1379		}
   1380		err = nexthop_check_scope(nhi, scope, extack);
   1381	}
   1382
   1383out:
   1384	return err;
   1385}
   1386
   1387static int fib_check_nh_list(struct nexthop *old, struct nexthop *new,
   1388			     struct netlink_ext_ack *extack)
   1389{
   1390	struct fib_info *fi;
   1391
   1392	list_for_each_entry(fi, &old->fi_list, nh_list) {
   1393		int err;
   1394
   1395		err = fib_check_nexthop(new, fi->fib_scope, extack);
   1396		if (err)
   1397			return err;
   1398	}
   1399	return 0;
   1400}
   1401
   1402static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge)
   1403{
   1404	return nhge->res.count_buckets == nhge->res.wants_buckets;
   1405}
   1406
   1407static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge)
   1408{
   1409	return nhge->res.count_buckets > nhge->res.wants_buckets;
   1410}
   1411
   1412static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge)
   1413{
   1414	return nhge->res.count_buckets < nhge->res.wants_buckets;
   1415}
   1416
   1417static bool nh_res_table_is_balanced(const struct nh_res_table *res_table)
   1418{
   1419	return list_empty(&res_table->uw_nh_entries);
   1420}
   1421
   1422static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket)
   1423{
   1424	struct nh_grp_entry *nhge;
   1425
   1426	if (bucket->occupied) {
   1427		nhge = nh_res_dereference(bucket->nh_entry);
   1428		nhge->res.count_buckets--;
   1429		bucket->occupied = false;
   1430	}
   1431}
   1432
   1433static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket,
   1434				 struct nh_grp_entry *nhge)
   1435{
   1436	nh_res_bucket_unset_nh(bucket);
   1437
   1438	bucket->occupied = true;
   1439	rcu_assign_pointer(bucket->nh_entry, nhge);
   1440	nhge->res.count_buckets++;
   1441}
   1442
   1443static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table,
   1444					 struct nh_res_bucket *bucket,
   1445					 unsigned long *deadline, bool *force)
   1446{
   1447	unsigned long now = jiffies;
   1448	struct nh_grp_entry *nhge;
   1449	unsigned long idle_point;
   1450
   1451	if (!bucket->occupied) {
   1452		/* The bucket is not occupied, its NHGE pointer is either
   1453		 * NULL or obsolete. We _have to_ migrate: set force.
   1454		 */
   1455		*force = true;
   1456		return true;
   1457	}
   1458
   1459	nhge = nh_res_dereference(bucket->nh_entry);
   1460
   1461	/* If the bucket is populated by an underweight or balanced
   1462	 * nexthop, do not migrate.
   1463	 */
   1464	if (!nh_res_nhge_is_ow(nhge))
   1465		return false;
   1466
   1467	/* At this point we know that the bucket is populated with an
   1468	 * overweight nexthop. It needs to be migrated to a new nexthop if
   1469	 * the idle timer of unbalanced timer expired.
   1470	 */
   1471
   1472	idle_point = nh_res_bucket_idle_point(res_table, bucket, now);
   1473	if (time_after_eq(now, idle_point)) {
   1474		/* The bucket is idle. We _can_ migrate: unset force. */
   1475		*force = false;
   1476		return true;
   1477	}
   1478
   1479	/* Unbalanced timer of 0 means "never force". */
   1480	if (res_table->unbalanced_timer) {
   1481		unsigned long unb_point;
   1482
   1483		unb_point = nh_res_table_unb_point(res_table);
   1484		if (time_after(now, unb_point)) {
   1485			/* The bucket is not idle, but the unbalanced timer
   1486			 * expired. We _can_ migrate, but set force anyway,
   1487			 * so that drivers know to ignore activity reports
   1488			 * from the HW.
   1489			 */
   1490			*force = true;
   1491			return true;
   1492		}
   1493
   1494		nh_res_time_set_deadline(unb_point, deadline);
   1495	}
   1496
   1497	nh_res_time_set_deadline(idle_point, deadline);
   1498	return false;
   1499}
   1500
   1501static bool nh_res_bucket_migrate(struct nh_res_table *res_table,
   1502				  u16 bucket_index, bool notify,
   1503				  bool notify_nl, bool force)
   1504{
   1505	struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index];
   1506	struct nh_grp_entry *new_nhge;
   1507	struct netlink_ext_ack extack;
   1508	int err;
   1509
   1510	new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries,
   1511					    struct nh_grp_entry,
   1512					    res.uw_nh_entry);
   1513	if (WARN_ON_ONCE(!new_nhge))
   1514		/* If this function is called, "bucket" is either not
   1515		 * occupied, or it belongs to a next hop that is
   1516		 * overweight. In either case, there ought to be a
   1517		 * corresponding underweight next hop.
   1518		 */
   1519		return false;
   1520
   1521	if (notify) {
   1522		struct nh_grp_entry *old_nhge;
   1523
   1524		old_nhge = nh_res_dereference(bucket->nh_entry);
   1525		err = call_nexthop_res_bucket_notifiers(res_table->net,
   1526							res_table->nhg_id,
   1527							bucket_index, force,
   1528							old_nhge->nh,
   1529							new_nhge->nh, &extack);
   1530		if (err) {
   1531			pr_err_ratelimited("%s\n", extack._msg);
   1532			if (!force)
   1533				return false;
   1534			/* It is not possible to veto a forced replacement, so
   1535			 * just clear the hardware flags from the nexthop
   1536			 * bucket to indicate to user space that this bucket is
   1537			 * not correctly populated in hardware.
   1538			 */
   1539			bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
   1540		}
   1541	}
   1542
   1543	nh_res_bucket_set_nh(bucket, new_nhge);
   1544	nh_res_bucket_set_idle(res_table, bucket);
   1545
   1546	if (notify_nl)
   1547		nexthop_bucket_notify(res_table, bucket_index);
   1548
   1549	if (nh_res_nhge_is_balanced(new_nhge))
   1550		list_del(&new_nhge->res.uw_nh_entry);
   1551	return true;
   1552}
   1553
   1554#define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2)
   1555
   1556static void nh_res_table_upkeep(struct nh_res_table *res_table,
   1557				bool notify, bool notify_nl)
   1558{
   1559	unsigned long now = jiffies;
   1560	unsigned long deadline;
   1561	u16 i;
   1562
   1563	/* Deadline is the next time that upkeep should be run. It is the
   1564	 * earliest time at which one of the buckets might be migrated.
   1565	 * Start at the most pessimistic estimate: either unbalanced_timer
   1566	 * from now, or if there is none, idle_timer from now. For each
   1567	 * encountered time point, call nh_res_time_set_deadline() to
   1568	 * refine the estimate.
   1569	 */
   1570	if (res_table->unbalanced_timer)
   1571		deadline = now + res_table->unbalanced_timer;
   1572	else
   1573		deadline = now + res_table->idle_timer;
   1574
   1575	for (i = 0; i < res_table->num_nh_buckets; i++) {
   1576		struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
   1577		bool force;
   1578
   1579		if (nh_res_bucket_should_migrate(res_table, bucket,
   1580						 &deadline, &force)) {
   1581			if (!nh_res_bucket_migrate(res_table, i, notify,
   1582						   notify_nl, force)) {
   1583				unsigned long idle_point;
   1584
   1585				/* A driver can override the migration
   1586				 * decision if the HW reports that the
   1587				 * bucket is actually not idle. Therefore
   1588				 * remark the bucket as busy again and
   1589				 * update the deadline.
   1590				 */
   1591				nh_res_bucket_set_busy(bucket);
   1592				idle_point = nh_res_bucket_idle_point(res_table,
   1593								      bucket,
   1594								      now);
   1595				nh_res_time_set_deadline(idle_point, &deadline);
   1596			}
   1597		}
   1598	}
   1599
   1600	/* If the group is still unbalanced, schedule the next upkeep to
   1601	 * either the deadline computed above, or the minimum deadline,
   1602	 * whichever comes later.
   1603	 */
   1604	if (!nh_res_table_is_balanced(res_table)) {
   1605		unsigned long now = jiffies;
   1606		unsigned long min_deadline;
   1607
   1608		min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL;
   1609		if (time_before(deadline, min_deadline))
   1610			deadline = min_deadline;
   1611
   1612		queue_delayed_work(system_power_efficient_wq,
   1613				   &res_table->upkeep_dw, deadline - now);
   1614	}
   1615}
   1616
   1617static void nh_res_table_upkeep_dw(struct work_struct *work)
   1618{
   1619	struct delayed_work *dw = to_delayed_work(work);
   1620	struct nh_res_table *res_table;
   1621
   1622	res_table = container_of(dw, struct nh_res_table, upkeep_dw);
   1623	nh_res_table_upkeep(res_table, true, true);
   1624}
   1625
   1626static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table)
   1627{
   1628	cancel_delayed_work_sync(&res_table->upkeep_dw);
   1629}
   1630
   1631static void nh_res_group_rebalance(struct nh_group *nhg,
   1632				   struct nh_res_table *res_table)
   1633{
   1634	int prev_upper_bound = 0;
   1635	int total = 0;
   1636	int w = 0;
   1637	int i;
   1638
   1639	INIT_LIST_HEAD(&res_table->uw_nh_entries);
   1640
   1641	for (i = 0; i < nhg->num_nh; ++i)
   1642		total += nhg->nh_entries[i].weight;
   1643
   1644	for (i = 0; i < nhg->num_nh; ++i) {
   1645		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
   1646		int upper_bound;
   1647
   1648		w += nhge->weight;
   1649		upper_bound = DIV_ROUND_CLOSEST(res_table->num_nh_buckets * w,
   1650						total);
   1651		nhge->res.wants_buckets = upper_bound - prev_upper_bound;
   1652		prev_upper_bound = upper_bound;
   1653
   1654		if (nh_res_nhge_is_uw(nhge)) {
   1655			if (list_empty(&res_table->uw_nh_entries))
   1656				res_table->unbalanced_since = jiffies;
   1657			list_add(&nhge->res.uw_nh_entry,
   1658				 &res_table->uw_nh_entries);
   1659		}
   1660	}
   1661}
   1662
   1663/* Migrate buckets in res_table so that they reference NHGE's from NHG with
   1664 * the right NH ID. Set those buckets that do not have a corresponding NHGE
   1665 * entry in NHG as not occupied.
   1666 */
   1667static void nh_res_table_migrate_buckets(struct nh_res_table *res_table,
   1668					 struct nh_group *nhg)
   1669{
   1670	u16 i;
   1671
   1672	for (i = 0; i < res_table->num_nh_buckets; i++) {
   1673		struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
   1674		u32 id = rtnl_dereference(bucket->nh_entry)->nh->id;
   1675		bool found = false;
   1676		int j;
   1677
   1678		for (j = 0; j < nhg->num_nh; j++) {
   1679			struct nh_grp_entry *nhge = &nhg->nh_entries[j];
   1680
   1681			if (nhge->nh->id == id) {
   1682				nh_res_bucket_set_nh(bucket, nhge);
   1683				found = true;
   1684				break;
   1685			}
   1686		}
   1687
   1688		if (!found)
   1689			nh_res_bucket_unset_nh(bucket);
   1690	}
   1691}
   1692
   1693static void replace_nexthop_grp_res(struct nh_group *oldg,
   1694				    struct nh_group *newg)
   1695{
   1696	/* For NH group replacement, the new NHG might only have a stub
   1697	 * hash table with 0 buckets, because the number of buckets was not
   1698	 * specified. For NH removal, oldg and newg both reference the same
   1699	 * res_table. So in any case, in the following, we want to work
   1700	 * with oldg->res_table.
   1701	 */
   1702	struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table);
   1703	unsigned long prev_unbalanced_since = old_res_table->unbalanced_since;
   1704	bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries);
   1705
   1706	nh_res_table_cancel_upkeep(old_res_table);
   1707	nh_res_table_migrate_buckets(old_res_table, newg);
   1708	nh_res_group_rebalance(newg, old_res_table);
   1709	if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries))
   1710		old_res_table->unbalanced_since = prev_unbalanced_since;
   1711	nh_res_table_upkeep(old_res_table, true, false);
   1712}
   1713
   1714static void nh_hthr_group_rebalance(struct nh_group *nhg)
   1715{
   1716	int total = 0;
   1717	int w = 0;
   1718	int i;
   1719
   1720	for (i = 0; i < nhg->num_nh; ++i)
   1721		total += nhg->nh_entries[i].weight;
   1722
   1723	for (i = 0; i < nhg->num_nh; ++i) {
   1724		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
   1725		int upper_bound;
   1726
   1727		w += nhge->weight;
   1728		upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1;
   1729		atomic_set(&nhge->hthr.upper_bound, upper_bound);
   1730	}
   1731}
   1732
   1733static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
   1734				struct nl_info *nlinfo)
   1735{
   1736	struct nh_grp_entry *nhges, *new_nhges;
   1737	struct nexthop *nhp = nhge->nh_parent;
   1738	struct netlink_ext_ack extack;
   1739	struct nexthop *nh = nhge->nh;
   1740	struct nh_group *nhg, *newg;
   1741	int i, j, err;
   1742
   1743	WARN_ON(!nh);
   1744
   1745	nhg = rtnl_dereference(nhp->nh_grp);
   1746	newg = nhg->spare;
   1747
   1748	/* last entry, keep it visible and remove the parent */
   1749	if (nhg->num_nh == 1) {
   1750		remove_nexthop(net, nhp, nlinfo);
   1751		return;
   1752	}
   1753
   1754	newg->has_v4 = false;
   1755	newg->is_multipath = nhg->is_multipath;
   1756	newg->hash_threshold = nhg->hash_threshold;
   1757	newg->resilient = nhg->resilient;
   1758	newg->fdb_nh = nhg->fdb_nh;
   1759	newg->num_nh = nhg->num_nh;
   1760
   1761	/* copy old entries to new except the one getting removed */
   1762	nhges = nhg->nh_entries;
   1763	new_nhges = newg->nh_entries;
   1764	for (i = 0, j = 0; i < nhg->num_nh; ++i) {
   1765		struct nh_info *nhi;
   1766
   1767		/* current nexthop getting removed */
   1768		if (nhg->nh_entries[i].nh == nh) {
   1769			newg->num_nh--;
   1770			continue;
   1771		}
   1772
   1773		nhi = rtnl_dereference(nhges[i].nh->nh_info);
   1774		if (nhi->family == AF_INET)
   1775			newg->has_v4 = true;
   1776
   1777		list_del(&nhges[i].nh_list);
   1778		new_nhges[j].nh_parent = nhges[i].nh_parent;
   1779		new_nhges[j].nh = nhges[i].nh;
   1780		new_nhges[j].weight = nhges[i].weight;
   1781		list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list);
   1782		j++;
   1783	}
   1784
   1785	if (newg->hash_threshold)
   1786		nh_hthr_group_rebalance(newg);
   1787	else if (newg->resilient)
   1788		replace_nexthop_grp_res(nhg, newg);
   1789
   1790	rcu_assign_pointer(nhp->nh_grp, newg);
   1791
   1792	list_del(&nhge->nh_list);
   1793	nexthop_put(nhge->nh);
   1794
   1795	/* Removal of a NH from a resilient group is notified through
   1796	 * bucket notifications.
   1797	 */
   1798	if (newg->hash_threshold) {
   1799		err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp,
   1800					     &extack);
   1801		if (err)
   1802			pr_err("%s\n", extack._msg);
   1803	}
   1804
   1805	if (nlinfo)
   1806		nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo);
   1807}
   1808
   1809static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
   1810				       struct nl_info *nlinfo)
   1811{
   1812	struct nh_grp_entry *nhge, *tmp;
   1813
   1814	list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list)
   1815		remove_nh_grp_entry(net, nhge, nlinfo);
   1816
   1817	/* make sure all see the newly published array before releasing rtnl */
   1818	synchronize_net();
   1819}
   1820
   1821static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
   1822{
   1823	struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp);
   1824	struct nh_res_table *res_table;
   1825	int i, num_nh = nhg->num_nh;
   1826
   1827	for (i = 0; i < num_nh; ++i) {
   1828		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
   1829
   1830		if (WARN_ON(!nhge->nh))
   1831			continue;
   1832
   1833		list_del_init(&nhge->nh_list);
   1834	}
   1835
   1836	if (nhg->resilient) {
   1837		res_table = rtnl_dereference(nhg->res_table);
   1838		nh_res_table_cancel_upkeep(res_table);
   1839	}
   1840}
   1841
   1842/* not called for nexthop replace */
   1843static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
   1844{
   1845	struct fib6_info *f6i, *tmp;
   1846	bool do_flush = false;
   1847	struct fib_info *fi;
   1848
   1849	list_for_each_entry(fi, &nh->fi_list, nh_list) {
   1850		fi->fib_flags |= RTNH_F_DEAD;
   1851		do_flush = true;
   1852	}
   1853	if (do_flush)
   1854		fib_flush(net);
   1855
   1856	/* ip6_del_rt removes the entry from this list hence the _safe */
   1857	list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
   1858		/* __ip6_del_rt does a release, so do a hold here */
   1859		fib6_info_hold(f6i);
   1860		ipv6_stub->ip6_del_rt(net, f6i,
   1861				      !net->ipv4.sysctl_nexthop_compat_mode);
   1862	}
   1863}
   1864
   1865static void __remove_nexthop(struct net *net, struct nexthop *nh,
   1866			     struct nl_info *nlinfo)
   1867{
   1868	__remove_nexthop_fib(net, nh);
   1869
   1870	if (nh->is_group) {
   1871		remove_nexthop_group(nh, nlinfo);
   1872	} else {
   1873		struct nh_info *nhi;
   1874
   1875		nhi = rtnl_dereference(nh->nh_info);
   1876		if (nhi->fib_nhc.nhc_dev)
   1877			hlist_del(&nhi->dev_hash);
   1878
   1879		remove_nexthop_from_groups(net, nh, nlinfo);
   1880	}
   1881}
   1882
   1883static void remove_nexthop(struct net *net, struct nexthop *nh,
   1884			   struct nl_info *nlinfo)
   1885{
   1886	call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL);
   1887
   1888	/* remove from the tree */
   1889	rb_erase(&nh->rb_node, &net->nexthop.rb_root);
   1890
   1891	if (nlinfo)
   1892		nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
   1893
   1894	__remove_nexthop(net, nh, nlinfo);
   1895	nh_base_seq_inc(net);
   1896
   1897	nexthop_put(nh);
   1898}
   1899
   1900/* if any FIB entries reference this nexthop, any dst entries
   1901 * need to be regenerated
   1902 */
   1903static void nh_rt_cache_flush(struct net *net, struct nexthop *nh,
   1904			      struct nexthop *replaced_nh)
   1905{
   1906	struct fib6_info *f6i;
   1907	struct nh_group *nhg;
   1908	int i;
   1909
   1910	if (!list_empty(&nh->fi_list))
   1911		rt_cache_flush(net);
   1912
   1913	list_for_each_entry(f6i, &nh->f6i_list, nh_list)
   1914		ipv6_stub->fib6_update_sernum(net, f6i);
   1915
   1916	/* if an IPv6 group was replaced, we have to release all old
   1917	 * dsts to make sure all refcounts are released
   1918	 */
   1919	if (!replaced_nh->is_group)
   1920		return;
   1921
   1922	nhg = rtnl_dereference(replaced_nh->nh_grp);
   1923	for (i = 0; i < nhg->num_nh; i++) {
   1924		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
   1925		struct nh_info *nhi = rtnl_dereference(nhge->nh->nh_info);
   1926
   1927		if (nhi->family == AF_INET6)
   1928			ipv6_stub->fib6_nh_release_dsts(&nhi->fib6_nh);
   1929	}
   1930}
   1931
   1932static int replace_nexthop_grp(struct net *net, struct nexthop *old,
   1933			       struct nexthop *new, const struct nh_config *cfg,
   1934			       struct netlink_ext_ack *extack)
   1935{
   1936	struct nh_res_table *tmp_table = NULL;
   1937	struct nh_res_table *new_res_table;
   1938	struct nh_res_table *old_res_table;
   1939	struct nh_group *oldg, *newg;
   1940	int i, err;
   1941
   1942	if (!new->is_group) {
   1943		NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop.");
   1944		return -EINVAL;
   1945	}
   1946
   1947	oldg = rtnl_dereference(old->nh_grp);
   1948	newg = rtnl_dereference(new->nh_grp);
   1949
   1950	if (newg->hash_threshold != oldg->hash_threshold) {
   1951		NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type.");
   1952		return -EINVAL;
   1953	}
   1954
   1955	if (newg->hash_threshold) {
   1956		err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new,
   1957					     extack);
   1958		if (err)
   1959			return err;
   1960	} else if (newg->resilient) {
   1961		new_res_table = rtnl_dereference(newg->res_table);
   1962		old_res_table = rtnl_dereference(oldg->res_table);
   1963
   1964		/* Accept if num_nh_buckets was not given, but if it was
   1965		 * given, demand that the value be correct.
   1966		 */
   1967		if (cfg->nh_grp_res_has_num_buckets &&
   1968		    cfg->nh_grp_res_num_buckets !=
   1969		    old_res_table->num_nh_buckets) {
   1970			NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group.");
   1971			return -EINVAL;
   1972		}
   1973
   1974		/* Emit a pre-replace notification so that listeners could veto
   1975		 * a potentially unsupported configuration. Otherwise,
   1976		 * individual bucket replacement notifications would need to be
   1977		 * vetoed, which is something that should only happen if the
   1978		 * bucket is currently active.
   1979		 */
   1980		err = call_nexthop_res_table_notifiers(net, new, extack);
   1981		if (err)
   1982			return err;
   1983
   1984		if (cfg->nh_grp_res_has_idle_timer)
   1985			old_res_table->idle_timer = cfg->nh_grp_res_idle_timer;
   1986		if (cfg->nh_grp_res_has_unbalanced_timer)
   1987			old_res_table->unbalanced_timer =
   1988				cfg->nh_grp_res_unbalanced_timer;
   1989
   1990		replace_nexthop_grp_res(oldg, newg);
   1991
   1992		tmp_table = new_res_table;
   1993		rcu_assign_pointer(newg->res_table, old_res_table);
   1994		rcu_assign_pointer(newg->spare->res_table, old_res_table);
   1995	}
   1996
   1997	/* update parents - used by nexthop code for cleanup */
   1998	for (i = 0; i < newg->num_nh; i++)
   1999		newg->nh_entries[i].nh_parent = old;
   2000
   2001	rcu_assign_pointer(old->nh_grp, newg);
   2002
   2003	/* Make sure concurrent readers are not using 'oldg' anymore. */
   2004	synchronize_net();
   2005
   2006	if (newg->resilient) {
   2007		rcu_assign_pointer(oldg->res_table, tmp_table);
   2008		rcu_assign_pointer(oldg->spare->res_table, tmp_table);
   2009	}
   2010
   2011	for (i = 0; i < oldg->num_nh; i++)
   2012		oldg->nh_entries[i].nh_parent = new;
   2013
   2014	rcu_assign_pointer(new->nh_grp, oldg);
   2015
   2016	return 0;
   2017}
   2018
   2019static void nh_group_v4_update(struct nh_group *nhg)
   2020{
   2021	struct nh_grp_entry *nhges;
   2022	bool has_v4 = false;
   2023	int i;
   2024
   2025	nhges = nhg->nh_entries;
   2026	for (i = 0; i < nhg->num_nh; i++) {
   2027		struct nh_info *nhi;
   2028
   2029		nhi = rtnl_dereference(nhges[i].nh->nh_info);
   2030		if (nhi->family == AF_INET)
   2031			has_v4 = true;
   2032	}
   2033	nhg->has_v4 = has_v4;
   2034}
   2035
   2036static int replace_nexthop_single_notify_res(struct net *net,
   2037					     struct nh_res_table *res_table,
   2038					     struct nexthop *old,
   2039					     struct nh_info *oldi,
   2040					     struct nh_info *newi,
   2041					     struct netlink_ext_ack *extack)
   2042{
   2043	u32 nhg_id = res_table->nhg_id;
   2044	int err;
   2045	u16 i;
   2046
   2047	for (i = 0; i < res_table->num_nh_buckets; i++) {
   2048		struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
   2049		struct nh_grp_entry *nhge;
   2050
   2051		nhge = rtnl_dereference(bucket->nh_entry);
   2052		if (nhge->nh == old) {
   2053			err = __call_nexthop_res_bucket_notifiers(net, nhg_id,
   2054								  i, true,
   2055								  oldi, newi,
   2056								  extack);
   2057			if (err)
   2058				goto err_notify;
   2059		}
   2060	}
   2061
   2062	return 0;
   2063
   2064err_notify:
   2065	while (i-- > 0) {
   2066		struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
   2067		struct nh_grp_entry *nhge;
   2068
   2069		nhge = rtnl_dereference(bucket->nh_entry);
   2070		if (nhge->nh == old)
   2071			__call_nexthop_res_bucket_notifiers(net, nhg_id, i,
   2072							    true, newi, oldi,
   2073							    extack);
   2074	}
   2075	return err;
   2076}
   2077
   2078static int replace_nexthop_single_notify(struct net *net,
   2079					 struct nexthop *group_nh,
   2080					 struct nexthop *old,
   2081					 struct nh_info *oldi,
   2082					 struct nh_info *newi,
   2083					 struct netlink_ext_ack *extack)
   2084{
   2085	struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp);
   2086	struct nh_res_table *res_table;
   2087
   2088	if (nhg->hash_threshold) {
   2089		return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE,
   2090					      group_nh, extack);
   2091	} else if (nhg->resilient) {
   2092		res_table = rtnl_dereference(nhg->res_table);
   2093		return replace_nexthop_single_notify_res(net, res_table,
   2094							 old, oldi, newi,
   2095							 extack);
   2096	}
   2097
   2098	return -EINVAL;
   2099}
   2100
   2101static int replace_nexthop_single(struct net *net, struct nexthop *old,
   2102				  struct nexthop *new,
   2103				  struct netlink_ext_ack *extack)
   2104{
   2105	u8 old_protocol, old_nh_flags;
   2106	struct nh_info *oldi, *newi;
   2107	struct nh_grp_entry *nhge;
   2108	int err;
   2109
   2110	if (new->is_group) {
   2111		NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group.");
   2112		return -EINVAL;
   2113	}
   2114
   2115	err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack);
   2116	if (err)
   2117		return err;
   2118
   2119	/* Hardware flags were set on 'old' as 'new' is not in the red-black
   2120	 * tree. Therefore, inherit the flags from 'old' to 'new'.
   2121	 */
   2122	new->nh_flags |= old->nh_flags & (RTNH_F_OFFLOAD | RTNH_F_TRAP);
   2123
   2124	oldi = rtnl_dereference(old->nh_info);
   2125	newi = rtnl_dereference(new->nh_info);
   2126
   2127	newi->nh_parent = old;
   2128	oldi->nh_parent = new;
   2129
   2130	old_protocol = old->protocol;
   2131	old_nh_flags = old->nh_flags;
   2132
   2133	old->protocol = new->protocol;
   2134	old->nh_flags = new->nh_flags;
   2135
   2136	rcu_assign_pointer(old->nh_info, newi);
   2137	rcu_assign_pointer(new->nh_info, oldi);
   2138
   2139	/* Send a replace notification for all the groups using the nexthop. */
   2140	list_for_each_entry(nhge, &old->grp_list, nh_list) {
   2141		struct nexthop *nhp = nhge->nh_parent;
   2142
   2143		err = replace_nexthop_single_notify(net, nhp, old, oldi, newi,
   2144						    extack);
   2145		if (err)
   2146			goto err_notify;
   2147	}
   2148
   2149	/* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially
   2150	 * update IPv4 indication in all the groups using the nexthop.
   2151	 */
   2152	if (oldi->family == AF_INET && newi->family == AF_INET6) {
   2153		list_for_each_entry(nhge, &old->grp_list, nh_list) {
   2154			struct nexthop *nhp = nhge->nh_parent;
   2155			struct nh_group *nhg;
   2156
   2157			nhg = rtnl_dereference(nhp->nh_grp);
   2158			nh_group_v4_update(nhg);
   2159		}
   2160	}
   2161
   2162	return 0;
   2163
   2164err_notify:
   2165	rcu_assign_pointer(new->nh_info, newi);
   2166	rcu_assign_pointer(old->nh_info, oldi);
   2167	old->nh_flags = old_nh_flags;
   2168	old->protocol = old_protocol;
   2169	oldi->nh_parent = old;
   2170	newi->nh_parent = new;
   2171	list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) {
   2172		struct nexthop *nhp = nhge->nh_parent;
   2173
   2174		replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL);
   2175	}
   2176	call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack);
   2177	return err;
   2178}
   2179
   2180static void __nexthop_replace_notify(struct net *net, struct nexthop *nh,
   2181				     struct nl_info *info)
   2182{
   2183	struct fib6_info *f6i;
   2184
   2185	if (!list_empty(&nh->fi_list)) {
   2186		struct fib_info *fi;
   2187
   2188		/* expectation is a few fib_info per nexthop and then
   2189		 * a lot of routes per fib_info. So mark the fib_info
   2190		 * and then walk the fib tables once
   2191		 */
   2192		list_for_each_entry(fi, &nh->fi_list, nh_list)
   2193			fi->nh_updated = true;
   2194
   2195		fib_info_notify_update(net, info);
   2196
   2197		list_for_each_entry(fi, &nh->fi_list, nh_list)
   2198			fi->nh_updated = false;
   2199	}
   2200
   2201	list_for_each_entry(f6i, &nh->f6i_list, nh_list)
   2202		ipv6_stub->fib6_rt_update(net, f6i, info);
   2203}
   2204
   2205/* send RTM_NEWROUTE with REPLACE flag set for all FIB entries
   2206 * linked to this nexthop and for all groups that the nexthop
   2207 * is a member of
   2208 */
   2209static void nexthop_replace_notify(struct net *net, struct nexthop *nh,
   2210				   struct nl_info *info)
   2211{
   2212	struct nh_grp_entry *nhge;
   2213
   2214	__nexthop_replace_notify(net, nh, info);
   2215
   2216	list_for_each_entry(nhge, &nh->grp_list, nh_list)
   2217		__nexthop_replace_notify(net, nhge->nh_parent, info);
   2218}
   2219
   2220static int replace_nexthop(struct net *net, struct nexthop *old,
   2221			   struct nexthop *new, const struct nh_config *cfg,
   2222			   struct netlink_ext_ack *extack)
   2223{
   2224	bool new_is_reject = false;
   2225	struct nh_grp_entry *nhge;
   2226	int err;
   2227
   2228	/* check that existing FIB entries are ok with the
   2229	 * new nexthop definition
   2230	 */
   2231	err = fib_check_nh_list(old, new, extack);
   2232	if (err)
   2233		return err;
   2234
   2235	err = fib6_check_nh_list(old, new, extack);
   2236	if (err)
   2237		return err;
   2238
   2239	if (!new->is_group) {
   2240		struct nh_info *nhi = rtnl_dereference(new->nh_info);
   2241
   2242		new_is_reject = nhi->reject_nh;
   2243	}
   2244
   2245	list_for_each_entry(nhge, &old->grp_list, nh_list) {
   2246		/* if new nexthop is a blackhole, any groups using this
   2247		 * nexthop cannot have more than 1 path
   2248		 */
   2249		if (new_is_reject &&
   2250		    nexthop_num_path(nhge->nh_parent) > 1) {
   2251			NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path");
   2252			return -EINVAL;
   2253		}
   2254
   2255		err = fib_check_nh_list(nhge->nh_parent, new, extack);
   2256		if (err)
   2257			return err;
   2258
   2259		err = fib6_check_nh_list(nhge->nh_parent, new, extack);
   2260		if (err)
   2261			return err;
   2262	}
   2263
   2264	if (old->is_group)
   2265		err = replace_nexthop_grp(net, old, new, cfg, extack);
   2266	else
   2267		err = replace_nexthop_single(net, old, new, extack);
   2268
   2269	if (!err) {
   2270		nh_rt_cache_flush(net, old, new);
   2271
   2272		__remove_nexthop(net, new, NULL);
   2273		nexthop_put(new);
   2274	}
   2275
   2276	return err;
   2277}
   2278
   2279/* called with rtnl_lock held */
   2280static int insert_nexthop(struct net *net, struct nexthop *new_nh,
   2281			  struct nh_config *cfg, struct netlink_ext_ack *extack)
   2282{
   2283	struct rb_node **pp, *parent = NULL, *next;
   2284	struct rb_root *root = &net->nexthop.rb_root;
   2285	bool replace = !!(cfg->nlflags & NLM_F_REPLACE);
   2286	bool create = !!(cfg->nlflags & NLM_F_CREATE);
   2287	u32 new_id = new_nh->id;
   2288	int replace_notify = 0;
   2289	int rc = -EEXIST;
   2290
   2291	pp = &root->rb_node;
   2292	while (1) {
   2293		struct nexthop *nh;
   2294
   2295		next = *pp;
   2296		if (!next)
   2297			break;
   2298
   2299		parent = next;
   2300
   2301		nh = rb_entry(parent, struct nexthop, rb_node);
   2302		if (new_id < nh->id) {
   2303			pp = &next->rb_left;
   2304		} else if (new_id > nh->id) {
   2305			pp = &next->rb_right;
   2306		} else if (replace) {
   2307			rc = replace_nexthop(net, nh, new_nh, cfg, extack);
   2308			if (!rc) {
   2309				new_nh = nh; /* send notification with old nh */
   2310				replace_notify = 1;
   2311			}
   2312			goto out;
   2313		} else {
   2314			/* id already exists and not a replace */
   2315			goto out;
   2316		}
   2317	}
   2318
   2319	if (replace && !create) {
   2320		NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists");
   2321		rc = -ENOENT;
   2322		goto out;
   2323	}
   2324
   2325	if (new_nh->is_group) {
   2326		struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp);
   2327		struct nh_res_table *res_table;
   2328
   2329		if (nhg->resilient) {
   2330			res_table = rtnl_dereference(nhg->res_table);
   2331
   2332			/* Not passing the number of buckets is OK when
   2333			 * replacing, but not when creating a new group.
   2334			 */
   2335			if (!cfg->nh_grp_res_has_num_buckets) {
   2336				NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion");
   2337				rc = -EINVAL;
   2338				goto out;
   2339			}
   2340
   2341			nh_res_group_rebalance(nhg, res_table);
   2342
   2343			/* Do not send bucket notifications, we do full
   2344			 * notification below.
   2345			 */
   2346			nh_res_table_upkeep(res_table, false, false);
   2347		}
   2348	}
   2349
   2350	rb_link_node_rcu(&new_nh->rb_node, parent, pp);
   2351	rb_insert_color(&new_nh->rb_node, root);
   2352
   2353	/* The initial insertion is a full notification for hash-threshold as
   2354	 * well as resilient groups.
   2355	 */
   2356	rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack);
   2357	if (rc)
   2358		rb_erase(&new_nh->rb_node, &net->nexthop.rb_root);
   2359
   2360out:
   2361	if (!rc) {
   2362		nh_base_seq_inc(net);
   2363		nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo);
   2364		if (replace_notify && net->ipv4.sysctl_nexthop_compat_mode)
   2365			nexthop_replace_notify(net, new_nh, &cfg->nlinfo);
   2366	}
   2367
   2368	return rc;
   2369}
   2370
   2371/* rtnl */
   2372/* remove all nexthops tied to a device being deleted */
   2373static void nexthop_flush_dev(struct net_device *dev, unsigned long event)
   2374{
   2375	unsigned int hash = nh_dev_hashfn(dev->ifindex);
   2376	struct net *net = dev_net(dev);
   2377	struct hlist_head *head = &net->nexthop.devhash[hash];
   2378	struct hlist_node *n;
   2379	struct nh_info *nhi;
   2380
   2381	hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
   2382		if (nhi->fib_nhc.nhc_dev != dev)
   2383			continue;
   2384
   2385		if (nhi->reject_nh &&
   2386		    (event == NETDEV_DOWN || event == NETDEV_CHANGE))
   2387			continue;
   2388
   2389		remove_nexthop(net, nhi->nh_parent, NULL);
   2390	}
   2391}
   2392
   2393/* rtnl; called when net namespace is deleted */
   2394static void flush_all_nexthops(struct net *net)
   2395{
   2396	struct rb_root *root = &net->nexthop.rb_root;
   2397	struct rb_node *node;
   2398	struct nexthop *nh;
   2399
   2400	while ((node = rb_first(root))) {
   2401		nh = rb_entry(node, struct nexthop, rb_node);
   2402		remove_nexthop(net, nh, NULL);
   2403		cond_resched();
   2404	}
   2405}
   2406
   2407static struct nexthop *nexthop_create_group(struct net *net,
   2408					    struct nh_config *cfg)
   2409{
   2410	struct nlattr *grps_attr = cfg->nh_grp;
   2411	struct nexthop_grp *entry = nla_data(grps_attr);
   2412	u16 num_nh = nla_len(grps_attr) / sizeof(*entry);
   2413	struct nh_group *nhg;
   2414	struct nexthop *nh;
   2415	int err;
   2416	int i;
   2417
   2418	if (WARN_ON(!num_nh))
   2419		return ERR_PTR(-EINVAL);
   2420
   2421	nh = nexthop_alloc();
   2422	if (!nh)
   2423		return ERR_PTR(-ENOMEM);
   2424
   2425	nh->is_group = 1;
   2426
   2427	nhg = nexthop_grp_alloc(num_nh);
   2428	if (!nhg) {
   2429		kfree(nh);
   2430		return ERR_PTR(-ENOMEM);
   2431	}
   2432
   2433	/* spare group used for removals */
   2434	nhg->spare = nexthop_grp_alloc(num_nh);
   2435	if (!nhg->spare) {
   2436		kfree(nhg);
   2437		kfree(nh);
   2438		return ERR_PTR(-ENOMEM);
   2439	}
   2440	nhg->spare->spare = nhg;
   2441
   2442	for (i = 0; i < nhg->num_nh; ++i) {
   2443		struct nexthop *nhe;
   2444		struct nh_info *nhi;
   2445
   2446		nhe = nexthop_find_by_id(net, entry[i].id);
   2447		if (!nexthop_get(nhe)) {
   2448			err = -ENOENT;
   2449			goto out_no_nh;
   2450		}
   2451
   2452		nhi = rtnl_dereference(nhe->nh_info);
   2453		if (nhi->family == AF_INET)
   2454			nhg->has_v4 = true;
   2455
   2456		nhg->nh_entries[i].nh = nhe;
   2457		nhg->nh_entries[i].weight = entry[i].weight + 1;
   2458		list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list);
   2459		nhg->nh_entries[i].nh_parent = nh;
   2460	}
   2461
   2462	if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
   2463		nhg->hash_threshold = 1;
   2464		nhg->is_multipath = true;
   2465	} else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) {
   2466		struct nh_res_table *res_table;
   2467
   2468		res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg);
   2469		if (!res_table) {
   2470			err = -ENOMEM;
   2471			goto out_no_nh;
   2472		}
   2473
   2474		rcu_assign_pointer(nhg->spare->res_table, res_table);
   2475		rcu_assign_pointer(nhg->res_table, res_table);
   2476		nhg->resilient = true;
   2477		nhg->is_multipath = true;
   2478	}
   2479
   2480	WARN_ON_ONCE(nhg->hash_threshold + nhg->resilient != 1);
   2481
   2482	if (nhg->hash_threshold)
   2483		nh_hthr_group_rebalance(nhg);
   2484
   2485	if (cfg->nh_fdb)
   2486		nhg->fdb_nh = 1;
   2487
   2488	rcu_assign_pointer(nh->nh_grp, nhg);
   2489
   2490	return nh;
   2491
   2492out_no_nh:
   2493	for (i--; i >= 0; --i) {
   2494		list_del(&nhg->nh_entries[i].nh_list);
   2495		nexthop_put(nhg->nh_entries[i].nh);
   2496	}
   2497
   2498	kfree(nhg->spare);
   2499	kfree(nhg);
   2500	kfree(nh);
   2501
   2502	return ERR_PTR(err);
   2503}
   2504
   2505static int nh_create_ipv4(struct net *net, struct nexthop *nh,
   2506			  struct nh_info *nhi, struct nh_config *cfg,
   2507			  struct netlink_ext_ack *extack)
   2508{
   2509	struct fib_nh *fib_nh = &nhi->fib_nh;
   2510	struct fib_config fib_cfg = {
   2511		.fc_oif   = cfg->nh_ifindex,
   2512		.fc_gw4   = cfg->gw.ipv4,
   2513		.fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0,
   2514		.fc_flags = cfg->nh_flags,
   2515		.fc_nlinfo = cfg->nlinfo,
   2516		.fc_encap = cfg->nh_encap,
   2517		.fc_encap_type = cfg->nh_encap_type,
   2518	};
   2519	u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN);
   2520	int err;
   2521
   2522	err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
   2523	if (err) {
   2524		fib_nh_release(net, fib_nh);
   2525		goto out;
   2526	}
   2527
   2528	if (nhi->fdb_nh)
   2529		goto out;
   2530
   2531	/* sets nh_dev if successful */
   2532	err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
   2533	if (!err) {
   2534		nh->nh_flags = fib_nh->fib_nh_flags;
   2535		fib_info_update_nhc_saddr(net, &fib_nh->nh_common,
   2536					  fib_nh->fib_nh_scope);
   2537	} else {
   2538		fib_nh_release(net, fib_nh);
   2539	}
   2540out:
   2541	return err;
   2542}
   2543
   2544static int nh_create_ipv6(struct net *net,  struct nexthop *nh,
   2545			  struct nh_info *nhi, struct nh_config *cfg,
   2546			  struct netlink_ext_ack *extack)
   2547{
   2548	struct fib6_nh *fib6_nh = &nhi->fib6_nh;
   2549	struct fib6_config fib6_cfg = {
   2550		.fc_table = l3mdev_fib_table(cfg->dev),
   2551		.fc_ifindex = cfg->nh_ifindex,
   2552		.fc_gateway = cfg->gw.ipv6,
   2553		.fc_flags = cfg->nh_flags,
   2554		.fc_nlinfo = cfg->nlinfo,
   2555		.fc_encap = cfg->nh_encap,
   2556		.fc_encap_type = cfg->nh_encap_type,
   2557		.fc_is_fdb = cfg->nh_fdb,
   2558	};
   2559	int err;
   2560
   2561	if (!ipv6_addr_any(&cfg->gw.ipv6))
   2562		fib6_cfg.fc_flags |= RTF_GATEWAY;
   2563
   2564	/* sets nh_dev if successful */
   2565	err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL,
   2566				      extack);
   2567	if (err) {
   2568		/* IPv6 is not enabled, don't call fib6_nh_release */
   2569		if (err == -EAFNOSUPPORT)
   2570			goto out;
   2571		ipv6_stub->fib6_nh_release(fib6_nh);
   2572	} else {
   2573		nh->nh_flags = fib6_nh->fib_nh_flags;
   2574	}
   2575out:
   2576	return err;
   2577}
   2578
   2579static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
   2580				      struct netlink_ext_ack *extack)
   2581{
   2582	struct nh_info *nhi;
   2583	struct nexthop *nh;
   2584	int err = 0;
   2585
   2586	nh = nexthop_alloc();
   2587	if (!nh)
   2588		return ERR_PTR(-ENOMEM);
   2589
   2590	nhi = kzalloc(sizeof(*nhi), GFP_KERNEL);
   2591	if (!nhi) {
   2592		kfree(nh);
   2593		return ERR_PTR(-ENOMEM);
   2594	}
   2595
   2596	nh->nh_flags = cfg->nh_flags;
   2597	nh->net = net;
   2598
   2599	nhi->nh_parent = nh;
   2600	nhi->family = cfg->nh_family;
   2601	nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK;
   2602
   2603	if (cfg->nh_fdb)
   2604		nhi->fdb_nh = 1;
   2605
   2606	if (cfg->nh_blackhole) {
   2607		nhi->reject_nh = 1;
   2608		cfg->nh_ifindex = net->loopback_dev->ifindex;
   2609	}
   2610
   2611	switch (cfg->nh_family) {
   2612	case AF_INET:
   2613		err = nh_create_ipv4(net, nh, nhi, cfg, extack);
   2614		break;
   2615	case AF_INET6:
   2616		err = nh_create_ipv6(net, nh, nhi, cfg, extack);
   2617		break;
   2618	}
   2619
   2620	if (err) {
   2621		kfree(nhi);
   2622		kfree(nh);
   2623		return ERR_PTR(err);
   2624	}
   2625
   2626	/* add the entry to the device based hash */
   2627	if (!nhi->fdb_nh)
   2628		nexthop_devhash_add(net, nhi);
   2629
   2630	rcu_assign_pointer(nh->nh_info, nhi);
   2631
   2632	return nh;
   2633}
   2634
   2635/* called with rtnl lock held */
   2636static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
   2637				   struct netlink_ext_ack *extack)
   2638{
   2639	struct nexthop *nh;
   2640	int err;
   2641
   2642	if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) {
   2643		NL_SET_ERR_MSG(extack, "Replace requires nexthop id");
   2644		return ERR_PTR(-EINVAL);
   2645	}
   2646
   2647	if (!cfg->nh_id) {
   2648		cfg->nh_id = nh_find_unused_id(net);
   2649		if (!cfg->nh_id) {
   2650			NL_SET_ERR_MSG(extack, "No unused id");
   2651			return ERR_PTR(-EINVAL);
   2652		}
   2653	}
   2654
   2655	if (cfg->nh_grp)
   2656		nh = nexthop_create_group(net, cfg);
   2657	else
   2658		nh = nexthop_create(net, cfg, extack);
   2659
   2660	if (IS_ERR(nh))
   2661		return nh;
   2662
   2663	refcount_set(&nh->refcnt, 1);
   2664	nh->id = cfg->nh_id;
   2665	nh->protocol = cfg->nh_protocol;
   2666	nh->net = net;
   2667
   2668	err = insert_nexthop(net, nh, cfg, extack);
   2669	if (err) {
   2670		__remove_nexthop(net, nh, NULL);
   2671		nexthop_put(nh);
   2672		nh = ERR_PTR(err);
   2673	}
   2674
   2675	return nh;
   2676}
   2677
   2678static int rtm_nh_get_timer(struct nlattr *attr, unsigned long fallback,
   2679			    unsigned long *timer_p, bool *has_p,
   2680			    struct netlink_ext_ack *extack)
   2681{
   2682	unsigned long timer;
   2683	u32 value;
   2684
   2685	if (!attr) {
   2686		*timer_p = fallback;
   2687		*has_p = false;
   2688		return 0;
   2689	}
   2690
   2691	value = nla_get_u32(attr);
   2692	timer = clock_t_to_jiffies(value);
   2693	if (timer == ~0UL) {
   2694		NL_SET_ERR_MSG(extack, "Timer value too large");
   2695		return -EINVAL;
   2696	}
   2697
   2698	*timer_p = timer;
   2699	*has_p = true;
   2700	return 0;
   2701}
   2702
   2703static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg,
   2704				    struct netlink_ext_ack *extack)
   2705{
   2706	struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_policy_new)] = {};
   2707	int err;
   2708
   2709	if (res) {
   2710		err = nla_parse_nested(tb,
   2711				       ARRAY_SIZE(rtm_nh_res_policy_new) - 1,
   2712				       res, rtm_nh_res_policy_new, extack);
   2713		if (err < 0)
   2714			return err;
   2715	}
   2716
   2717	if (tb[NHA_RES_GROUP_BUCKETS]) {
   2718		cfg->nh_grp_res_num_buckets =
   2719			nla_get_u16(tb[NHA_RES_GROUP_BUCKETS]);
   2720		cfg->nh_grp_res_has_num_buckets = true;
   2721		if (!cfg->nh_grp_res_num_buckets) {
   2722			NL_SET_ERR_MSG(extack, "Number of buckets needs to be non-0");
   2723			return -EINVAL;
   2724		}
   2725	}
   2726
   2727	err = rtm_nh_get_timer(tb[NHA_RES_GROUP_IDLE_TIMER],
   2728			       NH_RES_DEFAULT_IDLE_TIMER,
   2729			       &cfg->nh_grp_res_idle_timer,
   2730			       &cfg->nh_grp_res_has_idle_timer,
   2731			       extack);
   2732	if (err)
   2733		return err;
   2734
   2735	return rtm_nh_get_timer(tb[NHA_RES_GROUP_UNBALANCED_TIMER],
   2736				NH_RES_DEFAULT_UNBALANCED_TIMER,
   2737				&cfg->nh_grp_res_unbalanced_timer,
   2738				&cfg->nh_grp_res_has_unbalanced_timer,
   2739				extack);
   2740}
   2741
   2742static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
   2743			    struct nlmsghdr *nlh, struct nh_config *cfg,
   2744			    struct netlink_ext_ack *extack)
   2745{
   2746	struct nhmsg *nhm = nlmsg_data(nlh);
   2747	struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)];
   2748	int err;
   2749
   2750	err = nlmsg_parse(nlh, sizeof(*nhm), tb,
   2751			  ARRAY_SIZE(rtm_nh_policy_new) - 1,
   2752			  rtm_nh_policy_new, extack);
   2753	if (err < 0)
   2754		return err;
   2755
   2756	err = -EINVAL;
   2757	if (nhm->resvd || nhm->nh_scope) {
   2758		NL_SET_ERR_MSG(extack, "Invalid values in ancillary header");
   2759		goto out;
   2760	}
   2761	if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) {
   2762		NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header");
   2763		goto out;
   2764	}
   2765
   2766	switch (nhm->nh_family) {
   2767	case AF_INET:
   2768	case AF_INET6:
   2769		break;
   2770	case AF_UNSPEC:
   2771		if (tb[NHA_GROUP])
   2772			break;
   2773		fallthrough;
   2774	default:
   2775		NL_SET_ERR_MSG(extack, "Invalid address family");
   2776		goto out;
   2777	}
   2778
   2779	memset(cfg, 0, sizeof(*cfg));
   2780	cfg->nlflags = nlh->nlmsg_flags;
   2781	cfg->nlinfo.portid = NETLINK_CB(skb).portid;
   2782	cfg->nlinfo.nlh = nlh;
   2783	cfg->nlinfo.nl_net = net;
   2784
   2785	cfg->nh_family = nhm->nh_family;
   2786	cfg->nh_protocol = nhm->nh_protocol;
   2787	cfg->nh_flags = nhm->nh_flags;
   2788
   2789	if (tb[NHA_ID])
   2790		cfg->nh_id = nla_get_u32(tb[NHA_ID]);
   2791
   2792	if (tb[NHA_FDB]) {
   2793		if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] ||
   2794		    tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE]) {
   2795			NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole");
   2796			goto out;
   2797		}
   2798		if (nhm->nh_flags) {
   2799			NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header");
   2800			goto out;
   2801		}
   2802		cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]);
   2803	}
   2804
   2805	if (tb[NHA_GROUP]) {
   2806		if (nhm->nh_family != AF_UNSPEC) {
   2807			NL_SET_ERR_MSG(extack, "Invalid family for group");
   2808			goto out;
   2809		}
   2810		cfg->nh_grp = tb[NHA_GROUP];
   2811
   2812		cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH;
   2813		if (tb[NHA_GROUP_TYPE])
   2814			cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]);
   2815
   2816		if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) {
   2817			NL_SET_ERR_MSG(extack, "Invalid group type");
   2818			goto out;
   2819		}
   2820		err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb),
   2821					  cfg->nh_grp_type, extack);
   2822		if (err)
   2823			goto out;
   2824
   2825		if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES)
   2826			err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP],
   2827						       cfg, extack);
   2828
   2829		/* no other attributes should be set */
   2830		goto out;
   2831	}
   2832
   2833	if (tb[NHA_BLACKHOLE]) {
   2834		if (tb[NHA_GATEWAY] || tb[NHA_OIF] ||
   2835		    tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) {
   2836			NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb");
   2837			goto out;
   2838		}
   2839
   2840		cfg->nh_blackhole = 1;
   2841		err = 0;
   2842		goto out;
   2843	}
   2844
   2845	if (!cfg->nh_fdb && !tb[NHA_OIF]) {
   2846		NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops");
   2847		goto out;
   2848	}
   2849
   2850	if (!cfg->nh_fdb && tb[NHA_OIF]) {
   2851		cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
   2852		if (cfg->nh_ifindex)
   2853			cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
   2854
   2855		if (!cfg->dev) {
   2856			NL_SET_ERR_MSG(extack, "Invalid device index");
   2857			goto out;
   2858		} else if (!(cfg->dev->flags & IFF_UP)) {
   2859			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
   2860			err = -ENETDOWN;
   2861			goto out;
   2862		} else if (!netif_carrier_ok(cfg->dev)) {
   2863			NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
   2864			err = -ENETDOWN;
   2865			goto out;
   2866		}
   2867	}
   2868
   2869	err = -EINVAL;
   2870	if (tb[NHA_GATEWAY]) {
   2871		struct nlattr *gwa = tb[NHA_GATEWAY];
   2872
   2873		switch (cfg->nh_family) {
   2874		case AF_INET:
   2875			if (nla_len(gwa) != sizeof(u32)) {
   2876				NL_SET_ERR_MSG(extack, "Invalid gateway");
   2877				goto out;
   2878			}
   2879			cfg->gw.ipv4 = nla_get_be32(gwa);
   2880			break;
   2881		case AF_INET6:
   2882			if (nla_len(gwa) != sizeof(struct in6_addr)) {
   2883				NL_SET_ERR_MSG(extack, "Invalid gateway");
   2884				goto out;
   2885			}
   2886			cfg->gw.ipv6 = nla_get_in6_addr(gwa);
   2887			break;
   2888		default:
   2889			NL_SET_ERR_MSG(extack,
   2890				       "Unknown address family for gateway");
   2891			goto out;
   2892		}
   2893	} else {
   2894		/* device only nexthop (no gateway) */
   2895		if (cfg->nh_flags & RTNH_F_ONLINK) {
   2896			NL_SET_ERR_MSG(extack,
   2897				       "ONLINK flag can not be set for nexthop without a gateway");
   2898			goto out;
   2899		}
   2900	}
   2901
   2902	if (tb[NHA_ENCAP]) {
   2903		cfg->nh_encap = tb[NHA_ENCAP];
   2904
   2905		if (!tb[NHA_ENCAP_TYPE]) {
   2906			NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing");
   2907			goto out;
   2908		}
   2909
   2910		cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]);
   2911		err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack);
   2912		if (err < 0)
   2913			goto out;
   2914
   2915	} else if (tb[NHA_ENCAP_TYPE]) {
   2916		NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing");
   2917		goto out;
   2918	}
   2919
   2920
   2921	err = 0;
   2922out:
   2923	return err;
   2924}
   2925
   2926/* rtnl */
   2927static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
   2928			   struct netlink_ext_ack *extack)
   2929{
   2930	struct net *net = sock_net(skb->sk);
   2931	struct nh_config cfg;
   2932	struct nexthop *nh;
   2933	int err;
   2934
   2935	err = rtm_to_nh_config(net, skb, nlh, &cfg, extack);
   2936	if (!err) {
   2937		nh = nexthop_add(net, &cfg, extack);
   2938		if (IS_ERR(nh))
   2939			err = PTR_ERR(nh);
   2940	}
   2941
   2942	return err;
   2943}
   2944
   2945static int __nh_valid_get_del_req(const struct nlmsghdr *nlh,
   2946				  struct nlattr **tb, u32 *id,
   2947				  struct netlink_ext_ack *extack)
   2948{
   2949	struct nhmsg *nhm = nlmsg_data(nlh);
   2950
   2951	if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
   2952		NL_SET_ERR_MSG(extack, "Invalid values in header");
   2953		return -EINVAL;
   2954	}
   2955
   2956	if (!tb[NHA_ID]) {
   2957		NL_SET_ERR_MSG(extack, "Nexthop id is missing");
   2958		return -EINVAL;
   2959	}
   2960
   2961	*id = nla_get_u32(tb[NHA_ID]);
   2962	if (!(*id)) {
   2963		NL_SET_ERR_MSG(extack, "Invalid nexthop id");
   2964		return -EINVAL;
   2965	}
   2966
   2967	return 0;
   2968}
   2969
   2970static int nh_valid_get_del_req(const struct nlmsghdr *nlh, u32 *id,
   2971				struct netlink_ext_ack *extack)
   2972{
   2973	struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)];
   2974	int err;
   2975
   2976	err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
   2977			  ARRAY_SIZE(rtm_nh_policy_get) - 1,
   2978			  rtm_nh_policy_get, extack);
   2979	if (err < 0)
   2980		return err;
   2981
   2982	return __nh_valid_get_del_req(nlh, tb, id, extack);
   2983}
   2984
   2985/* rtnl */
   2986static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
   2987			   struct netlink_ext_ack *extack)
   2988{
   2989	struct net *net = sock_net(skb->sk);
   2990	struct nl_info nlinfo = {
   2991		.nlh = nlh,
   2992		.nl_net = net,
   2993		.portid = NETLINK_CB(skb).portid,
   2994	};
   2995	struct nexthop *nh;
   2996	int err;
   2997	u32 id;
   2998
   2999	err = nh_valid_get_del_req(nlh, &id, extack);
   3000	if (err)
   3001		return err;
   3002
   3003	nh = nexthop_find_by_id(net, id);
   3004	if (!nh)
   3005		return -ENOENT;
   3006
   3007	remove_nexthop(net, nh, &nlinfo);
   3008
   3009	return 0;
   3010}
   3011
   3012/* rtnl */
   3013static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh,
   3014			   struct netlink_ext_ack *extack)
   3015{
   3016	struct net *net = sock_net(in_skb->sk);
   3017	struct sk_buff *skb = NULL;
   3018	struct nexthop *nh;
   3019	int err;
   3020	u32 id;
   3021
   3022	err = nh_valid_get_del_req(nlh, &id, extack);
   3023	if (err)
   3024		return err;
   3025
   3026	err = -ENOBUFS;
   3027	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
   3028	if (!skb)
   3029		goto out;
   3030
   3031	err = -ENOENT;
   3032	nh = nexthop_find_by_id(net, id);
   3033	if (!nh)
   3034		goto errout_free;
   3035
   3036	err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid,
   3037			   nlh->nlmsg_seq, 0);
   3038	if (err < 0) {
   3039		WARN_ON(err == -EMSGSIZE);
   3040		goto errout_free;
   3041	}
   3042
   3043	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
   3044out:
   3045	return err;
   3046errout_free:
   3047	kfree_skb(skb);
   3048	goto out;
   3049}
   3050
   3051struct nh_dump_filter {
   3052	u32 nh_id;
   3053	int dev_idx;
   3054	int master_idx;
   3055	bool group_filter;
   3056	bool fdb_filter;
   3057	u32 res_bucket_nh_id;
   3058};
   3059
   3060static bool nh_dump_filtered(struct nexthop *nh,
   3061			     struct nh_dump_filter *filter, u8 family)
   3062{
   3063	const struct net_device *dev;
   3064	const struct nh_info *nhi;
   3065
   3066	if (filter->group_filter && !nh->is_group)
   3067		return true;
   3068
   3069	if (!filter->dev_idx && !filter->master_idx && !family)
   3070		return false;
   3071
   3072	if (nh->is_group)
   3073		return true;
   3074
   3075	nhi = rtnl_dereference(nh->nh_info);
   3076	if (family && nhi->family != family)
   3077		return true;
   3078
   3079	dev = nhi->fib_nhc.nhc_dev;
   3080	if (filter->dev_idx && (!dev || dev->ifindex != filter->dev_idx))
   3081		return true;
   3082
   3083	if (filter->master_idx) {
   3084		struct net_device *master;
   3085
   3086		if (!dev)
   3087			return true;
   3088
   3089		master = netdev_master_upper_dev_get((struct net_device *)dev);
   3090		if (!master || master->ifindex != filter->master_idx)
   3091			return true;
   3092	}
   3093
   3094	return false;
   3095}
   3096
   3097static int __nh_valid_dump_req(const struct nlmsghdr *nlh, struct nlattr **tb,
   3098			       struct nh_dump_filter *filter,
   3099			       struct netlink_ext_ack *extack)
   3100{
   3101	struct nhmsg *nhm;
   3102	u32 idx;
   3103
   3104	if (tb[NHA_OIF]) {
   3105		idx = nla_get_u32(tb[NHA_OIF]);
   3106		if (idx > INT_MAX) {
   3107			NL_SET_ERR_MSG(extack, "Invalid device index");
   3108			return -EINVAL;
   3109		}
   3110		filter->dev_idx = idx;
   3111	}
   3112	if (tb[NHA_MASTER]) {
   3113		idx = nla_get_u32(tb[NHA_MASTER]);
   3114		if (idx > INT_MAX) {
   3115			NL_SET_ERR_MSG(extack, "Invalid master device index");
   3116			return -EINVAL;
   3117		}
   3118		filter->master_idx = idx;
   3119	}
   3120	filter->group_filter = nla_get_flag(tb[NHA_GROUPS]);
   3121	filter->fdb_filter = nla_get_flag(tb[NHA_FDB]);
   3122
   3123	nhm = nlmsg_data(nlh);
   3124	if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
   3125		NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request");
   3126		return -EINVAL;
   3127	}
   3128
   3129	return 0;
   3130}
   3131
   3132static int nh_valid_dump_req(const struct nlmsghdr *nlh,
   3133			     struct nh_dump_filter *filter,
   3134			     struct netlink_callback *cb)
   3135{
   3136	struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump)];
   3137	int err;
   3138
   3139	err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
   3140			  ARRAY_SIZE(rtm_nh_policy_dump) - 1,
   3141			  rtm_nh_policy_dump, cb->extack);
   3142	if (err < 0)
   3143		return err;
   3144
   3145	return __nh_valid_dump_req(nlh, tb, filter, cb->extack);
   3146}
   3147
   3148struct rtm_dump_nh_ctx {
   3149	u32 idx;
   3150};
   3151
   3152static struct rtm_dump_nh_ctx *
   3153rtm_dump_nh_ctx(struct netlink_callback *cb)
   3154{
   3155	struct rtm_dump_nh_ctx *ctx = (void *)cb->ctx;
   3156
   3157	BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
   3158	return ctx;
   3159}
   3160
   3161static int rtm_dump_walk_nexthops(struct sk_buff *skb,
   3162				  struct netlink_callback *cb,
   3163				  struct rb_root *root,
   3164				  struct rtm_dump_nh_ctx *ctx,
   3165				  int (*nh_cb)(struct sk_buff *skb,
   3166					       struct netlink_callback *cb,
   3167					       struct nexthop *nh, void *data),
   3168				  void *data)
   3169{
   3170	struct rb_node *node;
   3171	int s_idx;
   3172	int err;
   3173
   3174	s_idx = ctx->idx;
   3175	for (node = rb_first(root); node; node = rb_next(node)) {
   3176		struct nexthop *nh;
   3177
   3178		nh = rb_entry(node, struct nexthop, rb_node);
   3179		if (nh->id < s_idx)
   3180			continue;
   3181
   3182		ctx->idx = nh->id;
   3183		err = nh_cb(skb, cb, nh, data);
   3184		if (err)
   3185			return err;
   3186	}
   3187
   3188	ctx->idx++;
   3189	return 0;
   3190}
   3191
   3192static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb,
   3193			       struct nexthop *nh, void *data)
   3194{
   3195	struct nhmsg *nhm = nlmsg_data(cb->nlh);
   3196	struct nh_dump_filter *filter = data;
   3197
   3198	if (nh_dump_filtered(nh, filter, nhm->nh_family))
   3199		return 0;
   3200
   3201	return nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
   3202			    NETLINK_CB(cb->skb).portid,
   3203			    cb->nlh->nlmsg_seq, NLM_F_MULTI);
   3204}
   3205
   3206/* rtnl */
   3207static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
   3208{
   3209	struct rtm_dump_nh_ctx *ctx = rtm_dump_nh_ctx(cb);
   3210	struct net *net = sock_net(skb->sk);
   3211	struct rb_root *root = &net->nexthop.rb_root;
   3212	struct nh_dump_filter filter = {};
   3213	int err;
   3214
   3215	err = nh_valid_dump_req(cb->nlh, &filter, cb);
   3216	if (err < 0)
   3217		return err;
   3218
   3219	err = rtm_dump_walk_nexthops(skb, cb, root, ctx,
   3220				     &rtm_dump_nexthop_cb, &filter);
   3221	if (err < 0) {
   3222		if (likely(skb->len))
   3223			goto out;
   3224		goto out_err;
   3225	}
   3226
   3227out:
   3228	err = skb->len;
   3229out_err:
   3230	cb->seq = net->nexthop.seq;
   3231	nl_dump_check_consistent(cb, nlmsg_hdr(skb));
   3232	return err;
   3233}
   3234
   3235static struct nexthop *
   3236nexthop_find_group_resilient(struct net *net, u32 id,
   3237			     struct netlink_ext_ack *extack)
   3238{
   3239	struct nh_group *nhg;
   3240	struct nexthop *nh;
   3241
   3242	nh = nexthop_find_by_id(net, id);
   3243	if (!nh)
   3244		return ERR_PTR(-ENOENT);
   3245
   3246	if (!nh->is_group) {
   3247		NL_SET_ERR_MSG(extack, "Not a nexthop group");
   3248		return ERR_PTR(-EINVAL);
   3249	}
   3250
   3251	nhg = rtnl_dereference(nh->nh_grp);
   3252	if (!nhg->resilient) {
   3253		NL_SET_ERR_MSG(extack, "Nexthop group not of type resilient");
   3254		return ERR_PTR(-EINVAL);
   3255	}
   3256
   3257	return nh;
   3258}
   3259
   3260static int nh_valid_dump_nhid(struct nlattr *attr, u32 *nh_id_p,
   3261			      struct netlink_ext_ack *extack)
   3262{
   3263	u32 idx;
   3264
   3265	if (attr) {
   3266		idx = nla_get_u32(attr);
   3267		if (!idx) {
   3268			NL_SET_ERR_MSG(extack, "Invalid nexthop id");
   3269			return -EINVAL;
   3270		}
   3271		*nh_id_p = idx;
   3272	} else {
   3273		*nh_id_p = 0;
   3274	}
   3275
   3276	return 0;
   3277}
   3278
   3279static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh,
   3280				    struct nh_dump_filter *filter,
   3281				    struct netlink_callback *cb)
   3282{
   3283	struct nlattr *res_tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump)];
   3284	struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump_bucket)];
   3285	int err;
   3286
   3287	err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
   3288			  ARRAY_SIZE(rtm_nh_policy_dump_bucket) - 1,
   3289			  rtm_nh_policy_dump_bucket, NULL);
   3290	if (err < 0)
   3291		return err;
   3292
   3293	err = nh_valid_dump_nhid(tb[NHA_ID], &filter->nh_id, cb->extack);
   3294	if (err)
   3295		return err;
   3296
   3297	if (tb[NHA_RES_BUCKET]) {
   3298		size_t max = ARRAY_SIZE(rtm_nh_res_bucket_policy_dump) - 1;
   3299
   3300		err = nla_parse_nested(res_tb, max,
   3301				       tb[NHA_RES_BUCKET],
   3302				       rtm_nh_res_bucket_policy_dump,
   3303				       cb->extack);
   3304		if (err < 0)
   3305			return err;
   3306
   3307		err = nh_valid_dump_nhid(res_tb[NHA_RES_BUCKET_NH_ID],
   3308					 &filter->res_bucket_nh_id,
   3309					 cb->extack);
   3310		if (err)
   3311			return err;
   3312	}
   3313
   3314	return __nh_valid_dump_req(nlh, tb, filter, cb->extack);
   3315}
   3316
   3317struct rtm_dump_res_bucket_ctx {
   3318	struct rtm_dump_nh_ctx nh;
   3319	u16 bucket_index;
   3320	u32 done_nh_idx; /* 1 + the index of the last fully processed NH. */
   3321};
   3322
   3323static struct rtm_dump_res_bucket_ctx *
   3324rtm_dump_res_bucket_ctx(struct netlink_callback *cb)
   3325{
   3326	struct rtm_dump_res_bucket_ctx *ctx = (void *)cb->ctx;
   3327
   3328	BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
   3329	return ctx;
   3330}
   3331
   3332struct rtm_dump_nexthop_bucket_data {
   3333	struct rtm_dump_res_bucket_ctx *ctx;
   3334	struct nh_dump_filter filter;
   3335};
   3336
   3337static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb,
   3338				      struct netlink_callback *cb,
   3339				      struct nexthop *nh,
   3340				      struct rtm_dump_nexthop_bucket_data *dd)
   3341{
   3342	u32 portid = NETLINK_CB(cb->skb).portid;
   3343	struct nhmsg *nhm = nlmsg_data(cb->nlh);
   3344	struct nh_res_table *res_table;
   3345	struct nh_group *nhg;
   3346	u16 bucket_index;
   3347	int err;
   3348
   3349	if (dd->ctx->nh.idx < dd->ctx->done_nh_idx)
   3350		return 0;
   3351
   3352	nhg = rtnl_dereference(nh->nh_grp);
   3353	res_table = rtnl_dereference(nhg->res_table);
   3354	for (bucket_index = dd->ctx->bucket_index;
   3355	     bucket_index < res_table->num_nh_buckets;
   3356	     bucket_index++) {
   3357		struct nh_res_bucket *bucket;
   3358		struct nh_grp_entry *nhge;
   3359
   3360		bucket = &res_table->nh_buckets[bucket_index];
   3361		nhge = rtnl_dereference(bucket->nh_entry);
   3362		if (nh_dump_filtered(nhge->nh, &dd->filter, nhm->nh_family))
   3363			continue;
   3364
   3365		if (dd->filter.res_bucket_nh_id &&
   3366		    dd->filter.res_bucket_nh_id != nhge->nh->id)
   3367			continue;
   3368
   3369		err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
   3370					 RTM_NEWNEXTHOPBUCKET, portid,
   3371					 cb->nlh->nlmsg_seq, NLM_F_MULTI,
   3372					 cb->extack);
   3373		if (err < 0) {
   3374			if (likely(skb->len))
   3375				goto out;
   3376			goto out_err;
   3377		}
   3378	}
   3379
   3380	dd->ctx->done_nh_idx = dd->ctx->nh.idx + 1;
   3381	bucket_index = 0;
   3382
   3383out:
   3384	err = skb->len;
   3385out_err:
   3386	dd->ctx->bucket_index = bucket_index;
   3387	return err;
   3388}
   3389
   3390static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb,
   3391				      struct netlink_callback *cb,
   3392				      struct nexthop *nh, void *data)
   3393{
   3394	struct rtm_dump_nexthop_bucket_data *dd = data;
   3395	struct nh_group *nhg;
   3396
   3397	if (!nh->is_group)
   3398		return 0;
   3399
   3400	nhg = rtnl_dereference(nh->nh_grp);
   3401	if (!nhg->resilient)
   3402		return 0;
   3403
   3404	return rtm_dump_nexthop_bucket_nh(skb, cb, nh, dd);
   3405}
   3406
   3407/* rtnl */
   3408static int rtm_dump_nexthop_bucket(struct sk_buff *skb,
   3409				   struct netlink_callback *cb)
   3410{
   3411	struct rtm_dump_res_bucket_ctx *ctx = rtm_dump_res_bucket_ctx(cb);
   3412	struct rtm_dump_nexthop_bucket_data dd = { .ctx = ctx };
   3413	struct net *net = sock_net(skb->sk);
   3414	struct nexthop *nh;
   3415	int err;
   3416
   3417	err = nh_valid_dump_bucket_req(cb->nlh, &dd.filter, cb);
   3418	if (err)
   3419		return err;
   3420
   3421	if (dd.filter.nh_id) {
   3422		nh = nexthop_find_group_resilient(net, dd.filter.nh_id,
   3423						  cb->extack);
   3424		if (IS_ERR(nh))
   3425			return PTR_ERR(nh);
   3426		err = rtm_dump_nexthop_bucket_nh(skb, cb, nh, &dd);
   3427	} else {
   3428		struct rb_root *root = &net->nexthop.rb_root;
   3429
   3430		err = rtm_dump_walk_nexthops(skb, cb, root, &ctx->nh,
   3431					     &rtm_dump_nexthop_bucket_cb, &dd);
   3432	}
   3433
   3434	if (err < 0) {
   3435		if (likely(skb->len))
   3436			goto out;
   3437		goto out_err;
   3438	}
   3439
   3440out:
   3441	err = skb->len;
   3442out_err:
   3443	cb->seq = net->nexthop.seq;
   3444	nl_dump_check_consistent(cb, nlmsg_hdr(skb));
   3445	return err;
   3446}
   3447
   3448static int nh_valid_get_bucket_req_res_bucket(struct nlattr *res,
   3449					      u16 *bucket_index,
   3450					      struct netlink_ext_ack *extack)
   3451{
   3452	struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_get)];
   3453	int err;
   3454
   3455	err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_bucket_policy_get) - 1,
   3456			       res, rtm_nh_res_bucket_policy_get, extack);
   3457	if (err < 0)
   3458		return err;
   3459
   3460	if (!tb[NHA_RES_BUCKET_INDEX]) {
   3461		NL_SET_ERR_MSG(extack, "Bucket index is missing");
   3462		return -EINVAL;
   3463	}
   3464
   3465	*bucket_index = nla_get_u16(tb[NHA_RES_BUCKET_INDEX]);
   3466	return 0;
   3467}
   3468
   3469static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh,
   3470				   u32 *id, u16 *bucket_index,
   3471				   struct netlink_ext_ack *extack)
   3472{
   3473	struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get_bucket)];
   3474	int err;
   3475
   3476	err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
   3477			  ARRAY_SIZE(rtm_nh_policy_get_bucket) - 1,
   3478			  rtm_nh_policy_get_bucket, extack);
   3479	if (err < 0)
   3480		return err;
   3481
   3482	err = __nh_valid_get_del_req(nlh, tb, id, extack);
   3483	if (err)
   3484		return err;
   3485
   3486	if (!tb[NHA_RES_BUCKET]) {
   3487		NL_SET_ERR_MSG(extack, "Bucket information is missing");
   3488		return -EINVAL;
   3489	}
   3490
   3491	err = nh_valid_get_bucket_req_res_bucket(tb[NHA_RES_BUCKET],
   3492						 bucket_index, extack);
   3493	if (err)
   3494		return err;
   3495
   3496	return 0;
   3497}
   3498
   3499/* rtnl */
   3500static int rtm_get_nexthop_bucket(struct sk_buff *in_skb, struct nlmsghdr *nlh,
   3501				  struct netlink_ext_ack *extack)
   3502{
   3503	struct net *net = sock_net(in_skb->sk);
   3504	struct nh_res_table *res_table;
   3505	struct sk_buff *skb = NULL;
   3506	struct nh_group *nhg;
   3507	struct nexthop *nh;
   3508	u16 bucket_index;
   3509	int err;
   3510	u32 id;
   3511
   3512	err = nh_valid_get_bucket_req(nlh, &id, &bucket_index, extack);
   3513	if (err)
   3514		return err;
   3515
   3516	nh = nexthop_find_group_resilient(net, id, extack);
   3517	if (IS_ERR(nh))
   3518		return PTR_ERR(nh);
   3519
   3520	nhg = rtnl_dereference(nh->nh_grp);
   3521	res_table = rtnl_dereference(nhg->res_table);
   3522	if (bucket_index >= res_table->num_nh_buckets) {
   3523		NL_SET_ERR_MSG(extack, "Bucket index out of bounds");
   3524		return -ENOENT;
   3525	}
   3526
   3527	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
   3528	if (!skb)
   3529		return -ENOBUFS;
   3530
   3531	err = nh_fill_res_bucket(skb, nh, &res_table->nh_buckets[bucket_index],
   3532				 bucket_index, RTM_NEWNEXTHOPBUCKET,
   3533				 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
   3534				 0, extack);
   3535	if (err < 0) {
   3536		WARN_ON(err == -EMSGSIZE);
   3537		goto errout_free;
   3538	}
   3539
   3540	return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
   3541
   3542errout_free:
   3543	kfree_skb(skb);
   3544	return err;
   3545}
   3546
   3547static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu)
   3548{
   3549	unsigned int hash = nh_dev_hashfn(dev->ifindex);
   3550	struct net *net = dev_net(dev);
   3551	struct hlist_head *head = &net->nexthop.devhash[hash];
   3552	struct hlist_node *n;
   3553	struct nh_info *nhi;
   3554
   3555	hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
   3556		if (nhi->fib_nhc.nhc_dev == dev) {
   3557			if (nhi->family == AF_INET)
   3558				fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu,
   3559						   orig_mtu);
   3560		}
   3561	}
   3562}
   3563
   3564/* rtnl */
   3565static int nh_netdev_event(struct notifier_block *this,
   3566			   unsigned long event, void *ptr)
   3567{
   3568	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
   3569	struct netdev_notifier_info_ext *info_ext;
   3570
   3571	switch (event) {
   3572	case NETDEV_DOWN:
   3573	case NETDEV_UNREGISTER:
   3574		nexthop_flush_dev(dev, event);
   3575		break;
   3576	case NETDEV_CHANGE:
   3577		if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
   3578			nexthop_flush_dev(dev, event);
   3579		break;
   3580	case NETDEV_CHANGEMTU:
   3581		info_ext = ptr;
   3582		nexthop_sync_mtu(dev, info_ext->ext.mtu);
   3583		rt_cache_flush(dev_net(dev));
   3584		break;
   3585	}
   3586	return NOTIFY_DONE;
   3587}
   3588
   3589static struct notifier_block nh_netdev_notifier = {
   3590	.notifier_call = nh_netdev_event,
   3591};
   3592
   3593static int nexthops_dump(struct net *net, struct notifier_block *nb,
   3594			 enum nexthop_event_type event_type,
   3595			 struct netlink_ext_ack *extack)
   3596{
   3597	struct rb_root *root = &net->nexthop.rb_root;
   3598	struct rb_node *node;
   3599	int err = 0;
   3600
   3601	for (node = rb_first(root); node; node = rb_next(node)) {
   3602		struct nexthop *nh;
   3603
   3604		nh = rb_entry(node, struct nexthop, rb_node);
   3605		err = call_nexthop_notifier(nb, net, event_type, nh, extack);
   3606		if (err)
   3607			break;
   3608	}
   3609
   3610	return err;
   3611}
   3612
   3613int register_nexthop_notifier(struct net *net, struct notifier_block *nb,
   3614			      struct netlink_ext_ack *extack)
   3615{
   3616	int err;
   3617
   3618	rtnl_lock();
   3619	err = nexthops_dump(net, nb, NEXTHOP_EVENT_REPLACE, extack);
   3620	if (err)
   3621		goto unlock;
   3622	err = blocking_notifier_chain_register(&net->nexthop.notifier_chain,
   3623					       nb);
   3624unlock:
   3625	rtnl_unlock();
   3626	return err;
   3627}
   3628EXPORT_SYMBOL(register_nexthop_notifier);
   3629
   3630int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb)
   3631{
   3632	int err;
   3633
   3634	rtnl_lock();
   3635	err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain,
   3636						 nb);
   3637	if (err)
   3638		goto unlock;
   3639	nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL);
   3640unlock:
   3641	rtnl_unlock();
   3642	return err;
   3643}
   3644EXPORT_SYMBOL(unregister_nexthop_notifier);
   3645
   3646void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap)
   3647{
   3648	struct nexthop *nexthop;
   3649
   3650	rcu_read_lock();
   3651
   3652	nexthop = nexthop_find_by_id(net, id);
   3653	if (!nexthop)
   3654		goto out;
   3655
   3656	nexthop->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
   3657	if (offload)
   3658		nexthop->nh_flags |= RTNH_F_OFFLOAD;
   3659	if (trap)
   3660		nexthop->nh_flags |= RTNH_F_TRAP;
   3661
   3662out:
   3663	rcu_read_unlock();
   3664}
   3665EXPORT_SYMBOL(nexthop_set_hw_flags);
   3666
   3667void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index,
   3668				 bool offload, bool trap)
   3669{
   3670	struct nh_res_table *res_table;
   3671	struct nh_res_bucket *bucket;
   3672	struct nexthop *nexthop;
   3673	struct nh_group *nhg;
   3674
   3675	rcu_read_lock();
   3676
   3677	nexthop = nexthop_find_by_id(net, id);
   3678	if (!nexthop || !nexthop->is_group)
   3679		goto out;
   3680
   3681	nhg = rcu_dereference(nexthop->nh_grp);
   3682	if (!nhg->resilient)
   3683		goto out;
   3684
   3685	if (bucket_index >= nhg->res_table->num_nh_buckets)
   3686		goto out;
   3687
   3688	res_table = rcu_dereference(nhg->res_table);
   3689	bucket = &res_table->nh_buckets[bucket_index];
   3690	bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
   3691	if (offload)
   3692		bucket->nh_flags |= RTNH_F_OFFLOAD;
   3693	if (trap)
   3694		bucket->nh_flags |= RTNH_F_TRAP;
   3695
   3696out:
   3697	rcu_read_unlock();
   3698}
   3699EXPORT_SYMBOL(nexthop_bucket_set_hw_flags);
   3700
   3701void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets,
   3702				     unsigned long *activity)
   3703{
   3704	struct nh_res_table *res_table;
   3705	struct nexthop *nexthop;
   3706	struct nh_group *nhg;
   3707	u16 i;
   3708
   3709	rcu_read_lock();
   3710
   3711	nexthop = nexthop_find_by_id(net, id);
   3712	if (!nexthop || !nexthop->is_group)
   3713		goto out;
   3714
   3715	nhg = rcu_dereference(nexthop->nh_grp);
   3716	if (!nhg->resilient)
   3717		goto out;
   3718
   3719	/* Instead of silently ignoring some buckets, demand that the sizes
   3720	 * be the same.
   3721	 */
   3722	res_table = rcu_dereference(nhg->res_table);
   3723	if (num_buckets != res_table->num_nh_buckets)
   3724		goto out;
   3725
   3726	for (i = 0; i < num_buckets; i++) {
   3727		if (test_bit(i, activity))
   3728			nh_res_bucket_set_busy(&res_table->nh_buckets[i]);
   3729	}
   3730
   3731out:
   3732	rcu_read_unlock();
   3733}
   3734EXPORT_SYMBOL(nexthop_res_grp_activity_update);
   3735
   3736static void __net_exit nexthop_net_exit_batch(struct list_head *net_list)
   3737{
   3738	struct net *net;
   3739
   3740	rtnl_lock();
   3741	list_for_each_entry(net, net_list, exit_list) {
   3742		flush_all_nexthops(net);
   3743		kfree(net->nexthop.devhash);
   3744	}
   3745	rtnl_unlock();
   3746}
   3747
   3748static int __net_init nexthop_net_init(struct net *net)
   3749{
   3750	size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE;
   3751
   3752	net->nexthop.rb_root = RB_ROOT;
   3753	net->nexthop.devhash = kzalloc(sz, GFP_KERNEL);
   3754	if (!net->nexthop.devhash)
   3755		return -ENOMEM;
   3756	BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain);
   3757
   3758	return 0;
   3759}
   3760
   3761static struct pernet_operations nexthop_net_ops = {
   3762	.init = nexthop_net_init,
   3763	.exit_batch = nexthop_net_exit_batch,
   3764};
   3765
   3766static int __init nexthop_init(void)
   3767{
   3768	register_pernet_subsys(&nexthop_net_ops);
   3769
   3770	register_netdevice_notifier(&nh_netdev_notifier);
   3771
   3772	rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
   3773	rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0);
   3774	rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop,
   3775		      rtm_dump_nexthop, 0);
   3776
   3777	rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
   3778	rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
   3779
   3780	rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
   3781	rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
   3782
   3783	rtnl_register(PF_UNSPEC, RTM_GETNEXTHOPBUCKET, rtm_get_nexthop_bucket,
   3784		      rtm_dump_nexthop_bucket, 0);
   3785
   3786	return 0;
   3787}
   3788subsys_initcall(nexthop_init);