cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

xfrm_policy.c (108996B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * xfrm_policy.c
      4 *
      5 * Changes:
      6 *	Mitsuru KANDA @USAGI
      7 * 	Kazunori MIYAZAWA @USAGI
      8 * 	Kunihiro Ishiguro <kunihiro@ipinfusion.com>
      9 * 		IPv6 support
     10 * 	Kazunori MIYAZAWA @USAGI
     11 * 	YOSHIFUJI Hideaki
     12 * 		Split up af-specific portion
     13 *	Derek Atkins <derek@ihtfp.com>		Add the post_input processor
     14 *
     15 */
     16
     17#include <linux/err.h>
     18#include <linux/slab.h>
     19#include <linux/kmod.h>
     20#include <linux/list.h>
     21#include <linux/spinlock.h>
     22#include <linux/workqueue.h>
     23#include <linux/notifier.h>
     24#include <linux/netdevice.h>
     25#include <linux/netfilter.h>
     26#include <linux/module.h>
     27#include <linux/cache.h>
     28#include <linux/cpu.h>
     29#include <linux/audit.h>
     30#include <linux/rhashtable.h>
     31#include <linux/if_tunnel.h>
     32#include <net/dst.h>
     33#include <net/flow.h>
     34#include <net/inet_ecn.h>
     35#include <net/xfrm.h>
     36#include <net/ip.h>
     37#include <net/gre.h>
     38#if IS_ENABLED(CONFIG_IPV6_MIP6)
     39#include <net/mip6.h>
     40#endif
     41#ifdef CONFIG_XFRM_STATISTICS
     42#include <net/snmp.h>
     43#endif
     44#ifdef CONFIG_XFRM_ESPINTCP
     45#include <net/espintcp.h>
     46#endif
     47
     48#include "xfrm_hash.h"
     49
     50#define XFRM_QUEUE_TMO_MIN ((unsigned)(HZ/10))
     51#define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ))
     52#define XFRM_MAX_QUEUE_LEN	100
     53
     54struct xfrm_flo {
     55	struct dst_entry *dst_orig;
     56	u8 flags;
     57};
     58
     59/* prefixes smaller than this are stored in lists, not trees. */
     60#define INEXACT_PREFIXLEN_IPV4	16
     61#define INEXACT_PREFIXLEN_IPV6	48
     62
     63struct xfrm_pol_inexact_node {
     64	struct rb_node node;
     65	union {
     66		xfrm_address_t addr;
     67		struct rcu_head rcu;
     68	};
     69	u8 prefixlen;
     70
     71	struct rb_root root;
     72
     73	/* the policies matching this node, can be empty list */
     74	struct hlist_head hhead;
     75};
     76
     77/* xfrm inexact policy search tree:
     78 * xfrm_pol_inexact_bin = hash(dir,type,family,if_id);
     79 *  |
     80 * +---- root_d: sorted by daddr:prefix
     81 * |                 |
     82 * |        xfrm_pol_inexact_node
     83 * |                 |
     84 * |                 +- root: sorted by saddr/prefix
     85 * |                 |              |
     86 * |                 |         xfrm_pol_inexact_node
     87 * |                 |              |
     88 * |                 |              + root: unused
     89 * |                 |              |
     90 * |                 |              + hhead: saddr:daddr policies
     91 * |                 |
     92 * |                 +- coarse policies and all any:daddr policies
     93 * |
     94 * +---- root_s: sorted by saddr:prefix
     95 * |                 |
     96 * |        xfrm_pol_inexact_node
     97 * |                 |
     98 * |                 + root: unused
     99 * |                 |
    100 * |                 + hhead: saddr:any policies
    101 * |
    102 * +---- coarse policies and all any:any policies
    103 *
    104 * Lookups return four candidate lists:
    105 * 1. any:any list from top-level xfrm_pol_inexact_bin
    106 * 2. any:daddr list from daddr tree
    107 * 3. saddr:daddr list from 2nd level daddr tree
    108 * 4. saddr:any list from saddr tree
    109 *
    110 * This result set then needs to be searched for the policy with
    111 * the lowest priority.  If two results have same prio, youngest one wins.
    112 */
    113
    114struct xfrm_pol_inexact_key {
    115	possible_net_t net;
    116	u32 if_id;
    117	u16 family;
    118	u8 dir, type;
    119};
    120
    121struct xfrm_pol_inexact_bin {
    122	struct xfrm_pol_inexact_key k;
    123	struct rhash_head head;
    124	/* list containing '*:*' policies */
    125	struct hlist_head hhead;
    126
    127	seqcount_spinlock_t count;
    128	/* tree sorted by daddr/prefix */
    129	struct rb_root root_d;
    130
    131	/* tree sorted by saddr/prefix */
    132	struct rb_root root_s;
    133
    134	/* slow path below */
    135	struct list_head inexact_bins;
    136	struct rcu_head rcu;
    137};
    138
    139enum xfrm_pol_inexact_candidate_type {
    140	XFRM_POL_CAND_BOTH,
    141	XFRM_POL_CAND_SADDR,
    142	XFRM_POL_CAND_DADDR,
    143	XFRM_POL_CAND_ANY,
    144
    145	XFRM_POL_CAND_MAX,
    146};
    147
    148struct xfrm_pol_inexact_candidates {
    149	struct hlist_head *res[XFRM_POL_CAND_MAX];
    150};
    151
    152static DEFINE_SPINLOCK(xfrm_if_cb_lock);
    153static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly;
    154
    155static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
    156static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
    157						__read_mostly;
    158
    159static struct kmem_cache *xfrm_dst_cache __ro_after_init;
    160
    161static struct rhashtable xfrm_policy_inexact_table;
    162static const struct rhashtable_params xfrm_pol_inexact_params;
    163
    164static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr);
    165static int stale_bundle(struct dst_entry *dst);
    166static int xfrm_bundle_ok(struct xfrm_dst *xdst);
    167static void xfrm_policy_queue_process(struct timer_list *t);
    168
    169static void __xfrm_policy_link(struct xfrm_policy *pol, int dir);
    170static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
    171						int dir);
    172
    173static struct xfrm_pol_inexact_bin *
    174xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir,
    175			   u32 if_id);
    176
    177static struct xfrm_pol_inexact_bin *
    178xfrm_policy_inexact_lookup_rcu(struct net *net,
    179			       u8 type, u16 family, u8 dir, u32 if_id);
    180static struct xfrm_policy *
    181xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy,
    182			bool excl);
    183static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
    184					    struct xfrm_policy *policy);
    185
    186static bool
    187xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
    188				    struct xfrm_pol_inexact_bin *b,
    189				    const xfrm_address_t *saddr,
    190				    const xfrm_address_t *daddr);
    191
    192static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy)
    193{
    194	return refcount_inc_not_zero(&policy->refcnt);
    195}
    196
    197static inline bool
    198__xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
    199{
    200	const struct flowi4 *fl4 = &fl->u.ip4;
    201
    202	return  addr4_match(fl4->daddr, sel->daddr.a4, sel->prefixlen_d) &&
    203		addr4_match(fl4->saddr, sel->saddr.a4, sel->prefixlen_s) &&
    204		!((xfrm_flowi_dport(fl, &fl4->uli) ^ sel->dport) & sel->dport_mask) &&
    205		!((xfrm_flowi_sport(fl, &fl4->uli) ^ sel->sport) & sel->sport_mask) &&
    206		(fl4->flowi4_proto == sel->proto || !sel->proto) &&
    207		(fl4->flowi4_oif == sel->ifindex || !sel->ifindex);
    208}
    209
    210static inline bool
    211__xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
    212{
    213	const struct flowi6 *fl6 = &fl->u.ip6;
    214
    215	return  addr_match(&fl6->daddr, &sel->daddr, sel->prefixlen_d) &&
    216		addr_match(&fl6->saddr, &sel->saddr, sel->prefixlen_s) &&
    217		!((xfrm_flowi_dport(fl, &fl6->uli) ^ sel->dport) & sel->dport_mask) &&
    218		!((xfrm_flowi_sport(fl, &fl6->uli) ^ sel->sport) & sel->sport_mask) &&
    219		(fl6->flowi6_proto == sel->proto || !sel->proto) &&
    220		(fl6->flowi6_oif == sel->ifindex || !sel->ifindex);
    221}
    222
    223bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl,
    224			 unsigned short family)
    225{
    226	switch (family) {
    227	case AF_INET:
    228		return __xfrm4_selector_match(sel, fl);
    229	case AF_INET6:
    230		return __xfrm6_selector_match(sel, fl);
    231	}
    232	return false;
    233}
    234
    235static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
    236{
    237	const struct xfrm_policy_afinfo *afinfo;
    238
    239	if (unlikely(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
    240		return NULL;
    241	rcu_read_lock();
    242	afinfo = rcu_dereference(xfrm_policy_afinfo[family]);
    243	if (unlikely(!afinfo))
    244		rcu_read_unlock();
    245	return afinfo;
    246}
    247
    248/* Called with rcu_read_lock(). */
    249static const struct xfrm_if_cb *xfrm_if_get_cb(void)
    250{
    251	return rcu_dereference(xfrm_if_cb);
    252}
    253
    254struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif,
    255				    const xfrm_address_t *saddr,
    256				    const xfrm_address_t *daddr,
    257				    int family, u32 mark)
    258{
    259	const struct xfrm_policy_afinfo *afinfo;
    260	struct dst_entry *dst;
    261
    262	afinfo = xfrm_policy_get_afinfo(family);
    263	if (unlikely(afinfo == NULL))
    264		return ERR_PTR(-EAFNOSUPPORT);
    265
    266	dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr, mark);
    267
    268	rcu_read_unlock();
    269
    270	return dst;
    271}
    272EXPORT_SYMBOL(__xfrm_dst_lookup);
    273
    274static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x,
    275						int tos, int oif,
    276						xfrm_address_t *prev_saddr,
    277						xfrm_address_t *prev_daddr,
    278						int family, u32 mark)
    279{
    280	struct net *net = xs_net(x);
    281	xfrm_address_t *saddr = &x->props.saddr;
    282	xfrm_address_t *daddr = &x->id.daddr;
    283	struct dst_entry *dst;
    284
    285	if (x->type->flags & XFRM_TYPE_LOCAL_COADDR) {
    286		saddr = x->coaddr;
    287		daddr = prev_daddr;
    288	}
    289	if (x->type->flags & XFRM_TYPE_REMOTE_COADDR) {
    290		saddr = prev_saddr;
    291		daddr = x->coaddr;
    292	}
    293
    294	dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family, mark);
    295
    296	if (!IS_ERR(dst)) {
    297		if (prev_saddr != saddr)
    298			memcpy(prev_saddr, saddr,  sizeof(*prev_saddr));
    299		if (prev_daddr != daddr)
    300			memcpy(prev_daddr, daddr,  sizeof(*prev_daddr));
    301	}
    302
    303	return dst;
    304}
    305
    306static inline unsigned long make_jiffies(long secs)
    307{
    308	if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
    309		return MAX_SCHEDULE_TIMEOUT-1;
    310	else
    311		return secs*HZ;
    312}
    313
    314static void xfrm_policy_timer(struct timer_list *t)
    315{
    316	struct xfrm_policy *xp = from_timer(xp, t, timer);
    317	time64_t now = ktime_get_real_seconds();
    318	time64_t next = TIME64_MAX;
    319	int warn = 0;
    320	int dir;
    321
    322	read_lock(&xp->lock);
    323
    324	if (unlikely(xp->walk.dead))
    325		goto out;
    326
    327	dir = xfrm_policy_id2dir(xp->index);
    328
    329	if (xp->lft.hard_add_expires_seconds) {
    330		time64_t tmo = xp->lft.hard_add_expires_seconds +
    331			xp->curlft.add_time - now;
    332		if (tmo <= 0)
    333			goto expired;
    334		if (tmo < next)
    335			next = tmo;
    336	}
    337	if (xp->lft.hard_use_expires_seconds) {
    338		time64_t tmo = xp->lft.hard_use_expires_seconds +
    339			(xp->curlft.use_time ? : xp->curlft.add_time) - now;
    340		if (tmo <= 0)
    341			goto expired;
    342		if (tmo < next)
    343			next = tmo;
    344	}
    345	if (xp->lft.soft_add_expires_seconds) {
    346		time64_t tmo = xp->lft.soft_add_expires_seconds +
    347			xp->curlft.add_time - now;
    348		if (tmo <= 0) {
    349			warn = 1;
    350			tmo = XFRM_KM_TIMEOUT;
    351		}
    352		if (tmo < next)
    353			next = tmo;
    354	}
    355	if (xp->lft.soft_use_expires_seconds) {
    356		time64_t tmo = xp->lft.soft_use_expires_seconds +
    357			(xp->curlft.use_time ? : xp->curlft.add_time) - now;
    358		if (tmo <= 0) {
    359			warn = 1;
    360			tmo = XFRM_KM_TIMEOUT;
    361		}
    362		if (tmo < next)
    363			next = tmo;
    364	}
    365
    366	if (warn)
    367		km_policy_expired(xp, dir, 0, 0);
    368	if (next != TIME64_MAX &&
    369	    !mod_timer(&xp->timer, jiffies + make_jiffies(next)))
    370		xfrm_pol_hold(xp);
    371
    372out:
    373	read_unlock(&xp->lock);
    374	xfrm_pol_put(xp);
    375	return;
    376
    377expired:
    378	read_unlock(&xp->lock);
    379	if (!xfrm_policy_delete(xp, dir))
    380		km_policy_expired(xp, dir, 1, 0);
    381	xfrm_pol_put(xp);
    382}
    383
    384/* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
    385 * SPD calls.
    386 */
    387
    388struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
    389{
    390	struct xfrm_policy *policy;
    391
    392	policy = kzalloc(sizeof(struct xfrm_policy), gfp);
    393
    394	if (policy) {
    395		write_pnet(&policy->xp_net, net);
    396		INIT_LIST_HEAD(&policy->walk.all);
    397		INIT_HLIST_NODE(&policy->bydst_inexact_list);
    398		INIT_HLIST_NODE(&policy->bydst);
    399		INIT_HLIST_NODE(&policy->byidx);
    400		rwlock_init(&policy->lock);
    401		refcount_set(&policy->refcnt, 1);
    402		skb_queue_head_init(&policy->polq.hold_queue);
    403		timer_setup(&policy->timer, xfrm_policy_timer, 0);
    404		timer_setup(&policy->polq.hold_timer,
    405			    xfrm_policy_queue_process, 0);
    406	}
    407	return policy;
    408}
    409EXPORT_SYMBOL(xfrm_policy_alloc);
    410
    411static void xfrm_policy_destroy_rcu(struct rcu_head *head)
    412{
    413	struct xfrm_policy *policy = container_of(head, struct xfrm_policy, rcu);
    414
    415	security_xfrm_policy_free(policy->security);
    416	kfree(policy);
    417}
    418
    419/* Destroy xfrm_policy: descendant resources must be released to this moment. */
    420
    421void xfrm_policy_destroy(struct xfrm_policy *policy)
    422{
    423	BUG_ON(!policy->walk.dead);
    424
    425	if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer))
    426		BUG();
    427
    428	call_rcu(&policy->rcu, xfrm_policy_destroy_rcu);
    429}
    430EXPORT_SYMBOL(xfrm_policy_destroy);
    431
    432/* Rule must be locked. Release descendant resources, announce
    433 * entry dead. The rule must be unlinked from lists to the moment.
    434 */
    435
    436static void xfrm_policy_kill(struct xfrm_policy *policy)
    437{
    438	write_lock_bh(&policy->lock);
    439	policy->walk.dead = 1;
    440	write_unlock_bh(&policy->lock);
    441
    442	atomic_inc(&policy->genid);
    443
    444	if (del_timer(&policy->polq.hold_timer))
    445		xfrm_pol_put(policy);
    446	skb_queue_purge(&policy->polq.hold_queue);
    447
    448	if (del_timer(&policy->timer))
    449		xfrm_pol_put(policy);
    450
    451	xfrm_pol_put(policy);
    452}
    453
    454static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024;
    455
    456static inline unsigned int idx_hash(struct net *net, u32 index)
    457{
    458	return __idx_hash(index, net->xfrm.policy_idx_hmask);
    459}
    460
    461/* calculate policy hash thresholds */
    462static void __get_hash_thresh(struct net *net,
    463			      unsigned short family, int dir,
    464			      u8 *dbits, u8 *sbits)
    465{
    466	switch (family) {
    467	case AF_INET:
    468		*dbits = net->xfrm.policy_bydst[dir].dbits4;
    469		*sbits = net->xfrm.policy_bydst[dir].sbits4;
    470		break;
    471
    472	case AF_INET6:
    473		*dbits = net->xfrm.policy_bydst[dir].dbits6;
    474		*sbits = net->xfrm.policy_bydst[dir].sbits6;
    475		break;
    476
    477	default:
    478		*dbits = 0;
    479		*sbits = 0;
    480	}
    481}
    482
    483static struct hlist_head *policy_hash_bysel(struct net *net,
    484					    const struct xfrm_selector *sel,
    485					    unsigned short family, int dir)
    486{
    487	unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
    488	unsigned int hash;
    489	u8 dbits;
    490	u8 sbits;
    491
    492	__get_hash_thresh(net, family, dir, &dbits, &sbits);
    493	hash = __sel_hash(sel, family, hmask, dbits, sbits);
    494
    495	if (hash == hmask + 1)
    496		return NULL;
    497
    498	return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
    499		     lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
    500}
    501
    502static struct hlist_head *policy_hash_direct(struct net *net,
    503					     const xfrm_address_t *daddr,
    504					     const xfrm_address_t *saddr,
    505					     unsigned short family, int dir)
    506{
    507	unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
    508	unsigned int hash;
    509	u8 dbits;
    510	u8 sbits;
    511
    512	__get_hash_thresh(net, family, dir, &dbits, &sbits);
    513	hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits);
    514
    515	return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
    516		     lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
    517}
    518
    519static void xfrm_dst_hash_transfer(struct net *net,
    520				   struct hlist_head *list,
    521				   struct hlist_head *ndsttable,
    522				   unsigned int nhashmask,
    523				   int dir)
    524{
    525	struct hlist_node *tmp, *entry0 = NULL;
    526	struct xfrm_policy *pol;
    527	unsigned int h0 = 0;
    528	u8 dbits;
    529	u8 sbits;
    530
    531redo:
    532	hlist_for_each_entry_safe(pol, tmp, list, bydst) {
    533		unsigned int h;
    534
    535		__get_hash_thresh(net, pol->family, dir, &dbits, &sbits);
    536		h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr,
    537				pol->family, nhashmask, dbits, sbits);
    538		if (!entry0) {
    539			hlist_del_rcu(&pol->bydst);
    540			hlist_add_head_rcu(&pol->bydst, ndsttable + h);
    541			h0 = h;
    542		} else {
    543			if (h != h0)
    544				continue;
    545			hlist_del_rcu(&pol->bydst);
    546			hlist_add_behind_rcu(&pol->bydst, entry0);
    547		}
    548		entry0 = &pol->bydst;
    549	}
    550	if (!hlist_empty(list)) {
    551		entry0 = NULL;
    552		goto redo;
    553	}
    554}
    555
    556static void xfrm_idx_hash_transfer(struct hlist_head *list,
    557				   struct hlist_head *nidxtable,
    558				   unsigned int nhashmask)
    559{
    560	struct hlist_node *tmp;
    561	struct xfrm_policy *pol;
    562
    563	hlist_for_each_entry_safe(pol, tmp, list, byidx) {
    564		unsigned int h;
    565
    566		h = __idx_hash(pol->index, nhashmask);
    567		hlist_add_head(&pol->byidx, nidxtable+h);
    568	}
    569}
    570
    571static unsigned long xfrm_new_hash_mask(unsigned int old_hmask)
    572{
    573	return ((old_hmask + 1) << 1) - 1;
    574}
    575
    576static void xfrm_bydst_resize(struct net *net, int dir)
    577{
    578	unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
    579	unsigned int nhashmask = xfrm_new_hash_mask(hmask);
    580	unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
    581	struct hlist_head *ndst = xfrm_hash_alloc(nsize);
    582	struct hlist_head *odst;
    583	int i;
    584
    585	if (!ndst)
    586		return;
    587
    588	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
    589	write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);
    590
    591	odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
    592				lockdep_is_held(&net->xfrm.xfrm_policy_lock));
    593
    594	for (i = hmask; i >= 0; i--)
    595		xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir);
    596
    597	rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst);
    598	net->xfrm.policy_bydst[dir].hmask = nhashmask;
    599
    600	write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation);
    601	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
    602
    603	synchronize_rcu();
    604
    605	xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head));
    606}
    607
    608static void xfrm_byidx_resize(struct net *net, int total)
    609{
    610	unsigned int hmask = net->xfrm.policy_idx_hmask;
    611	unsigned int nhashmask = xfrm_new_hash_mask(hmask);
    612	unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
    613	struct hlist_head *oidx = net->xfrm.policy_byidx;
    614	struct hlist_head *nidx = xfrm_hash_alloc(nsize);
    615	int i;
    616
    617	if (!nidx)
    618		return;
    619
    620	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
    621
    622	for (i = hmask; i >= 0; i--)
    623		xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask);
    624
    625	net->xfrm.policy_byidx = nidx;
    626	net->xfrm.policy_idx_hmask = nhashmask;
    627
    628	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
    629
    630	xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head));
    631}
    632
    633static inline int xfrm_bydst_should_resize(struct net *net, int dir, int *total)
    634{
    635	unsigned int cnt = net->xfrm.policy_count[dir];
    636	unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
    637
    638	if (total)
    639		*total += cnt;
    640
    641	if ((hmask + 1) < xfrm_policy_hashmax &&
    642	    cnt > hmask)
    643		return 1;
    644
    645	return 0;
    646}
    647
    648static inline int xfrm_byidx_should_resize(struct net *net, int total)
    649{
    650	unsigned int hmask = net->xfrm.policy_idx_hmask;
    651
    652	if ((hmask + 1) < xfrm_policy_hashmax &&
    653	    total > hmask)
    654		return 1;
    655
    656	return 0;
    657}
    658
    659void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si)
    660{
    661	si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN];
    662	si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT];
    663	si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD];
    664	si->inscnt = net->xfrm.policy_count[XFRM_POLICY_IN+XFRM_POLICY_MAX];
    665	si->outscnt = net->xfrm.policy_count[XFRM_POLICY_OUT+XFRM_POLICY_MAX];
    666	si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX];
    667	si->spdhcnt = net->xfrm.policy_idx_hmask;
    668	si->spdhmcnt = xfrm_policy_hashmax;
    669}
    670EXPORT_SYMBOL(xfrm_spd_getinfo);
    671
    672static DEFINE_MUTEX(hash_resize_mutex);
    673static void xfrm_hash_resize(struct work_struct *work)
    674{
    675	struct net *net = container_of(work, struct net, xfrm.policy_hash_work);
    676	int dir, total;
    677
    678	mutex_lock(&hash_resize_mutex);
    679
    680	total = 0;
    681	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
    682		if (xfrm_bydst_should_resize(net, dir, &total))
    683			xfrm_bydst_resize(net, dir);
    684	}
    685	if (xfrm_byidx_should_resize(net, total))
    686		xfrm_byidx_resize(net, total);
    687
    688	mutex_unlock(&hash_resize_mutex);
    689}
    690
    691/* Make sure *pol can be inserted into fastbin.
    692 * Useful to check that later insert requests will be successful
    693 * (provided xfrm_policy_lock is held throughout).
    694 */
    695static struct xfrm_pol_inexact_bin *
    696xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir)
    697{
    698	struct xfrm_pol_inexact_bin *bin, *prev;
    699	struct xfrm_pol_inexact_key k = {
    700		.family = pol->family,
    701		.type = pol->type,
    702		.dir = dir,
    703		.if_id = pol->if_id,
    704	};
    705	struct net *net = xp_net(pol);
    706
    707	lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
    708
    709	write_pnet(&k.net, net);
    710	bin = rhashtable_lookup_fast(&xfrm_policy_inexact_table, &k,
    711				     xfrm_pol_inexact_params);
    712	if (bin)
    713		return bin;
    714
    715	bin = kzalloc(sizeof(*bin), GFP_ATOMIC);
    716	if (!bin)
    717		return NULL;
    718
    719	bin->k = k;
    720	INIT_HLIST_HEAD(&bin->hhead);
    721	bin->root_d = RB_ROOT;
    722	bin->root_s = RB_ROOT;
    723	seqcount_spinlock_init(&bin->count, &net->xfrm.xfrm_policy_lock);
    724
    725	prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table,
    726						&bin->k, &bin->head,
    727						xfrm_pol_inexact_params);
    728	if (!prev) {
    729		list_add(&bin->inexact_bins, &net->xfrm.inexact_bins);
    730		return bin;
    731	}
    732
    733	kfree(bin);
    734
    735	return IS_ERR(prev) ? NULL : prev;
    736}
    737
    738static bool xfrm_pol_inexact_addr_use_any_list(const xfrm_address_t *addr,
    739					       int family, u8 prefixlen)
    740{
    741	if (xfrm_addr_any(addr, family))
    742		return true;
    743
    744	if (family == AF_INET6 && prefixlen < INEXACT_PREFIXLEN_IPV6)
    745		return true;
    746
    747	if (family == AF_INET && prefixlen < INEXACT_PREFIXLEN_IPV4)
    748		return true;
    749
    750	return false;
    751}
    752
    753static bool
    754xfrm_policy_inexact_insert_use_any_list(const struct xfrm_policy *policy)
    755{
    756	const xfrm_address_t *addr;
    757	bool saddr_any, daddr_any;
    758	u8 prefixlen;
    759
    760	addr = &policy->selector.saddr;
    761	prefixlen = policy->selector.prefixlen_s;
    762
    763	saddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
    764						       policy->family,
    765						       prefixlen);
    766	addr = &policy->selector.daddr;
    767	prefixlen = policy->selector.prefixlen_d;
    768	daddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
    769						       policy->family,
    770						       prefixlen);
    771	return saddr_any && daddr_any;
    772}
    773
    774static void xfrm_pol_inexact_node_init(struct xfrm_pol_inexact_node *node,
    775				       const xfrm_address_t *addr, u8 prefixlen)
    776{
    777	node->addr = *addr;
    778	node->prefixlen = prefixlen;
    779}
    780
    781static struct xfrm_pol_inexact_node *
    782xfrm_pol_inexact_node_alloc(const xfrm_address_t *addr, u8 prefixlen)
    783{
    784	struct xfrm_pol_inexact_node *node;
    785
    786	node = kzalloc(sizeof(*node), GFP_ATOMIC);
    787	if (node)
    788		xfrm_pol_inexact_node_init(node, addr, prefixlen);
    789
    790	return node;
    791}
    792
    793static int xfrm_policy_addr_delta(const xfrm_address_t *a,
    794				  const xfrm_address_t *b,
    795				  u8 prefixlen, u16 family)
    796{
    797	u32 ma, mb, mask;
    798	unsigned int pdw, pbi;
    799	int delta = 0;
    800
    801	switch (family) {
    802	case AF_INET:
    803		if (prefixlen == 0)
    804			return 0;
    805		mask = ~0U << (32 - prefixlen);
    806		ma = ntohl(a->a4) & mask;
    807		mb = ntohl(b->a4) & mask;
    808		if (ma < mb)
    809			delta = -1;
    810		else if (ma > mb)
    811			delta = 1;
    812		break;
    813	case AF_INET6:
    814		pdw = prefixlen >> 5;
    815		pbi = prefixlen & 0x1f;
    816
    817		if (pdw) {
    818			delta = memcmp(a->a6, b->a6, pdw << 2);
    819			if (delta)
    820				return delta;
    821		}
    822		if (pbi) {
    823			mask = ~0U << (32 - pbi);
    824			ma = ntohl(a->a6[pdw]) & mask;
    825			mb = ntohl(b->a6[pdw]) & mask;
    826			if (ma < mb)
    827				delta = -1;
    828			else if (ma > mb)
    829				delta = 1;
    830		}
    831		break;
    832	default:
    833		break;
    834	}
    835
    836	return delta;
    837}
    838
    839static void xfrm_policy_inexact_list_reinsert(struct net *net,
    840					      struct xfrm_pol_inexact_node *n,
    841					      u16 family)
    842{
    843	unsigned int matched_s, matched_d;
    844	struct xfrm_policy *policy, *p;
    845
    846	matched_s = 0;
    847	matched_d = 0;
    848
    849	list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
    850		struct hlist_node *newpos = NULL;
    851		bool matches_s, matches_d;
    852
    853		if (!policy->bydst_reinsert)
    854			continue;
    855
    856		WARN_ON_ONCE(policy->family != family);
    857
    858		policy->bydst_reinsert = false;
    859		hlist_for_each_entry(p, &n->hhead, bydst) {
    860			if (policy->priority > p->priority)
    861				newpos = &p->bydst;
    862			else if (policy->priority == p->priority &&
    863				 policy->pos > p->pos)
    864				newpos = &p->bydst;
    865			else
    866				break;
    867		}
    868
    869		if (newpos)
    870			hlist_add_behind_rcu(&policy->bydst, newpos);
    871		else
    872			hlist_add_head_rcu(&policy->bydst, &n->hhead);
    873
    874		/* paranoia checks follow.
    875		 * Check that the reinserted policy matches at least
    876		 * saddr or daddr for current node prefix.
    877		 *
    878		 * Matching both is fine, matching saddr in one policy
    879		 * (but not daddr) and then matching only daddr in another
    880		 * is a bug.
    881		 */
    882		matches_s = xfrm_policy_addr_delta(&policy->selector.saddr,
    883						   &n->addr,
    884						   n->prefixlen,
    885						   family) == 0;
    886		matches_d = xfrm_policy_addr_delta(&policy->selector.daddr,
    887						   &n->addr,
    888						   n->prefixlen,
    889						   family) == 0;
    890		if (matches_s && matches_d)
    891			continue;
    892
    893		WARN_ON_ONCE(!matches_s && !matches_d);
    894		if (matches_s)
    895			matched_s++;
    896		if (matches_d)
    897			matched_d++;
    898		WARN_ON_ONCE(matched_s && matched_d);
    899	}
    900}
    901
    902static void xfrm_policy_inexact_node_reinsert(struct net *net,
    903					      struct xfrm_pol_inexact_node *n,
    904					      struct rb_root *new,
    905					      u16 family)
    906{
    907	struct xfrm_pol_inexact_node *node;
    908	struct rb_node **p, *parent;
    909
    910	/* we should not have another subtree here */
    911	WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root));
    912restart:
    913	parent = NULL;
    914	p = &new->rb_node;
    915	while (*p) {
    916		u8 prefixlen;
    917		int delta;
    918
    919		parent = *p;
    920		node = rb_entry(*p, struct xfrm_pol_inexact_node, node);
    921
    922		prefixlen = min(node->prefixlen, n->prefixlen);
    923
    924		delta = xfrm_policy_addr_delta(&n->addr, &node->addr,
    925					       prefixlen, family);
    926		if (delta < 0) {
    927			p = &parent->rb_left;
    928		} else if (delta > 0) {
    929			p = &parent->rb_right;
    930		} else {
    931			bool same_prefixlen = node->prefixlen == n->prefixlen;
    932			struct xfrm_policy *tmp;
    933
    934			hlist_for_each_entry(tmp, &n->hhead, bydst) {
    935				tmp->bydst_reinsert = true;
    936				hlist_del_rcu(&tmp->bydst);
    937			}
    938
    939			node->prefixlen = prefixlen;
    940
    941			xfrm_policy_inexact_list_reinsert(net, node, family);
    942
    943			if (same_prefixlen) {
    944				kfree_rcu(n, rcu);
    945				return;
    946			}
    947
    948			rb_erase(*p, new);
    949			kfree_rcu(n, rcu);
    950			n = node;
    951			goto restart;
    952		}
    953	}
    954
    955	rb_link_node_rcu(&n->node, parent, p);
    956	rb_insert_color(&n->node, new);
    957}
    958
    959/* merge nodes v and n */
    960static void xfrm_policy_inexact_node_merge(struct net *net,
    961					   struct xfrm_pol_inexact_node *v,
    962					   struct xfrm_pol_inexact_node *n,
    963					   u16 family)
    964{
    965	struct xfrm_pol_inexact_node *node;
    966	struct xfrm_policy *tmp;
    967	struct rb_node *rnode;
    968
    969	/* To-be-merged node v has a subtree.
    970	 *
    971	 * Dismantle it and insert its nodes to n->root.
    972	 */
    973	while ((rnode = rb_first(&v->root)) != NULL) {
    974		node = rb_entry(rnode, struct xfrm_pol_inexact_node, node);
    975		rb_erase(&node->node, &v->root);
    976		xfrm_policy_inexact_node_reinsert(net, node, &n->root,
    977						  family);
    978	}
    979
    980	hlist_for_each_entry(tmp, &v->hhead, bydst) {
    981		tmp->bydst_reinsert = true;
    982		hlist_del_rcu(&tmp->bydst);
    983	}
    984
    985	xfrm_policy_inexact_list_reinsert(net, n, family);
    986}
    987
    988static struct xfrm_pol_inexact_node *
    989xfrm_policy_inexact_insert_node(struct net *net,
    990				struct rb_root *root,
    991				xfrm_address_t *addr,
    992				u16 family, u8 prefixlen, u8 dir)
    993{
    994	struct xfrm_pol_inexact_node *cached = NULL;
    995	struct rb_node **p, *parent = NULL;
    996	struct xfrm_pol_inexact_node *node;
    997
    998	p = &root->rb_node;
    999	while (*p) {
   1000		int delta;
   1001
   1002		parent = *p;
   1003		node = rb_entry(*p, struct xfrm_pol_inexact_node, node);
   1004
   1005		delta = xfrm_policy_addr_delta(addr, &node->addr,
   1006					       node->prefixlen,
   1007					       family);
   1008		if (delta == 0 && prefixlen >= node->prefixlen) {
   1009			WARN_ON_ONCE(cached); /* ipsec policies got lost */
   1010			return node;
   1011		}
   1012
   1013		if (delta < 0)
   1014			p = &parent->rb_left;
   1015		else
   1016			p = &parent->rb_right;
   1017
   1018		if (prefixlen < node->prefixlen) {
   1019			delta = xfrm_policy_addr_delta(addr, &node->addr,
   1020						       prefixlen,
   1021						       family);
   1022			if (delta)
   1023				continue;
   1024
   1025			/* This node is a subnet of the new prefix. It needs
   1026			 * to be removed and re-inserted with the smaller
   1027			 * prefix and all nodes that are now also covered
   1028			 * by the reduced prefixlen.
   1029			 */
   1030			rb_erase(&node->node, root);
   1031
   1032			if (!cached) {
   1033				xfrm_pol_inexact_node_init(node, addr,
   1034							   prefixlen);
   1035				cached = node;
   1036			} else {
   1037				/* This node also falls within the new
   1038				 * prefixlen. Merge the to-be-reinserted
   1039				 * node and this one.
   1040				 */
   1041				xfrm_policy_inexact_node_merge(net, node,
   1042							       cached, family);
   1043				kfree_rcu(node, rcu);
   1044			}
   1045
   1046			/* restart */
   1047			p = &root->rb_node;
   1048			parent = NULL;
   1049		}
   1050	}
   1051
   1052	node = cached;
   1053	if (!node) {
   1054		node = xfrm_pol_inexact_node_alloc(addr, prefixlen);
   1055		if (!node)
   1056			return NULL;
   1057	}
   1058
   1059	rb_link_node_rcu(&node->node, parent, p);
   1060	rb_insert_color(&node->node, root);
   1061
   1062	return node;
   1063}
   1064
   1065static void xfrm_policy_inexact_gc_tree(struct rb_root *r, bool rm)
   1066{
   1067	struct xfrm_pol_inexact_node *node;
   1068	struct rb_node *rn = rb_first(r);
   1069
   1070	while (rn) {
   1071		node = rb_entry(rn, struct xfrm_pol_inexact_node, node);
   1072
   1073		xfrm_policy_inexact_gc_tree(&node->root, rm);
   1074		rn = rb_next(rn);
   1075
   1076		if (!hlist_empty(&node->hhead) || !RB_EMPTY_ROOT(&node->root)) {
   1077			WARN_ON_ONCE(rm);
   1078			continue;
   1079		}
   1080
   1081		rb_erase(&node->node, r);
   1082		kfree_rcu(node, rcu);
   1083	}
   1084}
   1085
   1086static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool net_exit)
   1087{
   1088	write_seqcount_begin(&b->count);
   1089	xfrm_policy_inexact_gc_tree(&b->root_d, net_exit);
   1090	xfrm_policy_inexact_gc_tree(&b->root_s, net_exit);
   1091	write_seqcount_end(&b->count);
   1092
   1093	if (!RB_EMPTY_ROOT(&b->root_d) || !RB_EMPTY_ROOT(&b->root_s) ||
   1094	    !hlist_empty(&b->hhead)) {
   1095		WARN_ON_ONCE(net_exit);
   1096		return;
   1097	}
   1098
   1099	if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head,
   1100				   xfrm_pol_inexact_params) == 0) {
   1101		list_del(&b->inexact_bins);
   1102		kfree_rcu(b, rcu);
   1103	}
   1104}
   1105
   1106static void xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b)
   1107{
   1108	struct net *net = read_pnet(&b->k.net);
   1109
   1110	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
   1111	__xfrm_policy_inexact_prune_bin(b, false);
   1112	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
   1113}
   1114
   1115static void __xfrm_policy_inexact_flush(struct net *net)
   1116{
   1117	struct xfrm_pol_inexact_bin *bin, *t;
   1118
   1119	lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
   1120
   1121	list_for_each_entry_safe(bin, t, &net->xfrm.inexact_bins, inexact_bins)
   1122		__xfrm_policy_inexact_prune_bin(bin, false);
   1123}
   1124
   1125static struct hlist_head *
   1126xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin,
   1127				struct xfrm_policy *policy, u8 dir)
   1128{
   1129	struct xfrm_pol_inexact_node *n;
   1130	struct net *net;
   1131
   1132	net = xp_net(policy);
   1133	lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
   1134
   1135	if (xfrm_policy_inexact_insert_use_any_list(policy))
   1136		return &bin->hhead;
   1137
   1138	if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr,
   1139					       policy->family,
   1140					       policy->selector.prefixlen_d)) {
   1141		write_seqcount_begin(&bin->count);
   1142		n = xfrm_policy_inexact_insert_node(net,
   1143						    &bin->root_s,
   1144						    &policy->selector.saddr,
   1145						    policy->family,
   1146						    policy->selector.prefixlen_s,
   1147						    dir);
   1148		write_seqcount_end(&bin->count);
   1149		if (!n)
   1150			return NULL;
   1151
   1152		return &n->hhead;
   1153	}
   1154
   1155	/* daddr is fixed */
   1156	write_seqcount_begin(&bin->count);
   1157	n = xfrm_policy_inexact_insert_node(net,
   1158					    &bin->root_d,
   1159					    &policy->selector.daddr,
   1160					    policy->family,
   1161					    policy->selector.prefixlen_d, dir);
   1162	write_seqcount_end(&bin->count);
   1163	if (!n)
   1164		return NULL;
   1165
   1166	/* saddr is wildcard */
   1167	if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr,
   1168					       policy->family,
   1169					       policy->selector.prefixlen_s))
   1170		return &n->hhead;
   1171
   1172	write_seqcount_begin(&bin->count);
   1173	n = xfrm_policy_inexact_insert_node(net,
   1174					    &n->root,
   1175					    &policy->selector.saddr,
   1176					    policy->family,
   1177					    policy->selector.prefixlen_s, dir);
   1178	write_seqcount_end(&bin->count);
   1179	if (!n)
   1180		return NULL;
   1181
   1182	return &n->hhead;
   1183}
   1184
   1185static struct xfrm_policy *
   1186xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl)
   1187{
   1188	struct xfrm_pol_inexact_bin *bin;
   1189	struct xfrm_policy *delpol;
   1190	struct hlist_head *chain;
   1191	struct net *net;
   1192
   1193	bin = xfrm_policy_inexact_alloc_bin(policy, dir);
   1194	if (!bin)
   1195		return ERR_PTR(-ENOMEM);
   1196
   1197	net = xp_net(policy);
   1198	lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
   1199
   1200	chain = xfrm_policy_inexact_alloc_chain(bin, policy, dir);
   1201	if (!chain) {
   1202		__xfrm_policy_inexact_prune_bin(bin, false);
   1203		return ERR_PTR(-ENOMEM);
   1204	}
   1205
   1206	delpol = xfrm_policy_insert_list(chain, policy, excl);
   1207	if (delpol && excl) {
   1208		__xfrm_policy_inexact_prune_bin(bin, false);
   1209		return ERR_PTR(-EEXIST);
   1210	}
   1211
   1212	chain = &net->xfrm.policy_inexact[dir];
   1213	xfrm_policy_insert_inexact_list(chain, policy);
   1214
   1215	if (delpol)
   1216		__xfrm_policy_inexact_prune_bin(bin, false);
   1217
   1218	return delpol;
   1219}
   1220
   1221static void xfrm_hash_rebuild(struct work_struct *work)
   1222{
   1223	struct net *net = container_of(work, struct net,
   1224				       xfrm.policy_hthresh.work);
   1225	unsigned int hmask;
   1226	struct xfrm_policy *pol;
   1227	struct xfrm_policy *policy;
   1228	struct hlist_head *chain;
   1229	struct hlist_head *odst;
   1230	struct hlist_node *newpos;
   1231	int i;
   1232	int dir;
   1233	unsigned seq;
   1234	u8 lbits4, rbits4, lbits6, rbits6;
   1235
   1236	mutex_lock(&hash_resize_mutex);
   1237
   1238	/* read selector prefixlen thresholds */
   1239	do {
   1240		seq = read_seqbegin(&net->xfrm.policy_hthresh.lock);
   1241
   1242		lbits4 = net->xfrm.policy_hthresh.lbits4;
   1243		rbits4 = net->xfrm.policy_hthresh.rbits4;
   1244		lbits6 = net->xfrm.policy_hthresh.lbits6;
   1245		rbits6 = net->xfrm.policy_hthresh.rbits6;
   1246	} while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));
   1247
   1248	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
   1249	write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);
   1250
   1251	/* make sure that we can insert the indirect policies again before
   1252	 * we start with destructive action.
   1253	 */
   1254	list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) {
   1255		struct xfrm_pol_inexact_bin *bin;
   1256		u8 dbits, sbits;
   1257
   1258		dir = xfrm_policy_id2dir(policy->index);
   1259		if (policy->walk.dead || dir >= XFRM_POLICY_MAX)
   1260			continue;
   1261
   1262		if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
   1263			if (policy->family == AF_INET) {
   1264				dbits = rbits4;
   1265				sbits = lbits4;
   1266			} else {
   1267				dbits = rbits6;
   1268				sbits = lbits6;
   1269			}
   1270		} else {
   1271			if (policy->family == AF_INET) {
   1272				dbits = lbits4;
   1273				sbits = rbits4;
   1274			} else {
   1275				dbits = lbits6;
   1276				sbits = rbits6;
   1277			}
   1278		}
   1279
   1280		if (policy->selector.prefixlen_d < dbits ||
   1281		    policy->selector.prefixlen_s < sbits)
   1282			continue;
   1283
   1284		bin = xfrm_policy_inexact_alloc_bin(policy, dir);
   1285		if (!bin)
   1286			goto out_unlock;
   1287
   1288		if (!xfrm_policy_inexact_alloc_chain(bin, policy, dir))
   1289			goto out_unlock;
   1290	}
   1291
   1292	/* reset the bydst and inexact table in all directions */
   1293	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
   1294		struct hlist_node *n;
   1295
   1296		hlist_for_each_entry_safe(policy, n,
   1297					  &net->xfrm.policy_inexact[dir],
   1298					  bydst_inexact_list) {
   1299			hlist_del_rcu(&policy->bydst);
   1300			hlist_del_init(&policy->bydst_inexact_list);
   1301		}
   1302
   1303		hmask = net->xfrm.policy_bydst[dir].hmask;
   1304		odst = net->xfrm.policy_bydst[dir].table;
   1305		for (i = hmask; i >= 0; i--) {
   1306			hlist_for_each_entry_safe(policy, n, odst + i, bydst)
   1307				hlist_del_rcu(&policy->bydst);
   1308		}
   1309		if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
   1310			/* dir out => dst = remote, src = local */
   1311			net->xfrm.policy_bydst[dir].dbits4 = rbits4;
   1312			net->xfrm.policy_bydst[dir].sbits4 = lbits4;
   1313			net->xfrm.policy_bydst[dir].dbits6 = rbits6;
   1314			net->xfrm.policy_bydst[dir].sbits6 = lbits6;
   1315		} else {
   1316			/* dir in/fwd => dst = local, src = remote */
   1317			net->xfrm.policy_bydst[dir].dbits4 = lbits4;
   1318			net->xfrm.policy_bydst[dir].sbits4 = rbits4;
   1319			net->xfrm.policy_bydst[dir].dbits6 = lbits6;
   1320			net->xfrm.policy_bydst[dir].sbits6 = rbits6;
   1321		}
   1322	}
   1323
   1324	/* re-insert all policies by order of creation */
   1325	list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
   1326		if (policy->walk.dead)
   1327			continue;
   1328		dir = xfrm_policy_id2dir(policy->index);
   1329		if (dir >= XFRM_POLICY_MAX) {
   1330			/* skip socket policies */
   1331			continue;
   1332		}
   1333		newpos = NULL;
   1334		chain = policy_hash_bysel(net, &policy->selector,
   1335					  policy->family, dir);
   1336
   1337		if (!chain) {
   1338			void *p = xfrm_policy_inexact_insert(policy, dir, 0);
   1339
   1340			WARN_ONCE(IS_ERR(p), "reinsert: %ld\n", PTR_ERR(p));
   1341			continue;
   1342		}
   1343
   1344		hlist_for_each_entry(pol, chain, bydst) {
   1345			if (policy->priority >= pol->priority)
   1346				newpos = &pol->bydst;
   1347			else
   1348				break;
   1349		}
   1350		if (newpos)
   1351			hlist_add_behind_rcu(&policy->bydst, newpos);
   1352		else
   1353			hlist_add_head_rcu(&policy->bydst, chain);
   1354	}
   1355
   1356out_unlock:
   1357	__xfrm_policy_inexact_flush(net);
   1358	write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation);
   1359	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
   1360
   1361	mutex_unlock(&hash_resize_mutex);
   1362}
   1363
   1364void xfrm_policy_hash_rebuild(struct net *net)
   1365{
   1366	schedule_work(&net->xfrm.policy_hthresh.work);
   1367}
   1368EXPORT_SYMBOL(xfrm_policy_hash_rebuild);
   1369
   1370/* Generate new index... KAME seems to generate them ordered by cost
   1371 * of an absolute inpredictability of ordering of rules. This will not pass. */
   1372static u32 xfrm_gen_index(struct net *net, int dir, u32 index)
   1373{
   1374	static u32 idx_generator;
   1375
   1376	for (;;) {
   1377		struct hlist_head *list;
   1378		struct xfrm_policy *p;
   1379		u32 idx;
   1380		int found;
   1381
   1382		if (!index) {
   1383			idx = (idx_generator | dir);
   1384			idx_generator += 8;
   1385		} else {
   1386			idx = index;
   1387			index = 0;
   1388		}
   1389
   1390		if (idx == 0)
   1391			idx = 8;
   1392		list = net->xfrm.policy_byidx + idx_hash(net, idx);
   1393		found = 0;
   1394		hlist_for_each_entry(p, list, byidx) {
   1395			if (p->index == idx) {
   1396				found = 1;
   1397				break;
   1398			}
   1399		}
   1400		if (!found)
   1401			return idx;
   1402	}
   1403}
   1404
   1405static inline int selector_cmp(struct xfrm_selector *s1, struct xfrm_selector *s2)
   1406{
   1407	u32 *p1 = (u32 *) s1;
   1408	u32 *p2 = (u32 *) s2;
   1409	int len = sizeof(struct xfrm_selector) / sizeof(u32);
   1410	int i;
   1411
   1412	for (i = 0; i < len; i++) {
   1413		if (p1[i] != p2[i])
   1414			return 1;
   1415	}
   1416
   1417	return 0;
   1418}
   1419
   1420static void xfrm_policy_requeue(struct xfrm_policy *old,
   1421				struct xfrm_policy *new)
   1422{
   1423	struct xfrm_policy_queue *pq = &old->polq;
   1424	struct sk_buff_head list;
   1425
   1426	if (skb_queue_empty(&pq->hold_queue))
   1427		return;
   1428
   1429	__skb_queue_head_init(&list);
   1430
   1431	spin_lock_bh(&pq->hold_queue.lock);
   1432	skb_queue_splice_init(&pq->hold_queue, &list);
   1433	if (del_timer(&pq->hold_timer))
   1434		xfrm_pol_put(old);
   1435	spin_unlock_bh(&pq->hold_queue.lock);
   1436
   1437	pq = &new->polq;
   1438
   1439	spin_lock_bh(&pq->hold_queue.lock);
   1440	skb_queue_splice(&list, &pq->hold_queue);
   1441	pq->timeout = XFRM_QUEUE_TMO_MIN;
   1442	if (!mod_timer(&pq->hold_timer, jiffies))
   1443		xfrm_pol_hold(new);
   1444	spin_unlock_bh(&pq->hold_queue.lock);
   1445}
   1446
   1447static inline bool xfrm_policy_mark_match(const struct xfrm_mark *mark,
   1448					  struct xfrm_policy *pol)
   1449{
   1450	return mark->v == pol->mark.v && mark->m == pol->mark.m;
   1451}
   1452
   1453static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed)
   1454{
   1455	const struct xfrm_pol_inexact_key *k = data;
   1456	u32 a = k->type << 24 | k->dir << 16 | k->family;
   1457
   1458	return jhash_3words(a, k->if_id, net_hash_mix(read_pnet(&k->net)),
   1459			    seed);
   1460}
   1461
   1462static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed)
   1463{
   1464	const struct xfrm_pol_inexact_bin *b = data;
   1465
   1466	return xfrm_pol_bin_key(&b->k, 0, seed);
   1467}
   1468
   1469static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg,
   1470			    const void *ptr)
   1471{
   1472	const struct xfrm_pol_inexact_key *key = arg->key;
   1473	const struct xfrm_pol_inexact_bin *b = ptr;
   1474	int ret;
   1475
   1476	if (!net_eq(read_pnet(&b->k.net), read_pnet(&key->net)))
   1477		return -1;
   1478
   1479	ret = b->k.dir ^ key->dir;
   1480	if (ret)
   1481		return ret;
   1482
   1483	ret = b->k.type ^ key->type;
   1484	if (ret)
   1485		return ret;
   1486
   1487	ret = b->k.family ^ key->family;
   1488	if (ret)
   1489		return ret;
   1490
   1491	return b->k.if_id ^ key->if_id;
   1492}
   1493
   1494static const struct rhashtable_params xfrm_pol_inexact_params = {
   1495	.head_offset		= offsetof(struct xfrm_pol_inexact_bin, head),
   1496	.hashfn			= xfrm_pol_bin_key,
   1497	.obj_hashfn		= xfrm_pol_bin_obj,
   1498	.obj_cmpfn		= xfrm_pol_bin_cmp,
   1499	.automatic_shrinking	= true,
   1500};
   1501
   1502static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
   1503					    struct xfrm_policy *policy)
   1504{
   1505	struct xfrm_policy *pol, *delpol = NULL;
   1506	struct hlist_node *newpos = NULL;
   1507	int i = 0;
   1508
   1509	hlist_for_each_entry(pol, chain, bydst_inexact_list) {
   1510		if (pol->type == policy->type &&
   1511		    pol->if_id == policy->if_id &&
   1512		    !selector_cmp(&pol->selector, &policy->selector) &&
   1513		    xfrm_policy_mark_match(&policy->mark, pol) &&
   1514		    xfrm_sec_ctx_match(pol->security, policy->security) &&
   1515		    !WARN_ON(delpol)) {
   1516			delpol = pol;
   1517			if (policy->priority > pol->priority)
   1518				continue;
   1519		} else if (policy->priority >= pol->priority) {
   1520			newpos = &pol->bydst_inexact_list;
   1521			continue;
   1522		}
   1523		if (delpol)
   1524			break;
   1525	}
   1526
   1527	if (newpos)
   1528		hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos);
   1529	else
   1530		hlist_add_head_rcu(&policy->bydst_inexact_list, chain);
   1531
   1532	hlist_for_each_entry(pol, chain, bydst_inexact_list) {
   1533		pol->pos = i;
   1534		i++;
   1535	}
   1536}
   1537
   1538static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain,
   1539						   struct xfrm_policy *policy,
   1540						   bool excl)
   1541{
   1542	struct xfrm_policy *pol, *newpos = NULL, *delpol = NULL;
   1543
   1544	hlist_for_each_entry(pol, chain, bydst) {
   1545		if (pol->type == policy->type &&
   1546		    pol->if_id == policy->if_id &&
   1547		    !selector_cmp(&pol->selector, &policy->selector) &&
   1548		    xfrm_policy_mark_match(&policy->mark, pol) &&
   1549		    xfrm_sec_ctx_match(pol->security, policy->security) &&
   1550		    !WARN_ON(delpol)) {
   1551			if (excl)
   1552				return ERR_PTR(-EEXIST);
   1553			delpol = pol;
   1554			if (policy->priority > pol->priority)
   1555				continue;
   1556		} else if (policy->priority >= pol->priority) {
   1557			newpos = pol;
   1558			continue;
   1559		}
   1560		if (delpol)
   1561			break;
   1562	}
   1563
   1564	if (newpos)
   1565		hlist_add_behind_rcu(&policy->bydst, &newpos->bydst);
   1566	else
   1567		hlist_add_head_rcu(&policy->bydst, chain);
   1568
   1569	return delpol;
   1570}
   1571
   1572int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
   1573{
   1574	struct net *net = xp_net(policy);
   1575	struct xfrm_policy *delpol;
   1576	struct hlist_head *chain;
   1577
   1578	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
   1579	chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
   1580	if (chain)
   1581		delpol = xfrm_policy_insert_list(chain, policy, excl);
   1582	else
   1583		delpol = xfrm_policy_inexact_insert(policy, dir, excl);
   1584
   1585	if (IS_ERR(delpol)) {
   1586		spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
   1587		return PTR_ERR(delpol);
   1588	}
   1589
   1590	__xfrm_policy_link(policy, dir);
   1591
   1592	/* After previous checking, family can either be AF_INET or AF_INET6 */
   1593	if (policy->family == AF_INET)
   1594		rt_genid_bump_ipv4(net);
   1595	else
   1596		rt_genid_bump_ipv6(net);
   1597
   1598	if (delpol) {
   1599		xfrm_policy_requeue(delpol, policy);
   1600		__xfrm_policy_unlink(delpol, dir);
   1601	}
   1602	policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, policy->index);
   1603	hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index));
   1604	policy->curlft.add_time = ktime_get_real_seconds();
   1605	policy->curlft.use_time = 0;
   1606	if (!mod_timer(&policy->timer, jiffies + HZ))
   1607		xfrm_pol_hold(policy);
   1608	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
   1609
   1610	if (delpol)
   1611		xfrm_policy_kill(delpol);
   1612	else if (xfrm_bydst_should_resize(net, dir, NULL))
   1613		schedule_work(&net->xfrm.policy_hash_work);
   1614
   1615	return 0;
   1616}
   1617EXPORT_SYMBOL(xfrm_policy_insert);
   1618
   1619static struct xfrm_policy *
   1620__xfrm_policy_bysel_ctx(struct hlist_head *chain, const struct xfrm_mark *mark,
   1621			u32 if_id, u8 type, int dir, struct xfrm_selector *sel,
   1622			struct xfrm_sec_ctx *ctx)
   1623{
   1624	struct xfrm_policy *pol;
   1625
   1626	if (!chain)
   1627		return NULL;
   1628
   1629	hlist_for_each_entry(pol, chain, bydst) {
   1630		if (pol->type == type &&
   1631		    pol->if_id == if_id &&
   1632		    xfrm_policy_mark_match(mark, pol) &&
   1633		    !selector_cmp(sel, &pol->selector) &&
   1634		    xfrm_sec_ctx_match(ctx, pol->security))
   1635			return pol;
   1636	}
   1637
   1638	return NULL;
   1639}
   1640
   1641struct xfrm_policy *
   1642xfrm_policy_bysel_ctx(struct net *net, const struct xfrm_mark *mark, u32 if_id,
   1643		      u8 type, int dir, struct xfrm_selector *sel,
   1644		      struct xfrm_sec_ctx *ctx, int delete, int *err)
   1645{
   1646	struct xfrm_pol_inexact_bin *bin = NULL;
   1647	struct xfrm_policy *pol, *ret = NULL;
   1648	struct hlist_head *chain;
   1649
   1650	*err = 0;
   1651	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
   1652	chain = policy_hash_bysel(net, sel, sel->family, dir);
   1653	if (!chain) {
   1654		struct xfrm_pol_inexact_candidates cand;
   1655		int i;
   1656
   1657		bin = xfrm_policy_inexact_lookup(net, type,
   1658						 sel->family, dir, if_id);
   1659		if (!bin) {
   1660			spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
   1661			return NULL;
   1662		}
   1663
   1664		if (!xfrm_policy_find_inexact_candidates(&cand, bin,
   1665							 &sel->saddr,
   1666							 &sel->daddr)) {
   1667			spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
   1668			return NULL;
   1669		}
   1670
   1671		pol = NULL;
   1672		for (i = 0; i < ARRAY_SIZE(cand.res); i++) {
   1673			struct xfrm_policy *tmp;
   1674
   1675			tmp = __xfrm_policy_bysel_ctx(cand.res[i], mark,
   1676						      if_id, type, dir,
   1677						      sel, ctx);
   1678			if (!tmp)
   1679				continue;
   1680
   1681			if (!pol || tmp->pos < pol->pos)
   1682				pol = tmp;
   1683		}
   1684	} else {
   1685		pol = __xfrm_policy_bysel_ctx(chain, mark, if_id, type, dir,
   1686					      sel, ctx);
   1687	}
   1688
   1689	if (pol) {
   1690		xfrm_pol_hold(pol);
   1691		if (delete) {
   1692			*err = security_xfrm_policy_delete(pol->security);
   1693			if (*err) {
   1694				spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
   1695				return pol;
   1696			}
   1697			__xfrm_policy_unlink(pol, dir);
   1698		}
   1699		ret = pol;
   1700	}
   1701	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
   1702
   1703	if (ret && delete)
   1704		xfrm_policy_kill(ret);
   1705	if (bin && delete)
   1706		xfrm_policy_inexact_prune_bin(bin);
   1707	return ret;
   1708}
   1709EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
   1710
   1711struct xfrm_policy *
   1712xfrm_policy_byid(struct net *net, const struct xfrm_mark *mark, u32 if_id,
   1713		 u8 type, int dir, u32 id, int delete, int *err)
   1714{
   1715	struct xfrm_policy *pol, *ret;
   1716	struct hlist_head *chain;
   1717
   1718	*err = -ENOENT;
   1719	if (xfrm_policy_id2dir(id) != dir)
   1720		return NULL;
   1721
   1722	*err = 0;
   1723	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
   1724	chain = net->xfrm.policy_byidx + idx_hash(net, id);
   1725	ret = NULL;
   1726	hlist_for_each_entry(pol, chain, byidx) {
   1727		if (pol->type == type && pol->index == id &&
   1728		    pol->if_id == if_id && xfrm_policy_mark_match(mark, pol)) {
   1729			xfrm_pol_hold(pol);
   1730			if (delete) {
   1731				*err = security_xfrm_policy_delete(
   1732								pol->security);
   1733				if (*err) {
   1734					spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
   1735					return pol;
   1736				}
   1737				__xfrm_policy_unlink(pol, dir);
   1738			}
   1739			ret = pol;
   1740			break;
   1741		}
   1742	}
   1743	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
   1744
   1745	if (ret && delete)
   1746		xfrm_policy_kill(ret);
   1747	return ret;
   1748}
   1749EXPORT_SYMBOL(xfrm_policy_byid);
   1750
   1751#ifdef CONFIG_SECURITY_NETWORK_XFRM
   1752static inline int
   1753xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
   1754{
   1755	struct xfrm_policy *pol;
   1756	int err = 0;
   1757
   1758	list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
   1759		if (pol->walk.dead ||
   1760		    xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX ||
   1761		    pol->type != type)
   1762			continue;
   1763
   1764		err = security_xfrm_policy_delete(pol->security);
   1765		if (err) {
   1766			xfrm_audit_policy_delete(pol, 0, task_valid);
   1767			return err;
   1768		}
   1769	}
   1770	return err;
   1771}
   1772#else
   1773static inline int
   1774xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
   1775{
   1776	return 0;
   1777}
   1778#endif
   1779
   1780int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
   1781{
   1782	int dir, err = 0, cnt = 0;
   1783	struct xfrm_policy *pol;
   1784
   1785	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
   1786
   1787	err = xfrm_policy_flush_secctx_check(net, type, task_valid);
   1788	if (err)
   1789		goto out;
   1790
   1791again:
   1792	list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
   1793		dir = xfrm_policy_id2dir(pol->index);
   1794		if (pol->walk.dead ||
   1795		    dir >= XFRM_POLICY_MAX ||
   1796		    pol->type != type)
   1797			continue;
   1798
   1799		__xfrm_policy_unlink(pol, dir);
   1800		spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
   1801		cnt++;
   1802		xfrm_audit_policy_delete(pol, 1, task_valid);
   1803		xfrm_policy_kill(pol);
   1804		spin_lock_bh(&net->xfrm.xfrm_policy_lock);
   1805		goto again;
   1806	}
   1807	if (cnt)
   1808		__xfrm_policy_inexact_flush(net);
   1809	else
   1810		err = -ESRCH;
   1811out:
   1812	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
   1813	return err;
   1814}
   1815EXPORT_SYMBOL(xfrm_policy_flush);
   1816
   1817int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
   1818		     int (*func)(struct xfrm_policy *, int, int, void*),
   1819		     void *data)
   1820{
   1821	struct xfrm_policy *pol;
   1822	struct xfrm_policy_walk_entry *x;
   1823	int error = 0;
   1824
   1825	if (walk->type >= XFRM_POLICY_TYPE_MAX &&
   1826	    walk->type != XFRM_POLICY_TYPE_ANY)
   1827		return -EINVAL;
   1828
   1829	if (list_empty(&walk->walk.all) && walk->seq != 0)
   1830		return 0;
   1831
   1832	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
   1833	if (list_empty(&walk->walk.all))
   1834		x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all);
   1835	else
   1836		x = list_first_entry(&walk->walk.all,
   1837				     struct xfrm_policy_walk_entry, all);
   1838
   1839	list_for_each_entry_from(x, &net->xfrm.policy_all, all) {
   1840		if (x->dead)
   1841			continue;
   1842		pol = container_of(x, struct xfrm_policy, walk);
   1843		if (walk->type != XFRM_POLICY_TYPE_ANY &&
   1844		    walk->type != pol->type)
   1845			continue;
   1846		error = func(pol, xfrm_policy_id2dir(pol->index),
   1847			     walk->seq, data);
   1848		if (error) {
   1849			list_move_tail(&walk->walk.all, &x->all);
   1850			goto out;
   1851		}
   1852		walk->seq++;
   1853	}
   1854	if (walk->seq == 0) {
   1855		error = -ENOENT;
   1856		goto out;
   1857	}
   1858	list_del_init(&walk->walk.all);
   1859out:
   1860	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
   1861	return error;
   1862}
   1863EXPORT_SYMBOL(xfrm_policy_walk);
   1864
   1865void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type)
   1866{
   1867	INIT_LIST_HEAD(&walk->walk.all);
   1868	walk->walk.dead = 1;
   1869	walk->type = type;
   1870	walk->seq = 0;
   1871}
   1872EXPORT_SYMBOL(xfrm_policy_walk_init);
   1873
   1874void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net)
   1875{
   1876	if (list_empty(&walk->walk.all))
   1877		return;
   1878
   1879	spin_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */
   1880	list_del(&walk->walk.all);
   1881	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
   1882}
   1883EXPORT_SYMBOL(xfrm_policy_walk_done);
   1884
   1885/*
   1886 * Find policy to apply to this flow.
   1887 *
   1888 * Returns 0 if policy found, else an -errno.
   1889 */
   1890static int xfrm_policy_match(const struct xfrm_policy *pol,
   1891			     const struct flowi *fl,
   1892			     u8 type, u16 family, int dir, u32 if_id)
   1893{
   1894	const struct xfrm_selector *sel = &pol->selector;
   1895	int ret = -ESRCH;
   1896	bool match;
   1897
   1898	if (pol->family != family ||
   1899	    pol->if_id != if_id ||
   1900	    (fl->flowi_mark & pol->mark.m) != pol->mark.v ||
   1901	    pol->type != type)
   1902		return ret;
   1903
   1904	match = xfrm_selector_match(sel, fl, family);
   1905	if (match)
   1906		ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid);
   1907	return ret;
   1908}
   1909
   1910static struct xfrm_pol_inexact_node *
   1911xfrm_policy_lookup_inexact_addr(const struct rb_root *r,
   1912				seqcount_spinlock_t *count,
   1913				const xfrm_address_t *addr, u16 family)
   1914{
   1915	const struct rb_node *parent;
   1916	int seq;
   1917
   1918again:
   1919	seq = read_seqcount_begin(count);
   1920
   1921	parent = rcu_dereference_raw(r->rb_node);
   1922	while (parent) {
   1923		struct xfrm_pol_inexact_node *node;
   1924		int delta;
   1925
   1926		node = rb_entry(parent, struct xfrm_pol_inexact_node, node);
   1927
   1928		delta = xfrm_policy_addr_delta(addr, &node->addr,
   1929					       node->prefixlen, family);
   1930		if (delta < 0) {
   1931			parent = rcu_dereference_raw(parent->rb_left);
   1932			continue;
   1933		} else if (delta > 0) {
   1934			parent = rcu_dereference_raw(parent->rb_right);
   1935			continue;
   1936		}
   1937
   1938		return node;
   1939	}
   1940
   1941	if (read_seqcount_retry(count, seq))
   1942		goto again;
   1943
   1944	return NULL;
   1945}
   1946
   1947static bool
   1948xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
   1949				    struct xfrm_pol_inexact_bin *b,
   1950				    const xfrm_address_t *saddr,
   1951				    const xfrm_address_t *daddr)
   1952{
   1953	struct xfrm_pol_inexact_node *n;
   1954	u16 family;
   1955
   1956	if (!b)
   1957		return false;
   1958
   1959	family = b->k.family;
   1960	memset(cand, 0, sizeof(*cand));
   1961	cand->res[XFRM_POL_CAND_ANY] = &b->hhead;
   1962
   1963	n = xfrm_policy_lookup_inexact_addr(&b->root_d, &b->count, daddr,
   1964					    family);
   1965	if (n) {
   1966		cand->res[XFRM_POL_CAND_DADDR] = &n->hhead;
   1967		n = xfrm_policy_lookup_inexact_addr(&n->root, &b->count, saddr,
   1968						    family);
   1969		if (n)
   1970			cand->res[XFRM_POL_CAND_BOTH] = &n->hhead;
   1971	}
   1972
   1973	n = xfrm_policy_lookup_inexact_addr(&b->root_s, &b->count, saddr,
   1974					    family);
   1975	if (n)
   1976		cand->res[XFRM_POL_CAND_SADDR] = &n->hhead;
   1977
   1978	return true;
   1979}
   1980
   1981static struct xfrm_pol_inexact_bin *
   1982xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family,
   1983			       u8 dir, u32 if_id)
   1984{
   1985	struct xfrm_pol_inexact_key k = {
   1986		.family = family,
   1987		.type = type,
   1988		.dir = dir,
   1989		.if_id = if_id,
   1990	};
   1991
   1992	write_pnet(&k.net, net);
   1993
   1994	return rhashtable_lookup(&xfrm_policy_inexact_table, &k,
   1995				 xfrm_pol_inexact_params);
   1996}
   1997
   1998static struct xfrm_pol_inexact_bin *
   1999xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family,
   2000			   u8 dir, u32 if_id)
   2001{
   2002	struct xfrm_pol_inexact_bin *bin;
   2003
   2004	lockdep_assert_held(&net->xfrm.xfrm_policy_lock);
   2005
   2006	rcu_read_lock();
   2007	bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
   2008	rcu_read_unlock();
   2009
   2010	return bin;
   2011}
   2012
   2013static struct xfrm_policy *
   2014__xfrm_policy_eval_candidates(struct hlist_head *chain,
   2015			      struct xfrm_policy *prefer,
   2016			      const struct flowi *fl,
   2017			      u8 type, u16 family, int dir, u32 if_id)
   2018{
   2019	u32 priority = prefer ? prefer->priority : ~0u;
   2020	struct xfrm_policy *pol;
   2021
   2022	if (!chain)
   2023		return NULL;
   2024
   2025	hlist_for_each_entry_rcu(pol, chain, bydst) {
   2026		int err;
   2027
   2028		if (pol->priority > priority)
   2029			break;
   2030
   2031		err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
   2032		if (err) {
   2033			if (err != -ESRCH)
   2034				return ERR_PTR(err);
   2035
   2036			continue;
   2037		}
   2038
   2039		if (prefer) {
   2040			/* matches.  Is it older than *prefer? */
   2041			if (pol->priority == priority &&
   2042			    prefer->pos < pol->pos)
   2043				return prefer;
   2044		}
   2045
   2046		return pol;
   2047	}
   2048
   2049	return NULL;
   2050}
   2051
   2052static struct xfrm_policy *
   2053xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand,
   2054			    struct xfrm_policy *prefer,
   2055			    const struct flowi *fl,
   2056			    u8 type, u16 family, int dir, u32 if_id)
   2057{
   2058	struct xfrm_policy *tmp;
   2059	int i;
   2060
   2061	for (i = 0; i < ARRAY_SIZE(cand->res); i++) {
   2062		tmp = __xfrm_policy_eval_candidates(cand->res[i],
   2063						    prefer,
   2064						    fl, type, family, dir,
   2065						    if_id);
   2066		if (!tmp)
   2067			continue;
   2068
   2069		if (IS_ERR(tmp))
   2070			return tmp;
   2071		prefer = tmp;
   2072	}
   2073
   2074	return prefer;
   2075}
   2076
   2077static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
   2078						     const struct flowi *fl,
   2079						     u16 family, u8 dir,
   2080						     u32 if_id)
   2081{
   2082	struct xfrm_pol_inexact_candidates cand;
   2083	const xfrm_address_t *daddr, *saddr;
   2084	struct xfrm_pol_inexact_bin *bin;
   2085	struct xfrm_policy *pol, *ret;
   2086	struct hlist_head *chain;
   2087	unsigned int sequence;
   2088	int err;
   2089
   2090	daddr = xfrm_flowi_daddr(fl, family);
   2091	saddr = xfrm_flowi_saddr(fl, family);
   2092	if (unlikely(!daddr || !saddr))
   2093		return NULL;
   2094
   2095	rcu_read_lock();
   2096 retry:
   2097	do {
   2098		sequence = read_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);
   2099		chain = policy_hash_direct(net, daddr, saddr, family, dir);
   2100	} while (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence));
   2101
   2102	ret = NULL;
   2103	hlist_for_each_entry_rcu(pol, chain, bydst) {
   2104		err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
   2105		if (err) {
   2106			if (err == -ESRCH)
   2107				continue;
   2108			else {
   2109				ret = ERR_PTR(err);
   2110				goto fail;
   2111			}
   2112		} else {
   2113			ret = pol;
   2114			break;
   2115		}
   2116	}
   2117	bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
   2118	if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr,
   2119							 daddr))
   2120		goto skip_inexact;
   2121
   2122	pol = xfrm_policy_eval_candidates(&cand, ret, fl, type,
   2123					  family, dir, if_id);
   2124	if (pol) {
   2125		ret = pol;
   2126		if (IS_ERR(pol))
   2127			goto fail;
   2128	}
   2129
   2130skip_inexact:
   2131	if (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence))
   2132		goto retry;
   2133
   2134	if (ret && !xfrm_pol_hold_rcu(ret))
   2135		goto retry;
   2136fail:
   2137	rcu_read_unlock();
   2138
   2139	return ret;
   2140}
   2141
   2142static struct xfrm_policy *xfrm_policy_lookup(struct net *net,
   2143					      const struct flowi *fl,
   2144					      u16 family, u8 dir, u32 if_id)
   2145{
   2146#ifdef CONFIG_XFRM_SUB_POLICY
   2147	struct xfrm_policy *pol;
   2148
   2149	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family,
   2150					dir, if_id);
   2151	if (pol != NULL)
   2152		return pol;
   2153#endif
   2154	return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family,
   2155					 dir, if_id);
   2156}
   2157
   2158static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
   2159						 const struct flowi *fl,
   2160						 u16 family, u32 if_id)
   2161{
   2162	struct xfrm_policy *pol;
   2163
   2164	rcu_read_lock();
   2165 again:
   2166	pol = rcu_dereference(sk->sk_policy[dir]);
   2167	if (pol != NULL) {
   2168		bool match;
   2169		int err = 0;
   2170
   2171		if (pol->family != family) {
   2172			pol = NULL;
   2173			goto out;
   2174		}
   2175
   2176		match = xfrm_selector_match(&pol->selector, fl, family);
   2177		if (match) {
   2178			if ((sk->sk_mark & pol->mark.m) != pol->mark.v ||
   2179			    pol->if_id != if_id) {
   2180				pol = NULL;
   2181				goto out;
   2182			}
   2183			err = security_xfrm_policy_lookup(pol->security,
   2184						      fl->flowi_secid);
   2185			if (!err) {
   2186				if (!xfrm_pol_hold_rcu(pol))
   2187					goto again;
   2188			} else if (err == -ESRCH) {
   2189				pol = NULL;
   2190			} else {
   2191				pol = ERR_PTR(err);
   2192			}
   2193		} else
   2194			pol = NULL;
   2195	}
   2196out:
   2197	rcu_read_unlock();
   2198	return pol;
   2199}
   2200
   2201static void __xfrm_policy_link(struct xfrm_policy *pol, int dir)
   2202{
   2203	struct net *net = xp_net(pol);
   2204
   2205	list_add(&pol->walk.all, &net->xfrm.policy_all);
   2206	net->xfrm.policy_count[dir]++;
   2207	xfrm_pol_hold(pol);
   2208}
   2209
   2210static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
   2211						int dir)
   2212{
   2213	struct net *net = xp_net(pol);
   2214
   2215	if (list_empty(&pol->walk.all))
   2216		return NULL;
   2217
   2218	/* Socket policies are not hashed. */
   2219	if (!hlist_unhashed(&pol->bydst)) {
   2220		hlist_del_rcu(&pol->bydst);
   2221		hlist_del_init(&pol->bydst_inexact_list);
   2222		hlist_del(&pol->byidx);
   2223	}
   2224
   2225	list_del_init(&pol->walk.all);
   2226	net->xfrm.policy_count[dir]--;
   2227
   2228	return pol;
   2229}
   2230
   2231static void xfrm_sk_policy_link(struct xfrm_policy *pol, int dir)
   2232{
   2233	__xfrm_policy_link(pol, XFRM_POLICY_MAX + dir);
   2234}
   2235
   2236static void xfrm_sk_policy_unlink(struct xfrm_policy *pol, int dir)
   2237{
   2238	__xfrm_policy_unlink(pol, XFRM_POLICY_MAX + dir);
   2239}
   2240
   2241int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
   2242{
   2243	struct net *net = xp_net(pol);
   2244
   2245	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
   2246	pol = __xfrm_policy_unlink(pol, dir);
   2247	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
   2248	if (pol) {
   2249		xfrm_policy_kill(pol);
   2250		return 0;
   2251	}
   2252	return -ENOENT;
   2253}
   2254EXPORT_SYMBOL(xfrm_policy_delete);
   2255
   2256int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
   2257{
   2258	struct net *net = sock_net(sk);
   2259	struct xfrm_policy *old_pol;
   2260
   2261#ifdef CONFIG_XFRM_SUB_POLICY
   2262	if (pol && pol->type != XFRM_POLICY_TYPE_MAIN)
   2263		return -EINVAL;
   2264#endif
   2265
   2266	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
   2267	old_pol = rcu_dereference_protected(sk->sk_policy[dir],
   2268				lockdep_is_held(&net->xfrm.xfrm_policy_lock));
   2269	if (pol) {
   2270		pol->curlft.add_time = ktime_get_real_seconds();
   2271		pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0);
   2272		xfrm_sk_policy_link(pol, dir);
   2273	}
   2274	rcu_assign_pointer(sk->sk_policy[dir], pol);
   2275	if (old_pol) {
   2276		if (pol)
   2277			xfrm_policy_requeue(old_pol, pol);
   2278
   2279		/* Unlinking succeeds always. This is the only function
   2280		 * allowed to delete or replace socket policy.
   2281		 */
   2282		xfrm_sk_policy_unlink(old_pol, dir);
   2283	}
   2284	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
   2285
   2286	if (old_pol) {
   2287		xfrm_policy_kill(old_pol);
   2288	}
   2289	return 0;
   2290}
   2291
   2292static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
   2293{
   2294	struct xfrm_policy *newp = xfrm_policy_alloc(xp_net(old), GFP_ATOMIC);
   2295	struct net *net = xp_net(old);
   2296
   2297	if (newp) {
   2298		newp->selector = old->selector;
   2299		if (security_xfrm_policy_clone(old->security,
   2300					       &newp->security)) {
   2301			kfree(newp);
   2302			return NULL;  /* ENOMEM */
   2303		}
   2304		newp->lft = old->lft;
   2305		newp->curlft = old->curlft;
   2306		newp->mark = old->mark;
   2307		newp->if_id = old->if_id;
   2308		newp->action = old->action;
   2309		newp->flags = old->flags;
   2310		newp->xfrm_nr = old->xfrm_nr;
   2311		newp->index = old->index;
   2312		newp->type = old->type;
   2313		newp->family = old->family;
   2314		memcpy(newp->xfrm_vec, old->xfrm_vec,
   2315		       newp->xfrm_nr*sizeof(struct xfrm_tmpl));
   2316		spin_lock_bh(&net->xfrm.xfrm_policy_lock);
   2317		xfrm_sk_policy_link(newp, dir);
   2318		spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
   2319		xfrm_pol_put(newp);
   2320	}
   2321	return newp;
   2322}
   2323
   2324int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
   2325{
   2326	const struct xfrm_policy *p;
   2327	struct xfrm_policy *np;
   2328	int i, ret = 0;
   2329
   2330	rcu_read_lock();
   2331	for (i = 0; i < 2; i++) {
   2332		p = rcu_dereference(osk->sk_policy[i]);
   2333		if (p) {
   2334			np = clone_policy(p, i);
   2335			if (unlikely(!np)) {
   2336				ret = -ENOMEM;
   2337				break;
   2338			}
   2339			rcu_assign_pointer(sk->sk_policy[i], np);
   2340		}
   2341	}
   2342	rcu_read_unlock();
   2343	return ret;
   2344}
   2345
   2346static int
   2347xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local,
   2348	       xfrm_address_t *remote, unsigned short family, u32 mark)
   2349{
   2350	int err;
   2351	const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
   2352
   2353	if (unlikely(afinfo == NULL))
   2354		return -EINVAL;
   2355	err = afinfo->get_saddr(net, oif, local, remote, mark);
   2356	rcu_read_unlock();
   2357	return err;
   2358}
   2359
   2360/* Resolve list of templates for the flow, given policy. */
   2361
   2362static int
   2363xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
   2364		      struct xfrm_state **xfrm, unsigned short family)
   2365{
   2366	struct net *net = xp_net(policy);
   2367	int nx;
   2368	int i, error;
   2369	xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family);
   2370	xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family);
   2371	xfrm_address_t tmp;
   2372
   2373	for (nx = 0, i = 0; i < policy->xfrm_nr; i++) {
   2374		struct xfrm_state *x;
   2375		xfrm_address_t *remote = daddr;
   2376		xfrm_address_t *local  = saddr;
   2377		struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i];
   2378
   2379		if (tmpl->mode == XFRM_MODE_TUNNEL ||
   2380		    tmpl->mode == XFRM_MODE_BEET) {
   2381			remote = &tmpl->id.daddr;
   2382			local = &tmpl->saddr;
   2383			if (xfrm_addr_any(local, tmpl->encap_family)) {
   2384				error = xfrm_get_saddr(net, fl->flowi_oif,
   2385						       &tmp, remote,
   2386						       tmpl->encap_family, 0);
   2387				if (error)
   2388					goto fail;
   2389				local = &tmp;
   2390			}
   2391		}
   2392
   2393		x = xfrm_state_find(remote, local, fl, tmpl, policy, &error,
   2394				    family, policy->if_id);
   2395
   2396		if (x && x->km.state == XFRM_STATE_VALID) {
   2397			xfrm[nx++] = x;
   2398			daddr = remote;
   2399			saddr = local;
   2400			continue;
   2401		}
   2402		if (x) {
   2403			error = (x->km.state == XFRM_STATE_ERROR ?
   2404				 -EINVAL : -EAGAIN);
   2405			xfrm_state_put(x);
   2406		} else if (error == -ESRCH) {
   2407			error = -EAGAIN;
   2408		}
   2409
   2410		if (!tmpl->optional)
   2411			goto fail;
   2412	}
   2413	return nx;
   2414
   2415fail:
   2416	for (nx--; nx >= 0; nx--)
   2417		xfrm_state_put(xfrm[nx]);
   2418	return error;
   2419}
   2420
   2421static int
   2422xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl,
   2423		  struct xfrm_state **xfrm, unsigned short family)
   2424{
   2425	struct xfrm_state *tp[XFRM_MAX_DEPTH];
   2426	struct xfrm_state **tpp = (npols > 1) ? tp : xfrm;
   2427	int cnx = 0;
   2428	int error;
   2429	int ret;
   2430	int i;
   2431
   2432	for (i = 0; i < npols; i++) {
   2433		if (cnx + pols[i]->xfrm_nr >= XFRM_MAX_DEPTH) {
   2434			error = -ENOBUFS;
   2435			goto fail;
   2436		}
   2437
   2438		ret = xfrm_tmpl_resolve_one(pols[i], fl, &tpp[cnx], family);
   2439		if (ret < 0) {
   2440			error = ret;
   2441			goto fail;
   2442		} else
   2443			cnx += ret;
   2444	}
   2445
   2446	/* found states are sorted for outbound processing */
   2447	if (npols > 1)
   2448		xfrm_state_sort(xfrm, tpp, cnx, family);
   2449
   2450	return cnx;
   2451
   2452 fail:
   2453	for (cnx--; cnx >= 0; cnx--)
   2454		xfrm_state_put(tpp[cnx]);
   2455	return error;
   2456
   2457}
   2458
   2459static int xfrm_get_tos(const struct flowi *fl, int family)
   2460{
   2461	if (family == AF_INET)
   2462		return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos;
   2463
   2464	return 0;
   2465}
   2466
   2467static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
   2468{
   2469	const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
   2470	struct dst_ops *dst_ops;
   2471	struct xfrm_dst *xdst;
   2472
   2473	if (!afinfo)
   2474		return ERR_PTR(-EINVAL);
   2475
   2476	switch (family) {
   2477	case AF_INET:
   2478		dst_ops = &net->xfrm.xfrm4_dst_ops;
   2479		break;
   2480#if IS_ENABLED(CONFIG_IPV6)
   2481	case AF_INET6:
   2482		dst_ops = &net->xfrm.xfrm6_dst_ops;
   2483		break;
   2484#endif
   2485	default:
   2486		BUG();
   2487	}
   2488	xdst = dst_alloc(dst_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
   2489
   2490	if (likely(xdst)) {
   2491		memset_after(xdst, 0, u.dst);
   2492	} else
   2493		xdst = ERR_PTR(-ENOBUFS);
   2494
   2495	rcu_read_unlock();
   2496
   2497	return xdst;
   2498}
   2499
   2500static void xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
   2501			   int nfheader_len)
   2502{
   2503	if (dst->ops->family == AF_INET6) {
   2504		struct rt6_info *rt = (struct rt6_info *)dst;
   2505		path->path_cookie = rt6_get_cookie(rt);
   2506		path->u.rt6.rt6i_nfheader_len = nfheader_len;
   2507	}
   2508}
   2509
   2510static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
   2511				const struct flowi *fl)
   2512{
   2513	const struct xfrm_policy_afinfo *afinfo =
   2514		xfrm_policy_get_afinfo(xdst->u.dst.ops->family);
   2515	int err;
   2516
   2517	if (!afinfo)
   2518		return -EINVAL;
   2519
   2520	err = afinfo->fill_dst(xdst, dev, fl);
   2521
   2522	rcu_read_unlock();
   2523
   2524	return err;
   2525}
   2526
   2527
   2528/* Allocate chain of dst_entry's, attach known xfrm's, calculate
   2529 * all the metrics... Shortly, bundle a bundle.
   2530 */
   2531
   2532static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
   2533					    struct xfrm_state **xfrm,
   2534					    struct xfrm_dst **bundle,
   2535					    int nx,
   2536					    const struct flowi *fl,
   2537					    struct dst_entry *dst)
   2538{
   2539	const struct xfrm_state_afinfo *afinfo;
   2540	const struct xfrm_mode *inner_mode;
   2541	struct net *net = xp_net(policy);
   2542	unsigned long now = jiffies;
   2543	struct net_device *dev;
   2544	struct xfrm_dst *xdst_prev = NULL;
   2545	struct xfrm_dst *xdst0 = NULL;
   2546	int i = 0;
   2547	int err;
   2548	int header_len = 0;
   2549	int nfheader_len = 0;
   2550	int trailer_len = 0;
   2551	int tos;
   2552	int family = policy->selector.family;
   2553	xfrm_address_t saddr, daddr;
   2554
   2555	xfrm_flowi_addr_get(fl, &saddr, &daddr, family);
   2556
   2557	tos = xfrm_get_tos(fl, family);
   2558
   2559	dst_hold(dst);
   2560
   2561	for (; i < nx; i++) {
   2562		struct xfrm_dst *xdst = xfrm_alloc_dst(net, family);
   2563		struct dst_entry *dst1 = &xdst->u.dst;
   2564
   2565		err = PTR_ERR(xdst);
   2566		if (IS_ERR(xdst)) {
   2567			dst_release(dst);
   2568			goto put_states;
   2569		}
   2570
   2571		bundle[i] = xdst;
   2572		if (!xdst_prev)
   2573			xdst0 = xdst;
   2574		else
   2575			/* Ref count is taken during xfrm_alloc_dst()
   2576			 * No need to do dst_clone() on dst1
   2577			 */
   2578			xfrm_dst_set_child(xdst_prev, &xdst->u.dst);
   2579
   2580		if (xfrm[i]->sel.family == AF_UNSPEC) {
   2581			inner_mode = xfrm_ip2inner_mode(xfrm[i],
   2582							xfrm_af2proto(family));
   2583			if (!inner_mode) {
   2584				err = -EAFNOSUPPORT;
   2585				dst_release(dst);
   2586				goto put_states;
   2587			}
   2588		} else
   2589			inner_mode = &xfrm[i]->inner_mode;
   2590
   2591		xdst->route = dst;
   2592		dst_copy_metrics(dst1, dst);
   2593
   2594		if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
   2595			__u32 mark = 0;
   2596			int oif;
   2597
   2598			if (xfrm[i]->props.smark.v || xfrm[i]->props.smark.m)
   2599				mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]);
   2600
   2601			family = xfrm[i]->props.family;
   2602			oif = fl->flowi_oif ? : fl->flowi_l3mdev;
   2603			dst = xfrm_dst_lookup(xfrm[i], tos, oif,
   2604					      &saddr, &daddr, family, mark);
   2605			err = PTR_ERR(dst);
   2606			if (IS_ERR(dst))
   2607				goto put_states;
   2608		} else
   2609			dst_hold(dst);
   2610
   2611		dst1->xfrm = xfrm[i];
   2612		xdst->xfrm_genid = xfrm[i]->genid;
   2613
   2614		dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
   2615		dst1->lastuse = now;
   2616
   2617		dst1->input = dst_discard;
   2618
   2619		rcu_read_lock();
   2620		afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family);
   2621		if (likely(afinfo))
   2622			dst1->output = afinfo->output;
   2623		else
   2624			dst1->output = dst_discard_out;
   2625		rcu_read_unlock();
   2626
   2627		xdst_prev = xdst;
   2628
   2629		header_len += xfrm[i]->props.header_len;
   2630		if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT)
   2631			nfheader_len += xfrm[i]->props.header_len;
   2632		trailer_len += xfrm[i]->props.trailer_len;
   2633	}
   2634
   2635	xfrm_dst_set_child(xdst_prev, dst);
   2636	xdst0->path = dst;
   2637
   2638	err = -ENODEV;
   2639	dev = dst->dev;
   2640	if (!dev)
   2641		goto free_dst;
   2642
   2643	xfrm_init_path(xdst0, dst, nfheader_len);
   2644	xfrm_init_pmtu(bundle, nx);
   2645
   2646	for (xdst_prev = xdst0; xdst_prev != (struct xfrm_dst *)dst;
   2647	     xdst_prev = (struct xfrm_dst *) xfrm_dst_child(&xdst_prev->u.dst)) {
   2648		err = xfrm_fill_dst(xdst_prev, dev, fl);
   2649		if (err)
   2650			goto free_dst;
   2651
   2652		xdst_prev->u.dst.header_len = header_len;
   2653		xdst_prev->u.dst.trailer_len = trailer_len;
   2654		header_len -= xdst_prev->u.dst.xfrm->props.header_len;
   2655		trailer_len -= xdst_prev->u.dst.xfrm->props.trailer_len;
   2656	}
   2657
   2658	return &xdst0->u.dst;
   2659
   2660put_states:
   2661	for (; i < nx; i++)
   2662		xfrm_state_put(xfrm[i]);
   2663free_dst:
   2664	if (xdst0)
   2665		dst_release_immediate(&xdst0->u.dst);
   2666
   2667	return ERR_PTR(err);
   2668}
   2669
   2670static int xfrm_expand_policies(const struct flowi *fl, u16 family,
   2671				struct xfrm_policy **pols,
   2672				int *num_pols, int *num_xfrms)
   2673{
   2674	int i;
   2675
   2676	if (*num_pols == 0 || !pols[0]) {
   2677		*num_pols = 0;
   2678		*num_xfrms = 0;
   2679		return 0;
   2680	}
   2681	if (IS_ERR(pols[0]))
   2682		return PTR_ERR(pols[0]);
   2683
   2684	*num_xfrms = pols[0]->xfrm_nr;
   2685
   2686#ifdef CONFIG_XFRM_SUB_POLICY
   2687	if (pols[0]->action == XFRM_POLICY_ALLOW &&
   2688	    pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
   2689		pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]),
   2690						    XFRM_POLICY_TYPE_MAIN,
   2691						    fl, family,
   2692						    XFRM_POLICY_OUT,
   2693						    pols[0]->if_id);
   2694		if (pols[1]) {
   2695			if (IS_ERR(pols[1])) {
   2696				xfrm_pols_put(pols, *num_pols);
   2697				return PTR_ERR(pols[1]);
   2698			}
   2699			(*num_pols)++;
   2700			(*num_xfrms) += pols[1]->xfrm_nr;
   2701		}
   2702	}
   2703#endif
   2704	for (i = 0; i < *num_pols; i++) {
   2705		if (pols[i]->action != XFRM_POLICY_ALLOW) {
   2706			*num_xfrms = -1;
   2707			break;
   2708		}
   2709	}
   2710
   2711	return 0;
   2712
   2713}
   2714
   2715static struct xfrm_dst *
   2716xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
   2717			       const struct flowi *fl, u16 family,
   2718			       struct dst_entry *dst_orig)
   2719{
   2720	struct net *net = xp_net(pols[0]);
   2721	struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
   2722	struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
   2723	struct xfrm_dst *xdst;
   2724	struct dst_entry *dst;
   2725	int err;
   2726
   2727	/* Try to instantiate a bundle */
   2728	err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
   2729	if (err <= 0) {
   2730		if (err == 0)
   2731			return NULL;
   2732
   2733		if (err != -EAGAIN)
   2734			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
   2735		return ERR_PTR(err);
   2736	}
   2737
   2738	dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig);
   2739	if (IS_ERR(dst)) {
   2740		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
   2741		return ERR_CAST(dst);
   2742	}
   2743
   2744	xdst = (struct xfrm_dst *)dst;
   2745	xdst->num_xfrms = err;
   2746	xdst->num_pols = num_pols;
   2747	memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
   2748	xdst->policy_genid = atomic_read(&pols[0]->genid);
   2749
   2750	return xdst;
   2751}
   2752
   2753static void xfrm_policy_queue_process(struct timer_list *t)
   2754{
   2755	struct sk_buff *skb;
   2756	struct sock *sk;
   2757	struct dst_entry *dst;
   2758	struct xfrm_policy *pol = from_timer(pol, t, polq.hold_timer);
   2759	struct net *net = xp_net(pol);
   2760	struct xfrm_policy_queue *pq = &pol->polq;
   2761	struct flowi fl;
   2762	struct sk_buff_head list;
   2763	__u32 skb_mark;
   2764
   2765	spin_lock(&pq->hold_queue.lock);
   2766	skb = skb_peek(&pq->hold_queue);
   2767	if (!skb) {
   2768		spin_unlock(&pq->hold_queue.lock);
   2769		goto out;
   2770	}
   2771	dst = skb_dst(skb);
   2772	sk = skb->sk;
   2773
   2774	/* Fixup the mark to support VTI. */
   2775	skb_mark = skb->mark;
   2776	skb->mark = pol->mark.v;
   2777	xfrm_decode_session(skb, &fl, dst->ops->family);
   2778	skb->mark = skb_mark;
   2779	spin_unlock(&pq->hold_queue.lock);
   2780
   2781	dst_hold(xfrm_dst_path(dst));
   2782	dst = xfrm_lookup(net, xfrm_dst_path(dst), &fl, sk, XFRM_LOOKUP_QUEUE);
   2783	if (IS_ERR(dst))
   2784		goto purge_queue;
   2785
   2786	if (dst->flags & DST_XFRM_QUEUE) {
   2787		dst_release(dst);
   2788
   2789		if (pq->timeout >= XFRM_QUEUE_TMO_MAX)
   2790			goto purge_queue;
   2791
   2792		pq->timeout = pq->timeout << 1;
   2793		if (!mod_timer(&pq->hold_timer, jiffies + pq->timeout))
   2794			xfrm_pol_hold(pol);
   2795		goto out;
   2796	}
   2797
   2798	dst_release(dst);
   2799
   2800	__skb_queue_head_init(&list);
   2801
   2802	spin_lock(&pq->hold_queue.lock);
   2803	pq->timeout = 0;
   2804	skb_queue_splice_init(&pq->hold_queue, &list);
   2805	spin_unlock(&pq->hold_queue.lock);
   2806
   2807	while (!skb_queue_empty(&list)) {
   2808		skb = __skb_dequeue(&list);
   2809
   2810		/* Fixup the mark to support VTI. */
   2811		skb_mark = skb->mark;
   2812		skb->mark = pol->mark.v;
   2813		xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family);
   2814		skb->mark = skb_mark;
   2815
   2816		dst_hold(xfrm_dst_path(skb_dst(skb)));
   2817		dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0);
   2818		if (IS_ERR(dst)) {
   2819			kfree_skb(skb);
   2820			continue;
   2821		}
   2822
   2823		nf_reset_ct(skb);
   2824		skb_dst_drop(skb);
   2825		skb_dst_set(skb, dst);
   2826
   2827		dst_output(net, skb->sk, skb);
   2828	}
   2829
   2830out:
   2831	xfrm_pol_put(pol);
   2832	return;
   2833
   2834purge_queue:
   2835	pq->timeout = 0;
   2836	skb_queue_purge(&pq->hold_queue);
   2837	xfrm_pol_put(pol);
   2838}
   2839
   2840static int xdst_queue_output(struct net *net, struct sock *sk, struct sk_buff *skb)
   2841{
   2842	unsigned long sched_next;
   2843	struct dst_entry *dst = skb_dst(skb);
   2844	struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
   2845	struct xfrm_policy *pol = xdst->pols[0];
   2846	struct xfrm_policy_queue *pq = &pol->polq;
   2847
   2848	if (unlikely(skb_fclone_busy(sk, skb))) {
   2849		kfree_skb(skb);
   2850		return 0;
   2851	}
   2852
   2853	if (pq->hold_queue.qlen > XFRM_MAX_QUEUE_LEN) {
   2854		kfree_skb(skb);
   2855		return -EAGAIN;
   2856	}
   2857
   2858	skb_dst_force(skb);
   2859
   2860	spin_lock_bh(&pq->hold_queue.lock);
   2861
   2862	if (!pq->timeout)
   2863		pq->timeout = XFRM_QUEUE_TMO_MIN;
   2864
   2865	sched_next = jiffies + pq->timeout;
   2866
   2867	if (del_timer(&pq->hold_timer)) {
   2868		if (time_before(pq->hold_timer.expires, sched_next))
   2869			sched_next = pq->hold_timer.expires;
   2870		xfrm_pol_put(pol);
   2871	}
   2872
   2873	__skb_queue_tail(&pq->hold_queue, skb);
   2874	if (!mod_timer(&pq->hold_timer, sched_next))
   2875		xfrm_pol_hold(pol);
   2876
   2877	spin_unlock_bh(&pq->hold_queue.lock);
   2878
   2879	return 0;
   2880}
   2881
   2882static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net,
   2883						 struct xfrm_flo *xflo,
   2884						 const struct flowi *fl,
   2885						 int num_xfrms,
   2886						 u16 family)
   2887{
   2888	int err;
   2889	struct net_device *dev;
   2890	struct dst_entry *dst;
   2891	struct dst_entry *dst1;
   2892	struct xfrm_dst *xdst;
   2893
   2894	xdst = xfrm_alloc_dst(net, family);
   2895	if (IS_ERR(xdst))
   2896		return xdst;
   2897
   2898	if (!(xflo->flags & XFRM_LOOKUP_QUEUE) ||
   2899	    net->xfrm.sysctl_larval_drop ||
   2900	    num_xfrms <= 0)
   2901		return xdst;
   2902
   2903	dst = xflo->dst_orig;
   2904	dst1 = &xdst->u.dst;
   2905	dst_hold(dst);
   2906	xdst->route = dst;
   2907
   2908	dst_copy_metrics(dst1, dst);
   2909
   2910	dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
   2911	dst1->flags |= DST_XFRM_QUEUE;
   2912	dst1->lastuse = jiffies;
   2913
   2914	dst1->input = dst_discard;
   2915	dst1->output = xdst_queue_output;
   2916
   2917	dst_hold(dst);
   2918	xfrm_dst_set_child(xdst, dst);
   2919	xdst->path = dst;
   2920
   2921	xfrm_init_path((struct xfrm_dst *)dst1, dst, 0);
   2922
   2923	err = -ENODEV;
   2924	dev = dst->dev;
   2925	if (!dev)
   2926		goto free_dst;
   2927
   2928	err = xfrm_fill_dst(xdst, dev, fl);
   2929	if (err)
   2930		goto free_dst;
   2931
   2932out:
   2933	return xdst;
   2934
   2935free_dst:
   2936	dst_release(dst1);
   2937	xdst = ERR_PTR(err);
   2938	goto out;
   2939}
   2940
   2941static struct xfrm_dst *xfrm_bundle_lookup(struct net *net,
   2942					   const struct flowi *fl,
   2943					   u16 family, u8 dir,
   2944					   struct xfrm_flo *xflo, u32 if_id)
   2945{
   2946	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
   2947	int num_pols = 0, num_xfrms = 0, err;
   2948	struct xfrm_dst *xdst;
   2949
   2950	/* Resolve policies to use if we couldn't get them from
   2951	 * previous cache entry */
   2952	num_pols = 1;
   2953	pols[0] = xfrm_policy_lookup(net, fl, family, dir, if_id);
   2954	err = xfrm_expand_policies(fl, family, pols,
   2955					   &num_pols, &num_xfrms);
   2956	if (err < 0)
   2957		goto inc_error;
   2958	if (num_pols == 0)
   2959		return NULL;
   2960	if (num_xfrms <= 0)
   2961		goto make_dummy_bundle;
   2962
   2963	xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family,
   2964					      xflo->dst_orig);
   2965	if (IS_ERR(xdst)) {
   2966		err = PTR_ERR(xdst);
   2967		if (err == -EREMOTE) {
   2968			xfrm_pols_put(pols, num_pols);
   2969			return NULL;
   2970		}
   2971
   2972		if (err != -EAGAIN)
   2973			goto error;
   2974		goto make_dummy_bundle;
   2975	} else if (xdst == NULL) {
   2976		num_xfrms = 0;
   2977		goto make_dummy_bundle;
   2978	}
   2979
   2980	return xdst;
   2981
   2982make_dummy_bundle:
   2983	/* We found policies, but there's no bundles to instantiate:
   2984	 * either because the policy blocks, has no transformations or
   2985	 * we could not build template (no xfrm_states).*/
   2986	xdst = xfrm_create_dummy_bundle(net, xflo, fl, num_xfrms, family);
   2987	if (IS_ERR(xdst)) {
   2988		xfrm_pols_put(pols, num_pols);
   2989		return ERR_CAST(xdst);
   2990	}
   2991	xdst->num_pols = num_pols;
   2992	xdst->num_xfrms = num_xfrms;
   2993	memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
   2994
   2995	return xdst;
   2996
   2997inc_error:
   2998	XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
   2999error:
   3000	xfrm_pols_put(pols, num_pols);
   3001	return ERR_PTR(err);
   3002}
   3003
   3004static struct dst_entry *make_blackhole(struct net *net, u16 family,
   3005					struct dst_entry *dst_orig)
   3006{
   3007	const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
   3008	struct dst_entry *ret;
   3009
   3010	if (!afinfo) {
   3011		dst_release(dst_orig);
   3012		return ERR_PTR(-EINVAL);
   3013	} else {
   3014		ret = afinfo->blackhole_route(net, dst_orig);
   3015	}
   3016	rcu_read_unlock();
   3017
   3018	return ret;
   3019}
   3020
   3021/* Finds/creates a bundle for given flow and if_id
   3022 *
   3023 * At the moment we eat a raw IP route. Mostly to speed up lookups
   3024 * on interfaces with disabled IPsec.
   3025 *
   3026 * xfrm_lookup uses an if_id of 0 by default, and is provided for
   3027 * compatibility
   3028 */
   3029struct dst_entry *xfrm_lookup_with_ifid(struct net *net,
   3030					struct dst_entry *dst_orig,
   3031					const struct flowi *fl,
   3032					const struct sock *sk,
   3033					int flags, u32 if_id)
   3034{
   3035	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
   3036	struct xfrm_dst *xdst;
   3037	struct dst_entry *dst, *route;
   3038	u16 family = dst_orig->ops->family;
   3039	u8 dir = XFRM_POLICY_OUT;
   3040	int i, err, num_pols, num_xfrms = 0, drop_pols = 0;
   3041
   3042	dst = NULL;
   3043	xdst = NULL;
   3044	route = NULL;
   3045
   3046	sk = sk_const_to_full_sk(sk);
   3047	if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
   3048		num_pols = 1;
   3049		pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family,
   3050						if_id);
   3051		err = xfrm_expand_policies(fl, family, pols,
   3052					   &num_pols, &num_xfrms);
   3053		if (err < 0)
   3054			goto dropdst;
   3055
   3056		if (num_pols) {
   3057			if (num_xfrms <= 0) {
   3058				drop_pols = num_pols;
   3059				goto no_transform;
   3060			}
   3061
   3062			xdst = xfrm_resolve_and_create_bundle(
   3063					pols, num_pols, fl,
   3064					family, dst_orig);
   3065
   3066			if (IS_ERR(xdst)) {
   3067				xfrm_pols_put(pols, num_pols);
   3068				err = PTR_ERR(xdst);
   3069				if (err == -EREMOTE)
   3070					goto nopol;
   3071
   3072				goto dropdst;
   3073			} else if (xdst == NULL) {
   3074				num_xfrms = 0;
   3075				drop_pols = num_pols;
   3076				goto no_transform;
   3077			}
   3078
   3079			route = xdst->route;
   3080		}
   3081	}
   3082
   3083	if (xdst == NULL) {
   3084		struct xfrm_flo xflo;
   3085
   3086		xflo.dst_orig = dst_orig;
   3087		xflo.flags = flags;
   3088
   3089		/* To accelerate a bit...  */
   3090		if (!if_id && ((dst_orig->flags & DST_NOXFRM) ||
   3091			       !net->xfrm.policy_count[XFRM_POLICY_OUT]))
   3092			goto nopol;
   3093
   3094		xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo, if_id);
   3095		if (xdst == NULL)
   3096			goto nopol;
   3097		if (IS_ERR(xdst)) {
   3098			err = PTR_ERR(xdst);
   3099			goto dropdst;
   3100		}
   3101
   3102		num_pols = xdst->num_pols;
   3103		num_xfrms = xdst->num_xfrms;
   3104		memcpy(pols, xdst->pols, sizeof(struct xfrm_policy *) * num_pols);
   3105		route = xdst->route;
   3106	}
   3107
   3108	dst = &xdst->u.dst;
   3109	if (route == NULL && num_xfrms > 0) {
   3110		/* The only case when xfrm_bundle_lookup() returns a
   3111		 * bundle with null route, is when the template could
   3112		 * not be resolved. It means policies are there, but
   3113		 * bundle could not be created, since we don't yet
   3114		 * have the xfrm_state's. We need to wait for KM to
   3115		 * negotiate new SA's or bail out with error.*/
   3116		if (net->xfrm.sysctl_larval_drop) {
   3117			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
   3118			err = -EREMOTE;
   3119			goto error;
   3120		}
   3121
   3122		err = -EAGAIN;
   3123
   3124		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
   3125		goto error;
   3126	}
   3127
   3128no_transform:
   3129	if (num_pols == 0)
   3130		goto nopol;
   3131
   3132	if ((flags & XFRM_LOOKUP_ICMP) &&
   3133	    !(pols[0]->flags & XFRM_POLICY_ICMP)) {
   3134		err = -ENOENT;
   3135		goto error;
   3136	}
   3137
   3138	for (i = 0; i < num_pols; i++)
   3139		pols[i]->curlft.use_time = ktime_get_real_seconds();
   3140
   3141	if (num_xfrms < 0) {
   3142		/* Prohibit the flow */
   3143		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
   3144		err = -EPERM;
   3145		goto error;
   3146	} else if (num_xfrms > 0) {
   3147		/* Flow transformed */
   3148		dst_release(dst_orig);
   3149	} else {
   3150		/* Flow passes untransformed */
   3151		dst_release(dst);
   3152		dst = dst_orig;
   3153	}
   3154ok:
   3155	xfrm_pols_put(pols, drop_pols);
   3156	if (dst && dst->xfrm &&
   3157	    dst->xfrm->props.mode == XFRM_MODE_TUNNEL)
   3158		dst->flags |= DST_XFRM_TUNNEL;
   3159	return dst;
   3160
   3161nopol:
   3162	if (!(dst_orig->dev->flags & IFF_LOOPBACK) &&
   3163	    net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) {
   3164		err = -EPERM;
   3165		goto error;
   3166	}
   3167	if (!(flags & XFRM_LOOKUP_ICMP)) {
   3168		dst = dst_orig;
   3169		goto ok;
   3170	}
   3171	err = -ENOENT;
   3172error:
   3173	dst_release(dst);
   3174dropdst:
   3175	if (!(flags & XFRM_LOOKUP_KEEP_DST_REF))
   3176		dst_release(dst_orig);
   3177	xfrm_pols_put(pols, drop_pols);
   3178	return ERR_PTR(err);
   3179}
   3180EXPORT_SYMBOL(xfrm_lookup_with_ifid);
   3181
   3182/* Main function: finds/creates a bundle for given flow.
   3183 *
   3184 * At the moment we eat a raw IP route. Mostly to speed up lookups
   3185 * on interfaces with disabled IPsec.
   3186 */
   3187struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
   3188			      const struct flowi *fl, const struct sock *sk,
   3189			      int flags)
   3190{
   3191	return xfrm_lookup_with_ifid(net, dst_orig, fl, sk, flags, 0);
   3192}
   3193EXPORT_SYMBOL(xfrm_lookup);
   3194
   3195/* Callers of xfrm_lookup_route() must ensure a call to dst_output().
   3196 * Otherwise we may send out blackholed packets.
   3197 */
   3198struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig,
   3199				    const struct flowi *fl,
   3200				    const struct sock *sk, int flags)
   3201{
   3202	struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk,
   3203					    flags | XFRM_LOOKUP_QUEUE |
   3204					    XFRM_LOOKUP_KEEP_DST_REF);
   3205
   3206	if (PTR_ERR(dst) == -EREMOTE)
   3207		return make_blackhole(net, dst_orig->ops->family, dst_orig);
   3208
   3209	if (IS_ERR(dst))
   3210		dst_release(dst_orig);
   3211
   3212	return dst;
   3213}
   3214EXPORT_SYMBOL(xfrm_lookup_route);
   3215
   3216static inline int
   3217xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl)
   3218{
   3219	struct sec_path *sp = skb_sec_path(skb);
   3220	struct xfrm_state *x;
   3221
   3222	if (!sp || idx < 0 || idx >= sp->len)
   3223		return 0;
   3224	x = sp->xvec[idx];
   3225	if (!x->type->reject)
   3226		return 0;
   3227	return x->type->reject(x, skb, fl);
   3228}
   3229
   3230/* When skb is transformed back to its "native" form, we have to
   3231 * check policy restrictions. At the moment we make this in maximally
   3232 * stupid way. Shame on me. :-) Of course, connected sockets must
   3233 * have policy cached at them.
   3234 */
   3235
   3236static inline int
   3237xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x,
   3238	      unsigned short family)
   3239{
   3240	if (xfrm_state_kern(x))
   3241		return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, tmpl->encap_family);
   3242	return	x->id.proto == tmpl->id.proto &&
   3243		(x->id.spi == tmpl->id.spi || !tmpl->id.spi) &&
   3244		(x->props.reqid == tmpl->reqid || !tmpl->reqid) &&
   3245		x->props.mode == tmpl->mode &&
   3246		(tmpl->allalgs || (tmpl->aalgos & (1<<x->props.aalgo)) ||
   3247		 !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) &&
   3248		!(x->props.mode != XFRM_MODE_TRANSPORT &&
   3249		  xfrm_state_addr_cmp(tmpl, x, family));
   3250}
   3251
   3252/*
   3253 * 0 or more than 0 is returned when validation is succeeded (either bypass
   3254 * because of optional transport mode, or next index of the matched secpath
   3255 * state with the template.
   3256 * -1 is returned when no matching template is found.
   3257 * Otherwise "-2 - errored_index" is returned.
   3258 */
   3259static inline int
   3260xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start,
   3261	       unsigned short family)
   3262{
   3263	int idx = start;
   3264
   3265	if (tmpl->optional) {
   3266		if (tmpl->mode == XFRM_MODE_TRANSPORT)
   3267			return start;
   3268	} else
   3269		start = -1;
   3270	for (; idx < sp->len; idx++) {
   3271		if (xfrm_state_ok(tmpl, sp->xvec[idx], family))
   3272			return ++idx;
   3273		if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) {
   3274			if (start == -1)
   3275				start = -2-idx;
   3276			break;
   3277		}
   3278	}
   3279	return start;
   3280}
   3281
   3282static void
   3283decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse)
   3284{
   3285	const struct iphdr *iph = ip_hdr(skb);
   3286	int ihl = iph->ihl;
   3287	u8 *xprth = skb_network_header(skb) + ihl * 4;
   3288	struct flowi4 *fl4 = &fl->u.ip4;
   3289	int oif = 0;
   3290
   3291	if (skb_dst(skb) && skb_dst(skb)->dev)
   3292		oif = skb_dst(skb)->dev->ifindex;
   3293
   3294	memset(fl4, 0, sizeof(struct flowi4));
   3295	fl4->flowi4_mark = skb->mark;
   3296	fl4->flowi4_oif = reverse ? skb->skb_iif : oif;
   3297
   3298	fl4->flowi4_proto = iph->protocol;
   3299	fl4->daddr = reverse ? iph->saddr : iph->daddr;
   3300	fl4->saddr = reverse ? iph->daddr : iph->saddr;
   3301	fl4->flowi4_tos = iph->tos & ~INET_ECN_MASK;
   3302
   3303	if (!ip_is_fragment(iph)) {
   3304		switch (iph->protocol) {
   3305		case IPPROTO_UDP:
   3306		case IPPROTO_UDPLITE:
   3307		case IPPROTO_TCP:
   3308		case IPPROTO_SCTP:
   3309		case IPPROTO_DCCP:
   3310			if (xprth + 4 < skb->data ||
   3311			    pskb_may_pull(skb, xprth + 4 - skb->data)) {
   3312				__be16 *ports;
   3313
   3314				xprth = skb_network_header(skb) + ihl * 4;
   3315				ports = (__be16 *)xprth;
   3316
   3317				fl4->fl4_sport = ports[!!reverse];
   3318				fl4->fl4_dport = ports[!reverse];
   3319			}
   3320			break;
   3321		case IPPROTO_ICMP:
   3322			if (xprth + 2 < skb->data ||
   3323			    pskb_may_pull(skb, xprth + 2 - skb->data)) {
   3324				u8 *icmp;
   3325
   3326				xprth = skb_network_header(skb) + ihl * 4;
   3327				icmp = xprth;
   3328
   3329				fl4->fl4_icmp_type = icmp[0];
   3330				fl4->fl4_icmp_code = icmp[1];
   3331			}
   3332			break;
   3333		case IPPROTO_GRE:
   3334			if (xprth + 12 < skb->data ||
   3335			    pskb_may_pull(skb, xprth + 12 - skb->data)) {
   3336				__be16 *greflags;
   3337				__be32 *gre_hdr;
   3338
   3339				xprth = skb_network_header(skb) + ihl * 4;
   3340				greflags = (__be16 *)xprth;
   3341				gre_hdr = (__be32 *)xprth;
   3342
   3343				if (greflags[0] & GRE_KEY) {
   3344					if (greflags[0] & GRE_CSUM)
   3345						gre_hdr++;
   3346					fl4->fl4_gre_key = gre_hdr[1];
   3347				}
   3348			}
   3349			break;
   3350		default:
   3351			break;
   3352		}
   3353	}
   3354}
   3355
   3356#if IS_ENABLED(CONFIG_IPV6)
   3357static void
   3358decode_session6(struct sk_buff *skb, struct flowi *fl, bool reverse)
   3359{
   3360	struct flowi6 *fl6 = &fl->u.ip6;
   3361	int onlyproto = 0;
   3362	const struct ipv6hdr *hdr = ipv6_hdr(skb);
   3363	u32 offset = sizeof(*hdr);
   3364	struct ipv6_opt_hdr *exthdr;
   3365	const unsigned char *nh = skb_network_header(skb);
   3366	u16 nhoff = IP6CB(skb)->nhoff;
   3367	int oif = 0;
   3368	u8 nexthdr;
   3369
   3370	if (!nhoff)
   3371		nhoff = offsetof(struct ipv6hdr, nexthdr);
   3372
   3373	nexthdr = nh[nhoff];
   3374
   3375	if (skb_dst(skb) && skb_dst(skb)->dev)
   3376		oif = skb_dst(skb)->dev->ifindex;
   3377
   3378	memset(fl6, 0, sizeof(struct flowi6));
   3379	fl6->flowi6_mark = skb->mark;
   3380	fl6->flowi6_oif = reverse ? skb->skb_iif : oif;
   3381
   3382	fl6->daddr = reverse ? hdr->saddr : hdr->daddr;
   3383	fl6->saddr = reverse ? hdr->daddr : hdr->saddr;
   3384
   3385	while (nh + offset + sizeof(*exthdr) < skb->data ||
   3386	       pskb_may_pull(skb, nh + offset + sizeof(*exthdr) - skb->data)) {
   3387		nh = skb_network_header(skb);
   3388		exthdr = (struct ipv6_opt_hdr *)(nh + offset);
   3389
   3390		switch (nexthdr) {
   3391		case NEXTHDR_FRAGMENT:
   3392			onlyproto = 1;
   3393			fallthrough;
   3394		case NEXTHDR_ROUTING:
   3395		case NEXTHDR_HOP:
   3396		case NEXTHDR_DEST:
   3397			offset += ipv6_optlen(exthdr);
   3398			nexthdr = exthdr->nexthdr;
   3399			break;
   3400		case IPPROTO_UDP:
   3401		case IPPROTO_UDPLITE:
   3402		case IPPROTO_TCP:
   3403		case IPPROTO_SCTP:
   3404		case IPPROTO_DCCP:
   3405			if (!onlyproto && (nh + offset + 4 < skb->data ||
   3406			     pskb_may_pull(skb, nh + offset + 4 - skb->data))) {
   3407				__be16 *ports;
   3408
   3409				nh = skb_network_header(skb);
   3410				ports = (__be16 *)(nh + offset);
   3411				fl6->fl6_sport = ports[!!reverse];
   3412				fl6->fl6_dport = ports[!reverse];
   3413			}
   3414			fl6->flowi6_proto = nexthdr;
   3415			return;
   3416		case IPPROTO_ICMPV6:
   3417			if (!onlyproto && (nh + offset + 2 < skb->data ||
   3418			    pskb_may_pull(skb, nh + offset + 2 - skb->data))) {
   3419				u8 *icmp;
   3420
   3421				nh = skb_network_header(skb);
   3422				icmp = (u8 *)(nh + offset);
   3423				fl6->fl6_icmp_type = icmp[0];
   3424				fl6->fl6_icmp_code = icmp[1];
   3425			}
   3426			fl6->flowi6_proto = nexthdr;
   3427			return;
   3428		case IPPROTO_GRE:
   3429			if (!onlyproto &&
   3430			    (nh + offset + 12 < skb->data ||
   3431			     pskb_may_pull(skb, nh + offset + 12 - skb->data))) {
   3432				struct gre_base_hdr *gre_hdr;
   3433				__be32 *gre_key;
   3434
   3435				nh = skb_network_header(skb);
   3436				gre_hdr = (struct gre_base_hdr *)(nh + offset);
   3437				gre_key = (__be32 *)(gre_hdr + 1);
   3438
   3439				if (gre_hdr->flags & GRE_KEY) {
   3440					if (gre_hdr->flags & GRE_CSUM)
   3441						gre_key++;
   3442					fl6->fl6_gre_key = *gre_key;
   3443				}
   3444			}
   3445			fl6->flowi6_proto = nexthdr;
   3446			return;
   3447
   3448#if IS_ENABLED(CONFIG_IPV6_MIP6)
   3449		case IPPROTO_MH:
   3450			offset += ipv6_optlen(exthdr);
   3451			if (!onlyproto && (nh + offset + 3 < skb->data ||
   3452			    pskb_may_pull(skb, nh + offset + 3 - skb->data))) {
   3453				struct ip6_mh *mh;
   3454
   3455				nh = skb_network_header(skb);
   3456				mh = (struct ip6_mh *)(nh + offset);
   3457				fl6->fl6_mh_type = mh->ip6mh_type;
   3458			}
   3459			fl6->flowi6_proto = nexthdr;
   3460			return;
   3461#endif
   3462		default:
   3463			fl6->flowi6_proto = nexthdr;
   3464			return;
   3465		}
   3466	}
   3467}
   3468#endif
   3469
   3470int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
   3471			  unsigned int family, int reverse)
   3472{
   3473	switch (family) {
   3474	case AF_INET:
   3475		decode_session4(skb, fl, reverse);
   3476		break;
   3477#if IS_ENABLED(CONFIG_IPV6)
   3478	case AF_INET6:
   3479		decode_session6(skb, fl, reverse);
   3480		break;
   3481#endif
   3482	default:
   3483		return -EAFNOSUPPORT;
   3484	}
   3485
   3486	return security_xfrm_decode_session(skb, &fl->flowi_secid);
   3487}
   3488EXPORT_SYMBOL(__xfrm_decode_session);
   3489
   3490static inline int secpath_has_nontransport(const struct sec_path *sp, int k, int *idxp)
   3491{
   3492	for (; k < sp->len; k++) {
   3493		if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT) {
   3494			*idxp = k;
   3495			return 1;
   3496		}
   3497	}
   3498
   3499	return 0;
   3500}
   3501
   3502int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
   3503			unsigned short family)
   3504{
   3505	struct net *net = dev_net(skb->dev);
   3506	struct xfrm_policy *pol;
   3507	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
   3508	int npols = 0;
   3509	int xfrm_nr;
   3510	int pi;
   3511	int reverse;
   3512	struct flowi fl;
   3513	int xerr_idx = -1;
   3514	const struct xfrm_if_cb *ifcb;
   3515	struct sec_path *sp;
   3516	struct xfrm_if *xi;
   3517	u32 if_id = 0;
   3518
   3519	rcu_read_lock();
   3520	ifcb = xfrm_if_get_cb();
   3521
   3522	if (ifcb) {
   3523		xi = ifcb->decode_session(skb, family);
   3524		if (xi) {
   3525			if_id = xi->p.if_id;
   3526			net = xi->net;
   3527		}
   3528	}
   3529	rcu_read_unlock();
   3530
   3531	reverse = dir & ~XFRM_POLICY_MASK;
   3532	dir &= XFRM_POLICY_MASK;
   3533
   3534	if (__xfrm_decode_session(skb, &fl, family, reverse) < 0) {
   3535		XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
   3536		return 0;
   3537	}
   3538
   3539	nf_nat_decode_session(skb, &fl, family);
   3540
   3541	/* First, check used SA against their selectors. */
   3542	sp = skb_sec_path(skb);
   3543	if (sp) {
   3544		int i;
   3545
   3546		for (i = sp->len - 1; i >= 0; i--) {
   3547			struct xfrm_state *x = sp->xvec[i];
   3548			if (!xfrm_selector_match(&x->sel, &fl, family)) {
   3549				XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH);
   3550				return 0;
   3551			}
   3552		}
   3553	}
   3554
   3555	pol = NULL;
   3556	sk = sk_to_full_sk(sk);
   3557	if (sk && sk->sk_policy[dir]) {
   3558		pol = xfrm_sk_policy_lookup(sk, dir, &fl, family, if_id);
   3559		if (IS_ERR(pol)) {
   3560			XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
   3561			return 0;
   3562		}
   3563	}
   3564
   3565	if (!pol)
   3566		pol = xfrm_policy_lookup(net, &fl, family, dir, if_id);
   3567
   3568	if (IS_ERR(pol)) {
   3569		XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
   3570		return 0;
   3571	}
   3572
   3573	if (!pol) {
   3574		if (net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) {
   3575			XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
   3576			return 0;
   3577		}
   3578
   3579		if (sp && secpath_has_nontransport(sp, 0, &xerr_idx)) {
   3580			xfrm_secpath_reject(xerr_idx, skb, &fl);
   3581			XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
   3582			return 0;
   3583		}
   3584		return 1;
   3585	}
   3586
   3587	pol->curlft.use_time = ktime_get_real_seconds();
   3588
   3589	pols[0] = pol;
   3590	npols++;
   3591#ifdef CONFIG_XFRM_SUB_POLICY
   3592	if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
   3593		pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN,
   3594						    &fl, family,
   3595						    XFRM_POLICY_IN, if_id);
   3596		if (pols[1]) {
   3597			if (IS_ERR(pols[1])) {
   3598				XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
   3599				return 0;
   3600			}
   3601			pols[1]->curlft.use_time = ktime_get_real_seconds();
   3602			npols++;
   3603		}
   3604	}
   3605#endif
   3606
   3607	if (pol->action == XFRM_POLICY_ALLOW) {
   3608		static struct sec_path dummy;
   3609		struct xfrm_tmpl *tp[XFRM_MAX_DEPTH];
   3610		struct xfrm_tmpl *stp[XFRM_MAX_DEPTH];
   3611		struct xfrm_tmpl **tpp = tp;
   3612		int ti = 0;
   3613		int i, k;
   3614
   3615		sp = skb_sec_path(skb);
   3616		if (!sp)
   3617			sp = &dummy;
   3618
   3619		for (pi = 0; pi < npols; pi++) {
   3620			if (pols[pi] != pol &&
   3621			    pols[pi]->action != XFRM_POLICY_ALLOW) {
   3622				XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK);
   3623				goto reject;
   3624			}
   3625			if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH) {
   3626				XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
   3627				goto reject_error;
   3628			}
   3629			for (i = 0; i < pols[pi]->xfrm_nr; i++)
   3630				tpp[ti++] = &pols[pi]->xfrm_vec[i];
   3631		}
   3632		xfrm_nr = ti;
   3633
   3634		if (net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK &&
   3635		    !xfrm_nr) {
   3636			XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES);
   3637			goto reject;
   3638		}
   3639
   3640		if (npols > 1) {
   3641			xfrm_tmpl_sort(stp, tpp, xfrm_nr, family);
   3642			tpp = stp;
   3643		}
   3644
   3645		/* For each tunnel xfrm, find the first matching tmpl.
   3646		 * For each tmpl before that, find corresponding xfrm.
   3647		 * Order is _important_. Later we will implement
   3648		 * some barriers, but at the moment barriers
   3649		 * are implied between each two transformations.
   3650		 */
   3651		for (i = xfrm_nr-1, k = 0; i >= 0; i--) {
   3652			k = xfrm_policy_ok(tpp[i], sp, k, family);
   3653			if (k < 0) {
   3654				if (k < -1)
   3655					/* "-2 - errored_index" returned */
   3656					xerr_idx = -(2+k);
   3657				XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH);
   3658				goto reject;
   3659			}
   3660		}
   3661
   3662		if (secpath_has_nontransport(sp, k, &xerr_idx)) {
   3663			XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH);
   3664			goto reject;
   3665		}
   3666
   3667		xfrm_pols_put(pols, npols);
   3668		return 1;
   3669	}
   3670	XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK);
   3671
   3672reject:
   3673	xfrm_secpath_reject(xerr_idx, skb, &fl);
   3674reject_error:
   3675	xfrm_pols_put(pols, npols);
   3676	return 0;
   3677}
   3678EXPORT_SYMBOL(__xfrm_policy_check);
   3679
   3680int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
   3681{
   3682	struct net *net = dev_net(skb->dev);
   3683	struct flowi fl;
   3684	struct dst_entry *dst;
   3685	int res = 1;
   3686
   3687	if (xfrm_decode_session(skb, &fl, family) < 0) {
   3688		XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
   3689		return 0;
   3690	}
   3691
   3692	skb_dst_force(skb);
   3693	if (!skb_dst(skb)) {
   3694		XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
   3695		return 0;
   3696	}
   3697
   3698	dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE);
   3699	if (IS_ERR(dst)) {
   3700		res = 0;
   3701		dst = NULL;
   3702	}
   3703	skb_dst_set(skb, dst);
   3704	return res;
   3705}
   3706EXPORT_SYMBOL(__xfrm_route_forward);
   3707
   3708/* Optimize later using cookies and generation ids. */
   3709
   3710static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
   3711{
   3712	/* Code (such as __xfrm4_bundle_create()) sets dst->obsolete
   3713	 * to DST_OBSOLETE_FORCE_CHK to force all XFRM destinations to
   3714	 * get validated by dst_ops->check on every use.  We do this
   3715	 * because when a normal route referenced by an XFRM dst is
   3716	 * obsoleted we do not go looking around for all parent
   3717	 * referencing XFRM dsts so that we can invalidate them.  It
   3718	 * is just too much work.  Instead we make the checks here on
   3719	 * every use.  For example:
   3720	 *
   3721	 *	XFRM dst A --> IPv4 dst X
   3722	 *
   3723	 * X is the "xdst->route" of A (X is also the "dst->path" of A
   3724	 * in this example).  If X is marked obsolete, "A" will not
   3725	 * notice.  That's what we are validating here via the
   3726	 * stale_bundle() check.
   3727	 *
   3728	 * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will
   3729	 * be marked on it.
   3730	 * This will force stale_bundle() to fail on any xdst bundle with
   3731	 * this dst linked in it.
   3732	 */
   3733	if (dst->obsolete < 0 && !stale_bundle(dst))
   3734		return dst;
   3735
   3736	return NULL;
   3737}
   3738
   3739static int stale_bundle(struct dst_entry *dst)
   3740{
   3741	return !xfrm_bundle_ok((struct xfrm_dst *)dst);
   3742}
   3743
   3744void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
   3745{
   3746	while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) {
   3747		dst->dev = blackhole_netdev;
   3748		dev_hold(dst->dev);
   3749		dev_put(dev);
   3750	}
   3751}
   3752EXPORT_SYMBOL(xfrm_dst_ifdown);
   3753
   3754static void xfrm_link_failure(struct sk_buff *skb)
   3755{
   3756	/* Impossible. Such dst must be popped before reaches point of failure. */
   3757}
   3758
   3759static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
   3760{
   3761	if (dst) {
   3762		if (dst->obsolete) {
   3763			dst_release(dst);
   3764			dst = NULL;
   3765		}
   3766	}
   3767	return dst;
   3768}
   3769
   3770static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr)
   3771{
   3772	while (nr--) {
   3773		struct xfrm_dst *xdst = bundle[nr];
   3774		u32 pmtu, route_mtu_cached;
   3775		struct dst_entry *dst;
   3776
   3777		dst = &xdst->u.dst;
   3778		pmtu = dst_mtu(xfrm_dst_child(dst));
   3779		xdst->child_mtu_cached = pmtu;
   3780
   3781		pmtu = xfrm_state_mtu(dst->xfrm, pmtu);
   3782
   3783		route_mtu_cached = dst_mtu(xdst->route);
   3784		xdst->route_mtu_cached = route_mtu_cached;
   3785
   3786		if (pmtu > route_mtu_cached)
   3787			pmtu = route_mtu_cached;
   3788
   3789		dst_metric_set(dst, RTAX_MTU, pmtu);
   3790	}
   3791}
   3792
   3793/* Check that the bundle accepts the flow and its components are
   3794 * still valid.
   3795 */
   3796
   3797static int xfrm_bundle_ok(struct xfrm_dst *first)
   3798{
   3799	struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
   3800	struct dst_entry *dst = &first->u.dst;
   3801	struct xfrm_dst *xdst;
   3802	int start_from, nr;
   3803	u32 mtu;
   3804
   3805	if (!dst_check(xfrm_dst_path(dst), ((struct xfrm_dst *)dst)->path_cookie) ||
   3806	    (dst->dev && !netif_running(dst->dev)))
   3807		return 0;
   3808
   3809	if (dst->flags & DST_XFRM_QUEUE)
   3810		return 1;
   3811
   3812	start_from = nr = 0;
   3813	do {
   3814		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
   3815
   3816		if (dst->xfrm->km.state != XFRM_STATE_VALID)
   3817			return 0;
   3818		if (xdst->xfrm_genid != dst->xfrm->genid)
   3819			return 0;
   3820		if (xdst->num_pols > 0 &&
   3821		    xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
   3822			return 0;
   3823
   3824		bundle[nr++] = xdst;
   3825
   3826		mtu = dst_mtu(xfrm_dst_child(dst));
   3827		if (xdst->child_mtu_cached != mtu) {
   3828			start_from = nr;
   3829			xdst->child_mtu_cached = mtu;
   3830		}
   3831
   3832		if (!dst_check(xdst->route, xdst->route_cookie))
   3833			return 0;
   3834		mtu = dst_mtu(xdst->route);
   3835		if (xdst->route_mtu_cached != mtu) {
   3836			start_from = nr;
   3837			xdst->route_mtu_cached = mtu;
   3838		}
   3839
   3840		dst = xfrm_dst_child(dst);
   3841	} while (dst->xfrm);
   3842
   3843	if (likely(!start_from))
   3844		return 1;
   3845
   3846	xdst = bundle[start_from - 1];
   3847	mtu = xdst->child_mtu_cached;
   3848	while (start_from--) {
   3849		dst = &xdst->u.dst;
   3850
   3851		mtu = xfrm_state_mtu(dst->xfrm, mtu);
   3852		if (mtu > xdst->route_mtu_cached)
   3853			mtu = xdst->route_mtu_cached;
   3854		dst_metric_set(dst, RTAX_MTU, mtu);
   3855		if (!start_from)
   3856			break;
   3857
   3858		xdst = bundle[start_from - 1];
   3859		xdst->child_mtu_cached = mtu;
   3860	}
   3861
   3862	return 1;
   3863}
   3864
   3865static unsigned int xfrm_default_advmss(const struct dst_entry *dst)
   3866{
   3867	return dst_metric_advmss(xfrm_dst_path(dst));
   3868}
   3869
   3870static unsigned int xfrm_mtu(const struct dst_entry *dst)
   3871{
   3872	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
   3873
   3874	return mtu ? : dst_mtu(xfrm_dst_path(dst));
   3875}
   3876
   3877static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst,
   3878					const void *daddr)
   3879{
   3880	while (dst->xfrm) {
   3881		const struct xfrm_state *xfrm = dst->xfrm;
   3882
   3883		dst = xfrm_dst_child(dst);
   3884
   3885		if (xfrm->props.mode == XFRM_MODE_TRANSPORT)
   3886			continue;
   3887		if (xfrm->type->flags & XFRM_TYPE_REMOTE_COADDR)
   3888			daddr = xfrm->coaddr;
   3889		else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR))
   3890			daddr = &xfrm->id.daddr;
   3891	}
   3892	return daddr;
   3893}
   3894
   3895static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
   3896					   struct sk_buff *skb,
   3897					   const void *daddr)
   3898{
   3899	const struct dst_entry *path = xfrm_dst_path(dst);
   3900
   3901	if (!skb)
   3902		daddr = xfrm_get_dst_nexthop(dst, daddr);
   3903	return path->ops->neigh_lookup(path, skb, daddr);
   3904}
   3905
   3906static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
   3907{
   3908	const struct dst_entry *path = xfrm_dst_path(dst);
   3909
   3910	daddr = xfrm_get_dst_nexthop(dst, daddr);
   3911	path->ops->confirm_neigh(path, daddr);
   3912}
   3913
   3914int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family)
   3915{
   3916	int err = 0;
   3917
   3918	if (WARN_ON(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
   3919		return -EAFNOSUPPORT;
   3920
   3921	spin_lock(&xfrm_policy_afinfo_lock);
   3922	if (unlikely(xfrm_policy_afinfo[family] != NULL))
   3923		err = -EEXIST;
   3924	else {
   3925		struct dst_ops *dst_ops = afinfo->dst_ops;
   3926		if (likely(dst_ops->kmem_cachep == NULL))
   3927			dst_ops->kmem_cachep = xfrm_dst_cache;
   3928		if (likely(dst_ops->check == NULL))
   3929			dst_ops->check = xfrm_dst_check;
   3930		if (likely(dst_ops->default_advmss == NULL))
   3931			dst_ops->default_advmss = xfrm_default_advmss;
   3932		if (likely(dst_ops->mtu == NULL))
   3933			dst_ops->mtu = xfrm_mtu;
   3934		if (likely(dst_ops->negative_advice == NULL))
   3935			dst_ops->negative_advice = xfrm_negative_advice;
   3936		if (likely(dst_ops->link_failure == NULL))
   3937			dst_ops->link_failure = xfrm_link_failure;
   3938		if (likely(dst_ops->neigh_lookup == NULL))
   3939			dst_ops->neigh_lookup = xfrm_neigh_lookup;
   3940		if (likely(!dst_ops->confirm_neigh))
   3941			dst_ops->confirm_neigh = xfrm_confirm_neigh;
   3942		rcu_assign_pointer(xfrm_policy_afinfo[family], afinfo);
   3943	}
   3944	spin_unlock(&xfrm_policy_afinfo_lock);
   3945
   3946	return err;
   3947}
   3948EXPORT_SYMBOL(xfrm_policy_register_afinfo);
   3949
   3950void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo)
   3951{
   3952	struct dst_ops *dst_ops = afinfo->dst_ops;
   3953	int i;
   3954
   3955	for (i = 0; i < ARRAY_SIZE(xfrm_policy_afinfo); i++) {
   3956		if (xfrm_policy_afinfo[i] != afinfo)
   3957			continue;
   3958		RCU_INIT_POINTER(xfrm_policy_afinfo[i], NULL);
   3959		break;
   3960	}
   3961
   3962	synchronize_rcu();
   3963
   3964	dst_ops->kmem_cachep = NULL;
   3965	dst_ops->check = NULL;
   3966	dst_ops->negative_advice = NULL;
   3967	dst_ops->link_failure = NULL;
   3968}
   3969EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
   3970
   3971void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb)
   3972{
   3973	spin_lock(&xfrm_if_cb_lock);
   3974	rcu_assign_pointer(xfrm_if_cb, ifcb);
   3975	spin_unlock(&xfrm_if_cb_lock);
   3976}
   3977EXPORT_SYMBOL(xfrm_if_register_cb);
   3978
   3979void xfrm_if_unregister_cb(void)
   3980{
   3981	RCU_INIT_POINTER(xfrm_if_cb, NULL);
   3982	synchronize_rcu();
   3983}
   3984EXPORT_SYMBOL(xfrm_if_unregister_cb);
   3985
   3986#ifdef CONFIG_XFRM_STATISTICS
   3987static int __net_init xfrm_statistics_init(struct net *net)
   3988{
   3989	int rv;
   3990	net->mib.xfrm_statistics = alloc_percpu(struct linux_xfrm_mib);
   3991	if (!net->mib.xfrm_statistics)
   3992		return -ENOMEM;
   3993	rv = xfrm_proc_init(net);
   3994	if (rv < 0)
   3995		free_percpu(net->mib.xfrm_statistics);
   3996	return rv;
   3997}
   3998
   3999static void xfrm_statistics_fini(struct net *net)
   4000{
   4001	xfrm_proc_fini(net);
   4002	free_percpu(net->mib.xfrm_statistics);
   4003}
   4004#else
   4005static int __net_init xfrm_statistics_init(struct net *net)
   4006{
   4007	return 0;
   4008}
   4009
   4010static void xfrm_statistics_fini(struct net *net)
   4011{
   4012}
   4013#endif
   4014
   4015static int __net_init xfrm_policy_init(struct net *net)
   4016{
   4017	unsigned int hmask, sz;
   4018	int dir, err;
   4019
   4020	if (net_eq(net, &init_net)) {
   4021		xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
   4022					   sizeof(struct xfrm_dst),
   4023					   0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
   4024					   NULL);
   4025		err = rhashtable_init(&xfrm_policy_inexact_table,
   4026				      &xfrm_pol_inexact_params);
   4027		BUG_ON(err);
   4028	}
   4029
   4030	hmask = 8 - 1;
   4031	sz = (hmask+1) * sizeof(struct hlist_head);
   4032
   4033	net->xfrm.policy_byidx = xfrm_hash_alloc(sz);
   4034	if (!net->xfrm.policy_byidx)
   4035		goto out_byidx;
   4036	net->xfrm.policy_idx_hmask = hmask;
   4037
   4038	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
   4039		struct xfrm_policy_hash *htab;
   4040
   4041		net->xfrm.policy_count[dir] = 0;
   4042		net->xfrm.policy_count[XFRM_POLICY_MAX + dir] = 0;
   4043		INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]);
   4044
   4045		htab = &net->xfrm.policy_bydst[dir];
   4046		htab->table = xfrm_hash_alloc(sz);
   4047		if (!htab->table)
   4048			goto out_bydst;
   4049		htab->hmask = hmask;
   4050		htab->dbits4 = 32;
   4051		htab->sbits4 = 32;
   4052		htab->dbits6 = 128;
   4053		htab->sbits6 = 128;
   4054	}
   4055	net->xfrm.policy_hthresh.lbits4 = 32;
   4056	net->xfrm.policy_hthresh.rbits4 = 32;
   4057	net->xfrm.policy_hthresh.lbits6 = 128;
   4058	net->xfrm.policy_hthresh.rbits6 = 128;
   4059
   4060	seqlock_init(&net->xfrm.policy_hthresh.lock);
   4061
   4062	INIT_LIST_HEAD(&net->xfrm.policy_all);
   4063	INIT_LIST_HEAD(&net->xfrm.inexact_bins);
   4064	INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
   4065	INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild);
   4066	return 0;
   4067
   4068out_bydst:
   4069	for (dir--; dir >= 0; dir--) {
   4070		struct xfrm_policy_hash *htab;
   4071
   4072		htab = &net->xfrm.policy_bydst[dir];
   4073		xfrm_hash_free(htab->table, sz);
   4074	}
   4075	xfrm_hash_free(net->xfrm.policy_byidx, sz);
   4076out_byidx:
   4077	return -ENOMEM;
   4078}
   4079
   4080static void xfrm_policy_fini(struct net *net)
   4081{
   4082	struct xfrm_pol_inexact_bin *b, *t;
   4083	unsigned int sz;
   4084	int dir;
   4085
   4086	flush_work(&net->xfrm.policy_hash_work);
   4087#ifdef CONFIG_XFRM_SUB_POLICY
   4088	xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, false);
   4089#endif
   4090	xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, false);
   4091
   4092	WARN_ON(!list_empty(&net->xfrm.policy_all));
   4093
   4094	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
   4095		struct xfrm_policy_hash *htab;
   4096
   4097		WARN_ON(!hlist_empty(&net->xfrm.policy_inexact[dir]));
   4098
   4099		htab = &net->xfrm.policy_bydst[dir];
   4100		sz = (htab->hmask + 1) * sizeof(struct hlist_head);
   4101		WARN_ON(!hlist_empty(htab->table));
   4102		xfrm_hash_free(htab->table, sz);
   4103	}
   4104
   4105	sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head);
   4106	WARN_ON(!hlist_empty(net->xfrm.policy_byidx));
   4107	xfrm_hash_free(net->xfrm.policy_byidx, sz);
   4108
   4109	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
   4110	list_for_each_entry_safe(b, t, &net->xfrm.inexact_bins, inexact_bins)
   4111		__xfrm_policy_inexact_prune_bin(b, true);
   4112	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
   4113}
   4114
   4115static int __net_init xfrm_net_init(struct net *net)
   4116{
   4117	int rv;
   4118
   4119	/* Initialize the per-net locks here */
   4120	spin_lock_init(&net->xfrm.xfrm_state_lock);
   4121	spin_lock_init(&net->xfrm.xfrm_policy_lock);
   4122	seqcount_spinlock_init(&net->xfrm.xfrm_policy_hash_generation, &net->xfrm.xfrm_policy_lock);
   4123	mutex_init(&net->xfrm.xfrm_cfg_mutex);
   4124	net->xfrm.policy_default[XFRM_POLICY_IN] = XFRM_USERPOLICY_ACCEPT;
   4125	net->xfrm.policy_default[XFRM_POLICY_FWD] = XFRM_USERPOLICY_ACCEPT;
   4126	net->xfrm.policy_default[XFRM_POLICY_OUT] = XFRM_USERPOLICY_ACCEPT;
   4127
   4128	rv = xfrm_statistics_init(net);
   4129	if (rv < 0)
   4130		goto out_statistics;
   4131	rv = xfrm_state_init(net);
   4132	if (rv < 0)
   4133		goto out_state;
   4134	rv = xfrm_policy_init(net);
   4135	if (rv < 0)
   4136		goto out_policy;
   4137	rv = xfrm_sysctl_init(net);
   4138	if (rv < 0)
   4139		goto out_sysctl;
   4140
   4141	return 0;
   4142
   4143out_sysctl:
   4144	xfrm_policy_fini(net);
   4145out_policy:
   4146	xfrm_state_fini(net);
   4147out_state:
   4148	xfrm_statistics_fini(net);
   4149out_statistics:
   4150	return rv;
   4151}
   4152
   4153static void __net_exit xfrm_net_exit(struct net *net)
   4154{
   4155	xfrm_sysctl_fini(net);
   4156	xfrm_policy_fini(net);
   4157	xfrm_state_fini(net);
   4158	xfrm_statistics_fini(net);
   4159}
   4160
   4161static struct pernet_operations __net_initdata xfrm_net_ops = {
   4162	.init = xfrm_net_init,
   4163	.exit = xfrm_net_exit,
   4164};
   4165
   4166void __init xfrm_init(void)
   4167{
   4168	register_pernet_subsys(&xfrm_net_ops);
   4169	xfrm_dev_init();
   4170	xfrm_input_init();
   4171
   4172#ifdef CONFIG_XFRM_ESPINTCP
   4173	espintcp_init();
   4174#endif
   4175}
   4176
   4177#ifdef CONFIG_AUDITSYSCALL
   4178static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp,
   4179					 struct audit_buffer *audit_buf)
   4180{
   4181	struct xfrm_sec_ctx *ctx = xp->security;
   4182	struct xfrm_selector *sel = &xp->selector;
   4183
   4184	if (ctx)
   4185		audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s",
   4186				 ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str);
   4187
   4188	switch (sel->family) {
   4189	case AF_INET:
   4190		audit_log_format(audit_buf, " src=%pI4", &sel->saddr.a4);
   4191		if (sel->prefixlen_s != 32)
   4192			audit_log_format(audit_buf, " src_prefixlen=%d",
   4193					 sel->prefixlen_s);
   4194		audit_log_format(audit_buf, " dst=%pI4", &sel->daddr.a4);
   4195		if (sel->prefixlen_d != 32)
   4196			audit_log_format(audit_buf, " dst_prefixlen=%d",
   4197					 sel->prefixlen_d);
   4198		break;
   4199	case AF_INET6:
   4200		audit_log_format(audit_buf, " src=%pI6", sel->saddr.a6);
   4201		if (sel->prefixlen_s != 128)
   4202			audit_log_format(audit_buf, " src_prefixlen=%d",
   4203					 sel->prefixlen_s);
   4204		audit_log_format(audit_buf, " dst=%pI6", sel->daddr.a6);
   4205		if (sel->prefixlen_d != 128)
   4206			audit_log_format(audit_buf, " dst_prefixlen=%d",
   4207					 sel->prefixlen_d);
   4208		break;
   4209	}
   4210}
   4211
   4212void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid)
   4213{
   4214	struct audit_buffer *audit_buf;
   4215
   4216	audit_buf = xfrm_audit_start("SPD-add");
   4217	if (audit_buf == NULL)
   4218		return;
   4219	xfrm_audit_helper_usrinfo(task_valid, audit_buf);
   4220	audit_log_format(audit_buf, " res=%u", result);
   4221	xfrm_audit_common_policyinfo(xp, audit_buf);
   4222	audit_log_end(audit_buf);
   4223}
   4224EXPORT_SYMBOL_GPL(xfrm_audit_policy_add);
   4225
   4226void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
   4227			      bool task_valid)
   4228{
   4229	struct audit_buffer *audit_buf;
   4230
   4231	audit_buf = xfrm_audit_start("SPD-delete");
   4232	if (audit_buf == NULL)
   4233		return;
   4234	xfrm_audit_helper_usrinfo(task_valid, audit_buf);
   4235	audit_log_format(audit_buf, " res=%u", result);
   4236	xfrm_audit_common_policyinfo(xp, audit_buf);
   4237	audit_log_end(audit_buf);
   4238}
   4239EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete);
   4240#endif
   4241
   4242#ifdef CONFIG_XFRM_MIGRATE
   4243static bool xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp,
   4244					const struct xfrm_selector *sel_tgt)
   4245{
   4246	if (sel_cmp->proto == IPSEC_ULPROTO_ANY) {
   4247		if (sel_tgt->family == sel_cmp->family &&
   4248		    xfrm_addr_equal(&sel_tgt->daddr, &sel_cmp->daddr,
   4249				    sel_cmp->family) &&
   4250		    xfrm_addr_equal(&sel_tgt->saddr, &sel_cmp->saddr,
   4251				    sel_cmp->family) &&
   4252		    sel_tgt->prefixlen_d == sel_cmp->prefixlen_d &&
   4253		    sel_tgt->prefixlen_s == sel_cmp->prefixlen_s) {
   4254			return true;
   4255		}
   4256	} else {
   4257		if (memcmp(sel_tgt, sel_cmp, sizeof(*sel_tgt)) == 0) {
   4258			return true;
   4259		}
   4260	}
   4261	return false;
   4262}
   4263
   4264static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel,
   4265						    u8 dir, u8 type, struct net *net, u32 if_id)
   4266{
   4267	struct xfrm_policy *pol, *ret = NULL;
   4268	struct hlist_head *chain;
   4269	u32 priority = ~0U;
   4270
   4271	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
   4272	chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir);
   4273	hlist_for_each_entry(pol, chain, bydst) {
   4274		if ((if_id == 0 || pol->if_id == if_id) &&
   4275		    xfrm_migrate_selector_match(sel, &pol->selector) &&
   4276		    pol->type == type) {
   4277			ret = pol;
   4278			priority = ret->priority;
   4279			break;
   4280		}
   4281	}
   4282	chain = &net->xfrm.policy_inexact[dir];
   4283	hlist_for_each_entry(pol, chain, bydst_inexact_list) {
   4284		if ((pol->priority >= priority) && ret)
   4285			break;
   4286
   4287		if ((if_id == 0 || pol->if_id == if_id) &&
   4288		    xfrm_migrate_selector_match(sel, &pol->selector) &&
   4289		    pol->type == type) {
   4290			ret = pol;
   4291			break;
   4292		}
   4293	}
   4294
   4295	xfrm_pol_hold(ret);
   4296
   4297	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
   4298
   4299	return ret;
   4300}
   4301
   4302static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tmpl *t)
   4303{
   4304	int match = 0;
   4305
   4306	if (t->mode == m->mode && t->id.proto == m->proto &&
   4307	    (m->reqid == 0 || t->reqid == m->reqid)) {
   4308		switch (t->mode) {
   4309		case XFRM_MODE_TUNNEL:
   4310		case XFRM_MODE_BEET:
   4311			if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr,
   4312					    m->old_family) &&
   4313			    xfrm_addr_equal(&t->saddr, &m->old_saddr,
   4314					    m->old_family)) {
   4315				match = 1;
   4316			}
   4317			break;
   4318		case XFRM_MODE_TRANSPORT:
   4319			/* in case of transport mode, template does not store
   4320			   any IP addresses, hence we just compare mode and
   4321			   protocol */
   4322			match = 1;
   4323			break;
   4324		default:
   4325			break;
   4326		}
   4327	}
   4328	return match;
   4329}
   4330
   4331/* update endpoint address(es) of template(s) */
   4332static int xfrm_policy_migrate(struct xfrm_policy *pol,
   4333			       struct xfrm_migrate *m, int num_migrate)
   4334{
   4335	struct xfrm_migrate *mp;
   4336	int i, j, n = 0;
   4337
   4338	write_lock_bh(&pol->lock);
   4339	if (unlikely(pol->walk.dead)) {
   4340		/* target policy has been deleted */
   4341		write_unlock_bh(&pol->lock);
   4342		return -ENOENT;
   4343	}
   4344
   4345	for (i = 0; i < pol->xfrm_nr; i++) {
   4346		for (j = 0, mp = m; j < num_migrate; j++, mp++) {
   4347			if (!migrate_tmpl_match(mp, &pol->xfrm_vec[i]))
   4348				continue;
   4349			n++;
   4350			if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL &&
   4351			    pol->xfrm_vec[i].mode != XFRM_MODE_BEET)
   4352				continue;
   4353			/* update endpoints */
   4354			memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr,
   4355			       sizeof(pol->xfrm_vec[i].id.daddr));
   4356			memcpy(&pol->xfrm_vec[i].saddr, &mp->new_saddr,
   4357			       sizeof(pol->xfrm_vec[i].saddr));
   4358			pol->xfrm_vec[i].encap_family = mp->new_family;
   4359			/* flush bundles */
   4360			atomic_inc(&pol->genid);
   4361		}
   4362	}
   4363
   4364	write_unlock_bh(&pol->lock);
   4365
   4366	if (!n)
   4367		return -ENODATA;
   4368
   4369	return 0;
   4370}
   4371
   4372static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate)
   4373{
   4374	int i, j;
   4375
   4376	if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH)
   4377		return -EINVAL;
   4378
   4379	for (i = 0; i < num_migrate; i++) {
   4380		if (xfrm_addr_any(&m[i].new_daddr, m[i].new_family) ||
   4381		    xfrm_addr_any(&m[i].new_saddr, m[i].new_family))
   4382			return -EINVAL;
   4383
   4384		/* check if there is any duplicated entry */
   4385		for (j = i + 1; j < num_migrate; j++) {
   4386			if (!memcmp(&m[i].old_daddr, &m[j].old_daddr,
   4387				    sizeof(m[i].old_daddr)) &&
   4388			    !memcmp(&m[i].old_saddr, &m[j].old_saddr,
   4389				    sizeof(m[i].old_saddr)) &&
   4390			    m[i].proto == m[j].proto &&
   4391			    m[i].mode == m[j].mode &&
   4392			    m[i].reqid == m[j].reqid &&
   4393			    m[i].old_family == m[j].old_family)
   4394				return -EINVAL;
   4395		}
   4396	}
   4397
   4398	return 0;
   4399}
   4400
   4401int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
   4402		 struct xfrm_migrate *m, int num_migrate,
   4403		 struct xfrm_kmaddress *k, struct net *net,
   4404		 struct xfrm_encap_tmpl *encap, u32 if_id)
   4405{
   4406	int i, err, nx_cur = 0, nx_new = 0;
   4407	struct xfrm_policy *pol = NULL;
   4408	struct xfrm_state *x, *xc;
   4409	struct xfrm_state *x_cur[XFRM_MAX_DEPTH];
   4410	struct xfrm_state *x_new[XFRM_MAX_DEPTH];
   4411	struct xfrm_migrate *mp;
   4412
   4413	/* Stage 0 - sanity checks */
   4414	if ((err = xfrm_migrate_check(m, num_migrate)) < 0)
   4415		goto out;
   4416
   4417	if (dir >= XFRM_POLICY_MAX) {
   4418		err = -EINVAL;
   4419		goto out;
   4420	}
   4421
   4422	/* Stage 1 - find policy */
   4423	if ((pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id)) == NULL) {
   4424		err = -ENOENT;
   4425		goto out;
   4426	}
   4427
   4428	/* Stage 2 - find and update state(s) */
   4429	for (i = 0, mp = m; i < num_migrate; i++, mp++) {
   4430		if ((x = xfrm_migrate_state_find(mp, net, if_id))) {
   4431			x_cur[nx_cur] = x;
   4432			nx_cur++;
   4433			xc = xfrm_state_migrate(x, mp, encap);
   4434			if (xc) {
   4435				x_new[nx_new] = xc;
   4436				nx_new++;
   4437			} else {
   4438				err = -ENODATA;
   4439				goto restore_state;
   4440			}
   4441		}
   4442	}
   4443
   4444	/* Stage 3 - update policy */
   4445	if ((err = xfrm_policy_migrate(pol, m, num_migrate)) < 0)
   4446		goto restore_state;
   4447
   4448	/* Stage 4 - delete old state(s) */
   4449	if (nx_cur) {
   4450		xfrm_states_put(x_cur, nx_cur);
   4451		xfrm_states_delete(x_cur, nx_cur);
   4452	}
   4453
   4454	/* Stage 5 - announce */
   4455	km_migrate(sel, dir, type, m, num_migrate, k, encap);
   4456
   4457	xfrm_pol_put(pol);
   4458
   4459	return 0;
   4460out:
   4461	return err;
   4462
   4463restore_state:
   4464	if (pol)
   4465		xfrm_pol_put(pol);
   4466	if (nx_cur)
   4467		xfrm_states_put(x_cur, nx_cur);
   4468	if (nx_new)
   4469		xfrm_states_delete(x_new, nx_new);
   4470
   4471	return err;
   4472}
   4473EXPORT_SYMBOL(xfrm_migrate);
   4474#endif