cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

ip6_output.c (53271B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 *	IPv6 output functions
      4 *	Linux INET6 implementation
      5 *
      6 *	Authors:
      7 *	Pedro Roque		<roque@di.fc.ul.pt>
      8 *
      9 *	Based on linux/net/ipv4/ip_output.c
     10 *
     11 *	Changes:
     12 *	A.N.Kuznetsov	:	airthmetics in fragmentation.
     13 *				extension headers are implemented.
     14 *				route changes now work.
     15 *				ip6_forward does not confuse sniffers.
     16 *				etc.
     17 *
     18 *      H. von Brand    :       Added missing #include <linux/string.h>
     19 *	Imran Patel	:	frag id should be in NBO
     20 *      Kazunori MIYAZAWA @USAGI
     21 *			:       add ip6_append_data and related functions
     22 *				for datagram xmit
     23 */
     24
     25#include <linux/errno.h>
     26#include <linux/kernel.h>
     27#include <linux/string.h>
     28#include <linux/socket.h>
     29#include <linux/net.h>
     30#include <linux/netdevice.h>
     31#include <linux/if_arp.h>
     32#include <linux/in6.h>
     33#include <linux/tcp.h>
     34#include <linux/route.h>
     35#include <linux/module.h>
     36#include <linux/slab.h>
     37
     38#include <linux/bpf-cgroup.h>
     39#include <linux/netfilter.h>
     40#include <linux/netfilter_ipv6.h>
     41
     42#include <net/sock.h>
     43#include <net/snmp.h>
     44
     45#include <net/ipv6.h>
     46#include <net/ndisc.h>
     47#include <net/protocol.h>
     48#include <net/ip6_route.h>
     49#include <net/addrconf.h>
     50#include <net/rawv6.h>
     51#include <net/icmp.h>
     52#include <net/xfrm.h>
     53#include <net/checksum.h>
     54#include <linux/mroute6.h>
     55#include <net/l3mdev.h>
     56#include <net/lwtunnel.h>
     57#include <net/ip_tunnels.h>
     58
     59static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
     60{
     61	struct dst_entry *dst = skb_dst(skb);
     62	struct net_device *dev = dst->dev;
     63	struct inet6_dev *idev = ip6_dst_idev(dst);
     64	unsigned int hh_len = LL_RESERVED_SPACE(dev);
     65	const struct in6_addr *daddr, *nexthop;
     66	struct ipv6hdr *hdr;
     67	struct neighbour *neigh;
     68	int ret;
     69
     70	/* Be paranoid, rather than too clever. */
     71	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
     72		skb = skb_expand_head(skb, hh_len);
     73		if (!skb) {
     74			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
     75			return -ENOMEM;
     76		}
     77	}
     78
     79	hdr = ipv6_hdr(skb);
     80	daddr = &hdr->daddr;
     81	if (ipv6_addr_is_multicast(daddr)) {
     82		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
     83		    ((mroute6_is_socket(net, skb) &&
     84		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
     85		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
     86			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
     87
     88			/* Do not check for IFF_ALLMULTI; multicast routing
     89			   is not supported in any case.
     90			 */
     91			if (newskb)
     92				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
     93					net, sk, newskb, NULL, newskb->dev,
     94					dev_loopback_xmit);
     95
     96			if (hdr->hop_limit == 0) {
     97				IP6_INC_STATS(net, idev,
     98					      IPSTATS_MIB_OUTDISCARDS);
     99				kfree_skb(skb);
    100				return 0;
    101			}
    102		}
    103
    104		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
    105		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
    106		    !(dev->flags & IFF_LOOPBACK)) {
    107			kfree_skb(skb);
    108			return 0;
    109		}
    110	}
    111
    112	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
    113		int res = lwtunnel_xmit(skb);
    114
    115		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
    116			return res;
    117	}
    118
    119	rcu_read_lock_bh();
    120	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
    121	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
    122
    123	if (unlikely(IS_ERR_OR_NULL(neigh))) {
    124		if (unlikely(!neigh))
    125			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
    126		if (IS_ERR(neigh)) {
    127			rcu_read_unlock_bh();
    128			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
    129			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
    130			return -EINVAL;
    131		}
    132	}
    133	sock_confirm_neigh(skb, neigh);
    134	ret = neigh_output(neigh, skb, false);
    135	rcu_read_unlock_bh();
    136	return ret;
    137}
    138
    139static int
    140ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
    141				    struct sk_buff *skb, unsigned int mtu)
    142{
    143	struct sk_buff *segs, *nskb;
    144	netdev_features_t features;
    145	int ret = 0;
    146
    147	/* Please see corresponding comment in ip_finish_output_gso
    148	 * describing the cases where GSO segment length exceeds the
    149	 * egress MTU.
    150	 */
    151	features = netif_skb_features(skb);
    152	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
    153	if (IS_ERR_OR_NULL(segs)) {
    154		kfree_skb(skb);
    155		return -ENOMEM;
    156	}
    157
    158	consume_skb(skb);
    159
    160	skb_list_walk_safe(segs, segs, nskb) {
    161		int err;
    162
    163		skb_mark_not_on_list(segs);
    164		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
    165		if (err && ret == 0)
    166			ret = err;
    167	}
    168
    169	return ret;
    170}
    171
    172static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
    173{
    174	unsigned int mtu;
    175
    176#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
    177	/* Policy lookup after SNAT yielded a new policy */
    178	if (skb_dst(skb)->xfrm) {
    179		IP6CB(skb)->flags |= IP6SKB_REROUTED;
    180		return dst_output(net, sk, skb);
    181	}
    182#endif
    183
    184	mtu = ip6_skb_dst_mtu(skb);
    185	if (skb_is_gso(skb) &&
    186	    !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
    187	    !skb_gso_validate_network_len(skb, mtu))
    188		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
    189
    190	if ((skb->len > mtu && !skb_is_gso(skb)) ||
    191	    dst_allfrag(skb_dst(skb)) ||
    192	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
    193		return ip6_fragment(net, sk, skb, ip6_finish_output2);
    194	else
    195		return ip6_finish_output2(net, sk, skb);
    196}
    197
    198static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
    199{
    200	int ret;
    201
    202	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
    203	switch (ret) {
    204	case NET_XMIT_SUCCESS:
    205	case NET_XMIT_CN:
    206		return __ip6_finish_output(net, sk, skb) ? : ret;
    207	default:
    208		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
    209		return ret;
    210	}
    211}
    212
    213int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
    214{
    215	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
    216	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
    217
    218	skb->protocol = htons(ETH_P_IPV6);
    219	skb->dev = dev;
    220
    221	if (unlikely(idev->cnf.disable_ipv6)) {
    222		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
    223		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
    224		return 0;
    225	}
    226
    227	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
    228			    net, sk, skb, indev, dev,
    229			    ip6_finish_output,
    230			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
    231}
    232EXPORT_SYMBOL(ip6_output);
    233
    234bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
    235{
    236	if (!np->autoflowlabel_set)
    237		return ip6_default_np_autolabel(net);
    238	else
    239		return np->autoflowlabel;
    240}
    241
    242/*
    243 * xmit an sk_buff (used by TCP, SCTP and DCCP)
    244 * Note : socket lock is not held for SYNACK packets, but might be modified
    245 * by calls to skb_set_owner_w() and ipv6_local_error(),
    246 * which are using proper atomic operations or spinlocks.
    247 */
    248int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
    249	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
    250{
    251	struct net *net = sock_net(sk);
    252	const struct ipv6_pinfo *np = inet6_sk(sk);
    253	struct in6_addr *first_hop = &fl6->daddr;
    254	struct dst_entry *dst = skb_dst(skb);
    255	struct net_device *dev = dst->dev;
    256	struct inet6_dev *idev = ip6_dst_idev(dst);
    257	struct hop_jumbo_hdr *hop_jumbo;
    258	int hoplen = sizeof(*hop_jumbo);
    259	unsigned int head_room;
    260	struct ipv6hdr *hdr;
    261	u8  proto = fl6->flowi6_proto;
    262	int seg_len = skb->len;
    263	int hlimit = -1;
    264	u32 mtu;
    265
    266	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
    267	if (opt)
    268		head_room += opt->opt_nflen + opt->opt_flen;
    269
    270	if (unlikely(head_room > skb_headroom(skb))) {
    271		skb = skb_expand_head(skb, head_room);
    272		if (!skb) {
    273			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
    274			return -ENOBUFS;
    275		}
    276	}
    277
    278	if (opt) {
    279		seg_len += opt->opt_nflen + opt->opt_flen;
    280
    281		if (opt->opt_flen)
    282			ipv6_push_frag_opts(skb, opt, &proto);
    283
    284		if (opt->opt_nflen)
    285			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
    286					     &fl6->saddr);
    287	}
    288
    289	if (unlikely(seg_len > IPV6_MAXPLEN)) {
    290		hop_jumbo = skb_push(skb, hoplen);
    291
    292		hop_jumbo->nexthdr = proto;
    293		hop_jumbo->hdrlen = 0;
    294		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
    295		hop_jumbo->tlv_len = 4;
    296		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
    297
    298		proto = IPPROTO_HOPOPTS;
    299		seg_len = 0;
    300		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
    301	}
    302
    303	skb_push(skb, sizeof(struct ipv6hdr));
    304	skb_reset_network_header(skb);
    305	hdr = ipv6_hdr(skb);
    306
    307	/*
    308	 *	Fill in the IPv6 header
    309	 */
    310	if (np)
    311		hlimit = np->hop_limit;
    312	if (hlimit < 0)
    313		hlimit = ip6_dst_hoplimit(dst);
    314
    315	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
    316				ip6_autoflowlabel(net, np), fl6));
    317
    318	hdr->payload_len = htons(seg_len);
    319	hdr->nexthdr = proto;
    320	hdr->hop_limit = hlimit;
    321
    322	hdr->saddr = fl6->saddr;
    323	hdr->daddr = *first_hop;
    324
    325	skb->protocol = htons(ETH_P_IPV6);
    326	skb->priority = priority;
    327	skb->mark = mark;
    328
    329	mtu = dst_mtu(dst);
    330	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
    331		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
    332
    333		/* if egress device is enslaved to an L3 master device pass the
    334		 * skb to its handler for processing
    335		 */
    336		skb = l3mdev_ip6_out((struct sock *)sk, skb);
    337		if (unlikely(!skb))
    338			return 0;
    339
    340		/* hooks should never assume socket lock is held.
    341		 * we promote our socket to non const
    342		 */
    343		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
    344			       net, (struct sock *)sk, skb, NULL, dev,
    345			       dst_output);
    346	}
    347
    348	skb->dev = dev;
    349	/* ipv6_local_error() does not require socket lock,
    350	 * we promote our socket to non const
    351	 */
    352	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
    353
    354	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
    355	kfree_skb(skb);
    356	return -EMSGSIZE;
    357}
    358EXPORT_SYMBOL(ip6_xmit);
    359
    360static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
    361{
    362	struct ip6_ra_chain *ra;
    363	struct sock *last = NULL;
    364
    365	read_lock(&ip6_ra_lock);
    366	for (ra = ip6_ra_chain; ra; ra = ra->next) {
    367		struct sock *sk = ra->sk;
    368		if (sk && ra->sel == sel &&
    369		    (!sk->sk_bound_dev_if ||
    370		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
    371			struct ipv6_pinfo *np = inet6_sk(sk);
    372
    373			if (np && np->rtalert_isolate &&
    374			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
    375				continue;
    376			}
    377			if (last) {
    378				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
    379				if (skb2)
    380					rawv6_rcv(last, skb2);
    381			}
    382			last = sk;
    383		}
    384	}
    385
    386	if (last) {
    387		rawv6_rcv(last, skb);
    388		read_unlock(&ip6_ra_lock);
    389		return 1;
    390	}
    391	read_unlock(&ip6_ra_lock);
    392	return 0;
    393}
    394
    395static int ip6_forward_proxy_check(struct sk_buff *skb)
    396{
    397	struct ipv6hdr *hdr = ipv6_hdr(skb);
    398	u8 nexthdr = hdr->nexthdr;
    399	__be16 frag_off;
    400	int offset;
    401
    402	if (ipv6_ext_hdr(nexthdr)) {
    403		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
    404		if (offset < 0)
    405			return 0;
    406	} else
    407		offset = sizeof(struct ipv6hdr);
    408
    409	if (nexthdr == IPPROTO_ICMPV6) {
    410		struct icmp6hdr *icmp6;
    411
    412		if (!pskb_may_pull(skb, (skb_network_header(skb) +
    413					 offset + 1 - skb->data)))
    414			return 0;
    415
    416		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
    417
    418		switch (icmp6->icmp6_type) {
    419		case NDISC_ROUTER_SOLICITATION:
    420		case NDISC_ROUTER_ADVERTISEMENT:
    421		case NDISC_NEIGHBOUR_SOLICITATION:
    422		case NDISC_NEIGHBOUR_ADVERTISEMENT:
    423		case NDISC_REDIRECT:
    424			/* For reaction involving unicast neighbor discovery
    425			 * message destined to the proxied address, pass it to
    426			 * input function.
    427			 */
    428			return 1;
    429		default:
    430			break;
    431		}
    432	}
    433
    434	/*
    435	 * The proxying router can't forward traffic sent to a link-local
    436	 * address, so signal the sender and discard the packet. This
    437	 * behavior is clarified by the MIPv6 specification.
    438	 */
    439	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
    440		dst_link_failure(skb);
    441		return -1;
    442	}
    443
    444	return 0;
    445}
    446
    447static inline int ip6_forward_finish(struct net *net, struct sock *sk,
    448				     struct sk_buff *skb)
    449{
    450	struct dst_entry *dst = skb_dst(skb);
    451
    452	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
    453	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
    454
    455#ifdef CONFIG_NET_SWITCHDEV
    456	if (skb->offload_l3_fwd_mark) {
    457		consume_skb(skb);
    458		return 0;
    459	}
    460#endif
    461
    462	skb_clear_tstamp(skb);
    463	return dst_output(net, sk, skb);
    464}
    465
    466static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
    467{
    468	if (skb->len <= mtu)
    469		return false;
    470
    471	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
    472	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
    473		return true;
    474
    475	if (skb->ignore_df)
    476		return false;
    477
    478	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
    479		return false;
    480
    481	return true;
    482}
    483
    484int ip6_forward(struct sk_buff *skb)
    485{
    486	struct dst_entry *dst = skb_dst(skb);
    487	struct ipv6hdr *hdr = ipv6_hdr(skb);
    488	struct inet6_skb_parm *opt = IP6CB(skb);
    489	struct net *net = dev_net(dst->dev);
    490	struct inet6_dev *idev;
    491	SKB_DR(reason);
    492	u32 mtu;
    493
    494	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
    495	if (net->ipv6.devconf_all->forwarding == 0)
    496		goto error;
    497
    498	if (skb->pkt_type != PACKET_HOST)
    499		goto drop;
    500
    501	if (unlikely(skb->sk))
    502		goto drop;
    503
    504	if (skb_warn_if_lro(skb))
    505		goto drop;
    506
    507	if (!net->ipv6.devconf_all->disable_policy &&
    508	    (!idev || !idev->cnf.disable_policy) &&
    509	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
    510		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
    511		goto drop;
    512	}
    513
    514	skb_forward_csum(skb);
    515
    516	/*
    517	 *	We DO NOT make any processing on
    518	 *	RA packets, pushing them to user level AS IS
    519	 *	without ane WARRANTY that application will be able
    520	 *	to interpret them. The reason is that we
    521	 *	cannot make anything clever here.
    522	 *
    523	 *	We are not end-node, so that if packet contains
    524	 *	AH/ESP, we cannot make anything.
    525	 *	Defragmentation also would be mistake, RA packets
    526	 *	cannot be fragmented, because there is no warranty
    527	 *	that different fragments will go along one path. --ANK
    528	 */
    529	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
    530		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
    531			return 0;
    532	}
    533
    534	/*
    535	 *	check and decrement ttl
    536	 */
    537	if (hdr->hop_limit <= 1) {
    538		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
    539		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
    540
    541		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
    542		return -ETIMEDOUT;
    543	}
    544
    545	/* XXX: idev->cnf.proxy_ndp? */
    546	if (net->ipv6.devconf_all->proxy_ndp &&
    547	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
    548		int proxied = ip6_forward_proxy_check(skb);
    549		if (proxied > 0) {
    550			hdr->hop_limit--;
    551			return ip6_input(skb);
    552		} else if (proxied < 0) {
    553			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
    554			goto drop;
    555		}
    556	}
    557
    558	if (!xfrm6_route_forward(skb)) {
    559		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
    560		SKB_DR_SET(reason, XFRM_POLICY);
    561		goto drop;
    562	}
    563	dst = skb_dst(skb);
    564
    565	/* IPv6 specs say nothing about it, but it is clear that we cannot
    566	   send redirects to source routed frames.
    567	   We don't send redirects to frames decapsulated from IPsec.
    568	 */
    569	if (IP6CB(skb)->iif == dst->dev->ifindex &&
    570	    opt->srcrt == 0 && !skb_sec_path(skb)) {
    571		struct in6_addr *target = NULL;
    572		struct inet_peer *peer;
    573		struct rt6_info *rt;
    574
    575		/*
    576		 *	incoming and outgoing devices are the same
    577		 *	send a redirect.
    578		 */
    579
    580		rt = (struct rt6_info *) dst;
    581		if (rt->rt6i_flags & RTF_GATEWAY)
    582			target = &rt->rt6i_gateway;
    583		else
    584			target = &hdr->daddr;
    585
    586		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
    587
    588		/* Limit redirects both by destination (here)
    589		   and by source (inside ndisc_send_redirect)
    590		 */
    591		if (inet_peer_xrlim_allow(peer, 1*HZ))
    592			ndisc_send_redirect(skb, target);
    593		if (peer)
    594			inet_putpeer(peer);
    595	} else {
    596		int addrtype = ipv6_addr_type(&hdr->saddr);
    597
    598		/* This check is security critical. */
    599		if (addrtype == IPV6_ADDR_ANY ||
    600		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
    601			goto error;
    602		if (addrtype & IPV6_ADDR_LINKLOCAL) {
    603			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
    604				    ICMPV6_NOT_NEIGHBOUR, 0);
    605			goto error;
    606		}
    607	}
    608
    609	mtu = ip6_dst_mtu_maybe_forward(dst, true);
    610	if (mtu < IPV6_MIN_MTU)
    611		mtu = IPV6_MIN_MTU;
    612
    613	if (ip6_pkt_too_big(skb, mtu)) {
    614		/* Again, force OUTPUT device used as source address */
    615		skb->dev = dst->dev;
    616		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
    617		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
    618		__IP6_INC_STATS(net, ip6_dst_idev(dst),
    619				IPSTATS_MIB_FRAGFAILS);
    620		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
    621		return -EMSGSIZE;
    622	}
    623
    624	if (skb_cow(skb, dst->dev->hard_header_len)) {
    625		__IP6_INC_STATS(net, ip6_dst_idev(dst),
    626				IPSTATS_MIB_OUTDISCARDS);
    627		goto drop;
    628	}
    629
    630	hdr = ipv6_hdr(skb);
    631
    632	/* Mangling hops number delayed to point after skb COW */
    633
    634	hdr->hop_limit--;
    635
    636	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
    637		       net, NULL, skb, skb->dev, dst->dev,
    638		       ip6_forward_finish);
    639
    640error:
    641	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
    642	SKB_DR_SET(reason, IP_INADDRERRORS);
    643drop:
    644	kfree_skb_reason(skb, reason);
    645	return -EINVAL;
    646}
    647
    648static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
    649{
    650	to->pkt_type = from->pkt_type;
    651	to->priority = from->priority;
    652	to->protocol = from->protocol;
    653	skb_dst_drop(to);
    654	skb_dst_set(to, dst_clone(skb_dst(from)));
    655	to->dev = from->dev;
    656	to->mark = from->mark;
    657
    658	skb_copy_hash(to, from);
    659
    660#ifdef CONFIG_NET_SCHED
    661	to->tc_index = from->tc_index;
    662#endif
    663	nf_copy(to, from);
    664	skb_ext_copy(to, from);
    665	skb_copy_secmark(to, from);
    666}
    667
    668int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
    669		      u8 nexthdr, __be32 frag_id,
    670		      struct ip6_fraglist_iter *iter)
    671{
    672	unsigned int first_len;
    673	struct frag_hdr *fh;
    674
    675	/* BUILD HEADER */
    676	*prevhdr = NEXTHDR_FRAGMENT;
    677	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
    678	if (!iter->tmp_hdr)
    679		return -ENOMEM;
    680
    681	iter->frag = skb_shinfo(skb)->frag_list;
    682	skb_frag_list_init(skb);
    683
    684	iter->offset = 0;
    685	iter->hlen = hlen;
    686	iter->frag_id = frag_id;
    687	iter->nexthdr = nexthdr;
    688
    689	__skb_pull(skb, hlen);
    690	fh = __skb_push(skb, sizeof(struct frag_hdr));
    691	__skb_push(skb, hlen);
    692	skb_reset_network_header(skb);
    693	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
    694
    695	fh->nexthdr = nexthdr;
    696	fh->reserved = 0;
    697	fh->frag_off = htons(IP6_MF);
    698	fh->identification = frag_id;
    699
    700	first_len = skb_pagelen(skb);
    701	skb->data_len = first_len - skb_headlen(skb);
    702	skb->len = first_len;
    703	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
    704
    705	return 0;
    706}
    707EXPORT_SYMBOL(ip6_fraglist_init);
    708
    709void ip6_fraglist_prepare(struct sk_buff *skb,
    710			  struct ip6_fraglist_iter *iter)
    711{
    712	struct sk_buff *frag = iter->frag;
    713	unsigned int hlen = iter->hlen;
    714	struct frag_hdr *fh;
    715
    716	frag->ip_summed = CHECKSUM_NONE;
    717	skb_reset_transport_header(frag);
    718	fh = __skb_push(frag, sizeof(struct frag_hdr));
    719	__skb_push(frag, hlen);
    720	skb_reset_network_header(frag);
    721	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
    722	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
    723	fh->nexthdr = iter->nexthdr;
    724	fh->reserved = 0;
    725	fh->frag_off = htons(iter->offset);
    726	if (frag->next)
    727		fh->frag_off |= htons(IP6_MF);
    728	fh->identification = iter->frag_id;
    729	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
    730	ip6_copy_metadata(frag, skb);
    731}
    732EXPORT_SYMBOL(ip6_fraglist_prepare);
    733
    734void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
    735		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
    736		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
    737{
    738	state->prevhdr = prevhdr;
    739	state->nexthdr = nexthdr;
    740	state->frag_id = frag_id;
    741
    742	state->hlen = hlen;
    743	state->mtu = mtu;
    744
    745	state->left = skb->len - hlen;	/* Space per frame */
    746	state->ptr = hlen;		/* Where to start from */
    747
    748	state->hroom = hdr_room;
    749	state->troom = needed_tailroom;
    750
    751	state->offset = 0;
    752}
    753EXPORT_SYMBOL(ip6_frag_init);
    754
    755struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
    756{
    757	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
    758	struct sk_buff *frag;
    759	struct frag_hdr *fh;
    760	unsigned int len;
    761
    762	len = state->left;
    763	/* IF: it doesn't fit, use 'mtu' - the data space left */
    764	if (len > state->mtu)
    765		len = state->mtu;
    766	/* IF: we are not sending up to and including the packet end
    767	   then align the next start on an eight byte boundary */
    768	if (len < state->left)
    769		len &= ~7;
    770
    771	/* Allocate buffer */
    772	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
    773			 state->hroom + state->troom, GFP_ATOMIC);
    774	if (!frag)
    775		return ERR_PTR(-ENOMEM);
    776
    777	/*
    778	 *	Set up data on packet
    779	 */
    780
    781	ip6_copy_metadata(frag, skb);
    782	skb_reserve(frag, state->hroom);
    783	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
    784	skb_reset_network_header(frag);
    785	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
    786	frag->transport_header = (frag->network_header + state->hlen +
    787				  sizeof(struct frag_hdr));
    788
    789	/*
    790	 *	Charge the memory for the fragment to any owner
    791	 *	it might possess
    792	 */
    793	if (skb->sk)
    794		skb_set_owner_w(frag, skb->sk);
    795
    796	/*
    797	 *	Copy the packet header into the new buffer.
    798	 */
    799	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
    800
    801	fragnexthdr_offset = skb_network_header(frag);
    802	fragnexthdr_offset += prevhdr - skb_network_header(skb);
    803	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
    804
    805	/*
    806	 *	Build fragment header.
    807	 */
    808	fh->nexthdr = state->nexthdr;
    809	fh->reserved = 0;
    810	fh->identification = state->frag_id;
    811
    812	/*
    813	 *	Copy a block of the IP datagram.
    814	 */
    815	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
    816			     len));
    817	state->left -= len;
    818
    819	fh->frag_off = htons(state->offset);
    820	if (state->left > 0)
    821		fh->frag_off |= htons(IP6_MF);
    822	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
    823
    824	state->ptr += len;
    825	state->offset += len;
    826
    827	return frag;
    828}
    829EXPORT_SYMBOL(ip6_frag_next);
    830
    831int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
    832		 int (*output)(struct net *, struct sock *, struct sk_buff *))
    833{
    834	struct sk_buff *frag;
    835	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
    836	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
    837				inet6_sk(skb->sk) : NULL;
    838	bool mono_delivery_time = skb->mono_delivery_time;
    839	struct ip6_frag_state state;
    840	unsigned int mtu, hlen, nexthdr_offset;
    841	ktime_t tstamp = skb->tstamp;
    842	int hroom, err = 0;
    843	__be32 frag_id;
    844	u8 *prevhdr, nexthdr = 0;
    845
    846	err = ip6_find_1stfragopt(skb, &prevhdr);
    847	if (err < 0)
    848		goto fail;
    849	hlen = err;
    850	nexthdr = *prevhdr;
    851	nexthdr_offset = prevhdr - skb_network_header(skb);
    852
    853	mtu = ip6_skb_dst_mtu(skb);
    854
    855	/* We must not fragment if the socket is set to force MTU discovery
    856	 * or if the skb it not generated by a local socket.
    857	 */
    858	if (unlikely(!skb->ignore_df && skb->len > mtu))
    859		goto fail_toobig;
    860
    861	if (IP6CB(skb)->frag_max_size) {
    862		if (IP6CB(skb)->frag_max_size > mtu)
    863			goto fail_toobig;
    864
    865		/* don't send fragments larger than what we received */
    866		mtu = IP6CB(skb)->frag_max_size;
    867		if (mtu < IPV6_MIN_MTU)
    868			mtu = IPV6_MIN_MTU;
    869	}
    870
    871	if (np && np->frag_size < mtu) {
    872		if (np->frag_size)
    873			mtu = np->frag_size;
    874	}
    875	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
    876		goto fail_toobig;
    877	mtu -= hlen + sizeof(struct frag_hdr);
    878
    879	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
    880				    &ipv6_hdr(skb)->saddr);
    881
    882	if (skb->ip_summed == CHECKSUM_PARTIAL &&
    883	    (err = skb_checksum_help(skb)))
    884		goto fail;
    885
    886	prevhdr = skb_network_header(skb) + nexthdr_offset;
    887	hroom = LL_RESERVED_SPACE(rt->dst.dev);
    888	if (skb_has_frag_list(skb)) {
    889		unsigned int first_len = skb_pagelen(skb);
    890		struct ip6_fraglist_iter iter;
    891		struct sk_buff *frag2;
    892
    893		if (first_len - hlen > mtu ||
    894		    ((first_len - hlen) & 7) ||
    895		    skb_cloned(skb) ||
    896		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
    897			goto slow_path;
    898
    899		skb_walk_frags(skb, frag) {
    900			/* Correct geometry. */
    901			if (frag->len > mtu ||
    902			    ((frag->len & 7) && frag->next) ||
    903			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
    904				goto slow_path_clean;
    905
    906			/* Partially cloned skb? */
    907			if (skb_shared(frag))
    908				goto slow_path_clean;
    909
    910			BUG_ON(frag->sk);
    911			if (skb->sk) {
    912				frag->sk = skb->sk;
    913				frag->destructor = sock_wfree;
    914			}
    915			skb->truesize -= frag->truesize;
    916		}
    917
    918		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
    919					&iter);
    920		if (err < 0)
    921			goto fail;
    922
    923		for (;;) {
    924			/* Prepare header of the next frame,
    925			 * before previous one went down. */
    926			if (iter.frag)
    927				ip6_fraglist_prepare(skb, &iter);
    928
    929			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
    930			err = output(net, sk, skb);
    931			if (!err)
    932				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
    933					      IPSTATS_MIB_FRAGCREATES);
    934
    935			if (err || !iter.frag)
    936				break;
    937
    938			skb = ip6_fraglist_next(&iter);
    939		}
    940
    941		kfree(iter.tmp_hdr);
    942
    943		if (err == 0) {
    944			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
    945				      IPSTATS_MIB_FRAGOKS);
    946			return 0;
    947		}
    948
    949		kfree_skb_list(iter.frag);
    950
    951		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
    952			      IPSTATS_MIB_FRAGFAILS);
    953		return err;
    954
    955slow_path_clean:
    956		skb_walk_frags(skb, frag2) {
    957			if (frag2 == frag)
    958				break;
    959			frag2->sk = NULL;
    960			frag2->destructor = NULL;
    961			skb->truesize += frag2->truesize;
    962		}
    963	}
    964
    965slow_path:
    966	/*
    967	 *	Fragment the datagram.
    968	 */
    969
    970	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
    971		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
    972		      &state);
    973
    974	/*
    975	 *	Keep copying data until we run out.
    976	 */
    977
    978	while (state.left > 0) {
    979		frag = ip6_frag_next(skb, &state);
    980		if (IS_ERR(frag)) {
    981			err = PTR_ERR(frag);
    982			goto fail;
    983		}
    984
    985		/*
    986		 *	Put this fragment into the sending queue.
    987		 */
    988		skb_set_delivery_time(frag, tstamp, mono_delivery_time);
    989		err = output(net, sk, frag);
    990		if (err)
    991			goto fail;
    992
    993		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
    994			      IPSTATS_MIB_FRAGCREATES);
    995	}
    996	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
    997		      IPSTATS_MIB_FRAGOKS);
    998	consume_skb(skb);
    999	return err;
   1000
   1001fail_toobig:
   1002	if (skb->sk && dst_allfrag(skb_dst(skb)))
   1003		sk_gso_disable(skb->sk);
   1004
   1005	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
   1006	err = -EMSGSIZE;
   1007
   1008fail:
   1009	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
   1010		      IPSTATS_MIB_FRAGFAILS);
   1011	kfree_skb(skb);
   1012	return err;
   1013}
   1014
   1015static inline int ip6_rt_check(const struct rt6key *rt_key,
   1016			       const struct in6_addr *fl_addr,
   1017			       const struct in6_addr *addr_cache)
   1018{
   1019	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
   1020		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
   1021}
   1022
   1023static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
   1024					  struct dst_entry *dst,
   1025					  const struct flowi6 *fl6)
   1026{
   1027	struct ipv6_pinfo *np = inet6_sk(sk);
   1028	struct rt6_info *rt;
   1029
   1030	if (!dst)
   1031		goto out;
   1032
   1033	if (dst->ops->family != AF_INET6) {
   1034		dst_release(dst);
   1035		return NULL;
   1036	}
   1037
   1038	rt = (struct rt6_info *)dst;
   1039	/* Yes, checking route validity in not connected
   1040	 * case is not very simple. Take into account,
   1041	 * that we do not support routing by source, TOS,
   1042	 * and MSG_DONTROUTE		--ANK (980726)
   1043	 *
   1044	 * 1. ip6_rt_check(): If route was host route,
   1045	 *    check that cached destination is current.
   1046	 *    If it is network route, we still may
   1047	 *    check its validity using saved pointer
   1048	 *    to the last used address: daddr_cache.
   1049	 *    We do not want to save whole address now,
   1050	 *    (because main consumer of this service
   1051	 *    is tcp, which has not this problem),
   1052	 *    so that the last trick works only on connected
   1053	 *    sockets.
   1054	 * 2. oif also should be the same.
   1055	 */
   1056	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
   1057#ifdef CONFIG_IPV6_SUBTREES
   1058	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
   1059#endif
   1060	   (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
   1061		dst_release(dst);
   1062		dst = NULL;
   1063	}
   1064
   1065out:
   1066	return dst;
   1067}
   1068
   1069static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
   1070			       struct dst_entry **dst, struct flowi6 *fl6)
   1071{
   1072#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
   1073	struct neighbour *n;
   1074	struct rt6_info *rt;
   1075#endif
   1076	int err;
   1077	int flags = 0;
   1078
   1079	/* The correct way to handle this would be to do
   1080	 * ip6_route_get_saddr, and then ip6_route_output; however,
   1081	 * the route-specific preferred source forces the
   1082	 * ip6_route_output call _before_ ip6_route_get_saddr.
   1083	 *
   1084	 * In source specific routing (no src=any default route),
   1085	 * ip6_route_output will fail given src=any saddr, though, so
   1086	 * that's why we try it again later.
   1087	 */
   1088	if (ipv6_addr_any(&fl6->saddr)) {
   1089		struct fib6_info *from;
   1090		struct rt6_info *rt;
   1091
   1092		*dst = ip6_route_output(net, sk, fl6);
   1093		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
   1094
   1095		rcu_read_lock();
   1096		from = rt ? rcu_dereference(rt->from) : NULL;
   1097		err = ip6_route_get_saddr(net, from, &fl6->daddr,
   1098					  sk ? inet6_sk(sk)->srcprefs : 0,
   1099					  &fl6->saddr);
   1100		rcu_read_unlock();
   1101
   1102		if (err)
   1103			goto out_err_release;
   1104
   1105		/* If we had an erroneous initial result, pretend it
   1106		 * never existed and let the SA-enabled version take
   1107		 * over.
   1108		 */
   1109		if ((*dst)->error) {
   1110			dst_release(*dst);
   1111			*dst = NULL;
   1112		}
   1113
   1114		if (fl6->flowi6_oif)
   1115			flags |= RT6_LOOKUP_F_IFACE;
   1116	}
   1117
   1118	if (!*dst)
   1119		*dst = ip6_route_output_flags(net, sk, fl6, flags);
   1120
   1121	err = (*dst)->error;
   1122	if (err)
   1123		goto out_err_release;
   1124
   1125#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
   1126	/*
   1127	 * Here if the dst entry we've looked up
   1128	 * has a neighbour entry that is in the INCOMPLETE
   1129	 * state and the src address from the flow is
   1130	 * marked as OPTIMISTIC, we release the found
   1131	 * dst entry and replace it instead with the
   1132	 * dst entry of the nexthop router
   1133	 */
   1134	rt = (struct rt6_info *) *dst;
   1135	rcu_read_lock_bh();
   1136	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
   1137				      rt6_nexthop(rt, &fl6->daddr));
   1138	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
   1139	rcu_read_unlock_bh();
   1140
   1141	if (err) {
   1142		struct inet6_ifaddr *ifp;
   1143		struct flowi6 fl_gw6;
   1144		int redirect;
   1145
   1146		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
   1147				      (*dst)->dev, 1);
   1148
   1149		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
   1150		if (ifp)
   1151			in6_ifa_put(ifp);
   1152
   1153		if (redirect) {
   1154			/*
   1155			 * We need to get the dst entry for the
   1156			 * default router instead
   1157			 */
   1158			dst_release(*dst);
   1159			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
   1160			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
   1161			*dst = ip6_route_output(net, sk, &fl_gw6);
   1162			err = (*dst)->error;
   1163			if (err)
   1164				goto out_err_release;
   1165		}
   1166	}
   1167#endif
   1168	if (ipv6_addr_v4mapped(&fl6->saddr) &&
   1169	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
   1170		err = -EAFNOSUPPORT;
   1171		goto out_err_release;
   1172	}
   1173
   1174	return 0;
   1175
   1176out_err_release:
   1177	dst_release(*dst);
   1178	*dst = NULL;
   1179
   1180	if (err == -ENETUNREACH)
   1181		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
   1182	return err;
   1183}
   1184
   1185/**
   1186 *	ip6_dst_lookup - perform route lookup on flow
   1187 *	@net: Network namespace to perform lookup in
   1188 *	@sk: socket which provides route info
   1189 *	@dst: pointer to dst_entry * for result
   1190 *	@fl6: flow to lookup
   1191 *
   1192 *	This function performs a route lookup on the given flow.
   1193 *
   1194 *	It returns zero on success, or a standard errno code on error.
   1195 */
   1196int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
   1197		   struct flowi6 *fl6)
   1198{
   1199	*dst = NULL;
   1200	return ip6_dst_lookup_tail(net, sk, dst, fl6);
   1201}
   1202EXPORT_SYMBOL_GPL(ip6_dst_lookup);
   1203
   1204/**
   1205 *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
   1206 *	@net: Network namespace to perform lookup in
   1207 *	@sk: socket which provides route info
   1208 *	@fl6: flow to lookup
   1209 *	@final_dst: final destination address for ipsec lookup
   1210 *
   1211 *	This function performs a route lookup on the given flow.
   1212 *
   1213 *	It returns a valid dst pointer on success, or a pointer encoded
   1214 *	error code.
   1215 */
   1216struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
   1217				      const struct in6_addr *final_dst)
   1218{
   1219	struct dst_entry *dst = NULL;
   1220	int err;
   1221
   1222	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
   1223	if (err)
   1224		return ERR_PTR(err);
   1225	if (final_dst)
   1226		fl6->daddr = *final_dst;
   1227
   1228	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
   1229}
   1230EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
   1231
   1232/**
   1233 *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
   1234 *	@sk: socket which provides the dst cache and route info
   1235 *	@fl6: flow to lookup
   1236 *	@final_dst: final destination address for ipsec lookup
   1237 *	@connected: whether @sk is connected or not
   1238 *
   1239 *	This function performs a route lookup on the given flow with the
   1240 *	possibility of using the cached route in the socket if it is valid.
   1241 *	It will take the socket dst lock when operating on the dst cache.
   1242 *	As a result, this function can only be used in process context.
   1243 *
   1244 *	In addition, for a connected socket, cache the dst in the socket
   1245 *	if the current cache is not valid.
   1246 *
   1247 *	It returns a valid dst pointer on success, or a pointer encoded
   1248 *	error code.
   1249 */
   1250struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
   1251					 const struct in6_addr *final_dst,
   1252					 bool connected)
   1253{
   1254	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
   1255
   1256	dst = ip6_sk_dst_check(sk, dst, fl6);
   1257	if (dst)
   1258		return dst;
   1259
   1260	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
   1261	if (connected && !IS_ERR(dst))
   1262		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
   1263
   1264	return dst;
   1265}
   1266EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
   1267
   1268/**
   1269 *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
   1270 *      @skb: Packet for which lookup is done
   1271 *      @dev: Tunnel device
   1272 *      @net: Network namespace of tunnel device
   1273 *      @sock: Socket which provides route info
   1274 *      @saddr: Memory to store the src ip address
   1275 *      @info: Tunnel information
   1276 *      @protocol: IP protocol
   1277 *      @use_cache: Flag to enable cache usage
   1278 *      This function performs a route lookup on a tunnel
   1279 *
   1280 *      It returns a valid dst pointer and stores src address to be used in
   1281 *      tunnel in param saddr on success, else a pointer encoded error code.
   1282 */
   1283
   1284struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
   1285					struct net_device *dev,
   1286					struct net *net,
   1287					struct socket *sock,
   1288					struct in6_addr *saddr,
   1289					const struct ip_tunnel_info *info,
   1290					u8 protocol,
   1291					bool use_cache)
   1292{
   1293	struct dst_entry *dst = NULL;
   1294#ifdef CONFIG_DST_CACHE
   1295	struct dst_cache *dst_cache;
   1296#endif
   1297	struct flowi6 fl6;
   1298	__u8 prio;
   1299
   1300#ifdef CONFIG_DST_CACHE
   1301	dst_cache = (struct dst_cache *)&info->dst_cache;
   1302	if (use_cache) {
   1303		dst = dst_cache_get_ip6(dst_cache, saddr);
   1304		if (dst)
   1305			return dst;
   1306	}
   1307#endif
   1308	memset(&fl6, 0, sizeof(fl6));
   1309	fl6.flowi6_mark = skb->mark;
   1310	fl6.flowi6_proto = protocol;
   1311	fl6.daddr = info->key.u.ipv6.dst;
   1312	fl6.saddr = info->key.u.ipv6.src;
   1313	prio = info->key.tos;
   1314	fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
   1315					  info->key.label);
   1316
   1317	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
   1318					      NULL);
   1319	if (IS_ERR(dst)) {
   1320		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
   1321		return ERR_PTR(-ENETUNREACH);
   1322	}
   1323	if (dst->dev == dev) { /* is this necessary? */
   1324		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
   1325		dst_release(dst);
   1326		return ERR_PTR(-ELOOP);
   1327	}
   1328#ifdef CONFIG_DST_CACHE
   1329	if (use_cache)
   1330		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
   1331#endif
   1332	*saddr = fl6.saddr;
   1333	return dst;
   1334}
   1335EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
   1336
   1337static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
   1338					       gfp_t gfp)
   1339{
   1340	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
   1341}
   1342
   1343static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
   1344						gfp_t gfp)
   1345{
   1346	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
   1347}
   1348
   1349static void ip6_append_data_mtu(unsigned int *mtu,
   1350				int *maxfraglen,
   1351				unsigned int fragheaderlen,
   1352				struct sk_buff *skb,
   1353				struct rt6_info *rt,
   1354				unsigned int orig_mtu)
   1355{
   1356	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
   1357		if (!skb) {
   1358			/* first fragment, reserve header_len */
   1359			*mtu = orig_mtu - rt->dst.header_len;
   1360
   1361		} else {
   1362			/*
   1363			 * this fragment is not first, the headers
   1364			 * space is regarded as data space.
   1365			 */
   1366			*mtu = orig_mtu;
   1367		}
   1368		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
   1369			      + fragheaderlen - sizeof(struct frag_hdr);
   1370	}
   1371}
   1372
   1373static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
   1374			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
   1375			  struct rt6_info *rt)
   1376{
   1377	struct ipv6_pinfo *np = inet6_sk(sk);
   1378	unsigned int mtu;
   1379	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
   1380
   1381	/* callers pass dst together with a reference, set it first so
   1382	 * ip6_cork_release() can put it down even in case of an error.
   1383	 */
   1384	cork->base.dst = &rt->dst;
   1385
   1386	/*
   1387	 * setup for corking
   1388	 */
   1389	if (opt) {
   1390		if (WARN_ON(v6_cork->opt))
   1391			return -EINVAL;
   1392
   1393		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
   1394		if (unlikely(!nopt))
   1395			return -ENOBUFS;
   1396
   1397		nopt->tot_len = sizeof(*opt);
   1398		nopt->opt_flen = opt->opt_flen;
   1399		nopt->opt_nflen = opt->opt_nflen;
   1400
   1401		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
   1402		if (opt->dst0opt && !nopt->dst0opt)
   1403			return -ENOBUFS;
   1404
   1405		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
   1406		if (opt->dst1opt && !nopt->dst1opt)
   1407			return -ENOBUFS;
   1408
   1409		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
   1410		if (opt->hopopt && !nopt->hopopt)
   1411			return -ENOBUFS;
   1412
   1413		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
   1414		if (opt->srcrt && !nopt->srcrt)
   1415			return -ENOBUFS;
   1416
   1417		/* need source address above miyazawa*/
   1418	}
   1419	v6_cork->hop_limit = ipc6->hlimit;
   1420	v6_cork->tclass = ipc6->tclass;
   1421	if (rt->dst.flags & DST_XFRM_TUNNEL)
   1422		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
   1423		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
   1424	else
   1425		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
   1426			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
   1427	if (np->frag_size < mtu) {
   1428		if (np->frag_size)
   1429			mtu = np->frag_size;
   1430	}
   1431	cork->base.fragsize = mtu;
   1432	cork->base.gso_size = ipc6->gso_size;
   1433	cork->base.tx_flags = 0;
   1434	cork->base.mark = ipc6->sockc.mark;
   1435	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
   1436
   1437	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
   1438		cork->base.flags |= IPCORK_ALLFRAG;
   1439	cork->base.length = 0;
   1440
   1441	cork->base.transmit_time = ipc6->sockc.transmit_time;
   1442
   1443	return 0;
   1444}
   1445
   1446static int __ip6_append_data(struct sock *sk,
   1447			     struct sk_buff_head *queue,
   1448			     struct inet_cork_full *cork_full,
   1449			     struct inet6_cork *v6_cork,
   1450			     struct page_frag *pfrag,
   1451			     int getfrag(void *from, char *to, int offset,
   1452					 int len, int odd, struct sk_buff *skb),
   1453			     void *from, size_t length, int transhdrlen,
   1454			     unsigned int flags, struct ipcm6_cookie *ipc6)
   1455{
   1456	struct sk_buff *skb, *skb_prev = NULL;
   1457	struct inet_cork *cork = &cork_full->base;
   1458	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
   1459	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
   1460	struct ubuf_info *uarg = NULL;
   1461	int exthdrlen = 0;
   1462	int dst_exthdrlen = 0;
   1463	int hh_len;
   1464	int copy;
   1465	int err;
   1466	int offset = 0;
   1467	u32 tskey = 0;
   1468	struct rt6_info *rt = (struct rt6_info *)cork->dst;
   1469	struct ipv6_txoptions *opt = v6_cork->opt;
   1470	int csummode = CHECKSUM_NONE;
   1471	unsigned int maxnonfragsize, headersize;
   1472	unsigned int wmem_alloc_delta = 0;
   1473	bool paged, extra_uref = false;
   1474
   1475	skb = skb_peek_tail(queue);
   1476	if (!skb) {
   1477		exthdrlen = opt ? opt->opt_flen : 0;
   1478		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
   1479	}
   1480
   1481	paged = !!cork->gso_size;
   1482	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
   1483	orig_mtu = mtu;
   1484
   1485	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
   1486	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
   1487		tskey = atomic_inc_return(&sk->sk_tskey) - 1;
   1488
   1489	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
   1490
   1491	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
   1492			(opt ? opt->opt_nflen : 0);
   1493
   1494	headersize = sizeof(struct ipv6hdr) +
   1495		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
   1496		     (dst_allfrag(&rt->dst) ?
   1497		      sizeof(struct frag_hdr) : 0) +
   1498		     rt->rt6i_nfheader_len;
   1499
   1500	if (mtu <= fragheaderlen ||
   1501	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
   1502		goto emsgsize;
   1503
   1504	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
   1505		     sizeof(struct frag_hdr);
   1506
   1507	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
   1508	 * the first fragment
   1509	 */
   1510	if (headersize + transhdrlen > mtu)
   1511		goto emsgsize;
   1512
   1513	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
   1514	    (sk->sk_protocol == IPPROTO_UDP ||
   1515	     sk->sk_protocol == IPPROTO_ICMPV6 ||
   1516	     sk->sk_protocol == IPPROTO_RAW)) {
   1517		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
   1518				sizeof(struct ipv6hdr));
   1519		goto emsgsize;
   1520	}
   1521
   1522	if (ip6_sk_ignore_df(sk))
   1523		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
   1524	else
   1525		maxnonfragsize = mtu;
   1526
   1527	if (cork->length + length > maxnonfragsize - headersize) {
   1528emsgsize:
   1529		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
   1530		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
   1531		return -EMSGSIZE;
   1532	}
   1533
   1534	/* CHECKSUM_PARTIAL only with no extension headers and when
   1535	 * we are not going to fragment
   1536	 */
   1537	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
   1538	    headersize == sizeof(struct ipv6hdr) &&
   1539	    length <= mtu - headersize &&
   1540	    (!(flags & MSG_MORE) || cork->gso_size) &&
   1541	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
   1542		csummode = CHECKSUM_PARTIAL;
   1543
   1544	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
   1545		uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
   1546		if (!uarg)
   1547			return -ENOBUFS;
   1548		extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
   1549		if (rt->dst.dev->features & NETIF_F_SG &&
   1550		    csummode == CHECKSUM_PARTIAL) {
   1551			paged = true;
   1552		} else {
   1553			uarg->zerocopy = 0;
   1554			skb_zcopy_set(skb, uarg, &extra_uref);
   1555		}
   1556	}
   1557
   1558	/*
   1559	 * Let's try using as much space as possible.
   1560	 * Use MTU if total length of the message fits into the MTU.
   1561	 * Otherwise, we need to reserve fragment header and
   1562	 * fragment alignment (= 8-15 octects, in total).
   1563	 *
   1564	 * Note that we may need to "move" the data from the tail
   1565	 * of the buffer to the new fragment when we split
   1566	 * the message.
   1567	 *
   1568	 * FIXME: It may be fragmented into multiple chunks
   1569	 *        at once if non-fragmentable extension headers
   1570	 *        are too large.
   1571	 * --yoshfuji
   1572	 */
   1573
   1574	cork->length += length;
   1575	if (!skb)
   1576		goto alloc_new_skb;
   1577
   1578	while (length > 0) {
   1579		/* Check if the remaining data fits into current packet. */
   1580		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
   1581		if (copy < length)
   1582			copy = maxfraglen - skb->len;
   1583
   1584		if (copy <= 0) {
   1585			char *data;
   1586			unsigned int datalen;
   1587			unsigned int fraglen;
   1588			unsigned int fraggap;
   1589			unsigned int alloclen, alloc_extra;
   1590			unsigned int pagedlen;
   1591alloc_new_skb:
   1592			/* There's no room in the current skb */
   1593			if (skb)
   1594				fraggap = skb->len - maxfraglen;
   1595			else
   1596				fraggap = 0;
   1597			/* update mtu and maxfraglen if necessary */
   1598			if (!skb || !skb_prev)
   1599				ip6_append_data_mtu(&mtu, &maxfraglen,
   1600						    fragheaderlen, skb, rt,
   1601						    orig_mtu);
   1602
   1603			skb_prev = skb;
   1604
   1605			/*
   1606			 * If remaining data exceeds the mtu,
   1607			 * we know we need more fragment(s).
   1608			 */
   1609			datalen = length + fraggap;
   1610
   1611			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
   1612				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
   1613			fraglen = datalen + fragheaderlen;
   1614			pagedlen = 0;
   1615
   1616			alloc_extra = hh_len;
   1617			alloc_extra += dst_exthdrlen;
   1618			alloc_extra += rt->dst.trailer_len;
   1619
   1620			/* We just reserve space for fragment header.
   1621			 * Note: this may be overallocation if the message
   1622			 * (without MSG_MORE) fits into the MTU.
   1623			 */
   1624			alloc_extra += sizeof(struct frag_hdr);
   1625
   1626			if ((flags & MSG_MORE) &&
   1627			    !(rt->dst.dev->features&NETIF_F_SG))
   1628				alloclen = mtu;
   1629			else if (!paged &&
   1630				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
   1631				  !(rt->dst.dev->features & NETIF_F_SG)))
   1632				alloclen = fraglen;
   1633			else {
   1634				alloclen = min_t(int, fraglen, MAX_HEADER);
   1635				pagedlen = fraglen - alloclen;
   1636			}
   1637			alloclen += alloc_extra;
   1638
   1639			if (datalen != length + fraggap) {
   1640				/*
   1641				 * this is not the last fragment, the trailer
   1642				 * space is regarded as data space.
   1643				 */
   1644				datalen += rt->dst.trailer_len;
   1645			}
   1646
   1647			fraglen = datalen + fragheaderlen;
   1648
   1649			copy = datalen - transhdrlen - fraggap - pagedlen;
   1650			if (copy < 0) {
   1651				err = -EINVAL;
   1652				goto error;
   1653			}
   1654			if (transhdrlen) {
   1655				skb = sock_alloc_send_skb(sk, alloclen,
   1656						(flags & MSG_DONTWAIT), &err);
   1657			} else {
   1658				skb = NULL;
   1659				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
   1660				    2 * sk->sk_sndbuf)
   1661					skb = alloc_skb(alloclen,
   1662							sk->sk_allocation);
   1663				if (unlikely(!skb))
   1664					err = -ENOBUFS;
   1665			}
   1666			if (!skb)
   1667				goto error;
   1668			/*
   1669			 *	Fill in the control structures
   1670			 */
   1671			skb->protocol = htons(ETH_P_IPV6);
   1672			skb->ip_summed = csummode;
   1673			skb->csum = 0;
   1674			/* reserve for fragmentation and ipsec header */
   1675			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
   1676				    dst_exthdrlen);
   1677
   1678			/*
   1679			 *	Find where to start putting bytes
   1680			 */
   1681			data = skb_put(skb, fraglen - pagedlen);
   1682			skb_set_network_header(skb, exthdrlen);
   1683			data += fragheaderlen;
   1684			skb->transport_header = (skb->network_header +
   1685						 fragheaderlen);
   1686			if (fraggap) {
   1687				skb->csum = skb_copy_and_csum_bits(
   1688					skb_prev, maxfraglen,
   1689					data + transhdrlen, fraggap);
   1690				skb_prev->csum = csum_sub(skb_prev->csum,
   1691							  skb->csum);
   1692				data += fraggap;
   1693				pskb_trim_unique(skb_prev, maxfraglen);
   1694			}
   1695			if (copy > 0 &&
   1696			    getfrag(from, data + transhdrlen, offset,
   1697				    copy, fraggap, skb) < 0) {
   1698				err = -EFAULT;
   1699				kfree_skb(skb);
   1700				goto error;
   1701			}
   1702
   1703			offset += copy;
   1704			length -= copy + transhdrlen;
   1705			transhdrlen = 0;
   1706			exthdrlen = 0;
   1707			dst_exthdrlen = 0;
   1708
   1709			/* Only the initial fragment is time stamped */
   1710			skb_shinfo(skb)->tx_flags = cork->tx_flags;
   1711			cork->tx_flags = 0;
   1712			skb_shinfo(skb)->tskey = tskey;
   1713			tskey = 0;
   1714			skb_zcopy_set(skb, uarg, &extra_uref);
   1715
   1716			if ((flags & MSG_CONFIRM) && !skb_prev)
   1717				skb_set_dst_pending_confirm(skb, 1);
   1718
   1719			/*
   1720			 * Put the packet on the pending queue
   1721			 */
   1722			if (!skb->destructor) {
   1723				skb->destructor = sock_wfree;
   1724				skb->sk = sk;
   1725				wmem_alloc_delta += skb->truesize;
   1726			}
   1727			__skb_queue_tail(queue, skb);
   1728			continue;
   1729		}
   1730
   1731		if (copy > length)
   1732			copy = length;
   1733
   1734		if (!(rt->dst.dev->features&NETIF_F_SG) &&
   1735		    skb_tailroom(skb) >= copy) {
   1736			unsigned int off;
   1737
   1738			off = skb->len;
   1739			if (getfrag(from, skb_put(skb, copy),
   1740						offset, copy, off, skb) < 0) {
   1741				__skb_trim(skb, off);
   1742				err = -EFAULT;
   1743				goto error;
   1744			}
   1745		} else if (!uarg || !uarg->zerocopy) {
   1746			int i = skb_shinfo(skb)->nr_frags;
   1747
   1748			err = -ENOMEM;
   1749			if (!sk_page_frag_refill(sk, pfrag))
   1750				goto error;
   1751
   1752			if (!skb_can_coalesce(skb, i, pfrag->page,
   1753					      pfrag->offset)) {
   1754				err = -EMSGSIZE;
   1755				if (i == MAX_SKB_FRAGS)
   1756					goto error;
   1757
   1758				__skb_fill_page_desc(skb, i, pfrag->page,
   1759						     pfrag->offset, 0);
   1760				skb_shinfo(skb)->nr_frags = ++i;
   1761				get_page(pfrag->page);
   1762			}
   1763			copy = min_t(int, copy, pfrag->size - pfrag->offset);
   1764			if (getfrag(from,
   1765				    page_address(pfrag->page) + pfrag->offset,
   1766				    offset, copy, skb->len, skb) < 0)
   1767				goto error_efault;
   1768
   1769			pfrag->offset += copy;
   1770			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
   1771			skb->len += copy;
   1772			skb->data_len += copy;
   1773			skb->truesize += copy;
   1774			wmem_alloc_delta += copy;
   1775		} else {
   1776			err = skb_zerocopy_iter_dgram(skb, from, copy);
   1777			if (err < 0)
   1778				goto error;
   1779		}
   1780		offset += copy;
   1781		length -= copy;
   1782	}
   1783
   1784	if (wmem_alloc_delta)
   1785		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
   1786	return 0;
   1787
   1788error_efault:
   1789	err = -EFAULT;
   1790error:
   1791	net_zcopy_put_abort(uarg, extra_uref);
   1792	cork->length -= length;
   1793	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
   1794	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
   1795	return err;
   1796}
   1797
   1798int ip6_append_data(struct sock *sk,
   1799		    int getfrag(void *from, char *to, int offset, int len,
   1800				int odd, struct sk_buff *skb),
   1801		    void *from, size_t length, int transhdrlen,
   1802		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
   1803		    struct rt6_info *rt, unsigned int flags)
   1804{
   1805	struct inet_sock *inet = inet_sk(sk);
   1806	struct ipv6_pinfo *np = inet6_sk(sk);
   1807	int exthdrlen;
   1808	int err;
   1809
   1810	if (flags&MSG_PROBE)
   1811		return 0;
   1812	if (skb_queue_empty(&sk->sk_write_queue)) {
   1813		/*
   1814		 * setup for corking
   1815		 */
   1816		dst_hold(&rt->dst);
   1817		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
   1818				     ipc6, rt);
   1819		if (err)
   1820			return err;
   1821
   1822		inet->cork.fl.u.ip6 = *fl6;
   1823		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
   1824		length += exthdrlen;
   1825		transhdrlen += exthdrlen;
   1826	} else {
   1827		transhdrlen = 0;
   1828	}
   1829
   1830	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
   1831				 &np->cork, sk_page_frag(sk), getfrag,
   1832				 from, length, transhdrlen, flags, ipc6);
   1833}
   1834EXPORT_SYMBOL_GPL(ip6_append_data);
   1835
   1836static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
   1837{
   1838	struct dst_entry *dst = cork->base.dst;
   1839
   1840	cork->base.dst = NULL;
   1841	cork->base.flags &= ~IPCORK_ALLFRAG;
   1842	skb_dst_set(skb, dst);
   1843}
   1844
   1845static void ip6_cork_release(struct inet_cork_full *cork,
   1846			     struct inet6_cork *v6_cork)
   1847{
   1848	if (v6_cork->opt) {
   1849		struct ipv6_txoptions *opt = v6_cork->opt;
   1850
   1851		kfree(opt->dst0opt);
   1852		kfree(opt->dst1opt);
   1853		kfree(opt->hopopt);
   1854		kfree(opt->srcrt);
   1855		kfree(opt);
   1856		v6_cork->opt = NULL;
   1857	}
   1858
   1859	if (cork->base.dst) {
   1860		dst_release(cork->base.dst);
   1861		cork->base.dst = NULL;
   1862		cork->base.flags &= ~IPCORK_ALLFRAG;
   1863	}
   1864}
   1865
   1866struct sk_buff *__ip6_make_skb(struct sock *sk,
   1867			       struct sk_buff_head *queue,
   1868			       struct inet_cork_full *cork,
   1869			       struct inet6_cork *v6_cork)
   1870{
   1871	struct sk_buff *skb, *tmp_skb;
   1872	struct sk_buff **tail_skb;
   1873	struct in6_addr *final_dst;
   1874	struct ipv6_pinfo *np = inet6_sk(sk);
   1875	struct net *net = sock_net(sk);
   1876	struct ipv6hdr *hdr;
   1877	struct ipv6_txoptions *opt = v6_cork->opt;
   1878	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
   1879	struct flowi6 *fl6 = &cork->fl.u.ip6;
   1880	unsigned char proto = fl6->flowi6_proto;
   1881
   1882	skb = __skb_dequeue(queue);
   1883	if (!skb)
   1884		goto out;
   1885	tail_skb = &(skb_shinfo(skb)->frag_list);
   1886
   1887	/* move skb->data to ip header from ext header */
   1888	if (skb->data < skb_network_header(skb))
   1889		__skb_pull(skb, skb_network_offset(skb));
   1890	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
   1891		__skb_pull(tmp_skb, skb_network_header_len(skb));
   1892		*tail_skb = tmp_skb;
   1893		tail_skb = &(tmp_skb->next);
   1894		skb->len += tmp_skb->len;
   1895		skb->data_len += tmp_skb->len;
   1896		skb->truesize += tmp_skb->truesize;
   1897		tmp_skb->destructor = NULL;
   1898		tmp_skb->sk = NULL;
   1899	}
   1900
   1901	/* Allow local fragmentation. */
   1902	skb->ignore_df = ip6_sk_ignore_df(sk);
   1903	__skb_pull(skb, skb_network_header_len(skb));
   1904
   1905	final_dst = &fl6->daddr;
   1906	if (opt && opt->opt_flen)
   1907		ipv6_push_frag_opts(skb, opt, &proto);
   1908	if (opt && opt->opt_nflen)
   1909		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
   1910
   1911	skb_push(skb, sizeof(struct ipv6hdr));
   1912	skb_reset_network_header(skb);
   1913	hdr = ipv6_hdr(skb);
   1914
   1915	ip6_flow_hdr(hdr, v6_cork->tclass,
   1916		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
   1917					ip6_autoflowlabel(net, np), fl6));
   1918	hdr->hop_limit = v6_cork->hop_limit;
   1919	hdr->nexthdr = proto;
   1920	hdr->saddr = fl6->saddr;
   1921	hdr->daddr = *final_dst;
   1922
   1923	skb->priority = sk->sk_priority;
   1924	skb->mark = cork->base.mark;
   1925	skb->tstamp = cork->base.transmit_time;
   1926
   1927	ip6_cork_steal_dst(skb, cork);
   1928	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
   1929	if (proto == IPPROTO_ICMPV6) {
   1930		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
   1931
   1932		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
   1933		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
   1934	}
   1935
   1936	ip6_cork_release(cork, v6_cork);
   1937out:
   1938	return skb;
   1939}
   1940
   1941int ip6_send_skb(struct sk_buff *skb)
   1942{
   1943	struct net *net = sock_net(skb->sk);
   1944	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
   1945	int err;
   1946
   1947	err = ip6_local_out(net, skb->sk, skb);
   1948	if (err) {
   1949		if (err > 0)
   1950			err = net_xmit_errno(err);
   1951		if (err)
   1952			IP6_INC_STATS(net, rt->rt6i_idev,
   1953				      IPSTATS_MIB_OUTDISCARDS);
   1954	}
   1955
   1956	return err;
   1957}
   1958
   1959int ip6_push_pending_frames(struct sock *sk)
   1960{
   1961	struct sk_buff *skb;
   1962
   1963	skb = ip6_finish_skb(sk);
   1964	if (!skb)
   1965		return 0;
   1966
   1967	return ip6_send_skb(skb);
   1968}
   1969EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
   1970
   1971static void __ip6_flush_pending_frames(struct sock *sk,
   1972				       struct sk_buff_head *queue,
   1973				       struct inet_cork_full *cork,
   1974				       struct inet6_cork *v6_cork)
   1975{
   1976	struct sk_buff *skb;
   1977
   1978	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
   1979		if (skb_dst(skb))
   1980			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
   1981				      IPSTATS_MIB_OUTDISCARDS);
   1982		kfree_skb(skb);
   1983	}
   1984
   1985	ip6_cork_release(cork, v6_cork);
   1986}
   1987
   1988void ip6_flush_pending_frames(struct sock *sk)
   1989{
   1990	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
   1991				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
   1992}
   1993EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
   1994
   1995struct sk_buff *ip6_make_skb(struct sock *sk,
   1996			     int getfrag(void *from, char *to, int offset,
   1997					 int len, int odd, struct sk_buff *skb),
   1998			     void *from, size_t length, int transhdrlen,
   1999			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
   2000			     unsigned int flags, struct inet_cork_full *cork)
   2001{
   2002	struct inet6_cork v6_cork;
   2003	struct sk_buff_head queue;
   2004	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
   2005	int err;
   2006
   2007	if (flags & MSG_PROBE) {
   2008		dst_release(&rt->dst);
   2009		return NULL;
   2010	}
   2011
   2012	__skb_queue_head_init(&queue);
   2013
   2014	cork->base.flags = 0;
   2015	cork->base.addr = 0;
   2016	cork->base.opt = NULL;
   2017	v6_cork.opt = NULL;
   2018	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
   2019	if (err) {
   2020		ip6_cork_release(cork, &v6_cork);
   2021		return ERR_PTR(err);
   2022	}
   2023	if (ipc6->dontfrag < 0)
   2024		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
   2025
   2026	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
   2027				&current->task_frag, getfrag, from,
   2028				length + exthdrlen, transhdrlen + exthdrlen,
   2029				flags, ipc6);
   2030	if (err) {
   2031		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
   2032		return ERR_PTR(err);
   2033	}
   2034
   2035	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
   2036}