ip_output.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
ip_output.c (44596B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
      4 *		operating system.  INET is implemented using the  BSD Socket
      5 *		interface as the means of communication with the user level.
      6 *
      7 *		The Internet Protocol (IP) output module.
      8 *
      9 * Authors:	Ross Biro
     10 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
     11 *		Donald Becker, <becker@super.org>
     12 *		Alan Cox, <Alan.Cox@linux.org>
     13 *		Richard Underwood
     14 *		Stefan Becker, <stefanb@yello.ping.de>
     15 *		Jorge Cwik, <jorge@laser.satlink.net>
     16 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
     17 *		Hirokazu Takahashi, <taka@valinux.co.jp>
     18 *
     19 *	See ip_input.c for original log
     20 *
     21 *	Fixes:
     22 *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
     23 *		Mike Kilburn	:	htons() missing in ip_build_xmit.
     24 *		Bradford Johnson:	Fix faulty handling of some frames when
     25 *					no route is found.
     26 *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
     27 *					(in case if packet not accepted by
     28 *					output firewall rules)
     29 *		Mike McLagan	:	Routing by source
     30 *		Alexey Kuznetsov:	use new route cache
     31 *		Andi Kleen:		Fix broken PMTU recovery and remove
     32 *					some redundant tests.
     33 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
     34 *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
     35 *		Andi Kleen	:	Split fast and slow ip_build_xmit path
     36 *					for decreased register pressure on x86
     37 *					and more readability.
     38 *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
     39 *					silently drop skb instead of failing with -EPERM.
     40 *		Detlev Wengorz	:	Copy protocol for fragments.
     41 *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
     42 *					datagrams.
     43 *		Hirokazu Takahashi:	sendfile() on UDP works now.
     44 */
     45
     46#include <linux/uaccess.h>
     47#include <linux/module.h>
     48#include <linux/types.h>
     49#include <linux/kernel.h>
     50#include <linux/mm.h>
     51#include <linux/string.h>
     52#include <linux/errno.h>
     53#include <linux/highmem.h>
     54#include <linux/slab.h>
     55
     56#include <linux/socket.h>
     57#include <linux/sockios.h>
     58#include <linux/in.h>
     59#include <linux/inet.h>
     60#include <linux/netdevice.h>
     61#include <linux/etherdevice.h>
     62#include <linux/proc_fs.h>
     63#include <linux/stat.h>
     64#include <linux/init.h>
     65
     66#include <net/snmp.h>
     67#include <net/ip.h>
     68#include <net/protocol.h>
     69#include <net/route.h>
     70#include <net/xfrm.h>
     71#include <linux/skbuff.h>
     72#include <net/sock.h>
     73#include <net/arp.h>
     74#include <net/icmp.h>
     75#include <net/checksum.h>
     76#include <net/inetpeer.h>
     77#include <net/inet_ecn.h>
     78#include <net/lwtunnel.h>
     79#include <linux/bpf-cgroup.h>
     80#include <linux/igmp.h>
     81#include <linux/netfilter_ipv4.h>
     82#include <linux/netfilter_bridge.h>
     83#include <linux/netlink.h>
     84#include <linux/tcp.h>
     85
     86static int
     87ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
     88	    unsigned int mtu,
     89	    int (*output)(struct net *, struct sock *, struct sk_buff *));
     90
     91/* Generate a checksum for an outgoing IP datagram. */
     92void ip_send_check(struct iphdr *iph)
     93{
     94	iph->check = 0;
     95	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
     96}
     97EXPORT_SYMBOL(ip_send_check);
     98
     99int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
    100{
    101	struct iphdr *iph = ip_hdr(skb);
    102
    103	iph->tot_len = htons(skb->len);
    104	ip_send_check(iph);
    105
    106	/* if egress device is enslaved to an L3 master device pass the
    107	 * skb to its handler for processing
    108	 */
    109	skb = l3mdev_ip_out(sk, skb);
    110	if (unlikely(!skb))
    111		return 0;
    112
    113	skb->protocol = htons(ETH_P_IP);
    114
    115	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
    116		       net, sk, skb, NULL, skb_dst(skb)->dev,
    117		       dst_output);
    118}
    119
    120int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
    121{
    122	int err;
    123
    124	err = __ip_local_out(net, sk, skb);
    125	if (likely(err == 1))
    126		err = dst_output(net, sk, skb);
    127
    128	return err;
    129}
    130EXPORT_SYMBOL_GPL(ip_local_out);
    131
    132static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
    133{
    134	int ttl = inet->uc_ttl;
    135
    136	if (ttl < 0)
    137		ttl = ip4_dst_hoplimit(dst);
    138	return ttl;
    139}
    140
    141/*
    142 *		Add an ip header to a skbuff and send it out.
    143 *
    144 */
    145int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
    146			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt,
    147			  u8 tos)
    148{
    149	struct inet_sock *inet = inet_sk(sk);
    150	struct rtable *rt = skb_rtable(skb);
    151	struct net *net = sock_net(sk);
    152	struct iphdr *iph;
    153
    154	/* Build the IP header. */
    155	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
    156	skb_reset_network_header(skb);
    157	iph = ip_hdr(skb);
    158	iph->version  = 4;
    159	iph->ihl      = 5;
    160	iph->tos      = tos;
    161	iph->ttl      = ip_select_ttl(inet, &rt->dst);
    162	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
    163	iph->saddr    = saddr;
    164	iph->protocol = sk->sk_protocol;
    165	/* Do not bother generating IPID for small packets (eg SYNACK) */
    166	if (skb->len <= IPV4_MIN_MTU || ip_dont_fragment(sk, &rt->dst)) {
    167		iph->frag_off = htons(IP_DF);
    168		iph->id = 0;
    169	} else {
    170		iph->frag_off = 0;
    171		/* TCP packets here are SYNACK with fat IPv4/TCP options.
    172		 * Avoid using the hashed IP ident generator.
    173		 */
    174		if (sk->sk_protocol == IPPROTO_TCP)
    175			iph->id = (__force __be16)prandom_u32();
    176		else
    177			__ip_select_ident(net, iph, 1);
    178	}
    179
    180	if (opt && opt->opt.optlen) {
    181		iph->ihl += opt->opt.optlen>>2;
    182		ip_options_build(skb, &opt->opt, daddr, rt);
    183	}
    184
    185	skb->priority = sk->sk_priority;
    186	if (!skb->mark)
    187		skb->mark = sk->sk_mark;
    188
    189	/* Send it out. */
    190	return ip_local_out(net, skb->sk, skb);
    191}
    192EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
    193
    194static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
    195{
    196	struct dst_entry *dst = skb_dst(skb);
    197	struct rtable *rt = (struct rtable *)dst;
    198	struct net_device *dev = dst->dev;
    199	unsigned int hh_len = LL_RESERVED_SPACE(dev);
    200	struct neighbour *neigh;
    201	bool is_v6gw = false;
    202
    203	if (rt->rt_type == RTN_MULTICAST) {
    204		IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
    205	} else if (rt->rt_type == RTN_BROADCAST)
    206		IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
    207
    208	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
    209		skb = skb_expand_head(skb, hh_len);
    210		if (!skb)
    211			return -ENOMEM;
    212	}
    213
    214	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
    215		int res = lwtunnel_xmit(skb);
    216
    217		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
    218			return res;
    219	}
    220
    221	rcu_read_lock_bh();
    222	neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
    223	if (!IS_ERR(neigh)) {
    224		int res;
    225
    226		sock_confirm_neigh(skb, neigh);
    227		/* if crossing protocols, can not use the cached header */
    228		res = neigh_output(neigh, skb, is_v6gw);
    229		rcu_read_unlock_bh();
    230		return res;
    231	}
    232	rcu_read_unlock_bh();
    233
    234	net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
    235			    __func__);
    236	kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
    237	return -EINVAL;
    238}
    239
    240static int ip_finish_output_gso(struct net *net, struct sock *sk,
    241				struct sk_buff *skb, unsigned int mtu)
    242{
    243	struct sk_buff *segs, *nskb;
    244	netdev_features_t features;
    245	int ret = 0;
    246
    247	/* common case: seglen is <= mtu
    248	 */
    249	if (skb_gso_validate_network_len(skb, mtu))
    250		return ip_finish_output2(net, sk, skb);
    251
    252	/* Slowpath -  GSO segment length exceeds the egress MTU.
    253	 *
    254	 * This can happen in several cases:
    255	 *  - Forwarding of a TCP GRO skb, when DF flag is not set.
    256	 *  - Forwarding of an skb that arrived on a virtualization interface
    257	 *    (virtio-net/vhost/tap) with TSO/GSO size set by other network
    258	 *    stack.
    259	 *  - Local GSO skb transmitted on an NETIF_F_TSO tunnel stacked over an
    260	 *    interface with a smaller MTU.
    261	 *  - Arriving GRO skb (or GSO skb in a virtualized environment) that is
    262	 *    bridged to a NETIF_F_TSO tunnel stacked over an interface with an
    263	 *    insufficient MTU.
    264	 */
    265	features = netif_skb_features(skb);
    266	BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET);
    267	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
    268	if (IS_ERR_OR_NULL(segs)) {
    269		kfree_skb(skb);
    270		return -ENOMEM;
    271	}
    272
    273	consume_skb(skb);
    274
    275	skb_list_walk_safe(segs, segs, nskb) {
    276		int err;
    277
    278		skb_mark_not_on_list(segs);
    279		err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
    280
    281		if (err && ret == 0)
    282			ret = err;
    283	}
    284
    285	return ret;
    286}
    287
    288static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
    289{
    290	unsigned int mtu;
    291
    292#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
    293	/* Policy lookup after SNAT yielded a new policy */
    294	if (skb_dst(skb)->xfrm) {
    295		IPCB(skb)->flags |= IPSKB_REROUTED;
    296		return dst_output(net, sk, skb);
    297	}
    298#endif
    299	mtu = ip_skb_dst_mtu(sk, skb);
    300	if (skb_is_gso(skb))
    301		return ip_finish_output_gso(net, sk, skb, mtu);
    302
    303	if (skb->len > mtu || IPCB(skb)->frag_max_size)
    304		return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
    305
    306	return ip_finish_output2(net, sk, skb);
    307}
    308
    309static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
    310{
    311	int ret;
    312
    313	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
    314	switch (ret) {
    315	case NET_XMIT_SUCCESS:
    316		return __ip_finish_output(net, sk, skb);
    317	case NET_XMIT_CN:
    318		return __ip_finish_output(net, sk, skb) ? : ret;
    319	default:
    320		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
    321		return ret;
    322	}
    323}
    324
    325static int ip_mc_finish_output(struct net *net, struct sock *sk,
    326			       struct sk_buff *skb)
    327{
    328	struct rtable *new_rt;
    329	bool do_cn = false;
    330	int ret, err;
    331
    332	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
    333	switch (ret) {
    334	case NET_XMIT_CN:
    335		do_cn = true;
    336		fallthrough;
    337	case NET_XMIT_SUCCESS:
    338		break;
    339	default:
    340		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
    341		return ret;
    342	}
    343
    344	/* Reset rt_iif so that inet_iif() will return skb->skb_iif. Setting
    345	 * this to non-zero causes ipi_ifindex in in_pktinfo to be overwritten,
    346	 * see ipv4_pktinfo_prepare().
    347	 */
    348	new_rt = rt_dst_clone(net->loopback_dev, skb_rtable(skb));
    349	if (new_rt) {
    350		new_rt->rt_iif = 0;
    351		skb_dst_drop(skb);
    352		skb_dst_set(skb, &new_rt->dst);
    353	}
    354
    355	err = dev_loopback_xmit(net, sk, skb);
    356	return (do_cn && err) ? ret : err;
    357}
    358
    359int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
    360{
    361	struct rtable *rt = skb_rtable(skb);
    362	struct net_device *dev = rt->dst.dev;
    363
    364	/*
    365	 *	If the indicated interface is up and running, send the packet.
    366	 */
    367	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
    368
    369	skb->dev = dev;
    370	skb->protocol = htons(ETH_P_IP);
    371
    372	/*
    373	 *	Multicasts are looped back for other local users
    374	 */
    375
    376	if (rt->rt_flags&RTCF_MULTICAST) {
    377		if (sk_mc_loop(sk)
    378#ifdef CONFIG_IP_MROUTE
    379		/* Small optimization: do not loopback not local frames,
    380		   which returned after forwarding; they will be  dropped
    381		   by ip_mr_input in any case.
    382		   Note, that local frames are looped back to be delivered
    383		   to local recipients.
    384
    385		   This check is duplicated in ip_mr_input at the moment.
    386		 */
    387		    &&
    388		    ((rt->rt_flags & RTCF_LOCAL) ||
    389		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
    390#endif
    391		   ) {
    392			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
    393			if (newskb)
    394				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
    395					net, sk, newskb, NULL, newskb->dev,
    396					ip_mc_finish_output);
    397		}
    398
    399		/* Multicasts with ttl 0 must not go beyond the host */
    400
    401		if (ip_hdr(skb)->ttl == 0) {
    402			kfree_skb(skb);
    403			return 0;
    404		}
    405	}
    406
    407	if (rt->rt_flags&RTCF_BROADCAST) {
    408		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
    409		if (newskb)
    410			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
    411				net, sk, newskb, NULL, newskb->dev,
    412				ip_mc_finish_output);
    413	}
    414
    415	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
    416			    net, sk, skb, NULL, skb->dev,
    417			    ip_finish_output,
    418			    !(IPCB(skb)->flags & IPSKB_REROUTED));
    419}
    420
    421int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
    422{
    423	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
    424
    425	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
    426
    427	skb->dev = dev;
    428	skb->protocol = htons(ETH_P_IP);
    429
    430	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
    431			    net, sk, skb, indev, dev,
    432			    ip_finish_output,
    433			    !(IPCB(skb)->flags & IPSKB_REROUTED));
    434}
    435EXPORT_SYMBOL(ip_output);
    436
    437/*
    438 * copy saddr and daddr, possibly using 64bit load/stores
    439 * Equivalent to :
    440 *   iph->saddr = fl4->saddr;
    441 *   iph->daddr = fl4->daddr;
    442 */
    443static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
    444{
    445	BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
    446		     offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
    447
    448	iph->saddr = fl4->saddr;
    449	iph->daddr = fl4->daddr;
    450}
    451
    452/* Note: skb->sk can be different from sk, in case of tunnels */
    453int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
    454		    __u8 tos)
    455{
    456	struct inet_sock *inet = inet_sk(sk);
    457	struct net *net = sock_net(sk);
    458	struct ip_options_rcu *inet_opt;
    459	struct flowi4 *fl4;
    460	struct rtable *rt;
    461	struct iphdr *iph;
    462	int res;
    463
    464	/* Skip all of this if the packet is already routed,
    465	 * f.e. by something like SCTP.
    466	 */
    467	rcu_read_lock();
    468	inet_opt = rcu_dereference(inet->inet_opt);
    469	fl4 = &fl->u.ip4;
    470	rt = skb_rtable(skb);
    471	if (rt)
    472		goto packet_routed;
    473
    474	/* Make sure we can route this packet. */
    475	rt = (struct rtable *)__sk_dst_check(sk, 0);
    476	if (!rt) {
    477		__be32 daddr;
    478
    479		/* Use correct destination address if we have options. */
    480		daddr = inet->inet_daddr;
    481		if (inet_opt && inet_opt->opt.srr)
    482			daddr = inet_opt->opt.faddr;
    483
    484		/* If this fails, retransmit mechanism of transport layer will
    485		 * keep trying until route appears or the connection times
    486		 * itself out.
    487		 */
    488		rt = ip_route_output_ports(net, fl4, sk,
    489					   daddr, inet->inet_saddr,
    490					   inet->inet_dport,
    491					   inet->inet_sport,
    492					   sk->sk_protocol,
    493					   RT_CONN_FLAGS_TOS(sk, tos),
    494					   sk->sk_bound_dev_if);
    495		if (IS_ERR(rt))
    496			goto no_route;
    497		sk_setup_caps(sk, &rt->dst);
    498	}
    499	skb_dst_set_noref(skb, &rt->dst);
    500
    501packet_routed:
    502	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
    503		goto no_route;
    504
    505	/* OK, we know where to send it, allocate and build IP header. */
    506	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
    507	skb_reset_network_header(skb);
    508	iph = ip_hdr(skb);
    509	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff));
    510	if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
    511		iph->frag_off = htons(IP_DF);
    512	else
    513		iph->frag_off = 0;
    514	iph->ttl      = ip_select_ttl(inet, &rt->dst);
    515	iph->protocol = sk->sk_protocol;
    516	ip_copy_addrs(iph, fl4);
    517
    518	/* Transport layer set skb->h.foo itself. */
    519
    520	if (inet_opt && inet_opt->opt.optlen) {
    521		iph->ihl += inet_opt->opt.optlen >> 2;
    522		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt);
    523	}
    524
    525	ip_select_ident_segs(net, skb, sk,
    526			     skb_shinfo(skb)->gso_segs ?: 1);
    527
    528	/* TODO : should we use skb->sk here instead of sk ? */
    529	skb->priority = sk->sk_priority;
    530	skb->mark = sk->sk_mark;
    531
    532	res = ip_local_out(net, sk, skb);
    533	rcu_read_unlock();
    534	return res;
    535
    536no_route:
    537	rcu_read_unlock();
    538	IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
    539	kfree_skb_reason(skb, SKB_DROP_REASON_IP_OUTNOROUTES);
    540	return -EHOSTUNREACH;
    541}
    542EXPORT_SYMBOL(__ip_queue_xmit);
    543
    544int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
    545{
    546	return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
    547}
    548EXPORT_SYMBOL(ip_queue_xmit);
    549
    550static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
    551{
    552	to->pkt_type = from->pkt_type;
    553	to->priority = from->priority;
    554	to->protocol = from->protocol;
    555	to->skb_iif = from->skb_iif;
    556	skb_dst_drop(to);
    557	skb_dst_copy(to, from);
    558	to->dev = from->dev;
    559	to->mark = from->mark;
    560
    561	skb_copy_hash(to, from);
    562
    563#ifdef CONFIG_NET_SCHED
    564	to->tc_index = from->tc_index;
    565#endif
    566	nf_copy(to, from);
    567	skb_ext_copy(to, from);
    568#if IS_ENABLED(CONFIG_IP_VS)
    569	to->ipvs_property = from->ipvs_property;
    570#endif
    571	skb_copy_secmark(to, from);
    572}
    573
    574static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
    575		       unsigned int mtu,
    576		       int (*output)(struct net *, struct sock *, struct sk_buff *))
    577{
    578	struct iphdr *iph = ip_hdr(skb);
    579
    580	if ((iph->frag_off & htons(IP_DF)) == 0)
    581		return ip_do_fragment(net, sk, skb, output);
    582
    583	if (unlikely(!skb->ignore_df ||
    584		     (IPCB(skb)->frag_max_size &&
    585		      IPCB(skb)->frag_max_size > mtu))) {
    586		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
    587		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
    588			  htonl(mtu));
    589		kfree_skb(skb);
    590		return -EMSGSIZE;
    591	}
    592
    593	return ip_do_fragment(net, sk, skb, output);
    594}
    595
    596void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph,
    597		      unsigned int hlen, struct ip_fraglist_iter *iter)
    598{
    599	unsigned int first_len = skb_pagelen(skb);
    600
    601	iter->frag = skb_shinfo(skb)->frag_list;
    602	skb_frag_list_init(skb);
    603
    604	iter->offset = 0;
    605	iter->iph = iph;
    606	iter->hlen = hlen;
    607
    608	skb->data_len = first_len - skb_headlen(skb);
    609	skb->len = first_len;
    610	iph->tot_len = htons(first_len);
    611	iph->frag_off = htons(IP_MF);
    612	ip_send_check(iph);
    613}
    614EXPORT_SYMBOL(ip_fraglist_init);
    615
    616void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter)
    617{
    618	unsigned int hlen = iter->hlen;
    619	struct iphdr *iph = iter->iph;
    620	struct sk_buff *frag;
    621
    622	frag = iter->frag;
    623	frag->ip_summed = CHECKSUM_NONE;
    624	skb_reset_transport_header(frag);
    625	__skb_push(frag, hlen);
    626	skb_reset_network_header(frag);
    627	memcpy(skb_network_header(frag), iph, hlen);
    628	iter->iph = ip_hdr(frag);
    629	iph = iter->iph;
    630	iph->tot_len = htons(frag->len);
    631	ip_copy_metadata(frag, skb);
    632	iter->offset += skb->len - hlen;
    633	iph->frag_off = htons(iter->offset >> 3);
    634	if (frag->next)
    635		iph->frag_off |= htons(IP_MF);
    636	/* Ready, complete checksum */
    637	ip_send_check(iph);
    638}
    639EXPORT_SYMBOL(ip_fraglist_prepare);
    640
    641void ip_frag_init(struct sk_buff *skb, unsigned int hlen,
    642		  unsigned int ll_rs, unsigned int mtu, bool DF,
    643		  struct ip_frag_state *state)
    644{
    645	struct iphdr *iph = ip_hdr(skb);
    646
    647	state->DF = DF;
    648	state->hlen = hlen;
    649	state->ll_rs = ll_rs;
    650	state->mtu = mtu;
    651
    652	state->left = skb->len - hlen;	/* Space per frame */
    653	state->ptr = hlen;		/* Where to start from */
    654
    655	state->offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
    656	state->not_last_frag = iph->frag_off & htons(IP_MF);
    657}
    658EXPORT_SYMBOL(ip_frag_init);
    659
    660static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to,
    661			 bool first_frag)
    662{
    663	/* Copy the flags to each fragment. */
    664	IPCB(to)->flags = IPCB(from)->flags;
    665
    666	/* ANK: dirty, but effective trick. Upgrade options only if
    667	 * the segment to be fragmented was THE FIRST (otherwise,
    668	 * options are already fixed) and make it ONCE
    669	 * on the initial skb, so that all the following fragments
    670	 * will inherit fixed options.
    671	 */
    672	if (first_frag)
    673		ip_options_fragment(from);
    674}
    675
    676struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state)
    677{
    678	unsigned int len = state->left;
    679	struct sk_buff *skb2;
    680	struct iphdr *iph;
    681
    682	/* IF: it doesn't fit, use 'mtu' - the data space left */
    683	if (len > state->mtu)
    684		len = state->mtu;
    685	/* IF: we are not sending up to and including the packet end
    686	   then align the next start on an eight byte boundary */
    687	if (len < state->left)	{
    688		len &= ~7;
    689	}
    690
    691	/* Allocate buffer */
    692	skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC);
    693	if (!skb2)
    694		return ERR_PTR(-ENOMEM);
    695
    696	/*
    697	 *	Set up data on packet
    698	 */
    699
    700	ip_copy_metadata(skb2, skb);
    701	skb_reserve(skb2, state->ll_rs);
    702	skb_put(skb2, len + state->hlen);
    703	skb_reset_network_header(skb2);
    704	skb2->transport_header = skb2->network_header + state->hlen;
    705
    706	/*
    707	 *	Charge the memory for the fragment to any owner
    708	 *	it might possess
    709	 */
    710
    711	if (skb->sk)
    712		skb_set_owner_w(skb2, skb->sk);
    713
    714	/*
    715	 *	Copy the packet header into the new buffer.
    716	 */
    717
    718	skb_copy_from_linear_data(skb, skb_network_header(skb2), state->hlen);
    719
    720	/*
    721	 *	Copy a block of the IP datagram.
    722	 */
    723	if (skb_copy_bits(skb, state->ptr, skb_transport_header(skb2), len))
    724		BUG();
    725	state->left -= len;
    726
    727	/*
    728	 *	Fill in the new header fields.
    729	 */
    730	iph = ip_hdr(skb2);
    731	iph->frag_off = htons((state->offset >> 3));
    732	if (state->DF)
    733		iph->frag_off |= htons(IP_DF);
    734
    735	/*
    736	 *	Added AC : If we are fragmenting a fragment that's not the
    737	 *		   last fragment then keep MF on each bit
    738	 */
    739	if (state->left > 0 || state->not_last_frag)
    740		iph->frag_off |= htons(IP_MF);
    741	state->ptr += len;
    742	state->offset += len;
    743
    744	iph->tot_len = htons(len + state->hlen);
    745
    746	ip_send_check(iph);
    747
    748	return skb2;
    749}
    750EXPORT_SYMBOL(ip_frag_next);
    751
    752/*
    753 *	This IP datagram is too large to be sent in one piece.  Break it up into
    754 *	smaller pieces (each of size equal to IP header plus
    755 *	a block of the data of the original IP data part) that will yet fit in a
    756 *	single device frame, and queue such a frame for sending.
    757 */
    758
    759int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
    760		   int (*output)(struct net *, struct sock *, struct sk_buff *))
    761{
    762	struct iphdr *iph;
    763	struct sk_buff *skb2;
    764	bool mono_delivery_time = skb->mono_delivery_time;
    765	struct rtable *rt = skb_rtable(skb);
    766	unsigned int mtu, hlen, ll_rs;
    767	struct ip_fraglist_iter iter;
    768	ktime_t tstamp = skb->tstamp;
    769	struct ip_frag_state state;
    770	int err = 0;
    771
    772	/* for offloaded checksums cleanup checksum before fragmentation */
    773	if (skb->ip_summed == CHECKSUM_PARTIAL &&
    774	    (err = skb_checksum_help(skb)))
    775		goto fail;
    776
    777	/*
    778	 *	Point into the IP datagram header.
    779	 */
    780
    781	iph = ip_hdr(skb);
    782
    783	mtu = ip_skb_dst_mtu(sk, skb);
    784	if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
    785		mtu = IPCB(skb)->frag_max_size;
    786
    787	/*
    788	 *	Setup starting values.
    789	 */
    790
    791	hlen = iph->ihl * 4;
    792	mtu = mtu - hlen;	/* Size of data space */
    793	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
    794	ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
    795
    796	/* When frag_list is given, use it. First, check its validity:
    797	 * some transformers could create wrong frag_list or break existing
    798	 * one, it is not prohibited. In this case fall back to copying.
    799	 *
    800	 * LATER: this step can be merged to real generation of fragments,
    801	 * we can switch to copy when see the first bad fragment.
    802	 */
    803	if (skb_has_frag_list(skb)) {
    804		struct sk_buff *frag, *frag2;
    805		unsigned int first_len = skb_pagelen(skb);
    806
    807		if (first_len - hlen > mtu ||
    808		    ((first_len - hlen) & 7) ||
    809		    ip_is_fragment(iph) ||
    810		    skb_cloned(skb) ||
    811		    skb_headroom(skb) < ll_rs)
    812			goto slow_path;
    813
    814		skb_walk_frags(skb, frag) {
    815			/* Correct geometry. */
    816			if (frag->len > mtu ||
    817			    ((frag->len & 7) && frag->next) ||
    818			    skb_headroom(frag) < hlen + ll_rs)
    819				goto slow_path_clean;
    820
    821			/* Partially cloned skb? */
    822			if (skb_shared(frag))
    823				goto slow_path_clean;
    824
    825			BUG_ON(frag->sk);
    826			if (skb->sk) {
    827				frag->sk = skb->sk;
    828				frag->destructor = sock_wfree;
    829			}
    830			skb->truesize -= frag->truesize;
    831		}
    832
    833		/* Everything is OK. Generate! */
    834		ip_fraglist_init(skb, iph, hlen, &iter);
    835
    836		for (;;) {
    837			/* Prepare header of the next frame,
    838			 * before previous one went down. */
    839			if (iter.frag) {
    840				bool first_frag = (iter.offset == 0);
    841
    842				IPCB(iter.frag)->flags = IPCB(skb)->flags;
    843				ip_fraglist_prepare(skb, &iter);
    844				if (first_frag && IPCB(skb)->opt.optlen) {
    845					/* ipcb->opt is not populated for frags
    846					 * coming from __ip_make_skb(),
    847					 * ip_options_fragment() needs optlen
    848					 */
    849					IPCB(iter.frag)->opt.optlen =
    850						IPCB(skb)->opt.optlen;
    851					ip_options_fragment(iter.frag);
    852					ip_send_check(iter.iph);
    853				}
    854			}
    855
    856			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
    857			err = output(net, sk, skb);
    858
    859			if (!err)
    860				IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
    861			if (err || !iter.frag)
    862				break;
    863
    864			skb = ip_fraglist_next(&iter);
    865		}
    866
    867		if (err == 0) {
    868			IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
    869			return 0;
    870		}
    871
    872		kfree_skb_list(iter.frag);
    873
    874		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
    875		return err;
    876
    877slow_path_clean:
    878		skb_walk_frags(skb, frag2) {
    879			if (frag2 == frag)
    880				break;
    881			frag2->sk = NULL;
    882			frag2->destructor = NULL;
    883			skb->truesize += frag2->truesize;
    884		}
    885	}
    886
    887slow_path:
    888	/*
    889	 *	Fragment the datagram.
    890	 */
    891
    892	ip_frag_init(skb, hlen, ll_rs, mtu, IPCB(skb)->flags & IPSKB_FRAG_PMTU,
    893		     &state);
    894
    895	/*
    896	 *	Keep copying data until we run out.
    897	 */
    898
    899	while (state.left > 0) {
    900		bool first_frag = (state.offset == 0);
    901
    902		skb2 = ip_frag_next(skb, &state);
    903		if (IS_ERR(skb2)) {
    904			err = PTR_ERR(skb2);
    905			goto fail;
    906		}
    907		ip_frag_ipcb(skb, skb2, first_frag);
    908
    909		/*
    910		 *	Put this fragment into the sending queue.
    911		 */
    912		skb_set_delivery_time(skb2, tstamp, mono_delivery_time);
    913		err = output(net, sk, skb2);
    914		if (err)
    915			goto fail;
    916
    917		IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
    918	}
    919	consume_skb(skb);
    920	IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
    921	return err;
    922
    923fail:
    924	kfree_skb(skb);
    925	IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
    926	return err;
    927}
    928EXPORT_SYMBOL(ip_do_fragment);
    929
    930int
    931ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
    932{
    933	struct msghdr *msg = from;
    934
    935	if (skb->ip_summed == CHECKSUM_PARTIAL) {
    936		if (!copy_from_iter_full(to, len, &msg->msg_iter))
    937			return -EFAULT;
    938	} else {
    939		__wsum csum = 0;
    940		if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter))
    941			return -EFAULT;
    942		skb->csum = csum_block_add(skb->csum, csum, odd);
    943	}
    944	return 0;
    945}
    946EXPORT_SYMBOL(ip_generic_getfrag);
    947
    948static inline __wsum
    949csum_page(struct page *page, int offset, int copy)
    950{
    951	char *kaddr;
    952	__wsum csum;
    953	kaddr = kmap(page);
    954	csum = csum_partial(kaddr + offset, copy, 0);
    955	kunmap(page);
    956	return csum;
    957}
    958
    959static int __ip_append_data(struct sock *sk,
    960			    struct flowi4 *fl4,
    961			    struct sk_buff_head *queue,
    962			    struct inet_cork *cork,
    963			    struct page_frag *pfrag,
    964			    int getfrag(void *from, char *to, int offset,
    965					int len, int odd, struct sk_buff *skb),
    966			    void *from, int length, int transhdrlen,
    967			    unsigned int flags)
    968{
    969	struct inet_sock *inet = inet_sk(sk);
    970	struct ubuf_info *uarg = NULL;
    971	struct sk_buff *skb;
    972
    973	struct ip_options *opt = cork->opt;
    974	int hh_len;
    975	int exthdrlen;
    976	int mtu;
    977	int copy;
    978	int err;
    979	int offset = 0;
    980	unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
    981	int csummode = CHECKSUM_NONE;
    982	struct rtable *rt = (struct rtable *)cork->dst;
    983	unsigned int wmem_alloc_delta = 0;
    984	bool paged, extra_uref = false;
    985	u32 tskey = 0;
    986
    987	skb = skb_peek_tail(queue);
    988
    989	exthdrlen = !skb ? rt->dst.header_len : 0;
    990	mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
    991	paged = !!cork->gso_size;
    992
    993	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
    994	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
    995		tskey = atomic_inc_return(&sk->sk_tskey) - 1;
    996
    997	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
    998
    999	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
   1000	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
   1001	maxnonfragsize = ip_sk_ignore_df(sk) ? IP_MAX_MTU : mtu;
   1002
   1003	if (cork->length + length > maxnonfragsize - fragheaderlen) {
   1004		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
   1005			       mtu - (opt ? opt->optlen : 0));
   1006		return -EMSGSIZE;
   1007	}
   1008
   1009	/*
   1010	 * transhdrlen > 0 means that this is the first fragment and we wish
   1011	 * it won't be fragmented in the future.
   1012	 */
   1013	if (transhdrlen &&
   1014	    length + fragheaderlen <= mtu &&
   1015	    rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
   1016	    (!(flags & MSG_MORE) || cork->gso_size) &&
   1017	    (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
   1018		csummode = CHECKSUM_PARTIAL;
   1019
   1020	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
   1021		uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
   1022		if (!uarg)
   1023			return -ENOBUFS;
   1024		extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
   1025		if (rt->dst.dev->features & NETIF_F_SG &&
   1026		    csummode == CHECKSUM_PARTIAL) {
   1027			paged = true;
   1028		} else {
   1029			uarg->zerocopy = 0;
   1030			skb_zcopy_set(skb, uarg, &extra_uref);
   1031		}
   1032	}
   1033
   1034	cork->length += length;
   1035
   1036	/* So, what's going on in the loop below?
   1037	 *
   1038	 * We use calculated fragment length to generate chained skb,
   1039	 * each of segments is IP fragment ready for sending to network after
   1040	 * adding appropriate IP header.
   1041	 */
   1042
   1043	if (!skb)
   1044		goto alloc_new_skb;
   1045
   1046	while (length > 0) {
   1047		/* Check if the remaining data fits into current packet. */
   1048		copy = mtu - skb->len;
   1049		if (copy < length)
   1050			copy = maxfraglen - skb->len;
   1051		if (copy <= 0) {
   1052			char *data;
   1053			unsigned int datalen;
   1054			unsigned int fraglen;
   1055			unsigned int fraggap;
   1056			unsigned int alloclen, alloc_extra;
   1057			unsigned int pagedlen;
   1058			struct sk_buff *skb_prev;
   1059alloc_new_skb:
   1060			skb_prev = skb;
   1061			if (skb_prev)
   1062				fraggap = skb_prev->len - maxfraglen;
   1063			else
   1064				fraggap = 0;
   1065
   1066			/*
   1067			 * If remaining data exceeds the mtu,
   1068			 * we know we need more fragment(s).
   1069			 */
   1070			datalen = length + fraggap;
   1071			if (datalen > mtu - fragheaderlen)
   1072				datalen = maxfraglen - fragheaderlen;
   1073			fraglen = datalen + fragheaderlen;
   1074			pagedlen = 0;
   1075
   1076			alloc_extra = hh_len + 15;
   1077			alloc_extra += exthdrlen;
   1078
   1079			/* The last fragment gets additional space at tail.
   1080			 * Note, with MSG_MORE we overallocate on fragments,
   1081			 * because we have no idea what fragment will be
   1082			 * the last.
   1083			 */
   1084			if (datalen == length + fraggap)
   1085				alloc_extra += rt->dst.trailer_len;
   1086
   1087			if ((flags & MSG_MORE) &&
   1088			    !(rt->dst.dev->features&NETIF_F_SG))
   1089				alloclen = mtu;
   1090			else if (!paged &&
   1091				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
   1092				  !(rt->dst.dev->features & NETIF_F_SG)))
   1093				alloclen = fraglen;
   1094			else {
   1095				alloclen = min_t(int, fraglen, MAX_HEADER);
   1096				pagedlen = fraglen - alloclen;
   1097			}
   1098
   1099			alloclen += alloc_extra;
   1100
   1101			if (transhdrlen) {
   1102				skb = sock_alloc_send_skb(sk, alloclen,
   1103						(flags & MSG_DONTWAIT), &err);
   1104			} else {
   1105				skb = NULL;
   1106				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
   1107				    2 * sk->sk_sndbuf)
   1108					skb = alloc_skb(alloclen,
   1109							sk->sk_allocation);
   1110				if (unlikely(!skb))
   1111					err = -ENOBUFS;
   1112			}
   1113			if (!skb)
   1114				goto error;
   1115
   1116			/*
   1117			 *	Fill in the control structures
   1118			 */
   1119			skb->ip_summed = csummode;
   1120			skb->csum = 0;
   1121			skb_reserve(skb, hh_len);
   1122
   1123			/*
   1124			 *	Find where to start putting bytes.
   1125			 */
   1126			data = skb_put(skb, fraglen + exthdrlen - pagedlen);
   1127			skb_set_network_header(skb, exthdrlen);
   1128			skb->transport_header = (skb->network_header +
   1129						 fragheaderlen);
   1130			data += fragheaderlen + exthdrlen;
   1131
   1132			if (fraggap) {
   1133				skb->csum = skb_copy_and_csum_bits(
   1134					skb_prev, maxfraglen,
   1135					data + transhdrlen, fraggap);
   1136				skb_prev->csum = csum_sub(skb_prev->csum,
   1137							  skb->csum);
   1138				data += fraggap;
   1139				pskb_trim_unique(skb_prev, maxfraglen);
   1140			}
   1141
   1142			copy = datalen - transhdrlen - fraggap - pagedlen;
   1143			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
   1144				err = -EFAULT;
   1145				kfree_skb(skb);
   1146				goto error;
   1147			}
   1148
   1149			offset += copy;
   1150			length -= copy + transhdrlen;
   1151			transhdrlen = 0;
   1152			exthdrlen = 0;
   1153			csummode = CHECKSUM_NONE;
   1154
   1155			/* only the initial fragment is time stamped */
   1156			skb_shinfo(skb)->tx_flags = cork->tx_flags;
   1157			cork->tx_flags = 0;
   1158			skb_shinfo(skb)->tskey = tskey;
   1159			tskey = 0;
   1160			skb_zcopy_set(skb, uarg, &extra_uref);
   1161
   1162			if ((flags & MSG_CONFIRM) && !skb_prev)
   1163				skb_set_dst_pending_confirm(skb, 1);
   1164
   1165			/*
   1166			 * Put the packet on the pending queue.
   1167			 */
   1168			if (!skb->destructor) {
   1169				skb->destructor = sock_wfree;
   1170				skb->sk = sk;
   1171				wmem_alloc_delta += skb->truesize;
   1172			}
   1173			__skb_queue_tail(queue, skb);
   1174			continue;
   1175		}
   1176
   1177		if (copy > length)
   1178			copy = length;
   1179
   1180		if (!(rt->dst.dev->features&NETIF_F_SG) &&
   1181		    skb_tailroom(skb) >= copy) {
   1182			unsigned int off;
   1183
   1184			off = skb->len;
   1185			if (getfrag(from, skb_put(skb, copy),
   1186					offset, copy, off, skb) < 0) {
   1187				__skb_trim(skb, off);
   1188				err = -EFAULT;
   1189				goto error;
   1190			}
   1191		} else if (!uarg || !uarg->zerocopy) {
   1192			int i = skb_shinfo(skb)->nr_frags;
   1193
   1194			err = -ENOMEM;
   1195			if (!sk_page_frag_refill(sk, pfrag))
   1196				goto error;
   1197
   1198			if (!skb_can_coalesce(skb, i, pfrag->page,
   1199					      pfrag->offset)) {
   1200				err = -EMSGSIZE;
   1201				if (i == MAX_SKB_FRAGS)
   1202					goto error;
   1203
   1204				__skb_fill_page_desc(skb, i, pfrag->page,
   1205						     pfrag->offset, 0);
   1206				skb_shinfo(skb)->nr_frags = ++i;
   1207				get_page(pfrag->page);
   1208			}
   1209			copy = min_t(int, copy, pfrag->size - pfrag->offset);
   1210			if (getfrag(from,
   1211				    page_address(pfrag->page) + pfrag->offset,
   1212				    offset, copy, skb->len, skb) < 0)
   1213				goto error_efault;
   1214
   1215			pfrag->offset += copy;
   1216			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
   1217			skb->len += copy;
   1218			skb->data_len += copy;
   1219			skb->truesize += copy;
   1220			wmem_alloc_delta += copy;
   1221		} else {
   1222			err = skb_zerocopy_iter_dgram(skb, from, copy);
   1223			if (err < 0)
   1224				goto error;
   1225		}
   1226		offset += copy;
   1227		length -= copy;
   1228	}
   1229
   1230	if (wmem_alloc_delta)
   1231		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
   1232	return 0;
   1233
   1234error_efault:
   1235	err = -EFAULT;
   1236error:
   1237	net_zcopy_put_abort(uarg, extra_uref);
   1238	cork->length -= length;
   1239	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
   1240	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
   1241	return err;
   1242}
   1243
   1244static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
   1245			 struct ipcm_cookie *ipc, struct rtable **rtp)
   1246{
   1247	struct ip_options_rcu *opt;
   1248	struct rtable *rt;
   1249
   1250	rt = *rtp;
   1251	if (unlikely(!rt))
   1252		return -EFAULT;
   1253
   1254	/*
   1255	 * setup for corking.
   1256	 */
   1257	opt = ipc->opt;
   1258	if (opt) {
   1259		if (!cork->opt) {
   1260			cork->opt = kmalloc(sizeof(struct ip_options) + 40,
   1261					    sk->sk_allocation);
   1262			if (unlikely(!cork->opt))
   1263				return -ENOBUFS;
   1264		}
   1265		memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
   1266		cork->flags |= IPCORK_OPT;
   1267		cork->addr = ipc->addr;
   1268	}
   1269
   1270	cork->fragsize = ip_sk_use_pmtu(sk) ?
   1271			 dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);
   1272
   1273	if (!inetdev_valid_mtu(cork->fragsize))
   1274		return -ENETUNREACH;
   1275
   1276	cork->gso_size = ipc->gso_size;
   1277
   1278	cork->dst = &rt->dst;
   1279	/* We stole this route, caller should not release it. */
   1280	*rtp = NULL;
   1281
   1282	cork->length = 0;
   1283	cork->ttl = ipc->ttl;
   1284	cork->tos = ipc->tos;
   1285	cork->mark = ipc->sockc.mark;
   1286	cork->priority = ipc->priority;
   1287	cork->transmit_time = ipc->sockc.transmit_time;
   1288	cork->tx_flags = 0;
   1289	sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags);
   1290
   1291	return 0;
   1292}
   1293
   1294/*
   1295 *	ip_append_data() and ip_append_page() can make one large IP datagram
   1296 *	from many pieces of data. Each pieces will be holded on the socket
   1297 *	until ip_push_pending_frames() is called. Each piece can be a page
   1298 *	or non-page data.
   1299 *
   1300 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
   1301 *	this interface potentially.
   1302 *
   1303 *	LATER: length must be adjusted by pad at tail, when it is required.
   1304 */
   1305int ip_append_data(struct sock *sk, struct flowi4 *fl4,
   1306		   int getfrag(void *from, char *to, int offset, int len,
   1307			       int odd, struct sk_buff *skb),
   1308		   void *from, int length, int transhdrlen,
   1309		   struct ipcm_cookie *ipc, struct rtable **rtp,
   1310		   unsigned int flags)
   1311{
   1312	struct inet_sock *inet = inet_sk(sk);
   1313	int err;
   1314
   1315	if (flags&MSG_PROBE)
   1316		return 0;
   1317
   1318	if (skb_queue_empty(&sk->sk_write_queue)) {
   1319		err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
   1320		if (err)
   1321			return err;
   1322	} else {
   1323		transhdrlen = 0;
   1324	}
   1325
   1326	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
   1327				sk_page_frag(sk), getfrag,
   1328				from, length, transhdrlen, flags);
   1329}
   1330
   1331ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
   1332		       int offset, size_t size, int flags)
   1333{
   1334	struct inet_sock *inet = inet_sk(sk);
   1335	struct sk_buff *skb;
   1336	struct rtable *rt;
   1337	struct ip_options *opt = NULL;
   1338	struct inet_cork *cork;
   1339	int hh_len;
   1340	int mtu;
   1341	int len;
   1342	int err;
   1343	unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;
   1344
   1345	if (inet->hdrincl)
   1346		return -EPERM;
   1347
   1348	if (flags&MSG_PROBE)
   1349		return 0;
   1350
   1351	if (skb_queue_empty(&sk->sk_write_queue))
   1352		return -EINVAL;
   1353
   1354	cork = &inet->cork.base;
   1355	rt = (struct rtable *)cork->dst;
   1356	if (cork->flags & IPCORK_OPT)
   1357		opt = cork->opt;
   1358
   1359	if (!(rt->dst.dev->features & NETIF_F_SG))
   1360		return -EOPNOTSUPP;
   1361
   1362	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
   1363	mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
   1364
   1365	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
   1366	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
   1367	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
   1368
   1369	if (cork->length + size > maxnonfragsize - fragheaderlen) {
   1370		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
   1371			       mtu - (opt ? opt->optlen : 0));
   1372		return -EMSGSIZE;
   1373	}
   1374
   1375	skb = skb_peek_tail(&sk->sk_write_queue);
   1376	if (!skb)
   1377		return -EINVAL;
   1378
   1379	cork->length += size;
   1380
   1381	while (size > 0) {
   1382		/* Check if the remaining data fits into current packet. */
   1383		len = mtu - skb->len;
   1384		if (len < size)
   1385			len = maxfraglen - skb->len;
   1386
   1387		if (len <= 0) {
   1388			struct sk_buff *skb_prev;
   1389			int alloclen;
   1390
   1391			skb_prev = skb;
   1392			fraggap = skb_prev->len - maxfraglen;
   1393
   1394			alloclen = fragheaderlen + hh_len + fraggap + 15;
   1395			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
   1396			if (unlikely(!skb)) {
   1397				err = -ENOBUFS;
   1398				goto error;
   1399			}
   1400
   1401			/*
   1402			 *	Fill in the control structures
   1403			 */
   1404			skb->ip_summed = CHECKSUM_NONE;
   1405			skb->csum = 0;
   1406			skb_reserve(skb, hh_len);
   1407
   1408			/*
   1409			 *	Find where to start putting bytes.
   1410			 */
   1411			skb_put(skb, fragheaderlen + fraggap);
   1412			skb_reset_network_header(skb);
   1413			skb->transport_header = (skb->network_header +
   1414						 fragheaderlen);
   1415			if (fraggap) {
   1416				skb->csum = skb_copy_and_csum_bits(skb_prev,
   1417								   maxfraglen,
   1418						    skb_transport_header(skb),
   1419								   fraggap);
   1420				skb_prev->csum = csum_sub(skb_prev->csum,
   1421							  skb->csum);
   1422				pskb_trim_unique(skb_prev, maxfraglen);
   1423			}
   1424
   1425			/*
   1426			 * Put the packet on the pending queue.
   1427			 */
   1428			__skb_queue_tail(&sk->sk_write_queue, skb);
   1429			continue;
   1430		}
   1431
   1432		if (len > size)
   1433			len = size;
   1434
   1435		if (skb_append_pagefrags(skb, page, offset, len)) {
   1436			err = -EMSGSIZE;
   1437			goto error;
   1438		}
   1439
   1440		if (skb->ip_summed == CHECKSUM_NONE) {
   1441			__wsum csum;
   1442			csum = csum_page(page, offset, len);
   1443			skb->csum = csum_block_add(skb->csum, csum, skb->len);
   1444		}
   1445
   1446		skb->len += len;
   1447		skb->data_len += len;
   1448		skb->truesize += len;
   1449		refcount_add(len, &sk->sk_wmem_alloc);
   1450		offset += len;
   1451		size -= len;
   1452	}
   1453	return 0;
   1454
   1455error:
   1456	cork->length -= size;
   1457	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
   1458	return err;
   1459}
   1460
   1461static void ip_cork_release(struct inet_cork *cork)
   1462{
   1463	cork->flags &= ~IPCORK_OPT;
   1464	kfree(cork->opt);
   1465	cork->opt = NULL;
   1466	dst_release(cork->dst);
   1467	cork->dst = NULL;
   1468}
   1469
   1470/*
   1471 *	Combined all pending IP fragments on the socket as one IP datagram
   1472 *	and push them out.
   1473 */
   1474struct sk_buff *__ip_make_skb(struct sock *sk,
   1475			      struct flowi4 *fl4,
   1476			      struct sk_buff_head *queue,
   1477			      struct inet_cork *cork)
   1478{
   1479	struct sk_buff *skb, *tmp_skb;
   1480	struct sk_buff **tail_skb;
   1481	struct inet_sock *inet = inet_sk(sk);
   1482	struct net *net = sock_net(sk);
   1483	struct ip_options *opt = NULL;
   1484	struct rtable *rt = (struct rtable *)cork->dst;
   1485	struct iphdr *iph;
   1486	__be16 df = 0;
   1487	__u8 ttl;
   1488
   1489	skb = __skb_dequeue(queue);
   1490	if (!skb)
   1491		goto out;
   1492	tail_skb = &(skb_shinfo(skb)->frag_list);
   1493
   1494	/* move skb->data to ip header from ext header */
   1495	if (skb->data < skb_network_header(skb))
   1496		__skb_pull(skb, skb_network_offset(skb));
   1497	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
   1498		__skb_pull(tmp_skb, skb_network_header_len(skb));
   1499		*tail_skb = tmp_skb;
   1500		tail_skb = &(tmp_skb->next);
   1501		skb->len += tmp_skb->len;
   1502		skb->data_len += tmp_skb->len;
   1503		skb->truesize += tmp_skb->truesize;
   1504		tmp_skb->destructor = NULL;
   1505		tmp_skb->sk = NULL;
   1506	}
   1507
   1508	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
   1509	 * to fragment the frame generated here. No matter, what transforms
   1510	 * how transforms change size of the packet, it will come out.
   1511	 */
   1512	skb->ignore_df = ip_sk_ignore_df(sk);
   1513
   1514	/* DF bit is set when we want to see DF on outgoing frames.
   1515	 * If ignore_df is set too, we still allow to fragment this frame
   1516	 * locally. */
   1517	if (inet->pmtudisc == IP_PMTUDISC_DO ||
   1518	    inet->pmtudisc == IP_PMTUDISC_PROBE ||
   1519	    (skb->len <= dst_mtu(&rt->dst) &&
   1520	     ip_dont_fragment(sk, &rt->dst)))
   1521		df = htons(IP_DF);
   1522
   1523	if (cork->flags & IPCORK_OPT)
   1524		opt = cork->opt;
   1525
   1526	if (cork->ttl != 0)
   1527		ttl = cork->ttl;
   1528	else if (rt->rt_type == RTN_MULTICAST)
   1529		ttl = inet->mc_ttl;
   1530	else
   1531		ttl = ip_select_ttl(inet, &rt->dst);
   1532
   1533	iph = ip_hdr(skb);
   1534	iph->version = 4;
   1535	iph->ihl = 5;
   1536	iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
   1537	iph->frag_off = df;
   1538	iph->ttl = ttl;
   1539	iph->protocol = sk->sk_protocol;
   1540	ip_copy_addrs(iph, fl4);
   1541	ip_select_ident(net, skb, sk);
   1542
   1543	if (opt) {
   1544		iph->ihl += opt->optlen >> 2;
   1545		ip_options_build(skb, opt, cork->addr, rt);
   1546	}
   1547
   1548	skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
   1549	skb->mark = cork->mark;
   1550	skb->tstamp = cork->transmit_time;
   1551	/*
   1552	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
   1553	 * on dst refcount
   1554	 */
   1555	cork->dst = NULL;
   1556	skb_dst_set(skb, &rt->dst);
   1557
   1558	if (iph->protocol == IPPROTO_ICMP)
   1559		icmp_out_count(net, ((struct icmphdr *)
   1560			skb_transport_header(skb))->type);
   1561
   1562	ip_cork_release(cork);
   1563out:
   1564	return skb;
   1565}
   1566
   1567int ip_send_skb(struct net *net, struct sk_buff *skb)
   1568{
   1569	int err;
   1570
   1571	err = ip_local_out(net, skb->sk, skb);
   1572	if (err) {
   1573		if (err > 0)
   1574			err = net_xmit_errno(err);
   1575		if (err)
   1576			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
   1577	}
   1578
   1579	return err;
   1580}
   1581
   1582int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
   1583{
   1584	struct sk_buff *skb;
   1585
   1586	skb = ip_finish_skb(sk, fl4);
   1587	if (!skb)
   1588		return 0;
   1589
   1590	/* Netfilter gets whole the not fragmented skb. */
   1591	return ip_send_skb(sock_net(sk), skb);
   1592}
   1593
   1594/*
   1595 *	Throw away all pending data on the socket.
   1596 */
   1597static void __ip_flush_pending_frames(struct sock *sk,
   1598				      struct sk_buff_head *queue,
   1599				      struct inet_cork *cork)
   1600{
   1601	struct sk_buff *skb;
   1602
   1603	while ((skb = __skb_dequeue_tail(queue)) != NULL)
   1604		kfree_skb(skb);
   1605
   1606	ip_cork_release(cork);
   1607}
   1608
   1609void ip_flush_pending_frames(struct sock *sk)
   1610{
   1611	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
   1612}
   1613
   1614struct sk_buff *ip_make_skb(struct sock *sk,
   1615			    struct flowi4 *fl4,
   1616			    int getfrag(void *from, char *to, int offset,
   1617					int len, int odd, struct sk_buff *skb),
   1618			    void *from, int length, int transhdrlen,
   1619			    struct ipcm_cookie *ipc, struct rtable **rtp,
   1620			    struct inet_cork *cork, unsigned int flags)
   1621{
   1622	struct sk_buff_head queue;
   1623	int err;
   1624
   1625	if (flags & MSG_PROBE)
   1626		return NULL;
   1627
   1628	__skb_queue_head_init(&queue);
   1629
   1630	cork->flags = 0;
   1631	cork->addr = 0;
   1632	cork->opt = NULL;
   1633	err = ip_setup_cork(sk, cork, ipc, rtp);
   1634	if (err)
   1635		return ERR_PTR(err);
   1636
   1637	err = __ip_append_data(sk, fl4, &queue, cork,
   1638			       &current->task_frag, getfrag,
   1639			       from, length, transhdrlen, flags);
   1640	if (err) {
   1641		__ip_flush_pending_frames(sk, &queue, cork);
   1642		return ERR_PTR(err);
   1643	}
   1644
   1645	return __ip_make_skb(sk, fl4, &queue, cork);
   1646}
   1647
   1648/*
   1649 *	Fetch data from kernel space and fill in checksum if needed.
   1650 */
   1651static int ip_reply_glue_bits(void *dptr, char *to, int offset,
   1652			      int len, int odd, struct sk_buff *skb)
   1653{
   1654	__wsum csum;
   1655
   1656	csum = csum_partial_copy_nocheck(dptr+offset, to, len);
   1657	skb->csum = csum_block_add(skb->csum, csum, odd);
   1658	return 0;
   1659}
   1660
   1661/*
   1662 *	Generic function to send a packet as reply to another packet.
   1663 *	Used to send some TCP resets/acks so far.
   1664 */
   1665void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
   1666			   const struct ip_options *sopt,
   1667			   __be32 daddr, __be32 saddr,
   1668			   const struct ip_reply_arg *arg,
   1669			   unsigned int len, u64 transmit_time)
   1670{
   1671	struct ip_options_data replyopts;
   1672	struct ipcm_cookie ipc;
   1673	struct flowi4 fl4;
   1674	struct rtable *rt = skb_rtable(skb);
   1675	struct net *net = sock_net(sk);
   1676	struct sk_buff *nskb;
   1677	int err;
   1678	int oif;
   1679
   1680	if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
   1681		return;
   1682
   1683	ipcm_init(&ipc);
   1684	ipc.addr = daddr;
   1685	ipc.sockc.transmit_time = transmit_time;
   1686
   1687	if (replyopts.opt.opt.optlen) {
   1688		ipc.opt = &replyopts.opt;
   1689
   1690		if (replyopts.opt.opt.srr)
   1691			daddr = replyopts.opt.opt.faddr;
   1692	}
   1693
   1694	oif = arg->bound_dev_if;
   1695	if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
   1696		oif = skb->skb_iif;
   1697
   1698	flowi4_init_output(&fl4, oif,
   1699			   IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
   1700			   RT_TOS(arg->tos),
   1701			   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
   1702			   ip_reply_arg_flowi_flags(arg),
   1703			   daddr, saddr,
   1704			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
   1705			   arg->uid);
   1706	security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
   1707	rt = ip_route_output_key(net, &fl4);
   1708	if (IS_ERR(rt))
   1709		return;
   1710
   1711	inet_sk(sk)->tos = arg->tos & ~INET_ECN_MASK;
   1712
   1713	sk->sk_protocol = ip_hdr(skb)->protocol;
   1714	sk->sk_bound_dev_if = arg->bound_dev_if;
   1715	sk->sk_sndbuf = sysctl_wmem_default;
   1716	ipc.sockc.mark = fl4.flowi4_mark;
   1717	err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
   1718			     len, 0, &ipc, &rt, MSG_DONTWAIT);
   1719	if (unlikely(err)) {
   1720		ip_flush_pending_frames(sk);
   1721		goto out;
   1722	}
   1723
   1724	nskb = skb_peek(&sk->sk_write_queue);
   1725	if (nskb) {
   1726		if (arg->csumoffset >= 0)
   1727			*((__sum16 *)skb_transport_header(nskb) +
   1728			  arg->csumoffset) = csum_fold(csum_add(nskb->csum,
   1729								arg->csum));
   1730		nskb->ip_summed = CHECKSUM_NONE;
   1731		nskb->mono_delivery_time = !!transmit_time;
   1732		ip_push_pending_frames(sk, &fl4);
   1733	}
   1734out:
   1735	ip_rt_put(rt);
   1736}
   1737
   1738void __init ip_init(void)
   1739{
   1740	ip_rt_init();
   1741	inet_initpeers();
   1742
   1743#if defined(CONFIG_IP_MULTICAST)
   1744	igmp_mc_init();
   1745#endif
   1746}