From 89527be8d8d672773eeaec910118a6e84fb597e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 May 2022 11:33:56 -0700 Subject: net: add IFLA_TSO_{MAX_SIZE|SEGS} attributes New netlink attributes IFLA_TSO_MAX_SIZE and IFLA_TSO_MAX_SEGS are used to report to user-space the device TSO limits. ip -d link sh dev eth1 ... tso_max_size 65536 tso_max_segs 65535 Signed-off-by: Eric Dumazet Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index d1e600816b82..5f58dcfe2787 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -368,6 +368,8 @@ enum { IFLA_PARENT_DEV_NAME, IFLA_PARENT_DEV_BUS_NAME, IFLA_GRO_MAX_SIZE, + IFLA_TSO_MAX_SIZE, + IFLA_TSO_MAX_SEGS, __IFLA_MAX }; -- cgit v1.2.3-71-gd317 From 7c4e983c4f3cf94fcd879730c6caa877e0768a4d Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 13 May 2022 11:33:57 -0700 Subject: net: allow gso_max_size to exceed 65536 The code for gso_max_size was added originally to allow for debugging and workaround of buggy devices that couldn't support TSO with blocks 64K in size. The original reason for limiting it to 64K was because that was the existing limits of IPv4 and non-jumbogram IPv6 length fields. With the addition of Big TCP we can remove this limit and allow the value to potentially go up to UINT_MAX and instead be limited by the tso_max_size value. So in order to support this we need to go through and clean up the remaining users of the gso_max_size value so that the values will cap at 64K for non-TCPv6 flows. In addition we can clean up the GSO_MAX_SIZE value so that 64K becomes GSO_LEGACY_MAX_SIZE and UINT_MAX will now be the upper limit for GSO_MAX_SIZE. v6: (edumazet) fixed a compile error if CONFIG_IPV6=n, in a new sk_trim_gso_size() helper. netif_set_tso_max_size() caps the requested TSO size with GSO_MAX_SIZE. Signed-off-by: Alexander Duyck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe.h | 3 ++- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 2 +- drivers/net/ethernet/sfc/ef100_nic.c | 3 ++- drivers/net/ethernet/sfc/falcon/tx.c | 3 ++- drivers/net/ethernet/sfc/tx_common.c | 3 ++- drivers/net/ethernet/synopsys/dwc-xlgmac.h | 3 ++- drivers/net/hyperv/rndis_filter.c | 2 +- drivers/scsi/fcoe/fcoe.c | 2 +- include/linux/netdevice.h | 4 +++- net/bpf/test_run.c | 2 +- net/core/dev.c | 7 ++++--- net/core/rtnetlink.c | 2 +- net/core/sock.c | 14 ++++++++++++++ net/ipv4/tcp_bbr.c | 2 +- net/ipv4/tcp_output.c | 2 +- net/sctp/output.c | 3 ++- 16 files changed, 40 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h index 607a2c90513b..d9547552ceef 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe.h @@ -151,7 +151,8 @@ #define XGBE_TX_MAX_BUF_SIZE (0x3fff & ~(64 - 1)) /* Descriptors required for maximum contiguous TSO/GSO packet */ -#define XGBE_TX_MAX_SPLIT ((GSO_MAX_SIZE / XGBE_TX_MAX_BUF_SIZE) + 1) +#define XGBE_TX_MAX_SPLIT \ + ((GSO_LEGACY_MAX_SIZE / XGBE_TX_MAX_BUF_SIZE) + 1) /* Maximum possible descriptors needed for an SKB: * - Maximum number of SKB frags diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index fb11081001a0..838870bc6dbd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -2038,7 +2038,7 @@ mlx5e_hw_gro_skb_has_enough_space(struct sk_buff *skb, u16 data_bcnt) { int nr_frags = skb_shinfo(skb)->nr_frags; - return PAGE_SIZE * nr_frags + data_bcnt <= GSO_MAX_SIZE; + return PAGE_SIZE * nr_frags + data_bcnt <= GRO_MAX_SIZE; } static void diff --git a/drivers/net/ethernet/sfc/ef100_nic.c b/drivers/net/ethernet/sfc/ef100_nic.c index a69d756e09b9..b2536d2c218a 100644 --- a/drivers/net/ethernet/sfc/ef100_nic.c +++ b/drivers/net/ethernet/sfc/ef100_nic.c @@ -1008,7 +1008,8 @@ static int ef100_process_design_param(struct efx_nic *efx, } return 0; case ESE_EF100_DP_GZ_TSO_MAX_PAYLOAD_LEN: - nic_data->tso_max_payload_len = min_t(u64, reader->value, GSO_MAX_SIZE); + nic_data->tso_max_payload_len = min_t(u64, reader->value, + GSO_LEGACY_MAX_SIZE); netif_set_tso_max_size(efx->net_dev, nic_data->tso_max_payload_len); return 0; diff --git a/drivers/net/ethernet/sfc/falcon/tx.c b/drivers/net/ethernet/sfc/falcon/tx.c index f7306e93a8b8..b9369483758c 100644 --- a/drivers/net/ethernet/sfc/falcon/tx.c +++ b/drivers/net/ethernet/sfc/falcon/tx.c @@ -98,7 +98,8 @@ unsigned int ef4_tx_max_skb_descs(struct ef4_nic *efx) /* Possibly more for PCIe page boundaries within input fragments */ if (PAGE_SIZE > EF4_PAGE_SIZE) max_descs += max_t(unsigned int, MAX_SKB_FRAGS, - DIV_ROUND_UP(GSO_MAX_SIZE, EF4_PAGE_SIZE)); + DIV_ROUND_UP(GSO_LEGACY_MAX_SIZE, + EF4_PAGE_SIZE)); return max_descs; } diff --git a/drivers/net/ethernet/sfc/tx_common.c b/drivers/net/ethernet/sfc/tx_common.c index 9bc8281b7f5b..658ea2d34070 100644 --- a/drivers/net/ethernet/sfc/tx_common.c +++ b/drivers/net/ethernet/sfc/tx_common.c @@ -416,7 +416,8 @@ unsigned int efx_tx_max_skb_descs(struct efx_nic *efx) /* Possibly more for PCIe page boundaries within input fragments */ if (PAGE_SIZE > EFX_PAGE_SIZE) max_descs += max_t(unsigned int, MAX_SKB_FRAGS, - DIV_ROUND_UP(GSO_MAX_SIZE, EFX_PAGE_SIZE)); + DIV_ROUND_UP(GSO_LEGACY_MAX_SIZE, + EFX_PAGE_SIZE)); return max_descs; } diff --git a/drivers/net/ethernet/synopsys/dwc-xlgmac.h b/drivers/net/ethernet/synopsys/dwc-xlgmac.h index 98e3a271e017..a848e10f3ea4 100644 --- a/drivers/net/ethernet/synopsys/dwc-xlgmac.h +++ b/drivers/net/ethernet/synopsys/dwc-xlgmac.h @@ -38,7 +38,8 @@ #define XLGMAC_RX_DESC_MAX_DIRTY (XLGMAC_RX_DESC_CNT >> 3) /* Descriptors required for maximum contiguous TSO/GSO packet */ -#define XLGMAC_TX_MAX_SPLIT ((GSO_MAX_SIZE / XLGMAC_TX_MAX_BUF_SIZE) + 1) +#define XLGMAC_TX_MAX_SPLIT \ + ((GSO_LEGACY_MAX_SIZE / XLGMAC_TX_MAX_BUF_SIZE) + 1) /* Maximum possible descriptors needed for a SKB */ #define XLGMAC_TX_MAX_DESC_NR (MAX_SKB_FRAGS + XLGMAC_TX_MAX_SPLIT + 2) diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 866af2cc27a3..6da36cb8af80 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -1349,7 +1349,7 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device, struct net_device_context *net_device_ctx = netdev_priv(net); struct ndis_offload hwcaps; struct ndis_offload_params offloads; - unsigned int gso_max_size = GSO_MAX_SIZE; + unsigned int gso_max_size = GSO_LEGACY_MAX_SIZE; int ret; /* Find HW offload capabilities */ diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c index 44ca6110213c..79b2827e4081 100644 --- a/drivers/scsi/fcoe/fcoe.c +++ b/drivers/scsi/fcoe/fcoe.c @@ -667,7 +667,7 @@ static void fcoe_netdev_features_change(struct fc_lport *lport, if (netdev->features & NETIF_F_FSO) { lport->seq_offload = 1; - lport->lso_max = netdev->gso_max_size; + lport->lso_max = min(netdev->gso_max_size, GSO_LEGACY_MAX_SIZE); FCOE_NETDEV_DBG(netdev, "Supports LSO for max len 0x%x\n", lport->lso_max); } else { diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 536321691c72..ce780e352f43 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2272,7 +2272,9 @@ struct net_device { const struct rtnl_link_ops *rtnl_link_ops; /* for setting kernel sock attribute on TCP connection setup */ -#define GSO_MAX_SIZE 65536 +#define GSO_LEGACY_MAX_SIZE 65536u +#define GSO_MAX_SIZE UINT_MAX + unsigned int gso_max_size; #define TSO_LEGACY_MAX_SIZE 65536 #define TSO_MAX_SIZE UINT_MAX diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 8d54fef9a568..9b5a1f630bb0 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -1001,7 +1001,7 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) cb->pkt_len = skb->len; } else { if (__skb->wire_len < skb->len || - __skb->wire_len > GSO_MAX_SIZE) + __skb->wire_len > GSO_LEGACY_MAX_SIZE) return -EINVAL; cb->pkt_len = __skb->wire_len; } diff --git a/net/core/dev.c b/net/core/dev.c index a601da3b4a7c..830beb05161a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2998,11 +2998,12 @@ EXPORT_SYMBOL(netif_set_real_num_queues); * @size: max skb->len of a TSO frame * * Set the limit on the size of TSO super-frames the device can handle. - * Unless explicitly set the stack will assume the value of %GSO_MAX_SIZE. + * Unless explicitly set the stack will assume the value of + * %GSO_LEGACY_MAX_SIZE. */ void netif_set_tso_max_size(struct net_device *dev, unsigned int size) { - dev->tso_max_size = size; + dev->tso_max_size = min(GSO_MAX_SIZE, size); if (size < READ_ONCE(dev->gso_max_size)) netif_set_gso_max_size(dev, size); } @@ -10595,7 +10596,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - dev->gso_max_size = GSO_MAX_SIZE; + dev->gso_max_size = GSO_LEGACY_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; dev->gro_max_size = GRO_MAX_SIZE; dev->tso_max_size = TSO_LEGACY_MAX_SIZE; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f35cc21298ac..f2b0f747d3d2 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2817,7 +2817,7 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_GSO_MAX_SIZE]) { u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); - if (max_size > GSO_MAX_SIZE || max_size > dev->tso_max_size) { + if (max_size > dev->tso_max_size) { err = -EINVAL; goto errout; } diff --git a/net/core/sock.c b/net/core/sock.c index 6b287eb5427b..24a46a1e4f28 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2293,6 +2293,19 @@ void sk_free_unlock_clone(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_free_unlock_clone); +static void sk_trim_gso_size(struct sock *sk) +{ + if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE) + return; +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6 && + sk_is_tcp(sk) && + !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) + return; +#endif + sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE; +} + void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; @@ -2312,6 +2325,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size); + sk_trim_gso_size(sk); sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index c7d30a3bbd81..075e744bfb48 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -310,7 +310,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk) */ bytes = min_t(unsigned long, sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), - GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); + GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER); segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); return min(segs, 0x7FU); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b092228e4342..b4b2284ed4a2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1553,7 +1553,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, * SO_SNDBUF values. * Also allow first and last skb in retransmit queue to be split. */ - limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_MAX_SIZE); + limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_LEGACY_MAX_SIZE); if (unlikely((sk->sk_wmem_queued >> 1) > limit && tcp_queue != TCP_FRAG_IN_WRITE_QUEUE && skb != tcp_rtx_queue_head(sk) && diff --git a/net/sctp/output.c b/net/sctp/output.c index 72fe6669c50d..a63df055ac57 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -134,7 +134,8 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, dst_hold(tp->dst); sk_setup_caps(sk, tp->dst); } - packet->max_size = sk_can_gso(sk) ? READ_ONCE(tp->dst->dev->gso_max_size) + packet->max_size = sk_can_gso(sk) ? min(READ_ONCE(tp->dst->dev->gso_max_size), + GSO_LEGACY_MAX_SIZE) : asoc->pathmtu; rcu_read_unlock(); } -- cgit v1.2.3-71-gd317 From 34b92e8d19da1e44070dc0ecc2747871469ac534 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 May 2022 11:33:58 -0700 Subject: net: limit GSO_MAX_SIZE to 524280 bytes Make sure we will not overflow shinfo->gso_segs Minimal TCP MSS size is 8 bytes, and shinfo->gso_segs is a 16bit field. TCP_MIN_GSO_SIZE is currently defined in include/net/tcp.h, it seems cleaner to not bring tcp details into include/linux/netdevice.h Signed-off-by: Eric Dumazet Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ce780e352f43..fd38847d0dc7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2272,14 +2272,17 @@ struct net_device { const struct rtnl_link_ops *rtnl_link_ops; /* for setting kernel sock attribute on TCP connection setup */ +#define GSO_MAX_SEGS 65535u #define GSO_LEGACY_MAX_SIZE 65536u -#define GSO_MAX_SIZE UINT_MAX +/* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), + * and shinfo->gso_segs is a 16bit field. + */ +#define GSO_MAX_SIZE (8 * GSO_MAX_SEGS) unsigned int gso_max_size; #define TSO_LEGACY_MAX_SIZE 65536 #define TSO_MAX_SIZE UINT_MAX unsigned int tso_max_size; -#define GSO_MAX_SEGS 65535 u16 gso_max_segs; #define TSO_MAX_SEGS U16_MAX u16 tso_max_segs; -- cgit v1.2.3-71-gd317 From 7c96d8ec96bb71aac54c9f872aaa65d7411ab864 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 May 2022 11:34:00 -0700 Subject: ipv6: add struct hop_jumbo_hdr definition Following patches will need to add and remove local IPv6 jumbogram options to enable BIG TCP. Signed-off-by: Eric Dumazet Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- include/net/ipv6.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 213612f1680c..63d019953c47 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -151,6 +151,17 @@ struct frag_hdr { __be32 identification; }; +/* + * Jumbo payload option, as described in RFC 2675 2. + */ +struct hop_jumbo_hdr { + u8 nexthdr; + u8 hdrlen; + u8 tlv_type; /* IPV6_TLV_JUMBO, 0xC2 */ + u8 tlv_len; /* 4 */ + __be32 jumbo_payload_len; +}; + #define IP6_MF 0x0001 #define IP6_OFFSET 0xFFF8 -- cgit v1.2.3-71-gd317 From 09f3d1a3a52c696208008618a67e2c7c3fb16d41 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 May 2022 11:34:01 -0700 Subject: ipv6/gso: remove temporary HBH/jumbo header ipv6 tcp and gro stacks will soon be able to build big TCP packets, with an added temporary Hop By Hop header. If GSO is involved for these large packets, we need to remove the temporary HBH header before segmentation happens. v2: perform HBH removal from ipv6_gso_segment() instead of skb_segment() (Alexander feedback) Signed-off-by: Eric Dumazet Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- include/net/ipv6.h | 33 +++++++++++++++++++++++++++++++++ net/ipv6/ip6_offload.c | 24 +++++++++++++++++++++++- 2 files changed, 56 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 63d019953c47..b6df0314aa02 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -467,6 +467,39 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt); +/* This helper is specialized for BIG TCP needs. + * It assumes the hop_jumbo_hdr will immediately follow the IPV6 header. + * It assumes headers are already in skb->head. + * Returns 0, or IPPROTO_TCP if a BIG TCP packet is there. + */ +static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb) +{ + const struct hop_jumbo_hdr *jhdr; + const struct ipv6hdr *nhdr; + + if (likely(skb->len <= GRO_MAX_SIZE)) + return 0; + + if (skb->protocol != htons(ETH_P_IPV6)) + return 0; + + if (skb_network_offset(skb) + + sizeof(struct ipv6hdr) + + sizeof(struct hop_jumbo_hdr) > skb_headlen(skb)) + return 0; + + nhdr = ipv6_hdr(skb); + + if (nhdr->nexthdr != NEXTHDR_HOP) + return 0; + + jhdr = (const struct hop_jumbo_hdr *) (nhdr + 1); + if (jhdr->tlv_type != IPV6_TLV_JUMBO || jhdr->hdrlen != 0 || + jhdr->nexthdr != IPPROTO_TCP) + return 0; + return jhdr->nexthdr; +} + static inline bool ipv6_accept_ra(struct inet6_dev *idev) { /* If forwarding is enabled, RA are not accepted unless the special diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index c4fc03c1ac99..a6a6c1539c28 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -77,7 +77,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, struct sk_buff *segs = ERR_PTR(-EINVAL); struct ipv6hdr *ipv6h; const struct net_offload *ops; - int proto; + int proto, nexthdr; struct frag_hdr *fptr; unsigned int payload_len; u8 *prevhdr; @@ -87,6 +87,28 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, bool gso_partial; skb_reset_network_header(skb); + nexthdr = ipv6_has_hopopt_jumbo(skb); + if (nexthdr) { + const int hophdr_len = sizeof(struct hop_jumbo_hdr); + int err; + + err = skb_cow_head(skb, 0); + if (err < 0) + return ERR_PTR(err); + + /* remove the HBH header. + * Layout: [Ethernet header][IPv6 header][HBH][TCP header] + */ + memmove(skb_mac_header(skb) + hophdr_len, + skb_mac_header(skb), + ETH_HLEN + sizeof(struct ipv6hdr)); + skb->data += hophdr_len; + skb->len -= hophdr_len; + skb->network_header += hophdr_len; + skb->mac_header += hophdr_len; + ipv6h = (struct ipv6hdr *)skb->data; + ipv6h->nexthdr = nexthdr; + } nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) goto out; -- cgit v1.2.3-71-gd317 From 0fe79f28bfaf73b66b7b1562d2468f94aa03bd12 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 13 May 2022 11:34:03 -0700 Subject: net: allow gro_max_size to exceed 65536 Allow the gro_max_size to exceed a value larger than 65536. There weren't really any external limitations that prevented this other than the fact that IPv4 only supports a 16 bit length field. Since we have the option of adding a hop-by-hop header for IPv6 we can allow IPv6 to exceed this value and for IPv4 and non-TCP flows we can cap things at 65536 via a constant rather than relying on gro_max_size. [edumazet] limit GRO_MAX_SIZE to (8 * 65535) to avoid overflows. Signed-off-by: Alexander Duyck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 2 +- include/linux/netdevice.h | 6 +++++- include/net/ipv6.h | 2 +- net/core/dev.c | 2 +- net/core/gro.c | 8 ++++++++ net/core/rtnetlink.c | 8 -------- 6 files changed, 16 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 838870bc6dbd..24de37b79f5a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -2038,7 +2038,7 @@ mlx5e_hw_gro_skb_has_enough_space(struct sk_buff *skb, u16 data_bcnt) { int nr_frags = skb_shinfo(skb)->nr_frags; - return PAGE_SIZE * nr_frags + data_bcnt <= GRO_MAX_SIZE; + return PAGE_SIZE * nr_frags + data_bcnt <= GRO_LEGACY_MAX_SIZE; } static void diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fd38847d0dc7..d57ce248004c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2161,7 +2161,11 @@ struct net_device { struct bpf_prog __rcu *xdp_prog; unsigned long gro_flush_timeout; int napi_defer_hard_irqs; -#define GRO_MAX_SIZE 65536 +#define GRO_LEGACY_MAX_SIZE 65536u +/* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), + * and shinfo->gso_segs is a 16bit field. + */ +#define GRO_MAX_SIZE (8 * 65535u) unsigned int gro_max_size; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index b6df0314aa02..5b38bf1a586b 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -477,7 +477,7 @@ static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb) const struct hop_jumbo_hdr *jhdr; const struct ipv6hdr *nhdr; - if (likely(skb->len <= GRO_MAX_SIZE)) + if (likely(skb->len <= GRO_LEGACY_MAX_SIZE)) return 0; if (skb->protocol != htons(ETH_P_IPV6)) diff --git a/net/core/dev.c b/net/core/dev.c index 830beb05161a..d93456c75b55 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10598,7 +10598,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->gso_max_size = GSO_LEGACY_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; - dev->gro_max_size = GRO_MAX_SIZE; + dev->gro_max_size = GRO_LEGACY_MAX_SIZE; dev->tso_max_size = TSO_LEGACY_MAX_SIZE; dev->tso_max_segs = TSO_MAX_SEGS; dev->upper_level = 1; diff --git a/net/core/gro.c b/net/core/gro.c index 78110edf5d4b..b4190eb08467 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -167,6 +167,14 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) return -E2BIG; + if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { + if (p->protocol != htons(ETH_P_IPV6) || + skb_headroom(p) < sizeof(struct hop_jumbo_hdr) || + ipv6_hdr(p)->nexthdr != IPPROTO_TCP || + p->encapsulation) + return -E2BIG; + } + lp = NAPI_GRO_CB(p)->last; pinfo = skb_shinfo(lp); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f2b0f747d3d2..ac45328607f7 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2360,14 +2360,6 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], } } - if (tb[IFLA_GRO_MAX_SIZE]) { - u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]); - - if (gro_max_size > GRO_MAX_SIZE) { - NL_SET_ERR_MSG(extack, "too big gro_max_size"); - return -EINVAL; - } - } return 0; } -- cgit v1.2.3-71-gd317 From 80e425b613421911f89664663a7060216abcaed2 Mon Sep 17 00:00:00 2001 From: Coco Li Date: Fri, 13 May 2022 11:34:04 -0700 Subject: ipv6: Add hop-by-hop header to jumbograms in ip6_output Instead of simply forcing a 0 payload_len in IPv6 header, implement RFC 2675 and insert a custom extension header. Note that only TCP stack is currently potentially generating jumbograms, and that this extension header is purely local, it wont be sent on a physical link. This is needed so that packet capture (tcpdump and friends) can properly dissect these large packets. Signed-off-by: Coco Li Signed-off-by: Eric Dumazet Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + net/ipv6/ip6_output.c | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index ec5ca392eaa3..38c8203d52cb 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -145,6 +145,7 @@ struct inet6_skb_parm { #define IP6SKB_L3SLAVE 64 #define IP6SKB_JUMBOGRAM 128 #define IP6SKB_SEG6 256 +#define IP6SKB_FAKEJUMBO 512 }; #if defined(CONFIG_NET_L3_MASTER_DEV) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index afa5bd4ad167..4081b12a01ff 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -182,7 +182,9 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff #endif mtu = ip6_skb_dst_mtu(skb); - if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu)) + if (skb_is_gso(skb) && + !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) && + !skb_gso_validate_network_len(skb, mtu)) return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); if ((skb->len > mtu && !skb_is_gso(skb)) || @@ -252,6 +254,8 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; struct inet6_dev *idev = ip6_dst_idev(dst); + struct hop_jumbo_hdr *hop_jumbo; + int hoplen = sizeof(*hop_jumbo); unsigned int head_room; struct ipv6hdr *hdr; u8 proto = fl6->flowi6_proto; @@ -259,7 +263,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, int hlimit = -1; u32 mtu; - head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev); + head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev); if (opt) head_room += opt->opt_nflen + opt->opt_flen; @@ -282,6 +286,20 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, &fl6->saddr); } + if (unlikely(seg_len > IPV6_MAXPLEN)) { + hop_jumbo = skb_push(skb, hoplen); + + hop_jumbo->nexthdr = proto; + hop_jumbo->hdrlen = 0; + hop_jumbo->tlv_type = IPV6_TLV_JUMBO; + hop_jumbo->tlv_len = 4; + hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen); + + proto = IPPROTO_HOPOPTS; + seg_len = 0; + IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO; + } + skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); -- cgit v1.2.3-71-gd317