From 7c4e983c4f3cf94fcd879730c6caa877e0768a4d Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 13 May 2022 11:33:57 -0700 Subject: net: allow gso_max_size to exceed 65536 The code for gso_max_size was added originally to allow for debugging and workaround of buggy devices that couldn't support TSO with blocks 64K in size. The original reason for limiting it to 64K was because that was the existing limits of IPv4 and non-jumbogram IPv6 length fields. With the addition of Big TCP we can remove this limit and allow the value to potentially go up to UINT_MAX and instead be limited by the tso_max_size value. So in order to support this we need to go through and clean up the remaining users of the gso_max_size value so that the values will cap at 64K for non-TCPv6 flows. In addition we can clean up the GSO_MAX_SIZE value so that 64K becomes GSO_LEGACY_MAX_SIZE and UINT_MAX will now be the upper limit for GSO_MAX_SIZE. v6: (edumazet) fixed a compile error if CONFIG_IPV6=n, in a new sk_trim_gso_size() helper. netif_set_tso_max_size() caps the requested TSO size with GSO_MAX_SIZE. Signed-off-by: Alexander Duyck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe.h | 3 ++- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 2 +- drivers/net/ethernet/sfc/ef100_nic.c | 3 ++- drivers/net/ethernet/sfc/falcon/tx.c | 3 ++- drivers/net/ethernet/sfc/tx_common.c | 3 ++- drivers/net/ethernet/synopsys/dwc-xlgmac.h | 3 ++- drivers/net/hyperv/rndis_filter.c | 2 +- drivers/scsi/fcoe/fcoe.c | 2 +- 8 files changed, 13 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h index 607a2c90513b..d9547552ceef 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe.h @@ -151,7 +151,8 @@ #define XGBE_TX_MAX_BUF_SIZE (0x3fff & ~(64 - 1)) /* Descriptors required for maximum contiguous TSO/GSO packet */ -#define XGBE_TX_MAX_SPLIT ((GSO_MAX_SIZE / XGBE_TX_MAX_BUF_SIZE) + 1) +#define XGBE_TX_MAX_SPLIT \ + ((GSO_LEGACY_MAX_SIZE / XGBE_TX_MAX_BUF_SIZE) + 1) /* Maximum possible descriptors needed for an SKB: * - Maximum number of SKB frags diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index fb11081001a0..838870bc6dbd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -2038,7 +2038,7 @@ mlx5e_hw_gro_skb_has_enough_space(struct sk_buff *skb, u16 data_bcnt) { int nr_frags = skb_shinfo(skb)->nr_frags; - return PAGE_SIZE * nr_frags + data_bcnt <= GSO_MAX_SIZE; + return PAGE_SIZE * nr_frags + data_bcnt <= GRO_MAX_SIZE; } static void diff --git a/drivers/net/ethernet/sfc/ef100_nic.c b/drivers/net/ethernet/sfc/ef100_nic.c index a69d756e09b9..b2536d2c218a 100644 --- a/drivers/net/ethernet/sfc/ef100_nic.c +++ b/drivers/net/ethernet/sfc/ef100_nic.c @@ -1008,7 +1008,8 @@ static int ef100_process_design_param(struct efx_nic *efx, } return 0; case ESE_EF100_DP_GZ_TSO_MAX_PAYLOAD_LEN: - nic_data->tso_max_payload_len = min_t(u64, reader->value, GSO_MAX_SIZE); + nic_data->tso_max_payload_len = min_t(u64, reader->value, + GSO_LEGACY_MAX_SIZE); netif_set_tso_max_size(efx->net_dev, nic_data->tso_max_payload_len); return 0; diff --git a/drivers/net/ethernet/sfc/falcon/tx.c b/drivers/net/ethernet/sfc/falcon/tx.c index f7306e93a8b8..b9369483758c 100644 --- a/drivers/net/ethernet/sfc/falcon/tx.c +++ b/drivers/net/ethernet/sfc/falcon/tx.c @@ -98,7 +98,8 @@ unsigned int ef4_tx_max_skb_descs(struct ef4_nic *efx) /* Possibly more for PCIe page boundaries within input fragments */ if (PAGE_SIZE > EF4_PAGE_SIZE) max_descs += max_t(unsigned int, MAX_SKB_FRAGS, - DIV_ROUND_UP(GSO_MAX_SIZE, EF4_PAGE_SIZE)); + DIV_ROUND_UP(GSO_LEGACY_MAX_SIZE, + EF4_PAGE_SIZE)); return max_descs; } diff --git a/drivers/net/ethernet/sfc/tx_common.c b/drivers/net/ethernet/sfc/tx_common.c index 9bc8281b7f5b..658ea2d34070 100644 --- a/drivers/net/ethernet/sfc/tx_common.c +++ b/drivers/net/ethernet/sfc/tx_common.c @@ -416,7 +416,8 @@ unsigned int efx_tx_max_skb_descs(struct efx_nic *efx) /* Possibly more for PCIe page boundaries within input fragments */ if (PAGE_SIZE > EFX_PAGE_SIZE) max_descs += max_t(unsigned int, MAX_SKB_FRAGS, - DIV_ROUND_UP(GSO_MAX_SIZE, EFX_PAGE_SIZE)); + DIV_ROUND_UP(GSO_LEGACY_MAX_SIZE, + EFX_PAGE_SIZE)); return max_descs; } diff --git a/drivers/net/ethernet/synopsys/dwc-xlgmac.h b/drivers/net/ethernet/synopsys/dwc-xlgmac.h index 98e3a271e017..a848e10f3ea4 100644 --- a/drivers/net/ethernet/synopsys/dwc-xlgmac.h +++ b/drivers/net/ethernet/synopsys/dwc-xlgmac.h @@ -38,7 +38,8 @@ #define XLGMAC_RX_DESC_MAX_DIRTY (XLGMAC_RX_DESC_CNT >> 3) /* Descriptors required for maximum contiguous TSO/GSO packet */ -#define XLGMAC_TX_MAX_SPLIT ((GSO_MAX_SIZE / XLGMAC_TX_MAX_BUF_SIZE) + 1) +#define XLGMAC_TX_MAX_SPLIT \ + ((GSO_LEGACY_MAX_SIZE / XLGMAC_TX_MAX_BUF_SIZE) + 1) /* Maximum possible descriptors needed for a SKB */ #define XLGMAC_TX_MAX_DESC_NR (MAX_SKB_FRAGS + XLGMAC_TX_MAX_SPLIT + 2) diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 866af2cc27a3..6da36cb8af80 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -1349,7 +1349,7 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device, struct net_device_context *net_device_ctx = netdev_priv(net); struct ndis_offload hwcaps; struct ndis_offload_params offloads; - unsigned int gso_max_size = GSO_MAX_SIZE; + unsigned int gso_max_size = GSO_LEGACY_MAX_SIZE; int ret; /* Find HW offload capabilities */ diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c index 44ca6110213c..79b2827e4081 100644 --- a/drivers/scsi/fcoe/fcoe.c +++ b/drivers/scsi/fcoe/fcoe.c @@ -667,7 +667,7 @@ static void fcoe_netdev_features_change(struct fc_lport *lport, if (netdev->features & NETIF_F_FSO) { lport->seq_offload = 1; - lport->lso_max = netdev->gso_max_size; + lport->lso_max = min(netdev->gso_max_size, GSO_LEGACY_MAX_SIZE); FCOE_NETDEV_DBG(netdev, "Supports LSO for max len 0x%x\n", lport->lso_max); } else { -- cgit v1.2.3-71-gd317 From 0fe79f28bfaf73b66b7b1562d2468f94aa03bd12 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 13 May 2022 11:34:03 -0700 Subject: net: allow gro_max_size to exceed 65536 Allow the gro_max_size to exceed a value larger than 65536. There weren't really any external limitations that prevented this other than the fact that IPv4 only supports a 16 bit length field. Since we have the option of adding a hop-by-hop header for IPv6 we can allow IPv6 to exceed this value and for IPv4 and non-TCP flows we can cap things at 65536 via a constant rather than relying on gro_max_size. [edumazet] limit GRO_MAX_SIZE to (8 * 65535) to avoid overflows. Signed-off-by: Alexander Duyck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 2 +- include/linux/netdevice.h | 6 +++++- include/net/ipv6.h | 2 +- net/core/dev.c | 2 +- net/core/gro.c | 8 ++++++++ net/core/rtnetlink.c | 8 -------- 6 files changed, 16 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 838870bc6dbd..24de37b79f5a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -2038,7 +2038,7 @@ mlx5e_hw_gro_skb_has_enough_space(struct sk_buff *skb, u16 data_bcnt) { int nr_frags = skb_shinfo(skb)->nr_frags; - return PAGE_SIZE * nr_frags + data_bcnt <= GRO_MAX_SIZE; + return PAGE_SIZE * nr_frags + data_bcnt <= GRO_LEGACY_MAX_SIZE; } static void diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fd38847d0dc7..d57ce248004c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2161,7 +2161,11 @@ struct net_device { struct bpf_prog __rcu *xdp_prog; unsigned long gro_flush_timeout; int napi_defer_hard_irqs; -#define GRO_MAX_SIZE 65536 +#define GRO_LEGACY_MAX_SIZE 65536u +/* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), + * and shinfo->gso_segs is a 16bit field. + */ +#define GRO_MAX_SIZE (8 * 65535u) unsigned int gro_max_size; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index b6df0314aa02..5b38bf1a586b 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -477,7 +477,7 @@ static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb) const struct hop_jumbo_hdr *jhdr; const struct ipv6hdr *nhdr; - if (likely(skb->len <= GRO_MAX_SIZE)) + if (likely(skb->len <= GRO_LEGACY_MAX_SIZE)) return 0; if (skb->protocol != htons(ETH_P_IPV6)) diff --git a/net/core/dev.c b/net/core/dev.c index 830beb05161a..d93456c75b55 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10598,7 +10598,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->gso_max_size = GSO_LEGACY_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; - dev->gro_max_size = GRO_MAX_SIZE; + dev->gro_max_size = GRO_LEGACY_MAX_SIZE; dev->tso_max_size = TSO_LEGACY_MAX_SIZE; dev->tso_max_segs = TSO_MAX_SEGS; dev->upper_level = 1; diff --git a/net/core/gro.c b/net/core/gro.c index 78110edf5d4b..b4190eb08467 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -167,6 +167,14 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) return -E2BIG; + if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { + if (p->protocol != htons(ETH_P_IPV6) || + skb_headroom(p) < sizeof(struct hop_jumbo_hdr) || + ipv6_hdr(p)->nexthdr != IPPROTO_TCP || + p->encapsulation) + return -E2BIG; + } + lp = NAPI_GRO_CB(p)->last; pinfo = skb_shinfo(lp); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f2b0f747d3d2..ac45328607f7 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2360,14 +2360,6 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], } } - if (tb[IFLA_GRO_MAX_SIZE]) { - u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]); - - if (gro_max_size > GRO_MAX_SIZE) { - NL_SET_ERR_MSG(extack, "too big gro_max_size"); - return -EINVAL; - } - } return 0; } -- cgit v1.2.3-71-gd317 From d6f938ce52f9adb23f4c31cc371654a5f18ff328 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 May 2022 11:34:05 -0700 Subject: net: loopback: enable BIG TCP packets Set the driver limit to GSO_MAX_SIZE (512 KB). This allows the admin/user to set a GSO limit up to this value. Tested: ip link set dev lo gso_max_size 200000 netperf -H ::1 -t TCP_RR -l 100 -- -r 80000,80000 & tcpdump shows : 18:28:42.962116 IP6 ::1 > ::1: HBH 40051 > 63780: Flags [P.], seq 3626480001:3626560001, ack 3626560001, win 17743, options [nop,nop,TS val 3771179265 ecr 3771179265], length 80000 18:28:42.962138 IP6 ::1.63780 > ::1.40051: Flags [.], ack 3626560001, win 17743, options [nop,nop,TS val 3771179265 ecr 3771179265], length 0 18:28:42.962152 IP6 ::1 > ::1: HBH 63780 > 40051: Flags [P.], seq 3626560001:3626640001, ack 3626560001, win 17743, options [nop,nop,TS val 3771179265 ecr 3771179265], length 80000 18:28:42.962157 IP6 ::1.40051 > ::1.63780: Flags [.], ack 3626640001, win 17743, options [nop,nop,TS val 3771179265 ecr 3771179265], length 0 18:28:42.962180 IP6 ::1 > ::1: HBH 40051 > 63780: Flags [P.], seq 3626560001:3626640001, ack 3626640001, win 17743, options [nop,nop,TS val 3771179265 ecr 3771179265], length 80000 18:28:42.962214 IP6 ::1.63780 > ::1.40051: Flags [.], ack 3626640001, win 17743, options [nop,nop,TS val 3771179266 ecr 3771179265], length 0 18:28:42.962228 IP6 ::1 > ::1: HBH 63780 > 40051: Flags [P.], seq 3626640001:3626720001, ack 3626640001, win 17743, options [nop,nop,TS val 3771179266 ecr 3771179265], length 80000 18:28:42.962233 IP6 ::1.40051 > ::1.63780: Flags [.], ack 3626720001, win 17743, options [nop,nop,TS val 3771179266 ecr 3771179266], length 0 Signed-off-by: Eric Dumazet Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- drivers/net/loopback.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers') diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index 720394c0639b..14e8d04cb434 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -191,6 +191,8 @@ static void gen_lo_setup(struct net_device *dev, dev->netdev_ops = dev_ops; dev->needs_free_netdev = true; dev->priv_destructor = dev_destructor; + + netif_set_tso_max_size(dev, GSO_MAX_SIZE); } /* The loopback device is special. There is only one instance -- cgit v1.2.3-71-gd317 From d406099d6a150d6ee09b46d660feca348f9f9bba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 May 2022 11:34:06 -0700 Subject: veth: enable BIG TCP packets Set the TSO driver limit to GSO_MAX_SIZE (512 KB). This allows the admin/user to set a GSO limit up to this value. ip link set dev veth10 gso_max_size 200000 Signed-off-by: Eric Dumazet Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- drivers/net/veth.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/net/veth.c b/drivers/net/veth.c index f474e79a7745..466da01ba2e3 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1647,6 +1647,7 @@ static void veth_setup(struct net_device *dev) dev->hw_features = VETH_FEATURES; dev->hw_enc_features = VETH_FEATURES; dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; + netif_set_tso_max_size(dev, GSO_MAX_SIZE); } /* -- cgit v1.2.3-71-gd317 From 1169a64265c4ea7100091228c98d4267f041b0e7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 May 2022 11:34:07 -0700 Subject: mlx4: support BIG TCP packets mlx4 supports LSOv2 just fine. IPv6 stack inserts a temporary Hop-by-Hop header with JUMBO TLV for big packets. We need to ignore the HBH header when populating TX descriptor. Tested: Before: (not enabling bigger TSO/GRO packets) ip link set dev eth0 gso_max_size 65536 gro_max_size 65536 netperf -H lpaa18 -t TCP_RR -T2,2 -l 10 -Cc -- -r 70000,70000 MIGRATED TCP REQUEST/RESPONSE TEST from ::0 (::) port 0 AF_INET6 to lpaa18.prod.google.com () port 0 AF_INET6 : first burst 0 : cpu bind Local /Remote Socket Size Request Resp. Elapsed Trans. CPU CPU S.dem S.dem Send Recv Size Size Time Rate local remote local remote bytes bytes bytes bytes secs. per sec % S % S us/Tr us/Tr 262144 540000 70000 70000 10.00 6591.45 0.86 1.34 62.490 97.446 262144 540000 After: (enabling bigger TSO/GRO packets) ip link set dev eth0 gso_max_size 185000 gro_max_size 185000 netperf -H lpaa18 -t TCP_RR -T2,2 -l 10 -Cc -- -r 70000,70000 MIGRATED TCP REQUEST/RESPONSE TEST from ::0 (::) port 0 AF_INET6 to lpaa18.prod.google.com () port 0 AF_INET6 : first burst 0 : cpu bind Local /Remote Socket Size Request Resp. Elapsed Trans. CPU CPU S.dem S.dem Send Recv Size Size Time Rate local remote local remote bytes bytes bytes bytes secs. per sec % S % S us/Tr us/Tr 262144 540000 70000 70000 10.00 8383.95 0.95 1.01 54.432 57.584 262144 540000 Signed-off-by: Eric Dumazet Reviewed-by: Tariq Toukan Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 3 ++ drivers/net/ethernet/mellanox/mlx4/en_tx.c | 47 +++++++++++++++++++++----- 2 files changed, 41 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index c61dc7ae0c05..ca4b93a01034 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -3417,6 +3417,9 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = priv->max_mtu; + /* supports LSOv2 packets. */ + netif_set_tso_max_size(dev, GSO_MAX_SIZE); + mdev->pndev[port] = dev; mdev->upper[port] = NULL; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index f777151d226f..af3b2b59a2a6 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -43,6 +43,7 @@ #include #include #include +#include #include "mlx4_en.h" @@ -634,19 +635,28 @@ static int get_real_size(const struct sk_buff *skb, struct net_device *dev, int *lso_header_size, bool *inline_ok, - void **pfrag) + void **pfrag, + int *hopbyhop) { struct mlx4_en_priv *priv = netdev_priv(dev); int real_size; if (shinfo->gso_size) { *inline_ok = false; - if (skb->encapsulation) + *hopbyhop = 0; + if (skb->encapsulation) { *lso_header_size = (skb_inner_transport_header(skb) - skb->data) + inner_tcp_hdrlen(skb); - else + } else { + /* Detects large IPV6 TCP packets and prepares for removal of + * HBH header that has been pushed by ip6_xmit(), + * mainly so that tcpdump can dissect them. + */ + if (ipv6_has_hopopt_jumbo(skb)) + *hopbyhop = sizeof(struct hop_jumbo_hdr); *lso_header_size = skb_transport_offset(skb) + tcp_hdrlen(skb); + } real_size = CTRL_SIZE + shinfo->nr_frags * DS_SIZE + - ALIGN(*lso_header_size + 4, DS_SIZE); + ALIGN(*lso_header_size - *hopbyhop + 4, DS_SIZE); if (unlikely(*lso_header_size != skb_headlen(skb))) { /* We add a segment for the skb linear buffer only if * it contains data */ @@ -873,6 +883,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) int desc_size; int real_size; u32 index, bf_index; + struct ipv6hdr *h6; __be32 op_own; int lso_header_size; void *fragptr = NULL; @@ -881,6 +892,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) bool stop_queue; bool inline_ok; u8 data_offset; + int hopbyhop; bool bf_ok; tx_ind = skb_get_queue_mapping(skb); @@ -890,7 +902,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_drop; real_size = get_real_size(skb, shinfo, dev, &lso_header_size, - &inline_ok, &fragptr); + &inline_ok, &fragptr, &hopbyhop); if (unlikely(!real_size)) goto tx_drop_count; @@ -943,7 +955,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) data = &tx_desc->data; data_offset = offsetof(struct mlx4_en_tx_desc, data); } else { - int lso_align = ALIGN(lso_header_size + 4, DS_SIZE); + int lso_align = ALIGN(lso_header_size - hopbyhop + 4, DS_SIZE); data = (void *)&tx_desc->lso + lso_align; data_offset = offsetof(struct mlx4_en_tx_desc, lso) + lso_align; @@ -1008,14 +1020,31 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ((ring->prod & ring->size) ? cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0); + lso_header_size -= hopbyhop; /* Fill in the LSO prefix */ tx_desc->lso.mss_hdr_size = cpu_to_be32( shinfo->gso_size << 16 | lso_header_size); - /* Copy headers; - * note that we already verified that it is linear */ - memcpy(tx_desc->lso.header, skb->data, lso_header_size); + if (unlikely(hopbyhop)) { + /* remove the HBH header. + * Layout: [Ethernet header][IPv6 header][HBH][TCP header] + */ + memcpy(tx_desc->lso.header, skb->data, ETH_HLEN + sizeof(*h6)); + h6 = (struct ipv6hdr *)((char *)tx_desc->lso.header + ETH_HLEN); + h6->nexthdr = IPPROTO_TCP; + /* Copy the TCP header after the IPv6 one */ + memcpy(h6 + 1, + skb->data + ETH_HLEN + sizeof(*h6) + + sizeof(struct hop_jumbo_hdr), + tcp_hdrlen(skb)); + /* Leave ipv6 payload_len set to 0, as LSO v2 specs request. */ + } else { + /* Copy headers; + * note that we already verified that it is linear + */ + memcpy(tx_desc->lso.header, skb->data, lso_header_size); + } ring->tso_packets++; i = shinfo->gso_segs; -- cgit v1.2.3-71-gd317 From de78960e025f0f361db76f90de609ca2c26d1366 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 May 2022 11:34:08 -0700 Subject: mlx5: support BIG TCP packets mlx5 supports LSOv2. IPv6 gro/tcp stacks insert a temporary Hop-by-Hop header with JUMBO TLV for big packets. We need to ignore/skip this HBH header when populating TX descriptor. Note that ipv6_has_hopopt_jumbo() only recognizes very specific packet layout, thus mlx5e_sq_xmit_wqe() is taking care of this layout only. v7: adopt unsafe_memcpy() and MLX5_UNSAFE_MEMCPY_DISCLAIMER v2: clear hopbyhop in mlx5e_tx_get_gso_ihs() v4: fix compile error for CONFIG_MLX5_CORE_IPOIB=y Signed-off-by: Coco Li Signed-off-by: Eric Dumazet Reviewed-by: Tariq Toukan Cc: Saeed Mahameed Cc: Leon Romanovsky Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 111 +++++++++++++++++----- 2 files changed, 89 insertions(+), 23 deletions(-) (limited to 'drivers') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index d27986869b8b..bf3bca79e160 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4920,6 +4920,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev) netdev->priv_flags |= IFF_UNICAST_FLT; + netif_set_tso_max_size(netdev, GSO_MAX_SIZE); mlx5e_set_netdev_dev_addr(netdev); mlx5e_ipsec_build_netdev(priv); mlx5e_ktls_build_netdev(priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 5855d8f9c509..50d14cec4894 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -40,6 +40,7 @@ #include "en_accel/en_accel.h" #include "en_accel/ipsec_rxtx.h" #include "en/ptp.h" +#include static void mlx5e_dma_unmap_wqe_err(struct mlx5e_txqsq *sq, u8 num_dma) { @@ -91,6 +92,13 @@ static inline u16 mlx5e_calc_min_inline(enum mlx5_inline_modes mode, return min_t(u16, hlen, skb_headlen(skb)); } +#define MLX5_UNSAFE_MEMCPY_DISCLAIMER \ + "This copy has been bounds-checked earlier in " \ + "mlx5i_sq_calc_wqe_attr() and intentionally " \ + "crosses a flex array boundary. Since it is " \ + "performance sensitive, splitting the copy is " \ + "undesirable." + static inline void mlx5e_insert_vlan(void *start, struct sk_buff *skb, u16 ihs) { struct vlan_ethhdr *vhdr = (struct vlan_ethhdr *)start; @@ -100,7 +108,10 @@ static inline void mlx5e_insert_vlan(void *start, struct sk_buff *skb, u16 ihs) memcpy(&vhdr->addrs, skb->data, cpy1_sz); vhdr->h_vlan_proto = skb->vlan_proto; vhdr->h_vlan_TCI = cpu_to_be16(skb_vlan_tag_get(skb)); - memcpy(&vhdr->h_vlan_encapsulated_proto, skb->data + cpy1_sz, cpy2_sz); + unsafe_memcpy(&vhdr->h_vlan_encapsulated_proto, + skb->data + cpy1_sz, + cpy2_sz, + MLX5_UNSAFE_MEMCPY_DISCLAIMER); } static inline void @@ -130,23 +141,32 @@ mlx5e_txwqe_build_eseg_csum(struct mlx5e_txqsq *sq, struct sk_buff *skb, sq->stats->csum_none++; } +/* Returns the number of header bytes that we plan + * to inline later in the transmit descriptor + */ static inline u16 -mlx5e_tx_get_gso_ihs(struct mlx5e_txqsq *sq, struct sk_buff *skb) +mlx5e_tx_get_gso_ihs(struct mlx5e_txqsq *sq, struct sk_buff *skb, int *hopbyhop) { struct mlx5e_sq_stats *stats = sq->stats; u16 ihs; + *hopbyhop = 0; if (skb->encapsulation) { ihs = skb_inner_transport_offset(skb) + inner_tcp_hdrlen(skb); stats->tso_inner_packets++; stats->tso_inner_bytes += skb->len - ihs; } else { - if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) { ihs = skb_transport_offset(skb) + sizeof(struct udphdr); - else + } else { ihs = skb_transport_offset(skb) + tcp_hdrlen(skb); + if (ipv6_has_hopopt_jumbo(skb)) { + *hopbyhop = sizeof(struct hop_jumbo_hdr); + ihs -= sizeof(struct hop_jumbo_hdr); + } + } stats->tso_packets++; - stats->tso_bytes += skb->len - ihs; + stats->tso_bytes += skb->len - ihs - *hopbyhop; } return ihs; @@ -208,6 +228,7 @@ struct mlx5e_tx_attr { __be16 mss; u16 insz; u8 opcode; + u8 hopbyhop; }; struct mlx5e_tx_wqe_attr { @@ -244,14 +265,16 @@ static void mlx5e_sq_xmit_prepare(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5e_sq_stats *stats = sq->stats; if (skb_is_gso(skb)) { - u16 ihs = mlx5e_tx_get_gso_ihs(sq, skb); + int hopbyhop; + u16 ihs = mlx5e_tx_get_gso_ihs(sq, skb, &hopbyhop); *attr = (struct mlx5e_tx_attr) { .opcode = MLX5_OPCODE_LSO, .mss = cpu_to_be16(skb_shinfo(skb)->gso_size), .ihs = ihs, .num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs, - .headlen = skb_headlen(skb) - ihs, + .headlen = skb_headlen(skb) - ihs - hopbyhop, + .hopbyhop = hopbyhop, }; stats->packets += skb_shinfo(skb)->gso_segs; @@ -365,7 +388,8 @@ mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg; struct mlx5_wqe_data_seg *dseg; struct mlx5e_tx_wqe_info *wi; - + u16 ihs = attr->ihs; + struct ipv6hdr *h6; struct mlx5e_sq_stats *stats = sq->stats; int num_dma; @@ -379,21 +403,40 @@ mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, eseg->mss = attr->mss; - if (attr->ihs) { - if (skb_vlan_tag_present(skb)) { - eseg->inline_hdr.sz |= cpu_to_be16(attr->ihs + VLAN_HLEN); - mlx5e_insert_vlan(eseg->inline_hdr.start, skb, attr->ihs); + if (ihs) { + u8 *start = eseg->inline_hdr.start; + + if (unlikely(attr->hopbyhop)) { + /* remove the HBH header. + * Layout: [Ethernet header][IPv6 header][HBH][TCP header] + */ + if (skb_vlan_tag_present(skb)) { + mlx5e_insert_vlan(start, skb, ETH_HLEN + sizeof(*h6)); + ihs += VLAN_HLEN; + h6 = (struct ipv6hdr *)(start + sizeof(struct vlan_ethhdr)); + } else { + unsafe_memcpy(start, skb->data, + ETH_HLEN + sizeof(*h6), + MLX5_UNSAFE_MEMCPY_DISCLAIMER); + h6 = (struct ipv6hdr *)(start + ETH_HLEN); + } + h6->nexthdr = IPPROTO_TCP; + /* Copy the TCP header after the IPv6 one */ + memcpy(h6 + 1, + skb->data + ETH_HLEN + sizeof(*h6) + + sizeof(struct hop_jumbo_hdr), + tcp_hdrlen(skb)); + /* Leave ipv6 payload_len set to 0, as LSO v2 specs request. */ + } else if (skb_vlan_tag_present(skb)) { + mlx5e_insert_vlan(start, skb, ihs); + ihs += VLAN_HLEN; stats->added_vlan_packets++; } else { - eseg->inline_hdr.sz |= cpu_to_be16(attr->ihs); - unsafe_memcpy(eseg->inline_hdr.start, skb->data, attr->ihs, - /* This copy has been bounds-checked earlier in - * mlx5i_sq_calc_wqe_attr() and intentionally - * crosses a flex array boundary. Since it is - * performance sensitive, splitting the copy is - * undesirable. - */); + unsafe_memcpy(eseg->inline_hdr.start, skb->data, + attr->ihs, + MLX5_UNSAFE_MEMCPY_DISCLAIMER); } + eseg->inline_hdr.sz |= cpu_to_be16(ihs); dseg += wqe_attr->ds_cnt_inl; } else if (skb_vlan_tag_present(skb)) { eseg->insert.type = cpu_to_be16(MLX5_ETH_WQE_INSERT_VLAN); @@ -404,7 +447,7 @@ mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, } dseg += wqe_attr->ds_cnt_ids; - num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + attr->ihs, + num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + attr->ihs + attr->hopbyhop, attr->headlen, dseg); if (unlikely(num_dma < 0)) goto err_drop; @@ -924,12 +967,34 @@ void mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, eseg->mss = attr.mss; if (attr.ihs) { - memcpy(eseg->inline_hdr.start, skb->data, attr.ihs); + if (unlikely(attr.hopbyhop)) { + struct ipv6hdr *h6; + + /* remove the HBH header. + * Layout: [Ethernet header][IPv6 header][HBH][TCP header] + */ + unsafe_memcpy(eseg->inline_hdr.start, skb->data, + ETH_HLEN + sizeof(*h6), + MLX5_UNSAFE_MEMCPY_DISCLAIMER); + h6 = (struct ipv6hdr *)((char *)eseg->inline_hdr.start + ETH_HLEN); + h6->nexthdr = IPPROTO_TCP; + /* Copy the TCP header after the IPv6 one */ + unsafe_memcpy(h6 + 1, + skb->data + ETH_HLEN + sizeof(*h6) + + sizeof(struct hop_jumbo_hdr), + tcp_hdrlen(skb), + MLX5_UNSAFE_MEMCPY_DISCLAIMER); + /* Leave ipv6 payload_len set to 0, as LSO v2 specs request. */ + } else { + unsafe_memcpy(eseg->inline_hdr.start, skb->data, + attr.ihs, + MLX5_UNSAFE_MEMCPY_DISCLAIMER); + } eseg->inline_hdr.sz = cpu_to_be16(attr.ihs); dseg += wqe_attr.ds_cnt_inl; } - num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + attr.ihs, + num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + attr.ihs + attr.hopbyhop, attr.headlen, dseg); if (unlikely(num_dma < 0)) goto err_drop; -- cgit v1.2.3-71-gd317