From 0c317a02ca982ca093e71bf07cb562265ba40032 Mon Sep 17 00:00:00 2001 From: Purushottam Kushwaha Date: Wed, 12 Oct 2016 18:26:51 +0530 Subject: cfg80211: support virtual interfaces with different beacon intervals This commit provides a mechanism for the host drivers to advertise the support for different beacon intervals among the respective interface combinations in a group, through NL80211_IFACE_COMB_BI_MIN_GCD (u32). This value will be compared against GCD of all beaconing interfaces of matching combinations. If the driver doesn't advertise this value, the old behaviour where all beacon intervals must be identical is retained. If it is specified, then any beacon interval for an interface in the interface combination as well as the GCD of all active beacon intervals in the combination must be greater or equal to this value. Signed-off-by: Purushottam Kushwaha [change commit message, some variable names, small other things] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 56368e9b4622..1362d24957b5 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4280,6 +4280,9 @@ enum nl80211_iface_limit_attrs { * of supported channel widths for radar detection. * @NL80211_IFACE_COMB_RADAR_DETECT_REGIONS: u32 attribute containing the bitmap * of supported regulatory regions for radar detection. + * @NL80211_IFACE_COMB_BI_MIN_GCD: u32 attribute specifying the minimum GCD of + * different beacon intervals supported by all the interface combinations + * in this group (if not present, all beacon intervals be identical). * @NUM_NL80211_IFACE_COMB: number of attributes * @MAX_NL80211_IFACE_COMB: highest attribute number * @@ -4287,8 +4290,8 @@ enum nl80211_iface_limit_attrs { * limits = [ #{STA} <= 1, #{AP} <= 1 ], matching BI, channels = 1, max = 2 * => allows an AP and a STA that must match BIs * - * numbers = [ #{AP, P2P-GO} <= 8 ], channels = 1, max = 8 - * => allows 8 of AP/GO + * numbers = [ #{AP, P2P-GO} <= 8 ], BI min gcd, channels = 1, max = 8, + * => allows 8 of AP/GO that can have BI gcd >= min gcd * * numbers = [ #{STA} <= 2 ], channels = 2, max = 2 * => allows two STAs on different channels @@ -4314,6 +4317,7 @@ enum nl80211_if_combination_attrs { NL80211_IFACE_COMB_NUM_CHANNELS, NL80211_IFACE_COMB_RADAR_DETECT_WIDTHS, NL80211_IFACE_COMB_RADAR_DETECT_REGIONS, + NL80211_IFACE_COMB_BI_MIN_GCD, /* keep last */ NUM_NL80211_IFACE_COMB, -- cgit v1.2.3-71-gd317 From a52ad514fdf3b8a57ca4322c92d2d8d5c6182485 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Fri, 7 Oct 2016 22:04:34 -0400 Subject: net: deprecate eth_change_mtu, remove usage With centralized MTU checking, there's nothing productive done by eth_change_mtu that isn't already done in dev_set_mtu, so mark it as deprecated and remove all usage of it in the kernel. All callers have been audited for calls to alloc_etherdev* or ether_setup directly, which means they all have a valid dev->min_mtu and dev->max_mtu. Now eth_change_mtu prints out a netdev_warn about being deprecated, for the benefit of out-of-tree drivers that might be utilizing it. Of note, dvb_net.c actually had dev->mtu = 4096, while using eth_change_mtu, meaning that if you ever tried changing it's mtu, you couldn't set it above 1500 anymore. It's now getting dev->max_mtu also set to 4096 to remedy that. v2: fix up lantiq_etop, missed breakage due to drive not compiling on x86 CC: netdev@vger.kernel.org Signed-off-by: Jarod Wilson Signed-off-by: David S. Miller --- arch/m68k/emu/nfeth.c | 1 - drivers/isdn/hysdn/hysdn_net.c | 1 - drivers/media/dvb-core/dvb_net.c | 2 +- drivers/net/appletalk/ipddp.c | 1 - drivers/net/cris/eth_v10.c | 1 - drivers/net/ethernet/3com/3c509.c | 1 - drivers/net/ethernet/3com/3c515.c | 1 - drivers/net/ethernet/3com/3c574_cs.c | 1 - drivers/net/ethernet/3com/3c589_cs.c | 1 - drivers/net/ethernet/3com/3c59x.c | 2 -- drivers/net/ethernet/3com/typhoon.c | 1 - drivers/net/ethernet/8390/8390.c | 1 - drivers/net/ethernet/8390/8390p.c | 1 - drivers/net/ethernet/8390/ax88796.c | 1 - drivers/net/ethernet/8390/axnet_cs.c | 1 - drivers/net/ethernet/8390/etherh.c | 1 - drivers/net/ethernet/8390/hydra.c | 1 - drivers/net/ethernet/8390/mac8390.c | 1 - drivers/net/ethernet/8390/mcf8390.c | 1 - drivers/net/ethernet/8390/ne2k-pci.c | 1 - drivers/net/ethernet/8390/pcnet_cs.c | 1 - drivers/net/ethernet/8390/smc-ultra.c | 1 - drivers/net/ethernet/8390/wd.c | 1 - drivers/net/ethernet/8390/zorro8390.c | 1 - drivers/net/ethernet/adaptec/starfire.c | 1 - drivers/net/ethernet/adi/bfin_mac.c | 1 - drivers/net/ethernet/allwinner/sun4i-emac.c | 1 - drivers/net/ethernet/amd/a2065.c | 1 - drivers/net/ethernet/amd/am79c961a.c | 1 - drivers/net/ethernet/amd/ariadne.c | 1 - drivers/net/ethernet/amd/atarilance.c | 1 - drivers/net/ethernet/amd/au1000_eth.c | 1 - drivers/net/ethernet/amd/declance.c | 1 - drivers/net/ethernet/amd/hplance.c | 1 - drivers/net/ethernet/amd/lance.c | 1 - drivers/net/ethernet/amd/mvme147.c | 1 - drivers/net/ethernet/amd/ni65.c | 1 - drivers/net/ethernet/amd/nmclan_cs.c | 1 - drivers/net/ethernet/amd/pcnet32.c | 1 - drivers/net/ethernet/amd/sun3lance.c | 1 - drivers/net/ethernet/amd/sunlance.c | 1 - drivers/net/ethernet/apm/xgene/xgene_enet_main.c | 1 - drivers/net/ethernet/apple/bmac.c | 1 - drivers/net/ethernet/apple/mace.c | 1 - drivers/net/ethernet/apple/macmace.c | 1 - drivers/net/ethernet/aurora/nb8800.c | 1 - drivers/net/ethernet/cadence/macb.c | 1 - drivers/net/ethernet/cirrus/cs89x0.c | 1 - drivers/net/ethernet/cirrus/ep93xx_eth.c | 1 - drivers/net/ethernet/cirrus/mac89x0.c | 1 - drivers/net/ethernet/davicom/dm9000.c | 1 - drivers/net/ethernet/dec/tulip/de2104x.c | 1 - drivers/net/ethernet/dec/tulip/de4x5.c | 1 - drivers/net/ethernet/dec/tulip/dmfe.c | 1 - drivers/net/ethernet/dec/tulip/tulip_core.c | 1 - drivers/net/ethernet/dec/tulip/uli526x.c | 1 - drivers/net/ethernet/dec/tulip/winbond-840.c | 1 - drivers/net/ethernet/dec/tulip/xircom_cb.c | 1 - drivers/net/ethernet/dnet.c | 1 - drivers/net/ethernet/ec_bhf.c | 1 - drivers/net/ethernet/fealnx.c | 1 - drivers/net/ethernet/freescale/fec_main.c | 1 - drivers/net/ethernet/freescale/fec_mpc52xx.c | 1 - drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c | 1 - drivers/net/ethernet/freescale/ucc_geth.c | 1 - drivers/net/ethernet/fujitsu/fmvj18x_cs.c | 1 - drivers/net/ethernet/hisilicon/hip04_eth.c | 1 - drivers/net/ethernet/hisilicon/hisi_femac.c | 1 - drivers/net/ethernet/hp/hp100.c | 2 -- drivers/net/ethernet/i825xx/82596.c | 1 - drivers/net/ethernet/i825xx/ether1.c | 1 - drivers/net/ethernet/i825xx/lib82596.c | 1 - drivers/net/ethernet/i825xx/sun3_82586.c | 1 - drivers/net/ethernet/ibm/emac/core.c | 1 - drivers/net/ethernet/korina.c | 1 - drivers/net/ethernet/lantiq_etop.c | 18 ++++++++---------- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 1 - drivers/net/ethernet/micrel/ks8851.c | 1 - drivers/net/ethernet/micrel/ks8851_mll.c | 1 - drivers/net/ethernet/microchip/enc28j60.c | 1 - drivers/net/ethernet/moxa/moxart_ether.c | 1 - drivers/net/ethernet/natsemi/jazzsonic.c | 1 - drivers/net/ethernet/natsemi/macsonic.c | 1 - drivers/net/ethernet/natsemi/xtsonic.c | 1 - drivers/net/ethernet/netx-eth.c | 1 - drivers/net/ethernet/nuvoton/w90p910_ether.c | 1 - drivers/net/ethernet/nxp/lpc_eth.c | 1 - drivers/net/ethernet/packetengines/hamachi.c | 1 - drivers/net/ethernet/packetengines/yellowfin.c | 1 - drivers/net/ethernet/qlogic/qla3xxx.c | 1 - drivers/net/ethernet/rdc/r6040.c | 1 - drivers/net/ethernet/realtek/atp.c | 1 - drivers/net/ethernet/renesas/ravb_main.c | 1 - drivers/net/ethernet/renesas/sh_eth.c | 2 -- drivers/net/ethernet/seeq/ether3.c | 1 - drivers/net/ethernet/seeq/sgiseeq.c | 1 - drivers/net/ethernet/sgi/ioc3-eth.c | 1 - drivers/net/ethernet/sgi/meth.c | 1 - drivers/net/ethernet/silan/sc92031.c | 1 - drivers/net/ethernet/sis/sis190.c | 1 - drivers/net/ethernet/sis/sis900.c | 1 - drivers/net/ethernet/smsc/epic100.c | 1 - drivers/net/ethernet/smsc/smc911x.c | 1 - drivers/net/ethernet/smsc/smc9194.c | 1 - drivers/net/ethernet/smsc/smc91c92_cs.c | 1 - drivers/net/ethernet/smsc/smc91x.c | 1 - drivers/net/ethernet/smsc/smsc911x.c | 1 - drivers/net/ethernet/sun/sunbmac.c | 1 - drivers/net/ethernet/sun/sunhme.c | 1 - drivers/net/ethernet/sun/sunqe.c | 1 - drivers/net/ethernet/ti/cpmac.c | 1 - drivers/net/ethernet/ti/cpsw.c | 1 - drivers/net/ethernet/ti/tlan.c | 1 - drivers/net/ethernet/toshiba/tc35815.c | 1 - drivers/net/ethernet/tundra/tsi108_eth.c | 1 - drivers/net/ethernet/via/via-rhine.c | 1 - drivers/net/ethernet/wiznet/w5100.c | 1 - drivers/net/ethernet/wiznet/w5300.c | 1 - drivers/net/ethernet/xircom/xirc2ps_cs.c | 1 - drivers/net/ethernet/xscale/ixp4xx_eth.c | 1 - drivers/net/plip/plip.c | 1 - drivers/net/sb1000.c | 1 - drivers/net/usb/catc.c | 1 - drivers/net/usb/kaweth.c | 1 - drivers/net/usb/pegasus.c | 1 - drivers/net/usb/r8152.c | 3 ++- drivers/net/usb/rtl8150.c | 1 - drivers/net/wan/sbni.c | 1 - drivers/net/wireless/intersil/prism54/islpci_dev.c | 1 - drivers/net/wireless/mac80211_hwsim.c | 1 - drivers/net/wireless/marvell/libertas/main.c | 1 - drivers/net/wireless/ray_cs.c | 1 - drivers/net/wireless/wl3501_cs.c | 1 - drivers/net/wireless/zydas/zd1201.c | 1 - drivers/staging/rtl8188eu/os_dep/mon.c | 1 - drivers/staging/rtl8192e/rtl8192e/rtl_core.c | 1 - drivers/staging/rtl8192u/r8192U_core.c | 1 - drivers/staging/slicoss/slicoss.c | 1 - include/uapi/linux/if_ether.h | 2 ++ net/atm/br2684.c | 2 -- net/bluetooth/bnep/netdev.c | 1 - net/ethernet/eth.c | 5 +++-- net/irda/irlan/irlan_eth.c | 1 - 143 files changed, 16 insertions(+), 156 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/m68k/emu/nfeth.c b/arch/m68k/emu/nfeth.c index a0985fd088d1..fc4be028c418 100644 --- a/arch/m68k/emu/nfeth.c +++ b/arch/m68k/emu/nfeth.c @@ -184,7 +184,6 @@ static const struct net_device_ops nfeth_netdev_ops = { .ndo_start_xmit = nfeth_xmit, .ndo_tx_timeout = nfeth_tx_timeout, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, }; diff --git a/drivers/isdn/hysdn/hysdn_net.c b/drivers/isdn/hysdn/hysdn_net.c index 5609deee7cd3..b93a4e9a8d34 100644 --- a/drivers/isdn/hysdn/hysdn_net.c +++ b/drivers/isdn/hysdn/hysdn_net.c @@ -232,7 +232,6 @@ static const struct net_device_ops hysdn_netdev_ops = { .ndo_open = net_open, .ndo_stop = net_close, .ndo_start_xmit = net_send_packet, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/media/dvb-core/dvb_net.c b/drivers/media/dvb-core/dvb_net.c index 9914f69a4a02..0da622f5fe69 100644 --- a/drivers/media/dvb-core/dvb_net.c +++ b/drivers/media/dvb-core/dvb_net.c @@ -1198,7 +1198,6 @@ static const struct net_device_ops dvb_netdev_ops = { .ndo_start_xmit = dvb_net_tx, .ndo_set_rx_mode = dvb_net_set_multicast_list, .ndo_set_mac_address = dvb_net_set_mac, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, }; @@ -1209,6 +1208,7 @@ static void dvb_net_setup(struct net_device *dev) dev->header_ops = &dvb_header_ops; dev->netdev_ops = &dvb_netdev_ops; dev->mtu = 4096; + dev->max_mtu = 4096; dev->flags |= IFF_NOARP; } diff --git a/drivers/net/appletalk/ipddp.c b/drivers/net/appletalk/ipddp.c index e90c6a7333d7..31f89f1c6123 100644 --- a/drivers/net/appletalk/ipddp.c +++ b/drivers/net/appletalk/ipddp.c @@ -59,7 +59,6 @@ static int ipddp_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); static const struct net_device_ops ipddp_netdev_ops = { .ndo_start_xmit = ipddp_xmit, .ndo_do_ioctl = ipddp_ioctl, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/cris/eth_v10.c b/drivers/net/cris/eth_v10.c index 221f5f011ff9..e128826aa117 100644 --- a/drivers/net/cris/eth_v10.c +++ b/drivers/net/cris/eth_v10.c @@ -264,7 +264,6 @@ static const struct net_device_ops e100_netdev_ops = { .ndo_do_ioctl = e100_ioctl, .ndo_set_mac_address = e100_set_mac_address, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, .ndo_set_config = e100_set_config, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = e100_netpoll, diff --git a/drivers/net/ethernet/3com/3c509.c b/drivers/net/ethernet/3com/3c509.c index 91ada52f776b..9f9a5f440e2f 100644 --- a/drivers/net/ethernet/3com/3c509.c +++ b/drivers/net/ethernet/3com/3c509.c @@ -508,7 +508,6 @@ static const struct net_device_ops netdev_ops = { .ndo_get_stats = el3_get_stats, .ndo_set_rx_mode = set_multicast_list, .ndo_tx_timeout = el3_tx_timeout, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/3com/3c515.c b/drivers/net/ethernet/3com/3c515.c index b26e038b4a0e..b9f4c463e516 100644 --- a/drivers/net/ethernet/3com/3c515.c +++ b/drivers/net/ethernet/3com/3c515.c @@ -570,7 +570,6 @@ static const struct net_device_ops netdev_ops = { .ndo_tx_timeout = corkscrew_timeout, .ndo_get_stats = corkscrew_get_stats, .ndo_set_rx_mode = set_rx_mode, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/3com/3c574_cs.c b/drivers/net/ethernet/3com/3c574_cs.c index b88afd759307..9359a37fedc0 100644 --- a/drivers/net/ethernet/3com/3c574_cs.c +++ b/drivers/net/ethernet/3com/3c574_cs.c @@ -254,7 +254,6 @@ static const struct net_device_ops el3_netdev_ops = { .ndo_get_stats = el3_get_stats, .ndo_do_ioctl = el3_ioctl, .ndo_set_rx_mode = set_multicast_list, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/3com/3c589_cs.c b/drivers/net/ethernet/3com/3c589_cs.c index 71396e4b87e3..e28254a00599 100644 --- a/drivers/net/ethernet/3com/3c589_cs.c +++ b/drivers/net/ethernet/3com/3c589_cs.c @@ -188,7 +188,6 @@ static const struct net_device_ops el3_netdev_ops = { .ndo_set_config = el3_config, .ndo_get_stats = el3_get_stats, .ndo_set_rx_mode = set_multicast_list, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c index 9133e7926da5..3ecf61382269 100644 --- a/drivers/net/ethernet/3com/3c59x.c +++ b/drivers/net/ethernet/3com/3c59x.c @@ -1062,7 +1062,6 @@ static const struct net_device_ops boomrang_netdev_ops = { .ndo_do_ioctl = vortex_ioctl, #endif .ndo_set_rx_mode = set_rx_mode, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, #ifdef CONFIG_NET_POLL_CONTROLLER @@ -1080,7 +1079,6 @@ static const struct net_device_ops vortex_netdev_ops = { .ndo_do_ioctl = vortex_ioctl, #endif .ndo_set_rx_mode = set_rx_mode, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/3com/typhoon.c b/drivers/net/ethernet/3com/typhoon.c index 8f8418d2ac4a..506b507b4158 100644 --- a/drivers/net/ethernet/3com/typhoon.c +++ b/drivers/net/ethernet/3com/typhoon.c @@ -2255,7 +2255,6 @@ static const struct net_device_ops typhoon_netdev_ops = { .ndo_get_stats = typhoon_get_stats, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, - .ndo_change_mtu = eth_change_mtu, }; static int diff --git a/drivers/net/ethernet/8390/8390.c b/drivers/net/ethernet/8390/8390.c index 5db1f55abef4..a43544af257b 100644 --- a/drivers/net/ethernet/8390/8390.c +++ b/drivers/net/ethernet/8390/8390.c @@ -64,7 +64,6 @@ const struct net_device_ops ei_netdev_ops = { .ndo_set_rx_mode = ei_set_multicast_list, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, - .ndo_change_mtu = eth_change_mtu, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = ei_poll, #endif diff --git a/drivers/net/ethernet/8390/8390p.c b/drivers/net/ethernet/8390/8390p.c index e8fc2e87e840..46d2257c4430 100644 --- a/drivers/net/ethernet/8390/8390p.c +++ b/drivers/net/ethernet/8390/8390p.c @@ -69,7 +69,6 @@ const struct net_device_ops eip_netdev_ops = { .ndo_set_rx_mode = eip_set_multicast_list, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, - .ndo_change_mtu = eth_change_mtu, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = eip_poll, #endif diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c index 39ca9350d1b2..b0a3b85fc6f8 100644 --- a/drivers/net/ethernet/8390/ax88796.c +++ b/drivers/net/ethernet/8390/ax88796.c @@ -536,7 +536,6 @@ static const struct net_device_ops ax_netdev_ops = { .ndo_set_rx_mode = ax_ei_set_multicast_list, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, - .ndo_change_mtu = eth_change_mtu, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = ax_ei_poll, #endif diff --git a/drivers/net/ethernet/8390/axnet_cs.c b/drivers/net/ethernet/8390/axnet_cs.c index 4ea717d68c95..1d84a0544ace 100644 --- a/drivers/net/ethernet/8390/axnet_cs.c +++ b/drivers/net/ethernet/8390/axnet_cs.c @@ -134,7 +134,6 @@ static const struct net_device_ops axnet_netdev_ops = { .ndo_tx_timeout = axnet_tx_timeout, .ndo_get_stats = get_stats, .ndo_set_rx_mode = set_multicast_list, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/8390/etherh.c b/drivers/net/ethernet/8390/etherh.c index d686b9cac29f..11cbf22ad201 100644 --- a/drivers/net/ethernet/8390/etherh.c +++ b/drivers/net/ethernet/8390/etherh.c @@ -654,7 +654,6 @@ static const struct net_device_ops etherh_netdev_ops = { .ndo_set_rx_mode = __ei_set_multicast_list, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, - .ndo_change_mtu = eth_change_mtu, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = __ei_poll, #endif diff --git a/drivers/net/ethernet/8390/hydra.c b/drivers/net/ethernet/8390/hydra.c index 0fe19d609c2e..8ae249195301 100644 --- a/drivers/net/ethernet/8390/hydra.c +++ b/drivers/net/ethernet/8390/hydra.c @@ -105,7 +105,6 @@ static const struct net_device_ops hydra_netdev_ops = { .ndo_set_rx_mode = __ei_set_multicast_list, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, - .ndo_change_mtu = eth_change_mtu, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = __ei_poll, #endif diff --git a/drivers/net/ethernet/8390/mac8390.c b/drivers/net/ethernet/8390/mac8390.c index b9283901136e..9497f18eaba0 100644 --- a/drivers/net/ethernet/8390/mac8390.c +++ b/drivers/net/ethernet/8390/mac8390.c @@ -483,7 +483,6 @@ static const struct net_device_ops mac8390_netdev_ops = { .ndo_set_rx_mode = __ei_set_multicast_list, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, - .ndo_change_mtu = eth_change_mtu, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = __ei_poll, #endif diff --git a/drivers/net/ethernet/8390/mcf8390.c b/drivers/net/ethernet/8390/mcf8390.c index e1c055574a11..4bb967bc879e 100644 --- a/drivers/net/ethernet/8390/mcf8390.c +++ b/drivers/net/ethernet/8390/mcf8390.c @@ -308,7 +308,6 @@ static const struct net_device_ops mcf8390_netdev_ops = { .ndo_set_rx_mode = __ei_set_multicast_list, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, - .ndo_change_mtu = eth_change_mtu, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = __ei_poll, #endif diff --git a/drivers/net/ethernet/8390/ne2k-pci.c b/drivers/net/ethernet/8390/ne2k-pci.c index 57e97910c728..07355302443d 100644 --- a/drivers/net/ethernet/8390/ne2k-pci.c +++ b/drivers/net/ethernet/8390/ne2k-pci.c @@ -209,7 +209,6 @@ static const struct net_device_ops ne2k_netdev_ops = { .ndo_set_rx_mode = ei_set_multicast_list, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, - .ndo_change_mtu = eth_change_mtu, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = ei_poll, #endif diff --git a/drivers/net/ethernet/8390/pcnet_cs.c b/drivers/net/ethernet/8390/pcnet_cs.c index 2f79d29f17f2..63079a6e20d9 100644 --- a/drivers/net/ethernet/8390/pcnet_cs.c +++ b/drivers/net/ethernet/8390/pcnet_cs.c @@ -227,7 +227,6 @@ static const struct net_device_ops pcnet_netdev_ops = { .ndo_do_ioctl = ei_ioctl, .ndo_set_rx_mode = ei_set_multicast_list, .ndo_tx_timeout = ei_tx_timeout, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/8390/smc-ultra.c b/drivers/net/ethernet/8390/smc-ultra.c index 139385dcdaa7..364b6514f65f 100644 --- a/drivers/net/ethernet/8390/smc-ultra.c +++ b/drivers/net/ethernet/8390/smc-ultra.c @@ -195,7 +195,6 @@ static const struct net_device_ops ultra_netdev_ops = { .ndo_set_rx_mode = ei_set_multicast_list, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, - .ndo_change_mtu = eth_change_mtu, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = ultra_poll, #endif diff --git a/drivers/net/ethernet/8390/wd.c b/drivers/net/ethernet/8390/wd.c index dd7d816bde52..ad019cbc698f 100644 --- a/drivers/net/ethernet/8390/wd.c +++ b/drivers/net/ethernet/8390/wd.c @@ -156,7 +156,6 @@ static const struct net_device_ops wd_netdev_ops = { .ndo_set_rx_mode = ei_set_multicast_list, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, - .ndo_change_mtu = eth_change_mtu, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = ei_poll, #endif diff --git a/drivers/net/ethernet/8390/zorro8390.c b/drivers/net/ethernet/8390/zorro8390.c index 8308728fad05..6d93956b293b 100644 --- a/drivers/net/ethernet/8390/zorro8390.c +++ b/drivers/net/ethernet/8390/zorro8390.c @@ -284,7 +284,6 @@ static const struct net_device_ops zorro8390_netdev_ops = { .ndo_set_rx_mode = __ei_set_multicast_list, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, - .ndo_change_mtu = eth_change_mtu, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = __ei_poll, #endif diff --git a/drivers/net/ethernet/adaptec/starfire.c b/drivers/net/ethernet/adaptec/starfire.c index 8af2c88d5b33..4a9a16e25666 100644 --- a/drivers/net/ethernet/adaptec/starfire.c +++ b/drivers/net/ethernet/adaptec/starfire.c @@ -634,7 +634,6 @@ static const struct net_device_ops netdev_ops = { .ndo_get_stats = get_stats, .ndo_set_rx_mode = set_rx_mode, .ndo_do_ioctl = netdev_ioctl, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, #ifdef VLAN_SUPPORT diff --git a/drivers/net/ethernet/adi/bfin_mac.c b/drivers/net/ethernet/adi/bfin_mac.c index 00f9ee3fc3e5..88164529b52a 100644 --- a/drivers/net/ethernet/adi/bfin_mac.c +++ b/drivers/net/ethernet/adi/bfin_mac.c @@ -1571,7 +1571,6 @@ static const struct net_device_ops bfin_mac_netdev_ops = { .ndo_set_rx_mode = bfin_mac_set_multicast_list, .ndo_do_ioctl = bfin_mac_ioctl, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = bfin_mac_poll_controller, #endif diff --git a/drivers/net/ethernet/allwinner/sun4i-emac.c b/drivers/net/ethernet/allwinner/sun4i-emac.c index 6ffdff68bfc4..af27f9dbedf2 100644 --- a/drivers/net/ethernet/allwinner/sun4i-emac.c +++ b/drivers/net/ethernet/allwinner/sun4i-emac.c @@ -773,7 +773,6 @@ static const struct net_device_ops emac_netdev_ops = { .ndo_tx_timeout = emac_timeout, .ndo_set_rx_mode = emac_set_rx_mode, .ndo_do_ioctl = emac_ioctl, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = emac_set_mac_address, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/amd/a2065.c b/drivers/net/ethernet/amd/a2065.c index a83cd1c4ce1d..ee4b94e3cda9 100644 --- a/drivers/net/ethernet/amd/a2065.c +++ b/drivers/net/ethernet/amd/a2065.c @@ -665,7 +665,6 @@ static const struct net_device_ops lance_netdev_ops = { .ndo_tx_timeout = lance_tx_timeout, .ndo_set_rx_mode = lance_set_multicast, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, }; diff --git a/drivers/net/ethernet/amd/am79c961a.c b/drivers/net/ethernet/amd/am79c961a.c index fcdf5dda448f..b11e910850f7 100644 --- a/drivers/net/ethernet/amd/am79c961a.c +++ b/drivers/net/ethernet/amd/am79c961a.c @@ -663,7 +663,6 @@ static const struct net_device_ops am79c961_netdev_ops = { .ndo_set_rx_mode = am79c961_setmulticastlist, .ndo_tx_timeout = am79c961_timeout, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = am79c961_poll_controller, diff --git a/drivers/net/ethernet/amd/ariadne.c b/drivers/net/ethernet/amd/ariadne.c index 968b7bfac8fc..5fd7b15b0574 100644 --- a/drivers/net/ethernet/amd/ariadne.c +++ b/drivers/net/ethernet/amd/ariadne.c @@ -706,7 +706,6 @@ static const struct net_device_ops ariadne_netdev_ops = { .ndo_get_stats = ariadne_get_stats, .ndo_set_rx_mode = set_multicast_list, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, }; diff --git a/drivers/net/ethernet/amd/atarilance.c b/drivers/net/ethernet/amd/atarilance.c index d2bc8e5dcd23..e53ccc3b7d8d 100644 --- a/drivers/net/ethernet/amd/atarilance.c +++ b/drivers/net/ethernet/amd/atarilance.c @@ -460,7 +460,6 @@ static const struct net_device_ops lance_netdev_ops = { .ndo_set_mac_address = lance_set_mac_address, .ndo_tx_timeout = lance_tx_timeout, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, }; static unsigned long __init lance_probe1( struct net_device *dev, diff --git a/drivers/net/ethernet/amd/au1000_eth.c b/drivers/net/ethernet/amd/au1000_eth.c index df664187cd82..a3c90fe5de00 100644 --- a/drivers/net/ethernet/amd/au1000_eth.c +++ b/drivers/net/ethernet/amd/au1000_eth.c @@ -1103,7 +1103,6 @@ static const struct net_device_ops au1000_netdev_ops = { .ndo_tx_timeout = au1000_tx_timeout, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, }; static int au1000_probe(struct platform_device *pdev) diff --git a/drivers/net/ethernet/amd/declance.c b/drivers/net/ethernet/amd/declance.c index b799c7ac899b..76e5fc7adff5 100644 --- a/drivers/net/ethernet/amd/declance.c +++ b/drivers/net/ethernet/amd/declance.c @@ -1013,7 +1013,6 @@ static const struct net_device_ops lance_netdev_ops = { .ndo_start_xmit = lance_start_xmit, .ndo_tx_timeout = lance_tx_timeout, .ndo_set_rx_mode = lance_set_multicast, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, }; diff --git a/drivers/net/ethernet/amd/hplance.c b/drivers/net/ethernet/amd/hplance.c index 6c9de117ffc6..c3dbf1c8a269 100644 --- a/drivers/net/ethernet/amd/hplance.c +++ b/drivers/net/ethernet/amd/hplance.c @@ -72,7 +72,6 @@ static const struct net_device_ops hplance_netdev_ops = { .ndo_stop = hplance_close, .ndo_start_xmit = lance_start_xmit, .ndo_set_rx_mode = lance_set_multicast, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/amd/lance.c b/drivers/net/ethernet/amd/lance.c index abb1ba228b26..61a641f23149 100644 --- a/drivers/net/ethernet/amd/lance.c +++ b/drivers/net/ethernet/amd/lance.c @@ -461,7 +461,6 @@ static const struct net_device_ops lance_netdev_ops = { .ndo_get_stats = lance_get_stats, .ndo_set_rx_mode = set_multicast_list, .ndo_tx_timeout = lance_tx_timeout, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/amd/mvme147.c b/drivers/net/ethernet/amd/mvme147.c index 0660ac5846bb..0a920448522f 100644 --- a/drivers/net/ethernet/amd/mvme147.c +++ b/drivers/net/ethernet/amd/mvme147.c @@ -62,7 +62,6 @@ static const struct net_device_ops lance_netdev_ops = { .ndo_start_xmit = lance_start_xmit, .ndo_set_rx_mode = lance_set_multicast, .ndo_tx_timeout = lance_tx_timeout, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, }; diff --git a/drivers/net/ethernet/amd/ni65.c b/drivers/net/ethernet/amd/ni65.c index cda53db75f17..5985bf220a8d 100644 --- a/drivers/net/ethernet/amd/ni65.c +++ b/drivers/net/ethernet/amd/ni65.c @@ -407,7 +407,6 @@ static const struct net_device_ops ni65_netdev_ops = { .ndo_start_xmit = ni65_send_packet, .ndo_tx_timeout = ni65_timeout, .ndo_set_rx_mode = set_multicast_list, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/amd/nmclan_cs.c b/drivers/net/ethernet/amd/nmclan_cs.c index 2807e181647b..113a3b3cc50c 100644 --- a/drivers/net/ethernet/amd/nmclan_cs.c +++ b/drivers/net/ethernet/amd/nmclan_cs.c @@ -427,7 +427,6 @@ static const struct net_device_ops mace_netdev_ops = { .ndo_set_config = mace_config, .ndo_get_stats = mace_get_stats, .ndo_set_rx_mode = set_multicast_list, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c index c22bf52d3320..adc7ab99a2f6 100644 --- a/drivers/net/ethernet/amd/pcnet32.c +++ b/drivers/net/ethernet/amd/pcnet32.c @@ -1527,7 +1527,6 @@ static const struct net_device_ops pcnet32_netdev_ops = { .ndo_get_stats = pcnet32_get_stats, .ndo_set_rx_mode = pcnet32_set_multicast_list, .ndo_do_ioctl = pcnet32_ioctl, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/amd/sun3lance.c b/drivers/net/ethernet/amd/sun3lance.c index 3d8c6b2cdea4..12bb4f1489fc 100644 --- a/drivers/net/ethernet/amd/sun3lance.c +++ b/drivers/net/ethernet/amd/sun3lance.c @@ -299,7 +299,6 @@ static const struct net_device_ops lance_netdev_ops = { .ndo_start_xmit = lance_start_xmit, .ndo_set_rx_mode = set_multicast_list, .ndo_set_mac_address = NULL, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/amd/sunlance.c b/drivers/net/ethernet/amd/sunlance.c index 9b56b40259dc..291ca5187f12 100644 --- a/drivers/net/ethernet/amd/sunlance.c +++ b/drivers/net/ethernet/amd/sunlance.c @@ -1294,7 +1294,6 @@ static const struct net_device_ops sparc_lance_ops = { .ndo_start_xmit = lance_start_xmit, .ndo_set_rx_mode = lance_set_multicast, .ndo_tx_timeout = lance_tx_timeout, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c index f75d9556152f..3fc7b0db952b 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c @@ -1252,7 +1252,6 @@ static const struct net_device_ops xgene_ndev_ops = { .ndo_start_xmit = xgene_enet_start_xmit, .ndo_tx_timeout = xgene_enet_timeout, .ndo_get_stats64 = xgene_enet_get_stats64, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = xgene_enet_set_mac_address, }; diff --git a/drivers/net/ethernet/apple/bmac.c b/drivers/net/ethernet/apple/bmac.c index a65d7a60f116..2b2d87089987 100644 --- a/drivers/net/ethernet/apple/bmac.c +++ b/drivers/net/ethernet/apple/bmac.c @@ -1237,7 +1237,6 @@ static const struct net_device_ops bmac_netdev_ops = { .ndo_start_xmit = bmac_output, .ndo_set_rx_mode = bmac_set_multicast, .ndo_set_mac_address = bmac_set_address, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/apple/mace.c b/drivers/net/ethernet/apple/mace.c index e58a7c73766e..96dd5300e0e5 100644 --- a/drivers/net/ethernet/apple/mace.c +++ b/drivers/net/ethernet/apple/mace.c @@ -102,7 +102,6 @@ static const struct net_device_ops mace_netdev_ops = { .ndo_start_xmit = mace_xmit_start, .ndo_set_rx_mode = mace_set_multicast, .ndo_set_mac_address = mace_set_address, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/apple/macmace.c b/drivers/net/ethernet/apple/macmace.c index 89914ca17a49..857df9c45f04 100644 --- a/drivers/net/ethernet/apple/macmace.c +++ b/drivers/net/ethernet/apple/macmace.c @@ -186,7 +186,6 @@ static const struct net_device_ops mace_netdev_ops = { .ndo_tx_timeout = mace_tx_timeout, .ndo_set_rx_mode = mace_set_multicast, .ndo_set_mac_address = mace_set_address, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/aurora/nb8800.c b/drivers/net/ethernet/aurora/nb8800.c index b047fd607b83..453dc0967125 100644 --- a/drivers/net/ethernet/aurora/nb8800.c +++ b/drivers/net/ethernet/aurora/nb8800.c @@ -1032,7 +1032,6 @@ static const struct net_device_ops nb8800_netdev_ops = { .ndo_set_mac_address = nb8800_set_mac_address, .ndo_set_rx_mode = nb8800_set_rx_mode, .ndo_do_ioctl = nb8800_ioctl, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c index b32444a3ed79..20e72044b765 100644 --- a/drivers/net/ethernet/cadence/macb.c +++ b/drivers/net/ethernet/cadence/macb.c @@ -2793,7 +2793,6 @@ static const struct net_device_ops at91ether_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_do_ioctl = macb_ioctl, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = at91ether_poll_controller, #endif diff --git a/drivers/net/ethernet/cirrus/cs89x0.c b/drivers/net/ethernet/cirrus/cs89x0.c index c363b58552e9..3647b28e8de0 100644 --- a/drivers/net/ethernet/cirrus/cs89x0.c +++ b/drivers/net/ethernet/cirrus/cs89x0.c @@ -1266,7 +1266,6 @@ static const struct net_device_ops net_ops = { #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = net_poll_controller, #endif - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/cirrus/ep93xx_eth.c b/drivers/net/ethernet/cirrus/ep93xx_eth.c index de9f7c97d916..9119af088821 100644 --- a/drivers/net/ethernet/cirrus/ep93xx_eth.c +++ b/drivers/net/ethernet/cirrus/ep93xx_eth.c @@ -749,7 +749,6 @@ static const struct net_device_ops ep93xx_netdev_ops = { .ndo_start_xmit = ep93xx_xmit, .ndo_do_ioctl = ep93xx_ioctl, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, }; diff --git a/drivers/net/ethernet/cirrus/mac89x0.c b/drivers/net/ethernet/cirrus/mac89x0.c index 07719676c305..b600fbbbf679 100644 --- a/drivers/net/ethernet/cirrus/mac89x0.c +++ b/drivers/net/ethernet/cirrus/mac89x0.c @@ -172,7 +172,6 @@ static const struct net_device_ops mac89x0_netdev_ops = { .ndo_set_rx_mode = set_multicast_list, .ndo_set_mac_address = set_mac_address, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, }; /* Probe for the CS8900 card in slot E. We won't bother looking diff --git a/drivers/net/ethernet/davicom/dm9000.c b/drivers/net/ethernet/davicom/dm9000.c index f45385f5c6e5..f1a81c52afe3 100644 --- a/drivers/net/ethernet/davicom/dm9000.c +++ b/drivers/net/ethernet/davicom/dm9000.c @@ -1382,7 +1382,6 @@ static const struct net_device_ops dm9000_netdev_ops = { .ndo_tx_timeout = dm9000_timeout, .ndo_set_rx_mode = dm9000_hash_table, .ndo_do_ioctl = dm9000_ioctl, - .ndo_change_mtu = eth_change_mtu, .ndo_set_features = dm9000_set_features, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, diff --git a/drivers/net/ethernet/dec/tulip/de2104x.c b/drivers/net/ethernet/dec/tulip/de2104x.c index cadcee645f74..90c573b8ccaf 100644 --- a/drivers/net/ethernet/dec/tulip/de2104x.c +++ b/drivers/net/ethernet/dec/tulip/de2104x.c @@ -1956,7 +1956,6 @@ static const struct net_device_ops de_netdev_ops = { .ndo_start_xmit = de_start_xmit, .ndo_get_stats = de_get_stats, .ndo_tx_timeout = de_tx_timeout, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/dec/tulip/de4x5.c b/drivers/net/ethernet/dec/tulip/de4x5.c index 6620fc861c47..51fda3a6b13f 100644 --- a/drivers/net/ethernet/dec/tulip/de4x5.c +++ b/drivers/net/ethernet/dec/tulip/de4x5.c @@ -1085,7 +1085,6 @@ static const struct net_device_ops de4x5_netdev_ops = { .ndo_get_stats = de4x5_get_stats, .ndo_set_rx_mode = set_multicast_list, .ndo_do_ioctl = de4x5_ioctl, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address= eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/dec/tulip/dmfe.c b/drivers/net/ethernet/dec/tulip/dmfe.c index 8ed0fd8b1dda..df4994919456 100644 --- a/drivers/net/ethernet/dec/tulip/dmfe.c +++ b/drivers/net/ethernet/dec/tulip/dmfe.c @@ -352,7 +352,6 @@ static const struct net_device_ops netdev_ops = { .ndo_stop = dmfe_stop, .ndo_start_xmit = dmfe_start_xmit, .ndo_set_rx_mode = dmfe_set_filter_mode, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/dec/tulip/tulip_core.c b/drivers/net/ethernet/dec/tulip/tulip_core.c index bbde90bc74fe..5f1377449b8f 100644 --- a/drivers/net/ethernet/dec/tulip/tulip_core.c +++ b/drivers/net/ethernet/dec/tulip/tulip_core.c @@ -1282,7 +1282,6 @@ static const struct net_device_ops tulip_netdev_ops = { .ndo_get_stats = tulip_get_stats, .ndo_do_ioctl = private_ioctl, .ndo_set_rx_mode = set_rx_mode, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/dec/tulip/uli526x.c b/drivers/net/ethernet/dec/tulip/uli526x.c index e750b5ddc0fb..e1c4133b8787 100644 --- a/drivers/net/ethernet/dec/tulip/uli526x.c +++ b/drivers/net/ethernet/dec/tulip/uli526x.c @@ -269,7 +269,6 @@ static const struct net_device_ops netdev_ops = { .ndo_stop = uli526x_stop, .ndo_start_xmit = uli526x_start_xmit, .ndo_set_rx_mode = uli526x_set_filter_mode, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/dec/tulip/winbond-840.c b/drivers/net/ethernet/dec/tulip/winbond-840.c index 1f62b9423851..feda96d585e7 100644 --- a/drivers/net/ethernet/dec/tulip/winbond-840.c +++ b/drivers/net/ethernet/dec/tulip/winbond-840.c @@ -353,7 +353,6 @@ static const struct net_device_ops netdev_ops = { .ndo_set_rx_mode = set_rx_mode, .ndo_do_ioctl = netdev_ioctl, .ndo_tx_timeout = tx_timeout, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/dec/tulip/xircom_cb.c b/drivers/net/ethernet/dec/tulip/xircom_cb.c index 0e721cedfa67..19e4ea15b504 100644 --- a/drivers/net/ethernet/dec/tulip/xircom_cb.c +++ b/drivers/net/ethernet/dec/tulip/xircom_cb.c @@ -174,7 +174,6 @@ static const struct net_device_ops netdev_ops = { .ndo_open = xircom_open, .ndo_stop = xircom_close, .ndo_start_xmit = xircom_start_xmit, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/dnet.c b/drivers/net/ethernet/dnet.c index c3b64cdd0dec..2a17c59f69f9 100644 --- a/drivers/net/ethernet/dnet.c +++ b/drivers/net/ethernet/dnet.c @@ -767,7 +767,6 @@ static const struct net_device_ops dnet_netdev_ops = { .ndo_do_ioctl = dnet_ioctl, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, }; static int dnet_probe(struct platform_device *pdev) diff --git a/drivers/net/ethernet/ec_bhf.c b/drivers/net/ethernet/ec_bhf.c index f7b42483921c..57650953ff83 100644 --- a/drivers/net/ethernet/ec_bhf.c +++ b/drivers/net/ethernet/ec_bhf.c @@ -482,7 +482,6 @@ static const struct net_device_ops ec_bhf_netdev_ops = { .ndo_open = ec_bhf_open, .ndo_stop = ec_bhf_stop, .ndo_get_stats64 = ec_bhf_get_stats, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr }; diff --git a/drivers/net/ethernet/fealnx.c b/drivers/net/ethernet/fealnx.c index c08bd763172a..6967b287b6e7 100644 --- a/drivers/net/ethernet/fealnx.c +++ b/drivers/net/ethernet/fealnx.c @@ -472,7 +472,6 @@ static const struct net_device_ops netdev_ops = { .ndo_set_rx_mode = set_rx_mode, .ndo_do_ioctl = mii_ioctl, .ndo_tx_timeout = fealnx_tx_timeout, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 48a033e64423..4ce8179aa4ff 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -3055,7 +3055,6 @@ static const struct net_device_ops fec_netdev_ops = { .ndo_stop = fec_enet_close, .ndo_start_xmit = fec_enet_start_xmit, .ndo_set_rx_mode = set_multicast_list, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_tx_timeout = fec_timeout, .ndo_set_mac_address = fec_set_mac_address, diff --git a/drivers/net/ethernet/freescale/fec_mpc52xx.c b/drivers/net/ethernet/freescale/fec_mpc52xx.c index 446ae9d60c71..aa8cf5d2a53c 100644 --- a/drivers/net/ethernet/freescale/fec_mpc52xx.c +++ b/drivers/net/ethernet/freescale/fec_mpc52xx.c @@ -802,7 +802,6 @@ static const struct net_device_ops mpc52xx_fec_netdev_ops = { .ndo_set_mac_address = mpc52xx_fec_set_mac_address, .ndo_validate_addr = eth_validate_addr, .ndo_do_ioctl = mpc52xx_fec_ioctl, - .ndo_change_mtu = eth_change_mtu, .ndo_tx_timeout = mpc52xx_fec_tx_timeout, .ndo_get_stats = mpc52xx_fec_get_stats, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c index dc120c148d97..925d1bca6a26 100644 --- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c +++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c @@ -912,7 +912,6 @@ static const struct net_device_ops fs_enet_netdev_ops = { .ndo_do_ioctl = fs_ioctl, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, - .ndo_change_mtu = eth_change_mtu, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = fs_enet_netpoll, #endif diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index 186ef8f16c80..786182480a73 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -3681,7 +3681,6 @@ static const struct net_device_ops ucc_geth_netdev_ops = { .ndo_start_xmit = ucc_geth_start_xmit, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = ucc_geth_set_mac_addr, - .ndo_change_mtu = eth_change_mtu, .ndo_set_rx_mode = ucc_geth_set_multi, .ndo_tx_timeout = ucc_geth_timeout, .ndo_do_ioctl = ucc_geth_ioctl, diff --git a/drivers/net/ethernet/fujitsu/fmvj18x_cs.c b/drivers/net/ethernet/fujitsu/fmvj18x_cs.c index 399cfd217288..51c4abc51bf4 100644 --- a/drivers/net/ethernet/fujitsu/fmvj18x_cs.c +++ b/drivers/net/ethernet/fujitsu/fmvj18x_cs.c @@ -225,7 +225,6 @@ static const struct net_device_ops fjn_netdev_ops = { .ndo_tx_timeout = fjn_tx_timeout, .ndo_set_config = fjn_config, .ndo_set_rx_mode = set_rx_mode, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c index 39778892b3b3..3c99ca420f57 100644 --- a/drivers/net/ethernet/hisilicon/hip04_eth.c +++ b/drivers/net/ethernet/hisilicon/hip04_eth.c @@ -769,7 +769,6 @@ static const struct net_device_ops hip04_netdev_ops = { .ndo_set_mac_address = hip04_set_mac_address, .ndo_tx_timeout = hip04_timeout, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, }; static int hip04_alloc_ring(struct net_device *ndev, struct device *d) diff --git a/drivers/net/ethernet/hisilicon/hisi_femac.c b/drivers/net/ethernet/hisilicon/hisi_femac.c index ced185962ef8..49863068c59e 100644 --- a/drivers/net/ethernet/hisilicon/hisi_femac.c +++ b/drivers/net/ethernet/hisilicon/hisi_femac.c @@ -712,7 +712,6 @@ static const struct net_device_ops hisi_femac_netdev_ops = { .ndo_do_ioctl = hisi_femac_net_ioctl, .ndo_set_mac_address = hisi_femac_set_mac_address, .ndo_set_rx_mode = hisi_femac_net_set_rx_mode, - .ndo_change_mtu = eth_change_mtu, }; static void hisi_femac_core_reset(struct hisi_femac_priv *priv) diff --git a/drivers/net/ethernet/hp/hp100.c b/drivers/net/ethernet/hp/hp100.c index 631dbc7b4dbb..1a31bee6e728 100644 --- a/drivers/net/ethernet/hp/hp100.c +++ b/drivers/net/ethernet/hp/hp100.c @@ -427,7 +427,6 @@ static const struct net_device_ops hp100_bm_netdev_ops = { .ndo_start_xmit = hp100_start_xmit_bm, .ndo_get_stats = hp100_get_stats, .ndo_set_rx_mode = hp100_set_multicast_list, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; @@ -438,7 +437,6 @@ static const struct net_device_ops hp100_netdev_ops = { .ndo_start_xmit = hp100_start_xmit, .ndo_get_stats = hp100_get_stats, .ndo_set_rx_mode = hp100_set_multicast_list, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/i825xx/82596.c b/drivers/net/ethernet/i825xx/82596.c index ce235b776793..945883842533 100644 --- a/drivers/net/ethernet/i825xx/82596.c +++ b/drivers/net/ethernet/i825xx/82596.c @@ -1118,7 +1118,6 @@ static const struct net_device_ops i596_netdev_ops = { .ndo_start_xmit = i596_start_xmit, .ndo_set_rx_mode = set_multicast_list, .ndo_tx_timeout = i596_tx_timeout, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/i825xx/ether1.c b/drivers/net/ethernet/i825xx/ether1.c index 5d353c660068..dc983450354b 100644 --- a/drivers/net/ethernet/i825xx/ether1.c +++ b/drivers/net/ethernet/i825xx/ether1.c @@ -981,7 +981,6 @@ static const struct net_device_ops ether1_netdev_ops = { .ndo_set_rx_mode = ether1_setmulticastlist, .ndo_tx_timeout = ether1_timeout, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, }; diff --git a/drivers/net/ethernet/i825xx/lib82596.c b/drivers/net/ethernet/i825xx/lib82596.c index 3dbc53c21baa..e86773325cbe 100644 --- a/drivers/net/ethernet/i825xx/lib82596.c +++ b/drivers/net/ethernet/i825xx/lib82596.c @@ -1037,7 +1037,6 @@ static const struct net_device_ops i596_netdev_ops = { .ndo_start_xmit = i596_start_xmit, .ndo_set_rx_mode = set_multicast_list, .ndo_tx_timeout = i596_tx_timeout, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/i825xx/sun3_82586.c b/drivers/net/ethernet/i825xx/sun3_82586.c index 21c84cc9c871..8bb15a8c2a40 100644 --- a/drivers/net/ethernet/i825xx/sun3_82586.c +++ b/drivers/net/ethernet/i825xx/sun3_82586.c @@ -337,7 +337,6 @@ static const struct net_device_ops sun3_82586_netdev_ops = { .ndo_get_stats = sun3_82586_get_stats, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, - .ndo_change_mtu = eth_change_mtu, }; static int __init sun3_82586_probe1(struct net_device *dev,int ioaddr) diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c index 8f139197f1aa..5d804a544674 100644 --- a/drivers/net/ethernet/ibm/emac/core.c +++ b/drivers/net/ethernet/ibm/emac/core.c @@ -2718,7 +2718,6 @@ static const struct net_device_ops emac_netdev_ops = { .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = emac_set_mac_address, .ndo_start_xmit = emac_start_xmit, - .ndo_change_mtu = eth_change_mtu, }; static const struct net_device_ops emac_gige_netdev_ops = { diff --git a/drivers/net/ethernet/korina.c b/drivers/net/ethernet/korina.c index 1799fe1415df..cbeea915f026 100644 --- a/drivers/net/ethernet/korina.c +++ b/drivers/net/ethernet/korina.c @@ -1085,7 +1085,6 @@ static const struct net_device_ops korina_netdev_ops = { .ndo_set_rx_mode = korina_multicast_list, .ndo_tx_timeout = korina_tx_timeout, .ndo_do_ioctl = korina_ioctl, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c index 91e09d68b7e2..1a739d71f1c2 100644 --- a/drivers/net/ethernet/lantiq_etop.c +++ b/drivers/net/ethernet/lantiq_etop.c @@ -519,18 +519,16 @@ ltq_etop_tx(struct sk_buff *skb, struct net_device *dev) static int ltq_etop_change_mtu(struct net_device *dev, int new_mtu) { - int ret = eth_change_mtu(dev, new_mtu); + struct ltq_etop_priv *priv = netdev_priv(dev); + unsigned long flags; - if (!ret) { - struct ltq_etop_priv *priv = netdev_priv(dev); - unsigned long flags; + dev->mtu = new_mtu; - spin_lock_irqsave(&priv->lock, flags); - ltq_etop_w32((ETOP_PLEN_UNDER << 16) | new_mtu, - LTQ_ETOP_IGPLEN); - spin_unlock_irqrestore(&priv->lock, flags); - } - return ret; + spin_lock_irqsave(&priv->lock, flags); + ltq_etop_w32((ETOP_PLEN_UNDER << 16) | new_mtu, LTQ_ETOP_IGPLEN); + spin_unlock_irqrestore(&priv->lock, flags); + + return 0; } static int diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 4a62ffd7729d..8f80e618ae59 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -2243,7 +2243,6 @@ static const struct net_device_ops mtk_netdev_ops = { .ndo_set_mac_address = mtk_set_mac_address, .ndo_validate_addr = eth_validate_addr, .ndo_do_ioctl = mtk_do_ioctl, - .ndo_change_mtu = eth_change_mtu, .ndo_tx_timeout = mtk_tx_timeout, .ndo_get_stats64 = mtk_get_stats64, .ndo_fix_features = mtk_fix_features, diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index 1edc973df4c4..e7e1aff40bd9 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -1063,7 +1063,6 @@ static const struct net_device_ops ks8851_netdev_ops = { .ndo_start_xmit = ks8851_start_xmit, .ndo_set_mac_address = ks8851_set_mac_address, .ndo_set_rx_mode = ks8851_set_rx_mode, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/micrel/ks8851_mll.c b/drivers/net/ethernet/micrel/ks8851_mll.c index 2fc5cd56c0a8..db628078a4e6 100644 --- a/drivers/net/ethernet/micrel/ks8851_mll.c +++ b/drivers/net/ethernet/micrel/ks8851_mll.c @@ -1285,7 +1285,6 @@ static const struct net_device_ops ks_netdev_ops = { .ndo_start_xmit = ks_start_xmit, .ndo_set_mac_address = ks_set_mac_address, .ndo_set_rx_mode = ks_set_rx_mode, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/microchip/enc28j60.c b/drivers/net/ethernet/microchip/enc28j60.c index 0a26b11ca8f6..045b9106c0ff 100644 --- a/drivers/net/ethernet/microchip/enc28j60.c +++ b/drivers/net/ethernet/microchip/enc28j60.c @@ -1544,7 +1544,6 @@ static const struct net_device_ops enc28j60_netdev_ops = { .ndo_set_rx_mode = enc28j60_set_multicast_list, .ndo_set_mac_address = enc28j60_set_mac_address, .ndo_tx_timeout = enc28j60_tx_timeout, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/moxa/moxart_ether.c b/drivers/net/ethernet/moxa/moxart_ether.c index 4367dd6879a2..9774b50cff6e 100644 --- a/drivers/net/ethernet/moxa/moxart_ether.c +++ b/drivers/net/ethernet/moxa/moxart_ether.c @@ -444,7 +444,6 @@ static struct net_device_ops moxart_netdev_ops = { .ndo_set_rx_mode = moxart_mac_set_rx_mode, .ndo_set_mac_address = moxart_set_mac_address, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, }; static int moxart_mac_probe(struct platform_device *pdev) diff --git a/drivers/net/ethernet/natsemi/jazzsonic.c b/drivers/net/ethernet/natsemi/jazzsonic.c index acf3f11e38cc..a6caeb567c0d 100644 --- a/drivers/net/ethernet/natsemi/jazzsonic.c +++ b/drivers/net/ethernet/natsemi/jazzsonic.c @@ -110,7 +110,6 @@ static const struct net_device_ops sonic_netdev_ops = { .ndo_get_stats = sonic_get_stats, .ndo_set_rx_mode = sonic_multicast_list, .ndo_tx_timeout = sonic_tx_timeout, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, }; diff --git a/drivers/net/ethernet/natsemi/macsonic.c b/drivers/net/ethernet/natsemi/macsonic.c index d98f5b8a1c66..3ca6ae7caf55 100644 --- a/drivers/net/ethernet/natsemi/macsonic.c +++ b/drivers/net/ethernet/natsemi/macsonic.c @@ -190,7 +190,6 @@ static const struct net_device_ops macsonic_netdev_ops = { .ndo_tx_timeout = sonic_tx_timeout, .ndo_get_stats = sonic_get_stats, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, }; diff --git a/drivers/net/ethernet/natsemi/xtsonic.c b/drivers/net/ethernet/natsemi/xtsonic.c index 7007d212f3e4..9ee0f69a83c0 100644 --- a/drivers/net/ethernet/natsemi/xtsonic.c +++ b/drivers/net/ethernet/natsemi/xtsonic.c @@ -124,7 +124,6 @@ static const struct net_device_ops xtsonic_netdev_ops = { .ndo_set_rx_mode = sonic_multicast_list, .ndo_tx_timeout = sonic_tx_timeout, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, }; diff --git a/drivers/net/ethernet/netx-eth.c b/drivers/net/ethernet/netx-eth.c index adbc47f2d132..df4188cb43e0 100644 --- a/drivers/net/ethernet/netx-eth.c +++ b/drivers/net/ethernet/netx-eth.c @@ -304,7 +304,6 @@ static const struct net_device_ops netx_eth_netdev_ops = { .ndo_start_xmit = netx_eth_hard_start_xmit, .ndo_tx_timeout = netx_eth_timeout, .ndo_set_rx_mode = netx_eth_set_multicast_list, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, }; diff --git a/drivers/net/ethernet/nuvoton/w90p910_ether.c b/drivers/net/ethernet/nuvoton/w90p910_ether.c index 712d8bcb7d8c..119f6dca71f0 100644 --- a/drivers/net/ethernet/nuvoton/w90p910_ether.c +++ b/drivers/net/ethernet/nuvoton/w90p910_ether.c @@ -915,7 +915,6 @@ static const struct net_device_ops w90p910_ether_netdev_ops = { .ndo_set_mac_address = w90p910_set_mac_address, .ndo_do_ioctl = w90p910_ether_ioctl, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, }; static void __init get_mac_address(struct net_device *dev) diff --git a/drivers/net/ethernet/nxp/lpc_eth.c b/drivers/net/ethernet/nxp/lpc_eth.c index 8e13ec84c538..dd6b0d0f7fa5 100644 --- a/drivers/net/ethernet/nxp/lpc_eth.c +++ b/drivers/net/ethernet/nxp/lpc_eth.c @@ -1256,7 +1256,6 @@ static const struct net_device_ops lpc_netdev_ops = { .ndo_do_ioctl = lpc_eth_ioctl, .ndo_set_mac_address = lpc_set_mac_address, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, }; static int lpc_eth_drv_probe(struct platform_device *pdev) diff --git a/drivers/net/ethernet/packetengines/hamachi.c b/drivers/net/ethernet/packetengines/hamachi.c index 91be2f02ef1c..2d04679a923a 100644 --- a/drivers/net/ethernet/packetengines/hamachi.c +++ b/drivers/net/ethernet/packetengines/hamachi.c @@ -568,7 +568,6 @@ static const struct net_device_ops hamachi_netdev_ops = { .ndo_start_xmit = hamachi_start_xmit, .ndo_get_stats = hamachi_get_stats, .ndo_set_rx_mode = set_rx_mode, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, .ndo_tx_timeout = hamachi_tx_timeout, diff --git a/drivers/net/ethernet/packetengines/yellowfin.c b/drivers/net/ethernet/packetengines/yellowfin.c index fb1d1031b091..2a2ca5fa0c69 100644 --- a/drivers/net/ethernet/packetengines/yellowfin.c +++ b/drivers/net/ethernet/packetengines/yellowfin.c @@ -360,7 +360,6 @@ static const struct net_device_ops netdev_ops = { .ndo_stop = yellowfin_close, .ndo_start_xmit = yellowfin_start_xmit, .ndo_set_rx_mode = set_rx_mode, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, .ndo_do_ioctl = netdev_ioctl, diff --git a/drivers/net/ethernet/qlogic/qla3xxx.c b/drivers/net/ethernet/qlogic/qla3xxx.c index b09a6b80d107..5c100ab86c00 100644 --- a/drivers/net/ethernet/qlogic/qla3xxx.c +++ b/drivers/net/ethernet/qlogic/qla3xxx.c @@ -3755,7 +3755,6 @@ static const struct net_device_ops ql3xxx_netdev_ops = { .ndo_open = ql3xxx_open, .ndo_start_xmit = ql3xxx_send, .ndo_stop = ql3xxx_close, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = ql3xxx_set_mac_address, .ndo_tx_timeout = ql3xxx_tx_timeout, diff --git a/drivers/net/ethernet/rdc/r6040.c b/drivers/net/ethernet/rdc/r6040.c index 5ef5d728c250..4ff4e0491406 100644 --- a/drivers/net/ethernet/rdc/r6040.c +++ b/drivers/net/ethernet/rdc/r6040.c @@ -969,7 +969,6 @@ static const struct net_device_ops r6040_netdev_ops = { .ndo_start_xmit = r6040_start_xmit, .ndo_get_stats = r6040_get_stats, .ndo_set_rx_mode = r6040_multicast_list, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, .ndo_do_ioctl = r6040_ioctl, diff --git a/drivers/net/ethernet/realtek/atp.c b/drivers/net/ethernet/realtek/atp.c index 5cb96785fb63..570ed3bd3cbf 100644 --- a/drivers/net/ethernet/realtek/atp.c +++ b/drivers/net/ethernet/realtek/atp.c @@ -245,7 +245,6 @@ static const struct net_device_ops atp_netdev_ops = { .ndo_start_xmit = atp_send_packet, .ndo_set_rx_mode = set_rx_mode, .ndo_tx_timeout = tx_timeout, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 630536bc72f9..27cfec3154c8 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1780,7 +1780,6 @@ static const struct net_device_ops ravb_netdev_ops = { .ndo_do_ioctl = ravb_do_ioctl, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, - .ndo_change_mtu = eth_change_mtu, }; /* MDIO bus init function */ diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index 05b0dc55de77..e443695c2757 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -2914,7 +2914,6 @@ static const struct net_device_ops sh_eth_netdev_ops = { .ndo_do_ioctl = sh_eth_do_ioctl, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, - .ndo_change_mtu = eth_change_mtu, }; static const struct net_device_ops sh_eth_netdev_ops_tsu = { @@ -2929,7 +2928,6 @@ static const struct net_device_ops sh_eth_netdev_ops_tsu = { .ndo_do_ioctl = sh_eth_do_ioctl, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, - .ndo_change_mtu = eth_change_mtu, }; #ifdef CONFIG_OF diff --git a/drivers/net/ethernet/seeq/ether3.c b/drivers/net/ethernet/seeq/ether3.c index bdac936a68bc..244c1e171017 100644 --- a/drivers/net/ethernet/seeq/ether3.c +++ b/drivers/net/ethernet/seeq/ether3.c @@ -745,7 +745,6 @@ static const struct net_device_ops ether3_netdev_ops = { .ndo_set_rx_mode = ether3_setmulticastlist, .ndo_tx_timeout = ether3_timeout, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, }; diff --git a/drivers/net/ethernet/seeq/sgiseeq.c b/drivers/net/ethernet/seeq/sgiseeq.c index c2bd5378ffda..ed34196028b8 100644 --- a/drivers/net/ethernet/seeq/sgiseeq.c +++ b/drivers/net/ethernet/seeq/sgiseeq.c @@ -714,7 +714,6 @@ static const struct net_device_ops sgiseeq_netdev_ops = { .ndo_tx_timeout = timeout, .ndo_set_rx_mode = sgiseeq_set_multicast, .ndo_set_mac_address = sgiseeq_set_mac_address, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c index 7a254da85dd7..42051ab98cf0 100644 --- a/drivers/net/ethernet/sgi/ioc3-eth.c +++ b/drivers/net/ethernet/sgi/ioc3-eth.c @@ -1225,7 +1225,6 @@ static const struct net_device_ops ioc3_netdev_ops = { .ndo_do_ioctl = ioc3_ioctl, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = ioc3_set_mac_address, - .ndo_change_mtu = eth_change_mtu, }; static int ioc3_probe(struct pci_dev *pdev, const struct pci_device_id *ent) diff --git a/drivers/net/ethernet/sgi/meth.c b/drivers/net/ethernet/sgi/meth.c index aaa80f13859b..69d2d30e5ef1 100644 --- a/drivers/net/ethernet/sgi/meth.c +++ b/drivers/net/ethernet/sgi/meth.c @@ -815,7 +815,6 @@ static const struct net_device_ops meth_netdev_ops = { .ndo_start_xmit = meth_tx, .ndo_do_ioctl = meth_ioctl, .ndo_tx_timeout = meth_tx_timeout, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, .ndo_set_rx_mode = meth_set_rx_mode, diff --git a/drivers/net/ethernet/silan/sc92031.c b/drivers/net/ethernet/silan/sc92031.c index 7426f8b21252..6c2e2b311c16 100644 --- a/drivers/net/ethernet/silan/sc92031.c +++ b/drivers/net/ethernet/silan/sc92031.c @@ -1386,7 +1386,6 @@ static const struct net_device_ops sc92031_netdev_ops = { .ndo_open = sc92031_open, .ndo_stop = sc92031_stop, .ndo_set_rx_mode = sc92031_set_multicast_list, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, .ndo_tx_timeout = sc92031_tx_timeout, diff --git a/drivers/net/ethernet/sis/sis190.c b/drivers/net/ethernet/sis/sis190.c index 27be6c869315..210e35d079dd 100644 --- a/drivers/net/ethernet/sis/sis190.c +++ b/drivers/net/ethernet/sis/sis190.c @@ -1833,7 +1833,6 @@ static const struct net_device_ops sis190_netdev_ops = { .ndo_start_xmit = sis190_start_xmit, .ndo_tx_timeout = sis190_tx_timeout, .ndo_set_rx_mode = sis190_set_rx_mode, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = sis190_mac_addr, .ndo_validate_addr = eth_validate_addr, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/sis/sis900.c b/drivers/net/ethernet/sis/sis900.c index 6f85276376e8..39fca6c0b68d 100644 --- a/drivers/net/ethernet/sis/sis900.c +++ b/drivers/net/ethernet/sis/sis900.c @@ -400,7 +400,6 @@ static const struct net_device_ops sis900_netdev_ops = { .ndo_start_xmit = sis900_start_xmit, .ndo_set_config = sis900_set_config, .ndo_set_rx_mode = set_rx_mode, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, .ndo_do_ioctl = mii_ioctl, diff --git a/drivers/net/ethernet/smsc/epic100.c b/drivers/net/ethernet/smsc/epic100.c index 7186b89269ad..fe9760ffab51 100644 --- a/drivers/net/ethernet/smsc/epic100.c +++ b/drivers/net/ethernet/smsc/epic100.c @@ -313,7 +313,6 @@ static const struct net_device_ops epic_netdev_ops = { .ndo_get_stats = epic_get_stats, .ndo_set_rx_mode = set_rx_mode, .ndo_do_ioctl = netdev_ioctl, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/smsc/smc911x.c b/drivers/net/ethernet/smsc/smc911x.c index cb49c9654f0a..4f19c6166182 100644 --- a/drivers/net/ethernet/smsc/smc911x.c +++ b/drivers/net/ethernet/smsc/smc911x.c @@ -1753,7 +1753,6 @@ static const struct net_device_ops smc911x_netdev_ops = { .ndo_start_xmit = smc911x_hard_start_xmit, .ndo_tx_timeout = smc911x_timeout, .ndo_set_rx_mode = smc911x_set_multicast_list, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/smsc/smc9194.c b/drivers/net/ethernet/smsc/smc9194.c index d496888b85d3..c8d84679ede7 100644 --- a/drivers/net/ethernet/smsc/smc9194.c +++ b/drivers/net/ethernet/smsc/smc9194.c @@ -809,7 +809,6 @@ static const struct net_device_ops smc_netdev_ops = { .ndo_start_xmit = smc_wait_to_send_packet, .ndo_tx_timeout = smc_timeout, .ndo_set_rx_mode = smc_set_multicast_list, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/smsc/smc91c92_cs.c b/drivers/net/ethernet/smsc/smc91c92_cs.c index db3c696d7002..f1c75e291e55 100644 --- a/drivers/net/ethernet/smsc/smc91c92_cs.c +++ b/drivers/net/ethernet/smsc/smc91c92_cs.c @@ -294,7 +294,6 @@ static const struct net_device_ops smc_netdev_ops = { .ndo_set_config = s9k_config, .ndo_set_rx_mode = set_rx_mode, .ndo_do_ioctl = smc_ioctl, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/smsc/smc91x.c b/drivers/net/ethernet/smsc/smc91x.c index 73212590d04a..9b4780f87863 100644 --- a/drivers/net/ethernet/smsc/smc91x.c +++ b/drivers/net/ethernet/smsc/smc91x.c @@ -1762,7 +1762,6 @@ static const struct net_device_ops smc_netdev_ops = { .ndo_start_xmit = smc_hard_start_xmit, .ndo_tx_timeout = smc_timeout, .ndo_set_rx_mode = smc_set_multicast_list, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c index e9b8579e6241..cdb343f0c6e0 100644 --- a/drivers/net/ethernet/smsc/smsc911x.c +++ b/drivers/net/ethernet/smsc/smsc911x.c @@ -2152,7 +2152,6 @@ static const struct net_device_ops smsc911x_netdev_ops = { .ndo_get_stats = smsc911x_get_stats, .ndo_set_rx_mode = smsc911x_set_multicast_list, .ndo_do_ioctl = smsc911x_do_ioctl, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = smsc911x_set_mac_address, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/sun/sunbmac.c b/drivers/net/ethernet/sun/sunbmac.c index aa4f9d2d8fa9..ea89ef3b48fb 100644 --- a/drivers/net/ethernet/sun/sunbmac.c +++ b/drivers/net/ethernet/sun/sunbmac.c @@ -1064,7 +1064,6 @@ static const struct net_device_ops bigmac_ops = { .ndo_get_stats = bigmac_get_stats, .ndo_set_rx_mode = bigmac_set_multicast, .ndo_tx_timeout = bigmac_tx_timeout, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/sun/sunhme.c b/drivers/net/ethernet/sun/sunhme.c index cf4dcff051d5..ca96408058b0 100644 --- a/drivers/net/ethernet/sun/sunhme.c +++ b/drivers/net/ethernet/sun/sunhme.c @@ -2669,7 +2669,6 @@ static const struct net_device_ops hme_netdev_ops = { .ndo_tx_timeout = happy_meal_tx_timeout, .ndo_get_stats = happy_meal_get_stats, .ndo_set_rx_mode = happy_meal_set_multicast, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/sun/sunqe.c b/drivers/net/ethernet/sun/sunqe.c index 9b825780b3be..c5ef711f6567 100644 --- a/drivers/net/ethernet/sun/sunqe.c +++ b/drivers/net/ethernet/sun/sunqe.c @@ -823,7 +823,6 @@ static const struct net_device_ops qec_ops = { .ndo_start_xmit = qe_start_xmit, .ndo_set_rx_mode = qe_set_multicast, .ndo_tx_timeout = qe_tx_timeout, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/ti/cpmac.c b/drivers/net/ethernet/ti/cpmac.c index fa0cfda24fd9..c56e7030c44e 100644 --- a/drivers/net/ethernet/ti/cpmac.c +++ b/drivers/net/ethernet/ti/cpmac.c @@ -1068,7 +1068,6 @@ static const struct net_device_ops cpmac_netdev_ops = { .ndo_tx_timeout = cpmac_tx_timeout, .ndo_set_rx_mode = cpmac_set_multicast_list, .ndo_do_ioctl = cpmac_ioctl, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, }; diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 6d4b8a2e0cf6..b1ddf89a19be 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -1883,7 +1883,6 @@ static const struct net_device_ops cpsw_netdev_ops = { .ndo_set_mac_address = cpsw_ndo_set_mac_address, .ndo_do_ioctl = cpsw_ndo_ioctl, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, .ndo_tx_timeout = cpsw_ndo_tx_timeout, .ndo_set_rx_mode = cpsw_ndo_set_rx_mode, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/ti/tlan.c b/drivers/net/ethernet/ti/tlan.c index ece0ea0f6b38..4a3eeb10d45b 100644 --- a/drivers/net/ethernet/ti/tlan.c +++ b/drivers/net/ethernet/ti/tlan.c @@ -772,7 +772,6 @@ static const struct net_device_ops tlan_netdev_ops = { .ndo_get_stats = tlan_get_stats, .ndo_set_rx_mode = tlan_set_multicast_list, .ndo_do_ioctl = tlan_ioctl, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/drivers/net/ethernet/toshiba/tc35815.c b/drivers/net/ethernet/toshiba/tc35815.c index 5b01b3fa9fec..3be61ed28741 100644 --- a/drivers/net/ethernet/toshiba/tc35815.c +++ b/drivers/net/ethernet/toshiba/tc35815.c @@ -747,7 +747,6 @@ static const struct net_device_ops tc35815_netdev_ops = { .ndo_tx_timeout = tc35815_tx_timeout, .ndo_do_ioctl = tc35815_ioctl, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = tc35815_poll_controller, diff --git a/drivers/net/ethernet/tundra/tsi108_eth.c b/drivers/net/ethernet/tundra/tsi108_eth.c index 8fd131207ee1..f153ad729ce5 100644 --- a/drivers/net/ethernet/tundra/tsi108_eth.c +++ b/drivers/net/ethernet/tundra/tsi108_eth.c @@ -1548,7 +1548,6 @@ static const struct net_device_ops tsi108_netdev_ops = { .ndo_do_ioctl = tsi108_do_ioctl, .ndo_set_mac_address = tsi108_set_mac, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, }; static int diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c index 9d14731cdcb1..ba5c54249055 100644 --- a/drivers/net/ethernet/via/via-rhine.c +++ b/drivers/net/ethernet/via/via-rhine.c @@ -890,7 +890,6 @@ static const struct net_device_ops rhine_netdev_ops = { .ndo_start_xmit = rhine_start_tx, .ndo_get_stats64 = rhine_get_stats64, .ndo_set_rx_mode = rhine_set_rx_mode, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, .ndo_do_ioctl = netdev_ioctl, diff --git a/drivers/net/ethernet/wiznet/w5100.c b/drivers/net/ethernet/wiznet/w5100.c index d2349a1bc6ba..e1296ef2cf66 100644 --- a/drivers/net/ethernet/wiznet/w5100.c +++ b/drivers/net/ethernet/wiznet/w5100.c @@ -1045,7 +1045,6 @@ static const struct net_device_ops w5100_netdev_ops = { .ndo_set_rx_mode = w5100_set_rx_mode, .ndo_set_mac_address = w5100_set_macaddr, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, }; static int w5100_mmio_probe(struct platform_device *pdev) diff --git a/drivers/net/ethernet/wiznet/w5300.c b/drivers/net/ethernet/wiznet/w5300.c index ca31a57dbc86..724fabd38a23 100644 --- a/drivers/net/ethernet/wiznet/w5300.c +++ b/drivers/net/ethernet/wiznet/w5300.c @@ -536,7 +536,6 @@ static const struct net_device_ops w5300_netdev_ops = { .ndo_set_rx_mode = w5300_set_rx_mode, .ndo_set_mac_address = w5300_set_macaddr, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, }; static int w5300_hw_probe(struct platform_device *pdev) diff --git a/drivers/net/ethernet/xircom/xirc2ps_cs.c b/drivers/net/ethernet/xircom/xirc2ps_cs.c index ddced28e8247..3b08ec766076 100644 --- a/drivers/net/ethernet/xircom/xirc2ps_cs.c +++ b/drivers/net/ethernet/xircom/xirc2ps_cs.c @@ -466,7 +466,6 @@ static const struct net_device_ops netdev_ops = { .ndo_set_config = do_config, .ndo_do_ioctl = do_ioctl, .ndo_set_rx_mode = set_multicast_list, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/ethernet/xscale/ixp4xx_eth.c b/drivers/net/ethernet/xscale/ixp4xx_eth.c index 7f127dc1b7ba..46cc33b9e926 100644 --- a/drivers/net/ethernet/xscale/ixp4xx_eth.c +++ b/drivers/net/ethernet/xscale/ixp4xx_eth.c @@ -1379,7 +1379,6 @@ static const struct net_device_ops ixp4xx_netdev_ops = { .ndo_start_xmit = eth_xmit, .ndo_set_rx_mode = eth_set_mcast_list, .ndo_do_ioctl = eth_ioctl, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/plip/plip.c b/drivers/net/plip/plip.c index 9c4b41a4df7d..3c55ea357f35 100644 --- a/drivers/net/plip/plip.c +++ b/drivers/net/plip/plip.c @@ -270,7 +270,6 @@ static const struct net_device_ops plip_netdev_ops = { .ndo_stop = plip_close, .ndo_start_xmit = plip_tx_packet, .ndo_do_ioctl = plip_ioctl, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/sb1000.c b/drivers/net/sb1000.c index aad0b59d41e3..8b8b53259783 100644 --- a/drivers/net/sb1000.c +++ b/drivers/net/sb1000.c @@ -141,7 +141,6 @@ static const struct net_device_ops sb1000_netdev_ops = { .ndo_start_xmit = sb1000_start_xmit, .ndo_do_ioctl = sb1000_dev_ioctl, .ndo_stop = sb1000_close, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/usb/catc.c b/drivers/net/usb/catc.c index d9ca05d3ac8e..a1f2f6f1e614 100644 --- a/drivers/net/usb/catc.c +++ b/drivers/net/usb/catc.c @@ -761,7 +761,6 @@ static const struct net_device_ops catc_netdev_ops = { .ndo_tx_timeout = catc_tx_timeout, .ndo_set_rx_mode = catc_set_multicast_list, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/usb/kaweth.c b/drivers/net/usb/kaweth.c index 66b34ddbe216..338aed5da14d 100644 --- a/drivers/net/usb/kaweth.c +++ b/drivers/net/usb/kaweth.c @@ -982,7 +982,6 @@ static const struct net_device_ops kaweth_netdev_ops = { .ndo_tx_timeout = kaweth_tx_timeout, .ndo_set_rx_mode = kaweth_set_rx_mode, .ndo_get_stats = kaweth_netdev_stats, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/usb/pegasus.c b/drivers/net/usb/pegasus.c index 1434e5dd5f9c..399f7ee57aea 100644 --- a/drivers/net/usb/pegasus.c +++ b/drivers/net/usb/pegasus.c @@ -1273,7 +1273,6 @@ static const struct net_device_ops pegasus_netdev_ops = { .ndo_set_rx_mode = pegasus_set_multicast, .ndo_get_stats = pegasus_netdev_stats, .ndo_tx_timeout = pegasus_tx_timeout, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 44d439f50961..2886946ca371 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -4113,7 +4113,8 @@ static int rtl8152_change_mtu(struct net_device *dev, int new_mtu) switch (tp->version) { case RTL_VER_01: case RTL_VER_02: - return eth_change_mtu(dev, new_mtu); + dev->mtu = new_mtu; + return 0; default: break; } diff --git a/drivers/net/usb/rtl8150.c b/drivers/net/usb/rtl8150.c index 7c72bfac89d0..93a1bda1c1e5 100644 --- a/drivers/net/usb/rtl8150.c +++ b/drivers/net/usb/rtl8150.c @@ -847,7 +847,6 @@ static const struct net_device_ops rtl8150_netdev_ops = { .ndo_set_rx_mode = rtl8150_set_multicast, .ndo_set_mac_address = rtl8150_set_mac_address, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/wan/sbni.c b/drivers/net/wan/sbni.c index 3a421ca8a4d0..3f83be98d469 100644 --- a/drivers/net/wan/sbni.c +++ b/drivers/net/wan/sbni.c @@ -211,7 +211,6 @@ static const struct net_device_ops sbni_netdev_ops = { .ndo_start_xmit = sbni_start_xmit, .ndo_set_rx_mode = set_multicast_list, .ndo_do_ioctl = sbni_ioctl, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/wireless/intersil/prism54/islpci_dev.c b/drivers/net/wireless/intersil/prism54/islpci_dev.c index 84a42012aeae..325176d4d796 100644 --- a/drivers/net/wireless/intersil/prism54/islpci_dev.c +++ b/drivers/net/wireless/intersil/prism54/islpci_dev.c @@ -808,7 +808,6 @@ static const struct net_device_ops islpci_netdev_ops = { .ndo_start_xmit = islpci_eth_transmit, .ndo_tx_timeout = islpci_eth_tx_timeout, .ndo_set_mac_address = prism54_set_mac_address, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 431f13b4faf6..e95b79bccf9b 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -2791,7 +2791,6 @@ static void mac80211_hwsim_free(void) static const struct net_device_ops hwsim_netdev_ops = { .ndo_start_xmit = hwsim_mon_xmit, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/wireless/marvell/libertas/main.c b/drivers/net/wireless/marvell/libertas/main.c index 8541cbed786d..e3500203715c 100644 --- a/drivers/net/wireless/marvell/libertas/main.c +++ b/drivers/net/wireless/marvell/libertas/main.c @@ -945,7 +945,6 @@ static const struct net_device_ops lbs_netdev_ops = { .ndo_start_xmit = lbs_hard_start_xmit, .ndo_set_mac_address = lbs_set_mac_address, .ndo_set_rx_mode = lbs_set_multicast_list, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c index 0881ba8535f4..4fdc7223c894 100644 --- a/drivers/net/wireless/ray_cs.c +++ b/drivers/net/wireless/ray_cs.c @@ -272,7 +272,6 @@ static const struct net_device_ops ray_netdev_ops = { .ndo_set_config = ray_dev_config, .ndo_get_stats = ray_get_stats, .ndo_set_rx_mode = set_multicast_list, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/wireless/wl3501_cs.c b/drivers/net/wireless/wl3501_cs.c index 932f3f81e8cf..d9d29ab88184 100644 --- a/drivers/net/wireless/wl3501_cs.c +++ b/drivers/net/wireless/wl3501_cs.c @@ -1853,7 +1853,6 @@ static const struct net_device_ops wl3501_netdev_ops = { .ndo_stop = wl3501_close, .ndo_start_xmit = wl3501_hard_start_xmit, .ndo_tx_timeout = wl3501_tx_timeout, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/net/wireless/zydas/zd1201.c b/drivers/net/wireless/zydas/zd1201.c index dea049b2556f..de7ff395977a 100644 --- a/drivers/net/wireless/zydas/zd1201.c +++ b/drivers/net/wireless/zydas/zd1201.c @@ -1724,7 +1724,6 @@ static const struct net_device_ops zd1201_netdev_ops = { .ndo_tx_timeout = zd1201_tx_timeout, .ndo_set_rx_mode = zd1201_set_multicast, .ndo_set_mac_address = zd1201_set_mac_address, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/staging/rtl8188eu/os_dep/mon.c b/drivers/staging/rtl8188eu/os_dep/mon.c index d976e5e18d50..c9c9821cfc32 100644 --- a/drivers/staging/rtl8188eu/os_dep/mon.c +++ b/drivers/staging/rtl8188eu/os_dep/mon.c @@ -145,7 +145,6 @@ static netdev_tx_t mon_xmit(struct sk_buff *skb, struct net_device *dev) static const struct net_device_ops mon_netdev_ops = { .ndo_start_xmit = mon_xmit, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; diff --git a/drivers/staging/rtl8192e/rtl8192e/rtl_core.c b/drivers/staging/rtl8192e/rtl8192e/rtl_core.c index 4c30eea45f89..5f53fbd565ef 100644 --- a/drivers/staging/rtl8192e/rtl8192e/rtl_core.c +++ b/drivers/staging/rtl8192e/rtl8192e/rtl_core.c @@ -2545,7 +2545,6 @@ static const struct net_device_ops rtl8192_netdev_ops = { .ndo_set_rx_mode = _rtl92e_set_multicast, .ndo_set_mac_address = _rtl92e_set_mac_adr, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, .ndo_start_xmit = rtllib_xmit, }; diff --git a/drivers/staging/rtl8192u/r8192U_core.c b/drivers/staging/rtl8192u/r8192U_core.c index 457eeb5f5239..fdb03dccb449 100644 --- a/drivers/staging/rtl8192u/r8192U_core.c +++ b/drivers/staging/rtl8192u/r8192U_core.c @@ -4930,7 +4930,6 @@ static const struct net_device_ops rtl8192_netdev_ops = { .ndo_set_rx_mode = r8192_set_multicast, .ndo_set_mac_address = r8192_set_mac_adr, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, .ndo_start_xmit = ieee80211_xmit, }; diff --git a/drivers/staging/slicoss/slicoss.c b/drivers/staging/slicoss/slicoss.c index 062307ad7fed..2802b900f8ee 100644 --- a/drivers/staging/slicoss/slicoss.c +++ b/drivers/staging/slicoss/slicoss.c @@ -2880,7 +2880,6 @@ static const struct net_device_ops slic_netdev_ops = { .ndo_get_stats = slic_get_stats, .ndo_set_rx_mode = slic_mcast_set_list, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, }; static u32 slic_card_locate(struct adapter *adapter) diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 117d02e0fc31..864d6f2b2cb0 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -35,6 +35,8 @@ #define ETH_FRAME_LEN 1514 /* Max. octets in frame sans FCS */ #define ETH_FCS_LEN 4 /* Octets in the FCS */ +#define ETH_MIN_MTU 68 /* Min IPv4 MTU per RFC791 */ + /* * These are the defined Ethernet Protocol ID's. */ diff --git a/net/atm/br2684.c b/net/atm/br2684.c index aa0047c5c467..c7d82f4e8422 100644 --- a/net/atm/br2684.c +++ b/net/atm/br2684.c @@ -620,14 +620,12 @@ error: static const struct net_device_ops br2684_netdev_ops = { .ndo_start_xmit = br2684_start_xmit, .ndo_set_mac_address = br2684_mac_addr, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, }; static const struct net_device_ops br2684_netdev_ops_routed = { .ndo_start_xmit = br2684_start_xmit, .ndo_set_mac_address = br2684_mac_addr, - .ndo_change_mtu = eth_change_mtu }; static void br2684_setup(struct net_device *netdev) diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c index f4fcb4a9d5c1..0f25ddc319e2 100644 --- a/net/bluetooth/bnep/netdev.c +++ b/net/bluetooth/bnep/netdev.c @@ -211,7 +211,6 @@ static const struct net_device_ops bnep_netdev_ops = { .ndo_set_rx_mode = bnep_net_set_mc_list, .ndo_set_mac_address = bnep_net_set_mac_addr, .ndo_tx_timeout = bnep_net_timeout, - .ndo_change_mtu = eth_change_mtu, }; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 66dff5e3d772..f983c102ebe3 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -322,8 +322,7 @@ EXPORT_SYMBOL(eth_mac_addr); */ int eth_change_mtu(struct net_device *dev, int new_mtu) { - if (new_mtu < 68 || new_mtu > ETH_DATA_LEN) - return -EINVAL; + netdev_warn(dev, "%s is deprecated\n", __func__); dev->mtu = new_mtu; return 0; } @@ -357,6 +356,8 @@ void ether_setup(struct net_device *dev) dev->type = ARPHRD_ETHER; dev->hard_header_len = ETH_HLEN; dev->mtu = ETH_DATA_LEN; + dev->min_mtu = ETH_MIN_MTU; + dev->max_mtu = ETH_DATA_LEN; dev->addr_len = ETH_ALEN; dev->tx_queue_len = 1000; /* Ethernet wants good queues */ dev->flags = IFF_BROADCAST|IFF_MULTICAST; diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c index d8b7267280c3..8192eaea4ecd 100644 --- a/net/irda/irlan/irlan_eth.c +++ b/net/irda/irlan/irlan_eth.c @@ -51,7 +51,6 @@ static const struct net_device_ops irlan_eth_netdev_ops = { .ndo_stop = irlan_eth_close, .ndo_start_xmit = irlan_eth_xmit, .ndo_set_rx_mode = irlan_eth_set_multicast_list, - .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, }; -- cgit v1.2.3-71-gd317 From d894be57ca92c8a8819ab544d550809e8731137b Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Thu, 20 Oct 2016 13:55:16 -0400 Subject: ethernet: use net core MTU range checking in more drivers Somehow, I missed a healthy number of ethernet drivers in the last pass. Most of these drivers either were in need of an updated max_mtu to make jumbo frames possible to enable again. In a few cases, also setting a different min_mtu to match previous lower bounds. There are also a few drivers that had no upper bounds checking, so they're getting a brand new ETH_MAX_MTU that is identical to IP_MAX_MTU, but accessible by includes all ethernet and ethernet-like drivers all have already. acenic: - min_mtu = 0, max_mtu = 9000 amazon/ena: - min_mtu = 128, max_mtu = adapter->max_mtu amd/xgbe: - min_mtu = 0, max_mtu = 9000 sb1250: - min_mtu = 0, max_mtu = 1518 cxgb3: - min_mtu = 81, max_mtu = 65535 cxgb4: - min_mtu = 81, max_mtu = 9600 cxgb4vf: - min_mtu = 81, max_mtu = 65535 benet: - min_mtu = 256, max_mtu = 9000 ibmveth: - min_mtu = 68, max_mtu = 65535 ibmvnic: - min_mtu = adapter->min_mtu, max_mtu = adapter->max_mtu - remove now redundant ibmvnic_change_mtu jme: - min_mtu = 1280, max_mtu = 9202 mv643xx_eth: - min_mtu = 64, max_mtu = 9500 mlxsw: - min_mtu = 0, max_mtu = 65535 - Basically bypassing the core checks, and instead relying on dynamic checks in the respective switch drivers' ndo_change_mtu functions ns83820: - min_mtu = 0 - remove redundant ns83820_change_mtu, only checked for mtu > 1500 netxen: - min_mtu = 0, max_mtu = 8000 (P2), max_mtu = 9600 (P3) qlge: - min_mtu = 1500, max_mtu = 9000 - driver only supports setting mtu to 1500 or 9000, so the core check only rules out < 1500 and > 9000, qlge_change_mtu still needs to check that the value is 1500 or 9000 qualcomm/emac: - min_mtu = 46, max_mtu = 9194 xilinx_axienet: - min_mtu = 64, max_mtu = 9000 Fixes: 61e84623ace3 ("net: centralize net_device min/max MTU checking") CC: netdev@vger.kernel.org CC: Jes Sorensen CC: Netanel Belgazal CC: Tom Lendacky CC: Santosh Raspatur CC: Hariprasad S CC: Sathya Perla CC: Ajit Khaparde CC: Sriharsha Basavapatna CC: Somnath Kotur CC: Thomas Falcon CC: John Allen CC: Guo-Fu Tseng CC: Sebastian Hesselbarth CC: Jiri Pirko CC: Ido Schimmel CC: Manish Chopra CC: Sony Chacko CC: Rajesh Borundia CC: Timur Tabi CC: Anirudha Sarangi CC: John Linn Signed-off-by: Jarod Wilson Signed-off-by: David S. Miller --- drivers/net/ethernet/alteon/acenic.c | 5 ++--- drivers/net/ethernet/amazon/ena/ena_netdev.c | 9 ++------- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 5 ----- drivers/net/ethernet/amd/xgbe/xgbe-main.c | 2 ++ drivers/net/ethernet/broadcom/sb1250-mac.c | 12 ++---------- drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c | 4 ++-- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 6 ++++-- .../net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c | 6 ++---- drivers/net/ethernet/emulex/benet/be_main.c | 22 ++++------------------ drivers/net/ethernet/ibm/ibmveth.c | 6 +++--- drivers/net/ethernet/ibm/ibmvnic.c | 16 ++++------------ drivers/net/ethernet/jme.c | 12 ++++-------- drivers/net/ethernet/marvell/mv643xx_eth.c | 7 ++++--- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 3 +++ drivers/net/ethernet/mellanox/mlxsw/switchx2.c | 3 +++ drivers/net/ethernet/natsemi/ns83820.c | 11 ++--------- drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c | 12 ------------ .../net/ethernet/qlogic/netxen/netxen_nic_main.c | 7 +++++++ drivers/net/ethernet/qlogic/qlge/qlge_main.c | 7 +++++++ drivers/net/ethernet/qualcomm/emac/emac.c | 13 ++++++------- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 7 ++++--- include/uapi/linux/if_ether.h | 1 + 22 files changed, 68 insertions(+), 108 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/ethernet/alteon/acenic.c b/drivers/net/ethernet/alteon/acenic.c index b90a26b13fdf..a5c1e290677a 100644 --- a/drivers/net/ethernet/alteon/acenic.c +++ b/drivers/net/ethernet/alteon/acenic.c @@ -474,6 +474,8 @@ static int acenic_probe_one(struct pci_dev *pdev, dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX; dev->watchdog_timeo = 5*HZ; + dev->min_mtu = 0; + dev->max_mtu = ACE_JUMBO_MTU; dev->netdev_ops = &ace_netdev_ops; dev->ethtool_ops = &ace_ethtool_ops; @@ -2548,9 +2550,6 @@ static int ace_change_mtu(struct net_device *dev, int new_mtu) struct ace_private *ap = netdev_priv(dev); struct ace_regs __iomem *regs = ap->regs; - if (new_mtu > ACE_JUMBO_MTU) - return -EINVAL; - writel(new_mtu + ETH_HLEN + 4, ®s->IfMtu); dev->mtu = new_mtu; diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index bfeaec5bd7b9..2a55ab00686a 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -103,13 +103,6 @@ static int ena_change_mtu(struct net_device *dev, int new_mtu) struct ena_adapter *adapter = netdev_priv(dev); int ret; - if ((new_mtu > adapter->max_mtu) || (new_mtu < ENA_MIN_MTU)) { - netif_err(adapter, drv, dev, - "Invalid MTU setting. new_mtu: %d\n", new_mtu); - - return -EINVAL; - } - ret = ena_com_set_dev_mtu(adapter->ena_dev, new_mtu); if (!ret) { netif_dbg(adapter, drv, dev, "set MTU to %d\n", new_mtu); @@ -2755,6 +2748,8 @@ static void ena_set_conf_feat_params(struct ena_adapter *adapter, ena_set_dev_offloads(feat, netdev); adapter->max_mtu = feat->dev_attr.max_mtu; + netdev->max_mtu = adapter->max_mtu; + netdev->min_mtu = ENA_MIN_MTU; } static int ena_rss_init_default(struct ena_adapter *adapter) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index 7f9216db026f..c4e668208e04 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -257,11 +257,6 @@ static int xgbe_calc_rx_buf_size(struct net_device *netdev, unsigned int mtu) { unsigned int rx_buf_size; - if (mtu > XGMAC_JUMBO_PACKET_MTU) { - netdev_alert(netdev, "MTU exceeds maximum supported value\n"); - return -EINVAL; - } - rx_buf_size = mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN; rx_buf_size = clamp_val(rx_buf_size, XGBE_RX_MIN_BUF_SIZE, PAGE_SIZE); diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-main.c b/drivers/net/ethernet/amd/xgbe/xgbe-main.c index 9de078819aa6..667e1209a2f5 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-main.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-main.c @@ -738,6 +738,8 @@ static int xgbe_probe(struct platform_device *pdev) pdata->netdev_features = netdev->features; netdev->priv_flags |= IFF_UNICAST_FLT; + netdev->min_mtu = 0; + netdev->max_mtu = XGMAC_JUMBO_PACKET_MTU; /* Use default watchdog timeout */ netdev->watchdog_timeo = 0; diff --git a/drivers/net/ethernet/broadcom/sb1250-mac.c b/drivers/net/ethernet/broadcom/sb1250-mac.c index f1b81187a201..cb312e4c89f4 100644 --- a/drivers/net/ethernet/broadcom/sb1250-mac.c +++ b/drivers/net/ethernet/broadcom/sb1250-mac.c @@ -2147,15 +2147,6 @@ static void sbmac_setmulti(struct sbmac_softc *sc) } } -static int sb1250_change_mtu(struct net_device *_dev, int new_mtu) -{ - if (new_mtu > ENET_PACKET_SIZE) - return -EINVAL; - _dev->mtu = new_mtu; - pr_info("changing the mtu to %d\n", new_mtu); - return 0; -} - static const struct net_device_ops sbmac_netdev_ops = { .ndo_open = sbmac_open, .ndo_stop = sbmac_close, @@ -2163,7 +2154,6 @@ static const struct net_device_ops sbmac_netdev_ops = { .ndo_set_rx_mode = sbmac_set_rx_mode, .ndo_tx_timeout = sbmac_tx_timeout, .ndo_do_ioctl = sbmac_mii_ioctl, - .ndo_change_mtu = sb1250_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, #ifdef CONFIG_NET_POLL_CONTROLLER @@ -2229,6 +2219,8 @@ static int sbmac_init(struct platform_device *pldev, long long base) dev->netdev_ops = &sbmac_netdev_ops; dev->watchdog_timeo = TX_TIMEOUT; + dev->max_mtu = 0; + dev->max_mtu = ENET_PACKET_SIZE; netif_napi_add(dev, &sc->napi, sbmac_poll, 16); diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c index 43da891fab97..092b3c16440b 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c @@ -2531,8 +2531,6 @@ static int cxgb_change_mtu(struct net_device *dev, int new_mtu) struct adapter *adapter = pi->adapter; int ret; - if (new_mtu < 81) /* accommodate SACK */ - return -EINVAL; if ((ret = t3_mac_set_mtu(&pi->mac, new_mtu))) return ret; dev->mtu = new_mtu; @@ -3295,6 +3293,8 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) netdev->netdev_ops = &cxgb_netdev_ops; netdev->ethtool_ops = &cxgb_ethtool_ops; + netdev->min_mtu = 81; + netdev->max_mtu = ETH_MAX_MTU; } pci_set_drvdata(pdev, adapter); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index f320497368f4..b0bb23f95beb 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -2502,8 +2502,6 @@ static int cxgb_change_mtu(struct net_device *dev, int new_mtu) int ret; struct port_info *pi = netdev_priv(dev); - if (new_mtu < 81 || new_mtu > MAX_MTU) /* accommodate SACK */ - return -EINVAL; ret = t4_set_rxmode(pi->adapter, pi->adapter->pf, pi->viid, new_mtu, -1, -1, -1, -1, true); if (!ret) @@ -4803,6 +4801,10 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) netdev->priv_flags |= IFF_UNICAST_FLT; + /* MTU range: 81 - 9600 */ + netdev->min_mtu = 81; + netdev->max_mtu = MAX_MTU; + netdev->netdev_ops = &cxgb4_netdev_ops; #ifdef CONFIG_CHELSIO_T4_DCB netdev->dcbnl_ops = &cxgb4_dcb_ops; diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c index 100b2cc064a3..5d4da0e8acaa 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c @@ -1108,10 +1108,6 @@ static int cxgb4vf_change_mtu(struct net_device *dev, int new_mtu) int ret; struct port_info *pi = netdev_priv(dev); - /* accommodate SACK */ - if (new_mtu < 81) - return -EINVAL; - ret = t4vf_set_rxmode(pi->adapter, pi->viid, new_mtu, -1, -1, -1, -1, true); if (!ret) @@ -2966,6 +2962,8 @@ static int cxgb4vf_pci_probe(struct pci_dev *pdev, netdev->features |= NETIF_F_HIGHDMA; netdev->priv_flags |= IFF_UNICAST_FLT; + netdev->min_mtu = 81; + netdev->max_mtu = ETH_MAX_MTU; netdev->netdev_ops = &cxgb4vf_netdev_ops; netdev->ethtool_ops = &cxgb4vf_ethtool_ops; diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index cece8a08edca..3f6152cc648c 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -1406,23 +1406,6 @@ drop: return NETDEV_TX_OK; } -static int be_change_mtu(struct net_device *netdev, int new_mtu) -{ - struct be_adapter *adapter = netdev_priv(netdev); - struct device *dev = &adapter->pdev->dev; - - if (new_mtu < BE_MIN_MTU || new_mtu > BE_MAX_MTU) { - dev_info(dev, "MTU must be between %d and %d bytes\n", - BE_MIN_MTU, BE_MAX_MTU); - return -EINVAL; - } - - dev_info(dev, "MTU changed from %d to %d bytes\n", - netdev->mtu, new_mtu); - netdev->mtu = new_mtu; - return 0; -} - static inline bool be_in_all_promisc(struct be_adapter *adapter) { return (adapter->if_flags & BE_IF_FLAGS_ALL_PROMISCUOUS) == @@ -5216,7 +5199,6 @@ static const struct net_device_ops be_netdev_ops = { .ndo_start_xmit = be_xmit, .ndo_set_rx_mode = be_set_rx_mode, .ndo_set_mac_address = be_mac_addr_set, - .ndo_change_mtu = be_change_mtu, .ndo_get_stats64 = be_get_stats64, .ndo_validate_addr = eth_validate_addr, .ndo_vlan_rx_add_vid = be_vlan_add_vid, @@ -5266,6 +5248,10 @@ static void be_netdev_init(struct net_device *netdev) netdev->netdev_ops = &be_netdev_ops; netdev->ethtool_ops = &be_ethtool_ops; + + /* MTU range: 256 - 9000 */ + netdev->min_mtu = BE_MIN_MTU; + netdev->max_mtu = BE_MAX_MTU; } static void be_cleanup(struct be_adapter *adapter) diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c index ebe60719e489..29c05d0d79a9 100644 --- a/drivers/net/ethernet/ibm/ibmveth.c +++ b/drivers/net/ethernet/ibm/ibmveth.c @@ -1349,9 +1349,6 @@ static int ibmveth_change_mtu(struct net_device *dev, int new_mtu) int i, rc; int need_restart = 0; - if (new_mtu < IBMVETH_MIN_MTU) - return -EINVAL; - for (i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++) if (new_mtu_oh <= adapter->rx_buff_pool[i].buff_size) break; @@ -1551,6 +1548,9 @@ static int ibmveth_probe(struct vio_dev *dev, const struct vio_device_id *id) netdev->hw_features |= NETIF_F_TSO; } + netdev->min_mtu = IBMVETH_MIN_MTU; + netdev->min_mtu = ETH_MAX_MTU; + memcpy(netdev->dev_addr, mac_addr_p, ETH_ALEN); if (firmware_has_feature(FW_FEATURE_CMO)) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index bfe17d9c022d..657206be7ba9 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -902,17 +902,6 @@ static int ibmvnic_set_mac(struct net_device *netdev, void *p) return 0; } -static int ibmvnic_change_mtu(struct net_device *netdev, int new_mtu) -{ - struct ibmvnic_adapter *adapter = netdev_priv(netdev); - - if (new_mtu > adapter->req_mtu || new_mtu < adapter->min_mtu) - return -EINVAL; - - netdev->mtu = new_mtu; - return 0; -} - static void ibmvnic_tx_timeout(struct net_device *dev) { struct ibmvnic_adapter *adapter = netdev_priv(dev); @@ -1029,7 +1018,6 @@ static const struct net_device_ops ibmvnic_netdev_ops = { .ndo_set_rx_mode = ibmvnic_set_multi, .ndo_set_mac_address = ibmvnic_set_mac, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = ibmvnic_change_mtu, .ndo_tx_timeout = ibmvnic_tx_timeout, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = ibmvnic_netpoll_controller, @@ -2638,10 +2626,12 @@ static void handle_query_cap_rsp(union ibmvnic_crq *crq, break; case MIN_MTU: adapter->min_mtu = be64_to_cpu(crq->query_capability.number); + netdev->min_mtu = adapter->min_mtu; netdev_dbg(netdev, "min_mtu = %lld\n", adapter->min_mtu); break; case MAX_MTU: adapter->max_mtu = be64_to_cpu(crq->query_capability.number); + netdev->max_mtu = adapter->max_mtu; netdev_dbg(netdev, "max_mtu = %lld\n", adapter->max_mtu); break; case MAX_MULTICAST_FILTERS: @@ -3654,6 +3644,8 @@ static void handle_crq_init_rsp(struct work_struct *work) goto task_failed; netdev->real_num_tx_queues = adapter->req_tx_queues; + netdev->min_mtu = adapter->min_mtu; + netdev->max_mtu = adapter->max_mtu; if (adapter->failover) { adapter->failover = false; diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c index 836ebd8ee768..f9fcab54783c 100644 --- a/drivers/net/ethernet/jme.c +++ b/drivers/net/ethernet/jme.c @@ -2357,14 +2357,6 @@ jme_change_mtu(struct net_device *netdev, int new_mtu) { struct jme_adapter *jme = netdev_priv(netdev); - if (new_mtu == jme->old_mtu) - return 0; - - if (((new_mtu + ETH_HLEN) > MAX_ETHERNET_JUMBO_PACKET_SIZE) || - ((new_mtu) < IPV6_MIN_MTU)) - return -EINVAL; - - netdev->mtu = new_mtu; netdev_update_features(netdev); @@ -3063,6 +3055,10 @@ jme_init_one(struct pci_dev *pdev, if (using_dac) netdev->features |= NETIF_F_HIGHDMA; + /* MTU range: 1280 - 9202*/ + netdev->min_mtu = IPV6_MIN_MTU; + netdev->max_mtu = MAX_ETHERNET_JUMBO_PACKET_SIZE - ETH_HLEN; + SET_NETDEV_DEV(netdev, &pdev->dev); pci_set_drvdata(pdev, netdev); diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index 18e6bb6e3867..68675d83bdc5 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -2585,9 +2585,6 @@ static int mv643xx_eth_change_mtu(struct net_device *dev, int new_mtu) { struct mv643xx_eth_private *mp = netdev_priv(dev); - if (new_mtu < 64 || new_mtu > 9500) - return -EINVAL; - dev->mtu = new_mtu; mv643xx_eth_recalc_skb_size(mp); tx_set_rate(mp, 1000000000, 16777216); @@ -3206,6 +3203,10 @@ static int mv643xx_eth_probe(struct platform_device *pdev) dev->priv_flags |= IFF_UNICAST_FLT; dev->gso_max_segs = MV643XX_MAX_TSO_SEGS; + /* MTU range: 64 - 9500 */ + dev->min_mtu = 64; + dev->max_mtu = 9500; + SET_NETDEV_DEV(dev, &pdev->dev); if (mp->shared->win_protect) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 99805fd3d110..6d8cb22579ee 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -2284,6 +2284,9 @@ static int mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u8 local_port, NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_TC; dev->hw_features |= NETIF_F_HW_TC; + dev->min_mtu = 0; + dev->max_mtu = ETH_MAX_MTU; + /* Each packet needs to have a Tx header (metadata) on top all other * headers. */ diff --git a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c index c0c23e2f3275..66af63d99b8b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c +++ b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c @@ -994,6 +994,9 @@ static int mlxsw_sx_port_create(struct mlxsw_sx *mlxsw_sx, u8 local_port) dev->features |= NETIF_F_NETNS_LOCAL | NETIF_F_LLTX | NETIF_F_SG | NETIF_F_VLAN_CHALLENGED; + dev->min_mtu = 0; + dev->max_mtu = ETH_MAX_MTU; + /* Each packet needs to have a Tx header (metadata) on top all other * headers. */ diff --git a/drivers/net/ethernet/natsemi/ns83820.c b/drivers/net/ethernet/natsemi/ns83820.c index 569ade6cf85c..a34631ed741d 100644 --- a/drivers/net/ethernet/natsemi/ns83820.c +++ b/drivers/net/ethernet/natsemi/ns83820.c @@ -1679,14 +1679,6 @@ static void ns83820_getmac(struct ns83820 *dev, u8 *mac) } } -static int ns83820_change_mtu(struct net_device *ndev, int new_mtu) -{ - if (new_mtu > RX_BUF_SIZE) - return -EINVAL; - ndev->mtu = new_mtu; - return 0; -} - static void ns83820_set_multicast(struct net_device *ndev) { struct ns83820 *dev = PRIV(ndev); @@ -1933,7 +1925,6 @@ static const struct net_device_ops netdev_ops = { .ndo_stop = ns83820_stop, .ndo_start_xmit = ns83820_hard_start_xmit, .ndo_get_stats = ns83820_get_stats, - .ndo_change_mtu = ns83820_change_mtu, .ndo_set_rx_mode = ns83820_set_multicast, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, @@ -2190,6 +2181,8 @@ static int ns83820_init_one(struct pci_dev *pci_dev, ndev->features |= NETIF_F_SG; ndev->features |= NETIF_F_IP_CSUM; + ndev->min_mtu = 0; + #ifdef NS83820_VLAN_ACCEL_SUPPORT /* We also support hardware vlan acceleration */ ndev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX; diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c index 2b10f1bcd151..a996801d442d 100644 --- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c +++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c @@ -987,20 +987,8 @@ int netxen_send_lro_cleanup(struct netxen_adapter *adapter) int netxen_nic_change_mtu(struct net_device *netdev, int mtu) { struct netxen_adapter *adapter = netdev_priv(netdev); - int max_mtu; int rc = 0; - if (NX_IS_REVISION_P3(adapter->ahw.revision_id)) - max_mtu = P3_MAX_MTU; - else - max_mtu = P2_MAX_MTU; - - if (mtu > max_mtu) { - printk(KERN_ERR "%s: mtu > %d bytes unsupported\n", - netdev->name, max_mtu); - return -EINVAL; - } - if (adapter->set_mtu) rc = adapter->set_mtu(adapter, mtu); diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c index 7a0281a36c28..561fb94c7267 100644 --- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c +++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c @@ -1572,6 +1572,13 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) adapter->physical_port = i; } + /* MTU range: 0 - 8000 (P2) or 9600 (P3) */ + netdev->min_mtu = 0; + if (NX_IS_REVISION_P3(adapter->ahw.revision_id)) + netdev->max_mtu = P3_MAX_MTU; + else + netdev->max_mtu = P2_MAX_MTU; + netxen_nic_clear_stats(adapter); err = netxen_setup_intr(adapter); diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c index fd4a8e473f11..1409412ab39d 100644 --- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c +++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c @@ -4788,6 +4788,13 @@ static int qlge_probe(struct pci_dev *pdev, ndev->ethtool_ops = &qlge_ethtool_ops; ndev->watchdog_timeo = 10 * HZ; + /* MTU range: this driver only supports 1500 or 9000, so this only + * filters out values above or below, and we'll rely on + * qlge_change_mtu to make sure only 1500 or 9000 are allowed + */ + ndev->min_mtu = ETH_DATA_LEN; + ndev->max_mtu = 9000; + err = register_netdev(ndev); if (err) { dev_err(&pdev->dev, "net device registration failed.\n"); diff --git a/drivers/net/ethernet/qualcomm/emac/emac.c b/drivers/net/ethernet/qualcomm/emac/emac.c index 9bf3b2b82e95..e4e1925d18a4 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac.c +++ b/drivers/net/ethernet/qualcomm/emac/emac.c @@ -239,15 +239,8 @@ static void emac_rx_mode_set(struct net_device *netdev) /* Change the Maximum Transfer Unit (MTU) */ static int emac_change_mtu(struct net_device *netdev, int new_mtu) { - unsigned int max_frame = new_mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN; struct emac_adapter *adpt = netdev_priv(netdev); - if ((max_frame < EMAC_MIN_ETH_FRAME_SIZE) || - (max_frame > EMAC_MAX_ETH_FRAME_SIZE)) { - netdev_err(adpt->netdev, "error: invalid MTU setting\n"); - return -EINVAL; - } - netif_info(adpt, hw, adpt->netdev, "changing MTU from %d to %d\n", netdev->mtu, new_mtu); @@ -679,6 +672,12 @@ static int emac_probe(struct platform_device *pdev) netdev->vlan_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_TSO | NETIF_F_TSO6; + /* MTU range: 46 - 9194 */ + netdev->min_mtu = EMAC_MIN_ETH_FRAME_SIZE - + (ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN); + netdev->max_mtu = EMAC_MAX_ETH_FRAME_SIZE - + (ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN); + INIT_WORK(&adpt->work_thread, emac_work_thread); /* Initialize queues */ diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index c688d68c39aa..c9c8a3be9f1b 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -1034,9 +1034,6 @@ static int axienet_change_mtu(struct net_device *ndev, int new_mtu) XAE_TRL_SIZE) > lp->rxmem) return -EINVAL; - if ((new_mtu > XAE_JUMBO_MTU) || (new_mtu < 64)) - return -EINVAL; - ndev->mtu = new_mtu; return 0; @@ -1475,6 +1472,10 @@ static int axienet_probe(struct platform_device *pdev) ndev->netdev_ops = &axienet_netdev_ops; ndev->ethtool_ops = &axienet_ethtool_ops; + /* MTU range: 64 - 9000 */ + ndev->min_mtu = 64; + ndev->max_mtu = XAE_JUMBO_MTU; + lp = netdev_priv(ndev); lp->ndev = ndev; lp->dev = &pdev->dev; diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 864d6f2b2cb0..3e5185e9ef03 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -36,6 +36,7 @@ #define ETH_FCS_LEN 4 /* Octets in the FCS */ #define ETH_MIN_MTU 68 /* Min IPv4 MTU per RFC791 */ +#define ETH_MAX_MTU 0xFFFFU /* 65535, same as IP_MAX_MTU */ /* * These are the defined Ethernet Protocol ID's. -- cgit v1.2.3-71-gd317 From 2d0e30c30f84d08dc16f0f2af41f1b8a85f0755e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 21 Oct 2016 12:46:33 +0200 Subject: bpf: add helper for retrieving current numa node id Use case is mainly for soreuseport to select sockets for the local numa node, but since generic, lets also add this for other networking and tracing program types. Suggested-by: Eric Dumazet Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 6 ++++++ kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 12 ++++++++++++ kernel/trace/bpf_trace.c | 2 ++ net/core/filter.c | 2 ++ 6 files changed, 24 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c201017b5730..edcd96ded8aa 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -319,6 +319,7 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; +extern const struct bpf_func_proto bpf_get_numa_node_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; extern const struct bpf_func_proto bpf_ktime_get_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f09c70b97eca..374ef582ae18 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -426,6 +426,12 @@ enum bpf_func_id { */ BPF_FUNC_set_hash_invalid, + /** + * bpf_get_numa_node_id() + * Returns the id of the current NUMA node. + */ + BPF_FUNC_get_numa_node_id, + __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index aa6d98154106..82a04143368e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1043,6 +1043,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; +const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 39918402e6e9..045cbe673356 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -92,6 +93,17 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_get_numa_node_id) +{ + return numa_node_id(); +} + +const struct bpf_func_proto bpf_get_numa_node_id_proto = { + .func = bpf_get_numa_node_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + BPF_CALL_0(bpf_ktime_get_ns) { /* NMI safe access to clock monotonic */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 5dcb99281259..fa77311dadb2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -422,6 +422,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) return bpf_get_trace_printk_proto(); case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_get_numa_node_id: + return &bpf_get_numa_node_id_proto; case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; case BPF_FUNC_probe_write_user: diff --git a/net/core/filter.c b/net/core/filter.c index 00351cdf7d0c..cd9e2ba66b0e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2492,6 +2492,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_raw_smp_processor_id_proto; + case BPF_FUNC_get_numa_node_id: + return &bpf_get_numa_node_id_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: -- cgit v1.2.3-71-gd317 From 432490f9d455fb842d70219f22d9d2c812371676 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 21 Oct 2016 13:03:44 +0300 Subject: net: ip, diag -- Add diag interface for raw sockets In criu we are actively using diag interface to collect sockets present in the system when dumping applications. And while for unix, tcp, udp[lite], packet, netlink it works as expected, the raw sockets do not have. Thus add it. v2: - add missing sock_put calls in raw_diag_dump_one (by eric.dumazet@) - implement @destroy for diag requests (by dsa@) v3: - add export of raw_abort for IPv6 (by dsa@) - pass net-admin flag into inet_sk_diag_fill due to changes in net-next branch (by dsa@) v4: - use @pad in struct inet_diag_req_v2 for raw socket protocol specification: raw module carries sockets which may have custom protocol passed from socket() syscall and sole @sdiag_protocol is not enough to match underlied ones - start reporting protocol specifed in socket() call when sockets are raw ones for the same reason: user space tools like ss may parse this attribute and use it for socket matching v5 (by eric.dumazet@): - use sock_hold in raw_sock_get instead of atomic_inc, we're holding (raw_v4_hashinfo|raw_v6_hashinfo)->lock when looking up so counter won't be zero here. v6: - use sdiag_raw_protocol() helper which will access @pad structure used for raw sockets protocol specification: we can't simply rename this member without breaking uapi v7: - sine sdiag_raw_protocol() helper is not suitable for uapi lets rather make an alias structure with proper names. __check_inet_diag_req_raw helper will catch if any of structure unintentionally changed. CC: David S. Miller CC: Eric Dumazet CC: David Ahern CC: Alexey Kuznetsov CC: James Morris CC: Hideaki YOSHIFUJI CC: Patrick McHardy CC: Andrey Vagin CC: Stephen Hemminger Signed-off-by: Cyrill Gorcunov Signed-off-by: David S. Miller --- include/net/raw.h | 6 + include/net/rawv6.h | 7 ++ include/uapi/linux/inet_diag.h | 17 +++ net/ipv4/Kconfig | 8 ++ net/ipv4/Makefile | 1 + net/ipv4/inet_diag.c | 9 ++ net/ipv4/raw.c | 21 +++- net/ipv4/raw_diag.c | 261 +++++++++++++++++++++++++++++++++++++++++ net/ipv6/raw.c | 7 +- 9 files changed, 333 insertions(+), 4 deletions(-) create mode 100644 net/ipv4/raw_diag.c (limited to 'include/uapi/linux') diff --git a/include/net/raw.h b/include/net/raw.h index 3e789008394d..57c33dd22ec4 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -23,6 +23,12 @@ extern struct proto raw_prot; +extern struct raw_hashinfo raw_v4_hashinfo; +struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, + unsigned short num, __be32 raddr, + __be32 laddr, int dif); + +int raw_abort(struct sock *sk, int err); void raw_icmp_error(struct sk_buff *, int, u32); int raw_local_deliver(struct sk_buff *, int); diff --git a/include/net/rawv6.h b/include/net/rawv6.h index 87783dea0791..cbe4e9de1894 100644 --- a/include/net/rawv6.h +++ b/include/net/rawv6.h @@ -3,6 +3,13 @@ #include +extern struct raw_hashinfo raw_v6_hashinfo; +struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, + unsigned short num, const struct in6_addr *loc_addr, + const struct in6_addr *rmt_addr, int dif); + +int raw_abort(struct sock *sk, int err); + void raw6_icmp_error(struct sk_buff *, int nexthdr, u8 type, u8 code, int inner_offset, __be32); bool raw6_local_deliver(struct sk_buff *, int); diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 509cd961068d..bbe201047df6 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -43,6 +43,23 @@ struct inet_diag_req_v2 { struct inet_diag_sockid id; }; +/* + * SOCK_RAW sockets require the underlied protocol to be + * additionally specified so we can use @pad member for + * this, but we can't rename it because userspace programs + * still may depend on this name. Instead lets use another + * structure definition as an alias for struct + * @inet_diag_req_v2. + */ +struct inet_diag_req_raw { + __u8 sdiag_family; + __u8 sdiag_protocol; + __u8 idiag_ext; + __u8 sdiag_raw_protocol; + __u32 idiag_states; + struct inet_diag_sockid id; +}; + enum { INET_DIAG_REQ_NONE, INET_DIAG_REQ_BYTECODE, diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 300b06888fdf..28e051a8e847 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -430,6 +430,14 @@ config INET_UDP_DIAG Support for UDP socket monitoring interface used by the ss tool. If unsure, say Y. +config INET_RAW_DIAG + tristate "RAW: socket monitoring interface" + depends on INET_DIAG && (IPV6 || IPV6=n) + default n + ---help--- + Support for RAW socket monitoring interface used by the ss tool. + If unsure, say Y. + config INET_DIAG_DESTROY bool "INET: allow privileged process to administratively close sockets" depends on INET_DIAG diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index bc6a6c8b9bcd..48af58a5686e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o +obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 0a1d4a896a26..3b34024202d8 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -200,6 +200,15 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO)) goto errout; + /* + * RAW sockets might have user-defined protocols assigned, + * so report the one supplied on socket creation. + */ + if (sk->sk_type == SOCK_RAW) { + if (nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol)) + goto errout; + } + if (!icsk) { handler->idiag_get_info(sk, r, NULL); goto out; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 90a85c955872..03618ed03532 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -89,9 +89,10 @@ struct raw_frag_vec { int hlen; }; -static struct raw_hashinfo raw_v4_hashinfo = { +struct raw_hashinfo raw_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), }; +EXPORT_SYMBOL_GPL(raw_v4_hashinfo); int raw_hash_sk(struct sock *sk) { @@ -120,7 +121,7 @@ void raw_unhash_sk(struct sock *sk) } EXPORT_SYMBOL_GPL(raw_unhash_sk); -static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, +struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, unsigned short num, __be32 raddr, __be32 laddr, int dif) { sk_for_each_from(sk) { @@ -136,6 +137,7 @@ static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, found: return sk; } +EXPORT_SYMBOL_GPL(__raw_v4_lookup); /* * 0 - deliver @@ -912,6 +914,20 @@ static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg } #endif +int raw_abort(struct sock *sk, int err) +{ + lock_sock(sk); + + sk->sk_err = err; + sk->sk_error_report(sk); + udp_disconnect(sk, 0); + + release_sock(sk); + + return 0; +} +EXPORT_SYMBOL_GPL(raw_abort); + struct proto raw_prot = { .name = "RAW", .owner = THIS_MODULE, @@ -937,6 +953,7 @@ struct proto raw_prot = { .compat_getsockopt = compat_raw_getsockopt, .compat_ioctl = compat_raw_ioctl, #endif + .diag_destroy = raw_abort, }; #ifdef CONFIG_PROC_FS diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c new file mode 100644 index 000000000000..ef3bea061b75 --- /dev/null +++ b/net/ipv4/raw_diag.c @@ -0,0 +1,261 @@ +#include + +#include +#include + +#include +#include + +#ifdef pr_fmt +# undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +static struct raw_hashinfo * +raw_get_hashinfo(const struct inet_diag_req_v2 *r) +{ + if (r->sdiag_family == AF_INET) { + return &raw_v4_hashinfo; +#if IS_ENABLED(CONFIG_IPV6) + } else if (r->sdiag_family == AF_INET6) { + return &raw_v6_hashinfo; +#endif + } else { + pr_warn_once("Unexpected inet family %d\n", + r->sdiag_family); + WARN_ON_ONCE(1); + return ERR_PTR(-EINVAL); + } +} + +/* + * Due to requirement of not breaking user API we can't simply + * rename @pad field in inet_diag_req_v2 structure, instead + * use helper to figure it out. + */ + +static struct sock *raw_lookup(struct net *net, struct sock *from, + const struct inet_diag_req_v2 *req) +{ + struct inet_diag_req_raw *r = (void *)req; + struct sock *sk = NULL; + + if (r->sdiag_family == AF_INET) + sk = __raw_v4_lookup(net, from, r->sdiag_raw_protocol, + r->id.idiag_dst[0], + r->id.idiag_src[0], + r->id.idiag_if); +#if IS_ENABLED(CONFIG_IPV6) + else + sk = __raw_v6_lookup(net, from, r->sdiag_raw_protocol, + (const struct in6_addr *)r->id.idiag_src, + (const struct in6_addr *)r->id.idiag_dst, + r->id.idiag_if); +#endif + return sk; +} + +static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r) +{ + struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); + struct sock *sk = NULL, *s; + int slot; + + if (IS_ERR(hashinfo)) + return ERR_CAST(hashinfo); + + read_lock(&hashinfo->lock); + for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) { + sk_for_each(s, &hashinfo->ht[slot]) { + sk = raw_lookup(net, s, r); + if (sk) { + /* + * Grab it and keep until we fill + * diag meaage to be reported, so + * caller should call sock_put then. + * We can do that because we're keeping + * hashinfo->lock here. + */ + sock_hold(sk); + break; + } + } + } + read_unlock(&hashinfo->lock); + + return sk ? sk : ERR_PTR(-ENOENT); +} + +static int raw_diag_dump_one(struct sk_buff *in_skb, + const struct nlmsghdr *nlh, + const struct inet_diag_req_v2 *r) +{ + struct net *net = sock_net(in_skb->sk); + struct sk_buff *rep; + struct sock *sk; + int err; + + sk = raw_sock_get(net, r); + if (IS_ERR(sk)) + return PTR_ERR(sk); + + rep = nlmsg_new(sizeof(struct inet_diag_msg) + + sizeof(struct inet_diag_meminfo) + 64, + GFP_KERNEL); + if (!rep) { + sock_put(sk); + return -ENOMEM; + } + + err = inet_sk_diag_fill(sk, NULL, rep, r, + sk_user_ns(NETLINK_CB(in_skb).sk), + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, 0, nlh, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); + sock_put(sk); + + if (err < 0) { + kfree_skb(rep); + return err; + } + + err = netlink_unicast(net->diag_nlsk, rep, + NETLINK_CB(in_skb).portid, + MSG_DONTWAIT); + if (err > 0) + err = 0; + return err; +} + +static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + const struct inet_diag_req_v2 *r, + struct nlattr *bc, bool net_admin) +{ + if (!inet_diag_bc_sk(bc, sk)) + return 0; + + return inet_sk_diag_fill(sk, NULL, skb, r, + sk_user_ns(NETLINK_CB(cb->skb).sk), + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->nlh, net_admin); +} + +static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, + const struct inet_diag_req_v2 *r, struct nlattr *bc) +{ + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); + struct net *net = sock_net(skb->sk); + int num, s_num, slot, s_slot; + struct sock *sk = NULL; + + if (IS_ERR(hashinfo)) + return; + + s_slot = cb->args[0]; + num = s_num = cb->args[1]; + + read_lock(&hashinfo->lock); + for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) { + num = 0; + + sk_for_each(sk, &hashinfo->ht[slot]) { + struct inet_sock *inet = inet_sk(sk); + + if (!net_eq(sock_net(sk), net)) + continue; + if (num < s_num) + goto next; + if (sk->sk_family != r->sdiag_family) + goto next; + if (r->id.idiag_sport != inet->inet_sport && + r->id.idiag_sport) + goto next; + if (r->id.idiag_dport != inet->inet_dport && + r->id.idiag_dport) + goto next; + if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) + goto out_unlock; +next: + num++; + } + } + +out_unlock: + read_unlock(&hashinfo->lock); + + cb->args[0] = slot; + cb->args[1] = num; +} + +static void raw_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *info) +{ + r->idiag_rqueue = sk_rmem_alloc_get(sk); + r->idiag_wqueue = sk_wmem_alloc_get(sk); +} + +#ifdef CONFIG_INET_DIAG_DESTROY +static int raw_diag_destroy(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *r) +{ + struct net *net = sock_net(in_skb->sk); + struct sock *sk; + + sk = raw_sock_get(net, r); + if (IS_ERR(sk)) + return PTR_ERR(sk); + return sock_diag_destroy(sk, ECONNABORTED); +} +#endif + +static const struct inet_diag_handler raw_diag_handler = { + .dump = raw_diag_dump, + .dump_one = raw_diag_dump_one, + .idiag_get_info = raw_diag_get_info, + .idiag_type = IPPROTO_RAW, + .idiag_info_size = 0, +#ifdef CONFIG_INET_DIAG_DESTROY + .destroy = raw_diag_destroy, +#endif +}; + +static void __always_unused __check_inet_diag_req_raw(void) +{ + /* + * Make sure the two structures are identical, + * except the @pad field. + */ +#define __offset_mismatch(m1, m2) \ + (offsetof(struct inet_diag_req_v2, m1) != \ + offsetof(struct inet_diag_req_raw, m2)) + + BUILD_BUG_ON(sizeof(struct inet_diag_req_v2) != + sizeof(struct inet_diag_req_raw)); + BUILD_BUG_ON(__offset_mismatch(sdiag_family, sdiag_family)); + BUILD_BUG_ON(__offset_mismatch(sdiag_protocol, sdiag_protocol)); + BUILD_BUG_ON(__offset_mismatch(idiag_ext, idiag_ext)); + BUILD_BUG_ON(__offset_mismatch(pad, sdiag_raw_protocol)); + BUILD_BUG_ON(__offset_mismatch(idiag_states, idiag_states)); + BUILD_BUG_ON(__offset_mismatch(id, id)); +#undef __offset_mismatch +} + +static int __init raw_diag_init(void) +{ + return inet_diag_register(&raw_diag_handler); +} + +static void __exit raw_diag_exit(void) +{ + inet_diag_unregister(&raw_diag_handler); +} + +module_init(raw_diag_init); +module_exit(raw_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-255 /* AF_INET - IPPROTO_RAW */); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10-255 /* AF_INET6 - IPPROTO_RAW */); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 54404f08efcc..d7e8b955ade8 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -65,11 +65,12 @@ #define ICMPV6_HDRLEN 4 /* ICMPv6 header, RFC 4443 Section 2.1 */ -static struct raw_hashinfo raw_v6_hashinfo = { +struct raw_hashinfo raw_v6_hashinfo = { .lock = __RW_LOCK_UNLOCKED(raw_v6_hashinfo.lock), }; +EXPORT_SYMBOL_GPL(raw_v6_hashinfo); -static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, +struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, unsigned short num, const struct in6_addr *loc_addr, const struct in6_addr *rmt_addr, int dif) { @@ -102,6 +103,7 @@ static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, found: return sk; } +EXPORT_SYMBOL_GPL(__raw_v6_lookup); /* * 0 - deliver @@ -1259,6 +1261,7 @@ struct proto rawv6_prot = { .compat_getsockopt = compat_rawv6_getsockopt, .compat_ioctl = compat_rawv6_ioctl, #endif + .diag_destroy = raw_abort, }; #ifdef CONFIG_PROC_FS -- cgit v1.2.3-71-gd317 From 11b6b5a4ced2f2c76073b97ee08ca0eab8358fde Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Thu, 27 Oct 2016 00:41:58 +0300 Subject: cfg80211: Rename SAE_DATA to more generic AUTH_DATA This adds defines and nl80211 extensions to allow FILS Authentication to be implemented similarly to SAE. FILS does not need the special rules for the Authentication transaction number and Status code fields, but it does need to add non-IE fields. The previously used NL80211_ATTR_SAE_DATA can be reused for this to avoid having to duplicate that implementation. Rename that attribute to more generic NL80211_ATTR_AUTH_DATA (with backwards compatibility define for NL80211_SAE_DATA). Also document the special rules related to the Authentication transaction number and Status code fiels. Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 12 +++++++----- include/uapi/linux/nl80211.h | 15 ++++++++++++--- net/mac80211/mlme.c | 12 ++++++------ net/wireless/core.h | 2 +- net/wireless/mlme.c | 6 +++--- net/wireless/nl80211.c | 18 +++++++++--------- 6 files changed, 38 insertions(+), 27 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index d1ffbc3a8e55..dffc265a4fd6 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1785,9 +1785,11 @@ const u8 *ieee80211_bss_get_ie(struct cfg80211_bss *bss, u8 ie); * @key_len: length of WEP key for shared key authentication * @key_idx: index of WEP key for shared key authentication * @key: WEP key for shared key authentication - * @sae_data: Non-IE data to use with SAE or %NULL. This starts with - * Authentication transaction sequence number field. - * @sae_data_len: Length of sae_data buffer in octets + * @auth_data: Fields and elements in Authentication frames. This contains + * the authentication frame body (non-IE and IE data), excluding the + * Authentication algorithm number, i.e., starting at the Authentication + * transaction sequence number field. + * @auth_data_len: Length of auth_data buffer in octets */ struct cfg80211_auth_request { struct cfg80211_bss *bss; @@ -1796,8 +1798,8 @@ struct cfg80211_auth_request { enum nl80211_auth_type auth_type; const u8 *key; u8 key_len, key_idx; - const u8 *sae_data; - size_t sae_data_len; + const u8 *auth_data; + size_t auth_data_len; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 1362d24957b5..18bcf44899aa 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1638,8 +1638,16 @@ enum nl80211_commands { * the connection request from a station. nl80211_connect_failed_reason * enum has different reasons of connection failure. * - * @NL80211_ATTR_SAE_DATA: SAE elements in Authentication frames. This starts - * with the Authentication transaction sequence number field. + * @NL80211_ATTR_AUTH_DATA: Fields and elements in Authentication frames. + * This contains the authentication frame body (non-IE and IE data), + * excluding the Authentication algorithm number, i.e., starting at the + * Authentication transaction sequence number field. It is used with + * authentication algorithms that need special fields to be added into + * the frames (SAE and FILS). Currently, only the SAE cases use the + * initial two fields (Authentication transaction sequence number and + * Status code). However, those fields are included in the attribute data + * for all authentication algorithms to keep the attribute definition + * consistent. * * @NL80211_ATTR_VHT_CAPABILITY: VHT Capability information element (from * association request when used with NL80211_CMD_NEW_STATION) @@ -2195,7 +2203,7 @@ enum nl80211_attrs { NL80211_ATTR_CONN_FAILED_REASON, - NL80211_ATTR_SAE_DATA, + NL80211_ATTR_AUTH_DATA, NL80211_ATTR_VHT_CAPABILITY, @@ -2347,6 +2355,7 @@ enum nl80211_attrs { #define NL80211_ATTR_SCAN_GENERATION NL80211_ATTR_GENERATION #define NL80211_ATTR_MESH_PARAMS NL80211_ATTR_MESH_CONFIG #define NL80211_ATTR_IFACE_SOCKET_OWNER NL80211_ATTR_SOCKET_OWNER +#define NL80211_ATTR_SAE_DATA NL80211_ATTR_AUTH_DATA /* * Allow user space programs to use #ifdef on new attributes by defining them diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index c8d3a9b02fb6..32fd29581d4d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -4483,20 +4483,20 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, return -EOPNOTSUPP; } - auth_data = kzalloc(sizeof(*auth_data) + req->sae_data_len + + auth_data = kzalloc(sizeof(*auth_data) + req->auth_data_len + req->ie_len, GFP_KERNEL); if (!auth_data) return -ENOMEM; auth_data->bss = req->bss; - if (req->sae_data_len >= 4) { - __le16 *pos = (__le16 *) req->sae_data; + if (req->auth_data_len >= 4) { + __le16 *pos = (__le16 *) req->auth_data; auth_data->sae_trans = le16_to_cpu(pos[0]); auth_data->sae_status = le16_to_cpu(pos[1]); - memcpy(auth_data->data, req->sae_data + 4, - req->sae_data_len - 4); - auth_data->data_len += req->sae_data_len - 4; + memcpy(auth_data->data, req->auth_data + 4, + req->auth_data_len - 4); + auth_data->data_len += req->auth_data_len - 4; } if (req->ie && req->ie_len) { diff --git a/net/wireless/core.h b/net/wireless/core.h index 21e31888cfa9..fb2fcd5581fe 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -345,7 +345,7 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, const u8 *ssid, int ssid_len, const u8 *ie, int ie_len, const u8 *key, int key_len, int key_idx, - const u8 *sae_data, int sae_data_len); + const u8 *auth_data, int auth_data_len); int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ieee80211_channel *chan, diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index cbb48e26a871..bd1f7a159d6a 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -204,14 +204,14 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, const u8 *ssid, int ssid_len, const u8 *ie, int ie_len, const u8 *key, int key_len, int key_idx, - const u8 *sae_data, int sae_data_len) + const u8 *auth_data, int auth_data_len) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_auth_request req = { .ie = ie, .ie_len = ie_len, - .sae_data = sae_data, - .sae_data_len = sae_data_len, + .auth_data = auth_data, + .auth_data_len = auth_data_len, .auth_type = auth_type, .key = key, .key_len = key_len, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7e746c90156e..704851142eed 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -357,7 +357,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_BG_SCAN_PERIOD] = { .type = NLA_U16 }, [NL80211_ATTR_WDEV] = { .type = NLA_U64 }, [NL80211_ATTR_USER_REG_HINT_TYPE] = { .type = NLA_U32 }, - [NL80211_ATTR_SAE_DATA] = { .type = NLA_BINARY, }, + [NL80211_ATTR_AUTH_DATA] = { .type = NLA_BINARY, }, [NL80211_ATTR_VHT_CAPABILITY] = { .len = NL80211_VHT_CAPABILITY_LEN }, [NL80211_ATTR_SCAN_FLAGS] = { .type = NLA_U32 }, [NL80211_ATTR_P2P_CTWINDOW] = { .type = NLA_U8 }, @@ -7729,8 +7729,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct ieee80211_channel *chan; - const u8 *bssid, *ssid, *ie = NULL, *sae_data = NULL; - int err, ssid_len, ie_len = 0, sae_data_len = 0; + const u8 *bssid, *ssid, *ie = NULL, *auth_data = NULL; + int err, ssid_len, ie_len = 0, auth_data_len = 0; enum nl80211_auth_type auth_type; struct key_parse key; bool local_state_change; @@ -7811,16 +7811,16 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) return -EINVAL; if (auth_type == NL80211_AUTHTYPE_SAE && - !info->attrs[NL80211_ATTR_SAE_DATA]) + !info->attrs[NL80211_ATTR_AUTH_DATA]) return -EINVAL; - if (info->attrs[NL80211_ATTR_SAE_DATA]) { + if (info->attrs[NL80211_ATTR_AUTH_DATA]) { if (auth_type != NL80211_AUTHTYPE_SAE) return -EINVAL; - sae_data = nla_data(info->attrs[NL80211_ATTR_SAE_DATA]); - sae_data_len = nla_len(info->attrs[NL80211_ATTR_SAE_DATA]); + auth_data = nla_data(info->attrs[NL80211_ATTR_AUTH_DATA]); + auth_data_len = nla_len(info->attrs[NL80211_ATTR_AUTH_DATA]); /* need to include at least Auth Transaction and Status Code */ - if (sae_data_len < 4) + if (auth_data_len < 4) return -EINVAL; } @@ -7837,7 +7837,7 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) err = cfg80211_mlme_auth(rdev, dev, chan, auth_type, bssid, ssid, ssid_len, ie, ie_len, key.p.key, key.p.key_len, key.idx, - sae_data, sae_data_len); + auth_data, auth_data_len); wdev_unlock(dev->ieee80211_ptr); return err; } -- cgit v1.2.3-71-gd317 From 60b8084e844814631b57da3d35f272e0ff799ab2 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Thu, 27 Oct 2016 00:42:00 +0300 Subject: cfg80211: Add feature flag for Fast Initial Link Setup (FILS) as STA This defines a feature flag that drivers can use to indicate that they support FILS authentication/association (IEEE 802.11ai) when using user space SME (NL80211_CMD_AUTHENTICATE) in station mode. Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 18bcf44899aa..7825fd4db19e 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4647,6 +4647,8 @@ enum nl80211_feature_flags { * configuration (AP/mesh) with HT rates. * @NL80211_EXT_FEATURE_BEACON_RATE_VHT: Driver supports beacon rate * configuration (AP/mesh) with VHT rates. + * @NL80211_EXT_FEATURE_FILS_STA: This driver supports Fast Initial Link Setup + * with user space SME (NL80211_CMD_AUTHENTICATE) in station mode. * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. @@ -4661,6 +4663,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_BEACON_RATE_LEGACY, NL80211_EXT_FEATURE_BEACON_RATE_HT, NL80211_EXT_FEATURE_BEACON_RATE_VHT, + NL80211_EXT_FEATURE_FILS_STA, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3-71-gd317 From 631810603a20874554b2f17adf42b72d0f15eda5 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Thu, 27 Oct 2016 00:42:02 +0300 Subject: cfg80211: Add Fast Initial Link Setup (FILS) auth algs This defines authentication algorithms for FILS (IEEE 802.11ai). Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 3 +++ include/uapi/linux/nl80211.h | 6 ++++++ net/wireless/nl80211.c | 21 +++++++++++++++++++-- 3 files changed, 28 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index d428adf51446..793a0174ba29 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1576,6 +1576,9 @@ struct ieee80211_vht_operation { #define WLAN_AUTH_SHARED_KEY 1 #define WLAN_AUTH_FT 2 #define WLAN_AUTH_SAE 3 +#define WLAN_AUTH_FILS_SK 4 +#define WLAN_AUTH_FILS_SK_PFS 5 +#define WLAN_AUTH_FILS_PK 6 #define WLAN_AUTH_LEAP 128 #define WLAN_AUTH_CHALLENGE_LEN 128 diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 7825fd4db19e..4dc21265cd12 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3669,6 +3669,9 @@ enum nl80211_bss_status { * @NL80211_AUTHTYPE_FT: Fast BSS Transition (IEEE 802.11r) * @NL80211_AUTHTYPE_NETWORK_EAP: Network EAP (some Cisco APs and mainly LEAP) * @NL80211_AUTHTYPE_SAE: Simultaneous authentication of equals + * @NL80211_AUTHTYPE_FILS_SK: Fast Initial Link Setup shared key + * @NL80211_AUTHTYPE_FILS_SK_PFS: Fast Initial Link Setup shared key with PFS + * @NL80211_AUTHTYPE_FILS_PK: Fast Initial Link Setup public key * @__NL80211_AUTHTYPE_NUM: internal * @NL80211_AUTHTYPE_MAX: maximum valid auth algorithm * @NL80211_AUTHTYPE_AUTOMATIC: determine automatically (if necessary by @@ -3681,6 +3684,9 @@ enum nl80211_auth_type { NL80211_AUTHTYPE_FT, NL80211_AUTHTYPE_NETWORK_EAP, NL80211_AUTHTYPE_SAE, + NL80211_AUTHTYPE_FILS_SK, + NL80211_AUTHTYPE_FILS_SK_PFS, + NL80211_AUTHTYPE_FILS_PK, /* keep last */ __NL80211_AUTHTYPE_NUM, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 704851142eed..ff798620e929 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3778,12 +3778,23 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) && auth_type == NL80211_AUTHTYPE_SAE) return false; + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_FILS_STA) && + (auth_type == NL80211_AUTHTYPE_FILS_SK || + auth_type == NL80211_AUTHTYPE_FILS_SK_PFS || + auth_type == NL80211_AUTHTYPE_FILS_PK)) + return false; return true; case NL80211_CMD_CONNECT: case NL80211_CMD_START_AP: /* SAE not supported yet */ if (auth_type == NL80211_AUTHTYPE_SAE) return false; + /* FILS not supported yet */ + if (auth_type == NL80211_AUTHTYPE_FILS_SK || + auth_type == NL80211_AUTHTYPE_FILS_SK_PFS || + auth_type == NL80211_AUTHTYPE_FILS_PK) + return false; return true; default: return false; @@ -7810,12 +7821,18 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) if (!nl80211_valid_auth_type(rdev, auth_type, NL80211_CMD_AUTHENTICATE)) return -EINVAL; - if (auth_type == NL80211_AUTHTYPE_SAE && + if ((auth_type == NL80211_AUTHTYPE_SAE || + auth_type == NL80211_AUTHTYPE_FILS_SK || + auth_type == NL80211_AUTHTYPE_FILS_SK_PFS || + auth_type == NL80211_AUTHTYPE_FILS_PK) && !info->attrs[NL80211_ATTR_AUTH_DATA]) return -EINVAL; if (info->attrs[NL80211_ATTR_AUTH_DATA]) { - if (auth_type != NL80211_AUTHTYPE_SAE) + if (auth_type != NL80211_AUTHTYPE_SAE && + auth_type != NL80211_AUTHTYPE_FILS_SK && + auth_type != NL80211_AUTHTYPE_FILS_SK_PFS && + auth_type != NL80211_AUTHTYPE_FILS_PK) return -EINVAL; auth_data = nla_data(info->attrs[NL80211_ATTR_AUTH_DATA]); auth_data_len = nla_len(info->attrs[NL80211_ATTR_AUTH_DATA]); -- cgit v1.2.3-71-gd317 From 348bd456699801920a309c66e382380809fbdf41 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Thu, 27 Oct 2016 00:42:03 +0300 Subject: cfg80211: Add KEK/nonces for FILS association frames The new nl80211 attributes can be used to provide KEK and nonces to allow the driver to encrypt and decrypt FILS (Re)Association Request/Response frames in station mode. Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 3 +++ include/net/cfg80211.h | 9 +++++++++ include/uapi/linux/nl80211.h | 8 ++++++++ net/wireless/nl80211.c | 12 ++++++++++++ 4 files changed, 32 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 793a0174ba29..fe849329511a 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2096,6 +2096,9 @@ enum ieee80211_key_len { #define IEEE80211_GCMP_MIC_LEN 16 #define IEEE80211_GCMP_PN_LEN 6 +#define FILS_NONCE_LEN 16 +#define FILS_MAX_KEK_LEN 64 + /* Public action codes */ enum ieee80211_pub_actioncode { WLAN_PUB_ACTION_EXT_CHANSW_ANN = 4, diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 8ca2e9f354f7..738b4d8a4666 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1840,6 +1840,12 @@ enum cfg80211_assoc_req_flags { * @ht_capa_mask: The bits of ht_capa which are to be used. * @vht_capa: VHT capability override * @vht_capa_mask: VHT capability mask indicating which fields to use + * @fils_kek: FILS KEK for protecting (Re)Association Request/Response frame or + * %NULL if FILS is not used. + * @fils_kek_len: Length of fils_kek in octets + * @fils_nonces: FILS nonces (part of AAD) for protecting (Re)Association + * Request/Response frame or %NULL if FILS is not used. This field starts + * with 16 octets of STA Nonce followed by 16 octets of AP Nonce. */ struct cfg80211_assoc_request { struct cfg80211_bss *bss; @@ -1851,6 +1857,9 @@ struct cfg80211_assoc_request { struct ieee80211_ht_cap ht_capa; struct ieee80211_ht_cap ht_capa_mask; struct ieee80211_vht_cap vht_capa, vht_capa_mask; + const u8 *fils_kek; + size_t fils_kek_len; + const u8 *fils_nonces; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 4dc21265cd12..a268a009528a 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1944,6 +1944,11 @@ enum nl80211_commands { * attribute. * @NL80211_ATTR_NAN_MATCH: used to report a match. This is a nested attribute. * See &enum nl80211_nan_match_attributes. + * @NL80211_ATTR_FILS_KEK: KEK for FILS (Re)Association Request/Response frame + * protection. + * @NL80211_ATTR_FILS_NONCES: Nonces (part of AAD) for FILS (Re)Association + * Request/Response frame protection. This attribute contains the 16 octet + * STA Nonce followed by 16 octets of AP Nonce. * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -2344,6 +2349,9 @@ enum nl80211_attrs { NL80211_ATTR_NAN_FUNC, NL80211_ATTR_NAN_MATCH, + NL80211_ATTR_FILS_KEK, + NL80211_ATTR_FILS_NONCES, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index ff798620e929..667d5f719c22 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -414,6 +414,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_NAN_MASTER_PREF] = { .type = NLA_U8 }, [NL80211_ATTR_NAN_DUAL] = { .type = NLA_U8 }, [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED }, + [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY, + .len = FILS_MAX_KEK_LEN }, + [NL80211_ATTR_FILS_NONCES] = { .len = 2 * FILS_NONCE_LEN }, }; /* policy for the key attributes */ @@ -8033,6 +8036,15 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) req.flags |= ASSOC_REQ_USE_RRM; } + if (info->attrs[NL80211_ATTR_FILS_KEK]) { + req.fils_kek = nla_data(info->attrs[NL80211_ATTR_FILS_KEK]); + req.fils_kek_len = nla_len(info->attrs[NL80211_ATTR_FILS_KEK]); + if (!info->attrs[NL80211_ATTR_FILS_NONCES]) + return -EINVAL; + req.fils_nonces = + nla_data(info->attrs[NL80211_ATTR_FILS_NONCES]); + } + err = nl80211_crypto_settings(rdev, info, &req.crypto, 1); if (!err) { wdev_lock(dev->ieee80211_ptr); -- cgit v1.2.3-71-gd317 From ce0ce13a1c89ff8b94b7f8fb32eb4c43e111c82e Mon Sep 17 00:00:00 2001 From: Michael Braun Date: Mon, 10 Oct 2016 19:12:22 +0200 Subject: cfg80211: configure multicast to unicast for AP interfaces Add the ability to configure if an AP (and associated VLANs) will do multicast-to-unicast conversion for ARP, IPv4 and IPv6 frames (possibly within 802.1Q). If enabled, such frames are to be sent to each station separately, with the DA replaced by their own MAC address rather than the group address. Note that this may break certain expectations of the receiver, such as the ability to drop unicast IP packets received within multicast L2 frames, or the ability to not send ICMP destination unreachable messages for packets received in L2 multicast (which is required, but the receiver can't tell the difference if this new option is enabled.) This also doesn't implement the 802.11 DMS (directed multicast service). Signed-off-by: Michael Braun [fix disabling, add better documentation & commit message] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 6 ++++++ include/uapi/linux/nl80211.h | 21 +++++++++++++++++++++ net/wireless/nl80211.c | 35 +++++++++++++++++++++++++++++++++++ net/wireless/rdev-ops.h | 12 ++++++++++++ net/wireless/trace.h | 19 +++++++++++++++++++ 5 files changed, 93 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 738b4d8a4666..41ae3f500957 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2742,6 +2742,8 @@ struct cfg80211_nan_func { * @nan_change_conf: changes NAN configuration. The changed parameters must * be specified in @changes (using &enum cfg80211_nan_conf_changes); * All other parameters must be ignored. + * + * @set_multicast_to_unicast: configure multicast to unicast conversion for BSS */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -3018,6 +3020,10 @@ struct cfg80211_ops { struct wireless_dev *wdev, struct cfg80211_nan_conf *conf, u32 changes); + + int (*set_multicast_to_unicast)(struct wiphy *wiphy, + struct net_device *dev, + const bool enabled); }; /* diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index a268a009528a..e21d23dcb588 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -600,6 +600,20 @@ * * @NL80211_CMD_SET_WDS_PEER: Set the MAC address of the peer on a WDS interface. * + * @NL80211_CMD_SET_MULTICAST_TO_UNICAST: Configure if this AP should perform + * multicast to unicast conversion. When enabled, all multicast packets + * with ethertype ARP, IPv4 or IPv6 (possibly within an 802.1Q header) + * will be sent out to each station once with the destination (multicast) + * MAC address replaced by the station's MAC address. Note that this may + * break certain expectations of the receiver, e.g. the ability to drop + * unicast IP packets encapsulated in multicast L2 frames, or the ability + * to not send destination unreachable messages in such cases. + * This can only be toggled per BSS. Configure this on an interface of + * type %NL80211_IFTYPE_AP. It applies to all its VLAN interfaces + * (%NL80211_IFTYPE_AP_VLAN), except for those in 4addr (WDS) mode. + * If %NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED is not present with this + * command, the feature is disabled. + * * @NL80211_CMD_JOIN_MESH: Join a mesh. The mesh ID must be given, and initial * mesh config parameters may be given. * @NL80211_CMD_LEAVE_MESH: Leave the mesh network -- no special arguments, the @@ -1069,6 +1083,8 @@ enum nl80211_commands { NL80211_CMD_CHANGE_NAN_CONFIG, NL80211_CMD_NAN_MATCH, + NL80211_CMD_SET_MULTICAST_TO_UNICAST, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -1950,6 +1966,9 @@ enum nl80211_commands { * Request/Response frame protection. This attribute contains the 16 octet * STA Nonce followed by 16 octets of AP Nonce. * + * @NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED: Indicates whether or not multicast + * packets should be send out as unicast to all stations (flag attribute). + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2352,6 +2371,8 @@ enum nl80211_attrs { NL80211_ATTR_FILS_KEK, NL80211_ATTR_FILS_NONCES, + NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 667d5f719c22..0c9d00c9cef1 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -417,6 +417,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY, .len = FILS_MAX_KEK_LEN }, [NL80211_ATTR_FILS_NONCES] = { .len = 2 * FILS_NONCE_LEN }, + [NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED] = { .type = NLA_FLAG, }, }; /* policy for the key attributes */ @@ -1659,6 +1660,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, if (rdev->wiphy.features & NL80211_FEATURE_SUPPORTS_WMM_ADMISSION) CMD(add_tx_ts, ADD_TX_TS); + CMD(set_multicast_to_unicast, SET_MULTICAST_TO_UNICAST); } #undef CMD @@ -11766,6 +11768,31 @@ static int nl80211_tdls_cancel_channel_switch(struct sk_buff *skb, return 0; } +static int nl80211_set_multicast_to_unicast(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + const struct nlattr *nla; + bool enabled; + + if (netif_running(dev)) + return -EBUSY; + + if (!rdev->ops->set_multicast_to_unicast) + return -EOPNOTSUPP; + + if (wdev->iftype != NL80211_IFTYPE_AP && + wdev->iftype != NL80211_IFTYPE_P2P_GO) + return -EOPNOTSUPP; + + nla = info->attrs[NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED]; + enabled = nla_get_flag(nla); + + return rdev_set_multicast_to_unicast(rdev, dev, enabled); +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -12625,6 +12652,14 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_SET_MULTICAST_TO_UNICAST, + .doit = nl80211_set_multicast_to_unicast, + .policy = nl80211_policy, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, + }, }; /* notification functions */ diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 11cf83c8ad4f..e9ecf23e7942 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -562,6 +562,18 @@ static inline int rdev_set_wds_peer(struct cfg80211_registered_device *rdev, return ret; } +static inline int +rdev_set_multicast_to_unicast(struct cfg80211_registered_device *rdev, + struct net_device *dev, + const bool enabled) +{ + int ret; + trace_rdev_set_multicast_to_unicast(&rdev->wiphy, dev, enabled); + ret = rdev->ops->set_multicast_to_unicast(&rdev->wiphy, dev, enabled); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev) { trace_rdev_rfkill_poll(&rdev->wiphy); diff --git a/net/wireless/trace.h b/net/wireless/trace.h index a3d0a91b1e09..25f8318b3e0b 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -3030,6 +3030,25 @@ DEFINE_EVENT(wiphy_wdev_evt, rdev_abort_scan, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), TP_ARGS(wiphy, wdev) ); + +TRACE_EVENT(rdev_set_multicast_to_unicast, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + const bool enabled), + TP_ARGS(wiphy, netdev, enabled), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __field(bool, enabled) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + __entry->enabled = enabled; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", unicast: %s", + WIPHY_PR_ARG, NETDEV_PR_ARG, + BOOL_TO_STR(__entry->enabled)) +); #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3-71-gd317 From 088e8df82f91a24728d49d9532cab7ebdee5117f Mon Sep 17 00:00:00 2001 From: vamsi krishna Date: Thu, 27 Oct 2016 16:51:11 +0300 Subject: cfg80211: Add support to update connection parameters Add functionality to update the connection parameters when in connected state, so that driver/firmware uses the updated parameters for subsequent roaming. This is for drivers that support internal BSS selection and roaming. The new command does not change the current association state, i.e., it can be used to update IE contents for future (re)associations without causing an immediate disassociation or reassociation with the current BSS. This commit implements the required functionality for updating IEs for (Re)Association Request frame only. Other parameters can be added in future when required. Signed-off-by: vamsi krishna Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 24 ++++++++++++++++++++++++ include/uapi/linux/nl80211.h | 8 ++++++++ net/wireless/nl80211.c | 40 ++++++++++++++++++++++++++++++++++++++++ net/wireless/rdev-ops.h | 12 ++++++++++++ net/wireless/trace.h | 18 ++++++++++++++++++ 5 files changed, 102 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 41ae3f500957..c575583b50fb 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2050,6 +2050,18 @@ struct cfg80211_connect_params { const u8 *prev_bssid; }; +/** + * enum cfg80211_connect_params_changed - Connection parameters being updated + * + * This enum provides information of all connect parameters that + * have to be updated as part of update_connect_params() call. + * + * @UPDATE_ASSOC_IES: Indicates whether association request IEs are updated + */ +enum cfg80211_connect_params_changed { + UPDATE_ASSOC_IES = BIT(0), +}; + /** * enum wiphy_params_flags - set_wiphy_params bitfield values * @WIPHY_PARAM_RETRY_SHORT: wiphy->retry_short has changed @@ -2571,6 +2583,14 @@ struct cfg80211_nan_func { * cases, the result of roaming is indicated with a call to * cfg80211_roamed() or cfg80211_roamed_bss(). * (invoked with the wireless_dev mutex held) + * @update_connect_params: Update the connect parameters while connected to a + * BSS. The updated parameters can be used by driver/firmware for + * subsequent BSS selection (roaming) decisions and to form the + * Authentication/(Re)Association Request frames. This call does not + * request an immediate disassociation or reassociation with the current + * BSS, i.e., this impacts only subsequent (re)associations. The bits in + * changed are defined in &enum cfg80211_connect_params_changed. + * (invoked with the wireless_dev mutex held) * @disconnect: Disconnect from the BSS/ESS or stop connection attempts if * connection is in progress. Once done, call cfg80211_disconnected() in * case connection was already established (invoked with the @@ -2858,6 +2878,10 @@ struct cfg80211_ops { int (*connect)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_connect_params *sme); + int (*update_connect_params)(struct wiphy *wiphy, + struct net_device *dev, + struct cfg80211_connect_params *sme, + u32 changed); int (*disconnect)(struct wiphy *wiphy, struct net_device *dev, u16 reason_code); diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index e21d23dcb588..259c9c77fdc1 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -888,6 +888,12 @@ * This will contain a %NL80211_ATTR_NAN_MATCH nested attribute and * %NL80211_ATTR_COOKIE. * + * @NL80211_CMD_UPDATE_CONNECT_PARAMS: Update one or more connect parameters + * for subsequent roaming cases if the driver or firmware uses internal + * BSS selection. This command can be issued only while connected and it + * does not result in a change for the current association. Currently, + * only the %NL80211_ATTR_IE data is used and updated with this command. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1085,6 +1091,8 @@ enum nl80211_commands { NL80211_CMD_SET_MULTICAST_TO_UNICAST, + NL80211_CMD_UPDATE_CONNECT_PARAMS, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0c9d00c9cef1..bb30fa1969f9 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1661,6 +1661,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, NL80211_FEATURE_SUPPORTS_WMM_ADMISSION) CMD(add_tx_ts, ADD_TX_TS); CMD(set_multicast_to_unicast, SET_MULTICAST_TO_UNICAST); + CMD(update_connect_params, UPDATE_CONNECT_PARAMS); } #undef CMD @@ -8779,6 +8780,37 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) return err; } +static int nl80211_update_connect_params(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_connect_params connect = {}; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + u32 changed = 0; + int ret; + + if (!rdev->ops->update_connect_params) + return -EOPNOTSUPP; + + if (info->attrs[NL80211_ATTR_IE]) { + if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) + return -EINVAL; + connect.ie = nla_data(info->attrs[NL80211_ATTR_IE]); + connect.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + changed |= UPDATE_ASSOC_IES; + } + + wdev_lock(dev->ieee80211_ptr); + if (!wdev->current_bss) + ret = -ENOLINK; + else + ret = rdev_update_connect_params(rdev, dev, &connect, changed); + wdev_unlock(dev->ieee80211_ptr); + + return ret; +} + static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -12231,6 +12263,14 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_UPDATE_CONNECT_PARAMS, + .doit = nl80211_update_connect_params, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, { .cmd = NL80211_CMD_DISCONNECT, .doit = nl80211_disconnect, diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index e9ecf23e7942..2f425075ada8 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -490,6 +490,18 @@ static inline int rdev_connect(struct cfg80211_registered_device *rdev, return ret; } +static inline int +rdev_update_connect_params(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_connect_params *sme, u32 changed) +{ + int ret; + trace_rdev_update_connect_params(&rdev->wiphy, dev, sme, changed); + ret = rdev->ops->update_connect_params(&rdev->wiphy, dev, sme, changed); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + static inline int rdev_disconnect(struct cfg80211_registered_device *rdev, struct net_device *dev, u16 reason_code) { diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 25f8318b3e0b..ea1b47e04fa4 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1281,6 +1281,24 @@ TRACE_EVENT(rdev_connect, __entry->wpa_versions, __entry->flags, MAC_PR_ARG(prev_bssid)) ); +TRACE_EVENT(rdev_update_connect_params, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_connect_params *sme, u32 changed), + TP_ARGS(wiphy, netdev, sme, changed), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __field(u32, changed) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + __entry->changed = changed; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", parameters changed: %u", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->changed) +); + TRACE_EVENT(rdev_set_cqm_rssi_config, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, s32 rssi_thold, -- cgit v1.2.3-71-gd317 From 4fe77d82ef80c77031c9c6f8554cd0dee2aa423a Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Mon, 24 Oct 2016 20:32:57 +0800 Subject: skbedit: allow the user to specify bitmask for mark The user may want to use only some bits of the skb mark in his skbedit rules because the remaining part might be used by something else. Introduce the "mask" parameter to the skbedit actor in order to implement such functionality. When the mask is specified, only those bits selected by the latter are altered really changed by the actor, while the rest is left untouched. Signed-off-by: Antonio Quartulli Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/tc_act/tc_skbedit.h | 1 + include/uapi/linux/tc_act/tc_skbedit.h | 2 ++ net/sched/act_skbedit.c | 21 ++++++++++++++++++--- 3 files changed, 21 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h index 5767e9dbcf92..19cd3d345804 100644 --- a/include/net/tc_act/tc_skbedit.h +++ b/include/net/tc_act/tc_skbedit.h @@ -27,6 +27,7 @@ struct tcf_skbedit { u32 flags; u32 priority; u32 mark; + u32 mask; u16 queue_mapping; u16 ptype; }; diff --git a/include/uapi/linux/tc_act/tc_skbedit.h b/include/uapi/linux/tc_act/tc_skbedit.h index a4d00c608d8f..2884425738ce 100644 --- a/include/uapi/linux/tc_act/tc_skbedit.h +++ b/include/uapi/linux/tc_act/tc_skbedit.h @@ -28,6 +28,7 @@ #define SKBEDIT_F_QUEUE_MAPPING 0x2 #define SKBEDIT_F_MARK 0x4 #define SKBEDIT_F_PTYPE 0x8 +#define SKBEDIT_F_MASK 0x10 struct tc_skbedit { tc_gen; @@ -42,6 +43,7 @@ enum { TCA_SKBEDIT_MARK, TCA_SKBEDIT_PAD, TCA_SKBEDIT_PTYPE, + TCA_SKBEDIT_MASK, __TCA_SKBEDIT_MAX }; #define TCA_SKBEDIT_MAX (__TCA_SKBEDIT_MAX - 1) diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index a133dcb82132..024f3a3afeff 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -46,8 +46,10 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, if (d->flags & SKBEDIT_F_QUEUE_MAPPING && skb->dev->real_num_tx_queues > d->queue_mapping) skb_set_queue_mapping(skb, d->queue_mapping); - if (d->flags & SKBEDIT_F_MARK) - skb->mark = d->mark; + if (d->flags & SKBEDIT_F_MARK) { + skb->mark &= ~d->mask; + skb->mark |= d->mark & d->mask; + } if (d->flags & SKBEDIT_F_PTYPE) skb->pkt_type = d->ptype; @@ -61,6 +63,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { [TCA_SKBEDIT_QUEUE_MAPPING] = { .len = sizeof(u16) }, [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) }, [TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) }, + [TCA_SKBEDIT_MASK] = { .len = sizeof(u32) }, }; static int tcf_skbedit_init(struct net *net, struct nlattr *nla, @@ -71,7 +74,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; struct tc_skbedit *parm; struct tcf_skbedit *d; - u32 flags = 0, *priority = NULL, *mark = NULL; + u32 flags = 0, *priority = NULL, *mark = NULL, *mask = NULL; u16 *queue_mapping = NULL, *ptype = NULL; bool exists = false; int ret = 0, err; @@ -108,6 +111,11 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, mark = nla_data(tb[TCA_SKBEDIT_MARK]); } + if (tb[TCA_SKBEDIT_MASK] != NULL) { + flags |= SKBEDIT_F_MASK; + mask = nla_data(tb[TCA_SKBEDIT_MASK]); + } + parm = nla_data(tb[TCA_SKBEDIT_PARMS]); exists = tcf_hash_check(tn, parm->index, a, bind); @@ -145,6 +153,10 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, d->mark = *mark; if (flags & SKBEDIT_F_PTYPE) d->ptype = *ptype; + /* default behaviour is to use all the bits */ + d->mask = 0xffffffff; + if (flags & SKBEDIT_F_MASK) + d->mask = *mask; d->tcf_action = parm->action; @@ -182,6 +194,9 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, if ((d->flags & SKBEDIT_F_PTYPE) && nla_put_u16(skb, TCA_SKBEDIT_PTYPE, d->ptype)) goto nla_put_failure; + if ((d->flags & SKBEDIT_F_MASK) && + nla_put_u32(skb, TCA_SKBEDIT_MASK, d->mask)) + goto nla_put_failure; tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) -- cgit v1.2.3-71-gd317 From a07ea4d9941af5a0c6f0be2a71b51ac9c083c5e5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 24 Oct 2016 14:40:02 +0200 Subject: genetlink: no longer support using static family IDs Static family IDs have never really been used, the only use case was the workaround I introduced for those users that assumed their family ID was also their multicast group ID. Additionally, because static family IDs would never be reserved by the generic netlink code, using a relatively low ID would only work for built-in families that can be registered immediately after generic netlink is started, which is basically only the control family (apart from the workaround code, which I also had to add code for so it would reserve those IDs) Thus, anything other than GENL_ID_GENERATE is flawed and luckily not used except in the cases I mentioned. Move those workarounds into a few lines of code, and then get rid of GENL_ID_GENERATE entirely, making it more robust. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- drivers/acpi/event.c | 1 - drivers/net/gtp.c | 1 - drivers/net/macsec.c | 1 - drivers/net/team/team.c | 1 - drivers/net/wireless/mac80211_hwsim.c | 1 - drivers/scsi/pmcraid.c | 6 ------ drivers/target/target_core_user.c | 1 - drivers/thermal/thermal_core.c | 1 - fs/dlm/netlink.c | 1 - fs/quota/netlink.c | 7 ------- include/linux/genl_magic_func.h | 1 - include/net/genetlink.h | 7 ++----- include/uapi/linux/genetlink.h | 1 - kernel/taskstats.c | 1 - net/batman-adv/netlink.c | 1 - net/core/devlink.c | 1 - net/core/drop_monitor.c | 1 - net/hsr/hsr_netlink.c | 1 - net/ieee802154/netlink.c | 1 - net/ieee802154/nl802154.c | 1 - net/ipv4/fou.c | 1 - net/ipv4/tcp_metrics.c | 1 - net/ipv6/ila/ila_xlat.c | 1 - net/irda/irnetlink.c | 1 - net/l2tp/l2tp_netlink.c | 1 - net/netfilter/ipvs/ip_vs_ctl.c | 1 - net/netlabel/netlabel_calipso.c | 1 - net/netlabel/netlabel_cipso_v4.c | 1 - net/netlabel/netlabel_mgmt.c | 1 - net/netlabel/netlabel_unlabeled.c | 1 - net/netlink/genetlink.c | 37 +++++++++++++++++++++-------------- net/nfc/netlink.c | 1 - net/openvswitch/datapath.c | 4 ---- net/tipc/netlink.c | 1 - net/tipc/netlink_compat.c | 1 - net/wimax/stack.c | 1 - net/wireless/nl80211.c | 1 - 37 files changed, 24 insertions(+), 69 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/acpi/event.c b/drivers/acpi/event.c index e24ea4e796e4..8dfca3d53131 100644 --- a/drivers/acpi/event.c +++ b/drivers/acpi/event.c @@ -83,7 +83,6 @@ static const struct genl_multicast_group acpi_event_mcgrps[] = { }; static struct genl_family acpi_event_genl_family = { - .id = GENL_ID_GENERATE, .name = ACPI_GENL_FAMILY_NAME, .version = ACPI_GENL_VERSION, .maxattr = ACPI_GENL_ATTR_MAX, diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 97e0cbca0a08..f66737ba1299 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -1095,7 +1095,6 @@ static int gtp_genl_del_pdp(struct sk_buff *skb, struct genl_info *info) } static struct genl_family gtp_genl_family = { - .id = GENL_ID_GENERATE, .name = "gtp", .version = 0, .hdrsize = 0, diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 1a134cb2d52c..a5309b81a786 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -1422,7 +1422,6 @@ static void clear_tx_sa(struct macsec_tx_sa *tx_sa) } static struct genl_family macsec_fam = { - .id = GENL_ID_GENERATE, .name = MACSEC_GENL_NAME, .hdrsize = 0, .version = MACSEC_GENL_VERSION, diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index a380649bf6b5..0b50205764ff 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -2151,7 +2151,6 @@ static struct rtnl_link_ops team_link_ops __read_mostly = { ***********************************/ static struct genl_family team_nl_family = { - .id = GENL_ID_GENERATE, .name = TEAM_GENL_NAME, .version = TEAM_GENL_VERSION, .maxattr = TEAM_ATTR_MAX, diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index e95b79bccf9b..54b6cd62676e 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -589,7 +589,6 @@ struct hwsim_radiotap_ack_hdr { /* MAC80211_HWSIM netlinf family */ static struct genl_family hwsim_genl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = "MAC80211_HWSIM", .version = 1, diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c index 68a5c347fae9..cc50eb87b28a 100644 --- a/drivers/scsi/pmcraid.c +++ b/drivers/scsi/pmcraid.c @@ -1369,12 +1369,6 @@ static struct genl_multicast_group pmcraid_mcgrps[] = { }; static struct genl_family pmcraid_event_family = { - /* - * Due to prior multicast group abuse (the code having assumed that - * the family ID can be used as a multicast group ID) we need to - * statically allocate a family (and thus group) ID. - */ - .id = GENL_ID_PMCRAID, .name = "pmcraid", .version = 1, .maxattr = PMCRAID_AEN_ATTR_MAX, diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 62bf4fe5704a..313a0ef3cda7 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -148,7 +148,6 @@ static const struct genl_multicast_group tcmu_mcgrps[] = { /* Our generic netlink family */ static struct genl_family tcmu_genl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = "TCM-USER", .version = 1, diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 226b0b4aced6..68d7503f6417 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -2164,7 +2164,6 @@ static const struct genl_multicast_group thermal_event_mcgrps[] = { }; static struct genl_family thermal_event_genl_family = { - .id = GENL_ID_GENERATE, .name = THERMAL_GENL_FAMILY_NAME, .version = THERMAL_GENL_VERSION, .maxattr = THERMAL_GENL_ATTR_MAX, diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index 1e6e227134d7..00d226956264 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -17,7 +17,6 @@ static uint32_t dlm_nl_seqnum; static uint32_t listener_nlportid; static struct genl_family family = { - .id = GENL_ID_GENERATE, .name = DLM_GENL_NAME, .version = DLM_GENL_VERSION, }; diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c index 8b252673d454..3965a5cdfaa2 100644 --- a/fs/quota/netlink.c +++ b/fs/quota/netlink.c @@ -13,13 +13,6 @@ static const struct genl_multicast_group quota_mcgrps[] = { /* Netlink family structure for quota */ static struct genl_family quota_genl_family = { - /* - * Needed due to multicast group ID abuse - old code assumed - * the family ID was also a valid multicast group ID (which - * isn't true) and userspace might thus rely on it. Assign a - * static ID for this group to make dealing with that easier. - */ - .id = GENL_ID_VFS_DQUOT, .hdrsize = 0, .name = "VFS_DQUOT", .version = 1, diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h index 667c31101b8b..7c070c1fe457 100644 --- a/include/linux/genl_magic_func.h +++ b/include/linux/genl_magic_func.h @@ -260,7 +260,6 @@ static struct genl_ops ZZZ_genl_ops[] __read_mostly = { */ #define ZZZ_genl_family CONCAT_(GENL_MAGIC_FAMILY, _genl_family) static struct genl_family ZZZ_genl_family __read_mostly = { - .id = GENL_ID_GENERATE, .name = __stringify(GENL_MAGIC_FAMILY), .version = GENL_MAGIC_VERSION, #ifdef GENL_MAGIC_FAMILY_HDRSZ diff --git a/include/net/genetlink.h b/include/net/genetlink.h index ef9defb3f5bc..43a5c3975a2f 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -20,7 +20,7 @@ struct genl_info; /** * struct genl_family - generic netlink family - * @id: protocol family idenfitier + * @id: protocol family identifier (private) * @hdrsize: length of user specific header in bytes * @name: name of family * @version: protocol version @@ -48,7 +48,7 @@ struct genl_info; * @n_ops: number of operations supported by this family (private) */ struct genl_family { - unsigned int id; + unsigned int id; /* private */ unsigned int hdrsize; char name[GENL_NAMSIZ]; unsigned int version; @@ -149,9 +149,6 @@ static inline int genl_register_family(struct genl_family *family) * Registers the specified family and operations from the specified table. * Only one family may be registered with the same family name or identifier. * - * The family id may equal GENL_ID_GENERATE causing an unique id to - * be automatically generated and assigned. - * * Either a doit or dumpit callback must be specified for every registered * operation or the function will fail. Only one operation structure per * command identifier may be registered. diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h index 5512c90af7e3..d9b2db4a29c6 100644 --- a/include/uapi/linux/genetlink.h +++ b/include/uapi/linux/genetlink.h @@ -26,7 +26,6 @@ struct genlmsghdr { /* * List of reserved static generic netlink identifiers: */ -#define GENL_ID_GENERATE 0 #define GENL_ID_CTRL NLMSG_MIN_TYPE #define GENL_ID_VFS_DQUOT (NLMSG_MIN_TYPE + 1) #define GENL_ID_PMCRAID (NLMSG_MIN_TYPE + 2) diff --git a/kernel/taskstats.c b/kernel/taskstats.c index b3f05ee20d18..d7a1a9461a10 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -42,7 +42,6 @@ static int family_registered; struct kmem_cache *taskstats_cache; static struct genl_family family = { - .id = GENL_ID_GENERATE, .name = TASKSTATS_GENL_NAME, .version = TASKSTATS_GENL_VERSION, .maxattr = TASKSTATS_CMD_ATTR_MAX, diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 64cb6acbe0a6..a03b0ed7e8dd 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -49,7 +49,6 @@ #include "translation-table.h" struct genl_family batadv_netlink_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = BATADV_NL_NAME, .version = 1, diff --git a/net/core/devlink.c b/net/core/devlink.c index d2fd736de6a2..3008d9c33875 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -342,7 +342,6 @@ static void devlink_nl_post_doit(const struct genl_ops *ops, } static struct genl_family devlink_nl_family = { - .id = GENL_ID_GENERATE, .name = DEVLINK_GENL_NAME, .version = DEVLINK_GENL_VERSION, .maxattr = DEVLINK_ATTR_MAX, diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 72cfb0c61125..a5320dfcd978 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -60,7 +60,6 @@ struct dm_hw_stat_delta { }; static struct genl_family net_drop_monitor_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = "NET_DM", .version = 2, diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index d4d1617f43a8..2ad039492bee 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -132,7 +132,6 @@ static const struct nla_policy hsr_genl_policy[HSR_A_MAX + 1] = { }; static struct genl_family hsr_genl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = "HSR", .version = 1, diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c index c8133c07ceee..19144158b696 100644 --- a/net/ieee802154/netlink.c +++ b/net/ieee802154/netlink.c @@ -29,7 +29,6 @@ static unsigned int ieee802154_seq_num; static DEFINE_SPINLOCK(ieee802154_seq_lock); struct genl_family nl802154_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = IEEE802154_NL_NAME, .version = 1, diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 21aabadccd0e..182299858f1d 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -34,7 +34,6 @@ static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb, /* the netlink family */ static struct genl_family nl802154_fam = { - .id = GENL_ID_GENERATE, /* don't bother with a hardcoded ID */ .name = NL802154_GENL_NAME, /* have users key off the name instead */ .hdrsize = 0, /* no private header */ .version = 1, /* no particular meaning now */ diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index cf50f7e2b012..e3fc527c5d37 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -623,7 +623,6 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg) } static struct genl_family fou_nl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = FOU_GENL_NAME, .version = FOU_GENL_VERSION, diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index bf1f3b2b29d1..3da305127b32 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -743,7 +743,6 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss, } static struct genl_family tcp_metrics_nl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = TCP_METRICS_GENL_NAME, .version = TCP_METRICS_GENL_VERSION, diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index e604013dd814..0d57e27d1cdd 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -119,7 +119,6 @@ static const struct rhashtable_params rht_params = { }; static struct genl_family ila_nl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = ILA_GENL_NAME, .version = ILA_GENL_VERSION, diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c index e15c40e86660..f23b81aa91fe 100644 --- a/net/irda/irnetlink.c +++ b/net/irda/irnetlink.c @@ -25,7 +25,6 @@ static struct genl_family irda_nl_family = { - .id = GENL_ID_GENERATE, .name = IRDA_NL_NAME, .hdrsize = 0, .version = IRDA_NL_VERSION, diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index bf3117771822..4fbf1f41ac52 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -32,7 +32,6 @@ static struct genl_family l2tp_nl_family = { - .id = GENL_ID_GENERATE, .name = L2TP_GENL_NAME, .version = L2TP_GENL_VERSION, .hdrsize = 0, diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c3c809b2e712..ceed66cdd03e 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2841,7 +2841,6 @@ static struct nf_sockopt_ops ip_vs_sockopts = { /* IPVS genetlink family */ static struct genl_family ip_vs_genl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = IPVS_GENL_NAME, .version = IPVS_GENL_VERSION, diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index 2ec93c5e77bb..152e503b8c5d 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -61,7 +61,6 @@ struct netlbl_domhsh_walk_arg { /* NetLabel Generic NETLINK CALIPSO family */ static struct genl_family netlbl_calipso_gnl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = NETLBL_NLTYPE_CALIPSO_NAME, .version = NETLBL_PROTO_VERSION, diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c index 7fd1104ba900..755b284e7ad4 100644 --- a/net/netlabel/netlabel_cipso_v4.c +++ b/net/netlabel/netlabel_cipso_v4.c @@ -60,7 +60,6 @@ struct netlbl_domhsh_walk_arg { /* NetLabel Generic NETLINK CIPSOv4 family */ static struct genl_family netlbl_cipsov4_gnl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = NETLBL_NLTYPE_CIPSOV4_NAME, .version = NETLBL_PROTO_VERSION, diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index f85d0e07af2d..3b00f2368fcd 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -61,7 +61,6 @@ struct netlbl_domhsh_walk_arg { /* NetLabel Generic NETLINK CIPSOv4 family */ static struct genl_family netlbl_mgmt_gnl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = NETLBL_NLTYPE_MGMT_NAME, .version = NETLBL_PROTO_VERSION, diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 4528cff9138b..c2ea8d1f653a 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -124,7 +124,6 @@ static u8 netlabel_unlabel_acceptflg; /* NetLabel Generic NETLINK unlabeled family */ static struct genl_family netlbl_unlabel_gnl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = NETLBL_NLTYPE_UNLABELED_NAME, .version = NETLBL_PROTO_VERSION, diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 01291b7a27bb..f19ec969edee 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -349,8 +349,6 @@ static int genl_validate_ops(const struct genl_family *family) * * Registers the specified family after validating it first. Only one * family may be registered with the same family name or identifier. - * The family id may equal GENL_ID_GENERATE causing an unique id to - * be automatically generated and assigned. * * The family's ops array must already be assigned, you can use the * genl_register_family_with_ops() helper function. @@ -359,13 +357,7 @@ static int genl_validate_ops(const struct genl_family *family) */ int __genl_register_family(struct genl_family *family) { - int err = -EINVAL, i; - - if (family->id && family->id < GENL_MIN_ID) - goto errout; - - if (family->id > GENL_MAX_ID) - goto errout; + int err, i; err = genl_validate_ops(family); if (err) @@ -378,8 +370,27 @@ int __genl_register_family(struct genl_family *family) goto errout_locked; } - if (family->id == GENL_ID_GENERATE) { - u16 newid = genl_generate_id(); + if (family == &genl_ctrl) { + family->id = GENL_ID_CTRL; + } else { + u16 newid; + + /* this should be left zero in the struct */ + WARN_ON(family->id); + + /* + * Sadly, a few cases need to be special-cased + * due to them having previously abused the API + * and having used their family ID also as their + * multicast group ID, so we use reserved IDs + * for both to be sure we can do that mapping. + */ + if (strcmp(family->name, "pmcraid") == 0) + newid = GENL_ID_PMCRAID; + else if (strcmp(family->name, "VFS_DQUOT") == 0) + newid = GENL_ID_VFS_DQUOT; + else + newid = genl_generate_id(); if (!newid) { err = -ENOMEM; @@ -387,9 +398,6 @@ int __genl_register_family(struct genl_family *family) } family->id = newid; - } else if (genl_family_find_byid(family->id)) { - err = -EEXIST; - goto errout_locked; } if (family->maxattr && !family->parallel_ops) { @@ -419,7 +427,6 @@ int __genl_register_family(struct genl_family *family) errout_locked: genl_unlock_all(); -errout: return err; } EXPORT_SYMBOL(__genl_register_family); diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 79786bf62b88..c230403e066c 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -39,7 +39,6 @@ static const struct genl_multicast_group nfc_genl_mcgrps[] = { }; static struct genl_family nfc_genl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = NFC_GENL_NAME, .version = NFC_GENL_VERSION, diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 194435aa1165..f9fef7dfba15 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -671,7 +671,6 @@ static const struct genl_ops dp_packet_genl_ops[] = { }; static struct genl_family dp_packet_genl_family = { - .id = GENL_ID_GENERATE, .hdrsize = sizeof(struct ovs_header), .name = OVS_PACKET_FAMILY, .version = OVS_PACKET_VERSION, @@ -1436,7 +1435,6 @@ static const struct genl_ops dp_flow_genl_ops[] = { }; static struct genl_family dp_flow_genl_family = { - .id = GENL_ID_GENERATE, .hdrsize = sizeof(struct ovs_header), .name = OVS_FLOW_FAMILY, .version = OVS_FLOW_VERSION, @@ -1822,7 +1820,6 @@ static const struct genl_ops dp_datapath_genl_ops[] = { }; static struct genl_family dp_datapath_genl_family = { - .id = GENL_ID_GENERATE, .hdrsize = sizeof(struct ovs_header), .name = OVS_DATAPATH_FAMILY, .version = OVS_DATAPATH_VERSION, @@ -2244,7 +2241,6 @@ static const struct genl_ops dp_vport_genl_ops[] = { }; struct genl_family dp_vport_genl_family = { - .id = GENL_ID_GENERATE, .hdrsize = sizeof(struct ovs_header), .name = OVS_VPORT_FAMILY, .version = OVS_VPORT_VERSION, diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 4b94f3cfe3af..383b8fedabc7 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -136,7 +136,6 @@ const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = { * so we have a separate genl handling for the new API. */ struct genl_family tipc_genl_family = { - .id = GENL_ID_GENERATE, .name = TIPC_GENL_V2_NAME, .version = TIPC_GENL_V2_VERSION, .hdrsize = 0, diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 1fd464764765..f04428e4c8e5 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -1216,7 +1216,6 @@ send: } static struct genl_family tipc_genl_compat_family = { - .id = GENL_ID_GENERATE, .name = TIPC_GENL_NAME, .version = TIPC_GENL_VERSION, .hdrsize = TIPC_GENL_HDRLEN, diff --git a/net/wimax/stack.c b/net/wimax/stack.c index 3f816e2971ee..8ac83a41585f 100644 --- a/net/wimax/stack.c +++ b/net/wimax/stack.c @@ -573,7 +573,6 @@ size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL); struct genl_family wimax_gnl_family = { - .id = GENL_ID_GENERATE, .name = "WiMAX", .version = WIMAX_GNL_VERSION, .hdrsize = 0, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7d8cb3330c86..714beafe05e0 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -39,7 +39,6 @@ static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb, /* the netlink family */ static struct genl_family nl80211_fam = { - .id = GENL_ID_GENERATE, /* don't bother with a hardcoded ID */ .name = NL80211_GENL_NAME, /* have users key off the name instead */ .hdrsize = 0, /* no private header */ .version = 1, /* no particular meaning now */ -- cgit v1.2.3-71-gd317 From 2ae0f17df1cd52aafd1ab0415ea1f1dd56dc0e2a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 24 Oct 2016 14:40:04 +0200 Subject: genetlink: use idr to track families Since generic netlink family IDs are small integers, allocated densely, IDR is an ideal match for lookups. Replace the existing hand-written hash-table with IDR for allocation and lookup. This lets the families only be written to once, during register, since the list_head can be removed and removal of a family won't cause any writes. It also slightly reduces the code size (by about 1.3k on x86-64). Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/genetlink.h | 31 +++-- include/uapi/linux/genetlink.h | 2 + net/netlink/genetlink.c | 271 ++++++++++++++++------------------------- 3 files changed, 123 insertions(+), 181 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 2298b50cee34..3ec87bacc0f5 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -40,7 +40,6 @@ struct genl_info; * generic netlink family is removed while there are still open * sockets. * @attrbuf: buffer to store parsed attributes (private) - * @family_list: family list (private) * @mcgrps: multicast groups used by this family * @n_mcgrps: number of multicast groups * @mcgrp_offset: starting number of multicast group IDs in this family @@ -70,11 +69,10 @@ struct genl_family { unsigned int n_ops; unsigned int n_mcgrps; unsigned int mcgrp_offset; /* private */ - struct list_head family_list; /* private */ struct module *module; }; -struct nlattr **genl_family_attrbuf(struct genl_family *family); +struct nlattr **genl_family_attrbuf(const struct genl_family *family); /** * struct genl_info - receiving information @@ -134,12 +132,12 @@ struct genl_ops { }; int genl_register_family(struct genl_family *family); -int genl_unregister_family(struct genl_family *family); -void genl_notify(struct genl_family *family, struct sk_buff *skb, +int genl_unregister_family(const struct genl_family *family); +void genl_notify(const struct genl_family *family, struct sk_buff *skb, struct genl_info *info, u32 group, gfp_t flags); void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, - struct genl_family *family, int flags, u8 cmd); + const struct genl_family *family, int flags, u8 cmd); /** * genlmsg_nlhdr - Obtain netlink header from user specified header @@ -148,8 +146,8 @@ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, * * Returns pointer to netlink header. */ -static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr, - struct genl_family *family) +static inline struct nlmsghdr * +genlmsg_nlhdr(void *user_hdr, const struct genl_family *family) { return (struct nlmsghdr *)((char *)user_hdr - family->hdrsize - @@ -185,7 +183,7 @@ static inline int genlmsg_parse(const struct nlmsghdr *nlh, */ static inline void genl_dump_check_consistent(struct netlink_callback *cb, void *user_hdr, - struct genl_family *family) + const struct genl_family *family) { nl_dump_check_consistent(cb, genlmsg_nlhdr(user_hdr, family)); } @@ -202,7 +200,7 @@ static inline void genl_dump_check_consistent(struct netlink_callback *cb, */ static inline void *genlmsg_put_reply(struct sk_buff *skb, struct genl_info *info, - struct genl_family *family, + const struct genl_family *family, int flags, u8 cmd) { return genlmsg_put(skb, info->snd_portid, info->snd_seq, family, @@ -239,7 +237,7 @@ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr) * @group: offset of multicast group in groups array * @flags: allocation flags */ -static inline int genlmsg_multicast_netns(struct genl_family *family, +static inline int genlmsg_multicast_netns(const struct genl_family *family, struct net *net, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { @@ -257,7 +255,7 @@ static inline int genlmsg_multicast_netns(struct genl_family *family, * @group: offset of multicast group in groups array * @flags: allocation flags */ -static inline int genlmsg_multicast(struct genl_family *family, +static inline int genlmsg_multicast(const struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { @@ -275,7 +273,7 @@ static inline int genlmsg_multicast(struct genl_family *family, * * This function must hold the RTNL or rcu_read_lock(). */ -int genlmsg_multicast_allns(struct genl_family *family, +int genlmsg_multicast_allns(const struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags); @@ -359,8 +357,9 @@ static inline struct sk_buff *genlmsg_new(size_t payload, gfp_t flags) * This function returns the number of broadcast listeners that have set the * NETLINK_RECV_NO_ENOBUFS socket option. */ -static inline int genl_set_err(struct genl_family *family, struct net *net, - u32 portid, u32 group, int code) +static inline int genl_set_err(const struct genl_family *family, + struct net *net, u32 portid, + u32 group, int code) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; @@ -368,7 +367,7 @@ static inline int genl_set_err(struct genl_family *family, struct net *net, return netlink_set_err(net->genl_sock, portid, group, code); } -static inline int genl_has_listeners(struct genl_family *family, +static inline int genl_has_listeners(const struct genl_family *family, struct net *net, unsigned int group) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h index d9b2db4a29c6..adc899381e0d 100644 --- a/include/uapi/linux/genetlink.h +++ b/include/uapi/linux/genetlink.h @@ -29,6 +29,8 @@ struct genlmsghdr { #define GENL_ID_CTRL NLMSG_MIN_TYPE #define GENL_ID_VFS_DQUOT (NLMSG_MIN_TYPE + 1) #define GENL_ID_PMCRAID (NLMSG_MIN_TYPE + 2) +/* must be last reserved + 1 */ +#define GENL_START_ALLOC (NLMSG_MIN_TYPE + 3) /************************************************************************** * Controller diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index ca582ee4ae05..85659921e7b2 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -58,10 +59,8 @@ static void genl_unlock_all(void) up_write(&cb_lock); } -#define GENL_FAM_TAB_SIZE 16 -#define GENL_FAM_TAB_MASK (GENL_FAM_TAB_SIZE - 1) +static DEFINE_IDR(genl_fam_idr); -static struct list_head family_ht[GENL_FAM_TAB_SIZE]; /* * Bitmap of multicast groups that are currently in use. * @@ -86,45 +85,29 @@ static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_CTRL) | static unsigned long *mc_groups = &mc_group_start; static unsigned long mc_groups_longs = 1; -static int genl_ctrl_event(int event, struct genl_family *family, +static int genl_ctrl_event(int event, const struct genl_family *family, const struct genl_multicast_group *grp, int grp_id); -static inline unsigned int genl_family_hash(unsigned int id) +static const struct genl_family *genl_family_find_byid(unsigned int id) { - return id & GENL_FAM_TAB_MASK; + return idr_find(&genl_fam_idr, id); } -static inline struct list_head *genl_family_chain(unsigned int id) +static const struct genl_family *genl_family_find_byname(char *name) { - return &family_ht[genl_family_hash(id)]; -} - -static struct genl_family *genl_family_find_byid(unsigned int id) -{ - struct genl_family *f; + const struct genl_family *family; + unsigned int id; - list_for_each_entry(f, genl_family_chain(id), family_list) - if (f->id == id) - return f; + idr_for_each_entry(&genl_fam_idr, family, id) + if (strcmp(family->name, name) == 0) + return family; return NULL; } -static struct genl_family *genl_family_find_byname(char *name) -{ - struct genl_family *f; - int i; - - for (i = 0; i < GENL_FAM_TAB_SIZE; i++) - list_for_each_entry(f, genl_family_chain(i), family_list) - if (strcmp(f->name, name) == 0) - return f; - - return NULL; -} - -static const struct genl_ops *genl_get_cmd(u8 cmd, struct genl_family *family) +static const struct genl_ops *genl_get_cmd(u8 cmd, + const struct genl_family *family) { int i; @@ -135,26 +118,6 @@ static const struct genl_ops *genl_get_cmd(u8 cmd, struct genl_family *family) return NULL; } -/* Of course we are going to have problems once we hit - * 2^16 alive types, but that can only happen by year 2K -*/ -static u16 genl_generate_id(void) -{ - static u16 id_gen_idx = GENL_MIN_ID; - int i; - - for (i = 0; i <= GENL_MAX_ID - GENL_MIN_ID; i++) { - if (id_gen_idx != GENL_ID_VFS_DQUOT && - id_gen_idx != GENL_ID_PMCRAID && - !genl_family_find_byid(id_gen_idx)) - return id_gen_idx; - if (++id_gen_idx > GENL_MAX_ID) - id_gen_idx = GENL_MIN_ID; - } - - return 0; -} - static int genl_allocate_reserve_groups(int n_groups, int *first_id) { unsigned long *new_groups; @@ -295,7 +258,7 @@ static int genl_validate_assign_mc_groups(struct genl_family *family) return err; } -static void genl_unregister_mc_groups(struct genl_family *family) +static void genl_unregister_mc_groups(const struct genl_family *family) { struct net *net; int i; @@ -358,6 +321,7 @@ static int genl_validate_ops(const struct genl_family *family) int genl_register_family(struct genl_family *family) { int err, i; + int start = GENL_START_ALLOC, end = GENL_MAX_ID; err = genl_validate_ops(family); if (err) @@ -370,34 +334,20 @@ int genl_register_family(struct genl_family *family) goto errout_locked; } + /* + * Sadly, a few cases need to be special-cased + * due to them having previously abused the API + * and having used their family ID also as their + * multicast group ID, so we use reserved IDs + * for both to be sure we can do that mapping. + */ if (family == &genl_ctrl) { - family->id = GENL_ID_CTRL; - } else { - u16 newid; - - /* this should be left zero in the struct */ - WARN_ON(family->id); - - /* - * Sadly, a few cases need to be special-cased - * due to them having previously abused the API - * and having used their family ID also as their - * multicast group ID, so we use reserved IDs - * for both to be sure we can do that mapping. - */ - if (strcmp(family->name, "pmcraid") == 0) - newid = GENL_ID_PMCRAID; - else if (strcmp(family->name, "VFS_DQUOT") == 0) - newid = GENL_ID_VFS_DQUOT; - else - newid = genl_generate_id(); - - if (!newid) { - err = -ENOMEM; - goto errout_locked; - } - - family->id = newid; + /* and this needs to be special for initial family lookups */ + start = end = GENL_ID_CTRL; + } else if (strcmp(family->name, "pmcraid") == 0) { + start = end = GENL_ID_PMCRAID; + } else if (strcmp(family->name, "VFS_DQUOT") == 0) { + start = end = GENL_ID_VFS_DQUOT; } if (family->maxattr && !family->parallel_ops) { @@ -410,11 +360,15 @@ int genl_register_family(struct genl_family *family) } else family->attrbuf = NULL; + family->id = idr_alloc(&genl_fam_idr, family, + start, end + 1, GFP_KERNEL); + if (!family->id) + goto errout_locked; + err = genl_validate_assign_mc_groups(family); if (err) - goto errout_locked; + goto errout_remove; - list_add_tail(&family->family_list, genl_family_chain(family->id)); genl_unlock_all(); /* send all events */ @@ -425,6 +379,8 @@ int genl_register_family(struct genl_family *family) return 0; +errout_remove: + idr_remove(&genl_fam_idr, family->id); errout_locked: genl_unlock_all(); return err; @@ -439,32 +395,29 @@ EXPORT_SYMBOL(genl_register_family); * * Returns 0 on success or a negative error code. */ -int genl_unregister_family(struct genl_family *family) +int genl_unregister_family(const struct genl_family *family) { - struct genl_family *rc; - genl_lock_all(); - list_for_each_entry(rc, genl_family_chain(family->id), family_list) { - if (family->id != rc->id || strcmp(rc->name, family->name)) - continue; + if (genl_family_find_byid(family->id)) { + genl_unlock_all(); + return -ENOENT; + } - genl_unregister_mc_groups(family); + genl_unregister_mc_groups(family); - list_del(&rc->family_list); - up_write(&cb_lock); - wait_event(genl_sk_destructing_waitq, - atomic_read(&genl_sk_destructing_cnt) == 0); - genl_unlock(); + idr_remove(&genl_fam_idr, family->id); - kfree(family->attrbuf); - genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL, 0); - return 0; - } + up_write(&cb_lock); + wait_event(genl_sk_destructing_waitq, + atomic_read(&genl_sk_destructing_cnt) == 0); + genl_unlock(); - genl_unlock_all(); + kfree(family->attrbuf); - return -ENOENT; + genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL, 0); + + return 0; } EXPORT_SYMBOL(genl_unregister_family); @@ -480,7 +433,7 @@ EXPORT_SYMBOL(genl_unregister_family); * Returns pointer to user specific header */ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, - struct genl_family *family, int flags, u8 cmd) + const struct genl_family *family, int flags, u8 cmd) { struct nlmsghdr *nlh; struct genlmsghdr *hdr; @@ -539,7 +492,7 @@ static int genl_lock_done(struct netlink_callback *cb) return rc; } -static int genl_family_rcv_msg(struct genl_family *family, +static int genl_family_rcv_msg(const struct genl_family *family, struct sk_buff *skb, struct nlmsghdr *nlh) { @@ -651,7 +604,7 @@ out: static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { - struct genl_family *family; + const struct genl_family *family; int err; family = genl_family_find_byid(nlh->nlmsg_type); @@ -682,7 +635,7 @@ static void genl_rcv(struct sk_buff *skb) static struct genl_family genl_ctrl; -static int ctrl_fill_info(struct genl_family *family, u32 portid, u32 seq, +static int ctrl_fill_info(const struct genl_family *family, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { void *hdr; @@ -769,7 +722,7 @@ nla_put_failure: return -EMSGSIZE; } -static int ctrl_fill_mcgrp_info(struct genl_family *family, +static int ctrl_fill_mcgrp_info(const struct genl_family *family, const struct genl_multicast_group *grp, int grp_id, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) @@ -812,37 +765,30 @@ nla_put_failure: static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) { - - int i, n = 0; + int n = 0; struct genl_family *rt; struct net *net = sock_net(skb->sk); - int chains_to_skip = cb->args[0]; - int fams_to_skip = cb->args[1]; - - for (i = chains_to_skip; i < GENL_FAM_TAB_SIZE; i++) { - n = 0; - list_for_each_entry(rt, genl_family_chain(i), family_list) { - if (!rt->netnsok && !net_eq(net, &init_net)) - continue; - if (++n < fams_to_skip) - continue; - if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, - skb, CTRL_CMD_NEWFAMILY) < 0) - goto errout; - } + int fams_to_skip = cb->args[0]; + unsigned int id; - fams_to_skip = 0; - } + idr_for_each_entry(&genl_fam_idr, rt, id) { + if (!rt->netnsok && !net_eq(net, &init_net)) + continue; + + if (n++ < fams_to_skip) + continue; -errout: - cb->args[0] = i; - cb->args[1] = n; + if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + skb, CTRL_CMD_NEWFAMILY) < 0) + break; + } + cb->args[0] = n; return skb->len; } -static struct sk_buff *ctrl_build_family_msg(struct genl_family *family, +static struct sk_buff *ctrl_build_family_msg(const struct genl_family *family, u32 portid, int seq, u8 cmd) { struct sk_buff *skb; @@ -862,7 +808,7 @@ static struct sk_buff *ctrl_build_family_msg(struct genl_family *family, } static struct sk_buff * -ctrl_build_mcgrp_msg(struct genl_family *family, +ctrl_build_mcgrp_msg(const struct genl_family *family, const struct genl_multicast_group *grp, int grp_id, u32 portid, int seq, u8 cmd) { @@ -892,7 +838,7 @@ static const struct nla_policy ctrl_policy[CTRL_ATTR_MAX+1] = { static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; - struct genl_family *res = NULL; + const struct genl_family *res = NULL; int err = -EINVAL; if (info->attrs[CTRL_ATTR_FAMILY_ID]) { @@ -936,7 +882,7 @@ static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info) return genlmsg_reply(msg, info); } -static int genl_ctrl_event(int event, struct genl_family *family, +static int genl_ctrl_event(int event, const struct genl_family *family, const struct genl_multicast_group *grp, int grp_id) { @@ -1005,25 +951,24 @@ static struct genl_family genl_ctrl = { static int genl_bind(struct net *net, int group) { - int i, err = -ENOENT; + struct genl_family *f; + int err = -ENOENT; + unsigned int id; down_read(&cb_lock); - for (i = 0; i < GENL_FAM_TAB_SIZE; i++) { - struct genl_family *f; - - list_for_each_entry(f, genl_family_chain(i), family_list) { - if (group >= f->mcgrp_offset && - group < f->mcgrp_offset + f->n_mcgrps) { - int fam_grp = group - f->mcgrp_offset; - - if (!f->netnsok && net != &init_net) - err = -ENOENT; - else if (f->mcast_bind) - err = f->mcast_bind(net, fam_grp); - else - err = 0; - break; - } + + idr_for_each_entry(&genl_fam_idr, f, id) { + if (group >= f->mcgrp_offset && + group < f->mcgrp_offset + f->n_mcgrps) { + int fam_grp = group - f->mcgrp_offset; + + if (!f->netnsok && net != &init_net) + err = -ENOENT; + else if (f->mcast_bind) + err = f->mcast_bind(net, fam_grp); + else + err = 0; + break; } } up_read(&cb_lock); @@ -1033,21 +978,19 @@ static int genl_bind(struct net *net, int group) static void genl_unbind(struct net *net, int group) { - int i; + struct genl_family *f; + unsigned int id; down_read(&cb_lock); - for (i = 0; i < GENL_FAM_TAB_SIZE; i++) { - struct genl_family *f; - list_for_each_entry(f, genl_family_chain(i), family_list) { - if (group >= f->mcgrp_offset && - group < f->mcgrp_offset + f->n_mcgrps) { - int fam_grp = group - f->mcgrp_offset; + idr_for_each_entry(&genl_fam_idr, f, id) { + if (group >= f->mcgrp_offset && + group < f->mcgrp_offset + f->n_mcgrps) { + int fam_grp = group - f->mcgrp_offset; - if (f->mcast_unbind) - f->mcast_unbind(net, fam_grp); - break; - } + if (f->mcast_unbind) + f->mcast_unbind(net, fam_grp); + break; } } up_read(&cb_lock); @@ -1087,10 +1030,7 @@ static struct pernet_operations genl_pernet_ops = { static int __init genl_init(void) { - int i, err; - - for (i = 0; i < GENL_FAM_TAB_SIZE; i++) - INIT_LIST_HEAD(&family_ht[i]); + int err; err = genl_register_family(&genl_ctrl); if (err < 0) @@ -1118,7 +1058,7 @@ subsys_initcall(genl_init); * You cannot use this function with a family that has parallel_ops * and you can only use it within (pre/post) doit/dumpit callbacks. */ -struct nlattr **genl_family_attrbuf(struct genl_family *family) +struct nlattr **genl_family_attrbuf(const struct genl_family *family) { if (!WARN_ON(family->parallel_ops)) lockdep_assert_held(&genl_mutex); @@ -1156,8 +1096,9 @@ static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group, return err; } -int genlmsg_multicast_allns(struct genl_family *family, struct sk_buff *skb, - u32 portid, unsigned int group, gfp_t flags) +int genlmsg_multicast_allns(const struct genl_family *family, + struct sk_buff *skb, u32 portid, + unsigned int group, gfp_t flags) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; @@ -1166,7 +1107,7 @@ int genlmsg_multicast_allns(struct genl_family *family, struct sk_buff *skb, } EXPORT_SYMBOL(genlmsg_multicast_allns); -void genl_notify(struct genl_family *family, struct sk_buff *skb, +void genl_notify(const struct genl_family *family, struct sk_buff *skb, struct genl_info *info, u32 group, gfp_t flags) { struct net *net = genl_info_net(info); -- cgit v1.2.3-71-gd317 From ebb676daa1a340ccef25eb769aefc09b79c01f8a Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 27 Oct 2016 11:23:51 +0200 Subject: bpf: Print function name in addition to function id The verifier currently prints raw function ids when printing CALL instructions or when complaining: 5: (85) call 23 unknown func 23 print a meaningful function name instead: 5: (85) call bpf_redirect#23 unknown func bpf_redirect#23 Moves the function documentation to a single comment and renames all helpers names in the list to conform to the bpf_ prefix notation so they can be greped in the kernel source. Signed-off-by: Thomas Graf Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 574 ++++++++++++++++++++++++----------------------- kernel/bpf/verifier.c | 35 ++- 2 files changed, 316 insertions(+), 293 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 374ef582ae18..e2f38e0091b6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -143,297 +143,301 @@ union bpf_attr { }; } __attribute__((aligned(8))); +/* BPF helper function descriptions: + * + * void *bpf_map_lookup_elem(&map, &key) + * Return: Map value or NULL + * + * int bpf_map_update_elem(&map, &key, &value, flags) + * Return: 0 on success or negative error + * + * int bpf_map_delete_elem(&map, &key) + * Return: 0 on success or negative error + * + * int bpf_probe_read(void *dst, int size, void *src) + * Return: 0 on success or negative error + * + * u64 bpf_ktime_get_ns(void) + * Return: current ktime + * + * int bpf_trace_printk(const char *fmt, int fmt_size, ...) + * Return: length of buffer written or negative error + * + * u32 bpf_prandom_u32(void) + * Return: random value + * + * u32 bpf_raw_smp_processor_id(void) + * Return: SMP processor ID + * + * int bpf_skb_store_bytes(skb, offset, from, len, flags) + * store bytes into packet + * @skb: pointer to skb + * @offset: offset within packet from skb->mac_header + * @from: pointer where to copy bytes from + * @len: number of bytes to store into packet + * @flags: bit 0 - if true, recompute skb->csum + * other bits - reserved + * Return: 0 on success or negative error + * + * int bpf_l3_csum_replace(skb, offset, from, to, flags) + * recompute IP checksum + * @skb: pointer to skb + * @offset: offset within packet where IP checksum is located + * @from: old value of header field + * @to: new value of header field + * @flags: bits 0-3 - size of header field + * other bits - reserved + * Return: 0 on success or negative error + * + * int bpf_l4_csum_replace(skb, offset, from, to, flags) + * recompute TCP/UDP checksum + * @skb: pointer to skb + * @offset: offset within packet where TCP/UDP checksum is located + * @from: old value of header field + * @to: new value of header field + * @flags: bits 0-3 - size of header field + * bit 4 - is pseudo header + * other bits - reserved + * Return: 0 on success or negative error + * + * int bpf_tail_call(ctx, prog_array_map, index) + * jump into another BPF program + * @ctx: context pointer passed to next program + * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY + * @index: index inside array that selects specific program to run + * Return: 0 on success or negative error + * + * int bpf_clone_redirect(skb, ifindex, flags) + * redirect to another netdev + * @skb: pointer to skb + * @ifindex: ifindex of the net device + * @flags: bit 0 - if set, redirect to ingress instead of egress + * other bits - reserved + * Return: 0 on success or negative error + * + * u64 bpf_get_current_pid_tgid(void) + * Return: current->tgid << 32 | current->pid + * + * u64 bpf_get_current_uid_gid(void) + * Return: current_gid << 32 | current_uid + * + * int bpf_get_current_comm(char *buf, int size_of_buf) + * stores current->comm into buf + * Return: 0 on success or negative error + * + * u32 bpf_get_cgroup_classid(skb) + * retrieve a proc's classid + * @skb: pointer to skb + * Return: classid if != 0 + * + * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) + * Return: 0 on success or negative error + * + * int bpf_skb_vlan_pop(skb) + * Return: 0 on success or negative error + * + * int bpf_skb_get_tunnel_key(skb, key, size, flags) + * int bpf_skb_set_tunnel_key(skb, key, size, flags) + * retrieve or populate tunnel metadata + * @skb: pointer to skb + * @key: pointer to 'struct bpf_tunnel_key' + * @size: size of 'struct bpf_tunnel_key' + * @flags: room for future extensions + * Return: 0 on success or negative error + * + * u64 bpf_perf_event_read(&map, index) + * Return: Number events read or error code + * + * int bpf_redirect(ifindex, flags) + * redirect to another netdev + * @ifindex: ifindex of the net device + * @flags: bit 0 - if set, redirect to ingress instead of egress + * other bits - reserved + * Return: TC_ACT_REDIRECT + * + * u32 bpf_get_route_realm(skb) + * retrieve a dst's tclassid + * @skb: pointer to skb + * Return: realm if != 0 + * + * int bpf_perf_event_output(ctx, map, index, data, size) + * output perf raw sample + * @ctx: struct pt_regs* + * @map: pointer to perf_event_array map + * @index: index of event in the map + * @data: data on stack to be output as raw data + * @size: size of data + * Return: 0 on success or negative error + * + * int bpf_get_stackid(ctx, map, flags) + * walk user or kernel stack and return id + * @ctx: struct pt_regs* + * @map: pointer to stack_trace map + * @flags: bits 0-7 - numer of stack frames to skip + * bit 8 - collect user stack instead of kernel + * bit 9 - compare stacks by hash only + * bit 10 - if two different stacks hash into the same stackid + * discard old + * other bits - reserved + * Return: >= 0 stackid on success or negative error + * + * s64 bpf_csum_diff(from, from_size, to, to_size, seed) + * calculate csum diff + * @from: raw from buffer + * @from_size: length of from buffer + * @to: raw to buffer + * @to_size: length of to buffer + * @seed: optional seed + * Return: csum result or negative error code + * + * int bpf_skb_get_tunnel_opt(skb, opt, size) + * retrieve tunnel options metadata + * @skb: pointer to skb + * @opt: pointer to raw tunnel option data + * @size: size of @opt + * Return: option size + * + * int bpf_skb_set_tunnel_opt(skb, opt, size) + * populate tunnel options metadata + * @skb: pointer to skb + * @opt: pointer to raw tunnel option data + * @size: size of @opt + * Return: 0 on success or negative error + * + * int bpf_skb_change_proto(skb, proto, flags) + * Change protocol of the skb. Currently supported is v4 -> v6, + * v6 -> v4 transitions. The helper will also resize the skb. eBPF + * program is expected to fill the new headers via skb_store_bytes + * and lX_csum_replace. + * @skb: pointer to skb + * @proto: new skb->protocol type + * @flags: reserved + * Return: 0 on success or negative error + * + * int bpf_skb_change_type(skb, type) + * Change packet type of skb. + * @skb: pointer to skb + * @type: new skb->pkt_type type + * Return: 0 on success or negative error + * + * int bpf_skb_under_cgroup(skb, map, index) + * Check cgroup2 membership of skb + * @skb: pointer to skb + * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type + * @index: index of the cgroup in the bpf_map + * Return: + * == 0 skb failed the cgroup2 descendant test + * == 1 skb succeeded the cgroup2 descendant test + * < 0 error + * + * u32 bpf_get_hash_recalc(skb) + * Retrieve and possibly recalculate skb->hash. + * @skb: pointer to skb + * Return: hash + * + * u64 bpf_get_current_task(void) + * Returns current task_struct + * Return: current + * + * int bpf_probe_write_user(void *dst, void *src, int len) + * safely attempt to write to a location + * @dst: destination address in userspace + * @src: source address on stack + * @len: number of bytes to copy + * Return: 0 on success or negative error + * + * int bpf_current_task_under_cgroup(map, index) + * Check cgroup2 membership of current task + * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type + * @index: index of the cgroup in the bpf_map + * Return: + * == 0 current failed the cgroup2 descendant test + * == 1 current succeeded the cgroup2 descendant test + * < 0 error + * + * int bpf_skb_change_tail(skb, len, flags) + * The helper will resize the skb to the given new size, to be used f.e. + * with control messages. + * @skb: pointer to skb + * @len: new skb length + * @flags: reserved + * Return: 0 on success or negative error + * + * int bpf_skb_pull_data(skb, len) + * The helper will pull in non-linear data in case the skb is non-linear + * and not all of len are part of the linear section. Only needed for + * read/write with direct packet access. + * @skb: pointer to skb + * @len: len to make read/writeable + * Return: 0 on success or negative error + * + * s64 bpf_csum_update(skb, csum) + * Adds csum into skb->csum in case of CHECKSUM_COMPLETE. + * @skb: pointer to skb + * @csum: csum to add + * Return: csum on success or negative error + * + * void bpf_set_hash_invalid(skb) + * Invalidate current skb->hash. + * @skb: pointer to skb + * + * int bpf_get_numa_node_id() + * Return: Id of current NUMA node. + */ +#define __BPF_FUNC_MAPPER(FN) \ + FN(unspec), \ + FN(map_lookup_elem), \ + FN(map_update_elem), \ + FN(map_delete_elem), \ + FN(probe_read), \ + FN(ktime_get_ns), \ + FN(trace_printk), \ + FN(get_prandom_u32), \ + FN(get_smp_processor_id), \ + FN(skb_store_bytes), \ + FN(l3_csum_replace), \ + FN(l4_csum_replace), \ + FN(tail_call), \ + FN(clone_redirect), \ + FN(get_current_pid_tgid), \ + FN(get_current_uid_gid), \ + FN(get_current_comm), \ + FN(get_cgroup_classid), \ + FN(skb_vlan_push), \ + FN(skb_vlan_pop), \ + FN(skb_get_tunnel_key), \ + FN(skb_set_tunnel_key), \ + FN(perf_event_read), \ + FN(redirect), \ + FN(get_route_realm), \ + FN(perf_event_output), \ + FN(skb_load_bytes), \ + FN(get_stackid), \ + FN(csum_diff), \ + FN(skb_get_tunnel_opt), \ + FN(skb_set_tunnel_opt), \ + FN(skb_change_proto), \ + FN(skb_change_type), \ + FN(skb_under_cgroup), \ + FN(get_hash_recalc), \ + FN(get_current_task), \ + FN(probe_write_user), \ + FN(current_task_under_cgroup), \ + FN(skb_change_tail), \ + FN(skb_pull_data), \ + FN(csum_update), \ + FN(set_hash_invalid), \ + FN(get_numa_node_id), + /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call */ +#define __BPF_ENUM_FN(x) BPF_FUNC_ ## x enum bpf_func_id { - BPF_FUNC_unspec, - BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */ - BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ - BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ - BPF_FUNC_probe_read, /* int bpf_probe_read(void *dst, int size, void *src) */ - BPF_FUNC_ktime_get_ns, /* u64 bpf_ktime_get_ns(void) */ - BPF_FUNC_trace_printk, /* int bpf_trace_printk(const char *fmt, int fmt_size, ...) */ - BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */ - BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */ - - /** - * skb_store_bytes(skb, offset, from, len, flags) - store bytes into packet - * @skb: pointer to skb - * @offset: offset within packet from skb->mac_header - * @from: pointer where to copy bytes from - * @len: number of bytes to store into packet - * @flags: bit 0 - if true, recompute skb->csum - * other bits - reserved - * Return: 0 on success - */ - BPF_FUNC_skb_store_bytes, - - /** - * l3_csum_replace(skb, offset, from, to, flags) - recompute IP checksum - * @skb: pointer to skb - * @offset: offset within packet where IP checksum is located - * @from: old value of header field - * @to: new value of header field - * @flags: bits 0-3 - size of header field - * other bits - reserved - * Return: 0 on success - */ - BPF_FUNC_l3_csum_replace, - - /** - * l4_csum_replace(skb, offset, from, to, flags) - recompute TCP/UDP checksum - * @skb: pointer to skb - * @offset: offset within packet where TCP/UDP checksum is located - * @from: old value of header field - * @to: new value of header field - * @flags: bits 0-3 - size of header field - * bit 4 - is pseudo header - * other bits - reserved - * Return: 0 on success - */ - BPF_FUNC_l4_csum_replace, - - /** - * bpf_tail_call(ctx, prog_array_map, index) - jump into another BPF program - * @ctx: context pointer passed to next program - * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY - * @index: index inside array that selects specific program to run - * Return: 0 on success - */ - BPF_FUNC_tail_call, - - /** - * bpf_clone_redirect(skb, ifindex, flags) - redirect to another netdev - * @skb: pointer to skb - * @ifindex: ifindex of the net device - * @flags: bit 0 - if set, redirect to ingress instead of egress - * other bits - reserved - * Return: 0 on success - */ - BPF_FUNC_clone_redirect, - - /** - * u64 bpf_get_current_pid_tgid(void) - * Return: current->tgid << 32 | current->pid - */ - BPF_FUNC_get_current_pid_tgid, - - /** - * u64 bpf_get_current_uid_gid(void) - * Return: current_gid << 32 | current_uid - */ - BPF_FUNC_get_current_uid_gid, - - /** - * bpf_get_current_comm(char *buf, int size_of_buf) - * stores current->comm into buf - * Return: 0 on success - */ - BPF_FUNC_get_current_comm, - - /** - * bpf_get_cgroup_classid(skb) - retrieve a proc's classid - * @skb: pointer to skb - * Return: classid if != 0 - */ - BPF_FUNC_get_cgroup_classid, - BPF_FUNC_skb_vlan_push, /* bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) */ - BPF_FUNC_skb_vlan_pop, /* bpf_skb_vlan_pop(skb) */ - - /** - * bpf_skb_[gs]et_tunnel_key(skb, key, size, flags) - * retrieve or populate tunnel metadata - * @skb: pointer to skb - * @key: pointer to 'struct bpf_tunnel_key' - * @size: size of 'struct bpf_tunnel_key' - * @flags: room for future extensions - * Retrun: 0 on success - */ - BPF_FUNC_skb_get_tunnel_key, - BPF_FUNC_skb_set_tunnel_key, - BPF_FUNC_perf_event_read, /* u64 bpf_perf_event_read(&map, index) */ - /** - * bpf_redirect(ifindex, flags) - redirect to another netdev - * @ifindex: ifindex of the net device - * @flags: bit 0 - if set, redirect to ingress instead of egress - * other bits - reserved - * Return: TC_ACT_REDIRECT - */ - BPF_FUNC_redirect, - - /** - * bpf_get_route_realm(skb) - retrieve a dst's tclassid - * @skb: pointer to skb - * Return: realm if != 0 - */ - BPF_FUNC_get_route_realm, - - /** - * bpf_perf_event_output(ctx, map, index, data, size) - output perf raw sample - * @ctx: struct pt_regs* - * @map: pointer to perf_event_array map - * @index: index of event in the map - * @data: data on stack to be output as raw data - * @size: size of data - * Return: 0 on success - */ - BPF_FUNC_perf_event_output, - BPF_FUNC_skb_load_bytes, - - /** - * bpf_get_stackid(ctx, map, flags) - walk user or kernel stack and return id - * @ctx: struct pt_regs* - * @map: pointer to stack_trace map - * @flags: bits 0-7 - numer of stack frames to skip - * bit 8 - collect user stack instead of kernel - * bit 9 - compare stacks by hash only - * bit 10 - if two different stacks hash into the same stackid - * discard old - * other bits - reserved - * Return: >= 0 stackid on success or negative error - */ - BPF_FUNC_get_stackid, - - /** - * bpf_csum_diff(from, from_size, to, to_size, seed) - calculate csum diff - * @from: raw from buffer - * @from_size: length of from buffer - * @to: raw to buffer - * @to_size: length of to buffer - * @seed: optional seed - * Return: csum result - */ - BPF_FUNC_csum_diff, - - /** - * bpf_skb_[gs]et_tunnel_opt(skb, opt, size) - * retrieve or populate tunnel options metadata - * @skb: pointer to skb - * @opt: pointer to raw tunnel option data - * @size: size of @opt - * Return: 0 on success for set, option size for get - */ - BPF_FUNC_skb_get_tunnel_opt, - BPF_FUNC_skb_set_tunnel_opt, - - /** - * bpf_skb_change_proto(skb, proto, flags) - * Change protocol of the skb. Currently supported is - * v4 -> v6, v6 -> v4 transitions. The helper will also - * resize the skb. eBPF program is expected to fill the - * new headers via skb_store_bytes and lX_csum_replace. - * @skb: pointer to skb - * @proto: new skb->protocol type - * @flags: reserved - * Return: 0 on success or negative error - */ - BPF_FUNC_skb_change_proto, - - /** - * bpf_skb_change_type(skb, type) - * Change packet type of skb. - * @skb: pointer to skb - * @type: new skb->pkt_type type - * Return: 0 on success or negative error - */ - BPF_FUNC_skb_change_type, - - /** - * bpf_skb_under_cgroup(skb, map, index) - Check cgroup2 membership of skb - * @skb: pointer to skb - * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type - * @index: index of the cgroup in the bpf_map - * Return: - * == 0 skb failed the cgroup2 descendant test - * == 1 skb succeeded the cgroup2 descendant test - * < 0 error - */ - BPF_FUNC_skb_under_cgroup, - - /** - * bpf_get_hash_recalc(skb) - * Retrieve and possibly recalculate skb->hash. - * @skb: pointer to skb - * Return: hash - */ - BPF_FUNC_get_hash_recalc, - - /** - * u64 bpf_get_current_task(void) - * Returns current task_struct - * Return: current - */ - BPF_FUNC_get_current_task, - - /** - * bpf_probe_write_user(void *dst, void *src, int len) - * safely attempt to write to a location - * @dst: destination address in userspace - * @src: source address on stack - * @len: number of bytes to copy - * Return: 0 on success or negative error - */ - BPF_FUNC_probe_write_user, - - /** - * bpf_current_task_under_cgroup(map, index) - Check cgroup2 membership of current task - * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type - * @index: index of the cgroup in the bpf_map - * Return: - * == 0 current failed the cgroup2 descendant test - * == 1 current succeeded the cgroup2 descendant test - * < 0 error - */ - BPF_FUNC_current_task_under_cgroup, - - /** - * bpf_skb_change_tail(skb, len, flags) - * The helper will resize the skb to the given new size, - * to be used f.e. with control messages. - * @skb: pointer to skb - * @len: new skb length - * @flags: reserved - * Return: 0 on success or negative error - */ - BPF_FUNC_skb_change_tail, - - /** - * bpf_skb_pull_data(skb, len) - * The helper will pull in non-linear data in case the - * skb is non-linear and not all of len are part of the - * linear section. Only needed for read/write with direct - * packet access. - * @skb: pointer to skb - * @len: len to make read/writeable - * Return: 0 on success or negative error - */ - BPF_FUNC_skb_pull_data, - - /** - * bpf_csum_update(skb, csum) - * Adds csum into skb->csum in case of CHECKSUM_COMPLETE. - * @skb: pointer to skb - * @csum: csum to add - * Return: csum on success or negative error - */ - BPF_FUNC_csum_update, - - /** - * bpf_set_hash_invalid(skb) - * Invalidate current skb>hash. - * @skb: pointer to skb - */ - BPF_FUNC_set_hash_invalid, - - /** - * bpf_get_numa_node_id() - * Returns the id of the current NUMA node. - */ - BPF_FUNC_get_numa_node_id, - + __BPF_FUNC_MAPPER(__BPF_ENUM_FN) __BPF_FUNC_MAX_ID, }; +#undef __BPF_ENUM_FN /* All flags used by eBPF helper functions, placed here. */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 846d7ceaf202..900257578934 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19,6 +19,7 @@ #include #include #include +#include /* bpf_check() is a static code analyzer that walks eBPF program * instruction by instruction and updates register/stack state. @@ -190,6 +191,22 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; +#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) +static const char * const func_id_str[] = { + __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) +}; +#undef __BPF_FUNC_STR_FN + +static const char *func_id_name(int id) +{ + BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); + + if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) + return func_id_str[id]; + else + return "unknown"; +} + static void print_verifier_state(struct bpf_verifier_state *state) { struct bpf_reg_state *reg; @@ -354,7 +371,8 @@ static void print_bpf_insn(struct bpf_insn *insn) u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { - verbose("(%02x) call %d\n", insn->code, insn->imm); + verbose("(%02x) call %s#%d\n", insn->code, + func_id_name(insn->imm), insn->imm); } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose("(%02x) goto pc%+d\n", insn->code, insn->off); @@ -1114,8 +1132,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) return 0; error: - verbose("cannot pass map_type %d into func %d\n", - map->map_type, func_id); + verbose("cannot pass map_type %d into func %s#%d\n", + map->map_type, func_id_name(func_id), func_id); return -EINVAL; } @@ -1172,7 +1190,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id) /* find function prototype */ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { - verbose("invalid func %d\n", func_id); + verbose("invalid func %s#%d\n", func_id_name(func_id), func_id); return -EINVAL; } @@ -1180,7 +1198,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id) fn = env->prog->aux->ops->get_func_proto(func_id); if (!fn) { - verbose("unknown func %d\n", func_id); + verbose("unknown func %s#%d\n", func_id_name(func_id), func_id); return -EINVAL; } @@ -1200,7 +1218,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id) */ err = check_raw_mode(fn); if (err) { - verbose("kernel subsystem misconfigured func %d\n", func_id); + verbose("kernel subsystem misconfigured func %s#%d\n", + func_id_name(func_id), func_id); return err; } @@ -1256,8 +1275,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id) regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].id = ++env->id_gen; } else { - verbose("unknown return type %d of func %d\n", - fn->ret_type, func_id); + verbose("unknown return type %d of func %s#%d\n", + fn->ret_type, func_id_name(func_id), func_id); return -EINVAL; } -- cgit v1.2.3-71-gd317 From c62cce2caee558e18aa05c01c2fd3b40f07174f2 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Mon, 24 Oct 2016 18:29:13 -0700 Subject: net: add an ioctl to get a socket network namespace Each socket operates in a network namespace where it has been created, so if we want to dump and restore a socket, we have to know its network namespace. We have a socket_diag to get information about sockets, it doesn't report sockets which are not bound or connected. This patch introduces a new socket ioctl, which is called SIOCGSKNS and used to get a file descriptor for a socket network namespace. A task must have CAP_NET_ADMIN in a target network namespace to use this ioctl. Cc: "David S. Miller" Cc: Eric W. Biederman Signed-off-by: Andrei Vagin Signed-off-by: David S. Miller --- fs/nsfs.c | 2 +- include/linux/proc_fs.h | 4 ++++ include/uapi/linux/sockios.h | 1 + net/socket.c | 13 +++++++++++++ 4 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/fs/nsfs.c b/fs/nsfs.c index 8718af895eab..8c9fb29c6673 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -118,7 +118,7 @@ again: return ret; } -static int open_related_ns(struct ns_common *ns, +int open_related_ns(struct ns_common *ns, struct ns_common *(*get_ns)(struct ns_common *ns)) { struct path path = {}; diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index b97bf2ef996e..368c7ad06ae5 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -82,4 +82,8 @@ static inline struct proc_dir_entry *proc_net_mkdir( return proc_mkdir_data(name, 0, parent, net); } +struct ns_common; +int open_related_ns(struct ns_common *ns, + struct ns_common *(*get_ns)(struct ns_common *ns)); + #endif /* _LINUX_PROC_FS_H */ diff --git a/include/uapi/linux/sockios.h b/include/uapi/linux/sockios.h index 8e7890b26d9a..83cc54ce6081 100644 --- a/include/uapi/linux/sockios.h +++ b/include/uapi/linux/sockios.h @@ -84,6 +84,7 @@ #define SIOCWANDEV 0x894A /* get/set netdev parameters */ #define SIOCOUTQNSD 0x894B /* output queue size (not sent only) */ +#define SIOCGSKNS 0x894C /* get socket network namespace */ /* ARP cache control calls. */ /* 0x8950 - 0x8952 * obsolete calls, don't re-use */ diff --git a/net/socket.c b/net/socket.c index 5a9bf5ee2464..970a7ea3fc4a 100644 --- a/net/socket.c +++ b/net/socket.c @@ -877,6 +877,11 @@ static long sock_do_ioctl(struct net *net, struct socket *sock, * what to do with it - that's up to the protocol still. */ +static struct ns_common *get_net_ns(struct ns_common *ns) +{ + return &get_net(container_of(ns, struct net, ns))->ns; +} + static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) { struct socket *sock; @@ -945,6 +950,13 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) err = dlci_ioctl_hook(cmd, argp); mutex_unlock(&dlci_ioctl_mutex); break; + case SIOCGSKNS: + err = -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + break; + + err = open_related_ns(&net->ns, get_net_ns); + break; default: err = sock_do_ioctl(net, sock, cmd, arg); break; @@ -3093,6 +3105,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCSIFVLAN: case SIOCADDDLCI: case SIOCDELDLCI: + case SIOCGSKNS: return sock_ioctl(file, cmd, arg); case SIOCGIFFLAGS: -- cgit v1.2.3-71-gd317 From 20861f26e33d76a4f3587bcc866fa1dab3e01094 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Thu, 27 Oct 2016 09:05:22 +0800 Subject: driver: tun: Use new macro SOCK_IOC_TYPE instead of literal number 0x89 The current codes use _IOC_TYPE(cmd) == 0x89 to check if the cmd is one socket ioctl command like SIOCGIFHWADDR. But the literal number 0x89 may confuse readers. So create one macro SOCK_IOC_TYPE to enhance the readability. Signed-off-by: Gao Feng Signed-off-by: David S. Miller --- drivers/net/tun.c | 2 +- include/uapi/linux/sockios.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 9142db847ee1..1588469844e7 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1985,7 +1985,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, int le; int ret; - if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == 0x89) { + if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == SOCK_IOC_TYPE) { if (copy_from_user(&ifr, argp, ifreq_len)) return -EFAULT; } else { diff --git a/include/uapi/linux/sockios.h b/include/uapi/linux/sockios.h index 83cc54ce6081..79d029d25310 100644 --- a/include/uapi/linux/sockios.h +++ b/include/uapi/linux/sockios.h @@ -24,6 +24,8 @@ #define SIOCINQ FIONREAD #define SIOCOUTQ TIOCOUTQ /* output queue size (not sent + not acked) */ +#define SOCK_IOC_TYPE 0x89 + /* Routing table calls. */ #define SIOCADDRT 0x890B /* add routing table entry */ #define SIOCDELRT 0x890C /* delete routing table entry */ -- cgit v1.2.3-71-gd317 From f6d0cbcf09c506b9b022df8f9d7693a7cec3c732 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 24 Oct 2016 16:56:40 +0200 Subject: netfilter: nf_tables: add fib expression Add FIB expression, supported for ipv4, ipv6 and inet family (the latter just dispatches to ipv4 or ipv6 one based on nfproto). Currently supports fetching output interface index/name and the rtm_type associated with an address. This can be used for adding path filtering. rtm_type is useful to e.g. enforce a strong-end host model where packets are only accepted if daddr is configured on the interface the packet arrived on. The fib expression is a native nftables alternative to the xtables addrtype and rp_filter matches. FIB result order for oif/oifname retrieval is as follows: - if packet is local (skb has rtable, RTF_LOCAL set, this will also catch looped-back multicast packets), set oif to the loopback interface. - if fib lookup returns an error, or result points to local, store zero result. This means '--local' option of -m rpfilter is not supported. It is possible to use 'fib type local' or add explicit saddr/daddr matching rules to create exceptions if this is really needed. - store result in the destination register. In case of multiple routes, search set for desired oif in case strict matching is requested. ipv4 and ipv6 behave fib expressions are supposed to behave the same. [ I have collapsed Arnd Bergmann's ("netfilter: nf_tables: fib warnings") http://patchwork.ozlabs.org/patch/688615/ to address fallout from this patch after rebasing nf-next, that was posted to address compilation warnings. --pablo ] Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_fib.h | 31 ++++ include/uapi/linux/netfilter/nf_tables.h | 36 ++++ net/ipv4/netfilter/Kconfig | 8 + net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/nft_fib_ipv4.c | 238 ++++++++++++++++++++++++++ net/ipv6/netfilter/Kconfig | 8 + net/ipv6/netfilter/Makefile | 1 + net/ipv6/netfilter/nft_fib_ipv6.c | 275 +++++++++++++++++++++++++++++++ net/netfilter/Kconfig | 13 ++ net/netfilter/Makefile | 2 + net/netfilter/nft_fib.c | 159 ++++++++++++++++++ net/netfilter/nft_fib_inet.c | 82 +++++++++ 12 files changed, 854 insertions(+) create mode 100644 include/net/netfilter/nft_fib.h create mode 100644 net/ipv4/netfilter/nft_fib_ipv4.c create mode 100644 net/ipv6/netfilter/nft_fib_ipv6.c create mode 100644 net/netfilter/nft_fib.c create mode 100644 net/netfilter/nft_fib_inet.c (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h new file mode 100644 index 000000000000..cbedda077db2 --- /dev/null +++ b/include/net/netfilter/nft_fib.h @@ -0,0 +1,31 @@ +#ifndef _NFT_FIB_H_ +#define _NFT_FIB_H_ + +struct nft_fib { + enum nft_registers dreg:8; + u8 result; + u32 flags; +}; + +extern const struct nla_policy nft_fib_policy[]; + +int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr); +int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nlattr * const tb[]); +int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nft_data **data); + + +void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt); +void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt); + +void nft_fib6_eval_type(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt); +void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt); + +void nft_fib_store_result(void *reg, enum nft_fib_result r, + const struct nft_pktinfo *pkt, int index); +#endif diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index c6c4477c136b..a054ad2c8853 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1109,6 +1109,42 @@ enum nft_gen_attributes { }; #define NFTA_GEN_MAX (__NFTA_GEN_MAX - 1) +/* + * enum nft_fib_attributes - nf_tables fib expression netlink attributes + * + * @NFTA_FIB_DREG: destination register (NLA_U32) + * @NFTA_FIB_RESULT: desired result (NLA_U32) + * @NFTA_FIB_FLAGS: flowi fields to initialize when querying the FIB (NLA_U32) + * + * The FIB expression performs a route lookup according + * to the packet data. + */ +enum nft_fib_attributes { + NFTA_FIB_UNSPEC, + NFTA_FIB_DREG, + NFTA_FIB_RESULT, + NFTA_FIB_FLAGS, + __NFTA_FIB_MAX +}; +#define NFTA_FIB_MAX (__NFTA_FIB_MAX - 1) + +enum nft_fib_result { + NFT_FIB_RESULT_UNSPEC, + NFT_FIB_RESULT_OIF, + NFT_FIB_RESULT_OIFNAME, + NFT_FIB_RESULT_ADDRTYPE, + __NFT_FIB_RESULT_MAX +}; +#define NFT_FIB_RESULT_MAX (__NFT_FIB_RESULT_MAX - 1) + +enum nft_fib_flags { + NFTA_FIB_F_SADDR = 1 << 0, /* look up src */ + NFTA_FIB_F_DADDR = 1 << 1, /* look up dst */ + NFTA_FIB_F_MARK = 1 << 2, /* use skb->mark */ + NFTA_FIB_F_IIF = 1 << 3, /* restrict to iif */ + NFTA_FIB_F_OIF = 1 << 4, /* restrict to oif */ +}; + /** * enum nft_trace_attributes - nf_tables trace netlink attributes * diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index d613309e3e5d..f80bb8f457c8 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -54,6 +54,14 @@ config NFT_DUP_IPV4 help This module enables IPv4 packet duplication support for nf_tables. +config NFT_FIB_IPV4 + select NFT_FIB + tristate "nf_tables fib / ip route lookup support" + help + This module enables IPv4 FIB lookups, e.g. for reverse path filtering. + It also allows query of the FIB for the route type, e.g. local, unicast, + multicast or blackhole. + endif # NF_TABLES_IPV4 config NF_TABLES_ARP diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 853328f8fd05..59d7786ddce1 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o +obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c new file mode 100644 index 000000000000..db91fd42db67 --- /dev/null +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -0,0 +1,238 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* don't try to find route from mcast/bcast/zeronet */ +static __be32 get_saddr(__be32 addr) +{ + if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) || + ipv4_is_zeronet(addr)) + return 0; + return addr; +} + +static bool fib4_is_local(const struct sk_buff *skb) +{ + const struct rtable *rt = skb_rtable(skb); + + return rt && (rt->rt_flags & RTCF_LOCAL); +} + +#define DSCP_BITS 0xfc + +void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_fib *priv = nft_expr_priv(expr); + u32 *dst = ®s->data[priv->dreg]; + const struct net_device *dev = NULL; + const struct iphdr *iph; + __be32 addr; + + if (priv->flags & NFTA_FIB_F_IIF) + dev = pkt->in; + else if (priv->flags & NFTA_FIB_F_OIF) + dev = pkt->out; + + iph = ip_hdr(pkt->skb); + if (priv->flags & NFTA_FIB_F_DADDR) + addr = iph->daddr; + else + addr = iph->saddr; + + *dst = inet_dev_addr_type(pkt->net, dev, addr); +} +EXPORT_SYMBOL_GPL(nft_fib4_eval_type); + +static int get_ifindex(const struct net_device *dev) +{ + return dev ? dev->ifindex : 0; +} + +void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_fib *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + const struct iphdr *iph; + struct fib_result res; + struct flowi4 fl4 = { + .flowi4_scope = RT_SCOPE_UNIVERSE, + .flowi4_iif = LOOPBACK_IFINDEX, + }; + const struct net_device *oif; + struct net_device *found; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + int i; +#endif + + /* + * Do not set flowi4_oif, it restricts results (for example, asking + * for oif 3 will get RTN_UNICAST result even if the daddr exits + * on another interface. + * + * Search results for the desired outinterface instead. + */ + if (priv->flags & NFTA_FIB_F_OIF) + oif = pkt->out; + else if (priv->flags & NFTA_FIB_F_IIF) + oif = pkt->in; + else + oif = NULL; + + if (pkt->hook == NF_INET_PRE_ROUTING && fib4_is_local(pkt->skb)) { + nft_fib_store_result(dest, priv->result, pkt, LOOPBACK_IFINDEX); + return; + } + + iph = ip_hdr(pkt->skb); + if (ipv4_is_multicast(iph->daddr) && + ipv4_is_zeronet(iph->saddr) && + ipv4_is_local_multicast(iph->daddr)) { + nft_fib_store_result(dest, priv->result, pkt, + get_ifindex(pkt->skb->dev)); + return; + } + + if (priv->flags & NFTA_FIB_F_MARK) + fl4.flowi4_mark = pkt->skb->mark; + + fl4.flowi4_tos = iph->tos & DSCP_BITS; + + if (priv->flags & NFTA_FIB_F_DADDR) { + fl4.daddr = iph->daddr; + fl4.saddr = get_saddr(iph->saddr); + } else { + fl4.daddr = iph->saddr; + fl4.saddr = get_saddr(iph->daddr); + } + + if (fib_lookup(pkt->net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE)) + return; + + switch (res.type) { + case RTN_UNICAST: + break; + case RTN_LOCAL: /* should not appear here, see fib4_is_local() above */ + return; + default: + break; + } + + if (!oif) { + found = FIB_RES_DEV(res); + goto ok; + } + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + for (i = 0; i < res.fi->fib_nhs; i++) { + struct fib_nh *nh = &res.fi->fib_nh[i]; + + if (nh->nh_dev == oif) { + found = nh->nh_dev; + goto ok; + } + } + return; +#else + found = FIB_RES_DEV(res); + if (found != oif) + return; +#endif +ok: + switch (priv->result) { + case NFT_FIB_RESULT_OIF: + *dest = found->ifindex; + break; + case NFT_FIB_RESULT_OIFNAME: + strncpy((char *)dest, found->name, IFNAMSIZ); + break; + default: + WARN_ON_ONCE(1); + break; + } +} +EXPORT_SYMBOL_GPL(nft_fib4_eval); + +static struct nft_expr_type nft_fib4_type; + +static const struct nft_expr_ops nft_fib4_type_ops = { + .type = &nft_fib4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)), + .eval = nft_fib4_eval_type, + .init = nft_fib_init, + .dump = nft_fib_dump, + .validate = nft_fib_validate, +}; + +static const struct nft_expr_ops nft_fib4_ops = { + .type = &nft_fib4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)), + .eval = nft_fib4_eval, + .init = nft_fib_init, + .dump = nft_fib_dump, + .validate = nft_fib_validate, +}; + +static const struct nft_expr_ops * +nft_fib4_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + enum nft_fib_result result; + + if (!tb[NFTA_FIB_RESULT]) + return ERR_PTR(-EINVAL); + + result = htonl(nla_get_be32(tb[NFTA_FIB_RESULT])); + + switch (result) { + case NFT_FIB_RESULT_OIF: + return &nft_fib4_ops; + case NFT_FIB_RESULT_OIFNAME: + return &nft_fib4_ops; + case NFT_FIB_RESULT_ADDRTYPE: + return &nft_fib4_type_ops; + default: + return ERR_PTR(-EOPNOTSUPP); + } +} + +static struct nft_expr_type nft_fib4_type __read_mostly = { + .name = "fib", + .select_ops = &nft_fib4_select_ops, + .policy = nft_fib_policy, + .maxattr = NFTA_FIB_MAX, + .family = NFPROTO_IPV4, + .owner = THIS_MODULE, +}; + +static int __init nft_fib4_module_init(void) +{ + return nft_register_expr(&nft_fib4_type); +} + +static void __exit nft_fib4_module_exit(void) +{ + nft_unregister_expr(&nft_fib4_type); +} + +module_init(nft_fib4_module_init); +module_exit(nft_fib4_module_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal "); +MODULE_ALIAS_NFT_AF_EXPR(2, "fib"); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index e10a04c9cdc7..144f1de33836 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -54,6 +54,14 @@ config NFT_DUP_IPV6 help This module enables IPv6 packet duplication support for nf_tables. +config NFT_FIB_IPV6 + tristate "nf_tables fib / ipv6 route lookup support" + select NFT_FIB + help + This module enables IPv6 FIB lookups, e.g. for reverse path filtering. + It also allows query of the FIB for the route type, e.g. local, unicast, + multicast or blackhole. + endif # NF_TABLES_IPV6 endif # NF_TABLES diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index b4f7d0b4e2af..7984f3071f05 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o obj-$(CONFIG_NFT_MASQ_IPV6) += nft_masq_ipv6.o obj-$(CONFIG_NFT_REDIR_IPV6) += nft_redir_ipv6.o obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o +obj-$(CONFIG_NFT_FIB_IPV6) += nft_fib_ipv6.o # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c new file mode 100644 index 000000000000..ff1f1b6b4a4a --- /dev/null +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -0,0 +1,275 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static bool fib6_is_local(const struct sk_buff *skb) +{ + const struct rt6_info *rt = (const void *)skb_dst(skb); + + return rt && (rt->rt6i_flags & RTF_LOCAL); +} + +static int get_ifindex(const struct net_device *dev) +{ + return dev ? dev->ifindex : 0; +} + +static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv, + const struct nft_pktinfo *pkt, + const struct net_device *dev) +{ + const struct ipv6hdr *iph = ipv6_hdr(pkt->skb); + int lookup_flags = 0; + + if (priv->flags & NFTA_FIB_F_DADDR) { + fl6->daddr = iph->daddr; + fl6->saddr = iph->saddr; + } else { + fl6->daddr = iph->saddr; + fl6->saddr = iph->daddr; + } + + if (ipv6_addr_type(&fl6->daddr) & IPV6_ADDR_LINKLOCAL) { + lookup_flags |= RT6_LOOKUP_F_IFACE; + fl6->flowi6_oif = get_ifindex(dev ? dev : pkt->skb->dev); + } + + if (ipv6_addr_type(&fl6->saddr) & IPV6_ADDR_UNICAST) + lookup_flags |= RT6_LOOKUP_F_HAS_SADDR; + + if (priv->flags & NFTA_FIB_F_MARK) + fl6->flowi6_mark = pkt->skb->mark; + + fl6->flowlabel = (*(__be32 *)iph) & IPV6_FLOWINFO_MASK; + + return lookup_flags; +} + +static u32 __nft_fib6_eval_type(const struct nft_fib *priv, + const struct nft_pktinfo *pkt) +{ + const struct net_device *dev = NULL; + const struct nf_ipv6_ops *v6ops; + const struct nf_afinfo *afinfo; + int route_err, addrtype; + struct rt6_info *rt; + struct flowi6 fl6 = { + .flowi6_iif = LOOPBACK_IFINDEX, + .flowi6_proto = pkt->tprot, + }; + u32 ret = 0; + + afinfo = nf_get_afinfo(NFPROTO_IPV6); + if (!afinfo) + return RTN_UNREACHABLE; + + if (priv->flags & NFTA_FIB_F_IIF) + dev = pkt->in; + else if (priv->flags & NFTA_FIB_F_OIF) + dev = pkt->out; + + nft_fib6_flowi_init(&fl6, priv, pkt, dev); + + v6ops = nf_get_ipv6_ops(); + if (dev && v6ops && v6ops->chk_addr(pkt->net, &fl6.daddr, dev, true)) + ret = RTN_LOCAL; + + route_err = afinfo->route(pkt->net, (struct dst_entry **)&rt, + flowi6_to_flowi(&fl6), false); + if (route_err) + goto err; + + if (rt->rt6i_flags & RTF_REJECT) { + route_err = rt->dst.error; + dst_release(&rt->dst); + goto err; + } + + if (ipv6_anycast_destination((struct dst_entry *)rt, &fl6.daddr)) + ret = RTN_ANYCAST; + else if (!dev && rt->rt6i_flags & RTF_LOCAL) + ret = RTN_LOCAL; + + dst_release(&rt->dst); + + if (ret) + return ret; + + addrtype = ipv6_addr_type(&fl6.daddr); + + if (addrtype & IPV6_ADDR_MULTICAST) + return RTN_MULTICAST; + if (addrtype & IPV6_ADDR_UNICAST) + return RTN_UNICAST; + + return RTN_UNSPEC; + err: + switch (route_err) { + case -EINVAL: + return RTN_BLACKHOLE; + case -EACCES: + return RTN_PROHIBIT; + case -EAGAIN: + return RTN_THROW; + default: + break; + } + + return RTN_UNREACHABLE; +} + +void nft_fib6_eval_type(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_fib *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + + *dest = __nft_fib6_eval_type(priv, pkt); +} +EXPORT_SYMBOL_GPL(nft_fib6_eval_type); + +void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_fib *priv = nft_expr_priv(expr); + const struct net_device *oif = NULL; + u32 *dest = ®s->data[priv->dreg]; + struct flowi6 fl6 = { + .flowi6_iif = LOOPBACK_IFINDEX, + .flowi6_proto = pkt->tprot, + }; + struct rt6_info *rt; + int lookup_flags; + + if (priv->flags & NFTA_FIB_F_IIF) + oif = pkt->in; + else if (priv->flags & NFTA_FIB_F_OIF) + oif = pkt->out; + + lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif); + + if (pkt->hook == NF_INET_PRE_ROUTING && fib6_is_local(pkt->skb)) { + nft_fib_store_result(dest, priv->result, pkt, LOOPBACK_IFINDEX); + return; + } + + *dest = 0; + again: + rt = (void *)ip6_route_lookup(pkt->net, &fl6, lookup_flags); + if (rt->dst.error) + goto put_rt_err; + + /* Should not see RTF_LOCAL here */ + if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL)) + goto put_rt_err; + + if (oif && oif != rt->rt6i_idev->dev) { + /* multipath route? Try again with F_IFACE */ + if ((lookup_flags & RT6_LOOKUP_F_IFACE) == 0) { + lookup_flags |= RT6_LOOKUP_F_IFACE; + fl6.flowi6_oif = oif->ifindex; + ip6_rt_put(rt); + goto again; + } + } + + switch (priv->result) { + case NFT_FIB_RESULT_OIF: + *dest = rt->rt6i_idev->dev->ifindex; + break; + case NFT_FIB_RESULT_OIFNAME: + strncpy((char *)dest, rt->rt6i_idev->dev->name, IFNAMSIZ); + break; + default: + WARN_ON_ONCE(1); + break; + } + + put_rt_err: + ip6_rt_put(rt); +} +EXPORT_SYMBOL_GPL(nft_fib6_eval); + +static struct nft_expr_type nft_fib6_type; + +static const struct nft_expr_ops nft_fib6_type_ops = { + .type = &nft_fib6_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)), + .eval = nft_fib6_eval_type, + .init = nft_fib_init, + .dump = nft_fib_dump, + .validate = nft_fib_validate, +}; + +static const struct nft_expr_ops nft_fib6_ops = { + .type = &nft_fib6_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)), + .eval = nft_fib6_eval, + .init = nft_fib_init, + .dump = nft_fib_dump, + .validate = nft_fib_validate, +}; + +static const struct nft_expr_ops * +nft_fib6_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + enum nft_fib_result result; + + if (!tb[NFTA_FIB_RESULT]) + return ERR_PTR(-EINVAL); + + result = htonl(nla_get_be32(tb[NFTA_FIB_RESULT])); + + switch (result) { + case NFT_FIB_RESULT_OIF: + return &nft_fib6_ops; + case NFT_FIB_RESULT_OIFNAME: + return &nft_fib6_ops; + case NFT_FIB_RESULT_ADDRTYPE: + return &nft_fib6_type_ops; + default: + return ERR_PTR(-EOPNOTSUPP); + } +} + +static struct nft_expr_type nft_fib6_type __read_mostly = { + .name = "fib", + .select_ops = &nft_fib6_select_ops, + .policy = nft_fib_policy, + .maxattr = NFTA_FIB_MAX, + .family = NFPROTO_IPV6, + .owner = THIS_MODULE, +}; + +static int __init nft_fib6_module_init(void) +{ + return nft_register_expr(&nft_fib6_type); +} + +static void __exit nft_fib6_module_exit(void) +{ + nft_unregister_expr(&nft_fib6_type); +} +module_init(nft_fib6_module_init); +module_exit(nft_fib6_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal "); +MODULE_ALIAS_NFT_AF_EXPR(10, "fib"); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index e8d56d9a4df2..9bcf899ce16e 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -581,6 +581,19 @@ config NFT_HASH This option adds the "hash" expression that you can use to perform a hash operation on registers. +config NFT_FIB + tristate + +config NFT_FIB_INET + depends on NF_TABLES_INET + depends on NFT_FIB_IPV4 + depends on NFT_FIB_IPV6 + tristate "Netfilter nf_tables fib inet support" + help + This option allows using the FIB expression from the inet table. + The lookup will be delegated to the IPv4 or IPv6 FIB depending + on the protocol of the packet. + if NF_TABLES_NETDEV config NF_DUP_NETDEV diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index c23c3c84416f..8faa36c0686d 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -96,6 +96,8 @@ obj-$(CONFIG_NFT_LOG) += nft_log.o obj-$(CONFIG_NFT_MASQ) += nft_masq.o obj-$(CONFIG_NFT_REDIR) += nft_redir.o obj-$(CONFIG_NFT_HASH) += nft_hash.o +obj-$(CONFIG_NFT_FIB) += nft_fib.o +obj-$(CONFIG_NFT_FIB_INET) += nft_fib_inet.o # nf_tables netdev obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c new file mode 100644 index 000000000000..4944a8b7f7a7 --- /dev/null +++ b/net/netfilter/nft_fib.c @@ -0,0 +1,159 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Generic part shared by ipv4 and ipv6 backends. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const struct nla_policy nft_fib_policy[NFTA_FIB_MAX + 1] = { + [NFTA_FIB_DREG] = { .type = NLA_U32 }, + [NFTA_FIB_RESULT] = { .type = NLA_U32 }, + [NFTA_FIB_FLAGS] = { .type = NLA_U32 }, +}; +EXPORT_SYMBOL(nft_fib_policy); + +#define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \ + NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF) + +int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nft_data **data) +{ + const struct nft_fib *priv = nft_expr_priv(expr); + unsigned int hooks; + + switch (priv->result) { + case NFT_FIB_RESULT_OIF: /* fallthrough */ + case NFT_FIB_RESULT_OIFNAME: + hooks = (1 << NF_INET_PRE_ROUTING); + break; + case NFT_FIB_RESULT_ADDRTYPE: + if (priv->flags & NFTA_FIB_F_IIF) + hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD); + else if (priv->flags & NFTA_FIB_F_OIF) + hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_FORWARD); + else + hooks = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING); + + break; + default: + return -EINVAL; + } + + return nft_chain_validate_hooks(ctx->chain, hooks); +} +EXPORT_SYMBOL_GPL(nft_fib_validate); + +int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_fib *priv = nft_expr_priv(expr); + unsigned int len; + int err; + + if (!tb[NFTA_FIB_DREG] || !tb[NFTA_FIB_RESULT] || !tb[NFTA_FIB_FLAGS]) + return -EINVAL; + + priv->flags = ntohl(nla_get_be32(tb[NFTA_FIB_FLAGS])); + + if (priv->flags == 0 || (priv->flags & ~NFTA_FIB_F_ALL)) + return -EINVAL; + + if ((priv->flags & (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR)) == + (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR)) + return -EINVAL; + if ((priv->flags & (NFTA_FIB_F_IIF | NFTA_FIB_F_OIF)) == + (NFTA_FIB_F_IIF | NFTA_FIB_F_OIF)) + return -EINVAL; + if ((priv->flags & (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR)) == 0) + return -EINVAL; + + priv->result = htonl(nla_get_be32(tb[NFTA_FIB_RESULT])); + priv->dreg = nft_parse_register(tb[NFTA_FIB_DREG]); + + switch (priv->result) { + case NFT_FIB_RESULT_OIF: + if (priv->flags & NFTA_FIB_F_OIF) + return -EINVAL; + len = sizeof(int); + break; + case NFT_FIB_RESULT_OIFNAME: + if (priv->flags & NFTA_FIB_F_OIF) + return -EINVAL; + len = IFNAMSIZ; + break; + case NFT_FIB_RESULT_ADDRTYPE: + len = sizeof(u32); + break; + default: + return -EINVAL; + } + + err = nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, len); + if (err < 0) + return err; + + return nft_fib_validate(ctx, expr, NULL); +} +EXPORT_SYMBOL_GPL(nft_fib_init); + +int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_fib *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_FIB_DREG, priv->dreg)) + return -1; + + if (nla_put_be32(skb, NFTA_FIB_RESULT, htonl(priv->result))) + return -1; + + if (nla_put_be32(skb, NFTA_FIB_FLAGS, htonl(priv->flags))) + return -1; + + return 0; +} +EXPORT_SYMBOL_GPL(nft_fib_dump); + +void nft_fib_store_result(void *reg, enum nft_fib_result r, + const struct nft_pktinfo *pkt, int index) +{ + struct net_device *dev; + u32 *dreg = reg; + + switch (r) { + case NFT_FIB_RESULT_OIF: + *dreg = index; + break; + case NFT_FIB_RESULT_OIFNAME: + dev = dev_get_by_index_rcu(pkt->net, index); + strncpy(reg, dev ? dev->name : "", IFNAMSIZ); + break; + default: + WARN_ON_ONCE(1); + *dreg = 0; + break; + } +} +EXPORT_SYMBOL_GPL(nft_fib_store_result); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal "); diff --git a/net/netfilter/nft_fib_inet.c b/net/netfilter/nft_fib_inet.c new file mode 100644 index 000000000000..fe8943b572b7 --- /dev/null +++ b/net/netfilter/nft_fib_inet.c @@ -0,0 +1,82 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static void nft_fib_inet_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_fib *priv = nft_expr_priv(expr); + + switch (pkt->pf) { + case NFPROTO_IPV4: + switch (priv->result) { + case NFT_FIB_RESULT_OIF: + case NFT_FIB_RESULT_OIFNAME: + return nft_fib4_eval(expr, regs, pkt); + case NFT_FIB_RESULT_ADDRTYPE: + return nft_fib4_eval_type(expr, regs, pkt); + } + break; + case NFPROTO_IPV6: + switch (priv->result) { + case NFT_FIB_RESULT_OIF: + case NFT_FIB_RESULT_OIFNAME: + return nft_fib6_eval(expr, regs, pkt); + case NFT_FIB_RESULT_ADDRTYPE: + return nft_fib6_eval_type(expr, regs, pkt); + } + break; + } + + regs->verdict.code = NF_DROP; +} + +static struct nft_expr_type nft_fib_inet_type; +static const struct nft_expr_ops nft_fib_inet_ops = { + .type = &nft_fib_inet_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)), + .eval = nft_fib_inet_eval, + .init = nft_fib_init, + .dump = nft_fib_dump, + .validate = nft_fib_validate, +}; + +static struct nft_expr_type nft_fib_inet_type __read_mostly = { + .family = NFPROTO_INET, + .name = "fib", + .ops = &nft_fib_inet_ops, + .policy = nft_fib_policy, + .maxattr = NFTA_FIB_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_fib_inet_module_init(void) +{ + return nft_register_expr(&nft_fib_inet_type); +} + +static void __exit nft_fib_inet_module_exit(void) +{ + nft_unregister_expr(&nft_fib_inet_type); +} + +module_init(nft_fib_inet_module_init); +module_exit(nft_fib_inet_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal "); +MODULE_ALIAS_NFT_AF_EXPR(1, "fib"); -- cgit v1.2.3-71-gd317 From 2fa841938c648fe4359691f41e8e1f37ff1a3aa2 Mon Sep 17 00:00:00 2001 From: "Anders K. Pedersen" Date: Fri, 28 Oct 2016 05:54:15 +0000 Subject: netfilter: nf_tables: introduce routing expression Introduces an nftables rt expression for routing related data with support for nexthop (i.e. the directly connected IP address that an outgoing packet is sent to), which can be used either for matching or accounting, eg. # nft add rule filter postrouting \ ip daddr 192.168.1.0/24 rt nexthop != 192.168.0.1 drop This will drop any traffic to 192.168.1.0/24 that is not routed via 192.168.0.1. # nft add rule filter postrouting \ flow table acct { rt nexthop timeout 600s counter } # nft add rule ip6 filter postrouting \ flow table acct { rt nexthop timeout 600s counter } These rules count outgoing traffic per nexthop. Note that the timeout releases an entry if no traffic is seen for this nexthop within 10 minutes. # nft add rule inet filter postrouting \ ether type ip \ flow table acct { rt nexthop timeout 600s counter } # nft add rule inet filter postrouting \ ether type ip6 \ flow table acct { rt nexthop timeout 600s counter } Same as above, but via the inet family, where the ether type must be specified explicitly. "rt classid" is also implemented identical to "meta rtclassid", since it is more logical to have this match in the routing expression going forward. Signed-off-by: Anders K. Pedersen Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 27 ++++++ net/netfilter/Kconfig | 6 ++ net/netfilter/Makefile | 1 + net/netfilter/nft_rt.c | 153 +++++++++++++++++++++++++++++++ 4 files changed, 187 insertions(+) create mode 100644 net/netfilter/nft_rt.c (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index a054ad2c8853..14e5f619167e 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -758,6 +758,19 @@ enum nft_meta_keys { NFT_META_PRANDOM, }; +/** + * enum nft_rt_keys - nf_tables routing expression keys + * + * @NFT_RT_CLASSID: realm value of packet's route (skb->dst->tclassid) + * @NFT_RT_NEXTHOP4: routing nexthop for IPv4 + * @NFT_RT_NEXTHOP6: routing nexthop for IPv6 + */ +enum nft_rt_keys { + NFT_RT_CLASSID, + NFT_RT_NEXTHOP4, + NFT_RT_NEXTHOP6, +}; + /** * enum nft_hash_attributes - nf_tables hash expression netlink attributes * @@ -796,6 +809,20 @@ enum nft_meta_attributes { }; #define NFTA_META_MAX (__NFTA_META_MAX - 1) +/** + * enum nft_rt_attributes - nf_tables routing expression netlink attributes + * + * @NFTA_RT_DREG: destination register (NLA_U32) + * @NFTA_RT_KEY: routing data item to load (NLA_U32: nft_rt_keys) + */ +enum nft_rt_attributes { + NFTA_RT_UNSPEC, + NFTA_RT_DREG, + NFTA_RT_KEY, + __NFTA_RT_MAX +}; +#define NFTA_RT_MAX (__NFTA_RT_MAX - 1) + /** * enum nft_ct_keys - nf_tables ct expression keys * diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index ed00ec114ea2..44410d30d461 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -478,6 +478,12 @@ config NFT_META This option adds the "meta" expression that you can use to match and to set packet metainformation such as the packet mark. +config NFT_RT + tristate "Netfilter nf_tables routing module" + help + This option adds the "rt" expression that you can use to match + packet routing information such as the packet nexthop. + config NFT_NUMGEN tristate "Netfilter nf_tables number generator module" help diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 8edd791743fe..5bbf767672ec 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -84,6 +84,7 @@ obj-$(CONFIG_NF_TABLES_NETDEV) += nf_tables_netdev.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o obj-$(CONFIG_NFT_META) += nft_meta.o +obj-$(CONFIG_NFT_RT) += nft_rt.o obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o obj-$(CONFIG_NFT_CT) += nft_ct.o obj-$(CONFIG_NFT_LIMIT) += nft_limit.o diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c new file mode 100644 index 000000000000..9e5ec1f67020 --- /dev/null +++ b/net/netfilter/nft_rt.c @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2016 Anders K. Pedersen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct nft_rt { + enum nft_rt_keys key:8; + enum nft_registers dreg:8; +}; + +void nft_rt_get_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_rt *priv = nft_expr_priv(expr); + const struct sk_buff *skb = pkt->skb; + u32 *dest = ®s->data[priv->dreg]; + const struct dst_entry *dst; + + dst = skb_dst(skb); + if (!dst) + goto err; + + switch (priv->key) { +#ifdef CONFIG_IP_ROUTE_CLASSID + case NFT_RT_CLASSID: + *dest = dst->tclassid; + break; +#endif + case NFT_RT_NEXTHOP4: + if (pkt->pf != NFPROTO_IPV4) + goto err; + + *dest = rt_nexthop((const struct rtable *)dst, + ip_hdr(skb)->daddr); + break; + case NFT_RT_NEXTHOP6: + if (pkt->pf != NFPROTO_IPV6) + goto err; + + memcpy(dest, rt6_nexthop((struct rt6_info *)dst, + &ipv6_hdr(skb)->daddr), + sizeof(struct in6_addr)); + break; + default: + WARN_ON(1); + goto err; + } + return; + +err: + regs->verdict.code = NFT_BREAK; +} + +const struct nla_policy nft_rt_policy[NFTA_RT_MAX + 1] = { + [NFTA_RT_DREG] = { .type = NLA_U32 }, + [NFTA_RT_KEY] = { .type = NLA_U32 }, +}; + +int nft_rt_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_rt *priv = nft_expr_priv(expr); + unsigned int len; + + if (tb[NFTA_RT_KEY] == NULL || + tb[NFTA_RT_DREG] == NULL) + return -EINVAL; + + priv->key = ntohl(nla_get_be32(tb[NFTA_RT_KEY])); + switch (priv->key) { +#ifdef CONFIG_IP_ROUTE_CLASSID + case NFT_RT_CLASSID: +#endif + case NFT_RT_NEXTHOP4: + len = sizeof(u32); + break; + case NFT_RT_NEXTHOP6: + len = sizeof(struct in6_addr); + break; + default: + return -EOPNOTSUPP; + } + + priv->dreg = nft_parse_register(tb[NFTA_RT_DREG]); + return nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, len); +} + +int nft_rt_get_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_rt *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_RT_KEY, htonl(priv->key))) + goto nla_put_failure; + if (nft_dump_register(skb, NFTA_RT_DREG, priv->dreg)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_rt_type; +static const struct nft_expr_ops nft_rt_get_ops = { + .type = &nft_rt_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_rt)), + .eval = nft_rt_get_eval, + .init = nft_rt_get_init, + .dump = nft_rt_get_dump, +}; + +static struct nft_expr_type nft_rt_type __read_mostly = { + .name = "rt", + .ops = &nft_rt_get_ops, + .policy = nft_rt_policy, + .maxattr = NFTA_RT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_rt_module_init(void) +{ + return nft_register_expr(&nft_rt_type); +} + +static void __exit nft_rt_module_exit(void) +{ + nft_unregister_expr(&nft_rt_type); +} + +module_init(nft_rt_module_init); +module_exit(nft_rt_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Anders K. Pedersen "); +MODULE_ALIAS_NFT_EXPR("rt"); -- cgit v1.2.3-71-gd317 From 06fd3a392bb36ff162d10cb7d5794185b94edb2f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 3 Nov 2016 10:56:17 +0100 Subject: netfilter: deprecate NF_STOP NF_STOP is only used by br_netfilter these days, and it can be emulated with a combination of NF_STOLEN plus explicit call to the ->okfn() function as Florian suggests. To retain binary compatibility with userspace nf_queue application, we have to keep NF_STOP around, so libnetfilter_queue userspace userspace applications still work if they use NF_STOP for some exotic reason. Out of tree modules using NF_STOP would break, but we don't care about those. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter.h | 2 +- net/bridge/br_netfilter_hooks.c | 6 ++++-- net/netfilter/core.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h index d93f949d1d9a..7550e9176a54 100644 --- a/include/uapi/linux/netfilter.h +++ b/include/uapi/linux/netfilter.h @@ -13,7 +13,7 @@ #define NF_STOLEN 2 #define NF_QUEUE 3 #define NF_REPEAT 4 -#define NF_STOP 5 +#define NF_STOP 5 /* Deprecated, for userspace nf_queue compatibility. */ #define NF_MAX_VERDICT NF_STOP /* we overload the higher bits for encoding auxiliary data such as the queue diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index d0d66faebe90..7e3645fa6339 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -845,8 +845,10 @@ static unsigned int ip_sabotage_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (skb->nf_bridge && !skb->nf_bridge->in_prerouting) - return NF_STOP; + if (skb->nf_bridge && !skb->nf_bridge->in_prerouting) { + state->okfn(state->net, state->sk, skb); + return NF_STOLEN; + } return NF_ACCEPT; } diff --git a/net/netfilter/core.c b/net/netfilter/core.c index cb0232c11bc8..14f97b624f98 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -333,7 +333,7 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state) entry = rcu_dereference(state->hook_entries); next_hook: verdict = nf_iterate(skb, state, &entry); - if (verdict == NF_ACCEPT || verdict == NF_STOP) { + if (verdict == NF_ACCEPT) { ret = 1; } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) { kfree_skb(skb); -- cgit v1.2.3-71-gd317 From 70ecc24841326396a827deb55c3fefac582a729d Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 2 Nov 2016 11:02:16 -0400 Subject: ipv4: add IP_RECVFRAGSIZE cmsg The IP stack records the largest fragment of a reassembled packet in IPCB(skb)->frag_max_size. When reading a datagram or raw packet that arrived fragmented, expose the value to allow applications to estimate receive path MTU. Tested: Sent data over a veth pair of which the source has a small mtu. Sent data using netcat, received using a dedicated process. Verified that the cmsg IP_RECVFRAGSIZE is returned only when data arrives fragmented, and in that cases matches the veth mtu. ip link add veth0 type veth peer name veth1 ip netns add from ip netns add to ip link set dev veth1 netns to ip netns exec to ip addr add dev veth1 192.168.10.1/24 ip netns exec to ip link set dev veth1 up ip link set dev veth0 netns from ip netns exec from ip addr add dev veth0 192.168.10.2/24 ip netns exec from ip link set dev veth0 up ip netns exec from ip link set dev veth0 mtu 1300 ip netns exec from ethtool -K veth0 ufo off dd if=/dev/zero bs=1 count=1400 2>/dev/null > payload ip netns exec to ./recv_cmsg_recvfragsize -4 -u -p 6000 & ip netns exec from nc -q 1 -u 192.168.10.1 6000 < payload using github.com/wdebruij/kerneltools/blob/master/tests/recvfragsize.c Signed-off-by: Willem de Bruijn Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_sock.h | 1 + include/uapi/linux/in.h | 1 + net/ipv4/ip_sockglue.c | 26 ++++++++++++++++++++++++++ 3 files changed, 28 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 236a81034fef..c9cff977a7fb 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -228,6 +228,7 @@ struct inet_sock { #define IP_CMSG_PASSSEC BIT(5) #define IP_CMSG_ORIGDSTADDR BIT(6) #define IP_CMSG_CHECKSUM BIT(7) +#define IP_CMSG_RECVFRAGSIZE BIT(8) /** * sk_to_full_sk - Access to a full socket diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index eaf94919291a..4e557f4e9553 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -117,6 +117,7 @@ struct in_addr { #define IP_NODEFRAG 22 #define IP_CHECKSUM 23 #define IP_BIND_ADDRESS_NO_PORT 24 +#define IP_RECVFRAGSIZE 25 /* IP_MTU_DISCOVER values */ #define IP_PMTUDISC_DONT 0 /* Never send DF frames */ diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index b8a2d63d1fb8..ecbaae200131 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -97,6 +97,17 @@ static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data); } +static void ip_cmsg_recv_fragsize(struct msghdr *msg, struct sk_buff *skb) +{ + int val; + + if (IPCB(skb)->frag_max_size == 0) + return; + + val = IPCB(skb)->frag_max_size; + put_cmsg(msg, SOL_IP, IP_RECVFRAGSIZE, sizeof(val), &val); +} + static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb, int tlen, int offset) { @@ -218,6 +229,9 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, if (flags & IP_CMSG_CHECKSUM) ip_cmsg_recv_checksum(msg, skb, tlen, offset); + + if (flags & IP_CMSG_RECVFRAGSIZE) + ip_cmsg_recv_fragsize(msg, skb); } EXPORT_SYMBOL(ip_cmsg_recv_offset); @@ -614,6 +628,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, case IP_MULTICAST_LOOP: case IP_RECVORIGDSTADDR: case IP_CHECKSUM: + case IP_RECVFRAGSIZE: if (optlen >= sizeof(int)) { if (get_user(val, (int __user *) optval)) return -EFAULT; @@ -726,6 +741,14 @@ static int do_ip_setsockopt(struct sock *sk, int level, } } break; + case IP_RECVFRAGSIZE: + if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM) + goto e_inval; + if (val) + inet->cmsg_flags |= IP_CMSG_RECVFRAGSIZE; + else + inet->cmsg_flags &= ~IP_CMSG_RECVFRAGSIZE; + break; case IP_TOS: /* This sets both TOS and Precedence */ if (sk->sk_type == SOCK_STREAM) { val &= ~INET_ECN_MASK; @@ -1357,6 +1380,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_CHECKSUM: val = (inet->cmsg_flags & IP_CMSG_CHECKSUM) != 0; break; + case IP_RECVFRAGSIZE: + val = (inet->cmsg_flags & IP_CMSG_RECVFRAGSIZE) != 0; + break; case IP_TOS: val = inet->tos; break; -- cgit v1.2.3-71-gd317 From 0cc0aa614b4c24b21b2492c0a1753035ee8c6edb Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 2 Nov 2016 11:02:17 -0400 Subject: ipv6: add IPV6_RECVFRAGSIZE cmsg When reading a datagram or raw packet that arrived fragmented, expose the maximum fragment size if recorded to allow applications to estimate receive path MTU. At this point, the field is only recorded when ipv6 connection tracking is enabled. A follow-up patch will record this field also in the ipv6 input path. Tested using the test for IP_RECVFRAGSIZE plus ip netns exec to ip addr add dev veth1 fc07::1/64 ip netns exec from ip addr add dev veth0 fc07::2/64 ip netns exec to ./recv_cmsg_recvfragsize -6 -u -p 6000 & ip netns exec from nc -q 1 -u fc07::1 6000 < payload Both with and without enabling connection tracking ip6tables -A INPUT -m state --state NEW -p udp -j LOG Signed-off-by: Willem de Bruijn Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/ipv6.h | 5 +++-- include/uapi/linux/in6.h | 1 + net/ipv6/datagram.c | 5 +++++ net/ipv6/ipv6_sockglue.c | 8 ++++++++ 4 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index ca1ad9ebbc92..1afb6e8d35c3 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -229,8 +229,9 @@ struct ipv6_pinfo { rxflow:1, rxtclass:1, rxpmtu:1, - rxorigdstaddr:1; - /* 2 bits hole */ + rxorigdstaddr:1, + recvfragsize:1; + /* 1 bits hole */ } bits; __u16 all; } rxopt; diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index b39ea4f2e701..46444f8fbee4 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -283,6 +283,7 @@ struct in6_flowlabel_req { #define IPV6_RECVORIGDSTADDR IPV6_ORIGDSTADDR #define IPV6_TRANSPARENT 75 #define IPV6_UNICAST_IF 76 +#define IPV6_RECVFRAGSIZE 77 /* * Multicast Routing: diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 37874e2f30ed..620c79a0130a 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -715,6 +715,11 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6); } } + if (np->rxopt.bits.recvfragsize && opt->frag_max_size) { + int val = opt->frag_max_size; + + put_cmsg(msg, SOL_IPV6, IPV6_RECVFRAGSIZE, sizeof(val), &val); + } } void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 636ec56f5f50..6c126780fcf2 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -868,6 +868,10 @@ pref_skip_coa: np->autoflowlabel = valbool; retv = 0; break; + case IPV6_RECVFRAGSIZE: + np->rxopt.bits.recvfragsize = valbool; + retv = 0; + break; } release_sock(sk); @@ -1310,6 +1314,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->autoflowlabel; break; + case IPV6_RECVFRAGSIZE: + val = np->rxopt.bits.recvfragsize; + break; + default: return -ENOPROTOOPT; } -- cgit v1.2.3-71-gd317 From 5976c5f45c40588b90dda173ded9010917f8f45e Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 3 Nov 2016 13:24:21 +0100 Subject: net/sched: cls_flower: Support matching on SCTP ports Support matching on SCTP ports in the same way that matching on TCP and UDP ports is already supported. Example usage: tc qdisc add dev eth0 ingress tc filter add dev eth0 protocol ip parent ffff: \ flower indev eth0 ip_proto sctp dst_port 80 \ action drop Signed-off-by: Simon Horman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 5 +++++ net/sched/cls_flower.c | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 8fd715f806a2..eb94781757ee 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -447,6 +447,11 @@ enum { TCA_FLOWER_KEY_TCP_DST_MASK, /* be16 */ TCA_FLOWER_KEY_UDP_SRC_MASK, /* be16 */ TCA_FLOWER_KEY_UDP_DST_MASK, /* be16 */ + TCA_FLOWER_KEY_SCTP_SRC_MASK, /* be16 */ + TCA_FLOWER_KEY_SCTP_DST_MASK, /* be16 */ + + TCA_FLOWER_KEY_SCTP_SRC, /* be16 */ + TCA_FLOWER_KEY_SCTP_DST, /* be16 */ __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index a8fb1ca03b3e..db4cd882a989 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -344,6 +344,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_TCP_DST_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_UDP_SRC_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_UDP_DST_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_SCTP_SRC_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_SCTP_DST_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_SCTP_SRC] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_SCTP_DST] = { .type = NLA_U16 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -453,6 +457,13 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST, &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK, sizeof(key->tp.dst)); + } else if (key->basic.ip_proto == IPPROTO_SCTP) { + fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_SCTP_SRC, + &mask->tp.src, TCA_FLOWER_KEY_SCTP_SRC_MASK, + sizeof(key->tp.src)); + fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_SCTP_DST, + &mask->tp.dst, TCA_FLOWER_KEY_SCTP_DST_MASK, + sizeof(key->tp.dst)); } if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] || @@ -897,6 +908,14 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK, sizeof(key->tp.dst)))) goto nla_put_failure; + else if (key->basic.ip_proto == IPPROTO_SCTP && + (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_SCTP_SRC, + &mask->tp.src, TCA_FLOWER_KEY_SCTP_SRC_MASK, + sizeof(key->tp.src)) || + fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_SCTP_DST, + &mask->tp.dst, TCA_FLOWER_KEY_SCTP_DST_MASK, + sizeof(key->tp.dst)))) + goto nla_put_failure; if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && (fl_dump_key_val(skb, &key->enc_ipv4.src, -- cgit v1.2.3-71-gd317 From 622ec2c9d52405973c9f1ca5116eb1c393adfc7d Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Fri, 4 Nov 2016 02:23:42 +0900 Subject: net: core: add UID to flows, rules, and routes - Define a new FIB rule attributes, FRA_UID_RANGE, to describe a range of UIDs. - Define a RTA_UID attribute for per-UID route lookups and dumps. - Support passing these attributes to and from userspace via rtnetlink. The value INVALID_UID indicates no UID was specified. - Add a UID field to the flow structures. Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- include/net/fib_rules.h | 9 ++++- include/net/flow.h | 5 +++ include/uapi/linux/fib_rules.h | 6 ++++ include/uapi/linux/rtnetlink.h | 1 + net/core/fib_rules.c | 74 ++++++++++++++++++++++++++++++++++++++++-- net/ipv4/fib_frontend.c | 1 + net/ipv4/route.c | 11 +++++++ net/ipv6/route.c | 7 ++++ 8 files changed, 111 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 456e4a6006ab..8dbfdf728cd8 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -8,6 +8,11 @@ #include #include +struct fib_kuid_range { + kuid_t start; + kuid_t end; +}; + struct fib_rule { struct list_head list; int iifindex; @@ -30,6 +35,7 @@ struct fib_rule { int suppress_prefixlen; char iifname[IFNAMSIZ]; char oifname[IFNAMSIZ]; + struct fib_kuid_range uid_range; struct rcu_head rcu; }; @@ -92,7 +98,8 @@ struct fib_rules_ops { [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \ [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \ [FRA_GOTO] = { .type = NLA_U32 }, \ - [FRA_L3MDEV] = { .type = NLA_U8 } + [FRA_L3MDEV] = { .type = NLA_U8 }, \ + [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) } static inline void fib_rule_get(struct fib_rule *rule) { diff --git a/include/net/flow.h b/include/net/flow.h index 035aa7716967..51373f3a5e31 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -11,6 +11,7 @@ #include #include #include +#include /* * ifindex generation is per-net namespace, and loopback is @@ -37,6 +38,7 @@ struct flowi_common { #define FLOWI_FLAG_SKIP_NH_OIF 0x04 __u32 flowic_secid; struct flowi_tunnel flowic_tun_key; + kuid_t flowic_uid; }; union flowi_uli { @@ -74,6 +76,7 @@ struct flowi4 { #define flowi4_flags __fl_common.flowic_flags #define flowi4_secid __fl_common.flowic_secid #define flowi4_tun_key __fl_common.flowic_tun_key +#define flowi4_uid __fl_common.flowic_uid /* (saddr,daddr) must be grouped, same order as in IP header */ __be32 saddr; @@ -131,6 +134,7 @@ struct flowi6 { #define flowi6_flags __fl_common.flowic_flags #define flowi6_secid __fl_common.flowic_secid #define flowi6_tun_key __fl_common.flowic_tun_key +#define flowi6_uid __fl_common.flowic_uid struct in6_addr daddr; struct in6_addr saddr; /* Note: flowi6_tos is encoded in flowlabel, too. */ @@ -176,6 +180,7 @@ struct flowi { #define flowi_flags u.__fl_common.flowic_flags #define flowi_secid u.__fl_common.flowic_secid #define flowi_tun_key u.__fl_common.flowic_tun_key +#define flowi_uid u.__fl_common.flowic_uid } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 14404b3ebb89..bbf02a63a011 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -29,6 +29,11 @@ struct fib_rule_hdr { __u32 flags; }; +struct fib_rule_uid_range { + __u32 start; + __u32 end; +}; + enum { FRA_UNSPEC, FRA_DST, /* destination address */ @@ -51,6 +56,7 @@ enum { FRA_OIFNAME, FRA_PAD, FRA_L3MDEV, /* iif or oif is l3mdev goto its table */ + FRA_UID_RANGE, /* UID range */ __FRA_MAX }; diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 5a78be518101..e14377f2ec27 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -318,6 +318,7 @@ enum rtattr_type_t { RTA_ENCAP, RTA_EXPIRES, RTA_PAD, + RTA_UID, __RTA_MAX }; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index be4629c344a6..5de436a73be2 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -18,6 +18,11 @@ #include #include +static const struct fib_kuid_range fib_kuid_range_unset = { + KUIDT_INIT(0), + KUIDT_INIT(~0), +}; + int fib_default_rule_add(struct fib_rules_ops *ops, u32 pref, u32 table, u32 flags) { @@ -33,6 +38,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->table = table; r->flags = flags; r->fr_net = ops->fro_net; + r->uid_range = fib_kuid_range_unset; r->suppress_prefixlen = -1; r->suppress_ifgroup = -1; @@ -172,6 +178,34 @@ void fib_rules_unregister(struct fib_rules_ops *ops) } EXPORT_SYMBOL_GPL(fib_rules_unregister); +static int uid_range_set(struct fib_kuid_range *range) +{ + return uid_valid(range->start) && uid_valid(range->end); +} + +static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb) +{ + struct fib_rule_uid_range *in; + struct fib_kuid_range out; + + in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]); + + out.start = make_kuid(current_user_ns(), in->start); + out.end = make_kuid(current_user_ns(), in->end); + + return out; +} + +static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range) +{ + struct fib_rule_uid_range out = { + from_kuid_munged(current_user_ns(), range->start), + from_kuid_munged(current_user_ns(), range->end) + }; + + return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out); +} + static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) @@ -193,6 +227,10 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg)) goto out; + if (uid_lt(fl->flowi_uid, rule->uid_range.start) || + uid_gt(fl->flowi_uid, rule->uid_range.end)) + goto out; + ret = ops->match(rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; @@ -429,6 +467,21 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) if (rule->l3mdev && rule->table) goto errout_free; + if (tb[FRA_UID_RANGE]) { + if (current_user_ns() != net->user_ns) { + err = -EPERM; + goto errout_free; + } + + rule->uid_range = nla_get_kuid_range(tb); + + if (!uid_range_set(&rule->uid_range) || + !uid_lte(rule->uid_range.start, rule->uid_range.end)) + goto errout_free; + } else { + rule->uid_range = fib_kuid_range_unset; + } + if ((nlh->nlmsg_flags & NLM_F_EXCL) && rule_exists(ops, frh, tb, rule)) { err = -EEXIST; @@ -497,6 +550,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh) struct fib_rules_ops *ops = NULL; struct fib_rule *rule, *tmp; struct nlattr *tb[FRA_MAX+1]; + struct fib_kuid_range range; int err = -EINVAL; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) @@ -516,6 +570,14 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh) if (err < 0) goto errout; + if (tb[FRA_UID_RANGE]) { + range = nla_get_kuid_range(tb); + if (!uid_range_set(&range)) + goto errout; + } else { + range = fib_kuid_range_unset; + } + list_for_each_entry(rule, &ops->rules_list, list) { if (frh->action && (frh->action != rule->action)) continue; @@ -552,6 +614,11 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh) (rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV]))) continue; + if (uid_range_set(&range) && + (!uid_eq(rule->uid_range.start, range.start) || + !uid_eq(rule->uid_range.end, range.end))) + continue; + if (!ops->compare(rule, frh, tb)) continue; @@ -619,7 +686,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4) /* FRA_FWMASK */ - + nla_total_size_64bit(8); /* FRA_TUN_ID */ + + nla_total_size_64bit(8) /* FRA_TUN_ID */ + + nla_total_size(sizeof(struct fib_kuid_range)); if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -679,7 +747,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, (rule->tun_id && nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) || (rule->l3mdev && - nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev))) + nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) || + (uid_range_set(&rule->uid_range) && + nla_put_uid_range(skb, &rule->uid_range))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index c3b80478226e..d93eea8e2409 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -610,6 +610,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_FLOW] = { .type = NLA_U32 }, [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, + [RTA_UID] = { .type = NLA_U32 }, }; static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4392db83d540..92e59a638d3b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2504,6 +2504,11 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) goto nla_put_failure; + if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && + nla_put_u32(skb, RTA_UID, + from_kuid_munged(current_user_ns(), fl4->flowi4_uid))) + goto nla_put_failure; + error = rt->dst.error; if (rt_is_input_route(rt)) { @@ -2556,6 +2561,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) int mark; struct sk_buff *skb; u32 table_id = RT_TABLE_MAIN; + kuid_t uid; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); if (err < 0) @@ -2583,6 +2589,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; + if (tb[RTA_UID]) + uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); + else + uid = (iif ? INVALID_UID : current_uid()); memset(&fl4, 0, sizeof(fl4)); fl4.daddr = dst; @@ -2590,6 +2600,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) fl4.flowi4_tos = rtm->rtm_tos; fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; fl4.flowi4_mark = mark; + fl4.flowi4_uid = uid; if (iif) { struct net_device *dev; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 947ed1ded026..fdb9c87137bd 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2797,6 +2797,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, [RTA_EXPIRES] = { .type = NLA_U32 }, + [RTA_UID] = { .type = NLA_U32 }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -3371,6 +3372,12 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) if (tb[RTA_MARK]) fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); + if (tb[RTA_UID]) + fl6.flowi6_uid = make_kuid(current_user_ns(), + nla_get_u32(tb[RTA_UID])); + else + fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); + if (iif) { struct net_device *dev; int flags = 0; -- cgit v1.2.3-71-gd317 From f4d997fd613001e612543339e0275c037f94ffe9 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Mon, 7 Nov 2016 15:14:39 +0200 Subject: net/sched: cls_flower: Add UDP port to tunnel parameters The current IP tunneling classification supports only IP addresses and key. Enhance UDP based IP tunneling classification parameters by adding UDP src and dst port. Signed-off-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 1 + include/uapi/linux/pkt_cls.h | 5 +++++ net/sched/cls_flower.c | 29 ++++++++++++++++++++++++++++- 3 files changed, 34 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 4e7cf38a7750..c4f31666afd2 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -132,6 +132,7 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */ FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */ FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control */ + FLOW_DISSECTOR_KEY_ENC_PORTS, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_MAX, }; diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index eb94781757ee..86786d45ee66 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -452,6 +452,11 @@ enum { TCA_FLOWER_KEY_SCTP_SRC, /* be16 */ TCA_FLOWER_KEY_SCTP_DST, /* be16 */ + + TCA_FLOWER_KEY_ENC_UDP_SRC_PORT, /* be16 */ + TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK, /* be16 */ + TCA_FLOWER_KEY_ENC_UDP_DST_PORT, /* be16 */ + TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, /* be16 */ __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 6369b74c8f5b..e8dd09af0d0c 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -43,6 +43,7 @@ struct fl_flow_key { struct flow_dissector_key_ipv4_addrs enc_ipv4; struct flow_dissector_key_ipv6_addrs enc_ipv6; }; + struct flow_dissector_key_ports enc_tp; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -155,6 +156,8 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, } skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id); + skb_key.enc_tp.src = key->tp_src; + skb_key.enc_tp.dst = key->tp_dst; } skb_key.indev_ifindex = skb->skb_iif; @@ -348,6 +351,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_SCTP_DST_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_SCTP_SRC] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_SCTP_DST] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_ENC_UDP_SRC_PORT] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_ENC_UDP_DST_PORT] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK] = { .type = NLA_U16 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -500,6 +507,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb, &mask->enc_key_id.keyid, TCA_FLOWER_UNSPEC, sizeof(key->enc_key_id.keyid)); + fl_set_key_val(tb, &key->enc_tp.src, TCA_FLOWER_KEY_ENC_UDP_SRC_PORT, + &mask->enc_tp.src, TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK, + sizeof(key->enc_tp.src)); + + fl_set_key_val(tb, &key->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT, + &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, + sizeof(key->enc_tp.dst)); + return 0; } @@ -577,6 +592,8 @@ static void fl_init_dissector(struct cls_fl_head *head, FL_KEY_IS_MASKED(&mask->key, enc_ipv6)) FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_ENC_CONTROL, enc_control); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp); skb_flow_dissector_init(&head->dissector, keys, cnt); } @@ -951,7 +968,17 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, if (fl_dump_key_val(skb, &key->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID, &mask->enc_key_id, TCA_FLOWER_UNSPEC, - sizeof(key->enc_key_id))) + sizeof(key->enc_key_id)) || + fl_dump_key_val(skb, &key->enc_tp.src, + TCA_FLOWER_KEY_ENC_UDP_SRC_PORT, + &mask->enc_tp.src, + TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK, + sizeof(key->enc_tp.src)) || + fl_dump_key_val(skb, &key->enc_tp.dst, + TCA_FLOWER_KEY_ENC_UDP_DST_PORT, + &mask->enc_tp.dst, + TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, + sizeof(key->enc_tp.dst))) goto nla_put_failure; nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags); -- cgit v1.2.3-71-gd317 From 75bfbca01e48d2d62e8321609ae32aaf6c6fab0e Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Mon, 7 Nov 2016 15:14:41 +0200 Subject: net/sched: act_tunnel_key: Add UDP dst port option The current tunnel set action supports only IP addresses and key options. Add UDP dst port option. Signed-off-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_tunnel_key.h | 1 + net/sched/act_tunnel_key.c | 14 +++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h index 890106ff16e6..84ea55e1076b 100644 --- a/include/uapi/linux/tc_act/tc_tunnel_key.h +++ b/include/uapi/linux/tc_act/tc_tunnel_key.h @@ -33,6 +33,7 @@ enum { TCA_TUNNEL_KEY_ENC_IPV6_DST, /* struct in6_addr */ TCA_TUNNEL_KEY_ENC_KEY_ID, /* be64 */ TCA_TUNNEL_KEY_PAD, + TCA_TUNNEL_KEY_ENC_DST_PORT, /* be16 */ __TCA_TUNNEL_KEY_MAX, }; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index bd2f63e9cf5c..edc720f11687 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -66,6 +66,7 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { [TCA_TUNNEL_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, [TCA_TUNNEL_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) }, [TCA_TUNNEL_KEY_ENC_KEY_ID] = { .type = NLA_U32 }, + [TCA_TUNNEL_KEY_ENC_DST_PORT] = {.type = NLA_U16}, }; static int tunnel_key_init(struct net *net, struct nlattr *nla, @@ -80,6 +81,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, struct tc_tunnel_key *parm; struct tcf_tunnel_key *t; bool exists = false; + __be16 dst_port = 0; __be64 key_id; int ret = 0; int err; @@ -110,6 +112,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID])); + if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT]) + dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]); + if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] && tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) { __be32 saddr; @@ -119,7 +124,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]); metadata = __ip_tun_set_dst(saddr, daddr, 0, 0, - 0, TUNNEL_KEY, key_id, 0); + dst_port, TUNNEL_KEY, + key_id, 0); } else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] && tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) { struct in6_addr saddr; @@ -129,7 +135,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]); metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, 0, - 0, TUNNEL_KEY, key_id, 0); + dst_port, TUNNEL_KEY, + key_id, 0); } if (!metadata) { @@ -257,7 +264,8 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) || tunnel_key_dump_addresses(skb, - ¶ms->tcft_enc_metadata->u.tun_info)) + ¶ms->tcft_enc_metadata->u.tun_info) || + nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst)) goto nla_put_failure; } -- cgit v1.2.3-71-gd317 From 3f11ec045fecf2c0fb21f08f68ebc9237bd1d03c Mon Sep 17 00:00:00 2001 From: Asbjørn Sloth Tønnesen Date: Mon, 7 Nov 2016 20:39:24 +0000 Subject: net: l2tp: change L2TP_ATTR_UDP_ZERO_CSUM6_{RX, TX} attribute types The attributes L2TP_ATTR_UDP_ZERO_CSUM6_RX and L2TP_ATTR_UDP_ZERO_CSUM6_TX are used as flags, but is defined as a u8 in a comment. This patch redocuments them as flags. Adding nla_policy entries would break API, so not doing that. CC: Tom Herbert Signed-off-by: Asbjoern Sloth Toennesen Signed-off-by: David S. Miller --- include/uapi/linux/l2tp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index 4bd27d0270a2..5daa48e2571e 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -124,8 +124,8 @@ enum { L2TP_ATTR_STATS, /* nested */ L2TP_ATTR_IP6_SADDR, /* struct in6_addr */ L2TP_ATTR_IP6_DADDR, /* struct in6_addr */ - L2TP_ATTR_UDP_ZERO_CSUM6_TX, /* u8 */ - L2TP_ATTR_UDP_ZERO_CSUM6_RX, /* u8 */ + L2TP_ATTR_UDP_ZERO_CSUM6_TX, /* flag */ + L2TP_ATTR_UDP_ZERO_CSUM6_RX, /* flag */ L2TP_ATTR_PAD, __L2TP_ATTR_MAX, }; -- cgit v1.2.3-71-gd317 From 1ababeba4a21f3dba3da3523c670b207fb2feb62 Mon Sep 17 00:00:00 2001 From: David Lebrun Date: Tue, 8 Nov 2016 14:57:39 +0100 Subject: ipv6: implement dataplane support for rthdr type 4 (Segment Routing Header) Implement minimal support for processing of SR-enabled packets as described in https://tools.ietf.org/html/draft-ietf-6man-segment-routing-header-02. This patch implements the following operations: - Intermediate segment endpoint: incrementation of active segment and rerouting. - Egress for SR-encapsulated packets: decapsulation of outer IPv6 header + SRH and routing of inner packet. - Cleanup flag support for SR-inlined packets: removal of SRH if we are the penultimate segment endpoint. A per-interface sysctl seg6_enabled is provided, to accept/deny SR-enabled packets. Default is deny. This patch does not provide support for HMAC-signed packets. Signed-off-by: David Lebrun Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + include/linux/seg6.h | 6 ++ include/net/seg6.h | 36 ++++++++++ include/uapi/linux/ipv6.h | 2 + include/uapi/linux/seg6.h | 54 ++++++++++++++ net/ipv6/addrconf.c | 10 +++ net/ipv6/exthdrs.c | 175 ++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 284 insertions(+) create mode 100644 include/linux/seg6.h create mode 100644 include/net/seg6.h create mode 100644 include/uapi/linux/seg6.h (limited to 'include/uapi/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 1afb6e8d35c3..68d3f71f0abf 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -64,6 +64,7 @@ struct ipv6_devconf { } stable_secret; __s32 use_oif_addrs_only; __s32 keep_addr_on_down; + __s32 seg6_enabled; struct ctl_table_header *sysctl_header; }; diff --git a/include/linux/seg6.h b/include/linux/seg6.h new file mode 100644 index 000000000000..7a66d2b4c5a6 --- /dev/null +++ b/include/linux/seg6.h @@ -0,0 +1,6 @@ +#ifndef _LINUX_SEG6_H +#define _LINUX_SEG6_H + +#include + +#endif diff --git a/include/net/seg6.h b/include/net/seg6.h new file mode 100644 index 000000000000..4dd52a7e95f1 --- /dev/null +++ b/include/net/seg6.h @@ -0,0 +1,36 @@ +/* + * SR-IPv6 implementation + * + * Author: + * David Lebrun + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _NET_SEG6_H +#define _NET_SEG6_H + +static inline void update_csum_diff4(struct sk_buff *skb, __be32 from, + __be32 to) +{ + __be32 diff[] = { ~from, to }; + + skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum); +} + +static inline void update_csum_diff16(struct sk_buff *skb, __be32 *from, + __be32 *to) +{ + __be32 diff[] = { + ~from[0], ~from[1], ~from[2], ~from[3], + to[0], to[1], to[2], to[3], + }; + + skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum); +} + +#endif diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 8c2772340c3f..7ff1d654e333 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -39,6 +39,7 @@ struct in6_ifreq { #define IPV6_SRCRT_STRICT 0x01 /* Deprecated; will be removed */ #define IPV6_SRCRT_TYPE_0 0 /* Deprecated; will be removed */ #define IPV6_SRCRT_TYPE_2 2 /* IPv6 type 2 Routing Header */ +#define IPV6_SRCRT_TYPE_4 4 /* Segment Routing with IPv6 */ /* * routing header @@ -178,6 +179,7 @@ enum { DEVCONF_DROP_UNSOLICITED_NA, DEVCONF_KEEP_ADDR_ON_DOWN, DEVCONF_RTR_SOLICIT_MAX_INTERVAL, + DEVCONF_SEG6_ENABLED, DEVCONF_MAX }; diff --git a/include/uapi/linux/seg6.h b/include/uapi/linux/seg6.h new file mode 100644 index 000000000000..c396a8052f73 --- /dev/null +++ b/include/uapi/linux/seg6.h @@ -0,0 +1,54 @@ +/* + * SR-IPv6 implementation + * + * Author: + * David Lebrun + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _UAPI_LINUX_SEG6_H +#define _UAPI_LINUX_SEG6_H + +/* + * SRH + */ +struct ipv6_sr_hdr { + __u8 nexthdr; + __u8 hdrlen; + __u8 type; + __u8 segments_left; + __u8 first_segment; + __u8 flag_1; + __u8 flag_2; + __u8 reserved; + + struct in6_addr segments[0]; +}; + +#define SR6_FLAG1_CLEANUP (1 << 7) +#define SR6_FLAG1_PROTECTED (1 << 6) +#define SR6_FLAG1_OAM (1 << 5) +#define SR6_FLAG1_ALERT (1 << 4) +#define SR6_FLAG1_HMAC (1 << 3) + +#define SR6_TLV_INGRESS 1 +#define SR6_TLV_EGRESS 2 +#define SR6_TLV_OPAQUE 3 +#define SR6_TLV_PADDING 4 +#define SR6_TLV_HMAC 5 + +#define sr_has_cleanup(srh) ((srh)->flag_1 & SR6_FLAG1_CLEANUP) +#define sr_has_hmac(srh) ((srh)->flag_1 & SR6_FLAG1_HMAC) + +struct sr6_tlv { + __u8 type; + __u8 len; + __u8 data[0]; +}; + +#endif diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 060dd9922018..2ac6cb460af0 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -238,6 +238,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .use_oif_addrs_only = 0, .ignore_routes_with_linkdown = 0, .keep_addr_on_down = 0, + .seg6_enabled = 0, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -284,6 +285,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .use_oif_addrs_only = 0, .ignore_routes_with_linkdown = 0, .keep_addr_on_down = 0, + .seg6_enabled = 0, }; /* Check if a valid qdisc is available */ @@ -4944,6 +4946,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast; array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na; array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down; + array[DEVCONF_SEG6_ENABLED] = cnf->seg6_enabled; } static inline size_t inet6_ifla6_size(void) @@ -6035,6 +6038,13 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = proc_dointvec, }, + { + .procname = "seg6_enabled", + .data = &ipv6_devconf.seg6_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { /* sentinel */ } diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 139ceb68bd37..b8ba3961ff8a 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -47,6 +47,8 @@ #if IS_ENABLED(CONFIG_IPV6_MIP6) #include #endif +#include +#include #include @@ -286,6 +288,175 @@ static int ipv6_destopt_rcv(struct sk_buff *skb) return -1; } +static void seg6_update_csum(struct sk_buff *skb) +{ + struct ipv6_sr_hdr *hdr; + struct in6_addr *addr; + __be32 from, to; + + /* srh is at transport offset and seg_left is already decremented + * but daddr is not yet updated with next segment + */ + + hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); + addr = hdr->segments + hdr->segments_left; + + hdr->segments_left++; + from = *(__be32 *)hdr; + + hdr->segments_left--; + to = *(__be32 *)hdr; + + /* update skb csum with diff resulting from seg_left decrement */ + + update_csum_diff4(skb, from, to); + + /* compute csum diff between current and next segment and update */ + + update_csum_diff16(skb, (__be32 *)(&ipv6_hdr(skb)->daddr), + (__be32 *)addr); +} + +static int ipv6_srh_rcv(struct sk_buff *skb) +{ + struct inet6_skb_parm *opt = IP6CB(skb); + struct net *net = dev_net(skb->dev); + struct ipv6_sr_hdr *hdr; + struct inet6_dev *idev; + struct in6_addr *addr; + bool cleanup = false; + int accept_seg6; + + hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); + + idev = __in6_dev_get(skb->dev); + + accept_seg6 = net->ipv6.devconf_all->seg6_enabled; + if (accept_seg6 > idev->cnf.seg6_enabled) + accept_seg6 = idev->cnf.seg6_enabled; + + if (!accept_seg6) { + kfree_skb(skb); + return -1; + } + +looped_back: + if (hdr->segments_left > 0) { + if (hdr->nexthdr != NEXTHDR_IPV6 && hdr->segments_left == 1 && + sr_has_cleanup(hdr)) + cleanup = true; + } else { + if (hdr->nexthdr == NEXTHDR_IPV6) { + int offset = (hdr->hdrlen + 1) << 3; + + skb_postpull_rcsum(skb, skb_network_header(skb), + skb_network_header_len(skb)); + + if (!pskb_pull(skb, offset)) { + kfree_skb(skb); + return -1; + } + skb_postpull_rcsum(skb, skb_transport_header(skb), + offset); + + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb->encapsulation = 0; + + __skb_tunnel_rx(skb, skb->dev, net); + + netif_rx(skb); + return -1; + } + + opt->srcrt = skb_network_header_len(skb); + opt->lastopt = opt->srcrt; + skb->transport_header += (hdr->hdrlen + 1) << 3; + opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb); + + return 1; + } + + if (hdr->segments_left >= (hdr->hdrlen >> 1)) { + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, + ((&hdr->segments_left) - + skb_network_header(skb))); + kfree_skb(skb); + return -1; + } + + if (skb_cloned(skb)) { + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) { + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + return -1; + } + } + + hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); + + hdr->segments_left--; + addr = hdr->segments + hdr->segments_left; + + skb_push(skb, sizeof(struct ipv6hdr)); + + if (skb->ip_summed == CHECKSUM_COMPLETE) + seg6_update_csum(skb); + + ipv6_hdr(skb)->daddr = *addr; + + if (cleanup) { + int srhlen = (hdr->hdrlen + 1) << 3; + int nh = hdr->nexthdr; + + skb_pull_rcsum(skb, sizeof(struct ipv6hdr) + srhlen); + memmove(skb_network_header(skb) + srhlen, + skb_network_header(skb), + (unsigned char *)hdr - skb_network_header(skb)); + skb->network_header += srhlen; + ipv6_hdr(skb)->nexthdr = nh; + ipv6_hdr(skb)->payload_len = htons(skb->len - + sizeof(struct ipv6hdr)); + skb_push_rcsum(skb, sizeof(struct ipv6hdr)); + } + + skb_dst_drop(skb); + + ip6_route_input(skb); + + if (skb_dst(skb)->error) { + dst_input(skb); + return -1; + } + + if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { + if (ipv6_hdr(skb)->hop_limit <= 1) { + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); + icmpv6_send(skb, ICMPV6_TIME_EXCEED, + ICMPV6_EXC_HOPLIMIT, 0); + kfree_skb(skb); + return -1; + } + ipv6_hdr(skb)->hop_limit--; + + /* be sure that srh is still present before reinjecting */ + if (!cleanup) { + skb_pull(skb, sizeof(struct ipv6hdr)); + goto looped_back; + } + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); + } + + dst_input(skb); + + return -1; +} + /******************************** Routing header. ********************************/ @@ -326,6 +497,10 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) return -1; } + /* segment routing */ + if (hdr->type == IPV6_SRCRT_TYPE_4) + return ipv6_srh_rcv(skb); + looped_back: if (hdr->segments_left == 0) { switch (hdr->type) { -- cgit v1.2.3-71-gd317 From 915d7e5e5930b4f01d0971d93b9b25ed17d221aa Mon Sep 17 00:00:00 2001 From: David Lebrun Date: Tue, 8 Nov 2016 14:57:40 +0100 Subject: ipv6: sr: add code base for control plane support of SR-IPv6 This patch adds the necessary hooks and structures to provide support for SR-IPv6 control plane, essentially the Generic Netlink commands that will be used for userspace control over the Segment Routing kernel structures. The genetlink commands provide control over two different structures: tunnel source and HMAC data. The tunnel source is the source address that will be used by default when encapsulating packets into an outer IPv6 header + SRH. If the tunnel source is set to :: then an address of the outgoing interface will be selected as the source. The HMAC commands currently just return ENOTSUPP and will be implemented in a future patch. Signed-off-by: David Lebrun Signed-off-by: David S. Miller --- include/linux/seg6_genl.h | 6 ++ include/net/netns/ipv6.h | 1 + include/net/seg6.h | 16 +++ include/uapi/linux/seg6_genl.h | 32 ++++++ net/ipv6/Makefile | 2 +- net/ipv6/af_inet6.c | 9 +- net/ipv6/seg6.c | 214 +++++++++++++++++++++++++++++++++++++++++ 7 files changed, 278 insertions(+), 2 deletions(-) create mode 100644 include/linux/seg6_genl.h create mode 100644 include/uapi/linux/seg6_genl.h create mode 100644 net/ipv6/seg6.c (limited to 'include/uapi/linux') diff --git a/include/linux/seg6_genl.h b/include/linux/seg6_genl.h new file mode 100644 index 000000000000..d6c3fb4f3734 --- /dev/null +++ b/include/linux/seg6_genl.h @@ -0,0 +1,6 @@ +#ifndef _LINUX_SEG6_GENL_H +#define _LINUX_SEG6_GENL_H + +#include + +#endif diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 10d0848f5b8a..de7745e2edcc 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -85,6 +85,7 @@ struct netns_ipv6 { #endif atomic_t dev_addr_genid; atomic_t fib6_sernum; + struct seg6_pernet_data *seg6_data; }; #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) diff --git a/include/net/seg6.h b/include/net/seg6.h index 4dd52a7e95f1..7c7b8ed39661 100644 --- a/include/net/seg6.h +++ b/include/net/seg6.h @@ -14,6 +14,9 @@ #ifndef _NET_SEG6_H #define _NET_SEG6_H +#include +#include + static inline void update_csum_diff4(struct sk_buff *skb, __be32 from, __be32 to) { @@ -33,4 +36,17 @@ static inline void update_csum_diff16(struct sk_buff *skb, __be32 *from, skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum); } +struct seg6_pernet_data { + struct mutex lock; + struct in6_addr __rcu *tun_src; +}; + +static inline struct seg6_pernet_data *seg6_pernet(struct net *net) +{ + return net->ipv6.seg6_data; +} + +extern int seg6_init(void); +extern void seg6_exit(void); + #endif diff --git a/include/uapi/linux/seg6_genl.h b/include/uapi/linux/seg6_genl.h new file mode 100644 index 000000000000..fcf1c60d7df3 --- /dev/null +++ b/include/uapi/linux/seg6_genl.h @@ -0,0 +1,32 @@ +#ifndef _UAPI_LINUX_SEG6_GENL_H +#define _UAPI_LINUX_SEG6_GENL_H + +#define SEG6_GENL_NAME "SEG6" +#define SEG6_GENL_VERSION 0x1 + +enum { + SEG6_ATTR_UNSPEC, + SEG6_ATTR_DST, + SEG6_ATTR_DSTLEN, + SEG6_ATTR_HMACKEYID, + SEG6_ATTR_SECRET, + SEG6_ATTR_SECRETLEN, + SEG6_ATTR_ALGID, + SEG6_ATTR_HMACINFO, + __SEG6_ATTR_MAX, +}; + +#define SEG6_ATTR_MAX (__SEG6_ATTR_MAX - 1) + +enum { + SEG6_CMD_UNSPEC, + SEG6_CMD_SETHMAC, + SEG6_CMD_DUMPHMAC, + SEG6_CMD_SET_TUNSRC, + SEG6_CMD_GET_TUNSRC, + __SEG6_CMD_MAX, +}; + +#define SEG6_CMD_MAX (__SEG6_CMD_MAX - 1) + +#endif diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index c174ccb340a1..c92010d62afc 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -9,7 +9,7 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \ raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \ exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \ - udp_offload.o + udp_offload.o seg6.o ipv6-offload := ip6_offload.o tcpv6_offload.o exthdrs_offload.o diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c86911b63f8a..d424f3a3737a 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -61,6 +61,7 @@ #include #endif #include +#include #include #include @@ -991,6 +992,10 @@ static int __init inet6_init(void) if (err) goto calipso_fail; + err = seg6_init(); + if (err) + goto seg6_fail; + #ifdef CONFIG_SYSCTL err = ipv6_sysctl_register(); if (err) @@ -1001,8 +1006,10 @@ out: #ifdef CONFIG_SYSCTL sysctl_fail: - calipso_exit(); + seg6_exit(); #endif +seg6_fail: + calipso_exit(); calipso_fail: pingv6_exit(); pingv6_fail: diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c new file mode 100644 index 000000000000..e246b0ba12ac --- /dev/null +++ b/net/ipv6/seg6.c @@ -0,0 +1,214 @@ +/* + * SR-IPv6 implementation + * + * Author: + * David Lebrun + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +static struct genl_family seg6_genl_family; + +static const struct nla_policy seg6_genl_policy[SEG6_ATTR_MAX + 1] = { + [SEG6_ATTR_DST] = { .type = NLA_BINARY, + .len = sizeof(struct in6_addr) }, + [SEG6_ATTR_DSTLEN] = { .type = NLA_S32, }, + [SEG6_ATTR_HMACKEYID] = { .type = NLA_U32, }, + [SEG6_ATTR_SECRET] = { .type = NLA_BINARY, }, + [SEG6_ATTR_SECRETLEN] = { .type = NLA_U8, }, + [SEG6_ATTR_ALGID] = { .type = NLA_U8, }, + [SEG6_ATTR_HMACINFO] = { .type = NLA_NESTED, }, +}; + +static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info) +{ + return -ENOTSUPP; +} + +static int seg6_genl_set_tunsrc(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct in6_addr *val, *t_old, *t_new; + struct seg6_pernet_data *sdata; + + sdata = seg6_pernet(net); + + if (!info->attrs[SEG6_ATTR_DST]) + return -EINVAL; + + val = nla_data(info->attrs[SEG6_ATTR_DST]); + t_new = kmemdup(val, sizeof(*val), GFP_KERNEL); + + mutex_lock(&sdata->lock); + + t_old = sdata->tun_src; + rcu_assign_pointer(sdata->tun_src, t_new); + + mutex_unlock(&sdata->lock); + + synchronize_net(); + kfree(t_old); + + return 0; +} + +static int seg6_genl_get_tunsrc(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct in6_addr *tun_src; + struct sk_buff *msg; + void *hdr; + + msg = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, + &seg6_genl_family, 0, SEG6_CMD_GET_TUNSRC); + if (!hdr) + goto free_msg; + + rcu_read_lock(); + tun_src = rcu_dereference(seg6_pernet(net)->tun_src); + + if (nla_put(msg, SEG6_ATTR_DST, sizeof(struct in6_addr), tun_src)) + goto nla_put_failure; + + rcu_read_unlock(); + + genlmsg_end(msg, hdr); + genlmsg_reply(msg, info); + + return 0; + +nla_put_failure: + rcu_read_unlock(); + genlmsg_cancel(msg, hdr); +free_msg: + nlmsg_free(msg); + return -ENOMEM; +} + +static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb) +{ + return -ENOTSUPP; +} + +static int __net_init seg6_net_init(struct net *net) +{ + struct seg6_pernet_data *sdata; + + sdata = kzalloc(sizeof(*sdata), GFP_KERNEL); + if (!sdata) + return -ENOMEM; + + mutex_init(&sdata->lock); + + sdata->tun_src = kzalloc(sizeof(*sdata->tun_src), GFP_KERNEL); + if (!sdata->tun_src) { + kfree(sdata); + return -ENOMEM; + } + + net->ipv6.seg6_data = sdata; + + return 0; +} + +static void __net_exit seg6_net_exit(struct net *net) +{ + struct seg6_pernet_data *sdata = seg6_pernet(net); + + kfree(sdata->tun_src); + kfree(sdata); +} + +static struct pernet_operations ip6_segments_ops = { + .init = seg6_net_init, + .exit = seg6_net_exit, +}; + +static const struct genl_ops seg6_genl_ops[] = { + { + .cmd = SEG6_CMD_SETHMAC, + .doit = seg6_genl_sethmac, + .policy = seg6_genl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = SEG6_CMD_DUMPHMAC, + .dumpit = seg6_genl_dumphmac, + .policy = seg6_genl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = SEG6_CMD_SET_TUNSRC, + .doit = seg6_genl_set_tunsrc, + .policy = seg6_genl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = SEG6_CMD_GET_TUNSRC, + .doit = seg6_genl_get_tunsrc, + .policy = seg6_genl_policy, + .flags = GENL_ADMIN_PERM, + }, +}; + +static struct genl_family seg6_genl_family __ro_after_init = { + .hdrsize = 0, + .name = SEG6_GENL_NAME, + .version = SEG6_GENL_VERSION, + .maxattr = SEG6_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, + .ops = seg6_genl_ops, + .n_ops = ARRAY_SIZE(seg6_genl_ops), + .module = THIS_MODULE, +}; + +int __init seg6_init(void) +{ + int err = -ENOMEM; + + err = genl_register_family(&seg6_genl_family); + if (err) + goto out; + + err = register_pernet_subsys(&ip6_segments_ops); + if (err) + goto out_unregister_genl; + + pr_info("Segment Routing with IPv6\n"); + +out: + return err; +out_unregister_genl: + genl_unregister_family(&seg6_genl_family); + goto out; +} + +void seg6_exit(void) +{ + unregister_pernet_subsys(&ip6_segments_ops); + genl_unregister_family(&seg6_genl_family); +} -- cgit v1.2.3-71-gd317 From 6c8702c60b88651072460f3f4026c7dfe2521d12 Mon Sep 17 00:00:00 2001 From: David Lebrun Date: Tue, 8 Nov 2016 14:57:41 +0100 Subject: ipv6: sr: add support for SRH encapsulation and injection with lwtunnels This patch creates a new type of interfaceless lightweight tunnel (SEG6), enabling the encapsulation and injection of SRH within locally emitted packets and forwarded packets. >From a configuration viewpoint, a seg6 tunnel would be configured as follows: ip -6 ro ad fc00::1/128 encap seg6 mode encap segs fc42::1,fc42::2,fc42::3 dev eth0 Any packet whose destination address is fc00::1 would thus be encapsulated within an outer IPv6 header containing the SRH with three segments, and would actually be routed to the first segment of the list. If `mode inline' was specified instead of `mode encap', then the SRH would be directly inserted after the IPv6 header without outer encapsulation. The inline mode is only available if CONFIG_IPV6_SEG6_INLINE is enabled. This feature was made configurable because direct header insertion may break several mechanisms such as PMTUD or IPSec AH. Signed-off-by: David Lebrun Signed-off-by: David S. Miller --- include/linux/seg6_iptunnel.h | 6 + include/net/seg6.h | 6 + include/uapi/linux/lwtunnel.h | 1 + include/uapi/linux/seg6_iptunnel.h | 44 ++++ net/core/lwtunnel.c | 2 + net/ipv6/Kconfig | 12 ++ net/ipv6/Makefile | 2 +- net/ipv6/seg6.c | 44 ++++ net/ipv6/seg6_iptunnel.c | 410 +++++++++++++++++++++++++++++++++++++ 9 files changed, 526 insertions(+), 1 deletion(-) create mode 100644 include/linux/seg6_iptunnel.h create mode 100644 include/uapi/linux/seg6_iptunnel.h create mode 100644 net/ipv6/seg6_iptunnel.c (limited to 'include/uapi/linux') diff --git a/include/linux/seg6_iptunnel.h b/include/linux/seg6_iptunnel.h new file mode 100644 index 000000000000..5377cf6a5a02 --- /dev/null +++ b/include/linux/seg6_iptunnel.h @@ -0,0 +1,6 @@ +#ifndef _LINUX_SEG6_IPTUNNEL_H +#define _LINUX_SEG6_IPTUNNEL_H + +#include + +#endif diff --git a/include/net/seg6.h b/include/net/seg6.h index 7c7b8ed39661..ff5da0ce83e9 100644 --- a/include/net/seg6.h +++ b/include/net/seg6.h @@ -16,6 +16,8 @@ #include #include +#include +#include static inline void update_csum_diff4(struct sk_buff *skb, __be32 from, __be32 to) @@ -48,5 +50,9 @@ static inline struct seg6_pernet_data *seg6_pernet(struct net *net) extern int seg6_init(void); extern void seg6_exit(void); +extern int seg6_iptunnel_init(void); +extern void seg6_iptunnel_exit(void); + +extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len); #endif diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index a478fe80e203..453cc6215bfd 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -9,6 +9,7 @@ enum lwtunnel_encap_types { LWTUNNEL_ENCAP_IP, LWTUNNEL_ENCAP_ILA, LWTUNNEL_ENCAP_IP6, + LWTUNNEL_ENCAP_SEG6, __LWTUNNEL_ENCAP_MAX, }; diff --git a/include/uapi/linux/seg6_iptunnel.h b/include/uapi/linux/seg6_iptunnel.h new file mode 100644 index 000000000000..0f7dbd280a9c --- /dev/null +++ b/include/uapi/linux/seg6_iptunnel.h @@ -0,0 +1,44 @@ +/* + * SR-IPv6 implementation + * + * Author: + * David Lebrun + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _UAPI_LINUX_SEG6_IPTUNNEL_H +#define _UAPI_LINUX_SEG6_IPTUNNEL_H + +enum { + SEG6_IPTUNNEL_UNSPEC, + SEG6_IPTUNNEL_SRH, + __SEG6_IPTUNNEL_MAX, +}; +#define SEG6_IPTUNNEL_MAX (__SEG6_IPTUNNEL_MAX - 1) + +struct seg6_iptunnel_encap { + int mode; + struct ipv6_sr_hdr srh[0]; +}; + +#define SEG6_IPTUN_ENCAP_SIZE(x) ((sizeof(*x)) + (((x)->srh->hdrlen + 1) << 3)) + +enum { + SEG6_IPTUN_MODE_INLINE, + SEG6_IPTUN_MODE_ENCAP, +}; + +static inline size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo) +{ + int encap = (tuninfo->mode == SEG6_IPTUN_MODE_ENCAP); + + return ((tuninfo->srh->hdrlen + 1) << 3) + + (encap * sizeof(struct ipv6hdr)); +} + +#endif diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 88fd64250b02..03976e939818 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -39,6 +39,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) return "MPLS"; case LWTUNNEL_ENCAP_ILA: return "ILA"; + case LWTUNNEL_ENCAP_SEG6: + return "SEG6"; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_NONE: diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 2343e4f2e0bf..1123a001d729 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -289,4 +289,16 @@ config IPV6_PIMSM_V2 Support for IPv6 PIM multicast routing protocol PIM-SMv2. If unsure, say N. +config IPV6_SEG6_INLINE + bool "IPv6: direct Segment Routing Header insertion " + depends on IPV6 + ---help--- + Support for direct insertion of the Segment Routing Header, + also known as inline mode. Be aware that direct insertion of + extension headers (as opposed to encapsulation) may break + multiple mechanisms such as PMTUD or IPSec AH. Use this feature + only if you know exactly what you are doing. + + If unsure, say N. + endif # IPV6 diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index c92010d62afc..59ee92fb3689 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -9,7 +9,7 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \ raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \ exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \ - udp_offload.o seg6.o + udp_offload.o seg6.o seg6_iptunnel.o ipv6-offload := ip6_offload.o tcpv6_offload.o exthdrs_offload.o diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index e246b0ba12ac..9c78053e67e0 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -26,6 +26,43 @@ #include #include +bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len) +{ + int trailing; + unsigned int tlv_offset; + + if (srh->type != IPV6_SRCRT_TYPE_4) + return false; + + if (((srh->hdrlen + 1) << 3) != len) + return false; + + if (srh->segments_left != srh->first_segment) + return false; + + tlv_offset = sizeof(*srh) + ((srh->first_segment + 1) << 4); + + trailing = len - tlv_offset; + if (trailing < 0) + return false; + + while (trailing) { + struct sr6_tlv *tlv; + unsigned int tlv_len; + + tlv = (struct sr6_tlv *)((unsigned char *)srh + tlv_offset); + tlv_len = sizeof(*tlv) + tlv->len; + + trailing -= tlv_len; + if (trailing < 0) + return false; + + tlv_offset += tlv_len; + } + + return true; +} + static struct genl_family seg6_genl_family; static const struct nla_policy seg6_genl_policy[SEG6_ATTR_MAX + 1] = { @@ -198,10 +235,16 @@ int __init seg6_init(void) if (err) goto out_unregister_genl; + err = seg6_iptunnel_init(); + if (err) + goto out_unregister_pernet; + pr_info("Segment Routing with IPv6\n"); out: return err; +out_unregister_pernet: + unregister_pernet_subsys(&ip6_segments_ops); out_unregister_genl: genl_unregister_family(&seg6_genl_family); goto out; @@ -209,6 +252,7 @@ out_unregister_genl: void seg6_exit(void) { + seg6_iptunnel_exit(); unregister_pernet_subsys(&ip6_segments_ops); genl_unregister_family(&seg6_genl_family); } diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c new file mode 100644 index 000000000000..39762b2fa7a2 --- /dev/null +++ b/net/ipv6/seg6_iptunnel.c @@ -0,0 +1,410 @@ +/* + * SR-IPv6 implementation + * + * Author: + * David Lebrun + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_DST_CACHE +#include +#endif + +struct seg6_lwt { +#ifdef CONFIG_DST_CACHE + struct dst_cache cache; +#endif + struct seg6_iptunnel_encap tuninfo[0]; +}; + +static inline struct seg6_lwt *seg6_lwt_lwtunnel(struct lwtunnel_state *lwt) +{ + return (struct seg6_lwt *)lwt->data; +} + +static inline struct seg6_iptunnel_encap * +seg6_encap_lwtunnel(struct lwtunnel_state *lwt) +{ + return seg6_lwt_lwtunnel(lwt)->tuninfo; +} + +static const struct nla_policy seg6_iptunnel_policy[SEG6_IPTUNNEL_MAX + 1] = { + [SEG6_IPTUNNEL_SRH] = { .type = NLA_BINARY }, +}; + +int nla_put_srh(struct sk_buff *skb, int attrtype, + struct seg6_iptunnel_encap *tuninfo) +{ + struct seg6_iptunnel_encap *data; + struct nlattr *nla; + int len; + + len = SEG6_IPTUN_ENCAP_SIZE(tuninfo); + + nla = nla_reserve(skb, attrtype, len); + if (!nla) + return -EMSGSIZE; + + data = nla_data(nla); + memcpy(data, tuninfo, len); + + return 0; +} + +static void set_tun_src(struct net *net, struct net_device *dev, + struct in6_addr *daddr, struct in6_addr *saddr) +{ + struct seg6_pernet_data *sdata = seg6_pernet(net); + struct in6_addr *tun_src; + + rcu_read_lock(); + + tun_src = rcu_dereference(sdata->tun_src); + + if (!ipv6_addr_any(tun_src)) { + memcpy(saddr, tun_src, sizeof(struct in6_addr)); + } else { + ipv6_dev_get_saddr(net, dev, daddr, IPV6_PREFER_SRC_PUBLIC, + saddr); + } + + rcu_read_unlock(); +} + +/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */ +static int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) +{ + struct net *net = dev_net(skb_dst(skb)->dev); + struct ipv6hdr *hdr, *inner_hdr; + struct ipv6_sr_hdr *isrh; + int hdrlen, tot_len, err; + + hdrlen = (osrh->hdrlen + 1) << 3; + tot_len = hdrlen + sizeof(*hdr); + + err = pskb_expand_head(skb, tot_len, 0, GFP_ATOMIC); + if (unlikely(err)) + return err; + + inner_hdr = ipv6_hdr(skb); + + skb_push(skb, tot_len); + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + hdr = ipv6_hdr(skb); + + /* inherit tc, flowlabel and hlim + * hlim will be decremented in ip6_forward() afterwards and + * decapsulation will overwrite inner hlim with outer hlim + */ + ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)), + ip6_flowlabel(inner_hdr)); + hdr->hop_limit = inner_hdr->hop_limit; + hdr->nexthdr = NEXTHDR_ROUTING; + + isrh = (void *)hdr + sizeof(*hdr); + memcpy(isrh, osrh, hdrlen); + + isrh->nexthdr = NEXTHDR_IPV6; + + hdr->daddr = isrh->segments[isrh->first_segment]; + set_tun_src(net, skb->dev, &hdr->daddr, &hdr->saddr); + + skb_postpush_rcsum(skb, hdr, tot_len); + + return 0; +} + +/* insert an SRH within an IPv6 packet, just after the IPv6 header */ +#ifdef CONFIG_IPV6_SEG6_INLINE +static int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) +{ + struct ipv6hdr *hdr, *oldhdr; + struct ipv6_sr_hdr *isrh; + int hdrlen, err; + + hdrlen = (osrh->hdrlen + 1) << 3; + + err = pskb_expand_head(skb, hdrlen, 0, GFP_ATOMIC); + if (unlikely(err)) + return err; + + oldhdr = ipv6_hdr(skb); + + skb_pull(skb, sizeof(struct ipv6hdr)); + skb_postpull_rcsum(skb, skb_network_header(skb), + sizeof(struct ipv6hdr)); + + skb_push(skb, sizeof(struct ipv6hdr) + hdrlen); + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + + hdr = ipv6_hdr(skb); + + memmove(hdr, oldhdr, sizeof(*hdr)); + + isrh = (void *)hdr + sizeof(*hdr); + memcpy(isrh, osrh, hdrlen); + + isrh->nexthdr = hdr->nexthdr; + hdr->nexthdr = NEXTHDR_ROUTING; + + isrh->segments[0] = hdr->daddr; + hdr->daddr = isrh->segments[isrh->first_segment]; + + skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen); + + return 0; +} +#endif + +static int seg6_do_srh(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct seg6_iptunnel_encap *tinfo; + int err = 0; + + tinfo = seg6_encap_lwtunnel(dst->lwtstate); + + if (likely(!skb->encapsulation)) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + + switch (tinfo->mode) { +#ifdef CONFIG_IPV6_SEG6_INLINE + case SEG6_IPTUN_MODE_INLINE: + err = seg6_do_srh_inline(skb, tinfo->srh); + skb_reset_inner_headers(skb); + break; +#endif + case SEG6_IPTUN_MODE_ENCAP: + err = seg6_do_srh_encap(skb, tinfo->srh); + break; + } + + if (err) + return err; + + ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + + skb_set_inner_protocol(skb, skb->protocol); + + return 0; +} + +int seg6_input(struct sk_buff *skb) +{ + int err; + + err = seg6_do_srh(skb); + if (unlikely(err)) { + kfree_skb(skb); + return err; + } + + skb_dst_drop(skb); + ip6_route_input(skb); + + return dst_input(skb); +} + +int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *orig_dst = skb_dst(skb); + struct dst_entry *dst = NULL; + struct seg6_lwt *slwt; + int err = -EINVAL; + + err = seg6_do_srh(skb); + if (unlikely(err)) + goto drop; + + slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); + +#ifdef CONFIG_DST_CACHE + dst = dst_cache_get(&slwt->cache); +#endif + + if (unlikely(!dst)) { + struct ipv6hdr *hdr = ipv6_hdr(skb); + struct flowi6 fl6; + + fl6.daddr = hdr->daddr; + fl6.saddr = hdr->saddr; + fl6.flowlabel = ip6_flowinfo(hdr); + fl6.flowi6_mark = skb->mark; + fl6.flowi6_proto = hdr->nexthdr; + + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + err = dst->error; + dst_release(dst); + goto drop; + } + +#ifdef CONFIG_DST_CACHE + dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); +#endif + } + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + return dst_output(net, sk, skb); +drop: + kfree_skb(skb); + return err; +} + +static int seg6_build_state(struct net_device *dev, struct nlattr *nla, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts) +{ + struct nlattr *tb[SEG6_IPTUNNEL_MAX + 1]; + struct seg6_iptunnel_encap *tuninfo; + struct lwtunnel_state *newts; + int tuninfo_len, min_size; + struct seg6_lwt *slwt; + int err; + + err = nla_parse_nested(tb, SEG6_IPTUNNEL_MAX, nla, + seg6_iptunnel_policy); + + if (err < 0) + return err; + + if (!tb[SEG6_IPTUNNEL_SRH]) + return -EINVAL; + + tuninfo = nla_data(tb[SEG6_IPTUNNEL_SRH]); + tuninfo_len = nla_len(tb[SEG6_IPTUNNEL_SRH]); + + /* tuninfo must contain at least the iptunnel encap structure, + * the SRH and one segment + */ + min_size = sizeof(*tuninfo) + sizeof(struct ipv6_sr_hdr) + + sizeof(struct in6_addr); + if (tuninfo_len < min_size) + return -EINVAL; + + switch (tuninfo->mode) { +#ifdef CONFIG_IPV6_SEG6_INLINE + case SEG6_IPTUN_MODE_INLINE: + break; +#endif + case SEG6_IPTUN_MODE_ENCAP: + break; + default: + return -EINVAL; + } + + /* verify that SRH is consistent */ + if (!seg6_validate_srh(tuninfo->srh, tuninfo_len - sizeof(*tuninfo))) + return -EINVAL; + + newts = lwtunnel_state_alloc(tuninfo_len + sizeof(*slwt)); + if (!newts) + return -ENOMEM; + + slwt = seg6_lwt_lwtunnel(newts); + +#ifdef CONFIG_DST_CACHE + err = dst_cache_init(&slwt->cache, GFP_KERNEL); + if (err) { + kfree(newts); + return err; + } +#endif + + memcpy(&slwt->tuninfo, tuninfo, tuninfo_len); + + newts->type = LWTUNNEL_ENCAP_SEG6; + newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT | + LWTUNNEL_STATE_INPUT_REDIRECT; + newts->headroom = seg6_lwt_headroom(tuninfo); + + *ts = newts; + + return 0; +} + +#ifdef CONFIG_DST_CACHE +static void seg6_destroy_state(struct lwtunnel_state *lwt) +{ + dst_cache_destroy(&seg6_lwt_lwtunnel(lwt)->cache); +} +#endif + +static int seg6_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate); + + if (nla_put_srh(skb, SEG6_IPTUNNEL_SRH, tuninfo)) + return -EMSGSIZE; + + return 0; +} + +static int seg6_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate); + + return nla_total_size(SEG6_IPTUN_ENCAP_SIZE(tuninfo)); +} + +static int seg6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct seg6_iptunnel_encap *a_hdr = seg6_encap_lwtunnel(a); + struct seg6_iptunnel_encap *b_hdr = seg6_encap_lwtunnel(b); + int len = SEG6_IPTUN_ENCAP_SIZE(a_hdr); + + if (len != SEG6_IPTUN_ENCAP_SIZE(b_hdr)) + return 1; + + return memcmp(a_hdr, b_hdr, len); +} + +static const struct lwtunnel_encap_ops seg6_iptun_ops = { + .build_state = seg6_build_state, +#ifdef CONFIG_DST_CACHE + .destroy_state = seg6_destroy_state, +#endif + .output = seg6_output, + .input = seg6_input, + .fill_encap = seg6_fill_encap_info, + .get_encap_size = seg6_encap_nlsize, + .cmp_encap = seg6_encap_cmp, +}; + +int __init seg6_iptunnel_init(void) +{ + return lwtunnel_encap_add_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6); +} + +void seg6_iptunnel_exit(void) +{ + lwtunnel_encap_del_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6); +} -- cgit v1.2.3-71-gd317 From bf355b8d2c30a289232042cacc1cfaea4923936c Mon Sep 17 00:00:00 2001 From: David Lebrun Date: Tue, 8 Nov 2016 14:57:42 +0100 Subject: ipv6: sr: add core files for SR HMAC support This patch adds the necessary functions to compute and check the HMAC signature of an SR-enabled packet. Two HMAC algorithms are supported: hmac(sha1) and hmac(sha256). In order to avoid dynamic memory allocation for each HMAC computation, a per-cpu ring buffer is allocated for this purpose. A new per-interface sysctl called seg6_require_hmac is added, allowing a user-defined policy for processing HMAC-signed SR-enabled packets. A value of -1 means that the HMAC field will always be ignored. A value of 0 means that if an HMAC field is present, its validity will be enforced (the packet is dropped is the signature is incorrect). Finally, a value of 1 means that any SR-enabled packet that does not contain an HMAC signature or whose signature is incorrect will be dropped. Signed-off-by: David Lebrun Signed-off-by: David S. Miller --- include/linux/ipv6.h | 3 + include/linux/seg6_hmac.h | 6 + include/net/seg6.h | 4 + include/net/seg6_hmac.h | 62 ++++++ include/uapi/linux/ipv6.h | 1 + include/uapi/linux/seg6_hmac.h | 21 ++ net/ipv6/Kconfig | 12 + net/ipv6/Makefile | 1 + net/ipv6/addrconf.c | 18 ++ net/ipv6/seg6_hmac.c | 484 +++++++++++++++++++++++++++++++++++++++++ 10 files changed, 612 insertions(+) create mode 100644 include/linux/seg6_hmac.h create mode 100644 include/net/seg6_hmac.h create mode 100644 include/uapi/linux/seg6_hmac.h create mode 100644 net/ipv6/seg6_hmac.c (limited to 'include/uapi/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 68d3f71f0abf..93756585521f 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -65,6 +65,9 @@ struct ipv6_devconf { __s32 use_oif_addrs_only; __s32 keep_addr_on_down; __s32 seg6_enabled; +#ifdef CONFIG_IPV6_SEG6_HMAC + __s32 seg6_require_hmac; +#endif struct ctl_table_header *sysctl_header; }; diff --git a/include/linux/seg6_hmac.h b/include/linux/seg6_hmac.h new file mode 100644 index 000000000000..da437ebdc6cd --- /dev/null +++ b/include/linux/seg6_hmac.h @@ -0,0 +1,6 @@ +#ifndef _LINUX_SEG6_HMAC_H +#define _LINUX_SEG6_HMAC_H + +#include + +#endif diff --git a/include/net/seg6.h b/include/net/seg6.h index ff5da0ce83e9..4e0357517d79 100644 --- a/include/net/seg6.h +++ b/include/net/seg6.h @@ -18,6 +18,7 @@ #include #include #include +#include static inline void update_csum_diff4(struct sk_buff *skb, __be32 from, __be32 to) @@ -41,6 +42,9 @@ static inline void update_csum_diff16(struct sk_buff *skb, __be32 *from, struct seg6_pernet_data { struct mutex lock; struct in6_addr __rcu *tun_src; +#ifdef CONFIG_IPV6_SEG6_HMAC + struct rhashtable hmac_infos; +#endif }; static inline struct seg6_pernet_data *seg6_pernet(struct net *net) diff --git a/include/net/seg6_hmac.h b/include/net/seg6_hmac.h new file mode 100644 index 000000000000..69c3a106056b --- /dev/null +++ b/include/net/seg6_hmac.h @@ -0,0 +1,62 @@ +/* + * SR-IPv6 implementation + * + * Author: + * David Lebrun + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _NET_SEG6_HMAC_H +#define _NET_SEG6_HMAC_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SEG6_HMAC_MAX_DIGESTSIZE 160 +#define SEG6_HMAC_RING_SIZE 256 + +struct seg6_hmac_info { + struct rhash_head node; + struct rcu_head rcu; + + u32 hmackeyid; + char secret[SEG6_HMAC_SECRET_LEN]; + u8 slen; + u8 alg_id; +}; + +struct seg6_hmac_algo { + u8 alg_id; + char name[64]; + struct crypto_shash * __percpu *tfms; + struct shash_desc * __percpu *shashs; +}; + +extern int seg6_hmac_compute(struct seg6_hmac_info *hinfo, + struct ipv6_sr_hdr *hdr, struct in6_addr *saddr, + u8 *output); +extern struct seg6_hmac_info *seg6_hmac_info_lookup(struct net *net, u32 key); +extern int seg6_hmac_info_add(struct net *net, u32 key, + struct seg6_hmac_info *hinfo); +extern int seg6_hmac_info_del(struct net *net, u32 key); +extern int seg6_push_hmac(struct net *net, struct in6_addr *saddr, + struct ipv6_sr_hdr *srh); +extern bool seg6_hmac_validate_skb(struct sk_buff *skb); +extern int seg6_hmac_init(void); +extern void seg6_hmac_exit(void); +extern int seg6_hmac_net_init(struct net *net); +extern void seg6_hmac_net_exit(struct net *net); + +#endif diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 7ff1d654e333..53561be1ac21 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -180,6 +180,7 @@ enum { DEVCONF_KEEP_ADDR_ON_DOWN, DEVCONF_RTR_SOLICIT_MAX_INTERVAL, DEVCONF_SEG6_ENABLED, + DEVCONF_SEG6_REQUIRE_HMAC, DEVCONF_MAX }; diff --git a/include/uapi/linux/seg6_hmac.h b/include/uapi/linux/seg6_hmac.h new file mode 100644 index 000000000000..b652dfd51bc5 --- /dev/null +++ b/include/uapi/linux/seg6_hmac.h @@ -0,0 +1,21 @@ +#ifndef _UAPI_LINUX_SEG6_HMAC_H +#define _UAPI_LINUX_SEG6_HMAC_H + +#include + +#define SEG6_HMAC_SECRET_LEN 64 +#define SEG6_HMAC_FIELD_LEN 32 + +struct sr6_tlv_hmac { + struct sr6_tlv tlvhdr; + __u16 reserved; + __be32 hmackeyid; + __u8 hmac[SEG6_HMAC_FIELD_LEN]; +}; + +enum { + SEG6_HMAC_ALGO_SHA1 = 1, + SEG6_HMAC_ALGO_SHA256 = 2, +}; + +#endif diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 1123a001d729..0f00811a785f 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -301,4 +301,16 @@ config IPV6_SEG6_INLINE If unsure, say N. +config IPV6_SEG6_HMAC + bool "IPv6: Segment Routing HMAC support" + depends on IPV6 + select CRYPTO_HMAC + select CRYPTO_SHA1 + select CRYPTO_SHA256 + ---help--- + Support for HMAC signature generation and verification + of SR-enabled packets. + + If unsure, say N. + endif # IPV6 diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 59ee92fb3689..129cad2ba960 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_IPV6_SIT) += sit.o obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o obj-$(CONFIG_IPV6_GRE) += ip6_gre.o obj-$(CONFIG_IPV6_FOU) += fou6.o +obj-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 2ac6cb460af0..86219c0a0104 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -239,6 +239,9 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .ignore_routes_with_linkdown = 0, .keep_addr_on_down = 0, .seg6_enabled = 0, +#ifdef CONFIG_IPV6_SEG6_HMAC + .seg6_require_hmac = 0, +#endif }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -286,6 +289,9 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .ignore_routes_with_linkdown = 0, .keep_addr_on_down = 0, .seg6_enabled = 0, +#ifdef CONFIG_IPV6_SEG6_HMAC + .seg6_require_hmac = 0, +#endif }; /* Check if a valid qdisc is available */ @@ -4947,6 +4953,9 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na; array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down; array[DEVCONF_SEG6_ENABLED] = cnf->seg6_enabled; +#ifdef CONFIG_IPV6_SEG6_HMAC + array[DEVCONF_SEG6_REQUIRE_HMAC] = cnf->seg6_require_hmac; +#endif } static inline size_t inet6_ifla6_size(void) @@ -6045,6 +6054,15 @@ static const struct ctl_table addrconf_sysctl[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_IPV6_SEG6_HMAC + { + .procname = "seg6_require_hmac", + .data = &ipv6_devconf.seg6_require_hmac, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif { /* sentinel */ } diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c new file mode 100644 index 000000000000..ef1c8a46e7ac --- /dev/null +++ b/net/ipv6/seg6_hmac.c @@ -0,0 +1,484 @@ +/* + * SR-IPv6 implementation -- HMAC functions + * + * Author: + * David Lebrun + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +static char * __percpu *hmac_ring; + +static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) +{ + const struct seg6_hmac_info *hinfo = obj; + + return (hinfo->hmackeyid != *(__u32 *)arg->key); +} + +static inline void seg6_hinfo_release(struct seg6_hmac_info *hinfo) +{ + kfree_rcu(hinfo, rcu); +} + +static void seg6_free_hi(void *ptr, void *arg) +{ + struct seg6_hmac_info *hinfo = (struct seg6_hmac_info *)ptr; + + if (hinfo) + seg6_hinfo_release(hinfo); +} + +static const struct rhashtable_params rht_params = { + .head_offset = offsetof(struct seg6_hmac_info, node), + .key_offset = offsetof(struct seg6_hmac_info, hmackeyid), + .key_len = sizeof(u32), + .automatic_shrinking = true, + .obj_cmpfn = seg6_hmac_cmpfn, +}; + +static struct seg6_hmac_algo hmac_algos[] = { + { + .alg_id = SEG6_HMAC_ALGO_SHA1, + .name = "hmac(sha1)", + }, + { + .alg_id = SEG6_HMAC_ALGO_SHA256, + .name = "hmac(sha256)", + }, +}; + +static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh) +{ + struct sr6_tlv_hmac *tlv; + + if (srh->hdrlen < (srh->first_segment + 1) * 2 + 5) + return NULL; + + if (!sr_has_hmac(srh)) + return NULL; + + tlv = (struct sr6_tlv_hmac *) + ((char *)srh + ((srh->hdrlen + 1) << 3) - 40); + + if (tlv->tlvhdr.type != SR6_TLV_HMAC || tlv->tlvhdr.len != 38) + return NULL; + + return tlv; +} + +static struct seg6_hmac_algo *__hmac_get_algo(u8 alg_id) +{ + struct seg6_hmac_algo *algo; + int i, alg_count; + + alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo); + for (i = 0; i < alg_count; i++) { + algo = &hmac_algos[i]; + if (algo->alg_id == alg_id) + return algo; + } + + return NULL; +} + +static int __do_hmac(struct seg6_hmac_info *hinfo, const char *text, u8 psize, + u8 *output, int outlen) +{ + struct seg6_hmac_algo *algo; + struct crypto_shash *tfm; + struct shash_desc *shash; + int ret, dgsize; + + algo = __hmac_get_algo(hinfo->alg_id); + if (!algo) + return -ENOENT; + + tfm = *this_cpu_ptr(algo->tfms); + + dgsize = crypto_shash_digestsize(tfm); + if (dgsize > outlen) { + pr_debug("sr-ipv6: __do_hmac: digest size too big (%d / %d)\n", + dgsize, outlen); + return -ENOMEM; + } + + ret = crypto_shash_setkey(tfm, hinfo->secret, hinfo->slen); + if (ret < 0) { + pr_debug("sr-ipv6: crypto_shash_setkey failed: err %d\n", ret); + goto failed; + } + + shash = *this_cpu_ptr(algo->shashs); + shash->tfm = tfm; + + ret = crypto_shash_digest(shash, text, psize, output); + if (ret < 0) { + pr_debug("sr-ipv6: crypto_shash_digest failed: err %d\n", ret); + goto failed; + } + + return dgsize; + +failed: + return ret; +} + +int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, + struct in6_addr *saddr, u8 *output) +{ + __be32 hmackeyid = cpu_to_be32(hinfo->hmackeyid); + u8 tmp_out[SEG6_HMAC_MAX_DIGESTSIZE]; + int plen, i, dgsize, wrsize; + char *ring, *off; + + /* a 160-byte buffer for digest output allows to store highest known + * hash function (RadioGatun) with up to 1216 bits + */ + + /* saddr(16) + first_seg(1) + cleanup(1) + keyid(4) + seglist(16n) */ + plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16; + + /* this limit allows for 14 segments */ + if (plen >= SEG6_HMAC_RING_SIZE) + return -EMSGSIZE; + + /* Let's build the HMAC text on the ring buffer. The text is composed + * as follows, in order: + * + * 1. Source IPv6 address (128 bits) + * 2. first_segment value (8 bits) + * 3. cleanup flag (8 bits: highest bit is cleanup value, others are 0) + * 4. HMAC Key ID (32 bits) + * 5. All segments in the segments list (n * 128 bits) + */ + + local_bh_disable(); + ring = *this_cpu_ptr(hmac_ring); + off = ring; + + /* source address */ + memcpy(off, saddr, 16); + off += 16; + + /* first_segment value */ + *off++ = hdr->first_segment; + + /* cleanup flag */ + *off++ = !!(sr_has_cleanup(hdr)) << 7; + + /* HMAC Key ID */ + memcpy(off, &hmackeyid, 4); + off += 4; + + /* all segments in the list */ + for (i = 0; i < hdr->first_segment + 1; i++) { + memcpy(off, hdr->segments + i, 16); + off += 16; + } + + dgsize = __do_hmac(hinfo, ring, plen, tmp_out, + SEG6_HMAC_MAX_DIGESTSIZE); + local_bh_enable(); + + if (dgsize < 0) + return dgsize; + + wrsize = SEG6_HMAC_FIELD_LEN; + if (wrsize > dgsize) + wrsize = dgsize; + + memset(output, 0, SEG6_HMAC_FIELD_LEN); + memcpy(output, tmp_out, wrsize); + + return 0; +} +EXPORT_SYMBOL(seg6_hmac_compute); + +/* checks if an incoming SR-enabled packet's HMAC status matches + * the incoming policy. + * + * called with rcu_read_lock() + */ +bool seg6_hmac_validate_skb(struct sk_buff *skb) +{ + u8 hmac_output[SEG6_HMAC_FIELD_LEN]; + struct net *net = dev_net(skb->dev); + struct seg6_hmac_info *hinfo; + struct sr6_tlv_hmac *tlv; + struct ipv6_sr_hdr *srh; + struct inet6_dev *idev; + + idev = __in6_dev_get(skb->dev); + + srh = (struct ipv6_sr_hdr *)skb_transport_header(skb); + + tlv = seg6_get_tlv_hmac(srh); + + /* mandatory check but no tlv */ + if (idev->cnf.seg6_require_hmac > 0 && !tlv) + return false; + + /* no check */ + if (idev->cnf.seg6_require_hmac < 0) + return true; + + /* check only if present */ + if (idev->cnf.seg6_require_hmac == 0 && !tlv) + return true; + + /* now, seg6_require_hmac >= 0 && tlv */ + + hinfo = seg6_hmac_info_lookup(net, be32_to_cpu(tlv->hmackeyid)); + if (!hinfo) + return false; + + if (seg6_hmac_compute(hinfo, srh, &ipv6_hdr(skb)->saddr, hmac_output)) + return false; + + if (memcmp(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN) != 0) + return false; + + return true; +} +EXPORT_SYMBOL(seg6_hmac_validate_skb); + +/* called with rcu_read_lock() */ +struct seg6_hmac_info *seg6_hmac_info_lookup(struct net *net, u32 key) +{ + struct seg6_pernet_data *sdata = seg6_pernet(net); + struct seg6_hmac_info *hinfo; + + hinfo = rhashtable_lookup_fast(&sdata->hmac_infos, &key, rht_params); + + return hinfo; +} +EXPORT_SYMBOL(seg6_hmac_info_lookup); + +int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo) +{ + struct seg6_pernet_data *sdata = seg6_pernet(net); + int err; + + err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node, + rht_params); + + return err; +} +EXPORT_SYMBOL(seg6_hmac_info_add); + +int seg6_hmac_info_del(struct net *net, u32 key) +{ + struct seg6_pernet_data *sdata = seg6_pernet(net); + struct seg6_hmac_info *hinfo; + int err = -ENOENT; + + hinfo = rhashtable_lookup_fast(&sdata->hmac_infos, &key, rht_params); + if (!hinfo) + goto out; + + err = rhashtable_remove_fast(&sdata->hmac_infos, &hinfo->node, + rht_params); + if (err) + goto out; + + seg6_hinfo_release(hinfo); + +out: + return err; +} +EXPORT_SYMBOL(seg6_hmac_info_del); + +int seg6_push_hmac(struct net *net, struct in6_addr *saddr, + struct ipv6_sr_hdr *srh) +{ + struct seg6_hmac_info *hinfo; + struct sr6_tlv_hmac *tlv; + int err = -ENOENT; + + tlv = seg6_get_tlv_hmac(srh); + if (!tlv) + return -EINVAL; + + rcu_read_lock(); + + hinfo = seg6_hmac_info_lookup(net, be32_to_cpu(tlv->hmackeyid)); + if (!hinfo) + goto out; + + memset(tlv->hmac, 0, SEG6_HMAC_FIELD_LEN); + err = seg6_hmac_compute(hinfo, srh, saddr, tlv->hmac); + +out: + rcu_read_unlock(); + return err; +} +EXPORT_SYMBOL(seg6_push_hmac); + +static int seg6_hmac_init_ring(void) +{ + int i; + + hmac_ring = alloc_percpu(char *); + + if (!hmac_ring) + return -ENOMEM; + + for_each_possible_cpu(i) { + char *ring = kzalloc(SEG6_HMAC_RING_SIZE, GFP_KERNEL); + + if (!ring) + return -ENOMEM; + + *per_cpu_ptr(hmac_ring, i) = ring; + } + + return 0; +} + +static int seg6_hmac_init_algo(void) +{ + struct seg6_hmac_algo *algo; + struct crypto_shash *tfm; + struct shash_desc *shash; + int i, alg_count, cpu; + + alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo); + + for (i = 0; i < alg_count; i++) { + struct crypto_shash **p_tfm; + int shsize; + + algo = &hmac_algos[i]; + algo->tfms = alloc_percpu(struct crypto_shash *); + if (!algo->tfms) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + tfm = crypto_alloc_shash(algo->name, 0, GFP_KERNEL); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + p_tfm = per_cpu_ptr(algo->tfms, cpu); + *p_tfm = tfm; + } + + p_tfm = this_cpu_ptr(algo->tfms); + tfm = *p_tfm; + + shsize = sizeof(*shash) + crypto_shash_descsize(tfm); + + algo->shashs = alloc_percpu(struct shash_desc *); + if (!algo->shashs) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + shash = kzalloc(shsize, GFP_KERNEL); + if (!shash) + return -ENOMEM; + *per_cpu_ptr(algo->shashs, cpu) = shash; + } + } + + return 0; +} + +int __init seg6_hmac_init(void) +{ + int ret; + + ret = seg6_hmac_init_ring(); + if (ret < 0) + goto out; + + ret = seg6_hmac_init_algo(); + +out: + return ret; +} +EXPORT_SYMBOL(seg6_hmac_init); + +int __net_init seg6_hmac_net_init(struct net *net) +{ + struct seg6_pernet_data *sdata = seg6_pernet(net); + + rhashtable_init(&sdata->hmac_infos, &rht_params); + + return 0; +} +EXPORT_SYMBOL(seg6_hmac_net_init); + +void seg6_hmac_exit(void) +{ + struct seg6_hmac_algo *algo = NULL; + int i, alg_count, cpu; + + for_each_possible_cpu(i) { + char *ring = *per_cpu_ptr(hmac_ring, i); + + kfree(ring); + } + free_percpu(hmac_ring); + + alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo); + for (i = 0; i < alg_count; i++) { + algo = &hmac_algos[i]; + for_each_possible_cpu(cpu) { + struct crypto_shash *tfm; + struct shash_desc *shash; + + shash = *per_cpu_ptr(algo->shashs, cpu); + kfree(shash); + tfm = *per_cpu_ptr(algo->tfms, cpu); + crypto_free_shash(tfm); + } + free_percpu(algo->tfms); + free_percpu(algo->shashs); + } +} +EXPORT_SYMBOL(seg6_hmac_exit); + +void __net_exit seg6_hmac_net_exit(struct net *net) +{ + struct seg6_pernet_data *sdata = seg6_pernet(net); + + rhashtable_free_and_destroy(&sdata->hmac_infos, seg6_free_hi, NULL); +} +EXPORT_SYMBOL(seg6_hmac_net_exit); -- cgit v1.2.3-71-gd317 From 91820da6ae85904d95ed53bf3a83f9ec44a6b80a Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 10 Nov 2016 16:28:23 +0100 Subject: openvswitch: add Ethernet push and pop actions It's not allowed to push Ethernet header in front of another Ethernet header. It's not allowed to pop Ethernet header if there's a vlan tag. This preserves the invariant that L3 packet never has a vlan tag. Based on previous versions by Lorand Jakab and Simon Horman. Signed-off-by: Lorand Jakab Signed-off-by: Simon Horman Signed-off-by: Jiri Benc Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 15 ++++++++++++ net/openvswitch/actions.c | 49 ++++++++++++++++++++++++++++++++++++++++ net/openvswitch/flow_netlink.c | 18 +++++++++++++++ 3 files changed, 82 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 59ed3992c760..375d812fea36 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -705,6 +705,15 @@ enum ovs_nat_attr { #define OVS_NAT_ATTR_MAX (__OVS_NAT_ATTR_MAX - 1) +/* + * struct ovs_action_push_eth - %OVS_ACTION_ATTR_PUSH_ETH action argument. + * @addresses: Source and destination MAC addresses. + * @eth_type: Ethernet type + */ +struct ovs_action_push_eth { + struct ovs_key_ethernet addresses; +}; + /** * enum ovs_action_attr - Action types. * @@ -738,6 +747,10 @@ enum ovs_nat_attr { * is no MPLS label stack, as determined by ethertype, no action is taken. * @OVS_ACTION_ATTR_CT: Track the connection. Populate the conntrack-related * entries in the flow key. + * @OVS_ACTION_ATTR_PUSH_ETH: Push a new outermost Ethernet header onto the + * packet. + * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the + * packet. * * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all * fields within a header are modifiable, e.g. the IPv4 protocol and fragment @@ -765,6 +778,8 @@ enum ovs_action_attr { * bits. */ OVS_ACTION_ATTR_CT, /* Nested OVS_CT_ATTR_* . */ OVS_ACTION_ATTR_TRUNC, /* u32 struct ovs_action_trunc. */ + OVS_ACTION_ATTR_PUSH_ETH, /* struct ovs_action_push_eth. */ + OVS_ACTION_ATTR_POP_ETH, /* No argument. */ __OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted * from userspace. */ diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 064cbcb7b0c5..514f7bcf7c63 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -317,6 +317,47 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, return 0; } +/* pop_eth does not support VLAN packets as this action is never called + * for them. + */ +static int pop_eth(struct sk_buff *skb, struct sw_flow_key *key) +{ + skb_pull_rcsum(skb, ETH_HLEN); + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + /* safe right before invalidate_flow_key */ + key->mac_proto = MAC_PROTO_NONE; + invalidate_flow_key(key); + return 0; +} + +static int push_eth(struct sk_buff *skb, struct sw_flow_key *key, + const struct ovs_action_push_eth *ethh) +{ + struct ethhdr *hdr; + + /* Add the new Ethernet header */ + if (skb_cow_head(skb, ETH_HLEN) < 0) + return -ENOMEM; + + skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + hdr = eth_hdr(skb); + ether_addr_copy(hdr->h_source, ethh->addresses.eth_src); + ether_addr_copy(hdr->h_dest, ethh->addresses.eth_dst); + hdr->h_proto = skb->protocol; + + skb_postpush_rcsum(skb, hdr, ETH_HLEN); + + /* safe right before invalidate_flow_key */ + key->mac_proto = MAC_PROTO_ETHERNET; + invalidate_flow_key(key); + return 0; +} + static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh, __be32 addr, __be32 new_addr) { @@ -1200,6 +1241,14 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, if (err) return err == -EINPROGRESS ? 0 : err; break; + + case OVS_ACTION_ATTR_PUSH_ETH: + err = push_eth(skb, key, nla_data(a)); + break; + + case OVS_ACTION_ATTR_POP_ETH: + err = pop_eth(skb, key); + break; } if (unlikely(err)) { diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index c3d0cc4321c3..d19044f2b1f4 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2383,6 +2383,8 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash), [OVS_ACTION_ATTR_CT] = (u32)-1, [OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc), + [OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth), + [OVS_ACTION_ATTR_POP_ETH] = 0, }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -2517,6 +2519,22 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, skip_copy = true; break; + case OVS_ACTION_ATTR_PUSH_ETH: + /* Disallow pushing an Ethernet header if one + * is already present */ + if (mac_proto != MAC_PROTO_NONE) + return -EINVAL; + mac_proto = MAC_PROTO_NONE; + break; + + case OVS_ACTION_ATTR_POP_ETH: + if (mac_proto != MAC_PROTO_ETHERNET) + return -EINVAL; + if (vlan_tci & htons(VLAN_TAG_PRESENT)) + return -EINVAL; + mac_proto = MAC_PROTO_ETHERNET; + break; + default: OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; -- cgit v1.2.3-71-gd317 From 29ba732acbeece1e34c68483d1ec1f3720fa1bb3 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 11 Nov 2016 10:55:09 -0800 Subject: bpf: Add BPF_MAP_TYPE_LRU_HASH Provide a LRU version of the existing BPF_MAP_TYPE_HASH. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 8 ++ kernel/bpf/hashtab.c | 266 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 260 insertions(+), 14 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e2f38e0091b6..ed8c6799fb14 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -85,6 +85,7 @@ enum bpf_map_type { BPF_MAP_TYPE_PERCPU_ARRAY, BPF_MAP_TYPE_STACK_TRACE, BPF_MAP_TYPE_CGROUP_ARRAY, + BPF_MAP_TYPE_LRU_HASH, }; enum bpf_prog_type { @@ -106,6 +107,13 @@ enum bpf_prog_type { #define BPF_EXIST 2 /* update existing element */ #define BPF_F_NO_PREALLOC (1U << 0) +/* Instead of having one common LRU list in the + * BPF_MAP_TYPE_LRU_HASH map, use a percpu LRU list + * which can scale and perform better. + * Note, the LRU nodes (including free nodes) cannot be moved + * across different LRU lists. + */ +#define BPF_F_NO_COMMON_LRU (1U << 1) union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b478d80f9771..4a9e71a7c41f 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -15,6 +15,7 @@ #include #include #include "percpu_freelist.h" +#include "bpf_lru_list.h" struct bucket { struct hlist_head head; @@ -25,7 +26,10 @@ struct bpf_htab { struct bpf_map map; struct bucket *buckets; void *elems; - struct pcpu_freelist freelist; + union { + struct pcpu_freelist freelist; + struct bpf_lru lru; + }; void __percpu *extra_elems; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ @@ -48,11 +52,19 @@ struct htab_elem { union { struct rcu_head rcu; enum extra_elem_state state; + struct bpf_lru_node lru_node; }; u32 hash; char key[0] __aligned(8); }; +static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); + +static bool htab_is_lru(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH; +} + static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { @@ -87,7 +99,22 @@ free_elems: vfree(htab->elems); } -static int prealloc_elems_and_freelist(struct bpf_htab *htab) +static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, + u32 hash) +{ + struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); + struct htab_elem *l; + + if (node) { + l = container_of(node, struct htab_elem, lru_node); + memcpy(l->key, key, htab->map.key_size); + return l; + } + + return NULL; +} + +static int prealloc_init(struct bpf_htab *htab) { int err = -ENOMEM, i; @@ -110,12 +137,27 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab) } skip_percpu_elems: - err = pcpu_freelist_init(&htab->freelist); + if (htab_is_lru(htab)) + err = bpf_lru_init(&htab->lru, + htab->map.map_flags & BPF_F_NO_COMMON_LRU, + offsetof(struct htab_elem, hash) - + offsetof(struct htab_elem, lru_node), + htab_lru_map_delete_node, + htab); + else + err = pcpu_freelist_init(&htab->freelist); + if (err) goto free_elems; - pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size, - htab->map.max_entries); + if (htab_is_lru(htab)) + bpf_lru_populate(&htab->lru, htab->elems, + offsetof(struct htab_elem, lru_node), + htab->elem_size, htab->map.max_entries); + else + pcpu_freelist_populate(&htab->freelist, htab->elems, + htab->elem_size, htab->map.max_entries); + return 0; free_elems: @@ -123,6 +165,16 @@ free_elems: return err; } +static void prealloc_destroy(struct bpf_htab *htab) +{ + htab_free_elems(htab); + + if (htab_is_lru(htab)) + bpf_lru_destroy(&htab->lru); + else + pcpu_freelist_destroy(&htab->freelist); +} + static int alloc_extra_elems(struct bpf_htab *htab) { void __percpu *pptr; @@ -144,14 +196,34 @@ static int alloc_extra_elems(struct bpf_htab *htab) static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH; + bool lru = attr->map_type == BPF_MAP_TYPE_LRU_HASH; + /* percpu_lru means each cpu has its own LRU list. + * it is different from BPF_MAP_TYPE_PERCPU_HASH where + * the map's value itself is percpu. percpu_lru has + * nothing to do with the map's value. + */ + bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); + bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; int err, i; u64 cost; - if (attr->map_flags & ~BPF_F_NO_PREALLOC) + if (lru && !capable(CAP_SYS_ADMIN)) + /* LRU implementation is much complicated than other + * maps. Hence, limit to CAP_SYS_ADMIN for now. + */ + return ERR_PTR(-EPERM); + + if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU)) /* reserved bits should not be used */ return ERR_PTR(-EINVAL); + if (!lru && percpu_lru) + return ERR_PTR(-EINVAL); + + if (lru && !prealloc) + return ERR_PTR(-ENOTSUPP); + htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); @@ -171,6 +243,18 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->map.value_size == 0) goto free_htab; + if (percpu_lru) { + /* ensure each CPU's lru list has >=1 elements. + * since we are at it, make each lru list has the same + * number of elements. + */ + htab->map.max_entries = roundup(attr->max_entries, + num_possible_cpus()); + if (htab->map.max_entries < attr->max_entries) + htab->map.max_entries = rounddown(attr->max_entries, + num_possible_cpus()); + } + /* hash table size must be power of 2 */ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); @@ -241,14 +325,17 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&htab->buckets[i].lock); } - if (!percpu) { + if (!percpu && !lru) { + /* lru itself can remove the least used element, so + * there is no need for an extra elem during map_update. + */ err = alloc_extra_elems(htab); if (err) goto free_buckets; } - if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { - err = prealloc_elems_and_freelist(htab); + if (prealloc) { + err = prealloc_init(htab); if (err) goto free_extra_elems; } @@ -323,6 +410,46 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + + if (l) { + bpf_lru_node_set_ref(&l->lru_node); + return l->key + round_up(map->key_size, 8); + } + + return NULL; +} + +/* It is called from the bpf_lru_list when the LRU needs to delete + * older elements from the htab. + */ +static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) +{ + struct bpf_htab *htab = (struct bpf_htab *)arg; + struct htab_elem *l, *tgt_l; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + + tgt_l = container_of(node, struct htab_elem, lru_node); + b = __select_bucket(htab, tgt_l->hash); + head = &b->head; + + raw_spin_lock_irqsave(&b->lock, flags); + + hlist_for_each_entry_rcu(l, head, hash_node) + if (l == tgt_l) { + hlist_del_rcu(&l->hash_node); + break; + } + + raw_spin_unlock_irqrestore(&b->lock, flags); + + return l == tgt_l; +} + /* Called from syscall */ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { @@ -579,6 +706,70 @@ err: return ret; } +static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new, *l_old = NULL; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + u32 key_size, hash; + int ret; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + b = __select_bucket(htab, hash); + head = &b->head; + + /* For LRU, we need to alloc before taking bucket's + * spinlock because getting free nodes from LRU may need + * to remove older elements from htab and this removal + * operation will need a bucket lock. + */ + l_new = prealloc_lru_pop(htab, key, hash); + if (!l_new) + return -ENOMEM; + memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_irqsave(&b->lock, flags); + + l_old = lookup_elem_raw(head, hash, key, key_size); + + ret = check_flags(htab, l_old, map_flags); + if (ret) + goto err; + + /* add new element to the head of the list, so that + * concurrent search will find it before old elem + */ + hlist_add_head_rcu(&l_new->hash_node, head); + if (l_old) { + bpf_lru_node_set_ref(&l_new->lru_node); + hlist_del_rcu(&l_old->hash_node); + } + ret = 0; + +err: + raw_spin_unlock_irqrestore(&b->lock, flags); + + if (ret) + bpf_lru_push_free(&htab->lru, &l_new->lru_node); + else if (l_old) + bpf_lru_push_free(&htab->lru, &l_old->lru_node); + + return ret; +} + static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags, bool onallcpus) @@ -671,6 +862,39 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) return ret; } +static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct bucket *b; + struct htab_elem *l; + unsigned long flags; + u32 hash, key_size; + int ret = -ENOENT; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + raw_spin_lock_irqsave(&b->lock, flags); + + l = lookup_elem_raw(head, hash, key, key_size); + + if (l) { + hlist_del_rcu(&l->hash_node); + ret = 0; + } + + raw_spin_unlock_irqrestore(&b->lock, flags); + if (l) + bpf_lru_push_free(&htab->lru, &l->lru_node); + return ret; +} + static void delete_all_elements(struct bpf_htab *htab) { int i; @@ -703,12 +927,11 @@ static void htab_map_free(struct bpf_map *map) * not have executed. Wait for them. */ rcu_barrier(); - if (htab->map.map_flags & BPF_F_NO_PREALLOC) { + if (htab->map.map_flags & BPF_F_NO_PREALLOC) delete_all_elements(htab); - } else { - htab_free_elems(htab); - pcpu_freelist_destroy(&htab->freelist); - } + else + prealloc_destroy(htab); + free_percpu(htab->extra_elems); kvfree(htab->buckets); kfree(htab); @@ -728,6 +951,20 @@ static struct bpf_map_type_list htab_type __read_mostly = { .type = BPF_MAP_TYPE_HASH, }; +static const struct bpf_map_ops htab_lru_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_lru_map_lookup_elem, + .map_update_elem = htab_lru_map_update_elem, + .map_delete_elem = htab_lru_map_delete_elem, +}; + +static struct bpf_map_type_list htab_lru_type __read_mostly = { + .ops = &htab_lru_ops, + .type = BPF_MAP_TYPE_LRU_HASH, +}; + /* Called from eBPF program */ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) { @@ -798,6 +1035,7 @@ static int __init register_htab_map(void) { bpf_register_map_type(&htab_type); bpf_register_map_type(&htab_percpu_type); + bpf_register_map_type(&htab_lru_type); return 0; } late_initcall(register_htab_map); -- cgit v1.2.3-71-gd317 From 8f8449384ec364ba2a654f11f94e754e4ff719e0 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 11 Nov 2016 10:55:10 -0800 Subject: bpf: Add BPF_MAP_TYPE_LRU_PERCPU_HASH Provide a LRU version of the existing BPF_MAP_TYPE_PERCPU_HASH Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 3 +- kernel/bpf/hashtab.c | 129 ++++++++++++++++++++++++++++++++++++++++++++--- kernel/bpf/syscall.c | 8 ++- 3 files changed, 131 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ed8c6799fb14..7d9b2832c280 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -86,6 +86,7 @@ enum bpf_map_type { BPF_MAP_TYPE_STACK_TRACE, BPF_MAP_TYPE_CGROUP_ARRAY, BPF_MAP_TYPE_LRU_HASH, + BPF_MAP_TYPE_LRU_PERCPU_HASH, }; enum bpf_prog_type { @@ -108,7 +109,7 @@ enum bpf_prog_type { #define BPF_F_NO_PREALLOC (1U << 0) /* Instead of having one common LRU list in the - * BPF_MAP_TYPE_LRU_HASH map, use a percpu LRU list + * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list * which can scale and perform better. * Note, the LRU nodes (including free nodes) cannot be moved * across different LRU lists. diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4a9e71a7c41f..34debc1a9641 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -62,7 +62,14 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); static bool htab_is_lru(const struct bpf_htab *htab) { - return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH; + return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || + htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; +} + +static bool htab_is_percpu(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || + htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, @@ -85,7 +92,7 @@ static void htab_free_elems(struct bpf_htab *htab) { int i; - if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + if (!htab_is_percpu(htab)) goto free_elems; for (i = 0; i < htab->map.max_entries; i++) { @@ -122,7 +129,7 @@ static int prealloc_init(struct bpf_htab *htab) if (!htab->elems) return -ENOMEM; - if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + if (!htab_is_percpu(htab)) goto skip_percpu_elems; for (i = 0; i < htab->map.max_entries; i++) { @@ -195,8 +202,10 @@ static int alloc_extra_elems(struct bpf_htab *htab) /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { - bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH; - bool lru = attr->map_type == BPF_MAP_TYPE_LRU_HASH; + bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); /* percpu_lru means each cpu has its own LRU list. * it is different from BPF_MAP_TYPE_PERCPU_HASH where * the map's value itself is percpu. percpu_lru has @@ -823,12 +832,84 @@ err: return ret; } +static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags, + bool onallcpus) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new = NULL, *l_old; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + u32 key_size, hash; + int ret; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + b = __select_bucket(htab, hash); + head = &b->head; + + /* For LRU, we need to alloc before taking bucket's + * spinlock because LRU's elem alloc may need + * to remove older elem from htab and this removal + * operation will need a bucket lock. + */ + if (map_flags != BPF_EXIST) { + l_new = prealloc_lru_pop(htab, key, hash); + if (!l_new) + return -ENOMEM; + } + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_irqsave(&b->lock, flags); + + l_old = lookup_elem_raw(head, hash, key, key_size); + + ret = check_flags(htab, l_old, map_flags); + if (ret) + goto err; + + if (l_old) { + bpf_lru_node_set_ref(&l_old->lru_node); + + /* per-cpu hash map can update value in-place */ + pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), + value, onallcpus); + } else { + pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), + value, onallcpus); + hlist_add_head_rcu(&l_new->hash_node, head); + l_new = NULL; + } + ret = 0; +err: + raw_spin_unlock_irqrestore(&b->lock, flags); + if (l_new) + bpf_lru_push_free(&htab->lru, &l_new->lru_node); + return ret; +} + static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return __htab_percpu_map_update_elem(map, key, value, map_flags, false); } +static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, + false); +} + /* Called from syscall or from eBPF program */ static int htab_map_delete_elem(struct bpf_map *map, void *key) { @@ -976,8 +1057,21 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + + if (l) { + bpf_lru_node_set_ref(&l->lru_node); + return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); + } + + return NULL; +} + int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l; void __percpu *pptr; int ret = -ENOENT; @@ -993,6 +1087,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) l = __htab_map_lookup_elem(map, key); if (!l) goto out; + if (htab_is_lru(htab)) + bpf_lru_node_set_ref(&l->lru_node); pptr = htab_elem_get_ptr(l, map->key_size); for_each_possible_cpu(cpu) { bpf_long_memcpy(value + off, @@ -1008,10 +1104,16 @@ out: int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 map_flags) { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); int ret; rcu_read_lock(); - ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true); + if (htab_is_lru(htab)) + ret = __htab_lru_percpu_map_update_elem(map, key, value, + map_flags, true); + else + ret = __htab_percpu_map_update_elem(map, key, value, map_flags, + true); rcu_read_unlock(); return ret; @@ -1031,11 +1133,26 @@ static struct bpf_map_type_list htab_percpu_type __read_mostly = { .type = BPF_MAP_TYPE_PERCPU_HASH, }; +static const struct bpf_map_ops htab_lru_percpu_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_lru_percpu_map_lookup_elem, + .map_update_elem = htab_lru_percpu_map_update_elem, + .map_delete_elem = htab_lru_map_delete_elem, +}; + +static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = { + .ops = &htab_lru_percpu_ops, + .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, +}; + static int __init register_htab_map(void) { bpf_register_map_type(&htab_type); bpf_register_map_type(&htab_percpu_type); bpf_register_map_type(&htab_lru_type); + bpf_register_map_type(&htab_lru_percpu_type); return 0; } late_initcall(register_htab_map); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 233e3ac836a6..ce1b7de7d72c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -292,6 +292,7 @@ static int map_lookup_elem(union bpf_attr *attr) goto free_key; if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else @@ -302,7 +303,8 @@ static int map_lookup_elem(union bpf_attr *attr) if (!value) goto free_key; - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); @@ -366,6 +368,7 @@ static int map_update_elem(union bpf_attr *attr) goto free_key; if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else @@ -385,7 +388,8 @@ static int map_update_elem(union bpf_attr *attr) */ preempt_disable(); __this_cpu_inc(bpf_prog_active); - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); -- cgit v1.2.3-71-gd317 From 0d27f4e437e448c4ff440a31567b9729d1634d66 Mon Sep 17 00:00:00 2001 From: Raju Lakkaraju Date: Thu, 17 Nov 2016 13:07:20 +0100 Subject: ethtool: (uapi) Add ETHTOOL_PHY_GTUNABLE and ETHTOOL_PHY_STUNABLE Defines a generic API to get/set phy tunables. The API is using the existing ethtool_tunable/tunable_type_id types which is already being used for mac level tunables. Signed-off-by: Raju Lakkaraju Reviewed-by: Andrew Lunn Signed-off-by: Allan W. Nielsen Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 8e547231c1b7..42f696f139ec 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -248,6 +248,16 @@ struct ethtool_tunable { void *data[0]; }; +enum phy_tunable_id { + ETHTOOL_PHY_ID_UNSPEC, + + /* + * Add your fresh new phy tunable attribute above and remember to update + * phy_tunable_strings[] in net/core/ethtool.c + */ + __ETHTOOL_PHY_TUNABLE_COUNT, +}; + /** * struct ethtool_regs - hardware register dump * @cmd: Command number = %ETHTOOL_GREGS @@ -548,6 +558,7 @@ struct ethtool_pauseparam { * @ETH_SS_FEATURES: Device feature names * @ETH_SS_RSS_HASH_FUNCS: RSS hush function names * @ETH_SS_PHY_STATS: Statistic names, for use with %ETHTOOL_GPHYSTATS + * @ETH_SS_PHY_TUNABLES: PHY tunable names */ enum ethtool_stringset { ETH_SS_TEST = 0, @@ -558,6 +569,7 @@ enum ethtool_stringset { ETH_SS_RSS_HASH_FUNCS, ETH_SS_TUNABLES, ETH_SS_PHY_STATS, + ETH_SS_PHY_TUNABLES, }; /** @@ -1313,7 +1325,8 @@ struct ethtool_per_queue_op { #define ETHTOOL_GLINKSETTINGS 0x0000004c /* Get ethtool_link_settings */ #define ETHTOOL_SLINKSETTINGS 0x0000004d /* Set ethtool_link_settings */ - +#define ETHTOOL_PHY_GTUNABLE 0x0000004e /* Get PHY tunable configuration */ +#define ETHTOOL_PHY_STUNABLE 0x0000004f /* Set PHY tunable configuration */ /* compatibility with older code */ #define SPARC_ETH_GSET ETHTOOL_GSET -- cgit v1.2.3-71-gd317 From 607c7029146790201e90b58c4235ddff0304d6e0 Mon Sep 17 00:00:00 2001 From: Raju Lakkaraju Date: Thu, 17 Nov 2016 13:07:22 +0100 Subject: ethtool: (uapi) Add ETHTOOL_PHY_DOWNSHIFT to PHY tunables For operation in cabling environments that are incompatible with 1000BASE-T, PHY device may provide an automatic link speed downshift operation. When enabled, the device automatically changes its 1000BASE-T auto-negotiation to the next slower speed after a configured number of failed attempts at 1000BASE-T. This feature is useful in setting up in networks using older cable installations that include only pairs A and B, and not pairs C and D. Signed-off-by: Raju Lakkaraju Signed-off-by: Allan W. Nielsen Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 42f696f139ec..f0db7788f887 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -248,9 +248,12 @@ struct ethtool_tunable { void *data[0]; }; +#define DOWNSHIFT_DEV_DEFAULT_COUNT 0xff +#define DOWNSHIFT_DEV_DISABLE 0 + enum phy_tunable_id { ETHTOOL_PHY_ID_UNSPEC, - + ETHTOOL_PHY_DOWNSHIFT, /* * Add your fresh new phy tunable attribute above and remember to update * phy_tunable_strings[] in net/core/ethtool.c -- cgit v1.2.3-71-gd317 From 5e9235853d652a295d5f56cb8652950b6b5bf56b Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 21 Nov 2016 13:03:24 +0100 Subject: bridge: mcast: add IGMPv3 query support This patch adds basic support for IGMPv3 queries, the default is IGMPv2 as before. A new multicast option - multicast_igmp_version, adds the ability to change it between 2 and 3 via netlink and sysfs. The option struct member is in a 4 byte hole in net_bridge. There also a few minor style adjustments in br_multicast_new_group and br_multicast_add_group. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/bridge/br_multicast.c | 79 ++++++++++++++++++++++++++++++++++---------- net/bridge/br_netlink.c | 15 ++++++++- net/bridge/br_private.h | 3 ++ net/bridge/br_sysfs_br.c | 18 ++++++++++ 5 files changed, 98 insertions(+), 18 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index b4fba662cd32..325d2601150d 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -275,6 +275,7 @@ enum { IFLA_BR_PAD, IFLA_BR_VLAN_STATS_ENABLED, IFLA_BR_MCAST_STATS_ENABLED, + IFLA_BR_MCAST_IGMP_VERSION, __IFLA_BR_MAX, }; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 073d54afa056..66192c11aa45 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -365,13 +365,18 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, __be32 group, u8 *igmp_type) { + struct igmpv3_query *ihv3; + size_t igmp_hdr_size; struct sk_buff *skb; struct igmphdr *ih; struct ethhdr *eth; struct iphdr *iph; + igmp_hdr_size = sizeof(*ih); + if (br->multicast_igmp_version == 3) + igmp_hdr_size = sizeof(*ihv3); skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*iph) + - sizeof(*ih) + 4); + igmp_hdr_size + 4); if (!skb) goto out; @@ -396,7 +401,7 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, iph->version = 4; iph->ihl = 6; iph->tos = 0xc0; - iph->tot_len = htons(sizeof(*iph) + sizeof(*ih) + 4); + iph->tot_len = htons(sizeof(*iph) + igmp_hdr_size + 4); iph->id = 0; iph->frag_off = htons(IP_DF); iph->ttl = 1; @@ -412,17 +417,37 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, skb_put(skb, 24); skb_set_transport_header(skb, skb->len); - ih = igmp_hdr(skb); *igmp_type = IGMP_HOST_MEMBERSHIP_QUERY; - ih->type = IGMP_HOST_MEMBERSHIP_QUERY; - ih->code = (group ? br->multicast_last_member_interval : - br->multicast_query_response_interval) / - (HZ / IGMP_TIMER_SCALE); - ih->group = group; - ih->csum = 0; - ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr)); - skb_put(skb, sizeof(*ih)); + switch (br->multicast_igmp_version) { + case 2: + ih = igmp_hdr(skb); + ih->type = IGMP_HOST_MEMBERSHIP_QUERY; + ih->code = (group ? br->multicast_last_member_interval : + br->multicast_query_response_interval) / + (HZ / IGMP_TIMER_SCALE); + ih->group = group; + ih->csum = 0; + ih->csum = ip_compute_csum((void *)ih, sizeof(*ih)); + break; + case 3: + ihv3 = igmpv3_query_hdr(skb); + ihv3->type = IGMP_HOST_MEMBERSHIP_QUERY; + ihv3->code = (group ? br->multicast_last_member_interval : + br->multicast_query_response_interval) / + (HZ / IGMP_TIMER_SCALE); + ihv3->group = group; + ihv3->qqic = br->multicast_query_interval / HZ; + ihv3->nsrcs = 0; + ihv3->resv = 0; + ihv3->suppress = 0; + ihv3->qrv = 2; + ihv3->csum = 0; + ihv3->csum = ip_compute_csum((void *)ihv3, sizeof(*ihv3)); + break; + } + + skb_put(skb, igmp_hdr_size); __skb_pull(skb, sizeof(*eth)); out: @@ -608,7 +633,8 @@ err: } struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br, - struct net_bridge_port *port, struct br_ip *group) + struct net_bridge_port *p, + struct br_ip *group) { struct net_bridge_mdb_htable *mdb; struct net_bridge_mdb_entry *mp; @@ -624,7 +650,7 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br, } hash = br_ip_hash(mdb, group); - mp = br_multicast_get_group(br, port, group, hash); + mp = br_multicast_get_group(br, p, group, hash); switch (PTR_ERR(mp)) { case 0: break; @@ -681,9 +707,9 @@ static int br_multicast_add_group(struct net_bridge *br, struct net_bridge_port *port, struct br_ip *group) { - struct net_bridge_mdb_entry *mp; - struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; + struct net_bridge_port_group *p; + struct net_bridge_mdb_entry *mp; unsigned long now = jiffies; int err; @@ -861,9 +887,9 @@ static void br_multicast_send_query(struct net_bridge *br, struct net_bridge_port *port, struct bridge_mcast_own_query *own_query) { - unsigned long time; - struct br_ip br_group; struct bridge_mcast_other_query *other_query = NULL; + struct br_ip br_group; + unsigned long time; if (!netif_running(br->dev) || br->multicast_disabled || !br->multicast_querier) @@ -1816,6 +1842,7 @@ void br_multicast_init(struct net_bridge *br) br->hash_elasticity = 4; br->hash_max = 512; + br->multicast_igmp_version = 2; br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; br->multicast_querier = 0; br->multicast_query_use_ifaddr = 0; @@ -2132,6 +2159,24 @@ unlock: return err; } +int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val) +{ + /* Currently we support only version 2 and 3 */ + switch (val) { + case 2: + case 3: + break; + default: + return -EINVAL; + } + + spin_lock_bh(&br->multicast_lock); + br->multicast_igmp_version = val; + spin_unlock_bh(&br->multicast_lock); + + return 0; +} + /** * br_multicast_list_adjacent - Returns snooped multicast addresses * @dev: The bridge port adjacent to which to retrieve addresses diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index e99037c6f7b7..10b9b80f778f 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -858,6 +858,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 }, [IFLA_BR_VLAN_STATS_ENABLED] = { .type = NLA_U8 }, [IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 }, + [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -1069,6 +1070,15 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]); br->multicast_stats_enabled = !!mcast_stats; } + + if (data[IFLA_BR_MCAST_IGMP_VERSION]) { + __u8 igmp_version; + + igmp_version = nla_get_u8(data[IFLA_BR_MCAST_IGMP_VERSION]); + err = br_multicast_set_igmp_version(br, igmp_version); + if (err) + return err; + } #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (data[IFLA_BR_NF_CALL_IPTABLES]) { @@ -1135,6 +1145,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_IGMP_VERSION */ #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */ @@ -1210,7 +1221,9 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT, br->multicast_last_member_count) || nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT, - br->multicast_startup_query_count)) + br->multicast_startup_query_count) || + nla_put_u8(skb, IFLA_BR_MCAST_IGMP_VERSION, + br->multicast_igmp_version)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_last_member_interval); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 1b63177e0ccd..3d207d92d899 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -333,6 +333,8 @@ struct net_bridge u32 multicast_last_member_count; u32 multicast_startup_query_count; + u8 multicast_igmp_version; + unsigned long multicast_last_member_interval; unsigned long multicast_membership_interval; unsigned long multicast_querier_interval; @@ -582,6 +584,7 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val); int br_multicast_toggle(struct net_bridge *br, unsigned long val); int br_multicast_set_querier(struct net_bridge *br, unsigned long val); int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val); +int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val); struct net_bridge_mdb_entry * br_mdb_ip_get(struct net_bridge_mdb_htable *mdb, struct br_ip *dst); struct net_bridge_mdb_entry * diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index e120307c6e36..f00d1690658c 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -440,6 +440,23 @@ static ssize_t hash_max_store(struct device *d, struct device_attribute *attr, } static DEVICE_ATTR_RW(hash_max); +static ssize_t multicast_igmp_version_show(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + + return sprintf(buf, "%u\n", br->multicast_igmp_version); +} + +static ssize_t multicast_igmp_version_store(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, br_multicast_set_igmp_version); +} +static DEVICE_ATTR_RW(multicast_igmp_version); + static ssize_t multicast_last_member_count_show(struct device *d, struct device_attribute *attr, char *buf) @@ -809,6 +826,7 @@ static struct attribute *bridge_attrs[] = { &dev_attr_multicast_query_response_interval.attr, &dev_attr_multicast_startup_query_interval.attr, &dev_attr_multicast_stats_enabled.attr, + &dev_attr_multicast_igmp_version.attr, #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) &dev_attr_nf_call_iptables.attr, -- cgit v1.2.3-71-gd317 From aa2ae3e71c74cc00ec22f133dc900b3817415785 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 21 Nov 2016 13:03:25 +0100 Subject: bridge: mcast: add MLDv2 querier support This patch adds basic support for MLDv2 queries, the default is MLDv1 as before. A new multicast option - multicast_mld_version, adds the ability to change it between 1 and 2 via netlink and sysfs. The MLD option is disabled if CONFIG_IPV6 is disabled. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/bridge/br_multicast.c | 89 +++++++++++++++++++++++++++++++++----------- net/bridge/br_netlink.c | 19 +++++++++- net/bridge/br_private.h | 4 ++ net/bridge/br_sysfs_br.c | 22 +++++++++++ 5 files changed, 113 insertions(+), 22 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 325d2601150d..92b2d4928bf1 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -276,6 +276,7 @@ enum { IFLA_BR_VLAN_STATS_ENABLED, IFLA_BR_MCAST_STATS_ENABLED, IFLA_BR_MCAST_IGMP_VERSION, + IFLA_BR_MCAST_MLD_VERSION, __IFLA_BR_MAX, }; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 66192c11aa45..b30e77e8427c 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -459,15 +459,20 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, const struct in6_addr *grp, u8 *igmp_type) { - struct sk_buff *skb; + struct mld2_query *mld2q; + unsigned long interval; struct ipv6hdr *ip6h; struct mld_msg *mldq; + size_t mld_hdr_size; + struct sk_buff *skb; struct ethhdr *eth; u8 *hopopt; - unsigned long interval; + mld_hdr_size = sizeof(*mldq); + if (br->multicast_mld_version == 2) + mld_hdr_size = sizeof(*mld2q); skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*ip6h) + - 8 + sizeof(*mldq)); + 8 + mld_hdr_size); if (!skb) goto out; @@ -486,7 +491,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, ip6h = ipv6_hdr(skb); *(__force __be32 *)ip6h = htonl(0x60000000); - ip6h->payload_len = htons(8 + sizeof(*mldq)); + ip6h->payload_len = htons(8 + mld_hdr_size); ip6h->nexthdr = IPPROTO_HOPOPTS; ip6h->hop_limit = 1; ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1)); @@ -514,26 +519,47 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, /* ICMPv6 */ skb_set_transport_header(skb, skb->len); - mldq = (struct mld_msg *) icmp6_hdr(skb); - interval = ipv6_addr_any(grp) ? br->multicast_query_response_interval : br->multicast_last_member_interval; - *igmp_type = ICMPV6_MGM_QUERY; - mldq->mld_type = ICMPV6_MGM_QUERY; - mldq->mld_code = 0; - mldq->mld_cksum = 0; - mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval)); - mldq->mld_reserved = 0; - mldq->mld_mca = *grp; - - /* checksum */ - mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, - sizeof(*mldq), IPPROTO_ICMPV6, - csum_partial(mldq, - sizeof(*mldq), 0)); - skb_put(skb, sizeof(*mldq)); + switch (br->multicast_mld_version) { + case 1: + mldq = (struct mld_msg *)icmp6_hdr(skb); + mldq->mld_type = ICMPV6_MGM_QUERY; + mldq->mld_code = 0; + mldq->mld_cksum = 0; + mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval)); + mldq->mld_reserved = 0; + mldq->mld_mca = *grp; + mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + sizeof(*mldq), IPPROTO_ICMPV6, + csum_partial(mldq, + sizeof(*mldq), + 0)); + break; + case 2: + mld2q = (struct mld2_query *)icmp6_hdr(skb); + mld2q->mld2q_mrc = ntohs((u16)jiffies_to_msecs(interval)); + mld2q->mld2q_type = ICMPV6_MGM_QUERY; + mld2q->mld2q_code = 0; + mld2q->mld2q_cksum = 0; + mld2q->mld2q_resv1 = 0; + mld2q->mld2q_resv2 = 0; + mld2q->mld2q_suppress = 0; + mld2q->mld2q_qrv = 2; + mld2q->mld2q_nsrcs = 0; + mld2q->mld2q_qqic = br->multicast_query_interval / HZ; + mld2q->mld2q_mca = *grp; + mld2q->mld2q_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + sizeof(*mld2q), + IPPROTO_ICMPV6, + csum_partial(mld2q, + sizeof(*mld2q), + 0)); + break; + } + skb_put(skb, mld_hdr_size); __skb_pull(skb, sizeof(*eth)); @@ -1842,7 +1868,6 @@ void br_multicast_init(struct net_bridge *br) br->hash_elasticity = 4; br->hash_max = 512; - br->multicast_igmp_version = 2; br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; br->multicast_querier = 0; br->multicast_query_use_ifaddr = 0; @@ -1858,7 +1883,9 @@ void br_multicast_init(struct net_bridge *br) br->ip4_other_query.delay_time = 0; br->ip4_querier.port = NULL; + br->multicast_igmp_version = 2; #if IS_ENABLED(CONFIG_IPV6) + br->multicast_mld_version = 1; br->ip6_other_query.delay_time = 0; br->ip6_querier.port = NULL; #endif @@ -2177,6 +2204,26 @@ int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val) return 0; } +#if IS_ENABLED(CONFIG_IPV6) +int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val) +{ + /* Currently we support version 1 and 2 */ + switch (val) { + case 1: + case 2: + break; + default: + return -EINVAL; + } + + spin_lock_bh(&br->multicast_lock); + br->multicast_mld_version = val; + spin_unlock_bh(&br->multicast_lock); + + return 0; +} +#endif + /** * br_multicast_list_adjacent - Returns snooped multicast addresses * @dev: The bridge port adjacent to which to retrieve addresses diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 10b9b80f778f..71c7453268c1 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -859,6 +859,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_VLAN_STATS_ENABLED] = { .type = NLA_U8 }, [IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 }, [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 }, + [IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -1079,6 +1080,17 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (err) return err; } + +#if IS_ENABLED(CONFIG_IPV6) + if (data[IFLA_BR_MCAST_MLD_VERSION]) { + __u8 mld_version; + + mld_version = nla_get_u8(data[IFLA_BR_MCAST_MLD_VERSION]); + err = br_multicast_set_mld_version(br, mld_version); + if (err) + return err; + } +#endif #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (data[IFLA_BR_NF_CALL_IPTABLES]) { @@ -1146,6 +1158,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */ nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_IGMP_VERSION */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_MLD_VERSION */ #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */ @@ -1225,7 +1238,11 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u8(skb, IFLA_BR_MCAST_IGMP_VERSION, br->multicast_igmp_version)) return -EMSGSIZE; - +#if IS_ENABLED(CONFIG_IPV6) + if (nla_put_u8(skb, IFLA_BR_MCAST_MLD_VERSION, + br->multicast_mld_version)) + return -EMSGSIZE; +#endif clockval = jiffies_to_clock_t(br->multicast_last_member_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval, IFLA_BR_PAD)) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 3d207d92d899..26aec2366bc3 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -355,6 +355,7 @@ struct net_bridge struct bridge_mcast_other_query ip6_other_query; struct bridge_mcast_own_query ip6_own_query; struct bridge_mcast_querier ip6_querier; + u8 multicast_mld_version; #endif /* IS_ENABLED(CONFIG_IPV6) */ #endif @@ -585,6 +586,9 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val); int br_multicast_set_querier(struct net_bridge *br, unsigned long val); int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val); int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val); +#if IS_ENABLED(CONFIG_IPV6) +int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val); +#endif struct net_bridge_mdb_entry * br_mdb_ip_get(struct net_bridge_mdb_htable *mdb, struct br_ip *dst); struct net_bridge_mdb_entry * diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index f00d1690658c..c9d2e0abfb89 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -659,6 +659,25 @@ static ssize_t multicast_stats_enabled_store(struct device *d, return store_bridge_parm(d, buf, len, set_stats_enabled); } static DEVICE_ATTR_RW(multicast_stats_enabled); + +#if IS_ENABLED(CONFIG_IPV6) +static ssize_t multicast_mld_version_show(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + + return sprintf(buf, "%u\n", br->multicast_mld_version); +} + +static ssize_t multicast_mld_version_store(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, br_multicast_set_mld_version); +} +static DEVICE_ATTR_RW(multicast_mld_version); +#endif #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static ssize_t nf_call_iptables_show( @@ -827,6 +846,9 @@ static struct attribute *bridge_attrs[] = { &dev_attr_multicast_startup_query_interval.attr, &dev_attr_multicast_stats_enabled.attr, &dev_attr_multicast_igmp_version.attr, +#if IS_ENABLED(CONFIG_IPV6) + &dev_attr_multicast_mld_version.attr, +#endif #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) &dev_attr_nf_call_iptables.attr, -- cgit v1.2.3-71-gd317 From 59bfde01fab0c4550778cd53e8d266f1dfddf7b7 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Tue, 22 Nov 2016 23:09:57 +0200 Subject: devlink: Add E-Switch inline mode control Some HWs need the VF driver to put part of the packet headers on the TX descriptor so the e-switch can do proper matching and steering. The supported modes: none, link, network, transport. Signed-off-by: Roi Dayan Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/net/devlink.h | 2 ++ include/uapi/linux/devlink.h | 8 +++++ net/core/devlink.c | 70 ++++++++++++++++++++++++++++++++------------ 3 files changed, 61 insertions(+), 19 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index 211bd3c37028..d29e5fc82582 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -92,6 +92,8 @@ struct devlink_ops { int (*eswitch_mode_get)(struct devlink *devlink, u16 *p_mode); int (*eswitch_mode_set)(struct devlink *devlink, u16 mode); + int (*eswitch_inline_mode_get)(struct devlink *devlink, u8 *p_inline_mode); + int (*eswitch_inline_mode_set)(struct devlink *devlink, u8 inline_mode); }; static inline void *devlink_priv(struct devlink *devlink) diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 915bfa74458c..9014c33d4e77 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -102,6 +102,13 @@ enum devlink_eswitch_mode { DEVLINK_ESWITCH_MODE_SWITCHDEV, }; +enum devlink_eswitch_inline_mode { + DEVLINK_ESWITCH_INLINE_MODE_NONE, + DEVLINK_ESWITCH_INLINE_MODE_LINK, + DEVLINK_ESWITCH_INLINE_MODE_NETWORK, + DEVLINK_ESWITCH_INLINE_MODE_TRANSPORT, +}; + enum devlink_attr { /* don't change the order or add anything between, this is ABI! */ DEVLINK_ATTR_UNSPEC, @@ -133,6 +140,7 @@ enum devlink_attr { DEVLINK_ATTR_SB_OCC_CUR, /* u32 */ DEVLINK_ATTR_SB_OCC_MAX, /* u32 */ DEVLINK_ATTR_ESWITCH_MODE, /* u16 */ + DEVLINK_ATTR_ESWITCH_INLINE_MODE, /* u8 */ /* add new attributes above here, update the policy in devlink.c */ diff --git a/net/core/devlink.c b/net/core/devlink.c index c14f8b661db9..2b5bf9efa720 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1394,26 +1394,45 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb, static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, - u32 seq, int flags, u16 mode) + u32 seq, int flags) { + const struct devlink_ops *ops = devlink->ops; void *hdr; + int err = 0; + u16 mode; + u8 inline_mode; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; + err = devlink_nl_put_handle(msg, devlink); + if (err) + goto out; - if (nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode)) - goto nla_put_failure; + err = ops->eswitch_mode_get(devlink, &mode); + if (err) + goto out; + err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode); + if (err) + goto out; + + if (ops->eswitch_inline_mode_get) { + err = ops->eswitch_inline_mode_get(devlink, &inline_mode); + if (err) + goto out; + err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE, + inline_mode); + if (err) + goto out; + } genlmsg_end(msg, hdr); return 0; -nla_put_failure: +out: genlmsg_cancel(msg, hdr); - return -EMSGSIZE; + return err; } static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb, @@ -1422,22 +1441,17 @@ static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb, struct devlink *devlink = info->user_ptr[0]; const struct devlink_ops *ops = devlink->ops; struct sk_buff *msg; - u16 mode; int err; if (!ops || !ops->eswitch_mode_get) return -EOPNOTSUPP; - err = ops->eswitch_mode_get(devlink, &mode); - if (err) - return err; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET, - info->snd_portid, info->snd_seq, 0, mode); + info->snd_portid, info->snd_seq, 0); if (err) { nlmsg_free(msg); @@ -1453,15 +1467,32 @@ static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb, struct devlink *devlink = info->user_ptr[0]; const struct devlink_ops *ops = devlink->ops; u16 mode; + u8 inline_mode; + int err = 0; - if (!info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) - return -EINVAL; + if (!ops) + return -EOPNOTSUPP; - mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); + if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) { + if (!ops->eswitch_mode_set) + return -EOPNOTSUPP; + mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); + err = ops->eswitch_mode_set(devlink, mode); + if (err) + return err; + } - if (ops && ops->eswitch_mode_set) - return ops->eswitch_mode_set(devlink, mode); - return -EOPNOTSUPP; + if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) { + if (!ops->eswitch_inline_mode_set) + return -EOPNOTSUPP; + inline_mode = nla_get_u8( + info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]); + err = ops->eswitch_inline_mode_set(devlink, inline_mode); + if (err) + return err; + } + + return 0; } static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { @@ -1478,6 +1509,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 }, [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, [DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 }, + [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 }, }; static const struct genl_ops devlink_nl_ops[] = { -- cgit v1.2.3-71-gd317 From 0e33661de493db325435d565a4a722120ae4cbf3 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Wed, 23 Nov 2016 16:52:25 +0100 Subject: bpf: add new prog type for cgroup socket filtering This program type is similar to BPF_PROG_TYPE_SOCKET_FILTER, except that it does not allow BPF_LD_[ABS|IND] instructions and hooks up the bpf_skb_load_bytes() helper. Programs of this type will be attached to cgroups for network filtering and accounting. Signed-off-by: Daniel Mack Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 9 +++++++++ net/core/filter.c | 23 +++++++++++++++++++++++ 2 files changed, 32 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7d9b2832c280..5ae679fac993 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -98,8 +98,17 @@ enum bpf_prog_type { BPF_PROG_TYPE_TRACEPOINT, BPF_PROG_TYPE_XDP, BPF_PROG_TYPE_PERF_EVENT, + BPF_PROG_TYPE_CGROUP_SKB, }; +enum bpf_attach_type { + BPF_CGROUP_INET_INGRESS, + BPF_CGROUP_INET_EGRESS, + __MAX_BPF_ATTACH_TYPE +}; + +#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE + #define BPF_PSEUDO_MAP_FD 1 /* flags for BPF_MAP_UPDATE_ELEM command */ diff --git a/net/core/filter.c b/net/core/filter.c index dece94fef005..2de302d68038 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2630,6 +2630,17 @@ xdp_func_proto(enum bpf_func_id func_id) } } +static const struct bpf_func_proto * +cg_skb_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + default: + return sk_filter_func_proto(func_id); + } +} + static bool __is_valid_access(int off, int size, enum bpf_access_type type) { if (off < 0 || off >= sizeof(struct __sk_buff)) @@ -2992,6 +3003,12 @@ static const struct bpf_verifier_ops xdp_ops = { .convert_ctx_access = xdp_convert_ctx_access, }; +static const struct bpf_verifier_ops cg_skb_ops = { + .get_func_proto = cg_skb_func_proto, + .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, +}; + static struct bpf_prog_type_list sk_filter_type __read_mostly = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, @@ -3012,12 +3029,18 @@ static struct bpf_prog_type_list xdp_type __read_mostly = { .type = BPF_PROG_TYPE_XDP, }; +static struct bpf_prog_type_list cg_skb_type __read_mostly = { + .ops = &cg_skb_ops, + .type = BPF_PROG_TYPE_CGROUP_SKB, +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); bpf_register_prog_type(&sched_cls_type); bpf_register_prog_type(&sched_act_type); bpf_register_prog_type(&xdp_type); + bpf_register_prog_type(&cg_skb_type); return 0; } -- cgit v1.2.3-71-gd317 From f4324551489e8781d838f941b7aee4208e52e8bf Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Wed, 23 Nov 2016 16:52:27 +0100 Subject: bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH commands Extend the bpf(2) syscall by two new commands, BPF_PROG_ATTACH and BPF_PROG_DETACH which allow attaching and detaching eBPF programs to a target. On the API level, the target could be anything that has an fd in userspace, hence the name of the field in union bpf_attr is called 'target_fd'. When called with BPF_ATTACH_TYPE_CGROUP_INET_{E,IN}GRESS, the target is expected to be a valid file descriptor of a cgroup v2 directory which has the bpf controller enabled. These are the only use-cases implemented by this patch at this point, but more can be added. If a program of the given type already exists in the given cgroup, the program is swapped automically, so userspace does not have to drop an existing program first before installing a new one, which would otherwise leave a gap in which no program is attached. For more information on the propagation logic to subcgroups, please refer to the bpf cgroup controller implementation. The API is guarded by CAP_NET_ADMIN. Signed-off-by: Daniel Mack Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 8 +++++ kernel/bpf/syscall.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5ae679fac993..1370a9d1456f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -73,6 +73,8 @@ enum bpf_cmd { BPF_PROG_LOAD, BPF_OBJ_PIN, BPF_OBJ_GET, + BPF_PROG_ATTACH, + BPF_PROG_DETACH, }; enum bpf_map_type { @@ -159,6 +161,12 @@ union bpf_attr { __aligned_u64 pathname; __u32 bpf_fd; }; + + struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ + __u32 target_fd; /* container object to attach to */ + __u32 attach_bpf_fd; /* eBPF program to attach */ + __u32 attach_type; + }; } __attribute__((aligned(8))); /* BPF helper function descriptions: diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index eb15498b8d55..1090d16a31c1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -835,6 +835,77 @@ static int bpf_obj_get(const union bpf_attr *attr) return bpf_obj_get_user(u64_to_user_ptr(attr->pathname)); } +#ifdef CONFIG_CGROUP_BPF + +#define BPF_PROG_ATTACH_LAST_FIELD attach_type + +static int bpf_prog_attach(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct cgroup *cgrp; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (CHECK_ATTR(BPF_PROG_ATTACH)) + return -EINVAL; + + switch (attr->attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + prog = bpf_prog_get_type(attr->attach_bpf_fd, + BPF_PROG_TYPE_CGROUP_SKB); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) { + bpf_prog_put(prog); + return PTR_ERR(cgrp); + } + + cgroup_bpf_update(cgrp, prog, attr->attach_type); + cgroup_put(cgrp); + break; + + default: + return -EINVAL; + } + + return 0; +} + +#define BPF_PROG_DETACH_LAST_FIELD attach_type + +static int bpf_prog_detach(const union bpf_attr *attr) +{ + struct cgroup *cgrp; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (CHECK_ATTR(BPF_PROG_DETACH)) + return -EINVAL; + + switch (attr->attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + cgroup_bpf_update(cgrp, NULL, attr->attach_type); + cgroup_put(cgrp); + break; + + default: + return -EINVAL; + } + + return 0; +} +#endif /* CONFIG_CGROUP_BPF */ + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -901,6 +972,16 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET: err = bpf_obj_get(&attr); break; + +#ifdef CONFIG_CGROUP_BPF + case BPF_PROG_ATTACH: + err = bpf_prog_attach(&attr); + break; + case BPF_PROG_DETACH: + err = bpf_prog_detach(&attr); + break; +#endif + default: err = -EINVAL; break; -- cgit v1.2.3-71-gd317 From efd90174167530c67a54273fd5d8369c87f9bd32 Mon Sep 17 00:00:00 2001 From: Francis Yan Date: Sun, 27 Nov 2016 23:07:17 -0800 Subject: tcp: export sender limits chronographs to TCP_INFO This patch exports all the sender chronograph measurements collected in the previous patches to TCP_INFO interface. Note that busy time exported includes all the other sending limits (rwnd-limited, sndbuf-limited). Internally the time unit is jiffy but externally the measurements are in microseconds for future extensions. Signed-off-by: Francis Yan Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 4 ++++ net/ipv4/tcp.c | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 73ac0db487f8..2863b661d6e1 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -214,6 +214,10 @@ struct tcp_info { __u32 tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */ __u64 tcpi_delivery_rate; + + __u64 tcpi_busy_time; /* Time (usec) busy sending data */ + __u64 tcpi_rwnd_limited; /* Time (usec) limited by receive window */ + __u64 tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 259ffb50e429..cdde20f49999 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2708,6 +2708,25 @@ int compat_tcp_setsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL(compat_tcp_setsockopt); #endif +static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, + struct tcp_info *info) +{ + u64 stats[__TCP_CHRONO_MAX], total = 0; + enum tcp_chrono i; + + for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) { + stats[i] = tp->chrono_stat[i - 1]; + if (i == tp->chrono_type) + stats[i] += tcp_time_stamp - tp->chrono_start; + stats[i] *= USEC_PER_SEC / HZ; + total += stats[i]; + } + + info->tcpi_busy_time = total; + info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED]; + info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED]; +} + /* Return information about state of tcp endpoint in API format. */ void tcp_get_info(struct sock *sk, struct tcp_info *info) { @@ -2800,6 +2819,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_bytes_acked = tp->bytes_acked; info->tcpi_bytes_received = tp->bytes_received; info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt); + tcp_get_info_chrono_stats(tp, info); unlock_sock_fast(sk, slow); -- cgit v1.2.3-71-gd317 From 1c885808e45601b2b6f68b30ac1d999e10b6f606 Mon Sep 17 00:00:00 2001 From: Francis Yan Date: Sun, 27 Nov 2016 23:07:18 -0800 Subject: tcp: SOF_TIMESTAMPING_OPT_STATS option for SO_TIMESTAMPING This patch exports the sender chronograph stats via the socket SO_TIMESTAMPING channel. Currently we can instrument how long a particular application unit of data was queued in TCP by tracking SOF_TIMESTAMPING_TX_SOFTWARE and SOF_TIMESTAMPING_TX_SCHED. Having these sender chronograph stats exported simultaneously along with these timestamps allow further breaking down the various sender limitation. For example, a video server can tell if a particular chunk of video on a connection takes a long time to deliver because TCP was experiencing small receive window. It is not possible to tell before this patch without packet traces. To prepare these stats, the user needs to set SOF_TIMESTAMPING_OPT_STATS and SOF_TIMESTAMPING_OPT_TSONLY flags while requesting other SOF_TIMESTAMPING TX timestamps. When the timestamps are available in the error queue, the stats are returned in a separate control message of type SCM_TIMESTAMPING_OPT_STATS, in a list of TLVs (struct nlattr) of types: TCP_NLA_BUSY_TIME, TCP_NLA_RWND_LIMITED, TCP_NLA_SNDBUF_LIMITED. Unit is microsecond. Signed-off-by: Francis Yan Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- Documentation/networking/timestamping.txt | 10 ++++++++++ arch/alpha/include/uapi/asm/socket.h | 2 ++ arch/frv/include/uapi/asm/socket.h | 2 ++ arch/ia64/include/uapi/asm/socket.h | 2 ++ arch/m32r/include/uapi/asm/socket.h | 2 ++ arch/mips/include/uapi/asm/socket.h | 2 ++ arch/mn10300/include/uapi/asm/socket.h | 2 ++ arch/parisc/include/uapi/asm/socket.h | 2 ++ arch/powerpc/include/uapi/asm/socket.h | 2 ++ arch/s390/include/uapi/asm/socket.h | 2 ++ arch/sparc/include/uapi/asm/socket.h | 2 ++ arch/xtensa/include/uapi/asm/socket.h | 2 ++ include/linux/tcp.h | 2 ++ include/uapi/asm-generic/socket.h | 2 ++ include/uapi/linux/net_tstamp.h | 3 ++- include/uapi/linux/tcp.h | 8 ++++++++ net/core/skbuff.c | 14 +++++++++++--- net/core/sock.c | 7 +++++++ net/ipv4/tcp.c | 20 ++++++++++++++++++++ net/socket.c | 7 ++++++- 20 files changed, 90 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt index 671cccf0dcd2..96f50694a748 100644 --- a/Documentation/networking/timestamping.txt +++ b/Documentation/networking/timestamping.txt @@ -182,6 +182,16 @@ SOF_TIMESTAMPING_OPT_TSONLY: the timestamp even if sysctl net.core.tstamp_allow_data is 0. This option disables SOF_TIMESTAMPING_OPT_CMSG. +SOF_TIMESTAMPING_OPT_STATS: + + Optional stats that are obtained along with the transmit timestamps. + It must be used together with SOF_TIMESTAMPING_OPT_TSONLY. When the + transmit timestamp is available, the stats are available in a + separate control message of type SCM_TIMESTAMPING_OPT_STATS, as a + list of TLVs (struct nlattr) of types. These stats allow the + application to associate various transport layer stats with + the transmit timestamps, such as how long a certain block of + data was limited by peer's receiver window. New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 9e46d6e656d9..afc901b7a6f6 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -97,4 +97,6 @@ #define SO_CNX_ADVICE 53 +#define SCM_TIMESTAMPING_OPT_STATS 54 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h index afbc98f02d27..81e03530ed39 100644 --- a/arch/frv/include/uapi/asm/socket.h +++ b/arch/frv/include/uapi/asm/socket.h @@ -90,5 +90,7 @@ #define SO_CNX_ADVICE 53 +#define SCM_TIMESTAMPING_OPT_STATS 54 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h index 0018fad9039f..57feb0c1f7d7 100644 --- a/arch/ia64/include/uapi/asm/socket.h +++ b/arch/ia64/include/uapi/asm/socket.h @@ -99,4 +99,6 @@ #define SO_CNX_ADVICE 53 +#define SCM_TIMESTAMPING_OPT_STATS 54 + #endif /* _ASM_IA64_SOCKET_H */ diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h index 5fe42fc7b6c5..5853f8e92c20 100644 --- a/arch/m32r/include/uapi/asm/socket.h +++ b/arch/m32r/include/uapi/asm/socket.h @@ -90,4 +90,6 @@ #define SO_CNX_ADVICE 53 +#define SCM_TIMESTAMPING_OPT_STATS 54 + #endif /* _ASM_M32R_SOCKET_H */ diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 2027240aafbb..566ecdcb5b4b 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -108,4 +108,6 @@ #define SO_CNX_ADVICE 53 +#define SCM_TIMESTAMPING_OPT_STATS 54 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h index 5129f23a9ee1..0e12527c4b0e 100644 --- a/arch/mn10300/include/uapi/asm/socket.h +++ b/arch/mn10300/include/uapi/asm/socket.h @@ -90,4 +90,6 @@ #define SO_CNX_ADVICE 53 +#define SCM_TIMESTAMPING_OPT_STATS 54 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 9c935d717df9..7a109b73ddf7 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -89,4 +89,6 @@ #define SO_CNX_ADVICE 0x402E +#define SCM_TIMESTAMPING_OPT_STATS 0x402F + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h index 1672e3398270..44583a52f882 100644 --- a/arch/powerpc/include/uapi/asm/socket.h +++ b/arch/powerpc/include/uapi/asm/socket.h @@ -97,4 +97,6 @@ #define SO_CNX_ADVICE 53 +#define SCM_TIMESTAMPING_OPT_STATS 54 + #endif /* _ASM_POWERPC_SOCKET_H */ diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h index 41b51c2f4f1b..b24a64cbfeb1 100644 --- a/arch/s390/include/uapi/asm/socket.h +++ b/arch/s390/include/uapi/asm/socket.h @@ -96,4 +96,6 @@ #define SO_CNX_ADVICE 53 +#define SCM_TIMESTAMPING_OPT_STATS 54 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 31aede3af088..a25dc32f5d6a 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -86,6 +86,8 @@ #define SO_CNX_ADVICE 0x0037 +#define SCM_TIMESTAMPING_OPT_STATS 0x0038 + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h index 81435d995e11..9fdbe1fe0473 100644 --- a/arch/xtensa/include/uapi/asm/socket.h +++ b/arch/xtensa/include/uapi/asm/socket.h @@ -101,4 +101,6 @@ #define SO_CNX_ADVICE 53 +#define SCM_TIMESTAMPING_OPT_STATS 54 + #endif /* _XTENSA_SOCKET_H */ diff --git a/include/linux/tcp.h b/include/linux/tcp.h index d5d3bd814338..00e0ee8f001f 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -428,4 +428,6 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp) tp->saved_syn = NULL; } +struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk); + #endif /* _LINUX_TCP_H */ diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 67d632f1743d..2c748ddad5f8 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -92,4 +92,6 @@ #define SO_CNX_ADVICE 53 +#define SCM_TIMESTAMPING_OPT_STATS 54 + #endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 264e515de16f..464dcca5ed68 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -25,8 +25,9 @@ enum { SOF_TIMESTAMPING_TX_ACK = (1<<9), SOF_TIMESTAMPING_OPT_CMSG = (1<<10), SOF_TIMESTAMPING_OPT_TSONLY = (1<<11), + SOF_TIMESTAMPING_OPT_STATS = (1<<12), - SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_TSONLY, + SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_STATS, SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) | SOF_TIMESTAMPING_LAST }; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 2863b661d6e1..c53de2691cec 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -220,6 +220,14 @@ struct tcp_info { __u64 tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */ }; +/* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ +enum { + TCP_NLA_PAD, + TCP_NLA_BUSY, /* Time (usec) busy sending data */ + TCP_NLA_RWND_LIMITED, /* Time (usec) limited by receive window */ + TCP_NLA_SNDBUF_LIMITED, /* Time (usec) limited by send buffer */ +}; + /* for TCP_MD5SIG socket option */ #define TCP_MD5SIG_MAXKEYLEN 80 diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d1d1a5a5ad24..ea6fa954c7a0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3839,10 +3839,18 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (!skb_may_tx_timestamp(sk, tsonly)) return; - if (tsonly) - skb = alloc_skb(0, GFP_ATOMIC); - else + if (tsonly) { +#ifdef CONFIG_INET + if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && + sk->sk_protocol == IPPROTO_TCP && + sk->sk_type == SOCK_STREAM) + skb = tcp_get_timestamping_opt_stats(sk); + else +#endif + skb = alloc_skb(0, GFP_ATOMIC); + } else { skb = skb_clone(orig_skb, GFP_ATOMIC); + } if (!skb) return; diff --git a/net/core/sock.c b/net/core/sock.c index 14e6145be33b..d8c7f8c877ca 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -854,6 +854,13 @@ set_rcvbuf: sk->sk_tskey = 0; } } + + if (val & SOF_TIMESTAMPING_OPT_STATS && + !(val & SOF_TIMESTAMPING_OPT_TSONLY)) { + ret = -EINVAL; + break; + } + sk->sk_tsflags = val; if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index cdde20f49999..1149b48700a1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2841,6 +2841,26 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) } EXPORT_SYMBOL_GPL(tcp_get_info); +struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *stats; + struct tcp_info info; + + stats = alloc_skb(3 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC); + if (!stats) + return NULL; + + tcp_get_info_chrono_stats(tp, &info); + nla_put_u64_64bit(stats, TCP_NLA_BUSY, + info.tcpi_busy_time, TCP_NLA_PAD); + nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED, + info.tcpi_rwnd_limited, TCP_NLA_PAD); + nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED, + info.tcpi_sndbuf_limited, TCP_NLA_PAD); + return stats; +} + static int do_tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { diff --git a/net/socket.c b/net/socket.c index e2584c51aa1f..e6318943ad07 100644 --- a/net/socket.c +++ b/net/socket.c @@ -693,9 +693,14 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) empty = 0; - if (!empty) + if (!empty) { put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING, sizeof(tss), &tss); + + if (skb->len && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS)) + put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS, + skb->len, skb->data); + } } EXPORT_SYMBOL_GPL(__sock_recv_timestamp); -- cgit v1.2.3-71-gd317 From 85de8576a0b14aecc99136cfbf90e367fa2142cb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 28 Nov 2016 23:16:54 +0100 Subject: bpf, xdp: allow to pass flags to dev_change_xdp_fd Add an IFLA_XDP_FLAGS attribute that can be passed for setting up XDP along with IFLA_XDP_FD, which eventually allows user space to implement typical add/replace/delete logic for programs. Right now, calling into dev_change_xdp_fd() will always replace previous programs. When passed XDP_FLAGS_UPDATE_IF_NOEXIST, we can handle this more graceful when requested by returning -EBUSY in case we try to attach a new program, but we find that another one is already attached. This will be used by upcoming front-end for iproute2 as well. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- include/uapi/linux/if_link.h | 4 ++++ net/core/dev.c | 20 ++++++++++++++++++-- net/core/rtnetlink.c | 14 +++++++++++++- 4 files changed, 36 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4ffcd874cc20..3755317cc6a9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3253,7 +3253,7 @@ int dev_get_phys_port_id(struct net_device *dev, int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len); int dev_change_proto_down(struct net_device *dev, bool proto_down); -int dev_change_xdp_fd(struct net_device *dev, int fd); +int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 92b2d4928bf1..6b13e591abc9 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -876,10 +876,14 @@ enum { /* XDP section */ +#define XDP_FLAGS_UPDATE_IF_NOEXIST (1U << 0) +#define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST) + enum { IFLA_XDP_UNSPEC, IFLA_XDP_FD, IFLA_XDP_ATTACHED, + IFLA_XDP_FLAGS, __IFLA_XDP_MAX, }; diff --git a/net/core/dev.c b/net/core/dev.c index 048b46b7c92a..bffb5253e778 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6692,26 +6692,42 @@ EXPORT_SYMBOL(dev_change_proto_down); * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device * @fd: new program fd or negative value to clear + * @flags: xdp-related flags * * Set or clear a bpf program for a device */ -int dev_change_xdp_fd(struct net_device *dev, int fd) +int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags) { const struct net_device_ops *ops = dev->netdev_ops; struct bpf_prog *prog = NULL; - struct netdev_xdp xdp = {}; + struct netdev_xdp xdp; int err; + ASSERT_RTNL(); + if (!ops->ndo_xdp) return -EOPNOTSUPP; if (fd >= 0) { + if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) { + memset(&xdp, 0, sizeof(xdp)); + xdp.command = XDP_QUERY_PROG; + + err = ops->ndo_xdp(dev, &xdp); + if (err < 0) + return err; + if (xdp.prog_attached) + return -EBUSY; + } + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); if (IS_ERR(prog)) return PTR_ERR(prog); } + memset(&xdp, 0, sizeof(xdp)); xdp.command = XDP_SETUP_PROG; xdp.prog = prog; + err = ops->ndo_xdp(dev, &xdp); if (err < 0 && prog) bpf_prog_put(prog); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 4e60525ea586..bd85570e6e4b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1505,6 +1505,7 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { [IFLA_XDP_FD] = { .type = NLA_S32 }, [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, + [IFLA_XDP_FLAGS] = { .type = NLA_U32 }, }; static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) @@ -2164,6 +2165,7 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_XDP]) { struct nlattr *xdp[IFLA_XDP_MAX + 1]; + u32 xdp_flags = 0; err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP], ifla_xdp_policy); @@ -2174,9 +2176,19 @@ static int do_setlink(const struct sk_buff *skb, err = -EINVAL; goto errout; } + + if (xdp[IFLA_XDP_FLAGS]) { + xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]); + if (xdp_flags & ~XDP_FLAGS_MASK) { + err = -EINVAL; + goto errout; + } + } + if (xdp[IFLA_XDP_FD]) { err = dev_change_xdp_fd(dev, - nla_get_s32(xdp[IFLA_XDP_FD])); + nla_get_s32(xdp[IFLA_XDP_FD]), + xdp_flags); if (err) goto errout; status |= DO_SETLINK_NOTIFY; -- cgit v1.2.3-71-gd317 From 3a0af8fd61f90920f6fa04e4f1e9a6a73c1b4fd2 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 30 Nov 2016 17:10:10 +0100 Subject: bpf: BPF for lightweight tunnel infrastructure Registers new BPF program types which correspond to the LWT hooks: - BPF_PROG_TYPE_LWT_IN => dst_input() - BPF_PROG_TYPE_LWT_OUT => dst_output() - BPF_PROG_TYPE_LWT_XMIT => lwtunnel_xmit() The separate program types are required to differentiate between the capabilities each LWT hook allows: * Programs attached to dst_input() or dst_output() are restricted and may only read the data of an skb. This prevent modification and possible invalidation of already validated packet headers on receive and the construction of illegal headers while the IP headers are still being assembled. * Programs attached to lwtunnel_xmit() are allowed to modify packet content as well as prepending an L2 header via a newly introduced helper bpf_skb_change_head(). This is safe as lwtunnel_xmit() is invoked after the IP header has been assembled completely. All BPF programs receive an skb with L3 headers attached and may return one of the following error codes: BPF_OK - Continue routing as per nexthop BPF_DROP - Drop skb and return EPERM BPF_REDIRECT - Redirect skb to device as per redirect() helper. (Only valid in lwtunnel_xmit() context) The return codes are binary compatible with their TC_ACT_ relatives to ease compatibility. Signed-off-by: Thomas Graf Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/filter.h | 2 +- include/uapi/linux/bpf.h | 32 +++- include/uapi/linux/lwtunnel.h | 23 +++ kernel/bpf/verifier.c | 14 +- net/Kconfig | 8 + net/core/Makefile | 1 + net/core/filter.c | 173 ++++++++++++++++++ net/core/lwt_bpf.c | 396 ++++++++++++++++++++++++++++++++++++++++++ net/core/lwtunnel.c | 2 + 9 files changed, 646 insertions(+), 5 deletions(-) create mode 100644 net/core/lwt_bpf.c (limited to 'include/uapi/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 7f246a281435..7ba644626553 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -438,7 +438,7 @@ struct xdp_buff { }; /* compute the linear packet data range [data, data_end) which - * will be accessed by cls_bpf and act_bpf programs + * will be accessed by cls_bpf, act_bpf and lwt programs */ static inline void bpf_compute_data_end(struct sk_buff *skb) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1370a9d1456f..22ac82792687 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -101,6 +101,9 @@ enum bpf_prog_type { BPF_PROG_TYPE_XDP, BPF_PROG_TYPE_PERF_EVENT, BPF_PROG_TYPE_CGROUP_SKB, + BPF_PROG_TYPE_LWT_IN, + BPF_PROG_TYPE_LWT_OUT, + BPF_PROG_TYPE_LWT_XMIT, }; enum bpf_attach_type { @@ -409,6 +412,16 @@ union bpf_attr { * * int bpf_get_numa_node_id() * Return: Id of current NUMA node. + * + * int bpf_skb_change_head() + * Grows headroom of skb and adjusts MAC header offset accordingly. + * Will extends/reallocae as required automatically. + * May change skb data pointer and will thus invalidate any check + * performed for direct packet access. + * @skb: pointer to skb + * @len: length of header to be pushed in front + * @flags: Flags (unused for now) + * Return: 0 on success or negative error */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -453,7 +466,8 @@ union bpf_attr { FN(skb_pull_data), \ FN(csum_update), \ FN(set_hash_invalid), \ - FN(get_numa_node_id), + FN(get_numa_node_id), \ + FN(skb_change_head), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -537,6 +551,22 @@ struct bpf_tunnel_key { __u32 tunnel_label; }; +/* Generic BPF return codes which all BPF program types may support. + * The values are binary compatible with their TC_ACT_* counter-part to + * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT + * programs. + * + * XDP is handled seprately, see XDP_*. + */ +enum bpf_ret_code { + BPF_OK = 0, + /* 1 reserved */ + BPF_DROP = 2, + /* 3-6 reserved */ + BPF_REDIRECT = 7, + /* >127 are reserved for prog type specific return codes */ +}; + /* User return codes for XDP prog type. * A valid XDP program must return one of these defined values. All other * return codes are reserved for future use. Unknown return codes will result diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index 453cc6215bfd..92724cba1eba 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -10,6 +10,7 @@ enum lwtunnel_encap_types { LWTUNNEL_ENCAP_ILA, LWTUNNEL_ENCAP_IP6, LWTUNNEL_ENCAP_SEG6, + LWTUNNEL_ENCAP_BPF, __LWTUNNEL_ENCAP_MAX, }; @@ -43,4 +44,26 @@ enum lwtunnel_ip6_t { #define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1) +enum { + LWT_BPF_PROG_UNSPEC, + LWT_BPF_PROG_FD, + LWT_BPF_PROG_NAME, + __LWT_BPF_PROG_MAX, +}; + +#define LWT_BPF_PROG_MAX (__LWT_BPF_PROG_MAX - 1) + +enum { + LWT_BPF_UNSPEC, + LWT_BPF_IN, + LWT_BPF_OUT, + LWT_BPF_XMIT, + LWT_BPF_XMIT_HEADROOM, + __LWT_BPF_MAX, +}; + +#define LWT_BPF_MAX (__LWT_BPF_MAX - 1) + +#define LWT_BPF_MAX_HEADROOM 256 + #endif /* _UAPI_LWTUNNEL_H_ */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8740c5fa02fc..8135cb1077ee 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -633,12 +633,19 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, #define MAX_PACKET_OFF 0xffff static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, - const struct bpf_call_arg_meta *meta) + const struct bpf_call_arg_meta *meta, + enum bpf_access_type t) { switch (env->prog->type) { + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + /* dst_input() and dst_output() can't write for now */ + if (t == BPF_WRITE) + return false; case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_LWT_XMIT: if (meta) return meta->pkt_access; @@ -837,7 +844,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, err = check_stack_read(state, off, size, value_regno); } } else if (state->regs[regno].type == PTR_TO_PACKET) { - if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) { + if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose("cannot write into packet\n"); return -EACCES; } @@ -970,7 +977,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return 0; } - if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) { + if (type == PTR_TO_PACKET && + !may_access_direct_pkt_data(env, meta, BPF_READ)) { verbose("helper access to the packet is not allowed\n"); return -EACCES; } diff --git a/net/Kconfig b/net/Kconfig index 7b6cd340b72b..a1005007224c 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -402,6 +402,14 @@ config LWTUNNEL weight tunnel endpoint. Tunnel encapsulation parameters are stored with light weight tunnel state associated with fib routes. +config LWTUNNEL_BPF + bool "Execute BPF program as route nexthop action" + depends on LWTUNNEL + default y if LWTUNNEL=y + ---help--- + Allows to run BPF programs as a nexthop action following a route + lookup for incoming and outgoing packets. + config DST_CACHE bool default n diff --git a/net/core/Makefile b/net/core/Makefile index d6508c2ddca5..f6761b6e3b29 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o +obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o diff --git a/net/core/filter.c b/net/core/filter.c index 698a262b8ebb..1c4d0faf22c8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1689,6 +1689,12 @@ static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, u32 flags) { + /* Verify that a link layer header is carried */ + if (unlikely(skb->mac_header >= skb->network_header)) { + kfree_skb(skb); + return -ERANGE; + } + bpf_push_mac_rcsum(skb); return flags & BPF_F_INGRESS ? __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); @@ -2188,12 +2194,53 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + u32 max_len = __bpf_skb_max_len(skb); + u32 new_len = skb->len + head_room; + int ret; + + if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) || + new_len < skb->len)) + return -EINVAL; + + ret = skb_cow(skb, head_room); + if (likely(!ret)) { + /* Idea for this helper is that we currently only + * allow to expand on mac header. This means that + * skb->protocol network header, etc, stay as is. + * Compared to bpf_skb_change_tail(), we're more + * flexible due to not needing to linearize or + * reset GSO. Intention for this helper is to be + * used by an L3 skb that needs to push mac header + * for redirection into L2 device. + */ + __skb_push(skb, head_room); + memset(skb->data, 0, head_room); + skb_reset_mac_header(skb); + } + + bpf_compute_data_end(skb); + return 0; +} + +static const struct bpf_func_proto bpf_skb_change_head_proto = { + .func = bpf_skb_change_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + bool bpf_helper_changes_skb_data(void *func) { if (func == bpf_skb_vlan_push || func == bpf_skb_vlan_pop || func == bpf_skb_store_bytes || func == bpf_skb_change_proto || + func == bpf_skb_change_head || func == bpf_skb_change_tail || func == bpf_skb_pull_data || func == bpf_l3_csum_replace || @@ -2639,6 +2686,68 @@ cg_skb_func_proto(enum bpf_func_id func_id) } } +static const struct bpf_func_proto * +lwt_inout_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_proto; + case BPF_FUNC_get_route_realm: + return &bpf_get_route_realm_proto; + case BPF_FUNC_get_hash_recalc: + return &bpf_get_hash_recalc_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_skb_under_cgroup: + return &bpf_skb_under_cgroup_proto; + default: + return sk_filter_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +lwt_xmit_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_get_tunnel_key: + return &bpf_skb_get_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_key: + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_skb_get_tunnel_opt: + return &bpf_skb_get_tunnel_opt_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_redirect: + return &bpf_redirect_proto; + case BPF_FUNC_clone_redirect: + return &bpf_clone_redirect_proto; + case BPF_FUNC_skb_change_tail: + return &bpf_skb_change_tail_proto; + case BPF_FUNC_skb_change_head: + return &bpf_skb_change_head_proto; + case BPF_FUNC_skb_store_bytes: + return &bpf_skb_store_bytes_proto; + case BPF_FUNC_csum_update: + return &bpf_csum_update_proto; + case BPF_FUNC_l3_csum_replace: + return &bpf_l3_csum_replace_proto; + case BPF_FUNC_l4_csum_replace: + return &bpf_l4_csum_replace_proto; + case BPF_FUNC_set_hash_invalid: + return &bpf_set_hash_invalid_proto; + default: + return lwt_inout_func_proto(func_id); + } +} + static bool __is_valid_access(int off, int size, enum bpf_access_type type) { if (off < 0 || off >= sizeof(struct __sk_buff)) @@ -2676,6 +2785,39 @@ static bool sk_filter_is_valid_access(int off, int size, return __is_valid_access(off, size, type); } +static bool lwt_is_valid_access(int off, int size, + enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + switch (off) { + case offsetof(struct __sk_buff, tc_classid): + return false; + } + + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct __sk_buff, mark): + case offsetof(struct __sk_buff, priority): + case offsetof(struct __sk_buff, cb[0]) ... + offsetof(struct __sk_buff, cb[4]): + break; + default: + return false; + } + } + + switch (off) { + case offsetof(struct __sk_buff, data): + *reg_type = PTR_TO_PACKET; + break; + case offsetof(struct __sk_buff, data_end): + *reg_type = PTR_TO_PACKET_END; + break; + } + + return __is_valid_access(off, size, type); +} + static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { @@ -3007,6 +3149,19 @@ static const struct bpf_verifier_ops cg_skb_ops = { .convert_ctx_access = sk_filter_convert_ctx_access, }; +static const struct bpf_verifier_ops lwt_inout_ops = { + .get_func_proto = lwt_inout_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, +}; + +static const struct bpf_verifier_ops lwt_xmit_ops = { + .get_func_proto = lwt_xmit_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, + .gen_prologue = tc_cls_act_prologue, +}; + static struct bpf_prog_type_list sk_filter_type __read_mostly = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, @@ -3032,6 +3187,21 @@ static struct bpf_prog_type_list cg_skb_type __read_mostly = { .type = BPF_PROG_TYPE_CGROUP_SKB, }; +static struct bpf_prog_type_list lwt_in_type __read_mostly = { + .ops = &lwt_inout_ops, + .type = BPF_PROG_TYPE_LWT_IN, +}; + +static struct bpf_prog_type_list lwt_out_type __read_mostly = { + .ops = &lwt_inout_ops, + .type = BPF_PROG_TYPE_LWT_OUT, +}; + +static struct bpf_prog_type_list lwt_xmit_type __read_mostly = { + .ops = &lwt_xmit_ops, + .type = BPF_PROG_TYPE_LWT_XMIT, +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); @@ -3039,6 +3209,9 @@ static int __init register_sk_filter_ops(void) bpf_register_prog_type(&sched_act_type); bpf_register_prog_type(&xdp_type); bpf_register_prog_type(&cg_skb_type); + bpf_register_prog_type(&lwt_in_type); + bpf_register_prog_type(&lwt_out_type); + bpf_register_prog_type(&lwt_xmit_type); return 0; } diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c new file mode 100644 index 000000000000..71bb3e2eca08 --- /dev/null +++ b/net/core/lwt_bpf.c @@ -0,0 +1,396 @@ +/* Copyright (c) 2016 Thomas Graf + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include + +struct bpf_lwt_prog { + struct bpf_prog *prog; + char *name; +}; + +struct bpf_lwt { + struct bpf_lwt_prog in; + struct bpf_lwt_prog out; + struct bpf_lwt_prog xmit; + int family; +}; + +#define MAX_PROG_NAME 256 + +static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) +{ + return (struct bpf_lwt *)lwt->data; +} + +#define NO_REDIRECT false +#define CAN_REDIRECT true + +static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, + struct dst_entry *dst, bool can_redirect) +{ + int ret; + + /* Preempt disable is needed to protect per-cpu redirect_info between + * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and + * access to maps strictly require a rcu_read_lock() for protection, + * mixing with BH RCU lock doesn't work. + */ + preempt_disable(); + rcu_read_lock(); + bpf_compute_data_end(skb); + ret = bpf_prog_run_save_cb(lwt->prog, skb); + rcu_read_unlock(); + + switch (ret) { + case BPF_OK: + break; + + case BPF_REDIRECT: + if (unlikely(!can_redirect)) { + pr_warn_once("Illegal redirect return code in prog %s\n", + lwt->name ? : ""); + ret = BPF_OK; + } else { + ret = skb_do_redirect(skb); + if (ret == 0) + ret = BPF_REDIRECT; + } + break; + + case BPF_DROP: + kfree_skb(skb); + ret = -EPERM; + break; + + default: + pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); + kfree_skb(skb); + ret = -EINVAL; + break; + } + + preempt_enable(); + + return ret; +} + +static int bpf_input(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + int ret; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->in.prog) { + ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); + if (ret < 0) + return ret; + } + + if (unlikely(!dst->lwtstate->orig_input)) { + pr_warn_once("orig_input not set on dst for prog %s\n", + bpf->out.name); + kfree_skb(skb); + return -EINVAL; + } + + return dst->lwtstate->orig_input(skb); +} + +static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + int ret; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->out.prog) { + ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); + if (ret < 0) + return ret; + } + + if (unlikely(!dst->lwtstate->orig_output)) { + pr_warn_once("orig_output not set on dst for prog %s\n", + bpf->out.name); + kfree_skb(skb); + return -EINVAL; + } + + return dst->lwtstate->orig_output(net, sk, skb); +} + +static int xmit_check_hhlen(struct sk_buff *skb) +{ + int hh_len = skb_dst(skb)->dev->hard_header_len; + + if (skb_headroom(skb) < hh_len) { + int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); + + if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) + return -ENOMEM; + } + + return 0; +} + +static int bpf_xmit(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->xmit.prog) { + int ret; + + ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); + switch (ret) { + case BPF_OK: + /* If the header was expanded, headroom might be too + * small for L2 header to come, expand as needed. + */ + ret = xmit_check_hhlen(skb); + if (unlikely(ret)) + return ret; + + return LWTUNNEL_XMIT_CONTINUE; + case BPF_REDIRECT: + return LWTUNNEL_XMIT_DONE; + default: + return ret; + } + } + + return LWTUNNEL_XMIT_CONTINUE; +} + +static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) +{ + if (prog->prog) + bpf_prog_put(prog->prog); + + kfree(prog->name); +} + +static void bpf_destroy_state(struct lwtunnel_state *lwt) +{ + struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); + + bpf_lwt_prog_destroy(&bpf->in); + bpf_lwt_prog_destroy(&bpf->out); + bpf_lwt_prog_destroy(&bpf->xmit); +} + +static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { + [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, + [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, + .len = MAX_PROG_NAME }, +}; + +static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, + enum bpf_prog_type type) +{ + struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; + struct bpf_prog *p; + int ret; + u32 fd; + + ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy); + if (ret < 0) + return ret; + + if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) + return -EINVAL; + + prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL); + if (!prog->name) + return -ENOMEM; + + fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); + p = bpf_prog_get_type(fd, type); + if (IS_ERR(p)) + return PTR_ERR(p); + + prog->prog = p; + + return 0; +} + +static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { + [LWT_BPF_IN] = { .type = NLA_NESTED, }, + [LWT_BPF_OUT] = { .type = NLA_NESTED, }, + [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, + [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, +}; + +static int bpf_build_state(struct net_device *dev, struct nlattr *nla, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts) +{ + struct nlattr *tb[LWT_BPF_MAX + 1]; + struct lwtunnel_state *newts; + struct bpf_lwt *bpf; + int ret; + + if (family != AF_INET && family != AF_INET6) + return -EAFNOSUPPORT; + + ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy); + if (ret < 0) + return ret; + + if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) + return -EINVAL; + + newts = lwtunnel_state_alloc(sizeof(*bpf)); + if (!newts) + return -ENOMEM; + + newts->type = LWTUNNEL_ENCAP_BPF; + bpf = bpf_lwt_lwtunnel(newts); + + if (tb[LWT_BPF_IN]) { + newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, + BPF_PROG_TYPE_LWT_IN); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_OUT]) { + newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, + BPF_PROG_TYPE_LWT_OUT); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_XMIT]) { + newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, + BPF_PROG_TYPE_LWT_XMIT); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_XMIT_HEADROOM]) { + u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); + + if (headroom > LWT_BPF_MAX_HEADROOM) { + ret = -ERANGE; + goto errout; + } + + newts->headroom = headroom; + } + + bpf->family = family; + *ts = newts; + + return 0; + +errout: + bpf_destroy_state(newts); + kfree(newts); + return ret; +} + +static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, + struct bpf_lwt_prog *prog) +{ + struct nlattr *nest; + + if (!prog->prog) + return 0; + + nest = nla_nest_start(skb, attr); + if (!nest) + return -EMSGSIZE; + + if (prog->name && + nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) + return -EMSGSIZE; + + return nla_nest_end(skb, nest); +} + +static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) +{ + struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); + + if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || + bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || + bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) + return -EMSGSIZE; + + return 0; +} + +static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + int nest_len = nla_total_size(sizeof(struct nlattr)) + + nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ + 0; + + return nest_len + /* LWT_BPF_IN */ + nest_len + /* LWT_BPF_OUT */ + nest_len + /* LWT_BPF_XMIT */ + 0; +} + +int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) +{ + /* FIXME: + * The LWT state is currently rebuilt for delete requests which + * results in a new bpf_prog instance. Comparing names for now. + */ + if (!a->name && !b->name) + return 0; + + if (!a->name || !b->name) + return 1; + + return strcmp(a->name, b->name); +} + +static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); + struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); + + return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || + bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || + bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); +} + +static const struct lwtunnel_encap_ops bpf_encap_ops = { + .build_state = bpf_build_state, + .destroy_state = bpf_destroy_state, + .input = bpf_input, + .output = bpf_output, + .xmit = bpf_xmit, + .fill_encap = bpf_fill_encap_info, + .get_encap_size = bpf_encap_nlsize, + .cmp_encap = bpf_encap_cmp, +}; + +static int __init bpf_lwt_init(void) +{ + return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); +} + +subsys_initcall(bpf_lwt_init) diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 03976e939818..a5d4e866ce88 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -41,6 +41,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) return "ILA"; case LWTUNNEL_ENCAP_SEG6: return "SEG6"; + case LWTUNNEL_ENCAP_BPF: + return "BPF"; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_NONE: -- cgit v1.2.3-71-gd317 From 61023658760032e97869b07d54be9681d2529e77 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 1 Dec 2016 08:48:04 -0800 Subject: bpf: Add new cgroup attach type to enable sock modifications Add new cgroup based program type, BPF_PROG_TYPE_CGROUP_SOCK. Similar to BPF_PROG_TYPE_CGROUP_SKB programs can be attached to a cgroup and run any time a process in the cgroup opens an AF_INET or AF_INET6 socket. Currently only sk_bound_dev_if is exported to userspace for modification by a bpf program. This allows a cgroup to be configured such that AF_INET{6} sockets opened by processes are automatically bound to a specific device. In turn, this enables the running of programs that do not support SO_BINDTODEVICE in a specific VRF context / L3 domain. Signed-off-by: David Ahern Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf-cgroup.h | 14 +++++++++++ include/uapi/linux/bpf.h | 6 +++++ kernel/bpf/cgroup.c | 33 ++++++++++++++++++++++++ kernel/bpf/syscall.c | 5 +++- net/core/filter.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/af_inet.c | 12 ++++++++- net/ipv6/af_inet6.c | 8 ++++++ 7 files changed, 138 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index af2ca8b432c0..7b6e5d168c95 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -40,6 +40,9 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, enum bpf_attach_type type); +int __cgroup_bpf_run_filter_sk(struct sock *sk, + enum bpf_attach_type type); + /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ @@ -63,6 +66,16 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, __ret; \ }) +#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled && sk) { \ + __ret = __cgroup_bpf_run_filter_sk(sk, \ + BPF_CGROUP_INET_SOCK_CREATE); \ + } \ + __ret; \ +}) + #else struct cgroup_bpf {}; @@ -72,6 +85,7 @@ static inline void cgroup_bpf_inherit(struct cgroup *cgrp, #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #endif /* CONFIG_CGROUP_BPF */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 22ac82792687..bfe5e31a1288 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -101,6 +101,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_XDP, BPF_PROG_TYPE_PERF_EVENT, BPF_PROG_TYPE_CGROUP_SKB, + BPF_PROG_TYPE_CGROUP_SOCK, BPF_PROG_TYPE_LWT_IN, BPF_PROG_TYPE_LWT_OUT, BPF_PROG_TYPE_LWT_XMIT, @@ -109,6 +110,7 @@ enum bpf_prog_type { enum bpf_attach_type { BPF_CGROUP_INET_INGRESS, BPF_CGROUP_INET_EGRESS, + BPF_CGROUP_INET_SOCK_CREATE, __MAX_BPF_ATTACH_TYPE }; @@ -567,6 +569,10 @@ enum bpf_ret_code { /* >127 are reserved for prog type specific return codes */ }; +struct bpf_sock { + __u32 bound_dev_if; +}; + /* User return codes for XDP prog type. * A valid XDP program must return one of these defined values. All other * return codes are reserved for future use. Unknown return codes will result diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 8fe55ffd109d..a515f7b007c6 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -165,3 +165,36 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); + +/** + * __cgroup_bpf_run_filter_sk() - Run a program on a sock + * @sk: sock structure to manipulate + * @type: The type of program to be exectuted + * + * socket is passed is expected to be of type INET or INET6. + * + * The program type passed in via @type must be suitable for sock + * filtering. No further check is performed to assert that. + * + * This function will return %-EPERM if any if an attached program was found + * and if it returned != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_sk(struct sock *sk, + enum bpf_attach_type type) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_prog *prog; + int ret = 0; + + + rcu_read_lock(); + + prog = rcu_dereference(cgrp->bpf.effective[type]); + if (prog) + ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM; + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5518a6839ab1..85af86c496cd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -869,7 +869,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_INET_EGRESS: ptype = BPF_PROG_TYPE_CGROUP_SKB; break; - + case BPF_CGROUP_INET_SOCK_CREATE: + ptype = BPF_PROG_TYPE_CGROUP_SOCK; + break; default: return -EINVAL; } @@ -905,6 +907,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) switch (attr->attach_type) { case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_INET_SOCK_CREATE: cgrp = cgroup_get_from_fd(attr->target_fd); if (IS_ERR(cgrp)) return PTR_ERR(cgrp); diff --git a/net/core/filter.c b/net/core/filter.c index 1c4d0faf22c8..0ab252e462aa 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2818,6 +2818,32 @@ static bool lwt_is_valid_access(int off, int size, return __is_valid_access(off, size, type); } +static bool sock_filter_is_valid_access(int off, int size, + enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sock, bound_dev_if): + break; + default: + return false; + } + } + + if (off < 0 || off + size > sizeof(struct bpf_sock)) + return false; + + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + + if (size != sizeof(__u32)) + return false; + + return true; +} + static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { @@ -3076,6 +3102,30 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, return insn - insn_buf; } +static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, + int dst_reg, int src_reg, + int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct bpf_sock, bound_dev_if): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, sk_bound_dev_if)); + else + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, sk_bound_dev_if)); + break; + } + + return insn - insn_buf; +} + static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, struct bpf_insn *insn_buf, @@ -3162,6 +3212,12 @@ static const struct bpf_verifier_ops lwt_xmit_ops = { .gen_prologue = tc_cls_act_prologue, }; +static const struct bpf_verifier_ops cg_sock_ops = { + .get_func_proto = sk_filter_func_proto, + .is_valid_access = sock_filter_is_valid_access, + .convert_ctx_access = sock_filter_convert_ctx_access, +}; + static struct bpf_prog_type_list sk_filter_type __read_mostly = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, @@ -3202,6 +3258,11 @@ static struct bpf_prog_type_list lwt_xmit_type __read_mostly = { .type = BPF_PROG_TYPE_LWT_XMIT, }; +static struct bpf_prog_type_list cg_sock_type __read_mostly = { + .ops = &cg_sock_ops, + .type = BPF_PROG_TYPE_CGROUP_SOCK +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); @@ -3209,6 +3270,7 @@ static int __init register_sk_filter_ops(void) bpf_register_prog_type(&sched_act_type); bpf_register_prog_type(&xdp_type); bpf_register_prog_type(&cg_skb_type); + bpf_register_prog_type(&cg_sock_type); bpf_register_prog_type(&lwt_in_type); bpf_register_prog_type(&lwt_out_type); bpf_register_prog_type(&lwt_xmit_type); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5ddf5cda07f4..24d2550492ee 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -374,8 +374,18 @@ lookup_protocol: if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); - if (err) + if (err) { + sk_common_release(sk); + goto out; + } + } + + if (!kern) { + err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); + if (err) { sk_common_release(sk); + goto out; + } } out: return err; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d424f3a3737a..237e654ba717 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -258,6 +258,14 @@ lookup_protocol: goto out; } } + + if (!kern) { + err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); + if (err) { + sk_common_release(sk); + goto out; + } + } out: return err; out_rcu_unlock: -- cgit v1.2.3-71-gd317 From aa4c1037a30f4e88f444e83d42c2befbe0d5caf5 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 1 Dec 2016 08:48:06 -0800 Subject: bpf: Add support for reading socket family, type, protocol Add socket family, type and protocol to bpf_sock allowing bpf programs read-only access. Add __sk_flags_offset[0] to struct sock before the bitfield to programmtically determine the offset of the unsigned int containing protocol and type. Signed-off-by: David Ahern Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/sock.h | 15 +++++++++++++++ include/uapi/linux/bpf.h | 3 +++ net/core/filter.c | 21 +++++++++++++++++++++ 3 files changed, 39 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/sock.h b/include/net/sock.h index 442cbb118a07..69afda6bea15 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -389,6 +389,21 @@ struct sock { * Because of non atomicity rules, all * changes are protected by socket lock. */ + unsigned int __sk_flags_offset[0]; +#ifdef __BIG_ENDIAN_BITFIELD +#define SK_FL_PROTO_SHIFT 16 +#define SK_FL_PROTO_MASK 0x00ff0000 + +#define SK_FL_TYPE_SHIFT 0 +#define SK_FL_TYPE_MASK 0x0000ffff +#else +#define SK_FL_PROTO_SHIFT 8 +#define SK_FL_PROTO_MASK 0x0000ff00 + +#define SK_FL_TYPE_SHIFT 16 +#define SK_FL_TYPE_MASK 0xffff0000 +#endif + kmemcheck_bitfield_begin(flags); unsigned int sk_padding : 2, sk_no_check_tx : 1, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bfe5e31a1288..6123d9b8e828 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -571,6 +571,9 @@ enum bpf_ret_code { struct bpf_sock { __u32 bound_dev_if; + __u32 family; + __u32 type; + __u32 protocol; }; /* User return codes for XDP prog type. diff --git a/net/core/filter.c b/net/core/filter.c index 0ab252e462aa..56b43587d200 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3121,6 +3121,27 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, offsetof(struct sock, sk_bound_dev_if)); break; + + case offsetof(struct bpf_sock, family): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sock, sk_family)); + break; + + case offsetof(struct bpf_sock, type): + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, __sk_flags_offset)); + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_TYPE_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_TYPE_SHIFT); + break; + + case offsetof(struct bpf_sock, protocol): + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, __sk_flags_offset)); + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_PROTO_SHIFT); + break; } return insn - insn_buf; -- cgit v1.2.3-71-gd317 From adc176c5472214971d77c1a61c83db9b01e9cdc7 Mon Sep 17 00:00:00 2001 From: Erik Nordmark Date: Fri, 2 Dec 2016 14:00:08 -0800 Subject: ipv6 addrconf: Implemented enhanced DAD (RFC7527) Implemented RFC7527 Enhanced DAD. IPv6 duplicate address detection can fail if there is some temporary loopback of Ethernet frames. RFC7527 solves this by including a random nonce in the NS messages used for DAD, and if an NS is received with the same nonce it is assumed to be a looped back DAD probe and is ignored. RFC7527 is enabled by default. Can be disabled by setting both of conf/{all,interface}/enhanced_dad to zero. Signed-off-by: Erik Nordmark Signed-off-by: Bob Gilligan Reviewed-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 9 +++++++++ include/linux/ipv6.h | 1 + include/net/if_inet6.h | 1 + include/net/ndisc.h | 5 ++++- include/uapi/linux/ipv6.h | 1 + net/ipv6/addrconf.c | 22 +++++++++++++++++++++- net/ipv6/ndisc.c | 29 ++++++++++++++++++++++++++--- net/ipv6/route.c | 2 +- 8 files changed, 64 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 5ca567fa6b8c..7dd65c9cf707 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1734,6 +1734,15 @@ drop_unsolicited_na - BOOLEAN By default this is turned off. +enhanced_dad - BOOLEAN + Include a nonce option in the IPv6 neighbor solicitation messages used for + duplicate address detection per RFC7527. A received DAD NS will only signal + a duplicate address if the nonce is different. This avoids any false + detection of duplicates due to loopback of the NS messages that we send. + The nonce option will be sent on an interface unless both of + conf/{all,interface}/enhanced_dad are set to FALSE. + Default: TRUE + icmp/*: ratelimit - INTEGER Limit the maximal rates for sending ICMPv6 packets. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 3f95233b2733..671d014e6429 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -68,6 +68,7 @@ struct ipv6_devconf { #ifdef CONFIG_IPV6_SEG6_HMAC __s32 seg6_require_hmac; #endif + __u32 enhanced_dad; struct ctl_table_header *sysctl_header; }; diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index b0576cb2ab25..0fa4c324b713 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -55,6 +55,7 @@ struct inet6_ifaddr { __u8 stable_privacy_retry; __u16 scope; + __u64 dad_nonce; unsigned long cstamp; /* created timestamp */ unsigned long tstamp; /* updated timestamp */ diff --git a/include/net/ndisc.h b/include/net/ndisc.h index be1fe2283254..d562a2fe4860 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -31,6 +31,7 @@ enum { ND_OPT_PREFIX_INFO = 3, /* RFC2461 */ ND_OPT_REDIRECT_HDR = 4, /* RFC2461 */ ND_OPT_MTU = 5, /* RFC2461 */ + ND_OPT_NONCE = 14, /* RFC7527 */ __ND_OPT_ARRAY_MAX, ND_OPT_ROUTE_INFO = 24, /* RFC4191 */ ND_OPT_RDNSS = 25, /* RFC5006 */ @@ -121,6 +122,7 @@ struct ndisc_options { #define nd_opts_pi_end nd_opt_array[__ND_OPT_PREFIX_INFO_END] #define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR] #define nd_opts_mtu nd_opt_array[ND_OPT_MTU] +#define nd_opts_nonce nd_opt_array[ND_OPT_NONCE] #define nd_802154_opts_src_lladdr nd_802154_opt_array[ND_OPT_SOURCE_LL_ADDR] #define nd_802154_opts_tgt_lladdr nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR] @@ -398,7 +400,8 @@ void ndisc_cleanup(void); int ndisc_rcv(struct sk_buff *skb); void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, - const struct in6_addr *daddr, const struct in6_addr *saddr); + const struct in6_addr *daddr, const struct in6_addr *saddr, + u64 nonce); void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr); diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 53561be1ac21..eaf65dc82e22 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -181,6 +181,7 @@ enum { DEVCONF_RTR_SOLICIT_MAX_INTERVAL, DEVCONF_SEG6_ENABLED, DEVCONF_SEG6_REQUIRE_HMAC, + DEVCONF_ENHANCED_DAD, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4c387dc338e3..c1e124bc8e1e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -242,6 +242,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { #ifdef CONFIG_IPV6_SEG6_HMAC .seg6_require_hmac = 0, #endif + .enhanced_dad = 1, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -292,6 +293,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { #ifdef CONFIG_IPV6_SEG6_HMAC .seg6_require_hmac = 0, #endif + .enhanced_dad = 1, }; /* Check if a valid qdisc is available */ @@ -3735,12 +3737,21 @@ static void addrconf_dad_kick(struct inet6_ifaddr *ifp) { unsigned long rand_num; struct inet6_dev *idev = ifp->idev; + u64 nonce; if (ifp->flags & IFA_F_OPTIMISTIC) rand_num = 0; else rand_num = prandom_u32() % (idev->cnf.rtr_solicit_delay ? : 1); + nonce = 0; + if (idev->cnf.enhanced_dad || + dev_net(idev->dev)->ipv6.devconf_all->enhanced_dad) { + do + get_random_bytes(&nonce, 6); + while (nonce == 0); + } + ifp->dad_nonce = nonce; ifp->dad_probes = idev->cnf.dad_transmits; addrconf_mod_dad_work(ifp, rand_num); } @@ -3918,7 +3929,8 @@ static void addrconf_dad_work(struct work_struct *w) /* send a neighbour solicitation for our addr */ addrconf_addr_solict_mult(&ifp->addr, &mcaddr); - ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any); + ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any, + ifp->dad_nonce); out: in6_ifa_put(ifp); rtnl_unlock(); @@ -4962,6 +4974,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, #ifdef CONFIG_IPV6_SEG6_HMAC array[DEVCONF_SEG6_REQUIRE_HMAC] = cnf->seg6_require_hmac; #endif + array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad; } static inline size_t inet6_ifla6_size(void) @@ -6069,6 +6082,13 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = proc_dointvec, }, #endif + { + .procname = "enhanced_dad", + .data = &ipv6_devconf.enhanced_dad, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { /* sentinel */ } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index d8e671457d10..7ebac630d3c6 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -233,6 +233,7 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev, case ND_OPT_SOURCE_LL_ADDR: case ND_OPT_TARGET_LL_ADDR: case ND_OPT_MTU: + case ND_OPT_NONCE: case ND_OPT_REDIRECT_HDR: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { ND_PRINTK(2, warn, @@ -568,7 +569,8 @@ static void ndisc_send_unsol_na(struct net_device *dev) } void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, - const struct in6_addr *daddr, const struct in6_addr *saddr) + const struct in6_addr *daddr, const struct in6_addr *saddr, + u64 nonce) { struct sk_buff *skb; struct in6_addr addr_buf; @@ -588,6 +590,8 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, if (inc_opt) optlen += ndisc_opt_addr_space(dev, NDISC_NEIGHBOUR_SOLICITATION); + if (nonce != 0) + optlen += 8; skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) @@ -605,6 +609,13 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, NDISC_NEIGHBOUR_SOLICITATION); + if (nonce != 0) { + u8 *opt = skb_put(skb, 8); + + opt[0] = ND_OPT_NONCE; + opt[1] = 8 >> 3; + memcpy(opt + 2, &nonce, 6); + } ndisc_send_skb(skb, daddr, saddr); } @@ -693,12 +704,12 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) "%s: trying to ucast probe in NUD_INVALID: %pI6\n", __func__, target); } - ndisc_send_ns(dev, target, target, saddr); + ndisc_send_ns(dev, target, target, saddr, 0); } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { neigh_app_ns(neigh); } else { addrconf_addr_solict_mult(target, &mcaddr); - ndisc_send_ns(dev, target, &mcaddr, saddr); + ndisc_send_ns(dev, target, &mcaddr, saddr, 0); } } @@ -742,6 +753,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) int dad = ipv6_addr_any(saddr); bool inc; int is_router = -1; + u64 nonce = 0; if (skb->len < sizeof(struct nd_msg)) { ND_PRINTK(2, warn, "NS: packet too short\n"); @@ -786,6 +798,8 @@ static void ndisc_recv_ns(struct sk_buff *skb) return; } } + if (ndopts.nd_opts_nonce) + memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6); inc = ipv6_addr_is_multicast(daddr); @@ -794,6 +808,15 @@ static void ndisc_recv_ns(struct sk_buff *skb) have_ifp: if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) { if (dad) { + if (nonce != 0 && ifp->dad_nonce == nonce) { + u8 *np = (u8 *)&nonce; + /* Matching nonce if looped back */ + ND_PRINTK(2, notice, + "%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n", + ifp->idev->dev->name, + &ifp->addr, np); + goto out; + } /* * We are colliding with another node * who is doing DAD diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b317bb135ed4..aac7818e2e0f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -527,7 +527,7 @@ static void rt6_probe_deferred(struct work_struct *w) container_of(w, struct __rt6_probe_work, work); addrconf_addr_solict_mult(&work->target, &mcaddr); - ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL); + ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); dev_put(work->dev); kfree(work); } -- cgit v1.2.3-71-gd317 From 3fefeb88d002850e591339fed291eb6a795d9f21 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 15 Nov 2016 15:08:24 +0100 Subject: netfilter: nf_conntrack_tuple_common.h: fix #include To allow usage of enum ip_conntrack_dir in include/net/netns/conntrack.h, this patch encloses #include in a #ifndef __KERNEL__ directive, so that compiler errors caused by unwanted inclusion of include/linux/netfilter.h are avoided. In addition, #include line has been added to resolve correctly CTINFO2DIR macro. Signed-off-by: Davide Caratti Acked-by: Mikko Rapeli Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_tuple_common.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h index a9c3834abdd4..526b42496b78 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h @@ -2,7 +2,10 @@ #define _NF_CONNTRACK_TUPLE_COMMON_H #include +#ifndef __KERNEL__ #include +#endif +#include /* IP_CT_IS_REPLY */ enum ip_conntrack_dir { IP_CT_DIR_ORIGINAL, -- cgit v1.2.3-71-gd317 From 7bd509e311f408f7a5132fcdde2069af65fa05ae Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 4 Dec 2016 23:19:41 +0100 Subject: bpf: add prog_digest and expose it via fdinfo/netlink When loading a BPF program via bpf(2), calculate the digest over the program's instruction stream and store it in struct bpf_prog's digest member. This is done at a point in time before any instructions are rewritten by the verifier. Any unstable map file descriptor number part of the imm field will be zeroed for the hash. fdinfo example output for progs: # cat /proc/1590/fdinfo/5 pos: 0 flags: 02000002 mnt_id: 11 prog_type: 1 prog_jited: 1 prog_digest: b27e8b06da22707513aa97363dfb11c7c3675d28 memlock: 4096 When programs are pinned and retrieved by an ELF loader, the loader can check the program's digest through fdinfo and compare it against one that was generated over the ELF file's program section to see if the program needs to be reloaded. Furthermore, this can also be exposed through other means such as netlink in case of a tc cls/act dump (or xdp in future), but also through tracepoints or other facilities to identify the program. Other than that, the digest can also serve as a base name for the work in progress kallsyms support of programs. The digest doesn't depend/select the crypto layer, since we need to keep dependencies to a minimum. iproute2 will get support for this facility. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/linux/filter.h | 7 +++- include/uapi/linux/pkt_cls.h | 1 + include/uapi/linux/tc_act/tc_bpf.h | 1 + kernel/bpf/core.c | 65 ++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 24 +++++++++++++- kernel/bpf/verifier.c | 2 ++ net/sched/act_bpf.c | 9 ++++++ net/sched/cls_bpf.c | 8 +++++ 9 files changed, 116 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 69d0a7f12a3b..8796ff03f472 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -216,6 +216,7 @@ u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); +void bpf_prog_calc_digest(struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); diff --git a/include/linux/filter.h b/include/linux/filter.h index 97338134398f..f078d2b1cff6 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -56,6 +57,9 @@ struct bpf_prog_aux; /* BPF program can access up to 512 bytes of stack space. */ #define MAX_BPF_STACK 512 +/* Maximum BPF program size in bytes. */ +#define MAX_BPF_SIZE (BPF_MAXINSNS * sizeof(struct bpf_insn)) + /* Helper macros for filter block array initializers. */ /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ @@ -404,8 +408,9 @@ struct bpf_prog { cb_access:1, /* Is control block accessed? */ dst_needed:1; /* Do we need dst entry? */ kmemcheck_bitfield_end(meta); - u32 len; /* Number of filter blocks */ enum bpf_prog_type type; /* Type of BPF program */ + u32 len; /* Number of filter blocks */ + u32 digest[SHA_DIGEST_WORDS]; /* Program digest */ struct bpf_prog_aux *aux; /* Auxiliary fields */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ unsigned int (*bpf_func)(const void *ctx, diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 86786d45ee66..1adc0b654996 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -397,6 +397,7 @@ enum { TCA_BPF_NAME, TCA_BPF_FLAGS, TCA_BPF_FLAGS_GEN, + TCA_BPF_DIGEST, __TCA_BPF_MAX, }; diff --git a/include/uapi/linux/tc_act/tc_bpf.h b/include/uapi/linux/tc_act/tc_bpf.h index 063d9d465119..a6b88a6f7f71 100644 --- a/include/uapi/linux/tc_act/tc_bpf.h +++ b/include/uapi/linux/tc_act/tc_bpf.h @@ -27,6 +27,7 @@ enum { TCA_ACT_BPF_FD, TCA_ACT_BPF_NAME, TCA_ACT_BPF_PAD, + TCA_ACT_BPF_DIGEST, __TCA_ACT_BPF_MAX, }; #define TCA_ACT_BPF_MAX (__TCA_ACT_BPF_MAX - 1) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 82a04143368e..bdcc9f4ba767 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -136,6 +136,71 @@ void __bpf_prog_free(struct bpf_prog *fp) vfree(fp); } +#define SHA_BPF_RAW_SIZE \ + round_up(MAX_BPF_SIZE + sizeof(__be64) + 1, SHA_MESSAGE_BYTES) + +/* Called under verifier mutex. */ +void bpf_prog_calc_digest(struct bpf_prog *fp) +{ + const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64); + static u32 ws[SHA_WORKSPACE_WORDS]; + static u8 raw[SHA_BPF_RAW_SIZE]; + struct bpf_insn *dst = (void *)raw; + u32 i, bsize, psize, blocks; + bool was_ld_map; + u8 *todo = raw; + __be32 *result; + __be64 *bits; + + sha_init(fp->digest); + memset(ws, 0, sizeof(ws)); + + /* We need to take out the map fd for the digest calculation + * since they are unstable from user space side. + */ + for (i = 0, was_ld_map = false; i < fp->len; i++) { + dst[i] = fp->insnsi[i]; + if (!was_ld_map && + dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && + dst[i].src_reg == BPF_PSEUDO_MAP_FD) { + was_ld_map = true; + dst[i].imm = 0; + } else if (was_ld_map && + dst[i].code == 0 && + dst[i].dst_reg == 0 && + dst[i].src_reg == 0 && + dst[i].off == 0) { + was_ld_map = false; + dst[i].imm = 0; + } else { + was_ld_map = false; + } + } + + psize = fp->len * sizeof(struct bpf_insn); + memset(&raw[psize], 0, sizeof(raw) - psize); + raw[psize++] = 0x80; + + bsize = round_up(psize, SHA_MESSAGE_BYTES); + blocks = bsize / SHA_MESSAGE_BYTES; + if (bsize - psize >= sizeof(__be64)) { + bits = (__be64 *)(todo + bsize - sizeof(__be64)); + } else { + bits = (__be64 *)(todo + bsize + bits_offset); + blocks++; + } + *bits = cpu_to_be64((psize - 1) << 3); + + while (blocks--) { + sha_transform(fp->digest, todo, ws); + todo += SHA_MESSAGE_BYTES; + } + + result = (__force __be32 *)fp->digest; + for (i = 0; i < SHA_DIGEST_WORDS; i++) + result[i] = cpu_to_be32(fp->digest[i]); +} + static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) { return BPF_CLASS(insn->code) == BPF_JMP && diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 85af86c496cd..c0d2b423ce93 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -662,8 +662,30 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) return 0; } +#ifdef CONFIG_PROC_FS +static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) +{ + const struct bpf_prog *prog = filp->private_data; + char prog_digest[sizeof(prog->digest) * 2 + 1] = { }; + + bin2hex(prog_digest, prog->digest, sizeof(prog->digest)); + seq_printf(m, + "prog_type:\t%u\n" + "prog_jited:\t%u\n" + "prog_digest:\t%s\n" + "memlock:\t%llu\n", + prog->type, + prog->jited, + prog_digest, + prog->pages * 1ULL << PAGE_SHIFT); +} +#endif + static const struct file_operations bpf_prog_fops = { - .release = bpf_prog_release, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_prog_show_fdinfo, +#endif + .release = bpf_prog_release, }; int bpf_prog_new_fd(struct bpf_prog *prog) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 38d05da84a49..cb37339ca0da 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3176,6 +3176,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) log_level = 0; } + bpf_prog_calc_digest(env->prog); + ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 84c1d2da4f8b..1c60317f0121 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -117,10 +117,19 @@ static int tcf_bpf_dump_bpf_info(const struct tcf_bpf *prog, static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog, struct sk_buff *skb) { + struct nlattr *nla; + if (prog->bpf_name && nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name)) return -EMSGSIZE; + nla = nla_reserve(skb, TCA_ACT_BPF_DIGEST, + sizeof(prog->filter->digest)); + if (nla == NULL) + return -EMSGSIZE; + + memcpy(nla_data(nla), prog->filter->digest, nla_len(nla)); + return 0; } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index f70e03d2d2c8..adc776048d1a 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -549,10 +549,18 @@ static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog, static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog, struct sk_buff *skb) { + struct nlattr *nla; + if (prog->bpf_name && nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name)) return -EMSGSIZE; + nla = nla_reserve(skb, TCA_BPF_DIGEST, sizeof(prog->filter->digest)); + if (nla == NULL) + return -EMSGSIZE; + + memcpy(nla_data(nla), prog->filter->digest, nla_len(nla)); + return 0; } -- cgit v1.2.3-71-gd317 From 1814096980bbe546c4384b7b064126cbe7d40d30 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 24 Nov 2016 12:04:55 +0100 Subject: netfilter: nft_payload: layer 4 checksum adjustment for pseudoheader fields This patch adds a new flag that signals the kernel to update layer 4 checksum if the packet field belongs to the layer 4 pseudoheader. This implicitly provides stateless NAT 1:1 that is useful under very specific usecases. Since rules mangling layer 3 fields that are part of the pseudoheader may potentially convey any layer 4 packet, we have to deal with the layer 4 checksum adjustment using protocol specific code. This patch adds support for TCP, UDP and ICMPv6, since they include the pseudoheader in the layer 4 checksum calculation. ICMP doesn't, so we can skip it. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 1 + include/uapi/linux/netfilter/nf_tables.h | 6 ++ net/netfilter/nft_payload.c | 107 +++++++++++++++++++++++++++++-- 3 files changed, 109 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 862373d4ea9d..8f690effec37 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -45,6 +45,7 @@ struct nft_payload_set { enum nft_registers sreg:8; u8 csum_type; u8 csum_offset; + u8 csum_flags; }; extern const struct nft_expr_ops nft_payload_fast_ops; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 14e5f619167e..f030e59aa2ec 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -659,6 +659,10 @@ enum nft_payload_csum_types { NFT_PAYLOAD_CSUM_INET, }; +enum nft_payload_csum_flags { + NFT_PAYLOAD_L4CSUM_PSEUDOHDR = (1 << 0), +}; + /** * enum nft_payload_attributes - nf_tables payload expression netlink attributes * @@ -669,6 +673,7 @@ enum nft_payload_csum_types { * @NFTA_PAYLOAD_SREG: source register to load data from (NLA_U32: nft_registers) * @NFTA_PAYLOAD_CSUM_TYPE: checksum type (NLA_U32) * @NFTA_PAYLOAD_CSUM_OFFSET: checksum offset relative to base (NLA_U32) + * @NFTA_PAYLOAD_CSUM_FLAGS: checksum flags (NLA_U32) */ enum nft_payload_attributes { NFTA_PAYLOAD_UNSPEC, @@ -679,6 +684,7 @@ enum nft_payload_attributes { NFTA_PAYLOAD_SREG, NFTA_PAYLOAD_CSUM_TYPE, NFTA_PAYLOAD_CSUM_OFFSET, + NFTA_PAYLOAD_CSUM_FLAGS, __NFTA_PAYLOAD_MAX }; #define NFTA_PAYLOAD_MAX (__NFTA_PAYLOAD_MAX - 1) diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 98fb5d7b8087..36d2b1096546 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2008-2009 Patrick McHardy + * Copyright (c) 2016 Pablo Neira Ayuso * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -17,6 +18,10 @@ #include #include #include +/* For layer 4 checksum field offset. */ +#include +#include +#include /* add vlan header into the user buffer for if tag was removed by offloads */ static bool @@ -164,6 +169,87 @@ const struct nft_expr_ops nft_payload_fast_ops = { .dump = nft_payload_dump, }; +static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum) +{ + *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum)); + if (*sum == 0) + *sum = CSUM_MANGLED_0; +} + +static bool nft_payload_udp_checksum(struct sk_buff *skb, unsigned int thoff) +{ + struct udphdr *uh, _uh; + + uh = skb_header_pointer(skb, thoff, sizeof(_uh), &_uh); + if (!uh) + return false; + + return uh->check; +} + +static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt, + struct sk_buff *skb, + unsigned int *l4csum_offset) +{ + switch (pkt->tprot) { + case IPPROTO_TCP: + *l4csum_offset = offsetof(struct tcphdr, check); + break; + case IPPROTO_UDP: + if (!nft_payload_udp_checksum(skb, pkt->xt.thoff)) + return -1; + /* Fall through. */ + case IPPROTO_UDPLITE: + *l4csum_offset = offsetof(struct udphdr, check); + break; + case IPPROTO_ICMPV6: + *l4csum_offset = offsetof(struct icmp6hdr, icmp6_cksum); + break; + default: + return -1; + } + + *l4csum_offset += pkt->xt.thoff; + return 0; +} + +static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt, + struct sk_buff *skb, + __wsum fsum, __wsum tsum) +{ + int l4csum_offset; + __sum16 sum; + + /* If we cannot determine layer 4 checksum offset or this packet doesn't + * require layer 4 checksum recalculation, skip this packet. + */ + if (nft_payload_l4csum_offset(pkt, skb, &l4csum_offset) < 0) + return 0; + + if (skb_copy_bits(skb, l4csum_offset, &sum, sizeof(sum)) < 0) + return -1; + + /* Checksum mangling for an arbitrary amount of bytes, based on + * inet_proto_csum_replace*() functions. + */ + if (skb->ip_summed != CHECKSUM_PARTIAL) { + nft_csum_replace(&sum, fsum, tsum); + if (skb->ip_summed == CHECKSUM_COMPLETE) { + skb->csum = ~csum_add(csum_sub(~(skb->csum), fsum), + tsum); + } + } else { + sum = ~csum_fold(csum_add(csum_sub(csum_unfold(sum), fsum), + tsum)); + } + + if (!skb_make_writable(skb, l4csum_offset + sizeof(sum)) || + skb_store_bits(skb, l4csum_offset, &sum, sizeof(sum)) < 0) + return -1; + + return 0; +} + static void nft_payload_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -204,14 +290,15 @@ static void nft_payload_set_eval(const struct nft_expr *expr, fsum = skb_checksum(skb, offset, priv->len, 0); tsum = csum_partial(src, priv->len, 0); - sum = csum_fold(csum_add(csum_sub(~csum_unfold(sum), fsum), - tsum)); - if (sum == 0) - sum = CSUM_MANGLED_0; + nft_csum_replace(&sum, fsum, tsum); if (!skb_make_writable(skb, csum_offset + sizeof(sum)) || skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) goto err; + + if (priv->csum_flags && + nft_payload_l4csum_update(pkt, skb, fsum, tsum) < 0) + goto err; } if (!skb_make_writable(skb, max(offset + priv->len, 0)) || @@ -240,6 +327,15 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) priv->csum_offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_OFFSET])); + if (tb[NFTA_PAYLOAD_CSUM_FLAGS]) { + u32 flags; + + flags = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_FLAGS])); + if (flags & ~NFT_PAYLOAD_L4CSUM_PSEUDOHDR) + return -EINVAL; + + priv->csum_flags = flags; + } switch (priv->csum_type) { case NFT_PAYLOAD_CSUM_NONE: @@ -262,7 +358,8 @@ static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr nla_put_be32(skb, NFTA_PAYLOAD_LEN, htonl(priv->len)) || nla_put_be32(skb, NFTA_PAYLOAD_CSUM_TYPE, htonl(priv->csum_type)) || nla_put_be32(skb, NFTA_PAYLOAD_CSUM_OFFSET, - htonl(priv->csum_offset))) + htonl(priv->csum_offset)) || + nla_put_be32(skb, NFTA_PAYLOAD_CSUM_FLAGS, htonl(priv->csum_flags))) goto nla_put_failure; return 0; -- cgit v1.2.3-71-gd317 From e50092404c1bc7aaeb0a0f4077fa6f07b073a20f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 28 Nov 2016 00:04:32 +0100 Subject: netfilter: nf_tables: add stateful objects This patch augments nf_tables to support stateful objects. This new infrastructure allows you to create, dump and delete stateful objects, that are identified by a user-defined name. This patch adds the generic infrastructure, follow up patches add support for two stateful objects: counters and quotas. This patch provides a native infrastructure for nf_tables to replace nfacct, the extended accounting infrastructure for iptables. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 79 +++++ include/uapi/linux/netfilter/nf_tables.h | 29 ++ net/netfilter/nf_tables_api.c | 516 +++++++++++++++++++++++++++++++ 3 files changed, 624 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 32970cba184a..903cd618f50e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -875,6 +875,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); * @list: used internally * @chains: chains in the table * @sets: sets in the table + * @objects: stateful objects in the table * @hgenerator: handle generator state * @use: number of chain references to this table * @flags: table flag (see enum nft_table_flags) @@ -885,6 +886,7 @@ struct nft_table { struct list_head list; struct list_head chains; struct list_head sets; + struct list_head objects; u64 hgenerator; u32 use; u16 flags:14, @@ -934,6 +936,73 @@ void nft_unregister_expr(struct nft_expr_type *); int nft_verdict_dump(struct sk_buff *skb, int type, const struct nft_verdict *v); +/** + * struct nft_object - nf_tables stateful object + * + * @list: table stateful object list node + * @type: pointer to object type + * @data: pointer to object data + * @name: name of this stateful object + * @genmask: generation mask + * @use: number of references to this stateful object + * @data: object data, layout depends on type + */ +struct nft_object { + struct list_head list; + char name[NFT_OBJ_MAXNAMELEN]; + u32 genmask:2, + use:30; + /* runtime data below here */ + const struct nft_object_type *type ____cacheline_aligned; + unsigned char data[] + __attribute__((aligned(__alignof__(u64)))); +}; + +static inline void *nft_obj_data(const struct nft_object *obj) +{ + return (void *)obj->data; +} + +#define nft_expr_obj(expr) *((struct nft_object **)nft_expr_priv(expr)) + +struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, + const struct nlattr *nla, u32 objtype, + u8 genmask); + +/** + * struct nft_object_type - stateful object type + * + * @eval: stateful object evaluation function + * @list: list node in list of object types + * @type: stateful object numeric type + * @size: stateful object size + * @owner: module owner + * @maxattr: maximum netlink attribute + * @policy: netlink attribute policy + * @init: initialize object from netlink attributes + * @destroy: release existing stateful object + * @dump: netlink dump stateful object + */ +struct nft_object_type { + void (*eval)(struct nft_object *obj, + struct nft_regs *regs, + const struct nft_pktinfo *pkt); + struct list_head list; + u32 type; + unsigned int size; + unsigned int maxattr; + struct module *owner; + const struct nla_policy *policy; + int (*init)(const struct nlattr * const tb[], + struct nft_object *obj); + void (*destroy)(struct nft_object *obj); + int (*dump)(struct sk_buff *skb, + const struct nft_object *obj); +}; + +int nft_register_obj(struct nft_object_type *obj_type); +void nft_unregister_obj(struct nft_object_type *obj_type); + /** * struct nft_traceinfo - nft tracing information and state * @@ -981,6 +1050,9 @@ void nft_trace_notify(struct nft_traceinfo *info); #define MODULE_ALIAS_NFT_SET() \ MODULE_ALIAS("nft-set") +#define MODULE_ALIAS_NFT_OBJ(type) \ + MODULE_ALIAS("nft-obj-" __stringify(type)) + /* * The gencursor defines two generations, the currently active and the * next one. Objects contain a bitmask of 2 bits specifying the generations @@ -1157,4 +1229,11 @@ struct nft_trans_elem { #define nft_trans_elem(trans) \ (((struct nft_trans_elem *)trans->data)->elem) +struct nft_trans_obj { + struct nft_object *obj; +}; + +#define nft_trans_obj(trans) \ + (((struct nft_trans_obj *)trans->data)->obj) + #endif /* _NET_NF_TABLES_H */ diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index f030e59aa2ec..18e30dbc8c3f 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -4,6 +4,7 @@ #define NFT_TABLE_MAXNAMELEN 32 #define NFT_CHAIN_MAXNAMELEN 32 #define NFT_SET_MAXNAMELEN 32 +#define NFT_OBJ_MAXNAMELEN 32 #define NFT_USERDATA_MAXLEN 256 /** @@ -85,6 +86,9 @@ enum nft_verdicts { * @NFT_MSG_NEWGEN: announce a new generation, only for events (enum nft_gen_attributes) * @NFT_MSG_GETGEN: get the rule-set generation (enum nft_gen_attributes) * @NFT_MSG_TRACE: trace event (enum nft_trace_attributes) + * @NFT_MSG_NEWOBJ: create a stateful object (enum nft_obj_attributes) + * @NFT_MSG_GETOBJ: get a stateful object (enum nft_obj_attributes) + * @NFT_MSG_DELOBJ: delete a stateful object (enum nft_obj_attributes) */ enum nf_tables_msg_types { NFT_MSG_NEWTABLE, @@ -105,6 +109,9 @@ enum nf_tables_msg_types { NFT_MSG_NEWGEN, NFT_MSG_GETGEN, NFT_MSG_TRACE, + NFT_MSG_NEWOBJ, + NFT_MSG_GETOBJ, + NFT_MSG_DELOBJ, NFT_MSG_MAX, }; @@ -1178,6 +1185,28 @@ enum nft_fib_flags { NFTA_FIB_F_OIF = 1 << 4, /* restrict to oif */ }; +#define NFT_OBJECT_UNSPEC 0 + +/** + * enum nft_object_attributes - nf_tables stateful object netlink attributes + * + * @NFTA_OBJ_TABLE: name of the table containing the expression (NLA_STRING) + * @NFTA_OBJ_NAME: name of this expression type (NLA_STRING) + * @NFTA_OBJ_TYPE: stateful object type (NLA_U32) + * @NFTA_OBJ_DATA: stateful object data (NLA_NESTED) + * @NFTA_OBJ_USE: number of references to this expression (NLA_U32) + */ +enum nft_object_attributes { + NFTA_OBJ_UNSPEC, + NFTA_OBJ_TABLE, + NFTA_OBJ_NAME, + NFTA_OBJ_TYPE, + NFTA_OBJ_DATA, + NFTA_OBJ_USE, + __NFTA_OBJ_MAX +}; +#define NFTA_OBJ_MAX (__NFTA_OBJ_MAX - 1) + /** * enum nft_trace_attributes - nf_tables trace netlink attributes * diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e5194f6f906c..2ae717c5dcb8 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -22,6 +22,7 @@ #include static LIST_HEAD(nf_tables_expressions); +static LIST_HEAD(nf_tables_objects); /** * nft_register_afinfo - register nf_tables address family info @@ -304,6 +305,38 @@ static int nft_delset(struct nft_ctx *ctx, struct nft_set *set) return err; } +static int nft_trans_obj_add(struct nft_ctx *ctx, int msg_type, + struct nft_object *obj) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_obj)); + if (trans == NULL) + return -ENOMEM; + + if (msg_type == NFT_MSG_NEWOBJ) + nft_activate_next(ctx->net, obj); + + nft_trans_obj(trans) = obj; + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return 0; +} + +static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj) +{ + int err; + + err = nft_trans_obj_add(ctx, NFT_MSG_DELOBJ, obj); + if (err < 0) + return err; + + nft_deactivate_next(ctx->net, obj); + ctx->table->use--; + + return err; +} + /* * Tables */ @@ -688,6 +721,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN); INIT_LIST_HEAD(&table->chains); INIT_LIST_HEAD(&table->sets); + INIT_LIST_HEAD(&table->objects); table->flags = flags; nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); @@ -709,6 +743,7 @@ static int nft_flush_table(struct nft_ctx *ctx) { int err; struct nft_chain *chain, *nc; + struct nft_object *obj, *ne; struct nft_set *set, *ns; list_for_each_entry(chain, &ctx->table->chains, list) { @@ -735,6 +770,12 @@ static int nft_flush_table(struct nft_ctx *ctx) goto out; } + list_for_each_entry_safe(obj, ne, &ctx->table->objects, list) { + err = nft_delobj(ctx, obj); + if (err < 0) + goto out; + } + list_for_each_entry_safe(chain, nc, &ctx->table->chains, list) { if (!nft_is_active_next(ctx->net, chain)) continue; @@ -3838,6 +3879,434 @@ struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, } EXPORT_SYMBOL_GPL(nft_set_gc_batch_alloc); +/* + * Stateful objects + */ + +/** + * nft_register_obj- register nf_tables stateful object type + * @obj: object type + * + * Registers the object type for use with nf_tables. Returns zero on + * success or a negative errno code otherwise. + */ +int nft_register_obj(struct nft_object_type *obj_type) +{ + if (obj_type->type == NFT_OBJECT_UNSPEC) + return -EINVAL; + + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_add_rcu(&obj_type->list, &nf_tables_objects); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + return 0; +} +EXPORT_SYMBOL_GPL(nft_register_obj); + +/** + * nft_unregister_obj - unregister nf_tables object type + * @obj: object type + * + * Unregisters the object type for use with nf_tables. + */ +void nft_unregister_obj(struct nft_object_type *obj_type) +{ + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_del_rcu(&obj_type->list); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); +} +EXPORT_SYMBOL_GPL(nft_unregister_obj); + +struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, + const struct nlattr *nla, + u32 objtype, u8 genmask) +{ + struct nft_object *obj; + + list_for_each_entry(obj, &table->objects, list) { + if (!nla_strcmp(nla, obj->name) && + objtype == obj->type->type && + nft_active_genmask(obj, genmask)) + return obj; + } + return ERR_PTR(-ENOENT); +} +EXPORT_SYMBOL_GPL(nf_tables_obj_lookup); + +static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = { + [NFTA_OBJ_TABLE] = { .type = NLA_STRING }, + [NFTA_OBJ_NAME] = { .type = NLA_STRING }, + [NFTA_OBJ_TYPE] = { .type = NLA_U32 }, + [NFTA_OBJ_DATA] = { .type = NLA_NESTED }, +}; + +static struct nft_object *nft_obj_init(const struct nft_object_type *type, + const struct nlattr *attr) +{ + struct nlattr *tb[type->maxattr + 1]; + struct nft_object *obj; + int err; + + if (attr) { + err = nla_parse_nested(tb, type->maxattr, attr, type->policy); + if (err < 0) + goto err1; + } else { + memset(tb, 0, sizeof(tb[0]) * (type->maxattr + 1)); + } + + err = -ENOMEM; + obj = kzalloc(sizeof(struct nft_object) + type->size, GFP_KERNEL); + if (obj == NULL) + goto err1; + + err = type->init((const struct nlattr * const *)tb, obj); + if (err < 0) + goto err2; + + obj->type = type; + return obj; +err2: + kfree(obj); +err1: + return ERR_PTR(err); +} + +static int nft_object_dump(struct sk_buff *skb, unsigned int attr, + const struct nft_object *obj) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, attr); + if (!nest) + goto nla_put_failure; + if (obj->type->dump(skb, obj) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + return -1; +} + +static const struct nft_object_type *__nft_obj_type_get(u32 objtype) +{ + const struct nft_object_type *type; + + list_for_each_entry(type, &nf_tables_objects, list) { + if (objtype == type->type) + return type; + } + return NULL; +} + +static const struct nft_object_type *nft_obj_type_get(u32 objtype) +{ + const struct nft_object_type *type; + + type = __nft_obj_type_get(objtype); + if (type != NULL && try_module_get(type->owner)) + return type; + +#ifdef CONFIG_MODULES + if (type == NULL) { + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + request_module("nft-obj-%u", objtype); + nfnl_lock(NFNL_SUBSYS_NFTABLES); + if (__nft_obj_type_get(objtype)) + return ERR_PTR(-EAGAIN); + } +#endif + return ERR_PTR(-ENOENT); +} + +static int nf_tables_newobj(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + const struct nft_object_type *type; + u8 genmask = nft_genmask_next(net); + int family = nfmsg->nfgen_family; + struct nft_af_info *afi; + struct nft_table *table; + struct nft_object *obj; + struct nft_ctx ctx; + u32 objtype; + int err; + + if (!nla[NFTA_OBJ_TYPE] || + !nla[NFTA_OBJ_NAME] || + !nla[NFTA_OBJ_DATA]) + return -EINVAL; + + afi = nf_tables_afinfo_lookup(net, family, true); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask); + if (IS_ERR(table)) + return PTR_ERR(table); + + objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); + obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); + if (IS_ERR(obj)) { + err = PTR_ERR(obj); + if (err != -ENOENT) + return err; + + obj = NULL; + } + + if (obj != NULL) { + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + + return 0; + } + + nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); + + type = nft_obj_type_get(objtype); + if (IS_ERR(type)) + return PTR_ERR(type); + + obj = nft_obj_init(type, nla[NFTA_OBJ_DATA]); + if (IS_ERR(obj)) { + err = PTR_ERR(obj); + goto err1; + } + nla_strlcpy(obj->name, nla[NFTA_OBJ_NAME], NFT_OBJ_MAXNAMELEN); + + err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj); + if (err < 0) + goto err2; + + list_add_tail_rcu(&obj->list, &table->objects); + table->use++; + return 0; +err2: + if (obj->type->destroy) + obj->type->destroy(obj); + kfree(obj); +err1: + module_put(type->owner); + return err; +} + +static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq, int event, u32 flags, + int family, const struct nft_table *table, + const struct nft_object *obj) +{ + struct nfgenmsg *nfmsg; + struct nlmsghdr *nlh; + + event |= NFNL_SUBSYS_NFTABLES << 8; + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = family; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); + + if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) || + nla_put_string(skb, NFTA_OBJ_NAME, obj->name) || + nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->type->type)) || + nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) || + nft_object_dump(skb, NFTA_OBJ_DATA, obj)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -1; +} + +static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + const struct nft_af_info *afi; + const struct nft_table *table; + const struct nft_object *obj; + unsigned int idx = 0, s_idx = cb->args[0]; + struct net *net = sock_net(skb->sk); + int family = nfmsg->nfgen_family; + + rcu_read_lock(); + cb->seq = net->nft.base_seq; + + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { + if (family != NFPROTO_UNSPEC && family != afi->family) + continue; + + list_for_each_entry_rcu(table, &afi->tables, list) { + list_for_each_entry_rcu(obj, &table->objects, list) { + if (!nft_is_active(net, obj)) + goto cont; + if (idx < s_idx) + goto cont; + if (idx > s_idx) + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); + if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NFT_MSG_NEWOBJ, + NLM_F_MULTI | NLM_F_APPEND, + afi->family, table, obj) < 0) + goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + } + } +done: + rcu_read_unlock(); + + cb->args[0] = idx; + return skb->len; +} + +static int nf_tables_getobj(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_cur(net); + int family = nfmsg->nfgen_family; + const struct nft_af_info *afi; + const struct nft_table *table; + struct nft_object *obj; + struct sk_buff *skb2; + u32 objtype; + int err; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = nf_tables_dump_obj, + }; + return netlink_dump_start(nlsk, skb, nlh, &c); + } + + if (!nla[NFTA_OBJ_NAME] || + !nla[NFTA_OBJ_TYPE]) + return -EINVAL; + + afi = nf_tables_afinfo_lookup(net, family, false); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask); + if (IS_ERR(table)) + return PTR_ERR(table); + + objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); + obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); + if (IS_ERR(obj)) + return PTR_ERR(obj); + + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb2) + return -ENOMEM; + + err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, + family, table, obj); + if (err < 0) + goto err; + + return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); +err: + kfree_skb(skb2); + return err; + + return 0; +} + +static void nft_obj_destroy(struct nft_object *obj) +{ + if (obj->type->destroy) + obj->type->destroy(obj); + + module_put(obj->type->owner); + kfree(obj); +} + +static int nf_tables_delobj(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u8 genmask = nft_genmask_next(net); + int family = nfmsg->nfgen_family; + struct nft_af_info *afi; + struct nft_table *table; + struct nft_object *obj; + struct nft_ctx ctx; + u32 objtype; + + if (!nla[NFTA_OBJ_TYPE] || + !nla[NFTA_OBJ_NAME]) + return -EINVAL; + + afi = nf_tables_afinfo_lookup(net, family, true); + if (IS_ERR(afi)) + return PTR_ERR(afi); + + table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask); + if (IS_ERR(table)) + return PTR_ERR(table); + + objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); + obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); + if (IS_ERR(obj)) + return PTR_ERR(obj); + if (obj->use > 0) + return -EBUSY; + + nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); + + return nft_delobj(&ctx, obj); +} + +static int nf_tables_obj_notify(const struct nft_ctx *ctx, + struct nft_object *obj, int event) +{ + struct sk_buff *skb; + int err; + + if (!ctx->report && + !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) + return 0; + + err = -ENOBUFS; + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + goto err; + + err = nf_tables_fill_obj_info(skb, ctx->net, ctx->portid, ctx->seq, + event, 0, ctx->afi->family, ctx->table, + obj); + if (err < 0) { + kfree_skb(skb); + goto err; + } + + err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, + ctx->report, GFP_KERNEL); +err: + if (err < 0) { + nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, + err); + } + return err; +} + static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq) { @@ -3998,6 +4467,21 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { [NFT_MSG_GETGEN] = { .call = nf_tables_getgen, }, + [NFT_MSG_NEWOBJ] = { + .call_batch = nf_tables_newobj, + .attr_count = NFTA_OBJ_MAX, + .policy = nft_obj_policy, + }, + [NFT_MSG_GETOBJ] = { + .call = nf_tables_getobj, + .attr_count = NFTA_OBJ_MAX, + .policy = nft_obj_policy, + }, + [NFT_MSG_DELOBJ] = { + .call_batch = nf_tables_delobj, + .attr_count = NFTA_OBJ_MAX, + .policy = nft_obj_policy, + }, }; static void nft_chain_commit_update(struct nft_trans *trans) @@ -4040,6 +4524,9 @@ static void nf_tables_commit_release(struct nft_trans *trans) nft_set_elem_destroy(nft_trans_elem_set(trans), nft_trans_elem(trans).priv, true); break; + case NFT_MSG_DELOBJ: + nft_obj_destroy(nft_trans_obj(trans)); + break; } kfree(trans); } @@ -4147,6 +4634,17 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) atomic_dec(&te->set->nelems); te->set->ndeact--; break; + case NFT_MSG_NEWOBJ: + nft_clear(net, nft_trans_obj(trans)); + nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans), + NFT_MSG_NEWOBJ); + nft_trans_destroy(trans); + break; + case NFT_MSG_DELOBJ: + list_del_rcu(&nft_trans_obj(trans)->list); + nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans), + NFT_MSG_DELOBJ); + break; } } @@ -4181,6 +4679,9 @@ static void nf_tables_abort_release(struct nft_trans *trans) nft_set_elem_destroy(nft_trans_elem_set(trans), nft_trans_elem(trans).priv, true); break; + case NFT_MSG_NEWOBJ: + nft_obj_destroy(nft_trans_obj(trans)); + break; } kfree(trans); } @@ -4259,6 +4760,15 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) te->set->ops->activate(net, te->set, &te->elem); te->set->ndeact--; + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWOBJ: + trans->ctx.table->use--; + list_del_rcu(&nft_trans_obj(trans)->list); + break; + case NFT_MSG_DELOBJ: + trans->ctx.table->use++; + nft_clear(trans->ctx.net, nft_trans_obj(trans)); nft_trans_destroy(trans); break; } @@ -4807,6 +5317,7 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi) { struct nft_table *table, *nt; struct nft_chain *chain, *nc; + struct nft_object *obj, *ne; struct nft_rule *rule, *nr; struct nft_set *set, *ns; struct nft_ctx ctx = { @@ -4833,6 +5344,11 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi) table->use--; nft_set_destroy(set); } + list_for_each_entry_safe(obj, ne, &table->objects, list) { + list_del(&obj->list); + table->use--; + nft_obj_destroy(obj); + } list_for_each_entry_safe(chain, nc, &table->chains, list) { list_del(&chain->list); table->use--; -- cgit v1.2.3-71-gd317 From b1ce0ced101ee134c5d0bbb378b2c3cadc617f20 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 28 Nov 2016 00:04:36 +0100 Subject: netfilter: nft_counter: add stateful object type Register a new percpu counter stateful object type into the stateful object infrastructure. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 1 + net/netfilter/nft_counter.c | 140 +++++++++++++++++++++++++------ 2 files changed, 114 insertions(+), 27 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 18e30dbc8c3f..e352ef65d753 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1186,6 +1186,7 @@ enum nft_fib_flags { }; #define NFT_OBJECT_UNSPEC 0 +#define NFT_OBJECT_COUNTER 1 /** * enum nft_object_attributes - nf_tables stateful object netlink attributes diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index 77db8358ab14..6f3dd429f865 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -31,11 +31,10 @@ struct nft_counter_percpu_priv { struct nft_counter_percpu __percpu *counter; }; -static void nft_counter_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static inline void nft_counter_do_eval(struct nft_counter_percpu_priv *priv, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { - struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); struct nft_counter_percpu *this_cpu; local_bh_disable(); @@ -47,6 +46,60 @@ static void nft_counter_eval(const struct nft_expr *expr, local_bh_enable(); } +static inline void nft_counter_obj_eval(struct nft_object *obj, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_counter_percpu_priv *priv = nft_obj_data(obj); + + nft_counter_do_eval(priv, regs, pkt); +} + +static int nft_counter_do_init(const struct nlattr * const tb[], + struct nft_counter_percpu_priv *priv) +{ + struct nft_counter_percpu __percpu *cpu_stats; + struct nft_counter_percpu *this_cpu; + + cpu_stats = netdev_alloc_pcpu_stats(struct nft_counter_percpu); + if (cpu_stats == NULL) + return -ENOMEM; + + preempt_disable(); + this_cpu = this_cpu_ptr(cpu_stats); + if (tb[NFTA_COUNTER_PACKETS]) { + this_cpu->counter.packets = + be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); + } + if (tb[NFTA_COUNTER_BYTES]) { + this_cpu->counter.bytes = + be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); + } + preempt_enable(); + priv->counter = cpu_stats; + return 0; +} + +static int nft_counter_obj_init(const struct nlattr * const tb[], + struct nft_object *obj) +{ + struct nft_counter_percpu_priv *priv = nft_obj_data(obj); + + return nft_counter_do_init(tb, priv); +} + +static void nft_counter_do_destroy(struct nft_counter_percpu_priv *priv) +{ + free_percpu(priv->counter); +} + +static void nft_counter_obj_destroy(struct nft_object *obj) +{ + struct nft_counter_percpu_priv *priv = nft_obj_data(obj); + + nft_counter_do_destroy(priv); +} + static void nft_counter_fetch(const struct nft_counter_percpu __percpu *counter, struct nft_counter *total) { @@ -69,9 +122,9 @@ static void nft_counter_fetch(const struct nft_counter_percpu __percpu *counter, } } -static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_counter_do_dump(struct sk_buff *skb, + const struct nft_counter_percpu_priv *priv) { - struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); struct nft_counter total; nft_counter_fetch(priv->counter, &total); @@ -87,36 +140,54 @@ nla_put_failure: return -1; } +static int nft_counter_obj_dump(struct sk_buff *skb, + const struct nft_object *obj) +{ + const struct nft_counter_percpu_priv *priv = nft_obj_data(obj); + + return nft_counter_do_dump(skb, priv); +} + static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = { [NFTA_COUNTER_PACKETS] = { .type = NLA_U64 }, [NFTA_COUNTER_BYTES] = { .type = NLA_U64 }, }; +static struct nft_object_type nft_counter_obj __read_mostly = { + .type = NFT_OBJECT_COUNTER, + .size = sizeof(struct nft_counter_percpu_priv), + .maxattr = NFTA_COUNTER_MAX, + .policy = nft_counter_policy, + .eval = nft_counter_obj_eval, + .init = nft_counter_obj_init, + .destroy = nft_counter_obj_destroy, + .dump = nft_counter_obj_dump, + .owner = THIS_MODULE, +}; + +static void nft_counter_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + + nft_counter_do_eval(priv, regs, pkt); +} + +static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + + return nft_counter_do_dump(skb, priv); +} + static int nft_counter_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); - struct nft_counter_percpu __percpu *cpu_stats; - struct nft_counter_percpu *this_cpu; - cpu_stats = netdev_alloc_pcpu_stats(struct nft_counter_percpu); - if (cpu_stats == NULL) - return -ENOMEM; - - preempt_disable(); - this_cpu = this_cpu_ptr(cpu_stats); - if (tb[NFTA_COUNTER_PACKETS]) { - this_cpu->counter.packets = - be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); - } - if (tb[NFTA_COUNTER_BYTES]) { - this_cpu->counter.bytes = - be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); - } - preempt_enable(); - priv->counter = cpu_stats; - return 0; + return nft_counter_do_init(tb, priv); } static void nft_counter_destroy(const struct nft_ctx *ctx, @@ -124,7 +195,7 @@ static void nft_counter_destroy(const struct nft_ctx *ctx, { struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); - free_percpu(priv->counter); + nft_counter_do_destroy(priv); } static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src) @@ -174,12 +245,26 @@ static struct nft_expr_type nft_counter_type __read_mostly = { static int __init nft_counter_module_init(void) { - return nft_register_expr(&nft_counter_type); + int err; + + err = nft_register_obj(&nft_counter_obj); + if (err < 0) + return err; + + err = nft_register_expr(&nft_counter_type); + if (err < 0) + goto err1; + + return 0; +err1: + nft_unregister_obj(&nft_counter_obj); + return err; } static void __exit nft_counter_module_exit(void) { nft_unregister_expr(&nft_counter_type); + nft_unregister_obj(&nft_counter_obj); } module_init(nft_counter_module_init); @@ -188,3 +273,4 @@ module_exit(nft_counter_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); MODULE_ALIAS_NFT_EXPR("counter"); +MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_COUNTER); -- cgit v1.2.3-71-gd317 From 173705d9a2df1490478bf0d39f1b517bd489c8fa Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 28 Nov 2016 00:04:43 +0100 Subject: netfilter: nft_quota: add stateful object type Register a new quota stateful object type into the new stateful object infrastructure. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 1 + net/netfilter/nft_quota.c | 96 +++++++++++++++++++++++++++----- 2 files changed, 84 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index e352ef65d753..ad0577ba5d2a 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1187,6 +1187,7 @@ enum nft_fib_flags { #define NFT_OBJECT_UNSPEC 0 #define NFT_OBJECT_COUNTER 1 +#define NFT_OBJECT_QUOTA 2 /** * enum nft_object_attributes - nf_tables stateful object netlink attributes diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index c00104c07095..09ce72b1d6bf 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -27,12 +27,10 @@ static inline bool nft_overquota(struct nft_quota *priv, return atomic64_sub_return(pkt->skb->len, &priv->remain) < 0; } -static void nft_quota_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static inline void nft_quota_do_eval(struct nft_quota *priv, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { - struct nft_quota *priv = nft_expr_priv(expr); - if (nft_overquota(priv, pkt) ^ priv->invert) regs->verdict.code = NFT_BREAK; } @@ -42,11 +40,18 @@ static const struct nla_policy nft_quota_policy[NFTA_QUOTA_MAX + 1] = { [NFTA_QUOTA_FLAGS] = { .type = NLA_U32 }, }; -static int nft_quota_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +static void nft_quota_obj_eval(struct nft_object *obj, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_quota *priv = nft_obj_data(obj); + + nft_quota_do_eval(priv, regs, pkt); +} + +static int nft_quota_do_init(const struct nlattr * const tb[], + struct nft_quota *priv) { - struct nft_quota *priv = nft_expr_priv(expr); u32 flags = 0; u64 quota; @@ -70,9 +75,16 @@ static int nft_quota_init(const struct nft_ctx *ctx, return 0; } -static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_quota_obj_init(const struct nlattr * const tb[], + struct nft_object *obj) +{ + struct nft_quota *priv = nft_obj_data(obj); + + return nft_quota_do_init(tb, priv); +} + +static int nft_quota_do_dump(struct sk_buff *skb, const struct nft_quota *priv) { - const struct nft_quota *priv = nft_expr_priv(expr); u32 flags = priv->invert ? NFT_QUOTA_F_INV : 0; if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota), @@ -85,6 +97,49 @@ nla_put_failure: return -1; } +static int nft_quota_obj_dump(struct sk_buff *skb, const struct nft_object *obj) +{ + struct nft_quota *priv = nft_obj_data(obj); + + return nft_quota_do_dump(skb, priv); +} + +static struct nft_object_type nft_quota_obj __read_mostly = { + .type = NFT_OBJECT_QUOTA, + .size = sizeof(struct nft_quota), + .maxattr = NFTA_QUOTA_MAX, + .policy = nft_quota_policy, + .init = nft_quota_obj_init, + .eval = nft_quota_obj_eval, + .dump = nft_quota_obj_dump, + .owner = THIS_MODULE, +}; + +static void nft_quota_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_quota *priv = nft_expr_priv(expr); + + nft_quota_do_eval(priv, regs, pkt); +} + +static int nft_quota_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_quota *priv = nft_expr_priv(expr); + + return nft_quota_do_init(tb, priv); +} + +static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_quota *priv = nft_expr_priv(expr); + + return nft_quota_do_dump(skb, priv); +} + static struct nft_expr_type nft_quota_type; static const struct nft_expr_ops nft_quota_ops = { .type = &nft_quota_type, @@ -105,12 +160,26 @@ static struct nft_expr_type nft_quota_type __read_mostly = { static int __init nft_quota_module_init(void) { - return nft_register_expr(&nft_quota_type); + int err; + + err = nft_register_obj(&nft_quota_obj); + if (err < 0) + return err; + + err = nft_register_expr(&nft_quota_type); + if (err < 0) + goto err1; + + return 0; +err1: + nft_unregister_obj(&nft_quota_obj); + return err; } static void __exit nft_quota_module_exit(void) { - nft_unregister_expr(&nft_quota_type); + nft_unregister_expr(&nft_quota_type); + nft_unregister_obj(&nft_quota_obj); } module_init(nft_quota_module_init); @@ -119,3 +188,4 @@ module_exit(nft_quota_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); MODULE_ALIAS_NFT_EXPR("quota"); +MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_QUOTA); -- cgit v1.2.3-71-gd317 From c97d22e68bfedfacb9e752dee536c69916ae0933 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 28 Nov 2016 00:05:38 +0100 Subject: netfilter: nf_tables: add stateful object reference expression This new expression allows us to refer to existing stateful objects from rules. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 14 ++++ net/netfilter/Kconfig | 6 ++ net/netfilter/Makefile | 1 + net/netfilter/nft_objref.c | 112 +++++++++++++++++++++++++++++++ 4 files changed, 133 insertions(+) create mode 100644 net/netfilter/nft_objref.c (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index ad0577ba5d2a..1043ce4250c5 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1137,6 +1137,20 @@ enum nft_fwd_attributes { }; #define NFTA_FWD_MAX (__NFTA_FWD_MAX - 1) +/** + * enum nft_objref_attributes - nf_tables stateful object expression netlink attributes + * + * @NFTA_OBJREF_IMM_TYPE: object type for immediate reference (NLA_U32: nft_register) + * @NFTA_OBJREF_IMM_NAME: object name for immediate reference (NLA_STRING) + */ +enum nft_objref_attributes { + NFTA_OBJREF_UNSPEC, + NFTA_OBJREF_IMM_TYPE, + NFTA_OBJREF_IMM_NAME, + __NFTA_OBJREF_MAX +}; +#define NFTA_OBJREF_MAX (__NFTA_OBJREF_MAX - 1) + /** * enum nft_gen_attributes - nf_tables ruleset generation attributes * diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index def4be06cda6..63729b489c2c 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -551,6 +551,12 @@ config NFT_NAT This option adds the "nat" expression that you can use to perform typical Network Address Translation (NAT) packet transformations. +config NFT_OBJREF + tristate "Netfilter nf_tables stateful object reference module" + help + This option adds the "objref" expression that allows you to refer to + stateful objects, such as counters and quotas. + config NFT_QUEUE depends on NETFILTER_NETLINK_QUEUE tristate "Netfilter nf_tables queue module" diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index e4c8c1d7aaed..ca30d1960f1d 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -88,6 +88,7 @@ obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o obj-$(CONFIG_NFT_CT) += nft_ct.o obj-$(CONFIG_NFT_LIMIT) += nft_limit.o obj-$(CONFIG_NFT_NAT) += nft_nat.o +obj-$(CONFIG_NFT_OBJREF) += nft_objref.o obj-$(CONFIG_NFT_QUEUE) += nft_queue.o obj-$(CONFIG_NFT_QUOTA) += nft_quota.o obj-$(CONFIG_NFT_REJECT) += nft_reject.o diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c new file mode 100644 index 000000000000..23820f796aad --- /dev/null +++ b/net/netfilter/nft_objref.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2012-2016 Pablo Neira Ayuso + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define nft_objref_priv(expr) *((struct nft_object **)nft_expr_priv(expr)) + +static void nft_objref_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_object *obj = nft_objref_priv(expr); + + obj->type->eval(obj, regs, pkt); +} + +static int nft_objref_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_object *obj = nft_objref_priv(expr); + u8 genmask = nft_genmask_next(ctx->net); + u32 objtype; + + if (!tb[NFTA_OBJREF_IMM_NAME] || + !tb[NFTA_OBJREF_IMM_TYPE]) + return -EINVAL; + + objtype = ntohl(nla_get_be32(tb[NFTA_OBJREF_IMM_TYPE])); + obj = nf_tables_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype, + genmask); + if (IS_ERR(obj)) + return -ENOENT; + + nft_objref_priv(expr) = obj; + obj->use++; + + return 0; +} + +static int nft_objref_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_object *obj = nft_objref_priv(expr); + + if (nla_put_string(skb, NFTA_OBJREF_IMM_NAME, obj->name) || + nla_put_be32(skb, NFTA_OBJREF_IMM_TYPE, htonl(obj->type->type))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static void nft_objref_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_object *obj = nft_objref_priv(expr); + + obj->use--; +} + +static struct nft_expr_type nft_objref_type; +static const struct nft_expr_ops nft_objref_ops = { + .type = &nft_objref_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_object *)), + .eval = nft_objref_eval, + .init = nft_objref_init, + .destroy = nft_objref_destroy, + .dump = nft_objref_dump, +}; + +static const struct nla_policy nft_objref_policy[NFTA_OBJREF_MAX + 1] = { + [NFTA_OBJREF_IMM_NAME] = { .type = NLA_STRING }, + [NFTA_OBJREF_IMM_TYPE] = { .type = NLA_U32 }, +}; + +static struct nft_expr_type nft_objref_type __read_mostly = { + .name = "objref", + .ops = &nft_objref_ops, + .policy = nft_objref_policy, + .maxattr = NFTA_OBJREF_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_objref_module_init(void) +{ + return nft_register_expr(&nft_objref_type); +} + +static void __exit nft_objref_module_exit(void) +{ + nft_unregister_expr(&nft_objref_type); +} + +module_init(nft_objref_module_init); +module_exit(nft_objref_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_ALIAS_NFT_EXPR("objref"); -- cgit v1.2.3-71-gd317 From 795595f68d6c787028345804bb06f5a633af24a2 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 28 Nov 2016 00:05:52 +0100 Subject: netfilter: nft_quota: dump consumed quota Add a new attribute NFTA_QUOTA_CONSUMED that displays the amount of quota that has been already consumed. This allows us to restore the internal state of the quota object between reboots as well as to monitor how wasted it is. This patch changes the logic to account for the consumed bytes, instead of the bytes that remain to be consumed. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_quota.c | 21 ++++++++++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 1043ce4250c5..3d47582caa80 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -988,12 +988,14 @@ enum nft_quota_flags { * * @NFTA_QUOTA_BYTES: quota in bytes (NLA_U16) * @NFTA_QUOTA_FLAGS: flags (NLA_U32) + * @NFTA_QUOTA_CONSUMED: quota already consumed in bytes (NLA_U64) */ enum nft_quota_attributes { NFTA_QUOTA_UNSPEC, NFTA_QUOTA_BYTES, NFTA_QUOTA_FLAGS, NFTA_QUOTA_PAD, + NFTA_QUOTA_CONSUMED, __NFTA_QUOTA_MAX }; #define NFTA_QUOTA_MAX (__NFTA_QUOTA_MAX - 1) diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 09ce72b1d6bf..0d344209803a 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -18,20 +18,20 @@ struct nft_quota { u64 quota; bool invert; - atomic64_t remain; + atomic64_t consumed; }; static inline bool nft_overquota(struct nft_quota *priv, - const struct nft_pktinfo *pkt) + const struct sk_buff *skb) { - return atomic64_sub_return(pkt->skb->len, &priv->remain) < 0; + return atomic64_add_return(skb->len, &priv->consumed) >= priv->quota; } static inline void nft_quota_do_eval(struct nft_quota *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - if (nft_overquota(priv, pkt) ^ priv->invert) + if (nft_overquota(priv, pkt->skb) ^ priv->invert) regs->verdict.code = NFT_BREAK; } @@ -70,7 +70,7 @@ static int nft_quota_do_init(const struct nlattr * const tb[], priv->quota = quota; priv->invert = (flags & NFT_QUOTA_F_INV) ? true : false; - atomic64_set(&priv->remain, quota); + atomic64_set(&priv->consumed, 0); return 0; } @@ -86,9 +86,20 @@ static int nft_quota_obj_init(const struct nlattr * const tb[], static int nft_quota_do_dump(struct sk_buff *skb, const struct nft_quota *priv) { u32 flags = priv->invert ? NFT_QUOTA_F_INV : 0; + u64 consumed; + + consumed = atomic64_read(&priv->consumed); + /* Since we inconditionally increment consumed quota for each packet + * that we see, don't go over the quota boundary in what we send to + * userspace. + */ + if (consumed > priv->quota) + consumed = priv->quota; if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota), NFTA_QUOTA_PAD) || + nla_put_be64(skb, NFTA_QUOTA_CONSUMED, cpu_to_be64(consumed), + NFTA_QUOTA_PAD) || nla_put_be32(skb, NFTA_QUOTA_FLAGS, htonl(flags))) goto nla_put_failure; return 0; -- cgit v1.2.3-71-gd317 From 43da04a593d8b2626f1cf4b56efe9402f6b53652 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 28 Nov 2016 00:05:44 +0100 Subject: netfilter: nf_tables: atomic dump and reset for stateful objects This patch adds a new NFT_MSG_GETOBJ_RESET command perform an atomic dump-and-reset of the stateful object. This also comes with add support for atomic dump and reset for counter and quota objects. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 +- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 29 ++++++++++++----- net/netfilter/nft_counter.c | 56 +++++++++++++++++++++++++++----- net/netfilter/nft_quota.c | 18 ++++++---- 5 files changed, 85 insertions(+), 23 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 903cd618f50e..6f7d6a1dc09c 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -997,7 +997,8 @@ struct nft_object_type { struct nft_object *obj); void (*destroy)(struct nft_object *obj); int (*dump)(struct sk_buff *skb, - const struct nft_object *obj); + struct nft_object *obj, + bool reset); }; int nft_register_obj(struct nft_object_type *obj_type); diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 3d47582caa80..399eac1eee91 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -89,6 +89,7 @@ enum nft_verdicts { * @NFT_MSG_NEWOBJ: create a stateful object (enum nft_obj_attributes) * @NFT_MSG_GETOBJ: get a stateful object (enum nft_obj_attributes) * @NFT_MSG_DELOBJ: delete a stateful object (enum nft_obj_attributes) + * @NFT_MSG_GETOBJ_RESET: get and reset a stateful object (enum nft_obj_attributes) */ enum nf_tables_msg_types { NFT_MSG_NEWTABLE, @@ -112,6 +113,7 @@ enum nf_tables_msg_types { NFT_MSG_NEWOBJ, NFT_MSG_GETOBJ, NFT_MSG_DELOBJ, + NFT_MSG_GETOBJ_RESET, NFT_MSG_MAX, }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2ae717c5dcb8..bfc015af366a 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3972,14 +3972,14 @@ err1: } static int nft_object_dump(struct sk_buff *skb, unsigned int attr, - const struct nft_object *obj) + struct nft_object *obj, bool reset) { struct nlattr *nest; nest = nla_nest_start(skb, attr); if (!nest) goto nla_put_failure; - if (obj->type->dump(skb, obj) < 0) + if (obj->type->dump(skb, obj, reset) < 0) goto nla_put_failure; nla_nest_end(skb, nest); return 0; @@ -4096,7 +4096,7 @@ err1: static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, const struct nft_table *table, - const struct nft_object *obj) + struct nft_object *obj, bool reset) { struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; @@ -4115,7 +4115,7 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, nla_put_string(skb, NFTA_OBJ_NAME, obj->name) || nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->type->type)) || nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) || - nft_object_dump(skb, NFTA_OBJ_DATA, obj)) + nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -4131,10 +4131,14 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); const struct nft_af_info *afi; const struct nft_table *table; - const struct nft_object *obj; unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct nft_object *obj; + bool reset = false; + + if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) + reset = true; rcu_read_lock(); cb->seq = net->nft.base_seq; @@ -4156,7 +4160,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) cb->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, NLM_F_MULTI | NLM_F_APPEND, - afi->family, table, obj) < 0) + afi->family, table, obj, reset) < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -4183,6 +4187,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, const struct nft_table *table; struct nft_object *obj; struct sk_buff *skb2; + bool reset = false; u32 objtype; int err; @@ -4214,9 +4219,12 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, if (!skb2) return -ENOMEM; + if (NFNL_MSG_TYPE(nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) + reset = true; + err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, - family, table, obj); + family, table, obj, reset); if (err < 0) goto err; @@ -4291,7 +4299,7 @@ static int nf_tables_obj_notify(const struct nft_ctx *ctx, err = nf_tables_fill_obj_info(skb, ctx->net, ctx->portid, ctx->seq, event, 0, ctx->afi->family, ctx->table, - obj); + obj, false); if (err < 0) { kfree_skb(skb); goto err; @@ -4482,6 +4490,11 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, + [NFT_MSG_GETOBJ_RESET] = { + .call = nf_tables_getobj, + .attr_count = NFTA_OBJ_MAX, + .policy = nft_obj_policy, + }, }; static void nft_chain_commit_update(struct nft_trans *trans) diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index 6f3dd429f865..f6a02c5071c2 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -100,10 +100,10 @@ static void nft_counter_obj_destroy(struct nft_object *obj) nft_counter_do_destroy(priv); } -static void nft_counter_fetch(const struct nft_counter_percpu __percpu *counter, +static void nft_counter_fetch(struct nft_counter_percpu __percpu *counter, struct nft_counter *total) { - const struct nft_counter_percpu *cpu_stats; + struct nft_counter_percpu *cpu_stats; u64 bytes, packets; unsigned int seq; int cpu; @@ -122,12 +122,52 @@ static void nft_counter_fetch(const struct nft_counter_percpu __percpu *counter, } } +static u64 __nft_counter_reset(u64 *counter) +{ + u64 ret, old; + + do { + old = *counter; + ret = cmpxchg64(counter, old, 0); + } while (ret != old); + + return ret; +} + +static void nft_counter_reset(struct nft_counter_percpu __percpu *counter, + struct nft_counter *total) +{ + struct nft_counter_percpu *cpu_stats; + u64 bytes, packets; + unsigned int seq; + int cpu; + + memset(total, 0, sizeof(*total)); + for_each_possible_cpu(cpu) { + bytes = packets = 0; + + cpu_stats = per_cpu_ptr(counter, cpu); + do { + seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + packets += __nft_counter_reset(&cpu_stats->counter.packets); + bytes += __nft_counter_reset(&cpu_stats->counter.bytes); + } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq)); + + total->packets += packets; + total->bytes += bytes; + } +} + static int nft_counter_do_dump(struct sk_buff *skb, - const struct nft_counter_percpu_priv *priv) + const struct nft_counter_percpu_priv *priv, + bool reset) { struct nft_counter total; - nft_counter_fetch(priv->counter, &total); + if (reset) + nft_counter_reset(priv->counter, &total); + else + nft_counter_fetch(priv->counter, &total); if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes), NFTA_COUNTER_PAD) || @@ -141,11 +181,11 @@ nla_put_failure: } static int nft_counter_obj_dump(struct sk_buff *skb, - const struct nft_object *obj) + struct nft_object *obj, bool reset) { - const struct nft_counter_percpu_priv *priv = nft_obj_data(obj); + struct nft_counter_percpu_priv *priv = nft_obj_data(obj); - return nft_counter_do_dump(skb, priv); + return nft_counter_do_dump(skb, priv, reset); } static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = { @@ -178,7 +218,7 @@ static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); - return nft_counter_do_dump(skb, priv); + return nft_counter_do_dump(skb, priv, false); } static int nft_counter_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 0d344209803a..5d25f57497cb 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -83,12 +83,17 @@ static int nft_quota_obj_init(const struct nlattr * const tb[], return nft_quota_do_init(tb, priv); } -static int nft_quota_do_dump(struct sk_buff *skb, const struct nft_quota *priv) +static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv, + bool reset) { u32 flags = priv->invert ? NFT_QUOTA_F_INV : 0; u64 consumed; - consumed = atomic64_read(&priv->consumed); + if (reset) + consumed = atomic64_xchg(&priv->consumed, 0); + else + consumed = atomic64_read(&priv->consumed); + /* Since we inconditionally increment consumed quota for each packet * that we see, don't go over the quota boundary in what we send to * userspace. @@ -108,11 +113,12 @@ nla_put_failure: return -1; } -static int nft_quota_obj_dump(struct sk_buff *skb, const struct nft_object *obj) +static int nft_quota_obj_dump(struct sk_buff *skb, struct nft_object *obj, + bool reset) { struct nft_quota *priv = nft_obj_data(obj); - return nft_quota_do_dump(skb, priv); + return nft_quota_do_dump(skb, priv, reset); } static struct nft_object_type nft_quota_obj __read_mostly = { @@ -146,9 +152,9 @@ static int nft_quota_init(const struct nft_ctx *ctx, static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr) { - const struct nft_quota *priv = nft_expr_priv(expr); + struct nft_quota *priv = nft_expr_priv(expr); - return nft_quota_do_dump(skb, priv); + return nft_quota_do_dump(skb, priv, false); } static struct nft_expr_type nft_quota_type; -- cgit v1.2.3-71-gd317 From 1896531710abcd9a961a17d0c5c6a9f537d479b6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 28 Nov 2016 00:05:56 +0100 Subject: netfilter: nft_quota: add depleted flag for objects Notify on depleted quota objects. The NFT_QUOTA_F_DEPLETED flag indicates we have reached overquota. Add pointer to table from nft_object, so we can use it when sending the depletion notification to userspace. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ include/uapi/linux/netfilter/nf_tables.h | 1 + net/netfilter/nf_tables_api.c | 1 + net/netfilter/nft_quota.c | 36 +++++++++++++++++++++++++------- 4 files changed, 32 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 339e374c28b5..ce6fb6e83b32 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -940,6 +940,7 @@ int nft_verdict_dump(struct sk_buff *skb, int type, * struct nft_object - nf_tables stateful object * * @list: table stateful object list node + * @table: table this object belongs to * @type: pointer to object type * @data: pointer to object data * @name: name of this stateful object @@ -950,6 +951,7 @@ int nft_verdict_dump(struct sk_buff *skb, int type, struct nft_object { struct list_head list; char name[NFT_OBJ_MAXNAMELEN]; + struct nft_table *table; u32 genmask:2, use:30; /* runtime data below here */ diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 399eac1eee91..4864caca1e8e 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -983,6 +983,7 @@ enum nft_queue_attributes { enum nft_quota_flags { NFT_QUOTA_F_INV = (1 << 0), + NFT_QUOTA_F_DEPLETED = (1 << 1), }; /** diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9d2ed3f520ef..c5419701ca79 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4075,6 +4075,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, err = PTR_ERR(obj); goto err1; } + obj->table = table; nla_strlcpy(obj->name, nla[NFTA_OBJ_NAME], NFT_OBJ_MAXNAMELEN); err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj); diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 5d25f57497cb..7f27ebdce7ab 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -17,7 +17,7 @@ struct nft_quota { u64 quota; - bool invert; + unsigned long flags; atomic64_t consumed; }; @@ -27,11 +27,16 @@ static inline bool nft_overquota(struct nft_quota *priv, return atomic64_add_return(skb->len, &priv->consumed) >= priv->quota; } +static inline bool nft_quota_invert(struct nft_quota *priv) +{ + return priv->flags & NFT_QUOTA_F_INV; +} + static inline void nft_quota_do_eval(struct nft_quota *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - if (nft_overquota(priv, pkt->skb) ^ priv->invert) + if (nft_overquota(priv, pkt->skb) ^ nft_quota_invert(priv)) regs->verdict.code = NFT_BREAK; } @@ -40,19 +45,29 @@ static const struct nla_policy nft_quota_policy[NFTA_QUOTA_MAX + 1] = { [NFTA_QUOTA_FLAGS] = { .type = NLA_U32 }, }; +#define NFT_QUOTA_DEPLETED_BIT 1 /* From NFT_QUOTA_F_DEPLETED. */ + static void nft_quota_obj_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_quota *priv = nft_obj_data(obj); + bool overquota; - nft_quota_do_eval(priv, regs, pkt); + overquota = nft_overquota(priv, pkt->skb); + if (overquota ^ nft_quota_invert(priv)) + regs->verdict.code = NFT_BREAK; + + if (overquota && + !test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags)) + nft_obj_notify(nft_net(pkt), obj->table, obj, 0, 0, + NFT_MSG_NEWOBJ, nft_pf(pkt), 0, GFP_ATOMIC); } static int nft_quota_do_init(const struct nlattr * const tb[], struct nft_quota *priv) { - u32 flags = 0; + unsigned long flags = 0; u64 quota; if (!tb[NFTA_QUOTA_BYTES]) @@ -66,10 +81,12 @@ static int nft_quota_do_init(const struct nlattr * const tb[], flags = ntohl(nla_get_be32(tb[NFTA_QUOTA_FLAGS])); if (flags & ~NFT_QUOTA_F_INV) return -EINVAL; + if (flags & NFT_QUOTA_F_DEPLETED) + return -EOPNOTSUPP; } priv->quota = quota; - priv->invert = (flags & NFT_QUOTA_F_INV) ? true : false; + priv->flags = flags; atomic64_set(&priv->consumed, 0); return 0; @@ -86,13 +103,16 @@ static int nft_quota_obj_init(const struct nlattr * const tb[], static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv, bool reset) { - u32 flags = priv->invert ? NFT_QUOTA_F_INV : 0; + u32 flags = priv->flags; u64 consumed; - if (reset) + if (reset) { consumed = atomic64_xchg(&priv->consumed, 0); - else + if (test_and_clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags)) + flags |= NFT_QUOTA_F_DEPLETED; + } else { consumed = atomic64_read(&priv->consumed); + } /* Since we inconditionally increment consumed quota for each packet * that we see, don't go over the quota boundary in what we send to -- cgit v1.2.3-71-gd317 From 8aeff920dcc9b3f8cf43042a76428582634d9208 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 28 Nov 2016 00:06:00 +0100 Subject: netfilter: nf_tables: add stateful object reference to set elements This patch allows you to refer to stateful objects from set elements. This provides the infrastructure to create maps where the right hand side of the mapping is a stateful object. This allows us to build dictionaries of stateful objects, that you can use to perform fast lookups using any arbitrary key combination. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 9 ++++ include/uapi/linux/netfilter/nf_tables.h | 8 ++++ net/netfilter/nf_tables_api.c | 72 +++++++++++++++++++++++++++----- 3 files changed, 79 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index ce6fb6e83b32..85f0f03f1e87 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -326,6 +326,7 @@ void nft_unregister_set(struct nft_set_ops *ops); * @name: name of the set * @ktype: key type (numeric type defined by userspace, not used in the kernel) * @dtype: data type (verdict or numeric type defined by userspace) + * @objtype: object type (see NFT_OBJECT_* definitions) * @size: maximum set size * @nelems: number of elements * @ndeact: number of deactivated elements queued for removal @@ -347,6 +348,7 @@ struct nft_set { char name[NFT_SET_MAXNAMELEN]; u32 ktype; u32 dtype; + u32 objtype; u32 size; atomic_t nelems; u32 ndeact; @@ -416,6 +418,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, * @NFT_SET_EXT_EXPIRATION: element expiration time * @NFT_SET_EXT_USERDATA: user data associated with the element * @NFT_SET_EXT_EXPR: expression assiociated with the element + * @NFT_SET_EXT_OBJREF: stateful object reference associated with element * @NFT_SET_EXT_NUM: number of extension types */ enum nft_set_extensions { @@ -426,6 +429,7 @@ enum nft_set_extensions { NFT_SET_EXT_EXPIRATION, NFT_SET_EXT_USERDATA, NFT_SET_EXT_EXPR, + NFT_SET_EXT_OBJREF, NFT_SET_EXT_NUM }; @@ -554,6 +558,11 @@ static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, return elem + set->ops->elemsize; } +static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext) +{ + return nft_set_ext(ext, NFT_SET_EXT_OBJREF); +} + void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *data, diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 4864caca1e8e..a6b52dbff08c 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -255,6 +255,7 @@ enum nft_rule_compat_attributes { * @NFT_SET_MAP: set is used as a dictionary * @NFT_SET_TIMEOUT: set uses timeouts * @NFT_SET_EVAL: set contains expressions for evaluation + * @NFT_SET_OBJECT: set contains stateful objects */ enum nft_set_flags { NFT_SET_ANONYMOUS = 0x1, @@ -263,6 +264,7 @@ enum nft_set_flags { NFT_SET_MAP = 0x8, NFT_SET_TIMEOUT = 0x10, NFT_SET_EVAL = 0x20, + NFT_SET_OBJECT = 0x40, }; /** @@ -304,6 +306,7 @@ enum nft_set_desc_attributes { * @NFTA_SET_TIMEOUT: default timeout value (NLA_U64) * @NFTA_SET_GC_INTERVAL: garbage collection interval (NLA_U32) * @NFTA_SET_USERDATA: user data (NLA_BINARY) + * @NFTA_SET_OBJ_TYPE: stateful object type (NLA_U32: NFT_OBJECT_*) */ enum nft_set_attributes { NFTA_SET_UNSPEC, @@ -321,6 +324,7 @@ enum nft_set_attributes { NFTA_SET_GC_INTERVAL, NFTA_SET_USERDATA, NFTA_SET_PAD, + NFTA_SET_OBJ_TYPE, __NFTA_SET_MAX }; #define NFTA_SET_MAX (__NFTA_SET_MAX - 1) @@ -344,6 +348,7 @@ enum nft_set_elem_flags { * @NFTA_SET_ELEM_EXPIRATION: expiration time (NLA_U64) * @NFTA_SET_ELEM_USERDATA: user data (NLA_BINARY) * @NFTA_SET_ELEM_EXPR: expression (NLA_NESTED: nft_expr_attributes) + * @NFTA_SET_ELEM_OBJREF: stateful object reference (NLA_STRING) */ enum nft_set_elem_attributes { NFTA_SET_ELEM_UNSPEC, @@ -355,6 +360,7 @@ enum nft_set_elem_attributes { NFTA_SET_ELEM_USERDATA, NFTA_SET_ELEM_EXPR, NFTA_SET_ELEM_PAD, + NFTA_SET_ELEM_OBJREF, __NFTA_SET_ELEM_MAX }; #define NFTA_SET_ELEM_MAX (__NFTA_SET_ELEM_MAX - 1) @@ -1207,6 +1213,8 @@ enum nft_fib_flags { #define NFT_OBJECT_UNSPEC 0 #define NFT_OBJECT_COUNTER 1 #define NFT_OBJECT_QUOTA 2 +#define __NFT_OBJECT_MAX 3 +#define NFT_OBJECT_MAX (__NFT_OBJECT_MAX - 1) /** * enum nft_object_attributes - nf_tables stateful object netlink attributes diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c5419701ca79..8228714c42d5 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2452,6 +2452,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_GC_INTERVAL] = { .type = NLA_U32 }, [NFTA_SET_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, + [NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 }, }; static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { @@ -2609,6 +2610,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, if (nla_put_be32(skb, NFTA_SET_DATA_LEN, htonl(set->dlen))) goto nla_put_failure; } + if (set->flags & NFT_SET_OBJECT && + nla_put_be32(skb, NFTA_SET_OBJ_TYPE, htonl(set->objtype))) + goto nla_put_failure; if (set->timeout && nla_put_be64(skb, NFTA_SET_TIMEOUT, @@ -2838,7 +2842,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, unsigned int size; bool create; u64 timeout; - u32 ktype, dtype, flags, policy, gc_int; + u32 ktype, dtype, flags, policy, gc_int, objtype; struct nft_set_desc desc; unsigned char *udata; u16 udlen; @@ -2868,11 +2872,12 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT | NFT_SET_INTERVAL | NFT_SET_TIMEOUT | - NFT_SET_MAP | NFT_SET_EVAL)) + NFT_SET_MAP | NFT_SET_EVAL | + NFT_SET_OBJECT)) return -EINVAL; - /* Only one of both operations is supported */ - if ((flags & (NFT_SET_MAP | NFT_SET_EVAL)) == - (NFT_SET_MAP | NFT_SET_EVAL)) + /* Only one of these operations is supported */ + if ((flags & (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) == + (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) return -EOPNOTSUPP; } @@ -2897,6 +2902,19 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, } else if (flags & NFT_SET_MAP) return -EINVAL; + if (nla[NFTA_SET_OBJ_TYPE] != NULL) { + if (!(flags & NFT_SET_OBJECT)) + return -EINVAL; + + objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE])); + if (objtype == NFT_OBJECT_UNSPEC || + objtype > NFT_OBJECT_MAX) + return -EINVAL; + } else if (flags & NFT_SET_OBJECT) + return -EINVAL; + else + objtype = NFT_OBJECT_UNSPEC; + timeout = 0; if (nla[NFTA_SET_TIMEOUT] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) @@ -2984,6 +3002,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, set->ktype = ktype; set->klen = desc.klen; set->dtype = dtype; + set->objtype = objtype; set->dlen = desc.dlen; set->flags = flags; set->size = desc.size; @@ -3126,6 +3145,10 @@ const struct nft_set_ext_type nft_set_ext_types[] = { [NFT_SET_EXT_EXPR] = { .align = __alignof__(struct nft_expr), }, + [NFT_SET_EXT_OBJREF] = { + .len = sizeof(struct nft_object *), + .align = __alignof__(struct nft_object *), + }, [NFT_SET_EXT_FLAGS] = { .len = sizeof(u8), .align = __alignof__(u8), @@ -3214,6 +3237,11 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, nft_set_ext_expr(ext)) < 0) goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) && + nla_put_string(skb, NFTA_SET_ELEM_OBJREF, + (*nft_set_ext_obj(ext))->name) < 0) + goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && nla_put_be32(skb, NFTA_SET_ELEM_FLAGS, htonl(*nft_set_ext_flags(ext)))) @@ -3508,7 +3536,8 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, nft_data_uninit(nft_set_ext_data(ext), set->dtype); if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext)); - + if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) + (*nft_set_ext_obj(ext))->use--; kfree(elem); } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); @@ -3533,11 +3562,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr, u32 nlmsg_flags) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; + u8 genmask = nft_genmask_next(ctx->net); struct nft_data_desc d1, d2; struct nft_set_ext_tmpl tmpl; struct nft_set_ext *ext, *ext2; struct nft_set_elem elem; struct nft_set_binding *binding; + struct nft_object *obj = NULL; struct nft_userdata *udata; struct nft_data data; enum nft_registers dreg; @@ -3600,6 +3631,20 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); } + if (nla[NFTA_SET_ELEM_OBJREF] != NULL) { + if (!(set->flags & NFT_SET_OBJECT)) { + err = -EINVAL; + goto err2; + } + obj = nf_tables_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF], + set->objtype, genmask); + if (IS_ERR(obj)) { + err = PTR_ERR(obj); + goto err2; + } + nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF); + } + if (nla[NFTA_SET_ELEM_DATA] != NULL) { err = nft_data_init(ctx, &data, sizeof(data), &d2, nla[NFTA_SET_ELEM_DATA]); @@ -3658,6 +3703,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, udata->len = ulen - 1; nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen); } + if (obj) { + *nft_set_ext_obj(ext) = obj; + obj->use++; + } trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) @@ -3667,10 +3716,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = set->ops->insert(ctx->net, set, &elem, &ext2); if (err) { if (err == -EEXIST) { - if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && - nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && - memcmp(nft_set_ext_data(ext), - nft_set_ext_data(ext2), set->dlen) != 0) + if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && + nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && + memcmp(nft_set_ext_data(ext), + nft_set_ext_data(ext2), set->dlen) != 0) || + (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) && + nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF) && + *nft_set_ext_obj(ext) != *nft_set_ext_obj(ext2))) err = -EBUSY; else if (!(nlmsg_flags & NLM_F_EXCL)) err = 0; -- cgit v1.2.3-71-gd317 From 63aea29060025fd2732680aa48a6b97687b93af8 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 28 Nov 2016 00:06:03 +0100 Subject: netfilter: nft_objref: support for stateful object maps This patch allows us to refer to stateful object dictionaries, the source register indicates the key data to be used to look up for the corresponding state object. We can refer to these maps through names or, alternatively, the map transaction id. This allows us to refer to both anonymous and named maps. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 6 ++ net/netfilter/nf_tables_api.c | 4 ++ net/netfilter/nft_objref.c | 116 ++++++++++++++++++++++++++++++- 3 files changed, 125 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index a6b52dbff08c..881d49e94569 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1153,11 +1153,17 @@ enum nft_fwd_attributes { * * @NFTA_OBJREF_IMM_TYPE: object type for immediate reference (NLA_U32: nft_register) * @NFTA_OBJREF_IMM_NAME: object name for immediate reference (NLA_STRING) + * @NFTA_OBJREF_SET_SREG: source register of the data to look for (NLA_U32: nft_registers) + * @NFTA_OBJREF_SET_NAME: name of the set where to look for (NLA_STRING) + * @NFTA_OBJREF_SET_ID: id of the set where to look for in this transaction (NLA_U32) */ enum nft_objref_attributes { NFTA_OBJREF_UNSPEC, NFTA_OBJREF_IMM_TYPE, NFTA_OBJREF_IMM_NAME, + NFTA_OBJREF_SET_SREG, + NFTA_OBJREF_SET_NAME, + NFTA_OBJREF_SET_ID, __NFTA_OBJREF_MAX }; #define NFTA_OBJREF_MAX (__NFTA_OBJREF_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8228714c42d5..b4db5bf4c135 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2504,6 +2504,7 @@ struct nft_set *nf_tables_set_lookup(const struct nft_table *table, } return ERR_PTR(-ENOENT); } +EXPORT_SYMBOL_GPL(nf_tables_set_lookup); struct nft_set *nf_tables_set_lookup_byid(const struct net *net, const struct nlattr *nla, @@ -2522,6 +2523,7 @@ struct nft_set *nf_tables_set_lookup_byid(const struct net *net, } return ERR_PTR(-ENOENT); } +EXPORT_SYMBOL_GPL(nf_tables_set_lookup_byid); static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, const char *name) @@ -3124,6 +3126,7 @@ bind: list_add_tail_rcu(&binding->list, &set->bindings); return 0; } +EXPORT_SYMBOL_GPL(nf_tables_bind_set); void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding) @@ -3134,6 +3137,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, nft_is_active(ctx->net, set)) nf_tables_set_destroy(ctx, set); } +EXPORT_SYMBOL_GPL(nf_tables_unbind_set); const struct nft_set_ext_type nft_set_ext_types[] = { [NFT_SET_EXT_KEY] = { diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 23820f796aad..415a65ba2b85 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -81,14 +81,128 @@ static const struct nft_expr_ops nft_objref_ops = { .dump = nft_objref_dump, }; +struct nft_objref_map { + struct nft_set *set; + enum nft_registers sreg:8; + struct nft_set_binding binding; +}; + +static void nft_objref_map_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_objref_map *priv = nft_expr_priv(expr); + const struct nft_set *set = priv->set; + const struct nft_set_ext *ext; + struct nft_object *obj; + bool found; + + found = set->ops->lookup(nft_net(pkt), set, ®s->data[priv->sreg], + &ext); + if (!found) { + regs->verdict.code = NFT_BREAK; + return; + } + obj = *nft_set_ext_obj(ext); + obj->type->eval(obj, regs, pkt); +} + +static int nft_objref_map_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_objref_map *priv = nft_expr_priv(expr); + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set *set; + int err; + + set = nf_tables_set_lookup(ctx->table, tb[NFTA_OBJREF_SET_NAME], genmask); + if (IS_ERR(set)) { + if (tb[NFTA_OBJREF_SET_ID]) { + set = nf_tables_set_lookup_byid(ctx->net, + tb[NFTA_OBJREF_SET_ID], + genmask); + } + if (IS_ERR(set)) + return PTR_ERR(set); + } + + if (!(set->flags & NFT_SET_OBJECT)) + return -EINVAL; + + priv->sreg = nft_parse_register(tb[NFTA_OBJREF_SET_SREG]); + err = nft_validate_register_load(priv->sreg, set->klen); + if (err < 0) + return err; + + priv->binding.flags = set->flags & NFT_SET_OBJECT; + + err = nf_tables_bind_set(ctx, set, &priv->binding); + if (err < 0) + return err; + + priv->set = set; + return 0; +} + +static int nft_objref_map_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_objref_map *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_OBJREF_SET_SREG, priv->sreg) || + nla_put_string(skb, NFTA_OBJREF_SET_NAME, priv->set->name)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static void nft_objref_map_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_objref_map *priv = nft_expr_priv(expr); + + nf_tables_unbind_set(ctx, priv->set, &priv->binding); +} + +static struct nft_expr_type nft_objref_type; +static const struct nft_expr_ops nft_objref_map_ops = { + .type = &nft_objref_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)), + .eval = nft_objref_map_eval, + .init = nft_objref_map_init, + .destroy = nft_objref_map_destroy, + .dump = nft_objref_map_dump, +}; + +static const struct nft_expr_ops * +nft_objref_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (tb[NFTA_OBJREF_SET_SREG] && + (tb[NFTA_OBJREF_SET_NAME] || + tb[NFTA_OBJREF_SET_ID])) + return &nft_objref_map_ops; + else if (tb[NFTA_OBJREF_IMM_NAME] && + tb[NFTA_OBJREF_IMM_TYPE]) + return &nft_objref_ops; + + return ERR_PTR(-EOPNOTSUPP); +} + static const struct nla_policy nft_objref_policy[NFTA_OBJREF_MAX + 1] = { [NFTA_OBJREF_IMM_NAME] = { .type = NLA_STRING }, [NFTA_OBJREF_IMM_TYPE] = { .type = NLA_U32 }, + [NFTA_OBJREF_SET_SREG] = { .type = NLA_U32 }, + [NFTA_OBJREF_SET_NAME] = { .type = NLA_STRING }, + [NFTA_OBJREF_SET_ID] = { .type = NLA_U32 }, }; static struct nft_expr_type nft_objref_type __read_mostly = { .name = "objref", - .ops = &nft_objref_ops, + .select_ops = nft_objref_select_ops, .policy = nft_objref_policy, .maxattr = NFTA_OBJREF_MAX, .owner = THIS_MODULE, -- cgit v1.2.3-71-gd317 From 2c16d60332643e90d4fa244f4a706c454b8c7569 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 6 Dec 2016 16:25:02 -0500 Subject: netfilter: xt_bpf: support ebpf Add support for attaching an eBPF object by file descriptor. The iptables binary can be called with a path to an elf object or a pinned bpf object. Also pass the mode and path to the kernel to be able to return it later for iptables dump and save. Signed-off-by: Willem de Bruijn Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_bpf.h | 21 ++++++++ net/netfilter/xt_bpf.c | 96 +++++++++++++++++++++++++++++------ 2 files changed, 101 insertions(+), 16 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/xt_bpf.h b/include/uapi/linux/netfilter/xt_bpf.h index 1fad2c27ac32..b97725af2ac0 100644 --- a/include/uapi/linux/netfilter/xt_bpf.h +++ b/include/uapi/linux/netfilter/xt_bpf.h @@ -2,9 +2,11 @@ #define _XT_BPF_H #include +#include #include #define XT_BPF_MAX_NUM_INSTR 64 +#define XT_BPF_PATH_MAX (XT_BPF_MAX_NUM_INSTR * sizeof(struct sock_filter)) struct bpf_prog; @@ -16,4 +18,23 @@ struct xt_bpf_info { struct bpf_prog *filter __attribute__((aligned(8))); }; +enum xt_bpf_modes { + XT_BPF_MODE_BYTECODE, + XT_BPF_MODE_FD_PINNED, + XT_BPF_MODE_FD_ELF, +}; + +struct xt_bpf_info_v1 { + __u16 mode; + __u16 bpf_program_num_elem; + __s32 fd; + union { + struct sock_filter bpf_program[XT_BPF_MAX_NUM_INSTR]; + char path[XT_BPF_PATH_MAX]; + }; + + /* only used in the kernel */ + struct bpf_prog *filter __attribute__((aligned(8))); +}; + #endif /*_XT_BPF_H */ diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index dffee9d47ec4..2dedaa23ab0a 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -20,15 +21,15 @@ MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_bpf"); MODULE_ALIAS("ip6t_bpf"); -static int bpf_mt_check(const struct xt_mtchk_param *par) +static int __bpf_mt_check_bytecode(struct sock_filter *insns, __u16 len, + struct bpf_prog **ret) { - struct xt_bpf_info *info = par->matchinfo; struct sock_fprog_kern program; - program.len = info->bpf_program_num_elem; - program.filter = info->bpf_program; + program.len = len; + program.filter = insns; - if (bpf_prog_create(&info->filter, &program)) { + if (bpf_prog_create(ret, &program)) { pr_info("bpf: check failed: parse error\n"); return -EINVAL; } @@ -36,6 +37,42 @@ static int bpf_mt_check(const struct xt_mtchk_param *par) return 0; } +static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret) +{ + struct bpf_prog *prog; + + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + *ret = prog; + return 0; +} + +static int bpf_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_bpf_info *info = par->matchinfo; + + return __bpf_mt_check_bytecode(info->bpf_program, + info->bpf_program_num_elem, + &info->filter); +} + +static int bpf_mt_check_v1(const struct xt_mtchk_param *par) +{ + struct xt_bpf_info_v1 *info = par->matchinfo; + + if (info->mode == XT_BPF_MODE_BYTECODE) + return __bpf_mt_check_bytecode(info->bpf_program, + info->bpf_program_num_elem, + &info->filter); + else if (info->mode == XT_BPF_MODE_FD_PINNED || + info->mode == XT_BPF_MODE_FD_ELF) + return __bpf_mt_check_fd(info->fd, &info->filter); + else + return -EINVAL; +} + static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_bpf_info *info = par->matchinfo; @@ -43,31 +80,58 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par) return BPF_PROG_RUN(info->filter, skb); } +static bool bpf_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_bpf_info_v1 *info = par->matchinfo; + + return !!bpf_prog_run_save_cb(info->filter, (struct sk_buff *) skb); +} + static void bpf_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_bpf_info *info = par->matchinfo; + + bpf_prog_destroy(info->filter); +} + +static void bpf_mt_destroy_v1(const struct xt_mtdtor_param *par) +{ + const struct xt_bpf_info_v1 *info = par->matchinfo; + bpf_prog_destroy(info->filter); } -static struct xt_match bpf_mt_reg __read_mostly = { - .name = "bpf", - .revision = 0, - .family = NFPROTO_UNSPEC, - .checkentry = bpf_mt_check, - .match = bpf_mt, - .destroy = bpf_mt_destroy, - .matchsize = sizeof(struct xt_bpf_info), - .me = THIS_MODULE, +static struct xt_match bpf_mt_reg[] __read_mostly = { + { + .name = "bpf", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = bpf_mt_check, + .match = bpf_mt, + .destroy = bpf_mt_destroy, + .matchsize = sizeof(struct xt_bpf_info), + .me = THIS_MODULE, + }, + { + .name = "bpf", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = bpf_mt_check_v1, + .match = bpf_mt_v1, + .destroy = bpf_mt_destroy_v1, + .matchsize = sizeof(struct xt_bpf_info_v1), + .me = THIS_MODULE, + }, }; static int __init bpf_mt_init(void) { - return xt_register_match(&bpf_mt_reg); + return xt_register_matches(bpf_mt_reg, ARRAY_SIZE(bpf_mt_reg)); } static void __exit bpf_mt_exit(void) { - xt_unregister_match(&bpf_mt_reg); + xt_unregister_matches(bpf_mt_reg, ARRAY_SIZE(bpf_mt_reg)); } module_init(bpf_mt_init); -- cgit v1.2.3-71-gd317 From faa3ffce78298b2b782297765cffd05f52fed9d4 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Wed, 7 Dec 2016 14:03:10 +0200 Subject: net/sched: cls_flower: Add support for matching on flags Add UAPI to provide set of flags for matching, where the flags provided from user-space are mapped to flow-dissector flags. The 1st flag allows to match on whether the packet is an IP fragment and corresponds to the FLOW_DIS_IS_FRAGMENT flag. Signed-off-by: Or Gerlitz Reviewed-by: Paul Blakey Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 7 ++++ net/sched/cls_flower.c | 76 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 1adc0b654996..0ad9f0bce043 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -458,11 +458,18 @@ enum { TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK, /* be16 */ TCA_FLOWER_KEY_ENC_UDP_DST_PORT, /* be16 */ TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, /* be16 */ + + TCA_FLOWER_KEY_FLAGS, /* be32 */ + TCA_FLOWER_KEY_FLAGS_MASK, /* be32 */ __TCA_FLOWER_MAX, }; #define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1) +enum { + TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), +}; + /* Match-all classifier */ enum { diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 29a9e6d9f274..6c7e1f5052c9 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -386,6 +386,8 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_ENC_UDP_DST_PORT] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_FLAGS] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_FLAGS_MASK] = { .type = NLA_U32 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -420,6 +422,39 @@ static void fl_set_key_vlan(struct nlattr **tb, } } +static void fl_set_key_flag(u32 flower_key, u32 flower_mask, + u32 *dissector_key, u32 *dissector_mask, + u32 flower_flag_bit, u32 dissector_flag_bit) +{ + if (flower_mask & flower_flag_bit) { + *dissector_mask |= dissector_flag_bit; + if (flower_key & flower_flag_bit) + *dissector_key |= dissector_flag_bit; + } +} + +static void fl_set_key_flags(struct nlattr **tb, + u32 *flags_key, u32 *flags_mask) +{ + u32 key, mask; + + if (!tb[TCA_FLOWER_KEY_FLAGS]) + return; + + key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS])); + + if (!tb[TCA_FLOWER_KEY_FLAGS_MASK]) + mask = ~0; + else + mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK])); + + *flags_key = 0; + *flags_mask = 0; + + fl_set_key_flag(key, mask, flags_key, flags_mask, + TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT); +} + static int fl_set_key(struct net *net, struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask) { @@ -546,6 +581,8 @@ static int fl_set_key(struct net *net, struct nlattr **tb, &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, sizeof(key->enc_tp.dst)); + fl_set_key_flags(tb, &key->control.flags, &mask->control.flags); + return 0; } @@ -880,6 +917,42 @@ static int fl_dump_key_vlan(struct sk_buff *skb, return 0; } +static void fl_get_key_flag(u32 dissector_key, u32 dissector_mask, + u32 *flower_key, u32 *flower_mask, + u32 flower_flag_bit, u32 dissector_flag_bit) +{ + if (dissector_mask & dissector_flag_bit) { + *flower_mask |= flower_flag_bit; + if (dissector_key & dissector_flag_bit) + *flower_key |= flower_flag_bit; + } +} + +static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask) +{ + u32 key, mask; + __be32 _key, _mask; + int err; + + if (!memchr_inv(&flags_mask, 0, sizeof(flags_mask))) + return 0; + + key = 0; + mask = 0; + + fl_get_key_flag(flags_key, flags_mask, &key, &mask, + TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT); + + _key = cpu_to_be32(key); + _mask = cpu_to_be32(mask); + + err = nla_put(skb, TCA_FLOWER_KEY_FLAGS, 4, &_key); + if (err) + return err; + + return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask); +} + static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { @@ -1015,6 +1088,9 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, sizeof(key->enc_tp.dst))) goto nla_put_failure; + if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) + goto nla_put_failure; + nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags); if (tcf_exts_dump(skb, &f->exts)) -- cgit v1.2.3-71-gd317 From 7b684884fbfab33251115fa5054fb821c34b93be Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 7 Dec 2016 13:48:28 +0100 Subject: net/sched: cls_flower: Support matching on ICMP type and code Support matching on ICMP type and code. Example usage: tc qdisc add dev eth0 ingress tc filter add dev eth0 protocol ip parent ffff: flower \ indev eth0 ip_proto icmp type 8 code 0 action drop tc filter add dev eth0 protocol ipv6 parent ffff: flower \ indev eth0 ip_proto icmpv6 type 128 code 0 action drop Signed-off-by: Simon Horman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 10 +++++++++ net/sched/cls_flower.c | 53 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 0ad9f0bce043..cb4bcdc58543 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -461,6 +461,16 @@ enum { TCA_FLOWER_KEY_FLAGS, /* be32 */ TCA_FLOWER_KEY_FLAGS_MASK, /* be32 */ + + TCA_FLOWER_KEY_ICMPV4_CODE, /* u8 */ + TCA_FLOWER_KEY_ICMPV4_CODE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV4_TYPE, /* u8 */ + TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV6_CODE, /* u8 */ + TCA_FLOWER_KEY_ICMPV6_CODE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV6_TYPE, /* u8 */ + TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,/* u8 */ + __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 6c7e1f5052c9..e040c5140f61 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -39,6 +39,7 @@ struct fl_flow_key { struct flow_dissector_key_ipv6_addrs ipv6; }; struct flow_dissector_key_ports tp; + struct flow_dissector_key_icmp icmp; struct flow_dissector_key_keyid enc_key_id; union { struct flow_dissector_key_ipv4_addrs enc_ipv4; @@ -388,6 +389,14 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_FLAGS] = { .type = NLA_U32 }, [TCA_FLOWER_KEY_FLAGS_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ICMPV4_TYPE] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ICMPV4_TYPE_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ICMPV4_CODE] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ICMPV4_CODE_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ICMPV6_TYPE] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ICMPV6_TYPE_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ICMPV6_CODE] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ICMPV6_CODE_MASK] = { .type = NLA_U8 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -537,6 +546,26 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_SCTP_DST, &mask->tp.dst, TCA_FLOWER_KEY_SCTP_DST_MASK, sizeof(key->tp.dst)); + } else if (key->basic.n_proto == htons(ETH_P_IP) && + key->basic.ip_proto == IPPROTO_ICMP) { + fl_set_key_val(tb, &key->icmp.type, TCA_FLOWER_KEY_ICMPV4_TYPE, + &mask->icmp.type, + TCA_FLOWER_KEY_ICMPV4_TYPE_MASK, + sizeof(key->icmp.type)); + fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV4_CODE, + &mask->icmp.code, + TCA_FLOWER_KEY_ICMPV4_CODE_MASK, + sizeof(key->icmp.code)); + } else if (key->basic.n_proto == htons(ETH_P_IPV6) && + key->basic.ip_proto == IPPROTO_ICMPV6) { + fl_set_key_val(tb, &key->icmp.type, TCA_FLOWER_KEY_ICMPV6_TYPE, + &mask->icmp.type, + TCA_FLOWER_KEY_ICMPV6_TYPE_MASK, + sizeof(key->icmp.type)); + fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV4_CODE, + &mask->icmp.code, + TCA_FLOWER_KEY_ICMPV4_CODE_MASK, + sizeof(key->icmp.code)); } if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] || @@ -648,6 +677,8 @@ static void fl_init_dissector(struct cls_fl_head *head, FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_ICMP, icmp); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_VLAN, vlan); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, @@ -1050,6 +1081,28 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, &mask->tp.dst, TCA_FLOWER_KEY_SCTP_DST_MASK, sizeof(key->tp.dst)))) goto nla_put_failure; + else if (key->basic.n_proto == htons(ETH_P_IP) && + key->basic.ip_proto == IPPROTO_ICMP && + (fl_dump_key_val(skb, &key->icmp.type, + TCA_FLOWER_KEY_ICMPV4_TYPE, &mask->icmp.type, + TCA_FLOWER_KEY_ICMPV4_TYPE_MASK, + sizeof(key->icmp.type)) || + fl_dump_key_val(skb, &key->icmp.code, + TCA_FLOWER_KEY_ICMPV4_CODE, &mask->icmp.code, + TCA_FLOWER_KEY_ICMPV4_CODE_MASK, + sizeof(key->icmp.code)))) + goto nla_put_failure; + else if (key->basic.n_proto == htons(ETH_P_IPV6) && + key->basic.ip_proto == IPPROTO_ICMPV6 && + (fl_dump_key_val(skb, &key->icmp.type, + TCA_FLOWER_KEY_ICMPV6_TYPE, &mask->icmp.type, + TCA_FLOWER_KEY_ICMPV6_TYPE_MASK, + sizeof(key->icmp.type)) || + fl_dump_key_val(skb, &key->icmp.code, + TCA_FLOWER_KEY_ICMPV6_CODE, &mask->icmp.code, + TCA_FLOWER_KEY_ICMPV6_CODE_MASK, + sizeof(key->icmp.code)))) + goto nla_put_failure; if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && (fl_dump_key_val(skb, &key->enc_ipv4.src, -- cgit v1.2.3-71-gd317 From 17bedab2723145d17b14084430743549e6943d03 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 7 Dec 2016 15:53:11 -0800 Subject: bpf: xdp: Allow head adjustment in XDP prog This patch allows XDP prog to extend/remove the packet data at the head (like adding or removing header). It is done by adding a new XDP helper bpf_xdp_adjust_head(). It also renames bpf_helper_changes_skb_data() to bpf_helper_changes_pkt_data() to better reflect that XDP prog does not work on skb. This patch adds one "xdp_adjust_head" bit to bpf_prog for the XDP-capable driver to check if the XDP prog requires bpf_xdp_adjust_head() support. The driver can then decide to error out during XDP_SETUP_PROG. Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- arch/powerpc/net/bpf_jit_comp64.c | 4 ++-- arch/s390/net/bpf_jit_comp.c | 2 +- arch/x86/net/bpf_jit_comp.c | 2 +- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 5 ++++ drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 5 ++++ .../net/ethernet/netronome/nfp/nfp_net_common.c | 4 ++++ drivers/net/ethernet/qlogic/qede/qede_main.c | 5 ++++ include/linux/filter.h | 6 +++-- include/uapi/linux/bpf.h | 11 ++++++++- kernel/bpf/core.c | 2 +- kernel/bpf/syscall.c | 2 ++ kernel/bpf/verifier.c | 2 +- net/core/filter.c | 28 ++++++++++++++++++++-- 13 files changed, 67 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 0fe98a567125..73a5cf18fd84 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -766,7 +766,7 @@ emit_clear: func = (u8 *) __bpf_call_base + imm; /* Save skb pointer if we need to re-cache skb data */ - if (bpf_helper_changes_skb_data(func)) + if (bpf_helper_changes_pkt_data(func)) PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx)); bpf_jit_emit_func_call(image, ctx, (u64)func); @@ -775,7 +775,7 @@ emit_clear: PPC_MR(b2p[BPF_REG_0], 3); /* refresh skb cache */ - if (bpf_helper_changes_skb_data(func)) { + if (bpf_helper_changes_pkt_data(func)) { /* reload skb pointer to r3 */ PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx)); bpf_jit_emit_skb_loads(image, ctx); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index bee281f3163d..167b31b186c1 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -981,7 +981,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i EMIT2(0x0d00, REG_14, REG_W1); /* lgr %b0,%r2: load return value into %b0 */ EMIT4(0xb9040000, BPF_REG_0, REG_2); - if (bpf_helper_changes_skb_data((void *)func)) { + if (bpf_helper_changes_pkt_data((void *)func)) { jit->seen |= SEEN_SKB_CHANGE; /* lg %b1,ST_OFF_SKBP(%r15) */ EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0, diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index fe04a04dab8e..e76d1af60f7a 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -853,7 +853,7 @@ xadd: if (is_imm8(insn->off)) func = (u8 *) __bpf_call_base + imm32; jmp_offset = func - (image + addrs[i]); if (seen_ld_abs) { - reload_skb_data = bpf_helper_changes_skb_data(func); + reload_skb_data = bpf_helper_changes_pkt_data(func); if (reload_skb_data) { EMIT1(0x57); /* push %rdi */ jmp_offset += 22; /* pop, mov, sub, mov */ diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 49a81f1fc1d6..f441eda63bec 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2686,6 +2686,11 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog) int err; int i; + if (prog && prog->xdp_adjust_head) { + en_err(priv, "Does not support bpf_xdp_adjust_head()\n"); + return -EOPNOTSUPP; + } + xdp_ring_num = prog ? priv->rx_ring_num : 0; /* No need to reconfigure buffers when simply swapping the diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 07020276fe73..cbfa38fc72c0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3183,6 +3183,11 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog) bool reset, was_opened; int i; + if (prog && prog->xdp_adjust_head) { + netdev_err(netdev, "Does not support bpf_xdp_adjust_head()\n"); + return -EOPNOTSUPP; + } + mutex_lock(&priv->state_lock); if ((netdev->features & NETIF_F_LRO) && prog) { diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 00d9a03be31d..e8d448109e03 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -2946,6 +2946,10 @@ static int nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog) }; int err; + if (prog && prog->xdp_adjust_head) { + nn_err(nn, "Does not support bpf_xdp_adjust_head()\n"); + return -EOPNOTSUPP; + } if (!prog && !nn->xdp_prog) return 0; if (prog && nn->xdp_prog) { diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index cf1dd1436d93..aecdd1c5c0ea 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -2507,6 +2507,11 @@ static int qede_xdp_set(struct qede_dev *edev, struct bpf_prog *prog) { struct qede_reload_args args; + if (prog && prog->xdp_adjust_head) { + DP_ERR(edev, "Does not support bpf_xdp_adjust_head()\n"); + return -EOPNOTSUPP; + } + /* If we're called, there was already a bpf reference increment */ args.func = &qede_xdp_reload_func; args.u.new_prog = prog; diff --git a/include/linux/filter.h b/include/linux/filter.h index f078d2b1cff6..6a1658308612 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -406,7 +406,8 @@ struct bpf_prog { u16 jited:1, /* Is our filter JIT'ed? */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ - dst_needed:1; /* Do we need dst entry? */ + dst_needed:1, /* Do we need dst entry? */ + xdp_adjust_head:1; /* Adjusting pkt head? */ kmemcheck_bitfield_end(meta); enum bpf_prog_type type; /* Type of BPF program */ u32 len; /* Number of filter blocks */ @@ -440,6 +441,7 @@ struct bpf_skb_data_end { struct xdp_buff { void *data; void *data_end; + void *data_hard_start; }; /* compute the linear packet data range [data, data_end) which @@ -595,7 +597,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); -bool bpf_helper_changes_skb_data(void *func); +bool bpf_helper_changes_pkt_data(void *func); struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6123d9b8e828..0eb0e87dbe9f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -424,6 +424,12 @@ union bpf_attr { * @len: length of header to be pushed in front * @flags: Flags (unused for now) * Return: 0 on success or negative error + * + * int bpf_xdp_adjust_head(xdp_md, delta) + * Adjust the xdp_md.data by delta + * @xdp_md: pointer to xdp_md + * @delta: An positive/negative integer to be added to xdp_md.data + * Return: 0 on success or negative on error */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -469,7 +475,8 @@ union bpf_attr { FN(csum_update), \ FN(set_hash_invalid), \ FN(get_numa_node_id), \ - FN(skb_change_head), + FN(skb_change_head), \ + FN(xdp_adjust_head), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -576,6 +583,8 @@ struct bpf_sock { __u32 protocol; }; +#define XDP_PACKET_HEADROOM 256 + /* User return codes for XDP prog type. * A valid XDP program must return one of these defined values. All other * return codes are reserved for future use. Unknown return codes will result diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index bdcc9f4ba767..83e0d153b0b4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1143,7 +1143,7 @@ struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) return prog; } -bool __weak bpf_helper_changes_skb_data(void *func) +bool __weak bpf_helper_changes_pkt_data(void *func) { return false; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 88f609f1c0c3..4819ec9d95f6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -579,6 +579,8 @@ static void fixup_bpf_calls(struct bpf_prog *prog) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_xdp_adjust_head) + prog->xdp_adjust_head = 1; if (insn->imm == BPF_FUNC_tail_call) { /* mark bpf_tail_call as different opcode * to avoid conditional branch in diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5b14f85f45c6..d28f9a3380a9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1216,7 +1216,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id) return -EINVAL; } - changes_data = bpf_helper_changes_skb_data(fn->func); + changes_data = bpf_helper_changes_pkt_data(fn->func); memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; diff --git a/net/core/filter.c b/net/core/filter.c index b751202e12f8..b1461708a977 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2234,7 +2234,28 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { .arg3_type = ARG_ANYTHING, }; -bool bpf_helper_changes_skb_data(void *func) +BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) +{ + void *data = xdp->data + offset; + + if (unlikely(data < xdp->data_hard_start || + data > xdp->data_end - ETH_HLEN)) + return -EINVAL; + + xdp->data = data; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { + .func = bpf_xdp_adjust_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +bool bpf_helper_changes_pkt_data(void *func) { if (func == bpf_skb_vlan_push || func == bpf_skb_vlan_pop || @@ -2244,7 +2265,8 @@ bool bpf_helper_changes_skb_data(void *func) func == bpf_skb_change_tail || func == bpf_skb_pull_data || func == bpf_l3_csum_replace || - func == bpf_l4_csum_replace) + func == bpf_l4_csum_replace || + func == bpf_xdp_adjust_head) return true; return false; @@ -2670,6 +2692,8 @@ xdp_func_proto(enum bpf_func_id func_id) return &bpf_xdp_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_xdp_adjust_head: + return &bpf_xdp_adjust_head_proto; default: return sk_filter_func_proto(func_id); } -- cgit v1.2.3-71-gd317 From 2fa436b3a2a7009c11a3bc03fe0ff4c26e80fd87 Mon Sep 17 00:00:00 2001 From: Vamsi Krishna Date: Fri, 2 Dec 2016 23:59:08 +0200 Subject: nl80211: Use different attrs for BSSID and random MAC addr in scan req NL80211_ATTR_MAC was used to set both the specific BSSID to be scanned and the random MAC address to be used when privacy is enabled. When both the features are enabled, both the BSSID and the local MAC address were getting same value causing Probe Request frames to go with unintended DA. Hence, this has been fixed by using a different NL80211_ATTR_BSSID attribute to set the specific BSSID (which was the more recent addition in cfg80211) for a scan. Backwards compatibility with old userspace software is maintained to some extent by allowing NL80211_ATTR_MAC to be used to set the specific BSSID when scanning without enabling random MAC address use. Scanning with random source MAC address was introduced by commit ad2b26abc157 ("cfg80211: allow drivers to support random MAC addresses for scan") and the issue was introduced with the addition of the second user for the same attribute in commit 818965d39177 ("cfg80211: Allow a scan request for a specific BSSID"). Fixes: 818965d39177 ("cfg80211: Allow a scan request for a specific BSSID") Signed-off-by: Vamsi Krishna Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 7 ++++++- net/wireless/nl80211.c | 16 +++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 259c9c77fdc1..6b76e3b0c18e 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -323,7 +323,7 @@ * @NL80211_CMD_GET_SCAN: get scan results * @NL80211_CMD_TRIGGER_SCAN: trigger a new scan with the given parameters * %NL80211_ATTR_TX_NO_CCK_RATE is used to decide whether to send the - * probe requests at CCK rate or not. %NL80211_ATTR_MAC can be used to + * probe requests at CCK rate or not. %NL80211_ATTR_BSSID can be used to * specify a BSSID to scan for; if not included, the wildcard BSSID will * be used. * @NL80211_CMD_NEW_SCAN_RESULTS: scan notification (as a reply to @@ -1977,6 +1977,9 @@ enum nl80211_commands { * @NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED: Indicates whether or not multicast * packets should be send out as unicast to all stations (flag attribute). * + * @NL80211_ATTR_BSSID: The BSSID of the AP. Note that %NL80211_ATTR_MAC is also + * used in various commands/events for specifying the BSSID. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2381,6 +2384,8 @@ enum nl80211_attrs { NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED, + NL80211_ATTR_BSSID, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 765bd41b1623..3df85a751a85 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -404,6 +404,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { .len = FILS_MAX_KEK_LEN }, [NL80211_ATTR_FILS_NONCES] = { .len = 2 * FILS_NONCE_LEN }, [NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED] = { .type = NLA_FLAG, }, + [NL80211_ATTR_BSSID] = { .len = ETH_ALEN }, }; /* policy for the key attributes */ @@ -6703,7 +6704,20 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) request->no_cck = nla_get_flag(info->attrs[NL80211_ATTR_TX_NO_CCK_RATE]); - if (info->attrs[NL80211_ATTR_MAC]) + /* Initial implementation used NL80211_ATTR_MAC to set the specific + * BSSID to scan for. This was problematic because that same attribute + * was already used for another purpose (local random MAC address). The + * NL80211_ATTR_BSSID attribute was added to fix this. For backwards + * compatibility with older userspace components, also use the + * NL80211_ATTR_MAC value here if it can be determined to be used for + * the specific BSSID use case instead of the random MAC address + * (NL80211_ATTR_SCAN_FLAGS is used to enable random MAC address use). + */ + if (info->attrs[NL80211_ATTR_BSSID]) + memcpy(request->bssid, + nla_data(info->attrs[NL80211_ATTR_BSSID]), ETH_ALEN); + else if (!(request->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) && + info->attrs[NL80211_ATTR_MAC]) memcpy(request->bssid, nla_data(info->attrs[NL80211_ATTR_MAC]), ETH_ALEN); else -- cgit v1.2.3-71-gd317 From 41c43fbee68f4f9a2a9675d83bca91c77862d7f0 Mon Sep 17 00:00:00 2001 From: Asbjørn Sloth Tønnesen Date: Sun, 11 Dec 2016 00:18:57 +0000 Subject: net: l2tp: export debug flags to UAPI Move the L2TP_MSG_* definitions to UAPI, as it is part of the netlink API. Signed-off-by: Asbjoern Sloth Toennesen Signed-off-by: David S. Miller --- include/uapi/linux/l2tp.h | 17 ++++++++++++++++- net/l2tp/l2tp_core.h | 10 ---------- 2 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index 5daa48e2571e..85ddb74fcd1c 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -108,7 +108,7 @@ enum { L2TP_ATTR_VLAN_ID, /* u16 */ L2TP_ATTR_COOKIE, /* 0, 4 or 8 bytes */ L2TP_ATTR_PEER_COOKIE, /* 0, 4 or 8 bytes */ - L2TP_ATTR_DEBUG, /* u32 */ + L2TP_ATTR_DEBUG, /* u32, enum l2tp_debug_flags */ L2TP_ATTR_RECV_SEQ, /* u8 */ L2TP_ATTR_SEND_SEQ, /* u8 */ L2TP_ATTR_LNS_MODE, /* u8 */ @@ -175,6 +175,21 @@ enum l2tp_seqmode { L2TP_SEQ_ALL = 2, }; +/** + * enum l2tp_debug_flags - debug message categories for L2TP tunnels/sessions + * + * @L2TP_MSG_DEBUG: verbose debug (if compiled in) + * @L2TP_MSG_CONTROL: userspace - kernel interface + * @L2TP_MSG_SEQ: sequence numbers + * @L2TP_MSG_DATA: data packets + */ +enum l2tp_debug_flags { + L2TP_MSG_DEBUG = (1 << 0), + L2TP_MSG_CONTROL = (1 << 1), + L2TP_MSG_SEQ = (1 << 2), + L2TP_MSG_DATA = (1 << 3), +}; + /* * NETLINK_GENERIC related info */ diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 2599af6378e4..8f560f7140a0 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -23,16 +23,6 @@ #define L2TP_HASH_BITS_2 8 #define L2TP_HASH_SIZE_2 (1 << L2TP_HASH_BITS_2) -/* Debug message categories for the DEBUG socket option */ -enum { - L2TP_MSG_DEBUG = (1 << 0), /* verbose debug (if - * compiled in) */ - L2TP_MSG_CONTROL = (1 << 1), /* userspace - kernel - * interface */ - L2TP_MSG_SEQ = (1 << 2), /* sequence numbers */ - L2TP_MSG_DATA = (1 << 3), /* data packets */ -}; - struct sk_buff; struct l2tp_stats { -- cgit v1.2.3-71-gd317 From 47c3e7783be4e142b861d34b5c2e223330b05d8a Mon Sep 17 00:00:00 2001 From: Asbjørn Sloth Tønnesen Date: Sun, 11 Dec 2016 00:18:58 +0000 Subject: net: l2tp: deprecate PPPOL2TP_MSG_* in favour of L2TP_MSG_* PPPOL2TP_MSG_* and L2TP_MSG_* are duplicates, and are being used interchangeably in the kernel, so let's standardize on L2TP_MSG_* internally, and keep PPPOL2TP_MSG_* defined in UAPI for compatibility. Signed-off-by: Asbjoern Sloth Toennesen Signed-off-by: David S. Miller --- Documentation/networking/l2tp.txt | 8 ++++---- include/uapi/linux/if_pppol2tp.h | 13 ++++++------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/l2tp.txt b/Documentation/networking/l2tp.txt index 4650a00ed012..9bc271cdc9a8 100644 --- a/Documentation/networking/l2tp.txt +++ b/Documentation/networking/l2tp.txt @@ -177,10 +177,10 @@ setsockopt on the PPPoX socket to set a debug mask. The following debug mask bits are available: -PPPOL2TP_MSG_DEBUG verbose debug (if compiled in) -PPPOL2TP_MSG_CONTROL userspace - kernel interface -PPPOL2TP_MSG_SEQ sequence numbers handling -PPPOL2TP_MSG_DATA data packets +L2TP_MSG_DEBUG verbose debug (if compiled in) +L2TP_MSG_CONTROL userspace - kernel interface +L2TP_MSG_SEQ sequence numbers handling +L2TP_MSG_DATA data packets If enabled, files under a l2tp debugfs directory can be used to dump kernel state about L2TP tunnels and sessions. To access it, the diff --git a/include/uapi/linux/if_pppol2tp.h b/include/uapi/linux/if_pppol2tp.h index 4bd1f55d6377..6418c4d10241 100644 --- a/include/uapi/linux/if_pppol2tp.h +++ b/include/uapi/linux/if_pppol2tp.h @@ -18,6 +18,7 @@ #include #include #include +#include /* Structure used to connect() the socket to a particular tunnel UDP * socket over IPv4. @@ -90,14 +91,12 @@ enum { PPPOL2TP_SO_REORDERTO = 5, }; -/* Debug message categories for the DEBUG socket option */ +/* Debug message categories for the DEBUG socket option (deprecated) */ enum { - PPPOL2TP_MSG_DEBUG = (1 << 0), /* verbose debug (if - * compiled in) */ - PPPOL2TP_MSG_CONTROL = (1 << 1), /* userspace - kernel - * interface */ - PPPOL2TP_MSG_SEQ = (1 << 2), /* sequence numbers */ - PPPOL2TP_MSG_DATA = (1 << 3), /* data packets */ + PPPOL2TP_MSG_DEBUG = L2TP_MSG_DEBUG, + PPPOL2TP_MSG_CONTROL = L2TP_MSG_CONTROL, + PPPOL2TP_MSG_SEQ = L2TP_MSG_SEQ, + PPPOL2TP_MSG_DATA = L2TP_MSG_DATA, }; -- cgit v1.2.3-71-gd317