From 119363c7dc2bcc0c33c255a7b4979c8c0fdc1896 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 22 Apr 2013 16:29:30 +0200 Subject: cfg80211: add support for per-chain signal strength reporting Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d1e48b5e348f..cbf1e228650f 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1991,6 +1991,10 @@ enum nl80211_sta_bss_param { * @NL80211_STA_INFO_PEER_PM: peer mesh STA link-specific power mode * @NL80211_STA_INFO_NONPEER_PM: neighbor mesh STA power save mode towards * non-peer STA + * @NL80211_STA_INFO_CHAIN_SIGNAL: per-chain signal strength of last PPDU + * Contains a nested array of signal strength attributes (u8, dBm) + * @NL80211_STA_INFO_CHAIN_SIGNAL_AVG: per-chain signal strength average + * Same format as NL80211_STA_INFO_CHAIN_SIGNAL. * @__NL80211_STA_INFO_AFTER_LAST: internal * @NL80211_STA_INFO_MAX: highest possible station info attribute */ @@ -2020,6 +2024,8 @@ enum nl80211_sta_info { NL80211_STA_INFO_NONPEER_PM, NL80211_STA_INFO_RX_BYTES64, NL80211_STA_INFO_TX_BYTES64, + NL80211_STA_INFO_CHAIN_SIGNAL, + NL80211_STA_INFO_CHAIN_SIGNAL_AVG, /* keep last */ __NL80211_STA_INFO_AFTER_LAST, -- cgit v1.2.3-71-gd317 From fb4e156886ce6e8309e912d8b370d192330d19d3 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sun, 28 Apr 2013 16:22:06 -0700 Subject: nl80211: Add generic netlink module alias for cfg80211/nl80211 To support auto-loading of wireless modules from netlink users, add module alias for nl80211 family. This also adds NL80211_GENL_NAME constant to define the "nl80211" netlink family name as part of uapi. Signed-off-by: Marcel Holtmann Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ net/wireless/core.c | 1 + net/wireless/nl80211.c | 8 ++++---- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index cbf1e228650f..b48430769eda 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -27,6 +27,8 @@ #include +#define NL80211_GENL_NAME "nl80211" + /** * DOC: Station handling * diff --git a/net/wireless/core.c b/net/wireless/core.c index 84c9ad7e1dca..68f0c96c0565 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -34,6 +34,7 @@ MODULE_AUTHOR("Johannes Berg"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("wireless configuration support"); +MODULE_ALIAS_GENL_FAMILY(NL80211_GENL_NAME); /* RCU-protected (and cfg80211_mutex for writers) */ LIST_HEAD(cfg80211_rdev_list); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f687a8d0d026..9cdcd9ec3317 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -37,10 +37,10 @@ static void nl80211_post_doit(struct genl_ops *ops, struct sk_buff *skb, /* the netlink family */ static struct genl_family nl80211_fam = { - .id = GENL_ID_GENERATE, /* don't bother with a hardcoded ID */ - .name = "nl80211", /* have users key off the name instead */ - .hdrsize = 0, /* no private header */ - .version = 1, /* no particular meaning now */ + .id = GENL_ID_GENERATE, /* don't bother with a hardcoded ID */ + .name = NL80211_GENL_NAME, /* have users key off the name instead */ + .hdrsize = 0, /* no private header */ + .version = 1, /* no particular meaning now */ .maxattr = NL80211_ATTR_MAX, .netnsok = true, .pre_doit = nl80211_pre_doit, -- cgit v1.2.3-71-gd317 From 6e16d90b5218307db805e6b3e0b06d3946eb8c4c Mon Sep 17 00:00:00 2001 From: Colleen Twitty Date: Wed, 8 May 2013 11:45:59 -0700 Subject: cfg80211: Userspace may inform kernel of mesh auth method. Authentication takes place in userspace, but the beacon is generated in the kernel. Allow userspace to inform the kernel of the authentication method so the appropriate mesh config IE can be set prior to beacon generation when joining the MBSS. Signed-off-by: Colleen Twitty Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 5 +++++ net/wireless/mesh.c | 1 + net/wireless/nl80211.c | 8 ++++++++ 4 files changed, 16 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 26e91138a2c6..32a2f1b20861 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1161,6 +1161,7 @@ struct mesh_config { * @sync_method: which synchronization method to use * @path_sel_proto: which path selection protocol to use * @path_metric: which metric to use + * @auth_id: which authentication method this mesh is using * @ie: vendor information elements (optional) * @ie_len: length of vendor information elements * @is_authenticated: this mesh requires authentication @@ -1179,6 +1180,7 @@ struct mesh_setup { u8 sync_method; u8 path_sel_proto; u8 path_metric; + u8 auth_id; const u8 *ie; u8 ie_len; bool is_authenticated; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index b48430769eda..06320713e9c9 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2645,6 +2645,10 @@ enum nl80211_meshconf_params { * @NL80211_MESH_SETUP_USERSPACE_MPM: Enable this option if userspace will * implement an MPM which handles peer allocation and state. * + * @NL80211_MESH_SETUP_AUTH_PROTOCOL: Inform the kernel of the authentication + * method (u8, as defined in IEEE 8.4.2.100.6, e.g. 0x1 for SAE). + * Default is no authentication method required. + * * @NL80211_MESH_SETUP_ATTR_MAX: highest possible mesh setup attribute number * * @__NL80211_MESH_SETUP_ATTR_AFTER_LAST: Internal use @@ -2658,6 +2662,7 @@ enum nl80211_mesh_setup_params { NL80211_MESH_SETUP_USERSPACE_AMPE, NL80211_MESH_SETUP_ENABLE_VENDOR_SYNC, NL80211_MESH_SETUP_USERSPACE_MPM, + NL80211_MESH_SETUP_AUTH_PROTOCOL, /* keep last */ __NL80211_MESH_SETUP_ATTR_AFTER_LAST, diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c index 0bb93f3061a4..9546ad210550 100644 --- a/net/wireless/mesh.c +++ b/net/wireless/mesh.c @@ -82,6 +82,7 @@ const struct mesh_setup default_mesh_setup = { .sync_method = IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET, .path_sel_proto = IEEE80211_PATH_PROTOCOL_HWMP, .path_metric = IEEE80211_PATH_METRIC_AIRTIME, + .auth_id = 0, /* open */ .ie = NULL, .ie_len = 0, .is_secure = false, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 9cdcd9ec3317..5f10f7acfa06 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4672,6 +4672,7 @@ static const struct nla_policy [NL80211_MESH_SETUP_ENABLE_VENDOR_PATH_SEL] = { .type = NLA_U8 }, [NL80211_MESH_SETUP_ENABLE_VENDOR_METRIC] = { .type = NLA_U8 }, [NL80211_MESH_SETUP_USERSPACE_AUTH] = { .type = NLA_FLAG }, + [NL80211_MESH_SETUP_AUTH_PROTOCOL] = { .type = NLA_U8 }, [NL80211_MESH_SETUP_USERSPACE_MPM] = { .type = NLA_FLAG }, [NL80211_MESH_SETUP_IE] = { .type = NLA_BINARY, .len = IEEE80211_MAX_DATA_LEN }, @@ -4857,6 +4858,13 @@ static int nl80211_parse_mesh_setup(struct genl_info *info, if (setup->is_secure) setup->user_mpm = true; + if (tb[NL80211_MESH_SETUP_AUTH_PROTOCOL]) { + if (!setup->user_mpm) + return -EINVAL; + setup->auth_id = + nla_get_u8(tb[NL80211_MESH_SETUP_AUTH_PROTOCOL]); + } + return 0; } -- cgit v1.2.3-71-gd317 From 8b21ab140ec581caf3340a51cdbe91a76c36d9d8 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 20 May 2013 10:27:00 +0930 Subject: virtio_pci: better macro exported in uapi Macro VIRTIO_PCI_CONFIG assumes that userspace actually has a structure with a field named msix_enabled. Add VIRTIO_PCI_CONFIG_OFF that gets the msix_enabled by value instead, to make it useful for userspace. We still keep VIRTIO_PCI_CONFIG around for now, in case some userspace uses it. Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- include/uapi/linux/virtio_pci.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h index ea66f3f60d63..e5ec1caab82a 100644 --- a/include/uapi/linux/virtio_pci.h +++ b/include/uapi/linux/virtio_pci.h @@ -80,7 +80,9 @@ /* The remaining space is defined by each driver as the per-driver * configuration space */ -#define VIRTIO_PCI_CONFIG(dev) ((dev)->msix_enabled ? 24 : 20) +#define VIRTIO_PCI_CONFIG_OFF(msix_enabled) ((msix_enabled) ? 24 : 20) +/* Deprecated: please use VIRTIO_PCI_CONFIG_OFF instead */ +#define VIRTIO_PCI_CONFIG(dev) VIRTIO_PCI_CONFIG_OFF((dev)->msix_enabled) /* Virtio ABI version, this must match exactly */ #define VIRTIO_PCI_ABI_VERSION 0 -- cgit v1.2.3-71-gd317 From 1c6264cc67ac8712d84392fbc9293289011407c0 Mon Sep 17 00:00:00 2001 From: Pranavkumar Sawargaonkar Date: Mon, 20 May 2013 10:25:38 +0930 Subject: virtio: console: Add emergency writeonly register to config space This patch adds an emerg_wr register (writeonly) in config space of virtio console device which can be used for debugging. Signed-off-by: Pranavkumar Sawargaonkar Signed-off-by: Anup Patel Acked-by: Amit Shah Signed-off-by: Rusty Russell --- include/uapi/linux/virtio_console.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/virtio_console.h b/include/uapi/linux/virtio_console.h index ee13ab6c3614..3fd0dce3cb25 100644 --- a/include/uapi/linux/virtio_console.h +++ b/include/uapi/linux/virtio_console.h @@ -38,6 +38,7 @@ /* Feature bits */ #define VIRTIO_CONSOLE_F_SIZE 0 /* Does host provide console size? */ #define VIRTIO_CONSOLE_F_MULTIPORT 1 /* Does host provide multiple ports? */ +#define VIRTIO_CONSOLE_F_EMERG_WRITE 2 /* Does host support emergency write? */ #define VIRTIO_CONSOLE_BAD_ID (~(u32)0) @@ -48,6 +49,8 @@ struct virtio_console_config { __u16 rows; /* max. number of ports this device can hold */ __u32 max_nr_ports; + /* emergency write register */ + __u32 emerg_wr; } __attribute__((packed)); /* -- cgit v1.2.3-71-gd317 From adf6271281a811dbcf2cb090b2a99a8520f7852b Mon Sep 17 00:00:00 2001 From: Ismael Luceno Date: Thu, 18 Apr 2013 08:16:08 -0300 Subject: [media] videodev2.h: Make V4L2_PIX_FMT_MPEG4 comment more specific about its usage Signed-off-by: Ismael Luceno Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/videodev2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index f40b41c7e108..16c3cf71eb94 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -395,7 +395,7 @@ struct v4l2_pix_format { #define V4L2_PIX_FMT_H263 v4l2_fourcc('H', '2', '6', '3') /* H263 */ #define V4L2_PIX_FMT_MPEG1 v4l2_fourcc('M', 'P', 'G', '1') /* MPEG-1 ES */ #define V4L2_PIX_FMT_MPEG2 v4l2_fourcc('M', 'P', 'G', '2') /* MPEG-2 ES */ -#define V4L2_PIX_FMT_MPEG4 v4l2_fourcc('M', 'P', 'G', '4') /* MPEG-4 ES */ +#define V4L2_PIX_FMT_MPEG4 v4l2_fourcc('M', 'P', 'G', '4') /* MPEG-4 part 2 ES */ #define V4L2_PIX_FMT_XVID v4l2_fourcc('X', 'V', 'I', 'D') /* Xvid */ #define V4L2_PIX_FMT_VC1_ANNEX_G v4l2_fourcc('V', 'C', '1', 'G') /* SMPTE 421M Annex G compliant stream */ #define V4L2_PIX_FMT_VC1_ANNEX_L v4l2_fourcc('V', 'C', '1', 'L') /* SMPTE 421M Annex L compliant stream */ -- cgit v1.2.3-71-gd317 From 5e4b6f5698421d94226cc2f80eae6d613c9acef8 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Thu, 16 May 2013 20:11:08 +0300 Subject: cfg80211: Allow TDLS peer AID to be configured for VHT VHT uses peer AID in the PARTIAL_AID field in TDLS frames. The current design for TDLS is to first add a dummy STA entry before completing TDLS Setup and then update information on this STA entry based on what was received from the peer during the setup exchange. In theory, this could use NL80211_ATTR_STA_AID to set the peer AID just like this is used in AP mode to set the AID of an association station. However, existing cfg80211 validation rules prevent this attribute from being used with set_station operation. To avoid interoperability issues between different kernel and user space version combinations, introduce a new nl80211 attribute for the purpose of setting TDLS peer AID. This attribute can be used in both the new_station and set_station operations. It is not supposed to be allowed to change the AID value during the lifetime of the STA entry, but that validation is left for drivers to do in the change_station callback. Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 7 +++++++ net/wireless/nl80211.c | 11 +++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 06320713e9c9..32b060ea5266 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1431,6 +1431,11 @@ enum nl80211_commands { * @NL80211_ATTR_MAX_CRIT_PROT_DURATION: duration in milliseconds in which * the connection should have increased reliability (u16). * + * @NL80211_ATTR_PEER_AID: Association ID for the peer TDLS station (u16). + * This is similar to @NL80211_ATTR_STA_AID but with a difference of being + * allowed to be used with the first @NL80211_CMD_SET_STATION command to + * update a TDLS peer STA entry. + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -1729,6 +1734,8 @@ enum nl80211_attrs { NL80211_ATTR_CRIT_PROT_ID, NL80211_ATTR_MAX_CRIT_PROT_DURATION, + NL80211_ATTR_PEER_AID, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 5f10f7acfa06..14276af7964b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -378,6 +378,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_MDID] = { .type = NLA_U16 }, [NL80211_ATTR_IE_RIC] = { .type = NLA_BINARY, .len = IEEE80211_MAX_DATA_LEN }, + [NL80211_ATTR_PEER_AID] = { .type = NLA_U16 }, }; /* policy for the key attributes */ @@ -3872,6 +3873,8 @@ static int nl80211_set_station_tdls(struct genl_info *info, struct station_parameters *params) { /* Dummy STA entry gets updated once the peer capabilities are known */ + if (info->attrs[NL80211_ATTR_PEER_AID]) + params->aid = nla_get_u16(info->attrs[NL80211_ATTR_PEER_AID]); if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) params->ht_capa = nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]); @@ -4012,7 +4015,8 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]) return -EINVAL; - if (!info->attrs[NL80211_ATTR_STA_AID]) + if (!info->attrs[NL80211_ATTR_STA_AID] && + !info->attrs[NL80211_ATTR_PEER_AID]) return -EINVAL; mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); @@ -4023,7 +4027,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.listen_interval = nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]); - params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]); + if (info->attrs[NL80211_ATTR_STA_AID]) + params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]); + else + params.aid = nla_get_u16(info->attrs[NL80211_ATTR_PEER_AID]); if (!params.aid || params.aid > IEEE80211_MAX_AID) return -EINVAL; -- cgit v1.2.3-71-gd317 From ff29feb9146d1c0020f2ccbb25369582c6a16681 Mon Sep 17 00:00:00 2001 From: "Lad, Prabhakar" Date: Mon, 20 May 2013 06:02:40 -0300 Subject: [media] videodev2.h: fix typos This patch fixes several typos in videodev2.h file Signed-off-by: Lad, Prabhakar Acked-by: Laurent Pinchart Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/videodev2.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 16c3cf71eb94..2c5e67a45436 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -555,7 +555,7 @@ struct v4l2_jpegcompression { __u32 jpeg_markers; /* Which markers should go into the JPEG * output. Unless you exactly know what * you do, leave them untouched. - * Inluding less markers will make the + * Including less markers will make the * resulting code smaller, but there will * be fewer applications which can read it. * The presence of the APP and COM marker @@ -567,7 +567,7 @@ struct v4l2_jpegcompression { #define V4L2_JPEG_MARKER_DRI (1<<5) /* Define Restart Interval */ #define V4L2_JPEG_MARKER_COM (1<<6) /* Comment segment */ #define V4L2_JPEG_MARKER_APP (1<<7) /* App segment, driver will - * allways use APP0 */ + * always use APP0 */ }; /* @@ -900,7 +900,7 @@ typedef __u64 v4l2_std_id; /* * "Common" PAL - This macro is there to be compatible with the old * V4L1 concept of "PAL": /BGDKHI. - * Several PAL standards are mising here: /M, /N and /Nc + * Several PAL standards are missing here: /M, /N and /Nc */ #define V4L2_STD_PAL (V4L2_STD_PAL_BG |\ V4L2_STD_PAL_DK |\ @@ -1790,7 +1790,7 @@ struct v4l2_event_subscription { #define V4L2_CHIP_MATCH_HOST V4L2_CHIP_MATCH_BRIDGE #define V4L2_CHIP_MATCH_I2C_DRIVER 1 /* Match against I2C driver name */ #define V4L2_CHIP_MATCH_I2C_ADDR 2 /* Match against I2C 7-bit address */ -#define V4L2_CHIP_MATCH_AC97 3 /* Match against anciliary AC97 chip */ +#define V4L2_CHIP_MATCH_AC97 3 /* Match against ancillary AC97 chip */ #define V4L2_CHIP_MATCH_SUBDEV 4 /* Match against subdev index */ struct v4l2_dbg_match { -- cgit v1.2.3-71-gd317 From 7ec872114251b618adf5a2688de0ca00de63635d Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 23 May 2013 01:11:11 +0000 Subject: net: ethtool: disambiguate XCVR_* meaning Add a comment which explains the real meaning of XCVR_INTERNAL (PHY and Ethernet MAC in the same package/die) and XCVR_EXTERNAL (PHY and Ethernet MAC in a different package/die). Most if not all of the drivers setting their transceiver type already do it the way the comment describes it. Signed-off-by: Florian Fainelli Reviewed-by: Ben Hutchings Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 0c9b44871df0..38dbafaa5341 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -993,8 +993,8 @@ enum ethtool_sfeatures_retval_bits { #define PORT_OTHER 0xff /* Which transceiver to use. */ -#define XCVR_INTERNAL 0x00 -#define XCVR_EXTERNAL 0x01 +#define XCVR_INTERNAL 0x00 /* PHY and MAC are in the same package */ +#define XCVR_EXTERNAL 0x01 /* PHY and MAC are in different packages */ #define XCVR_DUMMY1 0x02 #define XCVR_DUMMY2 0x03 #define XCVR_DUMMY3 0x04 -- cgit v1.2.3-71-gd317 From e057d3c31bdf87616b415c4b2cbf7310f54b9219 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 28 May 2013 13:01:52 +0200 Subject: cfg80211: support an active monitor interface flag An active monitor interface is one that is used for communication (via injection). It is expected to ACK incoming unicast packets. This is useful for running various 802.11 testing utilities that associate to an AP via injection and manage the state in user space. Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 4 ++++ net/wireless/nl80211.c | 10 ++++++++++ 3 files changed, 16 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index b3b076a46d50..13b247d26544 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -961,6 +961,7 @@ struct station_info { * @MONITOR_FLAG_CONTROL: pass control frames * @MONITOR_FLAG_OTHER_BSS: disable BSSID filtering * @MONITOR_FLAG_COOK_FRAMES: report frames after processing + * @MONITOR_FLAG_ACTIVE: active monitor, ACKs frames on its MAC address */ enum monitor_flags { MONITOR_FLAG_FCSFAIL = 1<wiphy.features & NL80211_FEATURE_ACTIVE_MONITOR)) + return -EOPNOTSUPP; + if (change) err = cfg80211_change_iface(rdev, dev, ntype, flags, ¶ms); else @@ -2395,6 +2400,11 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) err = parse_monitor_flags(type == NL80211_IFTYPE_MONITOR ? info->attrs[NL80211_ATTR_MNTR_FLAGS] : NULL, &flags); + + if (!err && (flags & NL80211_MNTR_FLAG_ACTIVE) && + !(rdev->wiphy.features & NL80211_FEATURE_ACTIVE_MONITOR)) + return -EOPNOTSUPP; + wdev = rdev_add_virtual_intf(rdev, nla_data(info->attrs[NL80211_ATTR_IFNAME]), type, err ? NULL : &flags, ¶ms); -- cgit v1.2.3-71-gd317 From cb93b1864088eb833ea9cef2c20f07d1961241b0 Mon Sep 17 00:00:00 2001 From: Yijing Wang Date: Wed, 29 May 2013 17:01:52 +0800 Subject: PCI: Fix comment typo for PCI_EXP_LNKCAP_CLKPM Fix trivial typo for PCI_EXP_LNKCAP_CLKPM comment. Signed-off-by: Yijing Wang Signed-off-by: Bjorn Helgaas --- include/uapi/linux/pci_regs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 864e324da80d..c3cc01d474b0 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -468,7 +468,7 @@ #define PCI_EXP_LNKCAP_ASPMS 0x00000c00 /* ASPM Support */ #define PCI_EXP_LNKCAP_L0SEL 0x00007000 /* L0s Exit Latency */ #define PCI_EXP_LNKCAP_L1EL 0x00038000 /* L1 Exit Latency */ -#define PCI_EXP_LNKCAP_CLKPM 0x00040000 /* L1 Clock Power Management */ +#define PCI_EXP_LNKCAP_CLKPM 0x00040000 /* Clock Power Management */ #define PCI_EXP_LNKCAP_SDERC 0x00080000 /* Surprise Down Error Reporting Capable */ #define PCI_EXP_LNKCAP_DLLLARC 0x00100000 /* Data Link Layer Link Active Reporting Capable */ #define PCI_EXP_LNKCAP_LBNC 0x00200000 /* Link Bandwidth Notification Capability */ -- cgit v1.2.3-71-gd317 From 4c4d41f200db375b2d2cc6d0a1de0606c8266398 Mon Sep 17 00:00:00 2001 From: Fan Du Date: Thu, 6 Jun 2013 10:15:54 +0800 Subject: xfrm: add LINUX_MIB_XFRMACQUIREERROR statistic counter When host ping its peer, ICMP echo request packet triggers IPsec policy, then host negotiates SA secret with its peer. After IKE installed SA for OUT direction, but before SA for IN direction installed, host get ICMP echo reply from its peer. At the time being, the SA state for IN direction could be XFRM_STATE_ACQ, then the received packet will be dropped after adding LINUX_MIB_XFRMINSTATEINVALID statistic. Adding a LINUX_MIB_XFRMACQUIREERROR statistic counter for such scenario when SA in larval state is much clearer for user than LINUX_MIB_XFRMINSTATEINVALID which indicates the SA is totally bad. Signed-off-by: Fan Du Signed-off-by: Steffen Klassert --- include/uapi/linux/snmp.h | 1 + net/xfrm/xfrm_input.c | 5 +++++ net/xfrm/xfrm_proc.c | 1 + 3 files changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index df2e8b4f9c03..ea17542ff321 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -287,6 +287,7 @@ enum LINUX_MIB_XFRMOUTPOLERROR, /* XfrmOutPolError */ LINUX_MIB_XFRMFWDHDRERROR, /* XfrmFwdHdrError*/ LINUX_MIB_XFRMOUTSTATEINVALID, /* XfrmOutStateInvalid */ + LINUX_MIB_XFRMACQUIREERROR, /* XfrmAcquireError */ __LINUX_MIB_XFRMMAX }; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index ab2bb42fe094..88843996f935 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -163,6 +163,11 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) skb->sp->xvec[skb->sp->len++] = x; spin_lock(&x->lock); + if (unlikely(x->km.state == XFRM_STATE_ACQ)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR); + goto drop_unlock; + } + if (unlikely(x->km.state != XFRM_STATE_VALID)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEINVALID); goto drop_unlock; diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c index c721b0d9ab8b..80cd1e55b834 100644 --- a/net/xfrm/xfrm_proc.c +++ b/net/xfrm/xfrm_proc.c @@ -44,6 +44,7 @@ static const struct snmp_mib xfrm_mib_list[] = { SNMP_MIB_ITEM("XfrmOutPolError", LINUX_MIB_XFRMOUTPOLERROR), SNMP_MIB_ITEM("XfrmFwdHdrError", LINUX_MIB_XFRMFWDHDRERROR), SNMP_MIB_ITEM("XfrmOutStateInvalid", LINUX_MIB_XFRMOUTSTATEINVALID), + SNMP_MIB_ITEM("XfrmAcquireError", LINUX_MIB_XFRMACQUIREERROR), SNMP_MIB_SENTINEL }; -- cgit v1.2.3-71-gd317 From 7c8c5e6a9101ea57a1c2c9faff0917e79251a21e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 10 Dec 2012 16:15:34 +0000 Subject: arm64: KVM: system register handling Provide 64bit system register handling, modeled after the cp15 handling for ARM. Reviewed-by: Christopher Covington Reviewed-by: Catalin Marinas Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_coproc.h | 51 +++ arch/arm64/include/uapi/asm/kvm.h | 29 ++ arch/arm64/kvm/sys_regs.c | 883 ++++++++++++++++++++++++++++++++++++ arch/arm64/kvm/sys_regs.h | 138 ++++++ include/uapi/linux/kvm.h | 1 + 5 files changed, 1102 insertions(+) create mode 100644 arch/arm64/include/asm/kvm_coproc.h create mode 100644 arch/arm64/kvm/sys_regs.c create mode 100644 arch/arm64/kvm/sys_regs.h (limited to 'include/uapi/linux') diff --git a/arch/arm64/include/asm/kvm_coproc.h b/arch/arm64/include/asm/kvm_coproc.h new file mode 100644 index 000000000000..9b4477acb554 --- /dev/null +++ b/arch/arm64/include/asm/kvm_coproc.h @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2012,2013 - ARM Ltd + * Author: Marc Zyngier + * + * Derived from arch/arm/include/asm/kvm_coproc.h + * Copyright (C) 2012 Rusty Russell IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef __ARM64_KVM_COPROC_H__ +#define __ARM64_KVM_COPROC_H__ + +#include + +void kvm_reset_sys_regs(struct kvm_vcpu *vcpu); + +struct kvm_sys_reg_table { + const struct sys_reg_desc *table; + size_t num; +}; + +struct kvm_sys_reg_target_table { + struct kvm_sys_reg_table table64; +}; + +void kvm_register_target_sys_reg_table(unsigned int target, + struct kvm_sys_reg_target_table *table); + +int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run); + +#define kvm_coproc_table_init kvm_sys_reg_table_init +void kvm_sys_reg_table_init(void); + +struct kvm_one_reg; +int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices); +int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); +int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); +unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu); + +#endif /* __ARM64_KVM_COPROC_H__ */ diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 4e64570a20c9..ebac919dc0ca 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -92,6 +92,35 @@ struct kvm_sync_regs { struct kvm_arch_memory_slot { }; +/* If you need to interpret the index values, here is the key: */ +#define KVM_REG_ARM_COPROC_MASK 0x000000000FFF0000 +#define KVM_REG_ARM_COPROC_SHIFT 16 + +/* Normal registers are mapped as coprocessor 16. */ +#define KVM_REG_ARM_CORE (0x0010 << KVM_REG_ARM_COPROC_SHIFT) +#define KVM_REG_ARM_CORE_REG(name) (offsetof(struct kvm_regs, name) / sizeof(__u32)) + +/* Some registers need more space to represent values. */ +#define KVM_REG_ARM_DEMUX (0x0011 << KVM_REG_ARM_COPROC_SHIFT) +#define KVM_REG_ARM_DEMUX_ID_MASK 0x000000000000FF00 +#define KVM_REG_ARM_DEMUX_ID_SHIFT 8 +#define KVM_REG_ARM_DEMUX_ID_CCSIDR (0x00 << KVM_REG_ARM_DEMUX_ID_SHIFT) +#define KVM_REG_ARM_DEMUX_VAL_MASK 0x00000000000000FF +#define KVM_REG_ARM_DEMUX_VAL_SHIFT 0 + +/* AArch64 system registers */ +#define KVM_REG_ARM64_SYSREG (0x0013 << KVM_REG_ARM_COPROC_SHIFT) +#define KVM_REG_ARM64_SYSREG_OP0_MASK 0x000000000000c000 +#define KVM_REG_ARM64_SYSREG_OP0_SHIFT 14 +#define KVM_REG_ARM64_SYSREG_OP1_MASK 0x0000000000003800 +#define KVM_REG_ARM64_SYSREG_OP1_SHIFT 11 +#define KVM_REG_ARM64_SYSREG_CRN_MASK 0x0000000000000780 +#define KVM_REG_ARM64_SYSREG_CRN_SHIFT 7 +#define KVM_REG_ARM64_SYSREG_CRM_MASK 0x0000000000000078 +#define KVM_REG_ARM64_SYSREG_CRM_SHIFT 3 +#define KVM_REG_ARM64_SYSREG_OP2_MASK 0x0000000000000007 +#define KVM_REG_ARM64_SYSREG_OP2_SHIFT 0 + /* KVM_IRQ_LINE irq field index values */ #define KVM_ARM_IRQ_TYPE_SHIFT 24 #define KVM_ARM_IRQ_TYPE_MASK 0xff diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c new file mode 100644 index 000000000000..52fff0ae3442 --- /dev/null +++ b/arch/arm64/kvm/sys_regs.c @@ -0,0 +1,883 @@ +/* + * Copyright (C) 2012,2013 - ARM Ltd + * Author: Marc Zyngier + * + * Derived from arch/arm/kvm/coproc.c: + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Authors: Rusty Russell + * Christoffer Dall + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sys_regs.h" + +/* + * All of this file is extremly similar to the ARM coproc.c, but the + * types are different. My gut feeling is that it should be pretty + * easy to merge, but that would be an ABI breakage -- again. VFP + * would also need to be abstracted. + */ + +/* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */ +static u32 cache_levels; + +/* CSSELR values; used to index KVM_REG_ARM_DEMUX_ID_CCSIDR */ +#define CSSELR_MAX 12 + +/* Which cache CCSIDR represents depends on CSSELR value. */ +static u32 get_ccsidr(u32 csselr) +{ + u32 ccsidr; + + /* Make sure noone else changes CSSELR during this! */ + local_irq_disable(); + /* Put value into CSSELR */ + asm volatile("msr csselr_el1, %x0" : : "r" (csselr)); + isb(); + /* Read result out of CCSIDR */ + asm volatile("mrs %0, ccsidr_el1" : "=r" (ccsidr)); + local_irq_enable(); + + return ccsidr; +} + +static void do_dc_cisw(u32 val) +{ + asm volatile("dc cisw, %x0" : : "r" (val)); + dsb(); +} + +static void do_dc_csw(u32 val) +{ + asm volatile("dc csw, %x0" : : "r" (val)); + dsb(); +} + +/* See note at ARM ARM B1.14.4 */ +static bool access_dcsw(struct kvm_vcpu *vcpu, + const struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + unsigned long val; + int cpu; + + if (!p->is_write) + return read_from_write_only(vcpu, p); + + cpu = get_cpu(); + + cpumask_setall(&vcpu->arch.require_dcache_flush); + cpumask_clear_cpu(cpu, &vcpu->arch.require_dcache_flush); + + /* If we were already preempted, take the long way around */ + if (cpu != vcpu->arch.last_pcpu) { + flush_cache_all(); + goto done; + } + + val = *vcpu_reg(vcpu, p->Rt); + + switch (p->CRm) { + case 6: /* Upgrade DCISW to DCCISW, as per HCR.SWIO */ + case 14: /* DCCISW */ + do_dc_cisw(val); + break; + + case 10: /* DCCSW */ + do_dc_csw(val); + break; + } + +done: + put_cpu(); + + return true; +} + +/* + * We could trap ID_DFR0 and tell the guest we don't support performance + * monitoring. Unfortunately the patch to make the kernel check ID_DFR0 was + * NAKed, so it will read the PMCR anyway. + * + * Therefore we tell the guest we have 0 counters. Unfortunately, we + * must always support PMCCNTR (the cycle counter): we just RAZ/WI for + * all PM registers, which doesn't crash the guest kernel at least. + */ +static bool pm_fake(struct kvm_vcpu *vcpu, + const struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + return ignore_write(vcpu, p); + else + return read_zero(vcpu, p); +} + +static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + u64 amair; + + asm volatile("mrs %0, amair_el1\n" : "=r" (amair)); + vcpu_sys_reg(vcpu, AMAIR_EL1) = amair; +} + +static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + /* + * Simply map the vcpu_id into the Aff0 field of the MPIDR. + */ + vcpu_sys_reg(vcpu, MPIDR_EL1) = (1UL << 31) | (vcpu->vcpu_id & 0xff); +} + +/* + * Architected system registers. + * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 + */ +static const struct sys_reg_desc sys_reg_descs[] = { + /* DC ISW */ + { Op0(0b01), Op1(0b000), CRn(0b0111), CRm(0b0110), Op2(0b010), + access_dcsw }, + /* DC CSW */ + { Op0(0b01), Op1(0b000), CRn(0b0111), CRm(0b1010), Op2(0b010), + access_dcsw }, + /* DC CISW */ + { Op0(0b01), Op1(0b000), CRn(0b0111), CRm(0b1110), Op2(0b010), + access_dcsw }, + + /* MPIDR_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0000), Op2(0b101), + NULL, reset_mpidr, MPIDR_EL1 }, + /* SCTLR_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b000), + NULL, reset_val, SCTLR_EL1, 0x00C50078 }, + /* CPACR_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b010), + NULL, reset_val, CPACR_EL1, 0 }, + /* TTBR0_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b000), + NULL, reset_unknown, TTBR0_EL1 }, + /* TTBR1_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b001), + NULL, reset_unknown, TTBR1_EL1 }, + /* TCR_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b010), + NULL, reset_val, TCR_EL1, 0 }, + + /* AFSR0_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0001), Op2(0b000), + NULL, reset_unknown, AFSR0_EL1 }, + /* AFSR1_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0001), Op2(0b001), + NULL, reset_unknown, AFSR1_EL1 }, + /* ESR_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0010), Op2(0b000), + NULL, reset_unknown, ESR_EL1 }, + /* FAR_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b0110), CRm(0b0000), Op2(0b000), + NULL, reset_unknown, FAR_EL1 }, + + /* PMINTENSET_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b1001), CRm(0b1110), Op2(0b001), + pm_fake }, + /* PMINTENCLR_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b1001), CRm(0b1110), Op2(0b010), + pm_fake }, + + /* MAIR_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b1010), CRm(0b0010), Op2(0b000), + NULL, reset_unknown, MAIR_EL1 }, + /* AMAIR_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b1010), CRm(0b0011), Op2(0b000), + NULL, reset_amair_el1, AMAIR_EL1 }, + + /* VBAR_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b0000), Op2(0b000), + NULL, reset_val, VBAR_EL1, 0 }, + /* CONTEXTIDR_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b1101), CRm(0b0000), Op2(0b001), + NULL, reset_val, CONTEXTIDR_EL1, 0 }, + /* TPIDR_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b1101), CRm(0b0000), Op2(0b100), + NULL, reset_unknown, TPIDR_EL1 }, + + /* CNTKCTL_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b1110), CRm(0b0001), Op2(0b000), + NULL, reset_val, CNTKCTL_EL1, 0}, + + /* CSSELR_EL1 */ + { Op0(0b11), Op1(0b010), CRn(0b0000), CRm(0b0000), Op2(0b000), + NULL, reset_unknown, CSSELR_EL1 }, + + /* PMCR_EL0 */ + { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b000), + pm_fake }, + /* PMCNTENSET_EL0 */ + { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b001), + pm_fake }, + /* PMCNTENCLR_EL0 */ + { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b010), + pm_fake }, + /* PMOVSCLR_EL0 */ + { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b011), + pm_fake }, + /* PMSWINC_EL0 */ + { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b100), + pm_fake }, + /* PMSELR_EL0 */ + { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b101), + pm_fake }, + /* PMCEID0_EL0 */ + { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b110), + pm_fake }, + /* PMCEID1_EL0 */ + { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b111), + pm_fake }, + /* PMCCNTR_EL0 */ + { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b000), + pm_fake }, + /* PMXEVTYPER_EL0 */ + { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b001), + pm_fake }, + /* PMXEVCNTR_EL0 */ + { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b010), + pm_fake }, + /* PMUSERENR_EL0 */ + { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1110), Op2(0b000), + pm_fake }, + /* PMOVSSET_EL0 */ + { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1110), Op2(0b011), + pm_fake }, + + /* TPIDR_EL0 */ + { Op0(0b11), Op1(0b011), CRn(0b1101), CRm(0b0000), Op2(0b010), + NULL, reset_unknown, TPIDR_EL0 }, + /* TPIDRRO_EL0 */ + { Op0(0b11), Op1(0b011), CRn(0b1101), CRm(0b0000), Op2(0b011), + NULL, reset_unknown, TPIDRRO_EL0 }, +}; + +/* Target specific emulation tables */ +static struct kvm_sys_reg_target_table *target_tables[KVM_ARM_NUM_TARGETS]; + +void kvm_register_target_sys_reg_table(unsigned int target, + struct kvm_sys_reg_target_table *table) +{ + target_tables[target] = table; +} + +/* Get specific register table for this target. */ +static const struct sys_reg_desc *get_target_table(unsigned target, size_t *num) +{ + struct kvm_sys_reg_target_table *table; + + table = target_tables[target]; + *num = table->table64.num; + return table->table64.table; +} + +static const struct sys_reg_desc *find_reg(const struct sys_reg_params *params, + const struct sys_reg_desc table[], + unsigned int num) +{ + unsigned int i; + + for (i = 0; i < num; i++) { + const struct sys_reg_desc *r = &table[i]; + + if (params->Op0 != r->Op0) + continue; + if (params->Op1 != r->Op1) + continue; + if (params->CRn != r->CRn) + continue; + if (params->CRm != r->CRm) + continue; + if (params->Op2 != r->Op2) + continue; + + return r; + } + return NULL; +} + +static int emulate_sys_reg(struct kvm_vcpu *vcpu, + const struct sys_reg_params *params) +{ + size_t num; + const struct sys_reg_desc *table, *r; + + table = get_target_table(vcpu->arch.target, &num); + + /* Search target-specific then generic table. */ + r = find_reg(params, table, num); + if (!r) + r = find_reg(params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); + + if (likely(r)) { + /* + * Not having an accessor means that we have + * configured a trap that we don't know how to + * handle. This certainly qualifies as a gross bug + * that should be fixed right away. + */ + BUG_ON(!r->access); + + if (likely(r->access(vcpu, params, r))) { + /* Skip instruction, since it was emulated */ + kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + return 1; + } + /* If access function fails, it should complain. */ + } else { + kvm_err("Unsupported guest sys_reg access at: %lx\n", + *vcpu_pc(vcpu)); + print_sys_reg_instr(params); + } + kvm_inject_undefined(vcpu); + return 1; +} + +static void reset_sys_reg_descs(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *table, size_t num) +{ + unsigned long i; + + for (i = 0; i < num; i++) + if (table[i].reset) + table[i].reset(vcpu, &table[i]); +} + +/** + * kvm_handle_sys_reg -- handles a mrs/msr trap on a guest sys_reg access + * @vcpu: The VCPU pointer + * @run: The kvm_run struct + */ +int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + struct sys_reg_params params; + unsigned long esr = kvm_vcpu_get_hsr(vcpu); + + params.Op0 = (esr >> 20) & 3; + params.Op1 = (esr >> 14) & 0x7; + params.CRn = (esr >> 10) & 0xf; + params.CRm = (esr >> 1) & 0xf; + params.Op2 = (esr >> 17) & 0x7; + params.Rt = (esr >> 5) & 0x1f; + params.is_write = !(esr & 1); + + return emulate_sys_reg(vcpu, ¶ms); +} + +/****************************************************************************** + * Userspace API + *****************************************************************************/ + +static bool index_to_params(u64 id, struct sys_reg_params *params) +{ + switch (id & KVM_REG_SIZE_MASK) { + case KVM_REG_SIZE_U64: + /* Any unused index bits means it's not valid. */ + if (id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK + | KVM_REG_ARM_COPROC_MASK + | KVM_REG_ARM64_SYSREG_OP0_MASK + | KVM_REG_ARM64_SYSREG_OP1_MASK + | KVM_REG_ARM64_SYSREG_CRN_MASK + | KVM_REG_ARM64_SYSREG_CRM_MASK + | KVM_REG_ARM64_SYSREG_OP2_MASK)) + return false; + params->Op0 = ((id & KVM_REG_ARM64_SYSREG_OP0_MASK) + >> KVM_REG_ARM64_SYSREG_OP0_SHIFT); + params->Op1 = ((id & KVM_REG_ARM64_SYSREG_OP1_MASK) + >> KVM_REG_ARM64_SYSREG_OP1_SHIFT); + params->CRn = ((id & KVM_REG_ARM64_SYSREG_CRN_MASK) + >> KVM_REG_ARM64_SYSREG_CRN_SHIFT); + params->CRm = ((id & KVM_REG_ARM64_SYSREG_CRM_MASK) + >> KVM_REG_ARM64_SYSREG_CRM_SHIFT); + params->Op2 = ((id & KVM_REG_ARM64_SYSREG_OP2_MASK) + >> KVM_REG_ARM64_SYSREG_OP2_SHIFT); + return true; + default: + return false; + } +} + +/* Decode an index value, and find the sys_reg_desc entry. */ +static const struct sys_reg_desc *index_to_sys_reg_desc(struct kvm_vcpu *vcpu, + u64 id) +{ + size_t num; + const struct sys_reg_desc *table, *r; + struct sys_reg_params params; + + /* We only do sys_reg for now. */ + if ((id & KVM_REG_ARM_COPROC_MASK) != KVM_REG_ARM64_SYSREG) + return NULL; + + if (!index_to_params(id, ¶ms)) + return NULL; + + table = get_target_table(vcpu->arch.target, &num); + r = find_reg(¶ms, table, num); + if (!r) + r = find_reg(¶ms, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); + + /* Not saved in the sys_reg array? */ + if (r && !r->reg) + r = NULL; + + return r; +} + +/* + * These are the invariant sys_reg registers: we let the guest see the + * host versions of these, so they're part of the guest state. + * + * A future CPU may provide a mechanism to present different values to + * the guest, or a future kvm may trap them. + */ + +#define FUNCTION_INVARIANT(reg) \ + static void get_##reg(struct kvm_vcpu *v, \ + const struct sys_reg_desc *r) \ + { \ + u64 val; \ + \ + asm volatile("mrs %0, " __stringify(reg) "\n" \ + : "=r" (val)); \ + ((struct sys_reg_desc *)r)->val = val; \ + } + +FUNCTION_INVARIANT(midr_el1) +FUNCTION_INVARIANT(ctr_el0) +FUNCTION_INVARIANT(revidr_el1) +FUNCTION_INVARIANT(id_pfr0_el1) +FUNCTION_INVARIANT(id_pfr1_el1) +FUNCTION_INVARIANT(id_dfr0_el1) +FUNCTION_INVARIANT(id_afr0_el1) +FUNCTION_INVARIANT(id_mmfr0_el1) +FUNCTION_INVARIANT(id_mmfr1_el1) +FUNCTION_INVARIANT(id_mmfr2_el1) +FUNCTION_INVARIANT(id_mmfr3_el1) +FUNCTION_INVARIANT(id_isar0_el1) +FUNCTION_INVARIANT(id_isar1_el1) +FUNCTION_INVARIANT(id_isar2_el1) +FUNCTION_INVARIANT(id_isar3_el1) +FUNCTION_INVARIANT(id_isar4_el1) +FUNCTION_INVARIANT(id_isar5_el1) +FUNCTION_INVARIANT(clidr_el1) +FUNCTION_INVARIANT(aidr_el1) + +/* ->val is filled in by kvm_sys_reg_table_init() */ +static struct sys_reg_desc invariant_sys_regs[] = { + { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0000), Op2(0b000), + NULL, get_midr_el1 }, + { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0000), Op2(0b110), + NULL, get_revidr_el1 }, + { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b000), + NULL, get_id_pfr0_el1 }, + { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b001), + NULL, get_id_pfr1_el1 }, + { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b010), + NULL, get_id_dfr0_el1 }, + { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b011), + NULL, get_id_afr0_el1 }, + { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b100), + NULL, get_id_mmfr0_el1 }, + { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b101), + NULL, get_id_mmfr1_el1 }, + { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b110), + NULL, get_id_mmfr2_el1 }, + { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b111), + NULL, get_id_mmfr3_el1 }, + { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b000), + NULL, get_id_isar0_el1 }, + { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b001), + NULL, get_id_isar1_el1 }, + { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b010), + NULL, get_id_isar2_el1 }, + { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b011), + NULL, get_id_isar3_el1 }, + { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b100), + NULL, get_id_isar4_el1 }, + { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b101), + NULL, get_id_isar5_el1 }, + { Op0(0b11), Op1(0b001), CRn(0b0000), CRm(0b0000), Op2(0b001), + NULL, get_clidr_el1 }, + { Op0(0b11), Op1(0b001), CRn(0b0000), CRm(0b0000), Op2(0b111), + NULL, get_aidr_el1 }, + { Op0(0b11), Op1(0b011), CRn(0b0000), CRm(0b0000), Op2(0b001), + NULL, get_ctr_el0 }, +}; + +static int reg_from_user(void *val, const void __user *uaddr, u64 id) +{ + /* This Just Works because we are little endian. */ + if (copy_from_user(val, uaddr, KVM_REG_SIZE(id)) != 0) + return -EFAULT; + return 0; +} + +static int reg_to_user(void __user *uaddr, const void *val, u64 id) +{ + /* This Just Works because we are little endian. */ + if (copy_to_user(uaddr, val, KVM_REG_SIZE(id)) != 0) + return -EFAULT; + return 0; +} + +static int get_invariant_sys_reg(u64 id, void __user *uaddr) +{ + struct sys_reg_params params; + const struct sys_reg_desc *r; + + if (!index_to_params(id, ¶ms)) + return -ENOENT; + + r = find_reg(¶ms, invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs)); + if (!r) + return -ENOENT; + + return reg_to_user(uaddr, &r->val, id); +} + +static int set_invariant_sys_reg(u64 id, void __user *uaddr) +{ + struct sys_reg_params params; + const struct sys_reg_desc *r; + int err; + u64 val = 0; /* Make sure high bits are 0 for 32-bit regs */ + + if (!index_to_params(id, ¶ms)) + return -ENOENT; + r = find_reg(¶ms, invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs)); + if (!r) + return -ENOENT; + + err = reg_from_user(&val, uaddr, id); + if (err) + return err; + + /* This is what we mean by invariant: you can't change it. */ + if (r->val != val) + return -EINVAL; + + return 0; +} + +static bool is_valid_cache(u32 val) +{ + u32 level, ctype; + + if (val >= CSSELR_MAX) + return -ENOENT; + + /* Bottom bit is Instruction or Data bit. Next 3 bits are level. */ + level = (val >> 1); + ctype = (cache_levels >> (level * 3)) & 7; + + switch (ctype) { + case 0: /* No cache */ + return false; + case 1: /* Instruction cache only */ + return (val & 1); + case 2: /* Data cache only */ + case 4: /* Unified cache */ + return !(val & 1); + case 3: /* Separate instruction and data caches */ + return true; + default: /* Reserved: we can't know instruction or data. */ + return false; + } +} + +static int demux_c15_get(u64 id, void __user *uaddr) +{ + u32 val; + u32 __user *uval = uaddr; + + /* Fail if we have unknown bits set. */ + if (id & ~(KVM_REG_ARCH_MASK|KVM_REG_SIZE_MASK|KVM_REG_ARM_COPROC_MASK + | ((1 << KVM_REG_ARM_COPROC_SHIFT)-1))) + return -ENOENT; + + switch (id & KVM_REG_ARM_DEMUX_ID_MASK) { + case KVM_REG_ARM_DEMUX_ID_CCSIDR: + if (KVM_REG_SIZE(id) != 4) + return -ENOENT; + val = (id & KVM_REG_ARM_DEMUX_VAL_MASK) + >> KVM_REG_ARM_DEMUX_VAL_SHIFT; + if (!is_valid_cache(val)) + return -ENOENT; + + return put_user(get_ccsidr(val), uval); + default: + return -ENOENT; + } +} + +static int demux_c15_set(u64 id, void __user *uaddr) +{ + u32 val, newval; + u32 __user *uval = uaddr; + + /* Fail if we have unknown bits set. */ + if (id & ~(KVM_REG_ARCH_MASK|KVM_REG_SIZE_MASK|KVM_REG_ARM_COPROC_MASK + | ((1 << KVM_REG_ARM_COPROC_SHIFT)-1))) + return -ENOENT; + + switch (id & KVM_REG_ARM_DEMUX_ID_MASK) { + case KVM_REG_ARM_DEMUX_ID_CCSIDR: + if (KVM_REG_SIZE(id) != 4) + return -ENOENT; + val = (id & KVM_REG_ARM_DEMUX_VAL_MASK) + >> KVM_REG_ARM_DEMUX_VAL_SHIFT; + if (!is_valid_cache(val)) + return -ENOENT; + + if (get_user(newval, uval)) + return -EFAULT; + + /* This is also invariant: you can't change it. */ + if (newval != get_ccsidr(val)) + return -EINVAL; + return 0; + default: + return -ENOENT; + } +} + +int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + const struct sys_reg_desc *r; + void __user *uaddr = (void __user *)(unsigned long)reg->addr; + + if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) + return demux_c15_get(reg->id, uaddr); + + if (KVM_REG_SIZE(reg->id) != sizeof(__u64)) + return -ENOENT; + + r = index_to_sys_reg_desc(vcpu, reg->id); + if (!r) + return get_invariant_sys_reg(reg->id, uaddr); + + return reg_to_user(uaddr, &vcpu_sys_reg(vcpu, r->reg), reg->id); +} + +int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ + const struct sys_reg_desc *r; + void __user *uaddr = (void __user *)(unsigned long)reg->addr; + + if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) + return demux_c15_set(reg->id, uaddr); + + if (KVM_REG_SIZE(reg->id) != sizeof(__u64)) + return -ENOENT; + + r = index_to_sys_reg_desc(vcpu, reg->id); + if (!r) + return set_invariant_sys_reg(reg->id, uaddr); + + return reg_from_user(&vcpu_sys_reg(vcpu, r->reg), uaddr, reg->id); +} + +static unsigned int num_demux_regs(void) +{ + unsigned int i, count = 0; + + for (i = 0; i < CSSELR_MAX; i++) + if (is_valid_cache(i)) + count++; + + return count; +} + +static int write_demux_regids(u64 __user *uindices) +{ + u64 val = KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX; + unsigned int i; + + val |= KVM_REG_ARM_DEMUX_ID_CCSIDR; + for (i = 0; i < CSSELR_MAX; i++) { + if (!is_valid_cache(i)) + continue; + if (put_user(val | i, uindices)) + return -EFAULT; + uindices++; + } + return 0; +} + +static u64 sys_reg_to_index(const struct sys_reg_desc *reg) +{ + return (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | + KVM_REG_ARM64_SYSREG | + (reg->Op0 << KVM_REG_ARM64_SYSREG_OP0_SHIFT) | + (reg->Op1 << KVM_REG_ARM64_SYSREG_OP1_SHIFT) | + (reg->CRn << KVM_REG_ARM64_SYSREG_CRN_SHIFT) | + (reg->CRm << KVM_REG_ARM64_SYSREG_CRM_SHIFT) | + (reg->Op2 << KVM_REG_ARM64_SYSREG_OP2_SHIFT)); +} + +static bool copy_reg_to_user(const struct sys_reg_desc *reg, u64 __user **uind) +{ + if (!*uind) + return true; + + if (put_user(sys_reg_to_index(reg), *uind)) + return false; + + (*uind)++; + return true; +} + +/* Assumed ordered tables, see kvm_sys_reg_table_init. */ +static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 __user *uind) +{ + const struct sys_reg_desc *i1, *i2, *end1, *end2; + unsigned int total = 0; + size_t num; + + /* We check for duplicates here, to allow arch-specific overrides. */ + i1 = get_target_table(vcpu->arch.target, &num); + end1 = i1 + num; + i2 = sys_reg_descs; + end2 = sys_reg_descs + ARRAY_SIZE(sys_reg_descs); + + BUG_ON(i1 == end1 || i2 == end2); + + /* Walk carefully, as both tables may refer to the same register. */ + while (i1 || i2) { + int cmp = cmp_sys_reg(i1, i2); + /* target-specific overrides generic entry. */ + if (cmp <= 0) { + /* Ignore registers we trap but don't save. */ + if (i1->reg) { + if (!copy_reg_to_user(i1, &uind)) + return -EFAULT; + total++; + } + } else { + /* Ignore registers we trap but don't save. */ + if (i2->reg) { + if (!copy_reg_to_user(i2, &uind)) + return -EFAULT; + total++; + } + } + + if (cmp <= 0 && ++i1 == end1) + i1 = NULL; + if (cmp >= 0 && ++i2 == end2) + i2 = NULL; + } + return total; +} + +unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu) +{ + return ARRAY_SIZE(invariant_sys_regs) + + num_demux_regs() + + walk_sys_regs(vcpu, (u64 __user *)NULL); +} + +int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) +{ + unsigned int i; + int err; + + /* Then give them all the invariant registers' indices. */ + for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) { + if (put_user(sys_reg_to_index(&invariant_sys_regs[i]), uindices)) + return -EFAULT; + uindices++; + } + + err = walk_sys_regs(vcpu, uindices); + if (err < 0) + return err; + uindices += err; + + return write_demux_regids(uindices); +} + +void kvm_sys_reg_table_init(void) +{ + unsigned int i; + struct sys_reg_desc clidr; + + /* Make sure tables are unique and in order. */ + for (i = 1; i < ARRAY_SIZE(sys_reg_descs); i++) + BUG_ON(cmp_sys_reg(&sys_reg_descs[i-1], &sys_reg_descs[i]) >= 0); + + /* We abuse the reset function to overwrite the table itself. */ + for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) + invariant_sys_regs[i].reset(NULL, &invariant_sys_regs[i]); + + /* + * CLIDR format is awkward, so clean it up. See ARM B4.1.20: + * + * If software reads the Cache Type fields from Ctype1 + * upwards, once it has seen a value of 0b000, no caches + * exist at further-out levels of the hierarchy. So, for + * example, if Ctype3 is the first Cache Type field with a + * value of 0b000, the values of Ctype4 to Ctype7 must be + * ignored. + */ + get_clidr_el1(NULL, &clidr); /* Ugly... */ + cache_levels = clidr.val; + for (i = 0; i < 7; i++) + if (((cache_levels >> (i*3)) & 7) == 0) + break; + /* Clear all higher bits. */ + cache_levels &= (1 << (i*3))-1; +} + +/** + * kvm_reset_sys_regs - sets system registers to reset value + * @vcpu: The VCPU pointer + * + * This function finds the right table above and sets the registers on the + * virtual CPU struct to their architecturally defined reset values. + */ +void kvm_reset_sys_regs(struct kvm_vcpu *vcpu) +{ + size_t num; + const struct sys_reg_desc *table; + + /* Catch someone adding a register without putting in reset entry. */ + memset(&vcpu->arch.ctxt.sys_regs, 0x42, sizeof(vcpu->arch.ctxt.sys_regs)); + + /* Generic chip reset first (so target could override). */ + reset_sys_reg_descs(vcpu, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); + + table = get_target_table(vcpu->arch.target, &num); + reset_sys_reg_descs(vcpu, table, num); + + for (num = 1; num < NR_SYS_REGS; num++) + if (vcpu_sys_reg(vcpu, num) == 0x4242424242424242) + panic("Didn't reset vcpu_sys_reg(%zi)", num); +} diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h new file mode 100644 index 000000000000..d50d3722998e --- /dev/null +++ b/arch/arm64/kvm/sys_regs.h @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2012,2013 - ARM Ltd + * Author: Marc Zyngier + * + * Derived from arch/arm/kvm/coproc.h + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Authors: Christoffer Dall + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef __ARM64_KVM_SYS_REGS_LOCAL_H__ +#define __ARM64_KVM_SYS_REGS_LOCAL_H__ + +struct sys_reg_params { + u8 Op0; + u8 Op1; + u8 CRn; + u8 CRm; + u8 Op2; + u8 Rt; + bool is_write; +}; + +struct sys_reg_desc { + /* MRS/MSR instruction which accesses it. */ + u8 Op0; + u8 Op1; + u8 CRn; + u8 CRm; + u8 Op2; + + /* Trapped access from guest, if non-NULL. */ + bool (*access)(struct kvm_vcpu *, + const struct sys_reg_params *, + const struct sys_reg_desc *); + + /* Initialization for vcpu. */ + void (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *); + + /* Index into sys_reg[], or 0 if we don't need to save it. */ + int reg; + + /* Value (usually reset value) */ + u64 val; +}; + +static inline void print_sys_reg_instr(const struct sys_reg_params *p) +{ + /* Look, we even formatted it for you to paste into the table! */ + kvm_pr_unimpl(" { Op0(%2u), Op1(%2u), CRn(%2u), CRm(%2u), Op2(%2u), func_%s },\n", + p->Op0, p->Op1, p->CRn, p->CRm, p->Op2, p->is_write ? "write" : "read"); +} + +static inline bool ignore_write(struct kvm_vcpu *vcpu, + const struct sys_reg_params *p) +{ + return true; +} + +static inline bool read_zero(struct kvm_vcpu *vcpu, + const struct sys_reg_params *p) +{ + *vcpu_reg(vcpu, p->Rt) = 0; + return true; +} + +static inline bool write_to_read_only(struct kvm_vcpu *vcpu, + const struct sys_reg_params *params) +{ + kvm_debug("sys_reg write to read-only register at: %lx\n", + *vcpu_pc(vcpu)); + print_sys_reg_instr(params); + return false; +} + +static inline bool read_from_write_only(struct kvm_vcpu *vcpu, + const struct sys_reg_params *params) +{ + kvm_debug("sys_reg read to write-only register at: %lx\n", + *vcpu_pc(vcpu)); + print_sys_reg_instr(params); + return false; +} + +/* Reset functions */ +static inline void reset_unknown(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + BUG_ON(!r->reg); + BUG_ON(r->reg >= NR_SYS_REGS); + vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL; +} + +static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + BUG_ON(!r->reg); + BUG_ON(r->reg >= NR_SYS_REGS); + vcpu_sys_reg(vcpu, r->reg) = r->val; +} + +static inline int cmp_sys_reg(const struct sys_reg_desc *i1, + const struct sys_reg_desc *i2) +{ + BUG_ON(i1 == i2); + if (!i1) + return 1; + else if (!i2) + return -1; + if (i1->Op0 != i2->Op0) + return i1->Op0 - i2->Op0; + if (i1->Op1 != i2->Op1) + return i1->Op1 - i2->Op1; + if (i1->CRn != i2->CRn) + return i1->CRn - i2->CRn; + if (i1->CRm != i2->CRm) + return i1->CRm - i2->CRm; + return i1->Op2 - i2->Op2; +} + + +#define Op0(_x) .Op0 = _x +#define Op1(_x) .Op1 = _x +#define CRn(_x) .CRn = _x +#define CRm(_x) .CRm = _x +#define Op2(_x) .Op2 = _x + +#endif /* __ARM64_KVM_SYS_REGS_LOCAL_H__ */ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index a5c86fc34a37..2d1bcb891468 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -783,6 +783,7 @@ struct kvm_dirty_tlb { #define KVM_REG_IA64 0x3000000000000000ULL #define KVM_REG_ARM 0x4000000000000000ULL #define KVM_REG_S390 0x5000000000000000ULL +#define KVM_REG_ARM64 0x6000000000000000ULL #define KVM_REG_SIZE_SHIFT 52 #define KVM_REG_SIZE_MASK 0x00f0000000000000ULL -- cgit v1.2.3-71-gd317 From c9e2e946fb0ba5d2398feb89558f98c5c28e23e3 Mon Sep 17 00:00:00 2001 From: Jingchang Lu Date: Fri, 7 Jun 2013 09:20:40 +0800 Subject: tty: serial: add Freescale lpuart driver support Add Freescale lpuart driver support. The lpuart device can be found on Vybrid VF610 and Layerscape LS-1 SoCs. Signed-off-by: Jingchang Lu Signed-off-by: Greg Kroah-Hartman --- .../devicetree/bindings/tty/serial/fsl-lpuart.txt | 14 + drivers/tty/serial/Kconfig | 14 + drivers/tty/serial/Makefile | 1 + drivers/tty/serial/fsl_lpuart.c | 874 +++++++++++++++++++++ include/uapi/linux/serial_core.h | 3 + 5 files changed, 906 insertions(+) create mode 100644 Documentation/devicetree/bindings/tty/serial/fsl-lpuart.txt create mode 100644 drivers/tty/serial/fsl_lpuart.c (limited to 'include/uapi/linux') diff --git a/Documentation/devicetree/bindings/tty/serial/fsl-lpuart.txt b/Documentation/devicetree/bindings/tty/serial/fsl-lpuart.txt new file mode 100644 index 000000000000..6fd1dd1638dd --- /dev/null +++ b/Documentation/devicetree/bindings/tty/serial/fsl-lpuart.txt @@ -0,0 +1,14 @@ +* Freescale low power universal asynchronous receiver/transmitter (lpuart) + +Required properties: +- compatible : Should be "fsl,-lpuart" +- reg : Address and length of the register set for the device +- interrupts : Should contain uart interrupt + +Example: + +uart0: serial@40027000 { + compatible = "fsl,vf610-lpuart"; + reg = <0x40027000 0x1000>; + interrupts = <0 61 0x00>; + }; diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig index 95e66acca165..46dd1c72feda 100644 --- a/drivers/tty/serial/Kconfig +++ b/drivers/tty/serial/Kconfig @@ -1483,6 +1483,20 @@ config SERIAL_RP2_NR_UARTS If multiple cards are present, the default limit of 32 ports may need to be increased. +config SERIAL_FSL_LPUART + tristate "Freescale lpuart serial port support" + select SERIAL_CORE + help + Support for the on-chip lpuart on some Freescale SOCs. + +config SERIAL_FSL_LPUART_CONSOLE + bool "Console on Freescale lpuart serial port" + depends on SERIAL_FSL_LPUART=y + select SERIAL_CORE_CONSOLE + help + If you have enabled the lpuart serial port on the Freescale SoCs, + you can make it the console by answering Y to this option. + endmenu endif # TTY diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile index eedfec40e3dd..cf650f0cd6e4 100644 --- a/drivers/tty/serial/Makefile +++ b/drivers/tty/serial/Makefile @@ -85,3 +85,4 @@ obj-$(CONFIG_SERIAL_AR933X) += ar933x_uart.o obj-$(CONFIG_SERIAL_EFM32_UART) += efm32-uart.o obj-$(CONFIG_SERIAL_ARC) += arc_uart.o obj-$(CONFIG_SERIAL_RP2) += rp2.o +obj-$(CONFIG_SERIAL_FSL_LPUART) += fsl_lpuart.o diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c new file mode 100644 index 000000000000..263cfaabe9e2 --- /dev/null +++ b/drivers/tty/serial/fsl_lpuart.c @@ -0,0 +1,874 @@ +/* + * Freescale lpuart serial port driver + * + * Copyright 2012-2013 Freescale Semiconductor, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#if defined(CONFIG_SERIAL_FSL_LPUART_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ) +#define SUPPORT_SYSRQ +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* All registers are 8-bit width */ +#define UARTBDH 0x00 +#define UARTBDL 0x01 +#define UARTCR1 0x02 +#define UARTCR2 0x03 +#define UARTSR1 0x04 +#define UARTCR3 0x06 +#define UARTDR 0x07 +#define UARTCR4 0x0a +#define UARTCR5 0x0b +#define UARTMODEM 0x0d +#define UARTPFIFO 0x10 +#define UARTCFIFO 0x11 +#define UARTSFIFO 0x12 +#define UARTTWFIFO 0x13 +#define UARTTCFIFO 0x14 +#define UARTRWFIFO 0x15 + +#define UARTBDH_LBKDIE 0x80 +#define UARTBDH_RXEDGIE 0x40 +#define UARTBDH_SBR_MASK 0x1f + +#define UARTCR1_LOOPS 0x80 +#define UARTCR1_RSRC 0x20 +#define UARTCR1_M 0x10 +#define UARTCR1_WAKE 0x08 +#define UARTCR1_ILT 0x04 +#define UARTCR1_PE 0x02 +#define UARTCR1_PT 0x01 + +#define UARTCR2_TIE 0x80 +#define UARTCR2_TCIE 0x40 +#define UARTCR2_RIE 0x20 +#define UARTCR2_ILIE 0x10 +#define UARTCR2_TE 0x08 +#define UARTCR2_RE 0x04 +#define UARTCR2_RWU 0x02 +#define UARTCR2_SBK 0x01 + +#define UARTSR1_TDRE 0x80 +#define UARTSR1_TC 0x40 +#define UARTSR1_RDRF 0x20 +#define UARTSR1_IDLE 0x10 +#define UARTSR1_OR 0x08 +#define UARTSR1_NF 0x04 +#define UARTSR1_FE 0x02 +#define UARTSR1_PE 0x01 + +#define UARTCR3_R8 0x80 +#define UARTCR3_T8 0x40 +#define UARTCR3_TXDIR 0x20 +#define UARTCR3_TXINV 0x10 +#define UARTCR3_ORIE 0x08 +#define UARTCR3_NEIE 0x04 +#define UARTCR3_FEIE 0x02 +#define UARTCR3_PEIE 0x01 + +#define UARTCR4_MAEN1 0x80 +#define UARTCR4_MAEN2 0x40 +#define UARTCR4_M10 0x20 +#define UARTCR4_BRFA_MASK 0x1f +#define UARTCR4_BRFA_OFF 0 + +#define UARTCR5_TDMAS 0x80 +#define UARTCR5_RDMAS 0x20 + +#define UARTMODEM_RXRTSE 0x08 +#define UARTMODEM_TXRTSPOL 0x04 +#define UARTMODEM_TXRTSE 0x02 +#define UARTMODEM_TXCTSE 0x01 + +#define UARTPFIFO_TXFE 0x80 +#define UARTPFIFO_FIFOSIZE_MASK 0x7 +#define UARTPFIFO_TXSIZE_OFF 4 +#define UARTPFIFO_RXFE 0x08 +#define UARTPFIFO_RXSIZE_OFF 0 + +#define UARTCFIFO_TXFLUSH 0x80 +#define UARTCFIFO_RXFLUSH 0x40 +#define UARTCFIFO_RXOFE 0x04 +#define UARTCFIFO_TXOFE 0x02 +#define UARTCFIFO_RXUFE 0x01 + +#define UARTSFIFO_TXEMPT 0x80 +#define UARTSFIFO_RXEMPT 0x40 +#define UARTSFIFO_RXOF 0x04 +#define UARTSFIFO_TXOF 0x02 +#define UARTSFIFO_RXUF 0x01 + +#define DRIVER_NAME "fsl-lpuart" +#define DEV_NAME "ttyLP" +#define UART_NR 6 + +struct lpuart_port { + struct uart_port port; + struct clk *clk; + unsigned int txfifo_size; + unsigned int rxfifo_size; +}; + +static struct of_device_id lpuart_dt_ids[] = { + { + .compatible = "fsl,vf610-lpuart", + }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, lpuart_dt_ids); + +static void lpuart_stop_tx(struct uart_port *port) +{ + unsigned char temp; + + temp = readb(port->membase + UARTCR2); + temp &= ~(UARTCR2_TIE | UARTCR2_TCIE); + writeb(temp, port->membase + UARTCR2); +} + +static void lpuart_stop_rx(struct uart_port *port) +{ + unsigned char temp; + + temp = readb(port->membase + UARTCR2); + writeb(temp & ~UARTCR2_RE, port->membase + UARTCR2); +} + +static void lpuart_enable_ms(struct uart_port *port) +{ +} + +static inline void lpuart_transmit_buffer(struct lpuart_port *sport) +{ + struct circ_buf *xmit = &sport->port.state->xmit; + + while (!uart_circ_empty(xmit) && + (readb(sport->port.membase + UARTTCFIFO) < sport->txfifo_size)) { + writeb(xmit->buf[xmit->tail], sport->port.membase + UARTDR); + xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1); + sport->port.icount.tx++; + } + + if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) + uart_write_wakeup(&sport->port); + + if (uart_circ_empty(xmit)) + lpuart_stop_tx(&sport->port); +} + +static void lpuart_start_tx(struct uart_port *port) +{ + struct lpuart_port *sport = container_of(port, struct lpuart_port, port); + unsigned char temp; + + temp = readb(port->membase + UARTCR2); + writeb(temp | UARTCR2_TIE, port->membase + UARTCR2); + + if (readb(port->membase + UARTSR1) & UARTSR1_TDRE) + lpuart_transmit_buffer(sport); +} + +static irqreturn_t lpuart_txint(int irq, void *dev_id) +{ + struct lpuart_port *sport = dev_id; + struct circ_buf *xmit = &sport->port.state->xmit; + unsigned long flags; + + spin_lock_irqsave(&sport->port.lock, flags); + if (sport->port.x_char) { + writeb(sport->port.x_char, sport->port.membase + UARTDR); + goto out; + } + + if (uart_circ_empty(xmit) || uart_tx_stopped(&sport->port)) { + lpuart_stop_tx(&sport->port); + goto out; + } + + lpuart_transmit_buffer(sport); + + if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) + uart_write_wakeup(&sport->port); + +out: + spin_unlock_irqrestore(&sport->port.lock, flags); + return IRQ_HANDLED; +} + +static irqreturn_t lpuart_rxint(int irq, void *dev_id) +{ + struct lpuart_port *sport = dev_id; + unsigned int flg, ignored = 0; + struct tty_port *port = &sport->port.state->port; + unsigned long flags; + unsigned char rx, sr; + + spin_lock_irqsave(&sport->port.lock, flags); + + while (!(readb(sport->port.membase + UARTSFIFO) & UARTSFIFO_RXEMPT)) { + flg = TTY_NORMAL; + sport->port.icount.rx++; + /* + * to clear the FE, OR, NF, FE, PE flags, + * read SR1 then read DR + */ + sr = readb(sport->port.membase + UARTSR1); + rx = readb(sport->port.membase + UARTDR); + + if (uart_handle_sysrq_char(&sport->port, (unsigned char)rx)) + continue; + + if (sr & (UARTSR1_PE | UARTSR1_OR | UARTSR1_FE)) { + if (sr & UARTSR1_PE) + sport->port.icount.parity++; + else if (sr & UARTSR1_FE) + sport->port.icount.frame++; + + if (sr & UARTSR1_OR) + sport->port.icount.overrun++; + + if (sr & sport->port.ignore_status_mask) { + if (++ignored > 100) + goto out; + continue; + } + + sr &= sport->port.read_status_mask; + + if (sr & UARTSR1_PE) + flg = TTY_PARITY; + else if (sr & UARTSR1_FE) + flg = TTY_FRAME; + + if (sr & UARTSR1_OR) + flg = TTY_OVERRUN; + +#ifdef SUPPORT_SYSRQ + sport->port.sysrq = 0; +#endif + } + + tty_insert_flip_char(port, rx, flg); + } + +out: + spin_unlock_irqrestore(&sport->port.lock, flags); + + tty_flip_buffer_push(port); + return IRQ_HANDLED; +} + +static irqreturn_t lpuart_int(int irq, void *dev_id) +{ + struct lpuart_port *sport = dev_id; + unsigned char sts; + + sts = readb(sport->port.membase + UARTSR1); + + if (sts & UARTSR1_RDRF) + lpuart_rxint(irq, dev_id); + + if (sts & UARTSR1_TDRE && + !(readb(sport->port.membase + UARTCR5) & UARTCR5_TDMAS)) + lpuart_txint(irq, dev_id); + + return IRQ_HANDLED; +} + +/* return TIOCSER_TEMT when transmitter is not busy */ +static unsigned int lpuart_tx_empty(struct uart_port *port) +{ + return (readb(port->membase + UARTSR1) & UARTSR1_TC) ? + TIOCSER_TEMT : 0; +} + +static unsigned int lpuart_get_mctrl(struct uart_port *port) +{ + unsigned int temp = 0; + unsigned char reg; + + reg = readb(port->membase + UARTMODEM); + if (reg & UARTMODEM_TXCTSE) + temp |= TIOCM_CTS; + + if (reg & UARTMODEM_RXRTSE) + temp |= TIOCM_RTS; + + return temp; +} + +static void lpuart_set_mctrl(struct uart_port *port, unsigned int mctrl) +{ + unsigned char temp; + + temp = readb(port->membase + UARTMODEM) & + ~(UARTMODEM_RXRTSE | UARTMODEM_TXCTSE); + + if (mctrl & TIOCM_RTS) + temp |= UARTMODEM_RXRTSE; + + if (mctrl & TIOCM_CTS) + temp |= UARTMODEM_TXCTSE; + + writeb(temp, port->membase + UARTMODEM); +} + +static void lpuart_break_ctl(struct uart_port *port, int break_state) +{ + unsigned char temp; + + temp = readb(port->membase + UARTCR2) & ~UARTCR2_SBK; + + if (break_state != 0) + temp |= UARTCR2_SBK; + + writeb(temp, port->membase + UARTCR2); +} + +static void lpuart_setup_watermark(struct lpuart_port *sport) +{ + unsigned char val, cr2; + + cr2 = readb(sport->port.membase + UARTCR2); + cr2 &= ~(UARTCR2_TIE | UARTCR2_TCIE | UARTCR2_TE | + UARTCR2_RIE | UARTCR2_RE); + writeb(cr2, sport->port.membase + UARTCR2); + + /* determine FIFO size and enable FIFO mode */ + val = readb(sport->port.membase + UARTPFIFO); + + sport->txfifo_size = 0x1 << (((val >> UARTPFIFO_TXSIZE_OFF) & + UARTPFIFO_FIFOSIZE_MASK) + 1); + + sport->rxfifo_size = 0x1 << (((val >> UARTPFIFO_RXSIZE_OFF) & + UARTPFIFO_FIFOSIZE_MASK) + 1); + + writeb(val | UARTPFIFO_TXFE | UARTPFIFO_RXFE, + sport->port.membase + UARTPFIFO); + + /* flush Tx and Rx FIFO */ + writeb(UARTCFIFO_TXFLUSH | UARTCFIFO_RXFLUSH, + sport->port.membase + UARTCFIFO); + + writeb(2, sport->port.membase + UARTTWFIFO); + writeb(1, sport->port.membase + UARTRWFIFO); +} + +static int lpuart_startup(struct uart_port *port) +{ + struct lpuart_port *sport = container_of(port, struct lpuart_port, port); + int ret; + unsigned long flags; + unsigned char temp; + + ret = devm_request_irq(port->dev, port->irq, lpuart_int, 0, + DRIVER_NAME, sport); + if (ret) + return ret; + + spin_lock_irqsave(&sport->port.lock, flags); + + lpuart_setup_watermark(sport); + + temp = readb(sport->port.membase + UARTCR2); + temp |= (UARTCR2_RIE | UARTCR2_TIE | UARTCR2_RE | UARTCR2_TE); + writeb(temp, sport->port.membase + UARTCR2); + + spin_unlock_irqrestore(&sport->port.lock, flags); + return 0; +} + +static void lpuart_shutdown(struct uart_port *port) +{ + struct lpuart_port *sport = container_of(port, struct lpuart_port, port); + unsigned char temp; + unsigned long flags; + + spin_lock_irqsave(&port->lock, flags); + + /* disable Rx/Tx and interrupts */ + temp = readb(port->membase + UARTCR2); + temp &= ~(UARTCR2_TE | UARTCR2_RE | + UARTCR2_TIE | UARTCR2_TCIE | UARTCR2_RIE); + writeb(temp, port->membase + UARTCR2); + + spin_unlock_irqrestore(&port->lock, flags); + + devm_free_irq(port->dev, port->irq, sport); +} + +static void +lpuart_set_termios(struct uart_port *port, struct ktermios *termios, + struct ktermios *old) +{ + struct lpuart_port *sport = container_of(port, struct lpuart_port, port); + unsigned long flags; + unsigned char cr1, old_cr1, old_cr2, cr4, bdh, modem; + unsigned int baud; + unsigned int old_csize = old ? old->c_cflag & CSIZE : CS8; + unsigned int sbr, brfa; + + cr1 = old_cr1 = readb(sport->port.membase + UARTCR1); + old_cr2 = readb(sport->port.membase + UARTCR2); + cr4 = readb(sport->port.membase + UARTCR4); + bdh = readb(sport->port.membase + UARTBDH); + modem = readb(sport->port.membase + UARTMODEM); + /* + * only support CS8 and CS7, and for CS7 must enable PE. + * supported mode: + * - (7,e/o,1) + * - (8,n,1) + * - (8,m/s,1) + * - (8,e/o,1) + */ + while ((termios->c_cflag & CSIZE) != CS8 && + (termios->c_cflag & CSIZE) != CS7) { + termios->c_cflag &= ~CSIZE; + termios->c_cflag |= old_csize; + old_csize = CS8; + } + + if ((termios->c_cflag & CSIZE) == CS8 || + (termios->c_cflag & CSIZE) == CS7) + cr1 = old_cr1 & ~UARTCR1_M; + + if (termios->c_cflag & CMSPAR) { + if ((termios->c_cflag & CSIZE) != CS8) { + termios->c_cflag &= ~CSIZE; + termios->c_cflag |= CS8; + } + cr1 |= UARTCR1_M; + } + + if (termios->c_cflag & CRTSCTS) { + modem |= (UARTMODEM_RXRTSE | UARTMODEM_TXCTSE); + } else { + termios->c_cflag &= ~CRTSCTS; + modem &= ~(UARTMODEM_RXRTSE | UARTMODEM_TXCTSE); + } + + if (termios->c_cflag & CSTOPB) + termios->c_cflag &= ~CSTOPB; + + /* parity must be enabled when CS7 to match 8-bits format */ + if ((termios->c_cflag & CSIZE) == CS7) + termios->c_cflag |= PARENB; + + if ((termios->c_cflag & PARENB)) { + if (termios->c_cflag & CMSPAR) { + cr1 &= ~UARTCR1_PE; + cr1 |= UARTCR1_M; + } else { + cr1 |= UARTCR1_PE; + if ((termios->c_cflag & CSIZE) == CS8) + cr1 |= UARTCR1_M; + if (termios->c_cflag & PARODD) + cr1 |= UARTCR1_PT; + else + cr1 &= ~UARTCR1_PT; + } + } + + /* ask the core to calculate the divisor */ + baud = uart_get_baud_rate(port, termios, old, 50, port->uartclk / 16); + + spin_lock_irqsave(&sport->port.lock, flags); + + sport->port.read_status_mask = 0; + if (termios->c_iflag & INPCK) + sport->port.read_status_mask |= (UARTSR1_FE | UARTSR1_PE); + if (termios->c_iflag & (BRKINT | PARMRK)) + sport->port.read_status_mask |= UARTSR1_FE; + + /* characters to ignore */ + sport->port.ignore_status_mask = 0; + if (termios->c_iflag & IGNPAR) + sport->port.ignore_status_mask |= UARTSR1_PE; + if (termios->c_iflag & IGNBRK) { + sport->port.ignore_status_mask |= UARTSR1_FE; + /* + * if we're ignoring parity and break indicators, + * ignore overruns too (for real raw support). + */ + if (termios->c_iflag & IGNPAR) + sport->port.ignore_status_mask |= UARTSR1_OR; + } + + /* update the per-port timeout */ + uart_update_timeout(port, termios->c_cflag, baud); + + /* wait transmit engin complete */ + while (!(readb(sport->port.membase + UARTSR1) & UARTSR1_TC)) + barrier(); + + /* disable transmit and receive */ + writeb(old_cr2 & ~(UARTCR2_TE | UARTCR2_RE), + sport->port.membase + UARTCR2); + + sbr = sport->port.uartclk / (16 * baud); + brfa = ((sport->port.uartclk - (16 * sbr * baud)) * 2) / baud; + bdh &= ~UARTBDH_SBR_MASK; + bdh |= (sbr >> 8) & 0x1F; + cr4 &= ~UARTCR4_BRFA_MASK; + brfa &= UARTCR4_BRFA_MASK; + writeb(cr4 | brfa, sport->port.membase + UARTCR4); + writeb(bdh, sport->port.membase + UARTBDH); + writeb(sbr & 0xFF, sport->port.membase + UARTBDL); + writeb(cr1, sport->port.membase + UARTCR1); + writeb(modem, sport->port.membase + UARTMODEM); + + /* restore control register */ + writeb(old_cr2, sport->port.membase + UARTCR2); + + spin_unlock_irqrestore(&sport->port.lock, flags); +} + +static const char *lpuart_type(struct uart_port *port) +{ + return "FSL_LPUART"; +} + +static void lpuart_release_port(struct uart_port *port) +{ + /* nothing to do */ +} + +static int lpuart_request_port(struct uart_port *port) +{ + return 0; +} + +/* configure/autoconfigure the port */ +static void lpuart_config_port(struct uart_port *port, int flags) +{ + if (flags & UART_CONFIG_TYPE) + port->type = PORT_LPUART; +} + +static int lpuart_verify_port(struct uart_port *port, struct serial_struct *ser) +{ + int ret = 0; + + if (ser->type != PORT_UNKNOWN && ser->type != PORT_LPUART) + ret = -EINVAL; + if (port->irq != ser->irq) + ret = -EINVAL; + if (ser->io_type != UPIO_MEM) + ret = -EINVAL; + if (port->uartclk / 16 != ser->baud_base) + ret = -EINVAL; + if (port->iobase != ser->port) + ret = -EINVAL; + if (ser->hub6 != 0) + ret = -EINVAL; + return ret; +} + +static struct uart_ops lpuart_pops = { + .tx_empty = lpuart_tx_empty, + .set_mctrl = lpuart_set_mctrl, + .get_mctrl = lpuart_get_mctrl, + .stop_tx = lpuart_stop_tx, + .start_tx = lpuart_start_tx, + .stop_rx = lpuart_stop_rx, + .enable_ms = lpuart_enable_ms, + .break_ctl = lpuart_break_ctl, + .startup = lpuart_startup, + .shutdown = lpuart_shutdown, + .set_termios = lpuart_set_termios, + .type = lpuart_type, + .request_port = lpuart_request_port, + .release_port = lpuart_release_port, + .config_port = lpuart_config_port, + .verify_port = lpuart_verify_port, +}; + +static struct lpuart_port *lpuart_ports[UART_NR]; + +#ifdef CONFIG_SERIAL_FSL_LPUART_CONSOLE +static void lpuart_console_putchar(struct uart_port *port, int ch) +{ + while (!(readb(port->membase + UARTSR1) & UARTSR1_TDRE)) + barrier(); + + writeb(ch, port->membase + UARTDR); +} + +static void +lpuart_console_write(struct console *co, const char *s, unsigned int count) +{ + struct lpuart_port *sport = lpuart_ports[co->index]; + unsigned char old_cr2, cr2; + + /* first save CR2 and then disable interrupts */ + cr2 = old_cr2 = readb(sport->port.membase + UARTCR2); + cr2 |= (UARTCR2_TE | UARTCR2_RE); + cr2 &= ~(UARTCR2_TIE | UARTCR2_TCIE | UARTCR2_RIE); + writeb(cr2, sport->port.membase + UARTCR2); + + uart_console_write(&sport->port, s, count, lpuart_console_putchar); + + /* wait for transmitter finish complete and restore CR2 */ + while (!(readb(sport->port.membase + UARTSR1) & UARTSR1_TC)) + barrier(); + + writeb(old_cr2, sport->port.membase + UARTCR2); +} + +/* + * if the port was already initialised (eg, by a boot loader), + * try to determine the current setup. + */ +static void __init +lpuart_console_get_options(struct lpuart_port *sport, int *baud, + int *parity, int *bits) +{ + unsigned char cr, bdh, bdl, brfa; + unsigned int sbr, uartclk, baud_raw; + + cr = readb(sport->port.membase + UARTCR2); + cr &= UARTCR2_TE | UARTCR2_RE; + if (!cr) + return; + + /* ok, the port was enabled */ + + cr = readb(sport->port.membase + UARTCR1); + + *parity = 'n'; + if (cr & UARTCR1_PE) { + if (cr & UARTCR1_PT) + *parity = 'o'; + else + *parity = 'e'; + } + + if (cr & UARTCR1_M) + *bits = 9; + else + *bits = 8; + + bdh = readb(sport->port.membase + UARTBDH); + bdh &= UARTBDH_SBR_MASK; + bdl = readb(sport->port.membase + UARTBDL); + sbr = bdh; + sbr <<= 8; + sbr |= bdl; + brfa = readb(sport->port.membase + UARTCR4); + brfa &= UARTCR4_BRFA_MASK; + + uartclk = clk_get_rate(sport->clk); + /* + * baud = mod_clk/(16*(sbr[13]+(brfa)/32) + */ + baud_raw = uartclk / (16 * (sbr + brfa / 32)); + + if (*baud != baud_raw) + printk(KERN_INFO "Serial: Console lpuart rounded baud rate" + "from %d to %d\n", baud_raw, *baud); +} + +static int __init lpuart_console_setup(struct console *co, char *options) +{ + struct lpuart_port *sport; + int baud = 115200; + int bits = 8; + int parity = 'n'; + int flow = 'n'; + + /* + * check whether an invalid uart number has been specified, and + * if so, search for the first available port that does have + * console support. + */ + if (co->index == -1 || co->index >= ARRAY_SIZE(lpuart_ports)) + co->index = 0; + + sport = lpuart_ports[co->index]; + if (sport == NULL) + return -ENODEV; + + if (options) + uart_parse_options(options, &baud, &parity, &bits, &flow); + else + lpuart_console_get_options(sport, &baud, &parity, &bits); + + lpuart_setup_watermark(sport); + + return uart_set_options(&sport->port, co, baud, parity, bits, flow); +} + +static struct uart_driver lpuart_reg; +static struct console lpuart_console = { + .name = DEV_NAME, + .write = lpuart_console_write, + .device = uart_console_device, + .setup = lpuart_console_setup, + .flags = CON_PRINTBUFFER, + .index = -1, + .data = &lpuart_reg, +}; + +#define LPUART_CONSOLE (&lpuart_console) +#else +#define LPUART_CONSOLE NULL +#endif + +static struct uart_driver lpuart_reg = { + .owner = THIS_MODULE, + .driver_name = DRIVER_NAME, + .dev_name = DEV_NAME, + .nr = ARRAY_SIZE(lpuart_ports), + .cons = LPUART_CONSOLE, +}; + +static int lpuart_probe(struct platform_device *pdev) +{ + struct device_node *np = pdev->dev.of_node; + struct lpuart_port *sport; + struct resource *res; + int ret; + + sport = devm_kzalloc(&pdev->dev, sizeof(*sport), GFP_KERNEL); + if (!sport) + return -ENOMEM; + + pdev->dev.coherent_dma_mask = 0; + + ret = of_alias_get_id(np, "serial"); + if (ret < 0) { + dev_err(&pdev->dev, "failed to get alias id, errno %d\n", ret); + return ret; + } + sport->port.line = ret; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) + return -ENODEV; + + sport->port.mapbase = res->start; + sport->port.membase = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(sport->port.membase)) + return PTR_ERR(sport->port.membase); + + sport->port.dev = &pdev->dev; + sport->port.type = PORT_LPUART; + sport->port.iotype = UPIO_MEM; + sport->port.irq = platform_get_irq(pdev, 0); + sport->port.ops = &lpuart_pops; + sport->port.flags = UPF_BOOT_AUTOCONF; + + sport->clk = devm_clk_get(&pdev->dev, "ipg"); + if (IS_ERR(sport->clk)) { + ret = PTR_ERR(sport->clk); + dev_err(&pdev->dev, "failed to get uart clk: %d\n", ret); + return ret; + } + + ret = clk_prepare_enable(sport->clk); + if (ret) { + dev_err(&pdev->dev, "failed to enable uart clk: %d\n", ret); + return ret; + } + + sport->port.uartclk = clk_get_rate(sport->clk); + + lpuart_ports[sport->port.line] = sport; + + platform_set_drvdata(pdev, &sport->port); + + ret = uart_add_one_port(&lpuart_reg, &sport->port); + if (ret) { + clk_disable_unprepare(sport->clk); + return ret; + } + + return 0; +} + +static int lpuart_remove(struct platform_device *pdev) +{ + struct lpuart_port *sport = platform_get_drvdata(pdev); + + uart_remove_one_port(&lpuart_reg, &sport->port); + + clk_disable_unprepare(sport->clk); + + return 0; +} + +#ifdef CONFIG_PM_SLEEP +static int lpuart_suspend(struct device *dev) +{ + struct lpuart_port *sport = dev_get_drvdata(dev); + + uart_suspend_port(&lpuart_reg, &sport->port); + + return 0; +} + +static int lpuart_resume(struct device *dev) +{ + struct lpuart_port *sport = dev_get_drvdata(dev); + + uart_resume_port(&lpuart_reg, &sport->port); + + return 0; +} +#endif + +static SIMPLE_DEV_PM_OPS(lpuart_pm_ops, lpuart_suspend, lpuart_resume); + +static struct platform_driver lpuart_driver = { + .probe = lpuart_probe, + .remove = lpuart_remove, + .driver = { + .name = "fsl-lpuart", + .owner = THIS_MODULE, + .of_match_table = lpuart_dt_ids, + .pm = &lpuart_pm_ops, + }, +}; + +static int __init lpuart_serial_init(void) +{ + int ret; + + pr_info("serial: Freescale lpuart driver\n"); + + ret = uart_register_driver(&lpuart_reg); + if (ret) + return ret; + + ret = platform_driver_register(&lpuart_driver); + if (ret) + uart_unregister_driver(&lpuart_reg); + + return 0; +} + +static void __exit lpuart_serial_exit(void) +{ + platform_driver_unregister(&lpuart_driver); + uart_unregister_driver(&lpuart_reg); +} + +module_init(lpuart_serial_init); +module_exit(lpuart_serial_exit); + +MODULE_DESCRIPTION("Freescale lpuart serial port driver"); +MODULE_LICENSE("GPL v2"); diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index 74c2bf7211f8..c8eaeb5465ef 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -226,4 +226,7 @@ /* Rocketport EXPRESS/INFINITY */ #define PORT_RP2 102 +/* Freescale lpuart */ +#define PORT_LPUART 103 + #endif /* _UAPILINUX_SERIAL_CORE_H */ -- cgit v1.2.3-71-gd317 From 060212928670593fb89243640bf05cf89560b023 Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Mon, 10 Jun 2013 11:39:50 +0300 Subject: net: add low latency socket poll Adds an ndo_ll_poll method and the code that supports it. This method can be used by low latency applications to busy-poll Ethernet device queues directly from the socket code. sysctl_net_ll_poll controls how many microseconds to poll. Default is zero (disabled). Individual protocol support will be added by subsequent patches. Signed-off-by: Alexander Duyck Signed-off-by: Jesse Brandeburg Signed-off-by: Eliezer Tamir Acked-by: Eric Dumazet Tested-by: Willem de Bruijn Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 7 ++ include/linux/netdevice.h | 3 + include/linux/skbuff.h | 8 ++- include/net/ll_poll.h | 148 +++++++++++++++++++++++++++++++++++++++++++ include/net/sock.h | 4 ++ include/uapi/linux/snmp.h | 1 + net/Kconfig | 12 ++++ net/core/skbuff.c | 4 ++ net/core/sock.c | 6 ++ net/core/sysctl_net_core.c | 10 +++ net/ipv4/proc.c | 1 + net/socket.c | 6 ++ 12 files changed, 208 insertions(+), 2 deletions(-) create mode 100644 include/net/ll_poll.h (limited to 'include/uapi/linux') diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index c1f8640c2fc8..85ab72dcdc3c 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -50,6 +50,13 @@ The maximum number of packets that kernel can handle on a NAPI interrupt, it's a Per-CPU variable. Default: 64 +low_latency_poll +---------------- +Low latency busy poll timeout. (needs CONFIG_NET_LL_RX_POLL) +Approximate time in us to spin waiting for packets on the device queue. +Recommended value is 50. May increase power usage. +Default: 0 (off) + rmem_default ------------ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 39bbd462d68e..2ecb96d9a1e5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -971,6 +971,9 @@ struct net_device_ops { struct netpoll_info *info, gfp_t gfp); void (*ndo_netpoll_cleanup)(struct net_device *dev); +#endif +#ifdef CONFIG_NET_LL_RX_POLL + int (*ndo_ll_poll)(struct napi_struct *dev); #endif int (*ndo_set_vf_mac)(struct net_device *dev, int queue, u8 *mac); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9995834d2cb6..400d82ae2b03 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -386,6 +386,7 @@ typedef unsigned char *sk_buff_data_t; * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS * @dma_cookie: a cookie to one of several possible DMA operations * done by skb DMA functions + * @napi_id: id of the NAPI struct this skb came from * @secmark: security marking * @mark: Generic packet mark * @dropcount: total number of sk_receive_queue overflows @@ -500,8 +501,11 @@ struct sk_buff { /* 7/9 bit hole (depending on ndisc_nodetype presence) */ kmemcheck_bitfield_end(flags2); -#ifdef CONFIG_NET_DMA - dma_cookie_t dma_cookie; +#if defined CONFIG_NET_DMA || defined CONFIG_NET_LL_RX_POLL + union { + unsigned int napi_id; + dma_cookie_t dma_cookie; + }; #endif #ifdef CONFIG_NETWORK_SECMARK __u32 secmark; diff --git a/include/net/ll_poll.h b/include/net/ll_poll.h new file mode 100644 index 000000000000..bc262f88173f --- /dev/null +++ b/include/net/ll_poll.h @@ -0,0 +1,148 @@ +/* + * Low Latency Sockets + * Copyright(c) 2013 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + * Author: Eliezer Tamir + * + * Contact Information: + * e1000-devel Mailing List + */ + +/* + * For now this depends on CONFIG_X86_TSC + */ + +#ifndef _LINUX_NET_LL_POLL_H +#define _LINUX_NET_LL_POLL_H + +#include +#include + +#ifdef CONFIG_NET_LL_RX_POLL + +struct napi_struct; +extern unsigned long sysctl_net_ll_poll __read_mostly; + +/* return values from ndo_ll_poll */ +#define LL_FLUSH_FAILED -1 +#define LL_FLUSH_BUSY -2 + +/* we don't mind a ~2.5% imprecision */ +#define TSC_MHZ (tsc_khz >> 10) + +static inline cycles_t ll_end_time(void) +{ + return TSC_MHZ * ACCESS_ONCE(sysctl_net_ll_poll) + get_cycles(); +} + +static inline bool sk_valid_ll(struct sock *sk) +{ + return sysctl_net_ll_poll && sk->sk_napi_id && + !need_resched() && !signal_pending(current); +} + +static inline bool can_poll_ll(cycles_t end_time) +{ + return !time_after((unsigned long)get_cycles(), + (unsigned long)end_time); +} + +static inline bool sk_poll_ll(struct sock *sk, int nonblock) +{ + cycles_t end_time = ll_end_time(); + const struct net_device_ops *ops; + struct napi_struct *napi; + int rc = false; + + /* + * rcu read lock for napi hash + * bh so we don't race with net_rx_action + */ + rcu_read_lock_bh(); + + napi = napi_by_id(sk->sk_napi_id); + if (!napi) + goto out; + + ops = napi->dev->netdev_ops; + if (!ops->ndo_ll_poll) + goto out; + + do { + + rc = ops->ndo_ll_poll(napi); + + if (rc == LL_FLUSH_FAILED) + break; /* permanent failure */ + + if (rc > 0) + /* local bh are disabled so it is ok to use _BH */ + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_LOWLATENCYRXPACKETS, rc); + + } while (skb_queue_empty(&sk->sk_receive_queue) + && can_poll_ll(end_time) && !nonblock); + + rc = !skb_queue_empty(&sk->sk_receive_queue); +out: + rcu_read_unlock_bh(); + return rc; +} + +/* used in the NIC receive handler to mark the skb */ +static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi) +{ + skb->napi_id = napi->napi_id; +} + +/* used in the protocol hanlder to propagate the napi_id to the socket */ +static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb) +{ + sk->sk_napi_id = skb->napi_id; +} + +#else /* CONFIG_NET_LL_RX_POLL */ + +static inline cycles_t ll_end_time(void) +{ + return 0; +} + +static inline bool sk_valid_ll(struct sock *sk) +{ + return false; +} + +static inline bool sk_poll_ll(struct sock *sk, int nonblock) +{ + return false; +} + +static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi) +{ +} + +static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb) +{ +} + +static inline bool can_poll_ll(cycles_t end_time) +{ + return false; +} + +#endif /* CONFIG_NET_LL_RX_POLL */ +#endif /* _LINUX_NET_LL_POLL_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 66772cf8c3c5..ac8e1818380c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -229,6 +229,7 @@ struct cg_proto; * @sk_omem_alloc: "o" is "option" or "other" * @sk_wmem_queued: persistent queue size * @sk_forward_alloc: space allocated forward + * @sk_napi_id: id of the last napi context to receive data for sk * @sk_allocation: allocation mode * @sk_sndbuf: size of send buffer in bytes * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, @@ -324,6 +325,9 @@ struct sock { int sk_forward_alloc; #ifdef CONFIG_RPS __u32 sk_rxhash; +#endif +#ifdef CONFIG_NET_LL_RX_POLL + unsigned int sk_napi_id; #endif atomic_t sk_drops; int sk_rcvbuf; diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index df2e8b4f9c03..26cbf76f8058 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -253,6 +253,7 @@ enum LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */ LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */ + LINUX_MIB_LOWLATENCYRXPACKETS, /* LowLatencyRxPackets */ __LINUX_MIB_MAX }; diff --git a/net/Kconfig b/net/Kconfig index 523e43e6da1b..d6a9ce6e1800 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -243,6 +243,18 @@ config NETPRIO_CGROUP Cgroup subsystem for use in assigning processes to network priorities on a per-interface basis +config NET_LL_RX_POLL + bool "Low Latency Receive Poll" + depends on X86_TSC + default n + ---help--- + Support Low Latency Receive Queue Poll. + (For network card drivers which support this option.) + When waiting for data in read or poll call directly into the the device driver + to flush packets which may be pending on the device queues into the stack. + + If unsure, say N. + config BQL boolean depends on SYSFS diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 73f57a0e1523..4a4181e16c1a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -733,6 +733,10 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->vlan_tci = old->vlan_tci; skb_copy_secmark(new, old); + +#ifdef CONFIG_NET_LL_RX_POLL + new->napi_id = old->napi_id; +#endif } /* diff --git a/net/core/sock.c b/net/core/sock.c index 88868a9d21da..788c0da5eed1 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -139,6 +139,8 @@ #include #endif +#include + static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); @@ -2284,6 +2286,10 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_stamp = ktime_set(-1L, 0); +#ifdef CONFIG_NET_LL_RX_POLL + sk->sk_napi_id = 0; +#endif + /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 741db5fc7806..4b48f39582b0 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -19,6 +19,7 @@ #include #include #include +#include static int one = 1; @@ -284,6 +285,15 @@ static struct ctl_table net_core_table[] = { .proc_handler = flow_limit_table_len_sysctl }, #endif /* CONFIG_NET_FLOW_LIMIT */ +#ifdef CONFIG_NET_LL_RX_POLL + { + .procname = "low_latency_poll", + .data = &sysctl_net_ll_poll, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax + }, +#endif #endif /* CONFIG_NET */ { .procname = "netdev_budget", diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 2a5bf86d2415..6577a1149a47 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -273,6 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), + SNMP_MIB_ITEM("LowLatencyRxPackets", LINUX_MIB_LOWLATENCYRXPACKETS), SNMP_MIB_SENTINEL }; diff --git a/net/socket.c b/net/socket.c index 3ebdcb805c51..21fd29f63ed2 100644 --- a/net/socket.c +++ b/net/socket.c @@ -104,6 +104,12 @@ #include #include #include +#include + +#ifdef CONFIG_NET_LL_RX_POLL +unsigned long sysctl_net_ll_poll __read_mostly; +EXPORT_SYMBOL_GPL(sysctl_net_ll_poll); +#endif static int sock_no_open(struct inode *irrelevant, struct file *dontcare); static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, -- cgit v1.2.3-71-gd317 From 2a8fedd0c142d4328ab4667847e05afe17c3295c Mon Sep 17 00:00:00 2001 From: David Daney Date: Mon, 10 Jun 2013 12:33:47 -0700 Subject: kvm: Add definition of KVM_REG_MIPS We use 0x7000000000000000ULL as 0x6000000000000000ULL is reserved for ARM64. Signed-off-by: David Daney Signed-off-by: Gleb Natapov --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index a5c86fc34a37..d88c8ee00c8b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -783,6 +783,7 @@ struct kvm_dirty_tlb { #define KVM_REG_IA64 0x3000000000000000ULL #define KVM_REG_ARM 0x4000000000000000ULL #define KVM_REG_S390 0x5000000000000000ULL +#define KVM_REG_MIPS 0x7000000000000000ULL #define KVM_REG_SIZE_SHIFT 52 #define KVM_REG_SIZE_MASK 0x00f0000000000000ULL -- cgit v1.2.3-71-gd317 From 9ba18891f75535eca3ef53138b48970eb60f5255 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Wed, 5 Jun 2013 10:08:00 -0400 Subject: bridge: Add flag to control mac learning. Allow user to control whether mac learning is enabled on the port. By default, mac learning is enabled. Disabling mac learning will cause new dynamic FDB entries to not be created for a particular port. Signed-off-by: Vlad Yasevich Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/bridge/br_if.c | 2 +- net/bridge/br_input.c | 6 ++++-- net/bridge/br_netlink.c | 6 +++++- net/bridge/br_private.h | 1 + net/bridge/br_sysfs_if.c | 2 ++ 6 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index b05823cae784..8643809d8417 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -221,6 +221,7 @@ enum { IFLA_BRPORT_GUARD, /* bpdu guard */ IFLA_BRPORT_PROTECT, /* root port protection */ IFLA_BRPORT_FAST_LEAVE, /* multicast fast leave */ + IFLA_BRPORT_LEARNING, /* mac learning */ __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 4cdba60926ff..2c08911df578 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -221,7 +221,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br, p->path_cost = port_cost(dev); p->priority = 0x8000 >> BR_PORT_BITS; p->port_no = index; - p->flags = 0; + p->flags = BR_LEARNING; br_init_port(p); p->state = BR_STATE_DISABLED; br_stp_port_timer_init(p); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 828e2bcc1f52..7e993667d4bf 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -75,7 +75,8 @@ int br_handle_frame_finish(struct sk_buff *skb) /* insert into forwarding database after filtering to avoid spoofing */ br = p->br; - br_fdb_update(br, p, eth_hdr(skb)->h_source, vid); + if (p->flags & BR_LEARNING) + br_fdb_update(br, p, eth_hdr(skb)->h_source, vid); if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) && br_multicast_rcv(br, p, skb)) @@ -142,7 +143,8 @@ static int br_handle_local_finish(struct sk_buff *skb) u16 vid = 0; br_vlan_get_tag(skb, &vid); - br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid); + if (p->flags & BR_LEARNING) + br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid); return 0; /* process further */ } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 8e3abf564798..ce902bf8a618 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -30,6 +30,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_GUARD */ + nla_total_size(1) /* IFLA_BRPORT_PROTECT */ + nla_total_size(1) /* IFLA_BRPORT_FAST_LEAVE */ + + nla_total_size(1) /* IFLA_BRPORT_LEARNING */ + 0; } @@ -56,7 +57,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u8(skb, IFLA_BRPORT_MODE, mode) || nla_put_u8(skb, IFLA_BRPORT_GUARD, !!(p->flags & BR_BPDU_GUARD)) || nla_put_u8(skb, IFLA_BRPORT_PROTECT, !!(p->flags & BR_ROOT_BLOCK)) || - nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE))) + nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) || + nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING))) return -EMSGSIZE; return 0; @@ -281,6 +283,7 @@ static const struct nla_policy ifla_brport_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_MODE] = { .type = NLA_U8 }, [IFLA_BRPORT_GUARD] = { .type = NLA_U8 }, [IFLA_BRPORT_PROTECT] = { .type = NLA_U8 }, + [IFLA_BRPORT_LEARNING] = { .type = NLA_U8 }, }; /* Change the state of the port and notify spanning tree */ @@ -328,6 +331,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD); br_set_port_flag(p, tb, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE); br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK); + br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING); if (tb[IFLA_BRPORT_COST]) { err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST])); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 1b0ac95a5c37..04d7f43508f7 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -158,6 +158,7 @@ struct net_bridge_port #define BR_ROOT_BLOCK 0x00000004 #define BR_MULTICAST_FAST_LEAVE 0x00000008 #define BR_ADMIN_COST 0x00000010 +#define BR_LEARNING 0x00000020 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING u32 multicast_startup_queries_sent; diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index a1ef1b6e14dc..707f3628e9cd 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -158,6 +158,7 @@ static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush); BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE); BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD); BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK); +BRPORT_ATTR_FLAG(learning, BR_LEARNING); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) @@ -195,6 +196,7 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_hairpin_mode, &brport_attr_bpdu_guard, &brport_attr_root_block, + &brport_attr_learning, #ifdef CONFIG_BRIDGE_IGMP_SNOOPING &brport_attr_multicast_router, &brport_attr_multicast_fast_leave, -- cgit v1.2.3-71-gd317 From 867a59436fc35593ae0e0efcd56cc6d2f8506586 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Wed, 5 Jun 2013 10:08:01 -0400 Subject: bridge: Add a flag to control unicast packet flood. Add a flag to control flood of unicast traffic. By default, flood is on and the bridge will flood unicast traffic if it doesn't know the destination. When the flag is turned off, unicast traffic without an FDB will not be forwarded to the specified port. Signed-off-by: Vlad Yasevich Reviewed-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/bridge/br_device.c | 8 ++++---- net/bridge/br_forward.c | 14 +++++++++----- net/bridge/br_if.c | 2 +- net/bridge/br_input.c | 9 ++++++--- net/bridge/br_netlink.c | 6 +++++- net/bridge/br_private.h | 6 ++++-- net/bridge/br_sysfs_if.c | 2 ++ 8 files changed, 32 insertions(+), 16 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 8643809d8417..da05a2698cb5 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -222,6 +222,7 @@ enum { IFLA_BRPORT_PROTECT, /* root port protection */ IFLA_BRPORT_FAST_LEAVE, /* multicast fast leave */ IFLA_BRPORT_LEARNING, /* mac learning */ + IFLA_BRPORT_UNICAST_FLOOD, /* flood unicast traffic */ __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 75f3239130f8..2ef66781fedb 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -58,10 +58,10 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) skb_pull(skb, ETH_HLEN); if (is_broadcast_ether_addr(dest)) - br_flood_deliver(br, skb); + br_flood_deliver(br, skb, false); else if (is_multicast_ether_addr(dest)) { if (unlikely(netpoll_tx_running(dev))) { - br_flood_deliver(br, skb); + br_flood_deliver(br, skb, false); goto out; } if (br_multicast_rcv(br, NULL, skb)) { @@ -73,11 +73,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) br_multicast_deliver(mdst, skb); else - br_flood_deliver(br, skb); + br_flood_deliver(br, skb, false); } else if ((dst = __br_fdb_get(br, dest, vid)) != NULL) br_deliver(dst->dst, skb); else - br_flood_deliver(br, skb); + br_flood_deliver(br, skb, true); out: rcu_read_unlock(); diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 092b20e4ee4c..4b81b1471789 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -174,7 +174,8 @@ out: static void br_flood(struct net_bridge *br, struct sk_buff *skb, struct sk_buff *skb0, void (*__packet_hook)(const struct net_bridge_port *p, - struct sk_buff *skb)) + struct sk_buff *skb), + bool unicast) { struct net_bridge_port *p; struct net_bridge_port *prev; @@ -182,6 +183,9 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb, prev = NULL; list_for_each_entry_rcu(p, &br->port_list, list) { + /* Do not flood unicast traffic to ports that turn it off */ + if (unicast && !(p->flags & BR_FLOOD)) + continue; prev = maybe_deliver(prev, p, skb, __packet_hook); if (IS_ERR(prev)) goto out; @@ -203,16 +207,16 @@ out: /* called with rcu_read_lock */ -void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb) +void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, bool unicast) { - br_flood(br, skb, NULL, __br_deliver); + br_flood(br, skb, NULL, __br_deliver, unicast); } /* called under bridge lock */ void br_flood_forward(struct net_bridge *br, struct sk_buff *skb, - struct sk_buff *skb2) + struct sk_buff *skb2, bool unicast) { - br_flood(br, skb, skb2, __br_forward); + br_flood(br, skb, skb2, __br_forward, unicast); } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 2c08911df578..5623be6b9ecd 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -221,7 +221,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br, p->path_cost = port_cost(dev); p->priority = 0x8000 >> BR_PORT_BITS; p->port_no = index; - p->flags = BR_LEARNING; + p->flags = BR_LEARNING | BR_FLOOD; br_init_port(p); p->state = BR_STATE_DISABLED; br_stp_port_timer_init(p); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 7e993667d4bf..1b8b8b824cd7 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -65,6 +65,7 @@ int br_handle_frame_finish(struct sk_buff *skb) struct net_bridge_fdb_entry *dst; struct net_bridge_mdb_entry *mdst; struct sk_buff *skb2; + bool unicast = true; u16 vid = 0; if (!p || p->state == BR_STATE_DISABLED) @@ -95,9 +96,10 @@ int br_handle_frame_finish(struct sk_buff *skb) dst = NULL; - if (is_broadcast_ether_addr(dest)) + if (is_broadcast_ether_addr(dest)) { skb2 = skb; - else if (is_multicast_ether_addr(dest)) { + unicast = false; + } else if (is_multicast_ether_addr(dest)) { mdst = br_mdb_get(br, skb, vid); if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) { if ((mdst && mdst->mglist) || @@ -110,6 +112,7 @@ int br_handle_frame_finish(struct sk_buff *skb) } else skb2 = skb; + unicast = false; br->dev->stats.multicast++; } else if ((dst = __br_fdb_get(br, dest, vid)) && dst->is_local) { @@ -123,7 +126,7 @@ int br_handle_frame_finish(struct sk_buff *skb) dst->used = jiffies; br_forward(dst->dst, skb, skb2); } else - br_flood_forward(br, skb, skb2); + br_flood_forward(br, skb, skb2, unicast); } if (skb2) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index ce902bf8a618..1fc30abd3a52 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -31,6 +31,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_PROTECT */ + nla_total_size(1) /* IFLA_BRPORT_FAST_LEAVE */ + nla_total_size(1) /* IFLA_BRPORT_LEARNING */ + + nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */ + 0; } @@ -58,7 +59,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u8(skb, IFLA_BRPORT_GUARD, !!(p->flags & BR_BPDU_GUARD)) || nla_put_u8(skb, IFLA_BRPORT_PROTECT, !!(p->flags & BR_ROOT_BLOCK)) || nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) || - nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING))) + nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) || + nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD))) return -EMSGSIZE; return 0; @@ -284,6 +286,7 @@ static const struct nla_policy ifla_brport_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_GUARD] = { .type = NLA_U8 }, [IFLA_BRPORT_PROTECT] = { .type = NLA_U8 }, [IFLA_BRPORT_LEARNING] = { .type = NLA_U8 }, + [IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 }, }; /* Change the state of the port and notify spanning tree */ @@ -332,6 +335,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) br_set_port_flag(p, tb, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE); br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK); br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING); + br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); if (tb[IFLA_BRPORT_COST]) { err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST])); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 04d7f43508f7..3be89b3ce17b 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -159,6 +159,7 @@ struct net_bridge_port #define BR_MULTICAST_FAST_LEAVE 0x00000008 #define BR_ADMIN_COST 0x00000010 #define BR_LEARNING 0x00000020 +#define BR_FLOOD 0x00000040 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING u32 multicast_startup_queries_sent; @@ -414,9 +415,10 @@ extern int br_dev_queue_push_xmit(struct sk_buff *skb); extern void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, struct sk_buff *skb0); extern int br_forward_finish(struct sk_buff *skb); -extern void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb); +extern void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, + bool unicast); extern void br_flood_forward(struct net_bridge *br, struct sk_buff *skb, - struct sk_buff *skb2); + struct sk_buff *skb2, bool unicast); /* br_if.c */ extern void br_port_carrier_check(struct net_bridge_port *p); diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 707f3628e9cd..2a2cdb756d51 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -159,6 +159,7 @@ BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE); BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD); BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK); BRPORT_ATTR_FLAG(learning, BR_LEARNING); +BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) @@ -197,6 +198,7 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_bpdu_guard, &brport_attr_root_block, &brport_attr_learning, + &brport_attr_unicast_flood, #ifdef CONFIG_BRIDGE_IGMP_SNOOPING &brport_attr_multicast_router, &brport_attr_multicast_fast_leave, -- cgit v1.2.3-71-gd317 From 45203a3b380cee28f570475c0d28c169f908c209 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 Jun 2013 08:43:22 -0700 Subject: net_sched: add 64bit rate estimators struct gnet_stats_rate_est contains u32 fields, so the bytes per second field can wrap at 34360Mbit. Add a new gnet_stats_rate_est64 structure to get 64bit bps/pps fields, and switch the kernel to use this structure natively. This structure is dumped to user space as a new attribute : TCA_STATS_RATE_EST64 Old tc command will now display the capped bps (to 34360Mbit), instead of wrapped values, and updated tc command will display correct information. Old tc command output, after patch : eric:~# tc -s -d qd sh dev lo qdisc pfifo 8001: root refcnt 2 limit 1000p Sent 80868245400 bytes 1978837 pkt (dropped 0, overlimits 0 requeues 0) rate 34360Mbit 189696pps backlog 0b 0p requeues 0 This patch carefully reorganizes "struct Qdisc" layout to get optimal performance on SMP. Signed-off-by: Eric Dumazet Cc: Ben Hutchings Signed-off-by: David S. Miller --- include/net/act_api.h | 2 +- include/net/gen_stats.h | 10 +++++----- include/net/netfilter/xt_rateest.h | 2 +- include/net/sch_generic.h | 13 +++++++------ include/uapi/linux/gen_stats.h | 11 +++++++++++ net/core/gen_estimator.c | 12 ++++++------ net/core/gen_stats.c | 22 +++++++++++++++++----- net/netfilter/xt_rateest.c | 2 +- net/sched/sch_cbq.c | 2 +- net/sched/sch_drr.c | 2 +- net/sched/sch_hfsc.c | 2 +- net/sched/sch_htb.c | 2 +- net/sched/sch_qfq.c | 2 +- 13 files changed, 54 insertions(+), 30 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/act_api.h b/include/net/act_api.h index 06ef7e926a66..b8ffac7b6bab 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -18,7 +18,7 @@ struct tcf_common { struct tcf_t tcfc_tm; struct gnet_stats_basic_packed tcfc_bstats; struct gnet_stats_queue tcfc_qstats; - struct gnet_stats_rate_est tcfc_rate_est; + struct gnet_stats_rate_est64 tcfc_rate_est; spinlock_t tcfc_lock; struct rcu_head tcfc_rcu; }; diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index a79b6cfb02a8..cf8439ba4d11 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -30,7 +30,7 @@ extern int gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b); extern int gnet_stats_copy_rate_est(struct gnet_dump *d, const struct gnet_stats_basic_packed *b, - struct gnet_stats_rate_est *r); + struct gnet_stats_rate_est64 *r); extern int gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q); extern int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len); @@ -38,13 +38,13 @@ extern int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len); extern int gnet_stats_finish_copy(struct gnet_dump *d); extern int gen_new_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est *rate_est, + struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt); extern void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est *rate_est); + struct gnet_stats_rate_est64 *rate_est); extern int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est *rate_est, + struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt); extern bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, - const struct gnet_stats_rate_est *rate_est); + const struct gnet_stats_rate_est64 *rate_est); #endif diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h index 5a2978d1cb22..495c71f66e7e 100644 --- a/include/net/netfilter/xt_rateest.h +++ b/include/net/netfilter/xt_rateest.h @@ -6,7 +6,7 @@ struct xt_rateest { struct gnet_stats_basic_packed bstats; spinlock_t lock; /* keep rstats and lock on same cache line to speedup xt_rateest_mt() */ - struct gnet_stats_rate_est rstats; + struct gnet_stats_rate_est64 rstats; /* following fields not accessed in hot path */ struct hlist_node list; diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index e7f4e21cc3e1..df5676029827 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -58,14 +58,12 @@ struct Qdisc { * multiqueue device. */ #define TCQ_F_WARN_NONWC (1 << 16) - int padded; + u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; struct list_head list; u32 handle; u32 parent; - atomic_t refcnt; - struct gnet_stats_rate_est rate_est; int (*reshape_fail)(struct sk_buff *skb, struct Qdisc *q); @@ -76,8 +74,9 @@ struct Qdisc { */ struct Qdisc *__parent; struct netdev_queue *dev_queue; - struct Qdisc *next_sched; + struct gnet_stats_rate_est64 rate_est; + struct Qdisc *next_sched; struct sk_buff *gso_skb; /* * For performance sake on SMP, we put highly modified fields at the end @@ -88,8 +87,10 @@ struct Qdisc { unsigned int __state; struct gnet_stats_queue qstats; struct rcu_head rcu_head; - spinlock_t busylock; - u32 limit; + int padded; + atomic_t refcnt; + + spinlock_t busylock ____cacheline_aligned_in_smp; }; static inline bool qdisc_is_running(const struct Qdisc *qdisc) diff --git a/include/uapi/linux/gen_stats.h b/include/uapi/linux/gen_stats.h index 552c8a0a12d1..6487317ea619 100644 --- a/include/uapi/linux/gen_stats.h +++ b/include/uapi/linux/gen_stats.h @@ -9,6 +9,7 @@ enum { TCA_STATS_RATE_EST, TCA_STATS_QUEUE, TCA_STATS_APP, + TCA_STATS_RATE_EST64, __TCA_STATS_MAX, }; #define TCA_STATS_MAX (__TCA_STATS_MAX - 1) @@ -37,6 +38,16 @@ struct gnet_stats_rate_est { __u32 pps; }; +/** + * struct gnet_stats_rate_est64 - rate estimator + * @bps: current byte rate + * @pps: current packet rate + */ +struct gnet_stats_rate_est64 { + __u64 bps; + __u64 pps; +}; + /** * struct gnet_stats_queue - queuing statistics * @qlen: queue length diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index d9d198aa9fed..6b5b6e7013ca 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -82,7 +82,7 @@ struct gen_estimator { struct list_head list; struct gnet_stats_basic_packed *bstats; - struct gnet_stats_rate_est *rate_est; + struct gnet_stats_rate_est64 *rate_est; spinlock_t *stats_lock; int ewma_log; u64 last_bytes; @@ -167,7 +167,7 @@ static void gen_add_node(struct gen_estimator *est) static struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats, - const struct gnet_stats_rate_est *rate_est) + const struct gnet_stats_rate_est64 *rate_est) { struct rb_node *p = est_root.rb_node; @@ -203,7 +203,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats * */ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est *rate_est, + struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt) { @@ -258,7 +258,7 @@ EXPORT_SYMBOL(gen_new_estimator); * Note : Caller should respect an RCU grace period before freeing stats_lock */ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est *rate_est) + struct gnet_stats_rate_est64 *rate_est) { struct gen_estimator *e; @@ -290,7 +290,7 @@ EXPORT_SYMBOL(gen_kill_estimator); * Returns 0 on success or a negative error code. */ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est *rate_est, + struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt) { gen_kill_estimator(bstats, rate_est); @@ -306,7 +306,7 @@ EXPORT_SYMBOL(gen_replace_estimator); * Returns true if estimator is active, and false if not. */ bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, - const struct gnet_stats_rate_est *rate_est) + const struct gnet_stats_rate_est64 *rate_est) { bool res; diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index ddedf211e588..9d3d9e78397b 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -143,18 +143,30 @@ EXPORT_SYMBOL(gnet_stats_copy_basic); int gnet_stats_copy_rate_est(struct gnet_dump *d, const struct gnet_stats_basic_packed *b, - struct gnet_stats_rate_est *r) + struct gnet_stats_rate_est64 *r) { + struct gnet_stats_rate_est est; + int res; + if (b && !gen_estimator_active(b, r)) return 0; + est.bps = min_t(u64, UINT_MAX, r->bps); + /* we have some time before reaching 2^32 packets per second */ + est.pps = r->pps; + if (d->compat_tc_stats) { - d->tc_stats.bps = r->bps; - d->tc_stats.pps = r->pps; + d->tc_stats.bps = est.bps; + d->tc_stats.pps = est.pps; } - if (d->tail) - return gnet_stats_copy(d, TCA_STATS_RATE_EST, r, sizeof(*r)); + if (d->tail) { + res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est)); + if (res < 0 || est.bps == r->bps) + return res; + /* emit 64bit stats only if needed */ + return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r)); + } return 0; } diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c index ed0db15ab00e..7720b036d76a 100644 --- a/net/netfilter/xt_rateest.c +++ b/net/netfilter/xt_rateest.c @@ -18,7 +18,7 @@ static bool xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_rateest_match_info *info = par->matchinfo; - struct gnet_stats_rate_est *r; + struct gnet_stats_rate_est64 *r; u_int32_t bps1, bps2, pps1, pps2; bool ret = true; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 1bc210ffcba2..71a568862557 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -130,7 +130,7 @@ struct cbq_class { psched_time_t penalized; struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; struct tc_cbq_xstats xstats; struct tcf_proto *filter_list; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 759b308d1a8d..8302717ea303 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -25,7 +25,7 @@ struct drr_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; struct list_head alist; struct Qdisc *qdisc; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 9facea03faeb..c4075610502c 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -114,7 +114,7 @@ struct hfsc_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; unsigned int level; /* class level in hierarchy */ struct tcf_proto *filter_list; /* filter list */ unsigned int filter_cnt; /* filter count */ diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index adaedd79389c..162fb800754c 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -78,7 +78,7 @@ struct htb_class { /* general class parameters */ struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; struct tc_htb_xstats xstats; /* our special stats */ int refcnt; /* usage count of this class */ diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index d51852bba01c..7c195d972bf0 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -138,7 +138,7 @@ struct qfq_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; struct Qdisc *qdisc; struct list_head alist; /* Link for active-classes list. */ struct qfq_aggregate *agg; /* Parent aggregate. */ -- cgit v1.2.3-71-gd317 From a0a2af765543a96bdf188ec57dd201e708195302 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 3 Jun 2013 18:10:45 +0200 Subject: nl80211: add kernel-doc for NL80211_FEATURE_ACTIVE_MONITOR Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 5920715278c2..1f0019d4cce6 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3579,6 +3579,10 @@ enum nl80211_ap_sme_features { * Peering Management entity which may be implemented by registering for * beacons or NL80211_CMD_NEW_PEER_CANDIDATE events. The mesh beacon is * still generated by the driver. + * @NL80211_FEATURE_ACTIVE_MONITOR: This driver supports an active monitor + * interface. An active monitor interface behaves like a normal monitor + * interface, but gets added to the driver. It ensures that incoming + * unicast packets directed at the configured interface address get ACKed. */ enum nl80211_feature_flags { NL80211_FEATURE_SK_TX_STATUS = 1 << 0, -- cgit v1.2.3-71-gd317 From 8e7c053853b7d299e8a2b8733659b0df8eee51f7 Mon Sep 17 00:00:00 2001 From: Colleen Twitty Date: Mon, 3 Jun 2013 09:53:39 -0700 Subject: {nl,cfg}80211: make peer link expiration time configurable If a STA has a peer that it hasn't seen any tx activity from for a certain length of time, the peer link is expired. This means the inactive STA is removed from the list of peers and that STA is not considered a peer again unless it re-peers. Previously, this inactivity time was always 30 minutes. Now, add it to the mesh configuration and allow it to be configured. Retain 30 minutes as a default value. Signed-off-by: Colleen Twitty Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++++ include/uapi/linux/nl80211.h | 5 +++++ net/wireless/mesh.c | 2 ++ net/wireless/nl80211.c | 8 +++++++- 4 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 855c76ccff38..31ca11672ca8 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1124,6 +1124,9 @@ struct bss_parameters { * setting for new peer links. * @dot11MeshAwakeWindowDuration: The duration in TUs the STA will remain awake * after transmitting its beacon. + * @plink_timeout: If no tx activity is seen from a STA we've established + * peering with for longer than this time (in seconds), then remove it + * from the STA's list of peers. Default is 30 minutes. */ struct mesh_config { u16 dot11MeshRetryTimeout; @@ -1153,6 +1156,7 @@ struct mesh_config { u16 dot11MeshHWMPconfirmationInterval; enum nl80211_mesh_power_mode power_mode; u16 dot11MeshAwakeWindowDuration; + u32 plink_timeout; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 1f0019d4cce6..ca6facf4df0c 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2577,6 +2577,10 @@ enum nl80211_mesh_power_mode { * * @NL80211_MESHCONF_AWAKE_WINDOW: awake window duration (in TUs) * + * @NL80211_MESHCONF_PLINK_TIMEOUT: If no tx activity is seen from a STA we've + * established peering with for longer than this time (in seconds), then + * remove it from the STA's list of peers. Default is 30 minutes. + * * @__NL80211_MESHCONF_ATTR_AFTER_LAST: internal use */ enum nl80211_meshconf_params { @@ -2608,6 +2612,7 @@ enum nl80211_meshconf_params { NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL, NL80211_MESHCONF_POWER_MODE, NL80211_MESHCONF_AWAKE_WINDOW, + NL80211_MESHCONF_PLINK_TIMEOUT, /* keep last */ __NL80211_MESHCONF_ATTR_AFTER_LAST, diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c index 5dfb289ab761..0daaf72e1b81 100644 --- a/net/wireless/mesh.c +++ b/net/wireless/mesh.c @@ -18,6 +18,7 @@ #define MESH_PATH_TO_ROOT_TIMEOUT 6000 #define MESH_ROOT_INTERVAL 5000 #define MESH_ROOT_CONFIRMATION_INTERVAL 2000 +#define MESH_DEFAULT_PLINK_TIMEOUT 1800 /* timeout in seconds */ /* * Minimum interval between two consecutive PREQs originated by the same @@ -75,6 +76,7 @@ const struct mesh_config default_mesh_config = { .dot11MeshHWMPconfirmationInterval = MESH_ROOT_CONFIRMATION_INTERVAL, .power_mode = NL80211_MESH_POWER_ACTIVE, .dot11MeshAwakeWindowDuration = MESH_DEFAULT_AWAKE_WINDOW, + .plink_timeout = MESH_DEFAULT_PLINK_TIMEOUT, }; const struct mesh_setup default_mesh_setup = { diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 88e820b73674..8aa83c04d4eb 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4575,7 +4575,9 @@ static int nl80211_get_mesh_config(struct sk_buff *skb, nla_put_u32(msg, NL80211_MESHCONF_POWER_MODE, cur_params.power_mode) || nla_put_u16(msg, NL80211_MESHCONF_AWAKE_WINDOW, - cur_params.dot11MeshAwakeWindowDuration)) + cur_params.dot11MeshAwakeWindowDuration) || + nla_put_u32(msg, NL80211_MESHCONF_PLINK_TIMEOUT, + cur_params.plink_timeout)) goto nla_put_failure; nla_nest_end(msg, pinfoattr); genlmsg_end(msg, hdr); @@ -4616,6 +4618,7 @@ static const struct nla_policy nl80211_meshconf_params_policy[NL80211_MESHCONF_A [NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL] = { .type = NLA_U16 }, [NL80211_MESHCONF_POWER_MODE] = { .type = NLA_U32 }, [NL80211_MESHCONF_AWAKE_WINDOW] = { .type = NLA_U16 }, + [NL80211_MESHCONF_PLINK_TIMEOUT] = { .type = NLA_U32 }, }; static const struct nla_policy @@ -4753,6 +4756,9 @@ do { \ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration, 0, 65535, mask, NL80211_MESHCONF_AWAKE_WINDOW, nla_get_u16); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, 1, 0xffffffff, + mask, NL80211_MESHCONF_PLINK_TIMEOUT, + nla_get_u32); if (mask_out) *mask_out = mask; -- cgit v1.2.3-71-gd317 From 0d854a60b1d7d39a37b25dd28f63cfa0df637b91 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 7 Feb 2013 10:46:46 +0000 Subject: arm64: KVM: enable initialization of a 32bit vcpu Wire the init of a 32bit vcpu by allowing 32bit modes in pstate, and providing sensible defaults out of reset state. This feature is of course conditioned by the presence of 32bit capability on the physical CPU, and is checked by the KVM_CAP_ARM_EL1_32BIT capability. Reviewed-by: Catalin Marinas Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 2 +- arch/arm64/include/uapi/asm/kvm.h | 1 + arch/arm64/kvm/guest.c | 6 ++++++ arch/arm64/kvm/reset.c | 26 +++++++++++++++++++++++++- include/uapi/linux/kvm.h | 1 + 5 files changed, 34 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3f5830b3ca3f..644d73956864 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -34,7 +34,7 @@ #include #include -#define KVM_VCPU_MAX_FEATURES 1 +#define KVM_VCPU_MAX_FEATURES 2 /* We don't currently support large pages. */ #define KVM_HPAGE_GFN_SHIFT(x) 0 diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 5b1110c49df5..5031f4263937 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -75,6 +75,7 @@ struct kvm_regs { #define KVM_VGIC_V2_CPU_SIZE 0x2000 #define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */ +#define KVM_ARM_VCPU_EL1_32BIT 1 /* CPU running a 32bit VM */ struct kvm_vcpu_init { __u32 target; diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 3d7518a7ebaa..2c3ff67a8ecb 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -99,6 +99,12 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if (off == KVM_REG_ARM_CORE_REG(regs.pstate)) { u32 mode = (*(u32 *)valp) & COMPAT_PSR_MODE_MASK; switch (mode) { + case COMPAT_PSR_MODE_USR: + case COMPAT_PSR_MODE_FIQ: + case COMPAT_PSR_MODE_IRQ: + case COMPAT_PSR_MODE_SVC: + case COMPAT_PSR_MODE_ABT: + case COMPAT_PSR_MODE_UND: case PSR_MODE_EL0t: case PSR_MODE_EL1t: case PSR_MODE_EL1h: diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 766150ac76ed..70a7816535cd 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -38,16 +38,32 @@ static const struct kvm_regs default_regs_reset = { PSR_F_BIT | PSR_D_BIT), }; +static const struct kvm_regs default_regs_reset32 = { + .regs.pstate = (COMPAT_PSR_MODE_SVC | COMPAT_PSR_A_BIT | + COMPAT_PSR_I_BIT | COMPAT_PSR_F_BIT), +}; + static const struct kvm_irq_level default_vtimer_irq = { .irq = 27, .level = 1, }; +static bool cpu_has_32bit_el1(void) +{ + u64 pfr0; + + pfr0 = read_cpuid(ID_AA64PFR0_EL1); + return !!(pfr0 & 0x20); +} + int kvm_arch_dev_ioctl_check_extension(long ext) { int r; switch (ext) { + case KVM_CAP_ARM_EL1_32BIT: + r = cpu_has_32bit_el1(); + break; default: r = 0; } @@ -70,7 +86,15 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) switch (vcpu->arch.target) { default: - cpu_reset = &default_regs_reset; + if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) { + if (!cpu_has_32bit_el1()) + return -EINVAL; + cpu_reset = &default_regs_reset32; + vcpu->arch.hcr_el2 &= ~HCR_RW; + } else { + cpu_reset = &default_regs_reset; + } + cpu_vtimer_irq = &default_vtimer_irq; break; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 2d1bcb891468..aac27640bec2 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -666,6 +666,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_IRQ_MPIC 90 #define KVM_CAP_PPC_RTAS 91 #define KVM_CAP_IRQ_XICS 92 +#define KVM_CAP_ARM_EL1_32BIT 93 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3-71-gd317 From 274038f8c94c493e2977983e2aecb5f5f0778479 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Tue, 11 Jun 2013 14:41:24 +0400 Subject: tun: Report "persist" flag to userspace The TUN_PERSIST flag is not reported at all -- both TUNGETIFF, and sysfs "flags" attribute skip one. Knowing whether a device is persistent or not is critical for checkpoint-restore, thus I propose to add the read-only IFF_PERSIST one for this. Setting this new IFF_PERSIST is hardly possible, as TUNSETIFF doesn't check for unknown flags being zero and thus there can be trash. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- drivers/net/tun.c | 3 +++ include/uapi/linux/if_tun.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 89776c592151..b90a73165d30 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1530,6 +1530,9 @@ static int tun_flags(struct tun_struct *tun) if (tun->flags & TUN_TAP_MQ) flags |= IFF_MULTI_QUEUE; + if (tun->flags & TUN_PERSIST) + flags |= IFF_PERSIST; + return flags; } diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 2835b85fd46d..82334f88967e 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -68,6 +68,8 @@ #define IFF_MULTI_QUEUE 0x0100 #define IFF_ATTACH_QUEUE 0x0200 #define IFF_DETACH_QUEUE 0x0400 +/* read-only flag */ +#define IFF_PERSIST 0x0800 /* Features for GSO (TUNSETOFFLOAD). */ #define TUN_F_CSUM 0x01 /* You can hand me unchecksummed packets. */ -- cgit v1.2.3-71-gd317 From 9674da8759df0d6c0d24e1ede6e2a1acdef91e3c Mon Sep 17 00:00:00 2001 From: Eric Lapuyade Date: Mon, 29 Apr 2013 17:13:27 +0200 Subject: NFC: Add firmware upload netlink command As several NFC chipsets can have their firmwares upgraded and reflashed, this patchset adds a new netlink command to trigger that the driver loads or flashes a new firmware. This will allows userspace triggered firmware upgrade through netlink. The firmware name or hint is passed as a parameter, and the driver will eventually fetch the firmware binary through the request_firmware API. The cmd can only be executed when the nfc dev is not in use. Actual firmware loading/flashing is an asynchronous operation. Result of the operation shall send a new event up to user space through the nfc dev multicast socket. During operation, the nfc dev is not openable and thus not usable. Signed-off-by: Eric Lapuyade Signed-off-by: Samuel Ortiz --- include/net/nfc/nfc.h | 2 ++ include/uapi/linux/nfc.h | 6 +++++ net/nfc/core.c | 46 +++++++++++++++++++++++++++++++++++ net/nfc/netlink.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++ net/nfc/nfc.h | 5 ++++ 5 files changed, 122 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h index 5eb80bb3cbb2..3563dbdcaaf2 100644 --- a/include/net/nfc/nfc.h +++ b/include/net/nfc/nfc.h @@ -70,6 +70,7 @@ struct nfc_ops { int (*check_presence)(struct nfc_dev *dev, struct nfc_target *target); int (*enable_se)(struct nfc_dev *dev, u32 secure_element); int (*disable_se)(struct nfc_dev *dev, u32 secure_element); + int (*fw_upload)(struct nfc_dev *dev, const char *firmware_name); }; #define NFC_TARGET_IDX_ANY -1 @@ -104,6 +105,7 @@ struct nfc_dev { int targets_generation; struct device dev; bool dev_up; + bool fw_upload_in_progress; u8 rf_mode; bool polling; struct nfc_target *active_target; diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index 7c6f627a717d..b6cbd164f146 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -69,6 +69,8 @@ * starting a poll from a device which has a secure element enabled means * we want to do SE based card emulation. * @NFC_CMD_DISABLE_SE: Disable the physical link to a specific secure element. + * @NFC_CMD_FW_UPLOAD: Request to Load/flash firmware, or event to inform that + * some firmware was loaded */ enum nfc_commands { NFC_CMD_UNSPEC, @@ -92,6 +94,7 @@ enum nfc_commands { NFC_CMD_DISABLE_SE, NFC_CMD_LLC_SDREQ, NFC_EVENT_LLC_SDRES, + NFC_CMD_FW_UPLOAD, /* private: internal use only */ __NFC_CMD_AFTER_LAST }; @@ -121,6 +124,7 @@ enum nfc_commands { * @NFC_ATTR_LLC_PARAM_RW: Receive Window size parameter * @NFC_ATTR_LLC_PARAM_MIUX: MIU eXtension parameter * @NFC_ATTR_SE: Available Secure Elements + * @NFC_ATTR_FIRMWARE_NAME: Free format firmware version */ enum nfc_attrs { NFC_ATTR_UNSPEC, @@ -143,6 +147,7 @@ enum nfc_attrs { NFC_ATTR_LLC_PARAM_MIUX, NFC_ATTR_SE, NFC_ATTR_LLC_SDP, + NFC_ATTR_FIRMWARE_NAME, /* private: internal use only */ __NFC_ATTR_AFTER_LAST }; @@ -162,6 +167,7 @@ enum nfc_sdp_attr { #define NFC_SENSB_RES_MAXSIZE 12 #define NFC_SENSF_RES_MAXSIZE 18 #define NFC_GB_MAXSIZE 48 +#define NFC_FIRMWARE_NAME_MAXSIZE 32 /* NFC protocols */ #define NFC_PROTO_JEWEL 1 diff --git a/net/nfc/core.c b/net/nfc/core.c index 40d2527693da..eb3cecf1764e 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -44,6 +44,47 @@ DEFINE_MUTEX(nfc_devlist_mutex); /* NFC device ID bitmap */ static DEFINE_IDA(nfc_index_ida); +int nfc_fw_upload(struct nfc_dev *dev, const char *firmware_name) +{ + int rc = 0; + + pr_debug("%s do firmware %s\n", dev_name(&dev->dev), firmware_name); + + device_lock(&dev->dev); + + if (!device_is_registered(&dev->dev)) { + rc = -ENODEV; + goto error; + } + + if (dev->dev_up) { + rc = -EBUSY; + goto error; + } + + if (!dev->ops->fw_upload) { + rc = -EOPNOTSUPP; + goto error; + } + + dev->fw_upload_in_progress = true; + rc = dev->ops->fw_upload(dev, firmware_name); + if (rc) + dev->fw_upload_in_progress = false; + +error: + device_unlock(&dev->dev); + return rc; +} + +int nfc_fw_upload_done(struct nfc_dev *dev, const char *firmware_name) +{ + dev->fw_upload_in_progress = false; + + return nfc_genl_fw_upload_done(dev, firmware_name); +} +EXPORT_SYMBOL(nfc_fw_upload_done); + /** * nfc_dev_up - turn on the NFC device * @@ -69,6 +110,11 @@ int nfc_dev_up(struct nfc_dev *dev) goto error; } + if (dev->fw_upload_in_progress) { + rc = -EBUSY; + goto error; + } + if (dev->dev_up) { rc = -EALREADY; goto error; diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index f0c4d61f37c0..1deadad9a285 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -56,6 +56,8 @@ static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = { [NFC_ATTR_LLC_PARAM_RW] = { .type = NLA_U8 }, [NFC_ATTR_LLC_PARAM_MIUX] = { .type = NLA_U16 }, [NFC_ATTR_LLC_SDP] = { .type = NLA_NESTED }, + [NFC_ATTR_FIRMWARE_NAME] = { .type = NLA_STRING, + .len = NFC_FIRMWARE_NAME_MAXSIZE }, }; static const struct nla_policy nfc_sdp_genl_policy[NFC_SDP_ATTR_MAX + 1] = { @@ -1025,6 +1027,62 @@ exit: return rc; } +static int nfc_genl_fw_upload(struct sk_buff *skb, struct genl_info *info) +{ + struct nfc_dev *dev; + int rc; + u32 idx; + char firmware_name[NFC_FIRMWARE_NAME_MAXSIZE + 1]; + + if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) + return -EINVAL; + + idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); + + dev = nfc_get_device(idx); + if (!dev) + return -ENODEV; + + nla_strlcpy(firmware_name, info->attrs[NFC_ATTR_FIRMWARE_NAME], + sizeof(firmware_name)); + + rc = nfc_fw_upload(dev, firmware_name); + + nfc_put_device(dev); + return rc; +} + +int nfc_genl_fw_upload_done(struct nfc_dev *dev, const char *firmware_name) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, + NFC_CMD_FW_UPLOAD); + if (!hdr) + goto free_msg; + + if (nla_put_string(msg, NFC_ATTR_FIRMWARE_NAME, firmware_name) || + nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_KERNEL); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); +free_msg: + nlmsg_free(msg); + return -EMSGSIZE; +} + static struct genl_ops nfc_genl_ops[] = { { .cmd = NFC_CMD_GET_DEVICE, @@ -1084,6 +1142,11 @@ static struct genl_ops nfc_genl_ops[] = { .doit = nfc_genl_llc_sdreq, .policy = nfc_genl_policy, }, + { + .cmd = NFC_CMD_FW_UPLOAD, + .doit = nfc_genl_fw_upload, + .policy = nfc_genl_policy, + }, }; diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h index afa1f84ba040..cf0c48165996 100644 --- a/net/nfc/nfc.h +++ b/net/nfc/nfc.h @@ -120,6 +120,11 @@ static inline void nfc_device_iter_exit(struct class_dev_iter *iter) class_dev_iter_exit(iter); } +int nfc_fw_upload(struct nfc_dev *dev, const char *firmware_name); +int nfc_genl_fw_upload_done(struct nfc_dev *dev, const char *firmware_name); + +int nfc_fw_upload_done(struct nfc_dev *dev, const char *firmware_name); + int nfc_dev_up(struct nfc_dev *dev); int nfc_dev_down(struct nfc_dev *dev); -- cgit v1.2.3-71-gd317 From 1d8faf48c74b8329a0322dc4b2a2030ae5003c86 Mon Sep 17 00:00:00 2001 From: Rony Efraim Date: Thu, 13 Jun 2013 13:19:10 +0300 Subject: net/core: Add VF link state control Add netlink directives and ndo entry to allow for controling VF link, which can be in one of three states: Auto - VF link state reflects the PF link state (default) Up - VF link state is up, traffic from VF to VF works even if the actual PF link is down Down - VF link state is down, no traffic from/to this VF, can be of use while configuring the VF Signed-off-by: Rony Efraim Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/if_link.h | 1 + include/linux/netdevice.h | 3 +++ include/uapi/linux/if_link.h | 13 +++++++++++++ net/core/rtnetlink.c | 22 ++++++++++++++++++++-- 4 files changed, 37 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/if_link.h b/include/linux/if_link.h index c3f817c3eb45..a86784dec3d3 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -12,5 +12,6 @@ struct ifla_vf_info { __u32 qos; __u32 tx_rate; __u32 spoofchk; + __u32 linkstate; }; #endif /* _LINUX_IF_LINK_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8c9fcc42502a..09b4188c1ea7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -829,6 +829,7 @@ struct netdev_fcoe_hbainfo { * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting); * int (*ndo_get_vf_config)(struct net_device *dev, * int vf, struct ifla_vf_info *ivf); + * int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int link_state); * int (*ndo_set_vf_port)(struct net_device *dev, int vf, * struct nlattr *port[]); * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb); @@ -986,6 +987,8 @@ struct net_device_ops { int (*ndo_get_vf_config)(struct net_device *dev, int vf, struct ifla_vf_info *ivf); + int (*ndo_set_vf_link_state)(struct net_device *dev, + int vf, int link_state); int (*ndo_set_vf_port)(struct net_device *dev, int vf, struct nlattr *port[]); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index da05a2698cb5..03f6170ab337 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -338,6 +338,7 @@ enum { IFLA_VF_VLAN, IFLA_VF_TX_RATE, /* TX Bandwidth Allocation */ IFLA_VF_SPOOFCHK, /* Spoof Checking on/off switch */ + IFLA_VF_LINK_STATE, /* link state enable/disable/auto switch */ __IFLA_VF_MAX, }; @@ -364,6 +365,18 @@ struct ifla_vf_spoofchk { __u32 setting; }; +enum { + IFLA_VF_LINK_STATE_AUTO, /* link state of the uplink */ + IFLA_VF_LINK_STATE_ENABLE, /* link always up */ + IFLA_VF_LINK_STATE_DISABLE, /* link always down */ + __IFLA_VF_LINK_STATE_MAX, +}; + +struct ifla_vf_link_state { + __u32 vf; + __u32 link_state; +}; + /* VF ports management section * * Nested layout of set/get msg is: diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 49c14451d8ab..9007533867f0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -947,6 +947,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct ifla_vf_vlan vf_vlan; struct ifla_vf_tx_rate vf_tx_rate; struct ifla_vf_spoofchk vf_spoofchk; + struct ifla_vf_link_state vf_linkstate; /* * Not all SR-IOV capable drivers support the @@ -956,18 +957,24 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, */ ivi.spoofchk = -1; memset(ivi.mac, 0, sizeof(ivi.mac)); + /* The default value for VF link state is "auto" + * IFLA_VF_LINK_STATE_AUTO which equals zero + */ + ivi.linkstate = 0; if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi)) break; vf_mac.vf = vf_vlan.vf = vf_tx_rate.vf = - vf_spoofchk.vf = ivi.vf; + vf_spoofchk.vf = + vf_linkstate.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; vf_tx_rate.rate = ivi.tx_rate; vf_spoofchk.setting = ivi.spoofchk; + vf_linkstate.link_state = ivi.linkstate; vf = nla_nest_start(skb, IFLA_VF_INFO); if (!vf) { nla_nest_cancel(skb, vfinfo); @@ -978,7 +985,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate) || nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), - &vf_spoofchk)) + &vf_spoofchk) || + nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), + &vf_linkstate)) goto nla_put_failure; nla_nest_end(skb, vf); } @@ -1238,6 +1247,15 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr) ivs->setting); break; } + case IFLA_VF_LINK_STATE: { + struct ifla_vf_link_state *ivl; + ivl = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_link_state) + err = ops->ndo_set_vf_link_state(dev, ivl->vf, + ivl->link_state); + break; + } default: err = -EINVAL; break; -- cgit v1.2.3-71-gd317 From 322bce957e9b0e30ef7147dae0414ad8f3f558c8 Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Mon, 27 May 2013 15:29:11 +0200 Subject: NFC: pn533: Copy NFCID2 through ATR_REQ When using NFC-F we should copy the NFCID2 buffer that we got from SENSF_RES through the ATR_REQ NFCID3 buffer. Not doing so violates NFC Forum digital requirement #189. Signed-off-by: Samuel Ortiz --- drivers/nfc/pn533.c | 14 +++++++++++++- include/net/nfc/nfc.h | 2 ++ include/uapi/linux/nfc.h | 2 ++ 3 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/nfc/pn533.c b/drivers/nfc/pn533.c index 6bd4f598b3e1..e196bdfcfc30 100644 --- a/drivers/nfc/pn533.c +++ b/drivers/nfc/pn533.c @@ -1235,7 +1235,7 @@ static int pn533_target_found_type_a(struct nfc_target *nfc_tgt, u8 *tgt_data, struct pn533_target_felica { u8 pol_res; u8 opcode; - u8 nfcid2[8]; + u8 nfcid2[NFC_NFCID2_MAXSIZE]; u8 pad[8]; /* optional */ u8 syst_code[]; @@ -1275,6 +1275,9 @@ static int pn533_target_found_felica(struct nfc_target *nfc_tgt, u8 *tgt_data, memcpy(nfc_tgt->sensf_res, &tgt_felica->opcode, 9); nfc_tgt->sensf_res_len = 9; + memcpy(nfc_tgt->nfcid2, tgt_felica->nfcid2, NFC_NFCID2_MAXSIZE); + nfc_tgt->nfcid2_len = NFC_NFCID2_MAXSIZE; + return 0; } @@ -2084,6 +2087,9 @@ static int pn533_dep_link_up(struct nfc_dev *nfc_dev, struct nfc_target *target, if (comm_mode == NFC_COMM_PASSIVE) skb_len += PASSIVE_DATA_LEN; + if (target && target->nfcid2_len) + skb_len += NFC_NFCID3_MAXSIZE; + skb = pn533_alloc_skb(dev, skb_len); if (!skb) return -ENOMEM; @@ -2100,6 +2106,12 @@ static int pn533_dep_link_up(struct nfc_dev *nfc_dev, struct nfc_target *target, *next |= 1; } + if (target && target->nfcid2_len) { + memcpy(skb_put(skb, NFC_NFCID3_MAXSIZE), target->nfcid2, + target->nfcid2_len); + *next |= 2; + } + if (gb != NULL && gb_len > 0) { memcpy(skb_put(skb, gb_len), gb, gb_len); *next |= 4; /* We have some Gi */ diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h index 3563dbdcaaf2..8fc1784a264d 100644 --- a/include/net/nfc/nfc.h +++ b/include/net/nfc/nfc.h @@ -84,6 +84,8 @@ struct nfc_target { u8 sel_res; u8 nfcid1_len; u8 nfcid1[NFC_NFCID1_MAXSIZE]; + u8 nfcid2_len; + u8 nfcid2[NFC_NFCID2_MAXSIZE]; u8 sensb_res_len; u8 sensb_res[NFC_SENSB_RES_MAXSIZE]; u8 sensf_res_len; diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index b6cbd164f146..fb304fb774cc 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -164,6 +164,8 @@ enum nfc_sdp_attr { #define NFC_DEVICE_NAME_MAXSIZE 8 #define NFC_NFCID1_MAXSIZE 10 +#define NFC_NFCID2_MAXSIZE 8 +#define NFC_NFCID3_MAXSIZE 10 #define NFC_SENSB_RES_MAXSIZE 12 #define NFC_SENSF_RES_MAXSIZE 18 #define NFC_GB_MAXSIZE 48 -- cgit v1.2.3-71-gd317 From fed7c25ec0d4894edfc36bbe5c5231e52f45483a Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Fri, 10 May 2013 15:28:38 +0200 Subject: NFC: Add secure elements addition and removal API This API will allow NFC drivers to add and remove the secure elements they know about or detect. Typically this should be called (asynchronously or not) from the driver or the host interface stack detect_se hook. Signed-off-by: Samuel Ortiz --- include/net/nfc/nfc.h | 22 +++++++++++++++++++++- include/uapi/linux/nfc.h | 4 +++- net/nfc/core.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 68 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h index 5187ec70b66a..0e353f1658bb 100644 --- a/include/net/nfc/nfc.h +++ b/include/net/nfc/nfc.h @@ -97,6 +97,23 @@ struct nfc_target { u8 logical_idx; }; +/** + * nfc_se - A structure for NFC accessible secure elements. + * + * @idx: The secure element index. User space will enable or + * disable a secure element by its index. + * @type: The secure element type. It can be SE_UICC or + * SE_EMBEDDED. + * @state: The secure element state, either enabled or disabled. + * + */ +struct nfc_se { + struct list_head list; + u32 idx; + u16 type; + u16 state; +}; + struct nfc_genl_data { u32 poll_req_portid; struct mutex genl_data_mutex; @@ -118,7 +135,7 @@ struct nfc_dev { struct nfc_genl_data genl_data; u32 supported_protocols; - u32 active_se; + struct list_head secure_elements; int tx_headroom; int tx_tailroom; @@ -221,4 +238,7 @@ int nfc_tm_data_received(struct nfc_dev *dev, struct sk_buff *skb); void nfc_driver_failure(struct nfc_dev *dev, int err); +int nfc_add_se(struct nfc_dev *dev, u32 se_idx, u16 type); +int nfc_remove_se(struct nfc_dev *dev, u32 se_idx); + #endif /* __NET_NFC_H */ diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index fb304fb774cc..3a57cef0b986 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -199,10 +199,12 @@ enum nfc_sdp_attr { #define NFC_PROTO_ISO14443_B_MASK (1 << NFC_PROTO_ISO14443_B) /* NFC Secure Elements */ -#define NFC_SE_NONE 0x0 #define NFC_SE_UICC 0x1 #define NFC_SE_EMBEDDED 0x2 +#define NFC_SE_DISABLED 0x0 +#define NFC_SE_ENABLED 0x1 + struct sockaddr_nfc { sa_family_t sa_family; __u32 dev_idx; diff --git a/net/nfc/core.c b/net/nfc/core.c index a43a56d7f4be..dacadfbcacea 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -760,6 +760,49 @@ inline void nfc_driver_failure(struct nfc_dev *dev, int err) } EXPORT_SYMBOL(nfc_driver_failure); +int nfc_add_se(struct nfc_dev *dev, u32 se_idx, u16 type) +{ + struct nfc_se *se, *n; + + pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); + + list_for_each_entry_safe(se, n, &dev->secure_elements, list) + if (se->idx == se_idx) + return -EALREADY; + + se = kzalloc(sizeof(struct nfc_se), GFP_KERNEL); + if (!se) + return -ENOMEM; + + se->idx = se_idx; + se->type = type; + se->state = NFC_SE_DISABLED; + INIT_LIST_HEAD(&se->list); + + list_add(&se->list, &dev->secure_elements); + + return 0; +} +EXPORT_SYMBOL(nfc_add_se); + +int nfc_remove_se(struct nfc_dev *dev, u32 se_idx) +{ + struct nfc_se *se, *n; + + pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); + + list_for_each_entry_safe(se, n, &dev->secure_elements, list) + if (se->idx == se_idx) { + list_del(&se->list); + kfree(se); + + return 0; + } + + return -EINVAL; +} +EXPORT_SYMBOL(nfc_remove_se); + static void nfc_release(struct device *d) { struct nfc_dev *dev = to_nfc_dev(d); @@ -856,9 +899,9 @@ struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops, dev->ops = ops; dev->supported_protocols = supported_protocols; - dev->active_se = NFC_SE_NONE; dev->tx_headroom = tx_headroom; dev->tx_tailroom = tx_tailroom; + INIT_LIST_HEAD(&dev->secure_elements); nfc_genl_data_init(&dev->genl_data); -- cgit v1.2.3-71-gd317 From 2757c3723c3d2b13e3a8bfaa034826f64e9cca43 Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Fri, 10 May 2013 15:47:37 +0200 Subject: NFC: Send netlink events for secure elements additions and removals When an NFC driver or host controller stack discovers a secure element, it will call nfc_add_se(). In order for userspace applications to use these secure elements, a netlink event will then be sent with the SE index and its type. With that information userspace applications can decide wether or not to enable SEs, through their indexes. Signed-off-by: Samuel Ortiz --- include/uapi/linux/nfc.h | 6 +++++ net/nfc/core.c | 14 +++++++++++ net/nfc/netlink.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++ net/nfc/nfc.h | 3 +++ 4 files changed, 86 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index 3a57cef0b986..caed0f324d5f 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -95,6 +95,8 @@ enum nfc_commands { NFC_CMD_LLC_SDREQ, NFC_EVENT_LLC_SDRES, NFC_CMD_FW_UPLOAD, + NFC_EVENT_SE_ADDED, + NFC_EVENT_SE_REMOVED, /* private: internal use only */ __NFC_CMD_AFTER_LAST }; @@ -125,6 +127,8 @@ enum nfc_commands { * @NFC_ATTR_LLC_PARAM_MIUX: MIU eXtension parameter * @NFC_ATTR_SE: Available Secure Elements * @NFC_ATTR_FIRMWARE_NAME: Free format firmware version + * @NFC_ATTR_SE_INDEX: Secure element index + * @NFC_ATTR_SE_TYPE: Secure element type (UICC or EMBEDDED) */ enum nfc_attrs { NFC_ATTR_UNSPEC, @@ -148,6 +152,8 @@ enum nfc_attrs { NFC_ATTR_SE, NFC_ATTR_LLC_SDP, NFC_ATTR_FIRMWARE_NAME, + NFC_ATTR_SE_INDEX, + NFC_ATTR_SE_TYPE, /* private: internal use only */ __NFC_ATTR_AFTER_LAST }; diff --git a/net/nfc/core.c b/net/nfc/core.c index dacadfbcacea..bb5f16cfc201 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -763,6 +763,7 @@ EXPORT_SYMBOL(nfc_driver_failure); int nfc_add_se(struct nfc_dev *dev, u32 se_idx, u16 type) { struct nfc_se *se, *n; + int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); @@ -781,6 +782,14 @@ int nfc_add_se(struct nfc_dev *dev, u32 se_idx, u16 type) list_add(&se->list, &dev->secure_elements); + rc = nfc_genl_se_added(dev, se_idx, type); + if (rc < 0) { + list_del(&se->list); + kfree(se); + + return rc; + } + return 0; } EXPORT_SYMBOL(nfc_add_se); @@ -788,11 +797,16 @@ EXPORT_SYMBOL(nfc_add_se); int nfc_remove_se(struct nfc_dev *dev, u32 se_idx) { struct nfc_se *se, *n; + int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); list_for_each_entry_safe(se, n, &dev->secure_elements, list) if (se->idx == se_idx) { + rc = nfc_genl_se_removed(dev, se_idx); + if (rc < 0) + return rc; + list_del(&se->list); kfree(se); diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index fdbc662c564a..8a11a3a27e69 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -426,6 +426,69 @@ free_msg: return rc; } +int nfc_genl_se_added(struct nfc_dev *dev, u32 se_idx, u16 type) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, + NFC_EVENT_SE_ADDED); + if (!hdr) + goto free_msg; + + if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || + nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) || + nla_put_u8(msg, NFC_ATTR_SE_TYPE, type)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_KERNEL); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); +free_msg: + nlmsg_free(msg); + return -EMSGSIZE; +} + +int nfc_genl_se_removed(struct nfc_dev *dev, u32 se_idx) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, + NFC_EVENT_SE_REMOVED); + if (!hdr) + goto free_msg; + + if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || + nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_multicast(msg, 0, nfc_genl_event_mcgrp.id, GFP_KERNEL); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); +free_msg: + nlmsg_free(msg); + return -EMSGSIZE; +} + static int nfc_genl_send_device(struct sk_buff *msg, struct nfc_dev *dev, u32 portid, u32 seq, struct netlink_callback *cb, diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h index cf0c48165996..a6aeee094aa4 100644 --- a/net/nfc/nfc.h +++ b/net/nfc/nfc.h @@ -94,6 +94,9 @@ int nfc_genl_tm_deactivated(struct nfc_dev *dev); int nfc_genl_llc_send_sdres(struct nfc_dev *dev, struct hlist_head *sdres_list); +int nfc_genl_se_added(struct nfc_dev *dev, u32 se_idx, u16 type); +int nfc_genl_se_removed(struct nfc_dev *dev, u32 se_idx); + struct nfc_dev *nfc_get_device(unsigned int idx); static inline void nfc_put_device(struct nfc_dev *dev) -- cgit v1.2.3-71-gd317 From 57254b6ebce4ceca02d9c8b615f6059c56c19238 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 6 May 2013 19:14:17 +0000 Subject: Btrfs: add ioctl to wait for qgroup rescan completion btrfs_qgroup_wait_for_completion waits until the currently running qgroup operation completes. It returns immediately when no rescan process is in progress. This is useful to automate things around the rescan process (e.g. testing). Signed-off-by: Jan Schmidt Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/ioctl.c | 12 ++++++++++++ fs/btrfs/qgroup.c | 21 +++++++++++++++++++++ include/uapi/linux/btrfs.h | 1 + 4 files changed, 36 insertions(+) (limited to 'include/uapi/linux') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a365400e38da..e36e97b473a8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1613,6 +1613,7 @@ struct btrfs_fs_info { struct mutex qgroup_rescan_lock; /* protects the progress item */ struct btrfs_key qgroup_rescan_progress; struct btrfs_workers qgroup_rescan_workers; + struct completion qgroup_rescan_completion; /* filesystem state */ unsigned long fs_state; @@ -3820,6 +3821,7 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, int btrfs_quota_disable(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst); int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0f81d67cdc8d..1e0dda1feefe 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3937,6 +3937,16 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) return ret; } +static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return btrfs_qgroup_wait_for_completion(root->fs_info); +} + static long btrfs_ioctl_set_received_subvol(struct file *file, void __user *arg) { @@ -4179,6 +4189,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_quota_rescan(file, argp); case BTRFS_IOC_QUOTA_RESCAN_STATUS: return btrfs_ioctl_quota_rescan_status(file, argp); + case BTRFS_IOC_QUOTA_RESCAN_WAIT: + return btrfs_ioctl_quota_rescan_wait(file, argp); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(root, argp); case BTRFS_IOC_GET_FSLABEL: diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 7f38cce2528d..d059d86c3131 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2070,6 +2070,8 @@ out: } else { pr_err("btrfs: qgroup scan failed with %d\n", err); } + + complete_all(&fs_info->qgroup_rescan_completion); } static void @@ -2110,6 +2112,7 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; memset(&fs_info->qgroup_rescan_progress, 0, sizeof(fs_info->qgroup_rescan_progress)); + init_completion(&fs_info->qgroup_rescan_completion); /* clear all current qgroup tracking information */ for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { @@ -2126,3 +2129,21 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) return 0; } + +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info) +{ + int running; + int ret = 0; + + mutex_lock(&fs_info->qgroup_rescan_lock); + spin_lock(&fs_info->qgroup_lock); + running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN; + spin_unlock(&fs_info->qgroup_lock); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + if (running) + ret = wait_for_completion_interruptible( + &fs_info->qgroup_rescan_completion); + + return ret; +} diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 5ef0df545a2a..5b683b5f63cd 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -530,6 +530,7 @@ struct btrfs_ioctl_send_args { struct btrfs_ioctl_quota_rescan_args) #define BTRFS_IOC_QUOTA_RESCAN_STATUS _IOR(BTRFS_IOCTL_MAGIC, 45, \ struct btrfs_ioctl_quota_rescan_args) +#define BTRFS_IOC_QUOTA_RESCAN_WAIT _IO(BTRFS_IOCTL_MAGIC, 46) #define BTRFS_IOC_GET_FSLABEL _IOR(BTRFS_IOCTL_MAGIC, 49, \ char[BTRFS_LABEL_SIZE]) #define BTRFS_IOC_SET_FSLABEL _IOW(BTRFS_IOCTL_MAGIC, 50, \ -- cgit v1.2.3-71-gd317 From 183860f6a0646b876645ecce0553a7ef2dd71254 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Fri, 17 May 2013 10:52:45 +0000 Subject: btrfs: device delete to get errors from the kernel when user runs command btrfs dev del the raid requisite error if any goes to the /var/log/messages, its not good idea to clutter messages with these user (knowledge) errors, further user don't have to review the system messages to know problem with the cli it should be dropped to the user as part of the cli return. to bring this feature created a set of the ERROR defined BTRFS_ERROR_DEV* error codes and created their error string. I expect this enum to be added with other error which we might want to communicate to the user land v3: moved the code with in the file no logical change v1->v2: introduce error codes for the device mgmt usage v1: adds a parameter in the ioctl arg struct to carry the error string Signed-off-by: Anand Jain Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.c | 22 +++++++++++----------- fs/btrfs/volumes.c | 26 +++++++------------------- include/uapi/linux/btrfs.h | 41 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 58 insertions(+), 31 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1f50fe2e62c7..015689a158a4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2360,14 +2360,6 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, - 1)) { - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - mnt_drop_write_file(file); - return -EINVAL; - } - - mutex_lock(&root->fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); @@ -2375,12 +2367,20 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - ret = btrfs_rm_device(root, vol_args->name); - kfree(vol_args); -out: + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out; + } + + mutex_lock(&root->fs_info->volume_mutex); + ret = btrfs_rm_device(root, vol_args->name); mutex_unlock(&root->fs_info->volume_mutex); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + +out: + kfree(vol_args); mnt_drop_write_file(file); return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 440de708f9eb..b1446c0da154 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1462,31 +1462,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) btrfs_dev_replace_unlock(&root->fs_info->dev_replace); if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { - printk(KERN_ERR "btrfs: unable to go below four devices " - "on raid10\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; goto out; } if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { - printk(KERN_ERR "btrfs: unable to go below two " - "devices on raid1\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET; goto out; } if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && root->fs_info->fs_devices->rw_devices <= 2) { - printk(KERN_ERR "btrfs: unable to go below two " - "devices on raid5\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET; goto out; } if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && root->fs_info->fs_devices->rw_devices <= 3) { - printk(KERN_ERR "btrfs: unable to go below three " - "devices on raid6\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET; goto out; } @@ -1512,8 +1504,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) bh = NULL; disk_super = NULL; if (!device) { - printk(KERN_ERR "btrfs: no missing devices found to " - "remove\n"); + ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; goto out; } } else { @@ -1535,15 +1526,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } if (device->is_tgtdev_for_dev_replace) { - pr_err("btrfs: unable to remove the dev_replace target dev\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_TGT_REPLACE; goto error_brelse; } if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { - printk(KERN_ERR "btrfs: unable to remove the only writeable " - "device\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; goto error_brelse; } diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 5b683b5f63cd..05aed70627e2 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -447,6 +447,46 @@ struct btrfs_ioctl_send_args { __u64 reserved[4]; /* in */ }; +/* Error codes as returned by the kernel */ +enum btrfs_err_code { + notused, + BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, + BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, + BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, + BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, + BTRFS_ERROR_DEV_TGT_REPLACE, + BTRFS_ERROR_DEV_MISSING_NOT_FOUND, + BTRFS_ERROR_DEV_ONLY_WRITABLE, + BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS +}; +/* An error code to error string mapping for the kernel +* error codes +*/ +static inline char *btrfs_err_str(enum btrfs_err_code err_code) +{ + switch (err_code) { + case BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET: + return "unable to go below two devices on raid1"; + case BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET: + return "unable to go below four devices on raid10"; + case BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET: + return "unable to go below two devices on raid5"; + case BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET: + return "unable to go below three devices on raid6"; + case BTRFS_ERROR_DEV_TGT_REPLACE: + return "unable to remove the dev_replace target dev"; + case BTRFS_ERROR_DEV_MISSING_NOT_FOUND: + return "no missing devices found to remove"; + case BTRFS_ERROR_DEV_ONLY_WRITABLE: + return "unable to remove the only writeable device"; + case BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS: + return "add/delete/balance/replace/resize operation "\ + "in progress"; + default: + return NULL; + } +} + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -539,5 +579,4 @@ struct btrfs_ioctl_send_args { struct btrfs_ioctl_get_dev_stats) #define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \ struct btrfs_ioctl_dev_replace_args) - #endif /* _UAPI_LINUX_BTRFS_H */ -- cgit v1.2.3-71-gd317 From 45bfa52e36ef016b789eca73c7847db3f7b4742d Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Wed, 12 Jun 2013 15:57:10 -0700 Subject: openvswitch: Fix struct comment. Signed-off-by: Pravin B Shelar Signed-off-by: Jesse Gross --- include/uapi/linux/openvswitch.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 405918dd7b3f..424672db7f12 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -192,7 +192,6 @@ enum ovs_vport_type { * optional; if not specified a free port number is automatically selected. * Whether %OVS_VPORT_ATTR_OPTIONS is required or optional depends on the type * of vport. - * and other attributes are ignored. * * For other requests, if %OVS_VPORT_ATTR_NAME is specified then it is used to * look up the vport to operate on; otherwise dp_idx from the &struct -- cgit v1.2.3-71-gd317 From f303b364b41d3fc5bf879799128958400b7859aa Mon Sep 17 00:00:00 2001 From: Ulrich Hecht Date: Fri, 31 May 2013 17:57:01 +0200 Subject: serial: sh-sci: HSCIF support Adds support for "High Speed Serial Communications Interface with FIFO", essentially a SCIF with 128-byte FIFOs and more accurate baud rate generator. Signed-off-by: Ulrich Hecht Acked-by: Paul Mundt Acked-by: Greg Kroah-Hartman Signed-off-by: Simon Horman --- drivers/tty/serial/sh-sci.c | 102 ++++++++++++++++++++++++++++++++++++--- include/linux/serial_sci.h | 12 +++-- include/uapi/linux/serial_core.h | 3 ++ 3 files changed, 106 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c index 156418619949..931d6c3a792c 100644 --- a/drivers/tty/serial/sh-sci.c +++ b/drivers/tty/serial/sh-sci.c @@ -146,6 +146,7 @@ static struct plat_sci_reg sci_regmap[SCIx_NR_REGTYPES][SCIx_NR_REGS] = { [SCRFDR] = sci_reg_invalid, [SCSPTR] = sci_reg_invalid, [SCLSR] = sci_reg_invalid, + [HSSRR] = sci_reg_invalid, }, /* @@ -165,6 +166,7 @@ static struct plat_sci_reg sci_regmap[SCIx_NR_REGTYPES][SCIx_NR_REGS] = { [SCRFDR] = sci_reg_invalid, [SCSPTR] = sci_reg_invalid, [SCLSR] = sci_reg_invalid, + [HSSRR] = sci_reg_invalid, }, /* @@ -183,6 +185,7 @@ static struct plat_sci_reg sci_regmap[SCIx_NR_REGTYPES][SCIx_NR_REGS] = { [SCRFDR] = sci_reg_invalid, [SCSPTR] = sci_reg_invalid, [SCLSR] = sci_reg_invalid, + [HSSRR] = sci_reg_invalid, }, /* @@ -201,6 +204,7 @@ static struct plat_sci_reg sci_regmap[SCIx_NR_REGTYPES][SCIx_NR_REGS] = { [SCRFDR] = { 0x3c, 16 }, [SCSPTR] = sci_reg_invalid, [SCLSR] = sci_reg_invalid, + [HSSRR] = sci_reg_invalid, }, /* @@ -220,6 +224,7 @@ static struct plat_sci_reg sci_regmap[SCIx_NR_REGTYPES][SCIx_NR_REGS] = { [SCRFDR] = sci_reg_invalid, [SCSPTR] = { 0x20, 16 }, [SCLSR] = { 0x24, 16 }, + [HSSRR] = sci_reg_invalid, }, /* @@ -238,6 +243,7 @@ static struct plat_sci_reg sci_regmap[SCIx_NR_REGTYPES][SCIx_NR_REGS] = { [SCRFDR] = sci_reg_invalid, [SCSPTR] = sci_reg_invalid, [SCLSR] = sci_reg_invalid, + [HSSRR] = sci_reg_invalid, }, /* @@ -256,6 +262,26 @@ static struct plat_sci_reg sci_regmap[SCIx_NR_REGTYPES][SCIx_NR_REGS] = { [SCRFDR] = sci_reg_invalid, [SCSPTR] = { 0x20, 16 }, [SCLSR] = { 0x24, 16 }, + [HSSRR] = sci_reg_invalid, + }, + + /* + * Common HSCIF definitions. + */ + [SCIx_HSCIF_REGTYPE] = { + [SCSMR] = { 0x00, 16 }, + [SCBRR] = { 0x04, 8 }, + [SCSCR] = { 0x08, 16 }, + [SCxTDR] = { 0x0c, 8 }, + [SCxSR] = { 0x10, 16 }, + [SCxRDR] = { 0x14, 8 }, + [SCFCR] = { 0x18, 16 }, + [SCFDR] = { 0x1c, 16 }, + [SCTFDR] = sci_reg_invalid, + [SCRFDR] = sci_reg_invalid, + [SCSPTR] = { 0x20, 16 }, + [SCLSR] = { 0x24, 16 }, + [HSSRR] = { 0x40, 16 }, }, /* @@ -275,6 +301,7 @@ static struct plat_sci_reg sci_regmap[SCIx_NR_REGTYPES][SCIx_NR_REGS] = { [SCRFDR] = sci_reg_invalid, [SCSPTR] = sci_reg_invalid, [SCLSR] = { 0x24, 16 }, + [HSSRR] = sci_reg_invalid, }, /* @@ -294,6 +321,7 @@ static struct plat_sci_reg sci_regmap[SCIx_NR_REGTYPES][SCIx_NR_REGS] = { [SCRFDR] = { 0x20, 16 }, [SCSPTR] = { 0x24, 16 }, [SCLSR] = { 0x28, 16 }, + [HSSRR] = sci_reg_invalid, }, /* @@ -313,6 +341,7 @@ static struct plat_sci_reg sci_regmap[SCIx_NR_REGTYPES][SCIx_NR_REGS] = { [SCRFDR] = sci_reg_invalid, [SCSPTR] = sci_reg_invalid, [SCLSR] = sci_reg_invalid, + [HSSRR] = sci_reg_invalid, }, }; @@ -374,6 +403,9 @@ static int sci_probe_regmap(struct plat_sci_port *cfg) */ cfg->regtype = SCIx_SH4_SCIF_REGTYPE; break; + case PORT_HSCIF: + cfg->regtype = SCIx_HSCIF_REGTYPE; + break; default: printk(KERN_ERR "Can't probe register map for given port\n"); return -EINVAL; @@ -1798,6 +1830,42 @@ static unsigned int sci_scbrr_calc(unsigned int algo_id, unsigned int bps, return ((freq + 16 * bps) / (32 * bps) - 1); } +/* calculate sample rate, BRR, and clock select for HSCIF */ +static void sci_baud_calc_hscif(unsigned int bps, unsigned long freq, + int *brr, unsigned int *srr, + unsigned int *cks) +{ + int sr, c, br, err; + int min_err = 1000; /* 100% */ + + /* Find the combination of sample rate and clock select with the + smallest deviation from the desired baud rate. */ + for (sr = 8; sr <= 32; sr++) { + for (c = 0; c <= 3; c++) { + /* integerized formulas from HSCIF documentation */ + br = freq / (sr * (1 << (2 * c + 1)) * bps) - 1; + if (br < 0 || br > 255) + continue; + err = freq / ((br + 1) * bps * sr * + (1 << (2 * c + 1)) / 1000) - 1000; + if (min_err > err) { + min_err = err; + *brr = br; + *srr = sr - 1; + *cks = c; + } + } + } + + if (min_err == 1000) { + WARN_ON(1); + /* use defaults */ + *brr = 255; + *srr = 15; + *cks = 0; + } +} + static void sci_reset(struct uart_port *port) { struct plat_sci_reg *reg; @@ -1821,6 +1889,7 @@ static void sci_set_termios(struct uart_port *port, struct ktermios *termios, struct plat_sci_reg *reg; unsigned int baud, smr_val, max_baud, cks; int t = -1; + unsigned int srr; /* * earlyprintk comes here early on with port->uartclk set to zero. @@ -1833,8 +1902,17 @@ static void sci_set_termios(struct uart_port *port, struct ktermios *termios, max_baud = port->uartclk ? port->uartclk / 16 : 115200; baud = uart_get_baud_rate(port, termios, old, 0, max_baud); - if (likely(baud && port->uartclk)) - t = sci_scbrr_calc(s->cfg->scbrr_algo_id, baud, port->uartclk); + if (likely(baud && port->uartclk)) { + if (s->cfg->scbrr_algo_id == SCBRR_ALGO_6) { + sci_baud_calc_hscif(baud, port->uartclk, &t, &srr, + &cks); + } else { + t = sci_scbrr_calc(s->cfg->scbrr_algo_id, baud, + port->uartclk); + for (cks = 0; t >= 256 && cks <= 3; cks++) + t >>= 2; + } + } sci_port_enable(s); @@ -1853,15 +1931,15 @@ static void sci_set_termios(struct uart_port *port, struct ktermios *termios, uart_update_timeout(port, termios->c_cflag, baud); - for (cks = 0; t >= 256 && cks <= 3; cks++) - t >>= 2; - dev_dbg(port->dev, "%s: SMR %x, cks %x, t %x, SCSCR %x\n", __func__, smr_val, cks, t, s->cfg->scscr); if (t >= 0) { serial_port_out(port, SCSMR, (smr_val & ~3) | cks); serial_port_out(port, SCBRR, t); + reg = sci_getreg(port, HSSRR); + if (reg->size) + serial_port_out(port, HSSRR, srr | HSCIF_SRE); udelay((1000000+(baud-1)) / baud); /* Wait one bit interval */ } else serial_port_out(port, SCSMR, smr_val); @@ -1947,6 +2025,8 @@ static const char *sci_type(struct uart_port *port) return "scifa"; case PORT_SCIFB: return "scifb"; + case PORT_HSCIF: + return "hscif"; } return NULL; @@ -1960,7 +2040,10 @@ static inline unsigned long sci_port_size(struct uart_port *port) * from platform resource data at such a time that ports begin to * behave more erratically. */ - return 64; + if (port->type == PORT_HSCIF) + return 96; + else + return 64; } static int sci_remap_port(struct uart_port *port) @@ -2085,6 +2168,9 @@ static int sci_init_single(struct platform_device *dev, case PORT_SCIFB: port->fifosize = 256; break; + case PORT_HSCIF: + port->fifosize = 128; + break; case PORT_SCIFA: port->fifosize = 64; break; @@ -2325,7 +2411,7 @@ static inline int sci_probe_earlyprintk(struct platform_device *pdev) #endif /* CONFIG_SERIAL_SH_SCI_CONSOLE */ static char banner[] __initdata = - KERN_INFO "SuperH SCI(F) driver initialized\n"; + KERN_INFO "SuperH (H)SCI(F) driver initialized\n"; static struct uart_driver sci_uart_driver = { .owner = THIS_MODULE, @@ -2484,4 +2570,4 @@ module_exit(sci_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS("platform:sh-sci"); MODULE_AUTHOR("Paul Mundt"); -MODULE_DESCRIPTION("SuperH SCI(F) serial driver"); +MODULE_DESCRIPTION("SuperH (H)SCI(F) serial driver"); diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h index eb763adf9815..d34049712a4d 100644 --- a/include/linux/serial_sci.h +++ b/include/linux/serial_sci.h @@ -5,7 +5,7 @@ #include /* - * Generic header for SuperH SCI(F) (used by sh/sh64/h8300 and related parts) + * Generic header for SuperH (H)SCI(F) (used by sh/sh64/h8300 and related parts) */ #define SCIx_NOT_SUPPORTED (-1) @@ -16,6 +16,7 @@ enum { SCBRR_ALGO_3, /* (((clk * 2) + 16 * bps) / (16 * bps) - 1) */ SCBRR_ALGO_4, /* (((clk * 2) + 16 * bps) / (32 * bps) - 1) */ SCBRR_ALGO_5, /* (((clk * 1000 / 32) / bps) - 1) */ + SCBRR_ALGO_6, /* HSCIF variable sample rate algorithm */ }; #define SCSCR_TIE (1 << 7) @@ -37,7 +38,7 @@ enum { #define SCI_DEFAULT_ERROR_MASK (SCI_PER | SCI_FER) -/* SCxSR SCIF */ +/* SCxSR SCIF, HSCIF */ #define SCIF_ER 0x0080 #define SCIF_TEND 0x0040 #define SCIF_TDFE 0x0020 @@ -55,6 +56,9 @@ enum { #define SCSPTR_SPB2IO (1 << 1) #define SCSPTR_SPB2DT (1 << 0) +/* HSSRR HSCIF */ +#define HSCIF_SRE 0x8000 + /* Offsets into the sci_port->irqs array */ enum { SCIx_ERI_IRQ, @@ -90,6 +94,7 @@ enum { SCIx_SH4_SCIF_NO_SCSPTR_REGTYPE, SCIx_SH4_SCIF_FIFODATA_REGTYPE, SCIx_SH7705_SCIF_REGTYPE, + SCIx_HSCIF_REGTYPE, SCIx_NR_REGTYPES, }; @@ -115,6 +120,7 @@ enum { SCSMR, SCBRR, SCSCR, SCxSR, SCFCR, SCFDR, SCxTDR, SCxRDR, SCLSR, SCTFDR, SCRFDR, SCSPTR, + HSSRR, SCIx_NR_REGS, }; @@ -137,7 +143,7 @@ struct plat_sci_port { unsigned long mapbase; /* resource base */ unsigned int irqs[SCIx_NR_IRQS]; /* ERI, RXI, TXI, BRI */ unsigned int gpios[SCIx_NR_FNS]; /* SCK, RXD, TXD, CTS, RTS */ - unsigned int type; /* SCI / SCIF / IRDA */ + unsigned int type; /* SCI / SCIF / IRDA / HSCIF */ upf_t flags; /* UPF_* flags */ unsigned long capabilities; /* Port features/capabilities */ diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index 74c2bf7211f8..26eee07eeb24 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -226,4 +226,7 @@ /* Rocketport EXPRESS/INFINITY */ #define PORT_RP2 102 +/* SH-SCI */ +#define PORT_HSCIF 103 + #endif /* _UAPILINUX_SERIAL_CORE_H */ -- cgit v1.2.3-71-gd317 From 8941bbcd572a8860ad03c76e2f3d1dafa820b842 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Mon, 17 Jun 2013 10:54:36 -0400 Subject: tipc: update code comments to reflect new uapi header path Files tipc.h and tipc_config.h were moved to uapi directory, but the corresponding comments were not updated at the same time. Signed-off-by: Ying Xue Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 2 +- include/uapi/linux/tipc_config.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index f2d90091cc20..852373d27dbb 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -1,5 +1,5 @@ /* - * include/linux/tipc.h: Include file for TIPC socket interface + * include/uapi/linux/tipc.h: Header for TIPC socket interface * * Copyright (c) 2003-2006, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems diff --git a/include/uapi/linux/tipc_config.h b/include/uapi/linux/tipc_config.h index 0b1e3f218a36..6b0bff09b3a7 100644 --- a/include/uapi/linux/tipc_config.h +++ b/include/uapi/linux/tipc_config.h @@ -1,5 +1,5 @@ /* - * include/linux/tipc_config.h: Include file for TIPC configuration interface + * include/uapi/linux/tipc_config.h: Header for TIPC configuration interface * * Copyright (c) 2003-2006, Ericsson AB * Copyright (c) 2005-2007, 2010-2011, Wind River Systems -- cgit v1.2.3-71-gd317 From 2f301ab29e4656af824592363039d8f6bd5a9f68 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Thu, 16 May 2013 13:00:28 +0200 Subject: nl80211/cfg80211: add 5 and 10 MHz defines and wiphy flag Add defines for 5 and 10 MHz channel width and fix channel handling functions accordingly. Also check for and report the WIPHY_FLAG_SUPPORTS_5_10_MHZ capability. Signed-off-by: Simon Wunderlich Signed-off-by: Mathias Kretschmer [fix spelling in comment] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 4 ++++ net/wireless/chan.c | 57 +++++++++++++++++++++++++++++++++++++------- net/wireless/nl80211.c | 21 ++++++++++++---- 4 files changed, 72 insertions(+), 12 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 6a43c34ce96f..316f34bb8e6e 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2342,6 +2342,7 @@ struct cfg80211_ops { * responds to probe-requests in hardware. * @WIPHY_FLAG_OFFCHAN_TX: Device supports direct off-channel TX. * @WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL: Device supports remain-on-channel call. + * @WIPHY_FLAG_SUPPORTS_5_10_MHZ: Device supports 5 MHz and 10 MHz channels. */ enum wiphy_flags { WIPHY_FLAG_CUSTOM_REGULATORY = BIT(0), @@ -2365,6 +2366,7 @@ enum wiphy_flags { WIPHY_FLAG_AP_PROBE_RESP_OFFLOAD = BIT(19), WIPHY_FLAG_OFFCHAN_TX = BIT(20), WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL = BIT(21), + WIPHY_FLAG_SUPPORTS_5_10_MHZ = BIT(22), }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ca6facf4df0c..861e5eba3953 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2758,6 +2758,8 @@ enum nl80211_channel_type { * and %NL80211_ATTR_CENTER_FREQ2 attributes must be provided as well * @NL80211_CHAN_WIDTH_160: 160 MHz channel, the %NL80211_ATTR_CENTER_FREQ1 * attribute must be provided as well + * @NL80211_CHAN_WIDTH_5: 5 MHz OFDM channel + * @NL80211_CHAN_WIDTH_10: 10 MHz OFDM channel */ enum nl80211_chan_width { NL80211_CHAN_WIDTH_20_NOHT, @@ -2766,6 +2768,8 @@ enum nl80211_chan_width { NL80211_CHAN_WIDTH_80, NL80211_CHAN_WIDTH_80P80, NL80211_CHAN_WIDTH_160, + NL80211_CHAN_WIDTH_5, + NL80211_CHAN_WIDTH_10, }; /** diff --git a/net/wireless/chan.c b/net/wireless/chan.c index fd556ac05fdb..50f6195c8b70 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -54,6 +54,8 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) control_freq = chandef->chan->center_freq; switch (chandef->width) { + case NL80211_CHAN_WIDTH_5: + case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_20: case NL80211_CHAN_WIDTH_20_NOHT: if (chandef->center_freq1 != control_freq) @@ -152,6 +154,12 @@ static int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c) int width; switch (c->width) { + case NL80211_CHAN_WIDTH_5: + width = 5; + break; + case NL80211_CHAN_WIDTH_10: + width = 10; + break; case NL80211_CHAN_WIDTH_20: case NL80211_CHAN_WIDTH_20_NOHT: width = 20; @@ -194,6 +202,16 @@ cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1, if (c1->width == c2->width) return NULL; + /* + * can't be compatible if one of them is 5 or 10 MHz, + * but they don't have the same width. + */ + if (c1->width == NL80211_CHAN_WIDTH_5 || + c1->width == NL80211_CHAN_WIDTH_10 || + c2->width == NL80211_CHAN_WIDTH_5 || + c2->width == NL80211_CHAN_WIDTH_10) + return NULL; + if (c1->width == NL80211_CHAN_WIDTH_20_NOHT || c1->width == NL80211_CHAN_WIDTH_20) return c2; @@ -264,11 +282,17 @@ static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy, u32 bandwidth) { struct ieee80211_channel *c; - u32 freq; + u32 freq, start_freq, end_freq; + + if (bandwidth <= 20) { + start_freq = center_freq; + end_freq = center_freq; + } else { + start_freq = center_freq - bandwidth/2 + 10; + end_freq = center_freq + bandwidth/2 - 10; + } - for (freq = center_freq - bandwidth/2 + 10; - freq <= center_freq + bandwidth/2 - 10; - freq += 20) { + for (freq = start_freq; freq <= end_freq; freq += 20) { c = ieee80211_get_channel(wiphy, freq); if (!c) return -EINVAL; @@ -310,11 +334,17 @@ static bool cfg80211_secondary_chans_ok(struct wiphy *wiphy, u32 prohibited_flags) { struct ieee80211_channel *c; - u32 freq; + u32 freq, start_freq, end_freq; + + if (bandwidth <= 20) { + start_freq = center_freq; + end_freq = center_freq; + } else { + start_freq = center_freq - bandwidth/2 + 10; + end_freq = center_freq + bandwidth/2 - 10; + } - for (freq = center_freq - bandwidth/2 + 10; - freq <= center_freq + bandwidth/2 - 10; - freq += 20) { + for (freq = start_freq; freq <= end_freq; freq += 20) { c = ieee80211_get_channel(wiphy, freq); if (!c) return false; @@ -349,6 +379,12 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, control_freq = chandef->chan->center_freq; switch (chandef->width) { + case NL80211_CHAN_WIDTH_5: + width = 5; + break; + case NL80211_CHAN_WIDTH_10: + width = 10; + break; case NL80211_CHAN_WIDTH_20: if (!ht_cap->ht_supported) return false; @@ -405,6 +441,11 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, if (width > 20) prohibited_flags |= IEEE80211_CHAN_NO_OFDM; + /* 5 and 10 MHz are only defined for the OFDM PHY */ + if (width < 20) + prohibited_flags |= IEEE80211_CHAN_NO_OFDM; + + if (!cfg80211_secondary_chans_ok(wiphy, chandef->center_freq1, width, prohibited_flags)) return false; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 1c4f7daea6c7..4ab1ffa9df11 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1188,6 +1188,9 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev, if ((dev->wiphy.flags & WIPHY_FLAG_TDLS_EXTERNAL_SETUP) && nla_put_flag(msg, NL80211_ATTR_TDLS_EXTERNAL_SETUP)) goto nla_put_failure; + if ((dev->wiphy.flags & WIPHY_FLAG_SUPPORTS_5_10_MHZ) && + nla_put_flag(msg, WIPHY_FLAG_SUPPORTS_5_10_MHZ)) + goto nla_put_failure; (*split_start)++; if (split) @@ -1731,6 +1734,11 @@ static int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, IEEE80211_CHAN_DISABLED)) return -EINVAL; + if ((chandef->width == NL80211_CHAN_WIDTH_5 || + chandef->width == NL80211_CHAN_WIDTH_10) && + !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_5_10_MHZ)) + return -EINVAL; + return 0; } @@ -6280,11 +6288,16 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) if (!cfg80211_reg_can_beacon(&rdev->wiphy, &ibss.chandef)) return -EINVAL; - if (ibss.chandef.width > NL80211_CHAN_WIDTH_40) - return -EINVAL; - if (ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT && - !(rdev->wiphy.features & NL80211_FEATURE_HT_IBSS)) + switch (ibss.chandef.width) { + case NL80211_CHAN_WIDTH_20_NOHT: + break; + case NL80211_CHAN_WIDTH_20: + case NL80211_CHAN_WIDTH_40: + if (rdev->wiphy.features & NL80211_FEATURE_HT_IBSS) + break; + default: return -EINVAL; + } ibss.channel_fixed = !!info->attrs[NL80211_ATTR_FREQ_FIXED]; ibss.privacy = !!info->attrs[NL80211_ATTR_PRIVACY]; -- cgit v1.2.3-71-gd317 From 135c5612c460f89657c4698fe2ea753f6f667963 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 17 Jun 2013 17:36:51 -0700 Subject: perf/x86/intel: Support Haswell/v4 LBR format Haswell has two additional LBR from flags for TSX: in_tx and abort_tx, implemented as a new "v4" version of the LBR format. Handle those in and adjust the sign extension code to still correctly extend. The flags are exported similarly in the LBR record to the existing misprediction flag Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/1371515812-9646-6-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 56 +++++++++++++++++++++++++++--- include/linux/perf_event.h | 7 +++- include/uapi/linux/perf_event.h | 5 ++- 3 files changed, 61 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index de341d4ec92a..d5be06a5005e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -12,6 +12,16 @@ enum { LBR_FORMAT_LIP = 0x01, LBR_FORMAT_EIP = 0x02, LBR_FORMAT_EIP_FLAGS = 0x03, + LBR_FORMAT_EIP_FLAGS2 = 0x04, + LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_EIP_FLAGS2, +}; + +static enum { + LBR_EIP_FLAGS = 1, + LBR_TSX = 2, +} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = { + [LBR_FORMAT_EIP_FLAGS] = LBR_EIP_FLAGS, + [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX, }; /* @@ -56,6 +66,8 @@ enum { LBR_FAR) #define LBR_FROM_FLAG_MISPRED (1ULL << 63) +#define LBR_FROM_FLAG_IN_TX (1ULL << 62) +#define LBR_FROM_FLAG_ABORT (1ULL << 61) #define for_each_branch_sample_type(x) \ for ((x) = PERF_SAMPLE_BRANCH_USER; \ @@ -81,9 +93,13 @@ enum { X86_BR_JMP = 1 << 9, /* jump */ X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ X86_BR_IND_CALL = 1 << 11,/* indirect calls */ + X86_BR_ABORT = 1 << 12,/* transaction abort */ + X86_BR_IN_TX = 1 << 13,/* in transaction */ + X86_BR_NO_TX = 1 << 14,/* not in transaction */ }; #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) +#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX) #define X86_BR_ANY \ (X86_BR_CALL |\ @@ -95,6 +111,7 @@ enum { X86_BR_JCC |\ X86_BR_JMP |\ X86_BR_IRQ |\ + X86_BR_ABORT |\ X86_BR_IND_CALL) #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) @@ -270,21 +287,31 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) for (i = 0; i < x86_pmu.lbr_nr; i++) { unsigned long lbr_idx = (tos - i) & mask; - u64 from, to, mis = 0, pred = 0; + u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0; + int skip = 0; + int lbr_flags = lbr_desc[lbr_format]; rdmsrl(x86_pmu.lbr_from + lbr_idx, from); rdmsrl(x86_pmu.lbr_to + lbr_idx, to); - if (lbr_format == LBR_FORMAT_EIP_FLAGS) { + if (lbr_flags & LBR_EIP_FLAGS) { mis = !!(from & LBR_FROM_FLAG_MISPRED); pred = !mis; - from = (u64)((((s64)from) << 1) >> 1); + skip = 1; + } + if (lbr_flags & LBR_TSX) { + in_tx = !!(from & LBR_FROM_FLAG_IN_TX); + abort = !!(from & LBR_FROM_FLAG_ABORT); + skip = 3; } + from = (u64)((((s64)from) << skip) >> skip); cpuc->lbr_entries[i].from = from; cpuc->lbr_entries[i].to = to; cpuc->lbr_entries[i].mispred = mis; cpuc->lbr_entries[i].predicted = pred; + cpuc->lbr_entries[i].in_tx = in_tx; + cpuc->lbr_entries[i].abort = abort; cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; @@ -334,6 +361,16 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) if (br_type & PERF_SAMPLE_BRANCH_IND_CALL) mask |= X86_BR_IND_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX) + mask |= X86_BR_ABORT; + + if (br_type & PERF_SAMPLE_BRANCH_IN_TX) + mask |= X86_BR_IN_TX; + + if (br_type & PERF_SAMPLE_BRANCH_NO_TX) + mask |= X86_BR_NO_TX; + /* * stash actual user request into reg, it may * be used by fixup code for some CPU @@ -408,7 +445,7 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event) * decoded (e.g., text page not present), then X86_BR_NONE is * returned. */ -static int branch_type(unsigned long from, unsigned long to) +static int branch_type(unsigned long from, unsigned long to, int abort) { struct insn insn; void *addr; @@ -428,6 +465,9 @@ static int branch_type(unsigned long from, unsigned long to) if (from == 0 || to == 0) return X86_BR_NONE; + if (abort) + return X86_BR_ABORT | to_plm; + if (from_plm == X86_BR_USER) { /* * can happen if measuring at the user level only @@ -574,7 +614,13 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) from = cpuc->lbr_entries[i].from; to = cpuc->lbr_entries[i].to; - type = branch_type(from, to); + type = branch_type(from, to, cpuc->lbr_entries[i].abort); + if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) { + if (cpuc->lbr_entries[i].in_tx) + type |= X86_BR_IN_TX; + else + type |= X86_BR_NO_TX; + } /* if type does not correspond, then discard */ if (type == X86_BR_NONE || (br_sel & type) != type) { diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 33e8d65836d6..056f93a7990f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -73,13 +73,18 @@ struct perf_raw_record { * * support for mispred, predicted is optional. In case it * is not supported mispred = predicted = 0. + * + * in_tx: running in a hardware transaction + * abort: aborting a hardware transaction */ struct perf_branch_entry { __u64 from; __u64 to; __u64 mispred:1, /* target mispredicted */ predicted:1,/* target predicted */ - reserved:62; + in_tx:1, /* in transaction */ + abort:1, /* transaction abort */ + reserved:60; }; /* diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index fb104e51496e..0b1df41691e8 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -157,8 +157,11 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_ANY_CALL = 1U << 4, /* any call branch */ PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << 5, /* any return branch */ PERF_SAMPLE_BRANCH_IND_CALL = 1U << 6, /* indirect calls */ + PERF_SAMPLE_BRANCH_ABORT_TX = 1U << 7, /* transaction aborts */ + PERF_SAMPLE_BRANCH_IN_TX = 1U << 8, /* in transaction */ + PERF_SAMPLE_BRANCH_NO_TX = 1U << 9, /* not in transaction */ - PERF_SAMPLE_BRANCH_MAX = 1U << 7, /* non-ABI */ + PERF_SAMPLE_BRANCH_MAX = 1U << 10, /* non-ABI */ }; #define PERF_SAMPLE_BRANCH_PLM_ALL \ -- cgit v1.2.3-71-gd317 From 7d5437c709ded4f152cb8b305d17972d6707f20c Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 17 Jun 2013 17:50:18 -0700 Subject: openvswitch: Add tunneling interface. Add ovs tunnel interface for set tunnel action for userspace. Signed-off-by: Pravin B Shelar Acked-by: Jesse Gross Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 18 +++++ net/openvswitch/actions.c | 4 ++ net/openvswitch/datapath.c | 78 +++++++++++++++++++++- net/openvswitch/datapath.h | 3 + net/openvswitch/flow.c | 125 +++++++++++++++++++++++++++++++++++ net/openvswitch/flow.h | 19 ++++++ net/openvswitch/vport-internal_dev.c | 2 +- net/openvswitch/vport-netdev.c | 2 +- net/openvswitch/vport.c | 4 +- net/openvswitch/vport.h | 3 +- 10 files changed, 251 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 424672db7f12..b15a445927d6 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -246,11 +246,29 @@ enum ovs_key_attr { OVS_KEY_ATTR_ARP, /* struct ovs_key_arp */ OVS_KEY_ATTR_ND, /* struct ovs_key_nd */ OVS_KEY_ATTR_SKB_MARK, /* u32 skb mark */ + OVS_KEY_ATTR_TUNNEL, /* Nested set of ovs_tunnel attributes */ + +#ifdef __KERNEL__ + OVS_KEY_ATTR_IPV4_TUNNEL, /* struct ovs_key_ipv4_tunnel */ +#endif __OVS_KEY_ATTR_MAX }; #define OVS_KEY_ATTR_MAX (__OVS_KEY_ATTR_MAX - 1) +enum ovs_tunnel_key_attr { + OVS_TUNNEL_KEY_ATTR_ID, /* be64 Tunnel ID */ + OVS_TUNNEL_KEY_ATTR_IPV4_SRC, /* be32 src IP address. */ + OVS_TUNNEL_KEY_ATTR_IPV4_DST, /* be32 dst IP address. */ + OVS_TUNNEL_KEY_ATTR_TOS, /* u8 Tunnel IP ToS. */ + OVS_TUNNEL_KEY_ATTR_TTL, /* u8 Tunnel IP TTL. */ + OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT, /* No argument, set DF. */ + OVS_TUNNEL_KEY_ATTR_CSUM, /* No argument. CSUM packet. */ + __OVS_TUNNEL_KEY_ATTR_MAX +}; + +#define OVS_TUNNEL_KEY_ATTR_MAX (__OVS_TUNNEL_KEY_ATTR_MAX - 1) + /** * enum ovs_frag_type - IPv4 and IPv6 fragment type * @OVS_FRAG_TYPE_NONE: Packet is not a fragment. diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 596d6373399d..22c5f399f1cf 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -436,6 +436,10 @@ static int execute_set_action(struct sk_buff *skb, skb->mark = nla_get_u32(nested_attr); break; + case OVS_KEY_ATTR_IPV4_TUNNEL: + OVS_CB(skb)->tun_key = nla_data(nested_attr); + break; + case OVS_KEY_ATTR_ETHERNET: err = set_eth_addr(skb, nla_data(nested_attr)); break; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index f14816b80b80..bbd310646bc8 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -362,6 +362,14 @@ static int queue_gso_packets(struct net *net, int dp_ifindex, static size_t key_attr_size(void) { return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ + + nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */ + + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */ + + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */ + + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */ + + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ @@ -600,8 +608,30 @@ static int validate_tp_port(const struct sw_flow_key *flow_key) return -EINVAL; } +static int validate_and_copy_set_tun(const struct nlattr *attr, + struct sw_flow_actions **sfa) +{ + struct ovs_key_ipv4_tunnel tun_key; + int err, start; + + err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &tun_key); + if (err) + return err; + + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET); + if (start < 0) + return start; + + err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &tun_key, sizeof(tun_key)); + add_nested_action_end(*sfa, start); + + return err; +} + static int validate_set(const struct nlattr *a, - const struct sw_flow_key *flow_key) + const struct sw_flow_key *flow_key, + struct sw_flow_actions **sfa, + bool *set_tun) { const struct nlattr *ovs_key = nla_data(a); int key_type = nla_type(ovs_key); @@ -611,18 +641,27 @@ static int validate_set(const struct nlattr *a, return -EINVAL; if (key_type > OVS_KEY_ATTR_MAX || - nla_len(ovs_key) != ovs_key_lens[key_type]) + (ovs_key_lens[key_type] != nla_len(ovs_key) && + ovs_key_lens[key_type] != -1)) return -EINVAL; switch (key_type) { const struct ovs_key_ipv4 *ipv4_key; const struct ovs_key_ipv6 *ipv6_key; + int err; case OVS_KEY_ATTR_PRIORITY: case OVS_KEY_ATTR_SKB_MARK: case OVS_KEY_ATTR_ETHERNET: break; + case OVS_KEY_ATTR_TUNNEL: + *set_tun = true; + err = validate_and_copy_set_tun(a, sfa); + if (err) + return err; + break; + case OVS_KEY_ATTR_IPV4: if (flow_key->eth.type != htons(ETH_P_IP)) return -EINVAL; @@ -771,7 +810,7 @@ static int validate_and_copy_actions(const struct nlattr *attr, break; case OVS_ACTION_ATTR_SET: - err = validate_set(a, key); + err = validate_set(a, key, sfa, &skip_copy); if (err) return err; break; @@ -993,6 +1032,33 @@ static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) return err; } +static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) +{ + const struct nlattr *ovs_key = nla_data(a); + int key_type = nla_type(ovs_key); + struct nlattr *start; + int err; + + switch (key_type) { + case OVS_KEY_ATTR_IPV4_TUNNEL: + start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); + if (!start) + return -EMSGSIZE; + + err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key)); + if (err) + return err; + nla_nest_end(skb, start); + break; + default: + if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key)) + return -EMSGSIZE; + break; + } + + return 0; +} + static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *skb) { const struct nlattr *a; @@ -1002,6 +1068,12 @@ static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *s int type = nla_type(a); switch (type) { + case OVS_ACTION_ATTR_SET: + err = set_action_to_attr(a, skb); + if (err) + return err; + break; + case OVS_ACTION_ATTR_SAMPLE: err = sample_action_to_attr(a, skb); if (err) diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 16b840695216..e88ebc2f1c54 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -88,9 +88,12 @@ struct datapath { /** * struct ovs_skb_cb - OVS data in skb CB * @flow: The flow associated with this packet. May be %NULL if no flow. + * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the + * packet is not being tunneled. */ struct ovs_skb_cb { struct sw_flow *flow; + struct ovs_key_ipv4_tunnel *tun_key; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 940d4b803ff5..976a8b766a6a 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -603,6 +604,8 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, memset(key, 0, sizeof(*key)); key->phy.priority = skb->priority; + if (OVS_CB(skb)->tun_key) + memcpy(&key->tun_key, OVS_CB(skb)->tun_key, sizeof(key->tun_key)); key->phy.in_port = in_port; key->phy.skb_mark = skb->mark; @@ -818,6 +821,7 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6), [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp), [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd), + [OVS_KEY_ATTR_TUNNEL] = -1, }; static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len, @@ -955,6 +959,105 @@ static int parse_flow_nlattrs(const struct nlattr *attr, return 0; } +int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr, + struct ovs_key_ipv4_tunnel *tun_key) +{ + struct nlattr *a; + int rem; + bool ttl = false; + + memset(tun_key, 0, sizeof(*tun_key)); + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + static const u32 ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { + [OVS_TUNNEL_KEY_ATTR_ID] = sizeof(u64), + [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = sizeof(u32), + [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = sizeof(u32), + [OVS_TUNNEL_KEY_ATTR_TOS] = 1, + [OVS_TUNNEL_KEY_ATTR_TTL] = 1, + [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0, + [OVS_TUNNEL_KEY_ATTR_CSUM] = 0, + }; + + if (type > OVS_TUNNEL_KEY_ATTR_MAX || + ovs_tunnel_key_lens[type] != nla_len(a)) + return -EINVAL; + + switch (type) { + case OVS_TUNNEL_KEY_ATTR_ID: + tun_key->tun_id = nla_get_be64(a); + tun_key->tun_flags |= TUNNEL_KEY; + break; + case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: + tun_key->ipv4_src = nla_get_be32(a); + break; + case OVS_TUNNEL_KEY_ATTR_IPV4_DST: + tun_key->ipv4_dst = nla_get_be32(a); + break; + case OVS_TUNNEL_KEY_ATTR_TOS: + tun_key->ipv4_tos = nla_get_u8(a); + break; + case OVS_TUNNEL_KEY_ATTR_TTL: + tun_key->ipv4_ttl = nla_get_u8(a); + ttl = true; + break; + case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT: + tun_key->tun_flags |= TUNNEL_DONT_FRAGMENT; + break; + case OVS_TUNNEL_KEY_ATTR_CSUM: + tun_key->tun_flags |= TUNNEL_CSUM; + break; + default: + return -EINVAL; + + } + } + if (rem > 0) + return -EINVAL; + + if (!tun_key->ipv4_dst) + return -EINVAL; + + if (!ttl) + return -EINVAL; + + return 0; +} + +int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, + const struct ovs_key_ipv4_tunnel *tun_key) +{ + struct nlattr *nla; + + nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL); + if (!nla) + return -EMSGSIZE; + + if (tun_key->tun_flags & TUNNEL_KEY && + nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, tun_key->tun_id)) + return -EMSGSIZE; + if (tun_key->ipv4_src && + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, tun_key->ipv4_src)) + return -EMSGSIZE; + if (nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, tun_key->ipv4_dst)) + return -EMSGSIZE; + if (tun_key->ipv4_tos && + nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, tun_key->ipv4_tos)) + return -EMSGSIZE; + if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, tun_key->ipv4_ttl)) + return -EMSGSIZE; + if ((tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) && + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) + return -EMSGSIZE; + if ((tun_key->tun_flags & TUNNEL_CSUM) && + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) + return -EMSGSIZE; + + nla_nest_end(skb, nla); + return 0; +} + /** * ovs_flow_from_nlattrs - parses Netlink attributes into a flow key. * @swkey: receives the extracted flow key. @@ -997,6 +1100,14 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); } + if (attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { + err = ovs_ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], &swkey->tun_key); + if (err) + return err; + + attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); + } + /* Data attributes. */ if (!(attrs & (1 << OVS_KEY_ATTR_ETHERNET))) return -EINVAL; @@ -1135,17 +1246,21 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, const struct nlattr *attr) { + struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key; const struct nlattr *nla; int rem; flow->key.phy.in_port = DP_MAX_PORTS; flow->key.phy.priority = 0; flow->key.phy.skb_mark = 0; + memset(tun_key, 0, sizeof(flow->key.tun_key)); nla_for_each_nested(nla, attr, rem) { int type = nla_type(nla); if (type <= OVS_KEY_ATTR_MAX && ovs_key_lens[type] > 0) { + int err; + if (nla_len(nla) != ovs_key_lens[type]) return -EINVAL; @@ -1154,6 +1269,12 @@ int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, flow->key.phy.priority = nla_get_u32(nla); break; + case OVS_KEY_ATTR_TUNNEL: + err = ovs_ipv4_tun_from_nlattr(nla, tun_key); + if (err) + return err; + break; + case OVS_KEY_ATTR_IN_PORT: if (nla_get_u32(nla) >= DP_MAX_PORTS) return -EINVAL; @@ -1180,6 +1301,10 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority)) goto nla_put_failure; + if (swkey->tun_key.ipv4_dst && + ovs_ipv4_tun_to_nlattr(skb, &swkey->tun_key)) + goto nla_put_failure; + if (swkey->phy.in_port != DP_MAX_PORTS && nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port)) goto nla_put_failure; diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index e370f6246ee9..aec5e43f690b 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -40,7 +40,22 @@ struct sw_flow_actions { struct nlattr actions[]; }; +/* Used to memset ovs_key_ipv4_tunnel padding. */ +#define OVS_TUNNEL_KEY_SIZE \ + (offsetof(struct ovs_key_ipv4_tunnel, ipv4_ttl) + \ + FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, ipv4_ttl)) + +struct ovs_key_ipv4_tunnel { + __be64 tun_id; + __be32 ipv4_src; + __be32 ipv4_dst; + u16 tun_flags; + u8 ipv4_tos; + u8 ipv4_ttl; +}; + struct sw_flow_key { + struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */ struct { u32 priority; /* Packet QoS priority. */ u32 skb_mark; /* SKB mark. */ @@ -179,5 +194,9 @@ u32 ovs_flow_hash(const struct sw_flow_key *key, int key_len); struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *idx); extern const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1]; +int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr, + struct ovs_key_ipv4_tunnel *tun_key); +int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, + const struct ovs_key_ipv4_tunnel *tun_key); #endif /* flow.h */ diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index e284c7e1fec4..98d3edbbc235 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -67,7 +67,7 @@ static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netde static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) { rcu_read_lock(); - ovs_vport_receive(internal_dev_priv(netdev)->vport, skb); + ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); rcu_read_unlock(); return 0; } diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 40de815b4213..5982f3f62835 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -51,7 +51,7 @@ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) skb_push(skb, ETH_HLEN); ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); - ovs_vport_receive(vport, skb); + ovs_vport_receive(vport, skb, NULL); return; error: diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 176d449351eb..413287a1877f 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -325,7 +325,8 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) * Must be called with rcu_read_lock. The packet cannot be shared and * skb->data should point to the Ethernet header. */ -void ovs_vport_receive(struct vport *vport, struct sk_buff *skb) +void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, + struct ovs_key_ipv4_tunnel *tun_key) { struct pcpu_tstats *stats; @@ -335,6 +336,7 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb) stats->rx_bytes += skb->len; u64_stats_update_end(&stats->syncp); + OVS_CB(skb)->tun_key = tun_key; ovs_dp_process_received_packet(vport, skb); } diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 293278c4c2df..2d961aedd71d 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -184,7 +184,8 @@ static inline struct vport *vport_from_priv(const void *priv) return (struct vport *)(priv - ALIGN(sizeof(struct vport), VPORT_ALIGN)); } -void ovs_vport_receive(struct vport *, struct sk_buff *); +void ovs_vport_receive(struct vport *, struct sk_buff *, + struct ovs_key_ipv4_tunnel *); void ovs_vport_record_error(struct vport *, enum vport_err_type err_type); /* List of statically compiled vport implementations. Don't forget to also -- cgit v1.2.3-71-gd317 From aa310701e787087dbfbccf1409982a96e16c57a6 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 17 Jun 2013 17:50:33 -0700 Subject: openvswitch: Add gre tunnel support. Add gre vport implementation. Most of gre protocol processing is pushed to gre module. It make use of gre demultiplexer therefore it can co-exist with linux device based gre tunnels. Signed-off-by: Pravin B Shelar Acked-by: Jesse Gross Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 1 + net/openvswitch/Kconfig | 2 + net/openvswitch/Makefile | 3 +- net/openvswitch/datapath.h | 1 + net/openvswitch/flow.h | 18 ++- net/openvswitch/vport-gre.c | 274 +++++++++++++++++++++++++++++++++++++++ net/openvswitch/vport.c | 19 +++ net/openvswitch/vport.h | 7 + 8 files changed, 323 insertions(+), 2 deletions(-) create mode 100644 net/openvswitch/vport-gre.c (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index b15a445927d6..c55efaaa9bb4 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -164,6 +164,7 @@ enum ovs_vport_type { OVS_VPORT_TYPE_UNSPEC, OVS_VPORT_TYPE_NETDEV, /* network device */ OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */ + OVS_VPORT_TYPE_GRE, /* GRE tunnel. */ __OVS_VPORT_TYPE_MAX }; diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index d9ea33c361be..9fbc04a31ed6 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -19,6 +19,8 @@ config OPENVSWITCH which is able to accept configuration from a variety of sources and translate it into packet processing rules. + Open vSwitch GRE support depends on CONFIG_NET_IPGRE_DEMUX. + See http://openvswitch.org for more information and userspace utilities. diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 15e7384745c1..01bddb2991e3 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -10,5 +10,6 @@ openvswitch-y := \ dp_notify.o \ flow.o \ vport.o \ + vport-gre.o \ vport-internal_dev.o \ - vport-netdev.o \ + vport-netdev.o diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index e88ebc2f1c54..a91486484916 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -122,6 +122,7 @@ struct dp_upcall_info { struct ovs_net { struct list_head dps; struct work_struct dp_notify_work; + struct vport_net vport_net; }; extern int ovs_net_id; diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 999842f247a0..66ef7220293e 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -49,11 +49,27 @@ struct ovs_key_ipv4_tunnel { __be64 tun_id; __be32 ipv4_src; __be32 ipv4_dst; - u16 tun_flags; + __be16 tun_flags; u8 ipv4_tos; u8 ipv4_ttl; }; +static inline void ovs_flow_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key, + const struct iphdr *iph, __be64 tun_id, + __be16 tun_flags) +{ + tun_key->tun_id = tun_id; + tun_key->ipv4_src = iph->saddr; + tun_key->ipv4_dst = iph->daddr; + tun_key->ipv4_tos = iph->tos; + tun_key->ipv4_ttl = iph->ttl; + tun_key->tun_flags = tun_flags; + + /* clear struct padding. */ + memset((unsigned char *) tun_key + OVS_TUNNEL_KEY_SIZE, 0, + sizeof(*tun_key) - OVS_TUNNEL_KEY_SIZE); +} + struct sw_flow_key { struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */ struct { diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c new file mode 100644 index 000000000000..3a8d1900aa78 --- /dev/null +++ b/net/openvswitch/vport-gre.c @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifdef CONFIG_NET_IPGRE_DEMUX +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "vport.h" + +/* Returns the least-significant 32 bits of a __be64. */ +static __be32 be64_get_low32(__be64 x) +{ +#ifdef __BIG_ENDIAN + return (__force __be32)x; +#else + return (__force __be32)((__force u64)x >> 32); +#endif +} + +static __be16 filter_tnl_flags(__be16 flags) +{ + return flags & (TUNNEL_CSUM | TUNNEL_KEY); +} + +static struct sk_buff *__build_header(struct sk_buff *skb, + int tunnel_hlen) +{ + const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->tun_key; + struct tnl_ptk_info tpi; + + skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM)); + if (IS_ERR(skb)) + return NULL; + + tpi.flags = filter_tnl_flags(tun_key->tun_flags); + tpi.proto = htons(ETH_P_TEB); + tpi.key = be64_get_low32(tun_key->tun_id); + tpi.seq = 0; + gre_build_header(skb, &tpi, tunnel_hlen); + + return skb; +} + +static __be64 key_to_tunnel_id(__be32 key, __be32 seq) +{ +#ifdef __BIG_ENDIAN + return (__force __be64)((__force u64)seq << 32 | (__force u32)key); +#else + return (__force __be64)((__force u64)key << 32 | (__force u32)seq); +#endif +} + +/* Called with rcu_read_lock and BH disabled. */ +static int gre_rcv(struct sk_buff *skb, + const struct tnl_ptk_info *tpi) +{ + struct ovs_key_ipv4_tunnel tun_key; + struct ovs_net *ovs_net; + struct vport *vport; + __be64 key; + + ovs_net = net_generic(dev_net(skb->dev), ovs_net_id); + vport = rcu_dereference(ovs_net->vport_net.gre_vport); + if (unlikely(!vport)) + return PACKET_REJECT; + + key = key_to_tunnel_id(tpi->key, tpi->seq); + ovs_flow_tun_key_init(&tun_key, ip_hdr(skb), key, + filter_tnl_flags(tpi->flags)); + + ovs_vport_receive(vport, skb, &tun_key); + return PACKET_RCVD; +} + +static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) +{ + struct net *net = ovs_dp_get_net(vport->dp); + struct flowi4 fl; + struct rtable *rt; + int min_headroom; + int tunnel_hlen; + __be16 df; + int err; + + if (unlikely(!OVS_CB(skb)->tun_key)) { + err = -EINVAL; + goto error; + } + + /* Route lookup */ + memset(&fl, 0, sizeof(fl)); + fl.daddr = OVS_CB(skb)->tun_key->ipv4_dst; + fl.saddr = OVS_CB(skb)->tun_key->ipv4_src; + fl.flowi4_tos = RT_TOS(OVS_CB(skb)->tun_key->ipv4_tos); + fl.flowi4_mark = skb->mark; + fl.flowi4_proto = IPPROTO_GRE; + + rt = ip_route_output_key(net, &fl); + if (IS_ERR(rt)) + return PTR_ERR(rt); + + tunnel_hlen = ip_gre_calc_hlen(OVS_CB(skb)->tun_key->tun_flags); + + min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + + tunnel_hlen + sizeof(struct iphdr) + + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); + if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { + int head_delta = SKB_DATA_ALIGN(min_headroom - + skb_headroom(skb) + + 16); + err = pskb_expand_head(skb, max_t(int, head_delta, 0), + 0, GFP_ATOMIC); + if (unlikely(err)) + goto err_free_rt; + } + + if (vlan_tx_tag_present(skb)) { + if (unlikely(!__vlan_put_tag(skb, + skb->vlan_proto, + vlan_tx_tag_get(skb)))) { + err = -ENOMEM; + goto err_free_rt; + } + skb->vlan_tci = 0; + } + + /* Push Tunnel header. */ + skb = __build_header(skb, tunnel_hlen); + if (unlikely(!skb)) { + err = 0; + goto err_free_rt; + } + + df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? + htons(IP_DF) : 0; + + skb->local_df = 1; + + return iptunnel_xmit(net, rt, skb, fl.saddr, + OVS_CB(skb)->tun_key->ipv4_dst, IPPROTO_GRE, + OVS_CB(skb)->tun_key->ipv4_tos, + OVS_CB(skb)->tun_key->ipv4_ttl, df); +err_free_rt: + ip_rt_put(rt); +error: + return err; +} + +static struct gre_cisco_protocol gre_protocol = { + .handler = gre_rcv, + .priority = 1, +}; + +static int gre_ports; +static int gre_init(void) +{ + int err; + + gre_ports++; + if (gre_ports > 1) + return 0; + + err = gre_cisco_register(&gre_protocol); + if (err) + pr_warn("cannot register gre protocol handler\n"); + + return err; +} + +static void gre_exit(void) +{ + gre_ports--; + if (gre_ports > 0) + return; + + gre_cisco_unregister(&gre_protocol); +} + +static const char *gre_get_name(const struct vport *vport) +{ + return vport_priv(vport); +} + +static struct vport *gre_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct ovs_net *ovs_net; + struct vport *vport; + int err; + + err = gre_init(); + if (err) + return ERR_PTR(err); + + ovs_net = net_generic(net, ovs_net_id); + if (ovsl_dereference(ovs_net->vport_net.gre_vport)) { + vport = ERR_PTR(-EEXIST); + goto error; + } + + vport = ovs_vport_alloc(IFNAMSIZ, &ovs_gre_vport_ops, parms); + if (IS_ERR(vport)) + goto error; + + strncpy(vport_priv(vport), parms->name, IFNAMSIZ); + rcu_assign_pointer(ovs_net->vport_net.gre_vport, vport); + return vport; + +error: + gre_exit(); + return vport; +} + +static void gre_tnl_destroy(struct vport *vport) +{ + struct net *net = ovs_dp_get_net(vport->dp); + struct ovs_net *ovs_net; + + ovs_net = net_generic(net, ovs_net_id); + + rcu_assign_pointer(ovs_net->vport_net.gre_vport, NULL); + ovs_vport_deferred_free(vport); + gre_exit(); +} + +const struct vport_ops ovs_gre_vport_ops = { + .type = OVS_VPORT_TYPE_GRE, + .create = gre_create, + .destroy = gre_tnl_destroy, + .get_name = gre_get_name, + .send = gre_tnl_send, +}; +#endif diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 413287a1877f..f52dfb9cb5a7 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -38,6 +38,10 @@ static const struct vport_ops *vport_ops_list[] = { &ovs_netdev_vport_ops, &ovs_internal_vport_ops, + +#ifdef CONFIG_NET_IPGRE_DEMUX + &ovs_gre_vport_ops, +#endif }; /* Protected by RCU read lock for reading, ovs_mutex for writing. */ @@ -404,3 +408,18 @@ void ovs_vport_record_error(struct vport *vport, enum vport_err_type err_type) spin_unlock(&vport->stats_lock); } + +static void free_vport_rcu(struct rcu_head *rcu) +{ + struct vport *vport = container_of(rcu, struct vport, rcu); + + ovs_vport_free(vport); +} + +void ovs_vport_deferred_free(struct vport *vport) +{ + if (!vport) + return; + + call_rcu(&vport->rcu, free_vport_rcu); +} diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 2d961aedd71d..376045c42f8b 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -34,6 +34,11 @@ struct vport_parms; /* The following definitions are for users of the vport subsytem: */ +/* The following definitions are for users of the vport subsytem: */ +struct vport_net { + struct vport __rcu *gre_vport; +}; + int ovs_vport_init(void); void ovs_vport_exit(void); @@ -152,6 +157,7 @@ enum vport_err_type { struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *, const struct vport_parms *); void ovs_vport_free(struct vport *); +void ovs_vport_deferred_free(struct vport *vport); #define VPORT_ALIGN 8 @@ -192,6 +198,7 @@ void ovs_vport_record_error(struct vport *, enum vport_err_type err_type); * add yours to the list at the top of vport.c. */ extern const struct vport_ops ovs_netdev_vport_ops; extern const struct vport_ops ovs_internal_vport_ops; +extern const struct vport_ops ovs_gre_vport_ops; static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) -- cgit v1.2.3-71-gd317 From 2bd470fc08cbbfd4f2e53a620362806620d217ed Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Wed, 19 Jun 2013 18:06:54 -0700 Subject: netlink: export netlink_diag.h header The netlink_diag.h is in include/uapi/linux but not in the Kbuild necessary to cause it to be exported by make headers_install. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/uapi/linux/Kbuild | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index ab5d4992e568..bdc6e87ff3eb 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -261,6 +261,7 @@ header-y += net_dropmon.h header-y += net_tstamp.h header-y += netconf.h header-y += netdevice.h +header-y += netlink_diag.h header-y += netfilter.h header-y += netfilter_arp.h header-y += netfilter_bridge.h -- cgit v1.2.3-71-gd317 From bcefe17cffd06efdda3e7ad679ea743236e6271a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 15 Jun 2013 09:39:18 +0800 Subject: tcp: introduce a per-route knob for quick ack In previous discussions, I tried to find some reasonable heuristics for delayed ACK, however this seems not possible, according to Eric: "ACKS might also be delayed because of bidirectional traffic, and is more controlled by the application response time. TCP stack can not easily estimate it." "ACK can be incredibly useful to recover from losses in a short time. The vast majority of TCP sessions are small lived, and we send one ACK per received segment anyway at beginning or retransmits to let the sender smoothly increase its cwnd, so an auto-tuning facility wont help them that much." and according to David: "ACKs are the only information we have to detect loss. And, for the same reasons that TCP VEGAS is fundamentally broken, we cannot measure the pipe or some other receiver-side-visible piece of information to determine when it's "safe" to stretch ACK. And even if it's "safe", we should not do it so that losses are accurately detected and we don't spuriously retransmit. The only way to know when the bandwidth increases is to "test" it, by sending more and more packets until drops happen. That's why all successful congestion control algorithms must operate on explicited tested pieces of information. Similarly, it's not really possible to universally know if it's safe to stretch ACK or not." It still makes sense to enable or disable quick ack mode like what TCP_QUICK_ACK does. Similar to TCP_QUICK_ACK option, but for people who can't modify the source code and still wants to control TCP delayed ACK behavior. As David suggested, this should belong to per-path scope, since different pathes may want different behaviors. Cc: Eric Dumazet Cc: Rick Jones Cc: Stephen Hemminger Cc: "David S. Miller" Cc: Thomas Graf CC: David Laight Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 2 ++ net/ipv4/tcp_input.c | 5 ++++- net/ipv4/tcp_output.c | 6 ++++-- 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 7a2144e1afae..eb0f1a554d7b 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -386,6 +386,8 @@ enum { #define RTAX_RTO_MIN RTAX_RTO_MIN RTAX_INITRWND, #define RTAX_INITRWND RTAX_INITRWND + RTAX_QUICKACK, +#define RTAX_QUICKACK RTAX_QUICKACK __RTAX_MAX }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 46271cdcf088..28af45abe062 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3717,6 +3717,7 @@ void tcp_reset(struct sock *sk) static void tcp_fin(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + const struct dst_entry *dst; inet_csk_schedule_ack(sk); @@ -3728,7 +3729,9 @@ static void tcp_fin(struct sock *sk) case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); - inet_csk(sk)->icsk_ack.pingpong = 1; + dst = __sk_dst_get(sk); + if (!dst || !dst_metric(dst, RTAX_QUICKACK)) + inet_csk(sk)->icsk_ack.pingpong = 1; break; case TCP_CLOSE_WAIT: diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e2c1333ee318..3d609490f118 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -160,6 +160,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp, { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; + const struct dst_entry *dst = __sk_dst_get(sk); if (sysctl_tcp_slow_start_after_idle && (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) @@ -170,8 +171,9 @@ static void tcp_event_data_sent(struct tcp_sock *tp, /* If it is a reply for ato after last received * packet, enter pingpong mode. */ - if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) - icsk->icsk_ack.pingpong = 1; + if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato && + (!dst || !dst_metric(dst, RTAX_QUICKACK))) + icsk->icsk_ack.pingpong = 1; } /* Account for an ACK we sent. */ -- cgit v1.2.3-71-gd317 From 5ffd229c02731a91d08ca21e76b503c5bbb5c095 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 21 May 2013 13:33:10 +1000 Subject: powerpc/vfio: Implement IOMMU driver for VFIO VFIO implements platform independent stuff such as a PCI driver, BAR access (via read/write on a file descriptor or direct mapping when possible) and IRQ signaling. The platform dependent part includes IOMMU initialization and handling. This implements an IOMMU driver for VFIO which does mapping/unmapping pages for the guest IO and provides information about DMA window (required by a POWER guest). Cc: David Gibson Signed-off-by: Alexey Kardashevskiy Signed-off-by: Paul Mackerras Acked-by: Alex Williamson Signed-off-by: Benjamin Herrenschmidt --- Documentation/vfio.txt | 63 ++++++ drivers/vfio/Kconfig | 6 + drivers/vfio/Makefile | 1 + drivers/vfio/vfio.c | 1 + drivers/vfio/vfio_iommu_spapr_tce.c | 377 ++++++++++++++++++++++++++++++++++++ include/uapi/linux/vfio.h | 34 ++++ 6 files changed, 482 insertions(+) create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c (limited to 'include/uapi/linux') diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt index 8eda3635a17d..c55533c0adb3 100644 --- a/Documentation/vfio.txt +++ b/Documentation/vfio.txt @@ -283,6 +283,69 @@ a direct pass through for VFIO_DEVICE_* ioctls. The read/write/mmap interfaces implement the device region access defined by the device's own VFIO_DEVICE_GET_REGION_INFO ioctl. + +PPC64 sPAPR implementation note +------------------------------------------------------------------------------- + +This implementation has some specifics: + +1) Only one IOMMU group per container is supported as an IOMMU group +represents the minimal entity which isolation can be guaranteed for and +groups are allocated statically, one per a Partitionable Endpoint (PE) +(PE is often a PCI domain but not always). + +2) The hardware supports so called DMA windows - the PCI address range +within which DMA transfer is allowed, any attempt to access address space +out of the window leads to the whole PE isolation. + +3) PPC64 guests are paravirtualized but not fully emulated. There is an API +to map/unmap pages for DMA, and it normally maps 1..32 pages per call and +currently there is no way to reduce the number of calls. In order to make things +faster, the map/unmap handling has been implemented in real mode which provides +an excellent performance which has limitations such as inability to do +locked pages accounting in real time. + +So 3 additional ioctls have been added: + + VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start + of the DMA window on the PCI bus. + + VFIO_IOMMU_ENABLE - enables the container. The locked pages accounting + is done at this point. This lets user first to know what + the DMA window is and adjust rlimit before doing any real job. + + VFIO_IOMMU_DISABLE - disables the container. + + +The code flow from the example above should be slightly changed: + + ..... + /* Add the group to the container */ + ioctl(group, VFIO_GROUP_SET_CONTAINER, &container); + + /* Enable the IOMMU model we want */ + ioctl(container, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU) + + /* Get addition sPAPR IOMMU info */ + vfio_iommu_spapr_tce_info spapr_iommu_info; + ioctl(container, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &spapr_iommu_info); + + if (ioctl(container, VFIO_IOMMU_ENABLE)) + /* Cannot enable container, may be low rlimit */ + + /* Allocate some space and setup a DMA mapping */ + dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + + dma_map.size = 1024 * 1024; + dma_map.iova = 0; /* 1MB starting at 0x0 from device view */ + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + + /* Check here is .iova/.size are within DMA window from spapr_iommu_info */ + + ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map); + ..... + ------------------------------------------------------------------------------- [1] VFIO was originally an acronym for "Virtual Function I/O" in its diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 7cd5dec0abd1..b464687f6e14 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1 depends on VFIO default n +config VFIO_IOMMU_SPAPR_TCE + tristate + depends on VFIO && SPAPR_TCE_IOMMU + default n + menuconfig VFIO tristate "VFIO Non-Privileged userspace driver framework" depends on IOMMU_API select VFIO_IOMMU_TYPE1 if X86 + select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV help VFIO provides a framework for secure userspace device drivers. See Documentation/vfio.txt for more details. diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 2398d4a0e38b..72bfabc8629e 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_VFIO) += vfio.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o obj-$(CONFIG_VFIO_PCI) += pci/ diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 6d78736563de..259ad282ae5d 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -1415,6 +1415,7 @@ static int __init vfio_init(void) * drivers. */ request_module_nowait("vfio_iommu_type1"); + request_module_nowait("vfio_iommu_spapr_tce"); return 0; diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c new file mode 100644 index 000000000000..bdae7a04af75 --- /dev/null +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -0,0 +1,377 @@ +/* + * VFIO: IOMMU DMA mapping support for TCE on POWER + * + * Copyright (C) 2013 IBM Corp. All rights reserved. + * Author: Alexey Kardashevskiy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Derived from original vfio_iommu_type1.c: + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRIVER_VERSION "0.1" +#define DRIVER_AUTHOR "aik@ozlabs.ru" +#define DRIVER_DESC "VFIO IOMMU SPAPR TCE" + +static void tce_iommu_detach_group(void *iommu_data, + struct iommu_group *iommu_group); + +/* + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation + * + * This code handles mapping and unmapping of user data buffers + * into DMA'ble space using the IOMMU + */ + +/* + * The container descriptor supports only a single group per container. + * Required by the API as the container is not supplied with the IOMMU group + * at the moment of initialization. + */ +struct tce_container { + struct mutex lock; + struct iommu_table *tbl; + bool enabled; +}; + +static int tce_iommu_enable(struct tce_container *container) +{ + int ret = 0; + unsigned long locked, lock_limit, npages; + struct iommu_table *tbl = container->tbl; + + if (!container->tbl) + return -ENXIO; + + if (!current->mm) + return -ESRCH; /* process exited */ + + if (container->enabled) + return -EBUSY; + + /* + * When userspace pages are mapped into the IOMMU, they are effectively + * locked memory, so, theoretically, we need to update the accounting + * of locked pages on each map and unmap. For powerpc, the map unmap + * paths can be very hot, though, and the accounting would kill + * performance, especially since it would be difficult to impossible + * to handle the accounting in real mode only. + * + * To address that, rather than precisely accounting every page, we + * instead account for a worst case on locked memory when the iommu is + * enabled and disabled. The worst case upper bound on locked memory + * is the size of the whole iommu window, which is usually relatively + * small (compared to total memory sizes) on POWER hardware. + * + * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, + * that would effectively kill the guest at random points, much better + * enforcing the limit based on the max that the guest can map. + */ + down_write(¤t->mm->mmap_sem); + npages = (tbl->it_size << IOMMU_PAGE_SHIFT) >> PAGE_SHIFT; + locked = current->mm->locked_vm + npages; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { + pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n", + rlimit(RLIMIT_MEMLOCK)); + ret = -ENOMEM; + } else { + + current->mm->locked_vm += npages; + container->enabled = true; + } + up_write(¤t->mm->mmap_sem); + + return ret; +} + +static void tce_iommu_disable(struct tce_container *container) +{ + if (!container->enabled) + return; + + container->enabled = false; + + if (!container->tbl || !current->mm) + return; + + down_write(¤t->mm->mmap_sem); + current->mm->locked_vm -= (container->tbl->it_size << + IOMMU_PAGE_SHIFT) >> PAGE_SHIFT; + up_write(¤t->mm->mmap_sem); +} + +static void *tce_iommu_open(unsigned long arg) +{ + struct tce_container *container; + + if (arg != VFIO_SPAPR_TCE_IOMMU) { + pr_err("tce_vfio: Wrong IOMMU type\n"); + return ERR_PTR(-EINVAL); + } + + container = kzalloc(sizeof(*container), GFP_KERNEL); + if (!container) + return ERR_PTR(-ENOMEM); + + mutex_init(&container->lock); + + return container; +} + +static void tce_iommu_release(void *iommu_data) +{ + struct tce_container *container = iommu_data; + + WARN_ON(container->tbl && !container->tbl->it_group); + tce_iommu_disable(container); + + if (container->tbl && container->tbl->it_group) + tce_iommu_detach_group(iommu_data, container->tbl->it_group); + + mutex_destroy(&container->lock); + + kfree(container); +} + +static long tce_iommu_ioctl(void *iommu_data, + unsigned int cmd, unsigned long arg) +{ + struct tce_container *container = iommu_data; + unsigned long minsz; + long ret; + + switch (cmd) { + case VFIO_CHECK_EXTENSION: + return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0; + + case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { + struct vfio_iommu_spapr_tce_info info; + struct iommu_table *tbl = container->tbl; + + if (WARN_ON(!tbl)) + return -ENXIO; + + minsz = offsetofend(struct vfio_iommu_spapr_tce_info, + dma32_window_size); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT; + info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT; + info.flags = 0; + + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + + return 0; + } + case VFIO_IOMMU_MAP_DMA: { + struct vfio_iommu_type1_dma_map param; + struct iommu_table *tbl = container->tbl; + unsigned long tce, i; + + if (!tbl) + return -ENXIO; + + BUG_ON(!tbl->it_group); + + minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); + + if (copy_from_user(¶m, (void __user *)arg, minsz)) + return -EFAULT; + + if (param.argsz < minsz) + return -EINVAL; + + if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ | + VFIO_DMA_MAP_FLAG_WRITE)) + return -EINVAL; + + if ((param.size & ~IOMMU_PAGE_MASK) || + (param.vaddr & ~IOMMU_PAGE_MASK)) + return -EINVAL; + + /* iova is checked by the IOMMU API */ + tce = param.vaddr; + if (param.flags & VFIO_DMA_MAP_FLAG_READ) + tce |= TCE_PCI_READ; + if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) + tce |= TCE_PCI_WRITE; + + ret = iommu_tce_put_param_check(tbl, param.iova, tce); + if (ret) + return ret; + + for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT); ++i) { + ret = iommu_put_tce_user_mode(tbl, + (param.iova >> IOMMU_PAGE_SHIFT) + i, + tce); + if (ret) + break; + tce += IOMMU_PAGE_SIZE; + } + if (ret) + iommu_clear_tces_and_put_pages(tbl, + param.iova >> IOMMU_PAGE_SHIFT, i); + + iommu_flush_tce(tbl); + + return ret; + } + case VFIO_IOMMU_UNMAP_DMA: { + struct vfio_iommu_type1_dma_unmap param; + struct iommu_table *tbl = container->tbl; + + if (WARN_ON(!tbl)) + return -ENXIO; + + minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, + size); + + if (copy_from_user(¶m, (void __user *)arg, minsz)) + return -EFAULT; + + if (param.argsz < minsz) + return -EINVAL; + + /* No flag is supported now */ + if (param.flags) + return -EINVAL; + + if (param.size & ~IOMMU_PAGE_MASK) + return -EINVAL; + + ret = iommu_tce_clear_param_check(tbl, param.iova, 0, + param.size >> IOMMU_PAGE_SHIFT); + if (ret) + return ret; + + ret = iommu_clear_tces_and_put_pages(tbl, + param.iova >> IOMMU_PAGE_SHIFT, + param.size >> IOMMU_PAGE_SHIFT); + iommu_flush_tce(tbl); + + return ret; + } + case VFIO_IOMMU_ENABLE: + mutex_lock(&container->lock); + ret = tce_iommu_enable(container); + mutex_unlock(&container->lock); + return ret; + + + case VFIO_IOMMU_DISABLE: + mutex_lock(&container->lock); + tce_iommu_disable(container); + mutex_unlock(&container->lock); + return 0; + } + + return -ENOTTY; +} + +static int tce_iommu_attach_group(void *iommu_data, + struct iommu_group *iommu_group) +{ + int ret; + struct tce_container *container = iommu_data; + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group); + + BUG_ON(!tbl); + mutex_lock(&container->lock); + + /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", + iommu_group_id(iommu_group), iommu_group); */ + if (container->tbl) { + pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n", + iommu_group_id(container->tbl->it_group), + iommu_group_id(iommu_group)); + ret = -EBUSY; + } else if (container->enabled) { + pr_err("tce_vfio: attaching group #%u to enabled container\n", + iommu_group_id(iommu_group)); + ret = -EBUSY; + } else { + ret = iommu_take_ownership(tbl); + if (!ret) + container->tbl = tbl; + } + + mutex_unlock(&container->lock); + + return ret; +} + +static void tce_iommu_detach_group(void *iommu_data, + struct iommu_group *iommu_group) +{ + struct tce_container *container = iommu_data; + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group); + + BUG_ON(!tbl); + mutex_lock(&container->lock); + if (tbl != container->tbl) { + pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n", + iommu_group_id(iommu_group), + iommu_group_id(tbl->it_group)); + } else { + if (container->enabled) { + pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n", + iommu_group_id(tbl->it_group)); + tce_iommu_disable(container); + } + + /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n", + iommu_group_id(iommu_group), iommu_group); */ + container->tbl = NULL; + iommu_release_ownership(tbl); + } + mutex_unlock(&container->lock); +} + +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = { + .name = "iommu-vfio-powerpc", + .owner = THIS_MODULE, + .open = tce_iommu_open, + .release = tce_iommu_release, + .ioctl = tce_iommu_ioctl, + .attach_group = tce_iommu_attach_group, + .detach_group = tce_iommu_detach_group, +}; + +static int __init tce_iommu_init(void) +{ + return vfio_register_iommu_driver(&tce_iommu_driver_ops); +} + +static void __exit tce_iommu_cleanup(void) +{ + vfio_unregister_iommu_driver(&tce_iommu_driver_ops); +} + +module_init(tce_iommu_init); +module_exit(tce_iommu_cleanup); + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); + diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 284ff2436829..87ee4f4cff25 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -22,6 +22,7 @@ /* Extensions */ #define VFIO_TYPE1_IOMMU 1 +#define VFIO_SPAPR_TCE_IOMMU 2 /* * The IOCTL interface is designed for extensibility by embedding the @@ -375,4 +376,37 @@ struct vfio_iommu_type1_dma_unmap { #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) +/* + * IOCTLs to enable/disable IOMMU container usage. + * No parameters are supported. + */ +#define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15) +#define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16) + +/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ + +/* + * The SPAPR TCE info struct provides the information about the PCI bus + * address ranges available for DMA, these values are programmed into + * the hardware so the guest has to know that information. + * + * The DMA 32 bit window start is an absolute PCI bus address. + * The IOVA address passed via map/unmap ioctls are absolute PCI bus + * addresses too so the window works as a filter rather than an offset + * for IOVA addresses. + * + * A flag will need to be added if other page sizes are supported, + * so as defined here, it is always 4k. + */ +struct vfio_iommu_spapr_tce_info { + __u32 argsz; + __u32 flags; /* reserved for future use */ + __u32 dma32_window_start; /* 32 bit window start (bytes) */ + __u32 dma32_window_size; /* 32 bit window size (bytes) */ +}; + +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) + +/* ***************************************************************** */ + #endif /* _UAPIVFIO_H */ -- cgit v1.2.3-71-gd317 From 681f130f39e10087475383e6771b9366e26bab0c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Jun 2013 05:52:22 -0700 Subject: netfilter: xt_socket: add XT_SOCKET_NOWILDCARD flag xt_socket module can be a nice replacement to conntrack module in some cases (SYN filtering for example) But it lacks the ability to match the 3rd packet of TCP handshake (ACK coming from the client). Add a XT_SOCKET_NOWILDCARD flag to disable the wildcard mechanism. The wildcard is the legacy socket match behavior, that ignores LISTEN sockets bound to INADDR_ANY (or ipv6 equivalent) iptables -I INPUT -p tcp --syn -j SYN_CHAIN iptables -I INPUT -m socket --nowildcard -j ACCEPT Signed-off-by: Eric Dumazet Cc: Patrick McHardy Cc: Jesper Dangaard Brouer Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_socket.h | 7 ++++ net/netfilter/xt_socket.c | 70 ++++++++++++++++++++++++++++---- 2 files changed, 69 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/xt_socket.h b/include/uapi/linux/netfilter/xt_socket.h index 26d7217bd4f1..6315e2ac3474 100644 --- a/include/uapi/linux/netfilter/xt_socket.h +++ b/include/uapi/linux/netfilter/xt_socket.h @@ -5,10 +5,17 @@ enum { XT_SOCKET_TRANSPARENT = 1 << 0, + XT_SOCKET_NOWILDCARD = 1 << 1, }; struct xt_socket_mtinfo1 { __u8 flags; }; +#define XT_SOCKET_FLAGS_V1 XT_SOCKET_TRANSPARENT + +struct xt_socket_mtinfo2 { + __u8 flags; +}; +#define XT_SOCKET_FLAGS_V2 (XT_SOCKET_TRANSPARENT | XT_SOCKET_NOWILDCARD) #endif /* _XT_SOCKET_H */ diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 02704245710e..f8b71911037a 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -163,8 +163,11 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, bool wildcard; bool transparent = true; - /* Ignore sockets listening on INADDR_ANY */ - wildcard = (sk->sk_state != TCP_TIME_WAIT && + /* Ignore sockets listening on INADDR_ANY, + * unless XT_SOCKET_NOWILDCARD is set + */ + wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) && + sk->sk_state != TCP_TIME_WAIT && inet_sk(sk)->inet_rcv_saddr == 0); /* Ignore non-transparent sockets, @@ -197,7 +200,7 @@ socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par) } static bool -socket_mt4_v1(const struct sk_buff *skb, struct xt_action_param *par) +socket_mt4_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) { return socket_match(skb, par, par->matchinfo); } @@ -259,7 +262,7 @@ extract_icmp6_fields(const struct sk_buff *skb, } static bool -socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par) +socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par) { struct ipv6hdr *iph = ipv6_hdr(skb); struct udphdr _hdr, *hp = NULL; @@ -302,8 +305,11 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par) bool wildcard; bool transparent = true; - /* Ignore sockets listening on INADDR_ANY */ - wildcard = (sk->sk_state != TCP_TIME_WAIT && + /* Ignore sockets listening on INADDR_ANY + * unless XT_SOCKET_NOWILDCARD is set + */ + wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) && + sk->sk_state != TCP_TIME_WAIT && ipv6_addr_any(&inet6_sk(sk)->rcv_saddr)); /* Ignore non-transparent sockets, @@ -331,6 +337,28 @@ socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par) } #endif +static int socket_mt_v1_check(const struct xt_mtchk_param *par) +{ + const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; + + if (info->flags & ~XT_SOCKET_FLAGS_V1) { + pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V1); + return -EINVAL; + } + return 0; +} + +static int socket_mt_v2_check(const struct xt_mtchk_param *par) +{ + const struct xt_socket_mtinfo2 *info = (struct xt_socket_mtinfo2 *) par->matchinfo; + + if (info->flags & ~XT_SOCKET_FLAGS_V2) { + pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V2); + return -EINVAL; + } + return 0; +} + static struct xt_match socket_mt_reg[] __read_mostly = { { .name = "socket", @@ -345,7 +373,8 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .name = "socket", .revision = 1, .family = NFPROTO_IPV4, - .match = socket_mt4_v1, + .match = socket_mt4_v1_v2, + .checkentry = socket_mt_v1_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), @@ -356,7 +385,32 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .name = "socket", .revision = 1, .family = NFPROTO_IPV6, - .match = socket_mt6_v1, + .match = socket_mt6_v1_v2, + .checkentry = socket_mt_v1_check, + .matchsize = sizeof(struct xt_socket_mtinfo1), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, +#endif + { + .name = "socket", + .revision = 2, + .family = NFPROTO_IPV4, + .match = socket_mt4_v1_v2, + .checkentry = socket_mt_v2_check, + .matchsize = sizeof(struct xt_socket_mtinfo1), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, +#ifdef XT_SOCKET_HAVE_IPV6 + { + .name = "socket", + .revision = 2, + .family = NFPROTO_IPV6, + .match = socket_mt6_v1_v2, + .checkentry = socket_mt_v2_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), -- cgit v1.2.3-71-gd317 From b71c99801e18eb172ae34851daf25044a3bf644a Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Sun, 26 May 2013 10:03:01 -0300 Subject: [media] v4l2-core: remove support for obsolete VIDIOC_DBG_G_CHIP_IDENT This has been replaced by the new and much better VIDIOC_DBG_G_CHIP_INFO. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 1 - drivers/media/v4l2-core/v4l2-dev.c | 1 - drivers/media/v4l2-core/v4l2-ioctl.c | 28 -- include/media/v4l2-chip-ident.h | 354 -------------------------- include/media/v4l2-ioctl.h | 2 - include/media/v4l2-subdev.h | 4 +- include/uapi/linux/videodev2.h | 17 +- 7 files changed, 4 insertions(+), 403 deletions(-) delete mode 100644 include/media/v4l2-chip-ident.h (limited to 'include/uapi/linux') diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c index f1295519f285..8f7a6a454a4c 100644 --- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c +++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c @@ -1074,7 +1074,6 @@ long v4l2_compat_ioctl32(struct file *file, unsigned int cmd, unsigned long arg) case VIDIOC_TRY_DECODER_CMD: case VIDIOC_DBG_S_REGISTER: case VIDIOC_DBG_G_REGISTER: - case VIDIOC_DBG_G_CHIP_IDENT: case VIDIOC_S_HW_FREQ_SEEK: case VIDIOC_S_DV_TIMINGS: case VIDIOC_G_DV_TIMINGS: diff --git a/drivers/media/v4l2-core/v4l2-dev.c b/drivers/media/v4l2-core/v4l2-dev.c index 2f3fac5345db..0c061e16502e 100644 --- a/drivers/media/v4l2-core/v4l2-dev.c +++ b/drivers/media/v4l2-core/v4l2-dev.c @@ -596,7 +596,6 @@ static void determine_valid_ioctls(struct video_device *vdev) set_bit(_IOC_NR(VIDIOC_DBG_G_REGISTER), valid_ioctls); set_bit(_IOC_NR(VIDIOC_DBG_S_REGISTER), valid_ioctls); #endif - SET_VALID_IOCTL(ops, VIDIOC_DBG_G_CHIP_IDENT, vidioc_g_chip_ident); /* yes, really vidioc_subscribe_event */ SET_VALID_IOCTL(ops, VIDIOC_DQEVENT, vidioc_subscribe_event); SET_VALID_IOCTL(ops, VIDIOC_SUBSCRIBE_EVENT, vidioc_subscribe_event); diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c index 423cbf948600..c9d9f01d21bc 100644 --- a/drivers/media/v4l2-core/v4l2-ioctl.c +++ b/drivers/media/v4l2-core/v4l2-ioctl.c @@ -26,7 +26,6 @@ #include #include #include -#include #include /* Zero out the end of the struct pointed to by p. Everything after, but @@ -619,20 +618,6 @@ static void v4l_print_decoder_cmd(const void *arg, bool write_only) pr_info("pts=%llu\n", p->stop.pts); } -static void v4l_print_dbg_chip_ident(const void *arg, bool write_only) -{ - const struct v4l2_dbg_chip_ident *p = arg; - - pr_cont("type=%u, ", p->match.type); - if (p->match.type == V4L2_CHIP_MATCH_I2C_DRIVER) - pr_cont("name=%.*s, ", - (int)sizeof(p->match.name), p->match.name); - else - pr_cont("addr=%u, ", p->match.addr); - pr_cont("chip_ident=%u, revision=0x%x\n", - p->ident, p->revision); -} - static void v4l_print_dbg_chip_info(const void *arg, bool write_only) { const struct v4l2_dbg_chip_info *p = arg; @@ -1813,18 +1798,6 @@ static int v4l_dbg_s_register(const struct v4l2_ioctl_ops *ops, #endif } -static int v4l_dbg_g_chip_ident(const struct v4l2_ioctl_ops *ops, - struct file *file, void *fh, void *arg) -{ - struct v4l2_dbg_chip_ident *p = arg; - - p->ident = V4L2_IDENT_NONE; - p->revision = 0; - if (p->match.type == V4L2_CHIP_MATCH_SUBDEV) - return -EINVAL; - return ops->vidioc_g_chip_ident(file, fh, p); -} - static int v4l_dbg_g_chip_info(const struct v4l2_ioctl_ops *ops, struct file *file, void *fh, void *arg) { @@ -2074,7 +2047,6 @@ static struct v4l2_ioctl_info v4l2_ioctls[] = { IOCTL_INFO_STD(VIDIOC_TRY_DECODER_CMD, vidioc_try_decoder_cmd, v4l_print_decoder_cmd, 0), IOCTL_INFO_FNC(VIDIOC_DBG_S_REGISTER, v4l_dbg_s_register, v4l_print_dbg_register, 0), IOCTL_INFO_FNC(VIDIOC_DBG_G_REGISTER, v4l_dbg_g_register, v4l_print_dbg_register, 0), - IOCTL_INFO_FNC(VIDIOC_DBG_G_CHIP_IDENT, v4l_dbg_g_chip_ident, v4l_print_dbg_chip_ident, 0), IOCTL_INFO_FNC(VIDIOC_S_HW_FREQ_SEEK, v4l_s_hw_freq_seek, v4l_print_hw_freq_seek, INFO_FL_PRIO), IOCTL_INFO_STD(VIDIOC_S_DV_TIMINGS, vidioc_s_dv_timings, v4l_print_dv_timings, INFO_FL_PRIO), IOCTL_INFO_STD(VIDIOC_G_DV_TIMINGS, vidioc_g_dv_timings, v4l_print_dv_timings, 0), diff --git a/include/media/v4l2-chip-ident.h b/include/media/v4l2-chip-ident.h deleted file mode 100644 index 543f89c18896..000000000000 --- a/include/media/v4l2-chip-ident.h +++ /dev/null @@ -1,354 +0,0 @@ -/* - v4l2 chip identifiers header - - This header provides a list of chip identifiers that can be returned - through the VIDIOC_DBG_G_CHIP_IDENT ioctl. - - Copyright (C) 2007 Hans Verkuil - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef V4L2_CHIP_IDENT_H_ -#define V4L2_CHIP_IDENT_H_ - -/* VIDIOC_DBG_G_CHIP_IDENT: identifies the actual chip installed on the board */ - -/* KEEP THIS LIST ORDERED BY ID! - Otherwise it will be hard to see which ranges are already in use when - adding support to a new chip family. */ -enum { - /* general idents: reserved range 0-49 */ - V4L2_IDENT_NONE = 0, /* No chip matched */ - V4L2_IDENT_AMBIGUOUS = 1, /* Match too general, multiple chips matched */ - V4L2_IDENT_UNKNOWN = 2, /* Chip found, but cannot identify */ - - /* module tvaudio: reserved range 50-99 */ - V4L2_IDENT_TVAUDIO = 50, /* A tvaudio chip, unknown which it is exactly */ - - /* Sony IMX074 */ - V4L2_IDENT_IMX074 = 74, - - /* module saa7110: just ident 100 */ - V4L2_IDENT_SAA7110 = 100, - - /* module saa7115: reserved range 101-149 */ - V4L2_IDENT_SAA7111 = 101, - V4L2_IDENT_SAA7111A = 102, - V4L2_IDENT_SAA7113 = 103, - V4L2_IDENT_SAA7114 = 104, - V4L2_IDENT_SAA7115 = 105, - V4L2_IDENT_SAA7118 = 108, - - V4L2_IDENT_GM7113C = 140, - - /* module saa7127: reserved range 150-199 */ - V4L2_IDENT_SAA7127 = 157, - V4L2_IDENT_SAA7129 = 159, - - /* module cx25840: reserved range 200-249 */ - V4L2_IDENT_CX25836 = 236, - V4L2_IDENT_CX25837 = 237, - V4L2_IDENT_CX25840 = 240, - V4L2_IDENT_CX25841 = 241, - V4L2_IDENT_CX25842 = 242, - V4L2_IDENT_CX25843 = 243, - - /* OmniVision sensors: reserved range 250-299 */ - V4L2_IDENT_OV7670 = 250, - V4L2_IDENT_OV7720 = 251, - V4L2_IDENT_OV7725 = 252, - V4L2_IDENT_OV7660 = 253, - V4L2_IDENT_OV9650 = 254, - V4L2_IDENT_OV9655 = 255, - V4L2_IDENT_SOI968 = 256, - V4L2_IDENT_OV9640 = 257, - V4L2_IDENT_OV6650 = 258, - V4L2_IDENT_OV2640 = 259, - V4L2_IDENT_OV9740 = 260, - V4L2_IDENT_OV5642 = 261, - - /* module saa7146: reserved range 300-309 */ - V4L2_IDENT_SAA7146 = 300, - - /* Conexant MPEG encoder/decoders: reserved range 400-420 */ - V4L2_IDENT_CX23418_843 = 403, /* Integrated A/V Decoder on the '418 */ - V4L2_IDENT_CX23415 = 415, - V4L2_IDENT_CX23416 = 416, - V4L2_IDENT_CX23417 = 417, - V4L2_IDENT_CX23418 = 418, - - /* module bt819: reserved range 810-819 */ - V4L2_IDENT_BT815A = 815, - V4L2_IDENT_BT817A = 817, - V4L2_IDENT_BT819A = 819, - - /* module au0828 */ - V4L2_IDENT_AU0828 = 828, - - /* module bttv: ident 848 + 849 */ - V4L2_IDENT_BT848 = 848, - V4L2_IDENT_BT849 = 849, - - /* module bt856: just ident 856 */ - V4L2_IDENT_BT856 = 856, - - /* module bt866: just ident 866 */ - V4L2_IDENT_BT866 = 866, - - /* module bttv: ident 878 + 879 */ - V4L2_IDENT_BT878 = 878, - V4L2_IDENT_BT879 = 879, - - /* module ks0127: reserved range 1120-1129 */ - V4L2_IDENT_KS0122S = 1122, - V4L2_IDENT_KS0127 = 1127, - V4L2_IDENT_KS0127B = 1128, - - /* module indycam: just ident 2000 */ - V4L2_IDENT_INDYCAM = 2000, - - /* module vp27smpx: just ident 2700 */ - V4L2_IDENT_VP27SMPX = 2700, - - /* module vpx3220: reserved range: 3210-3229 */ - V4L2_IDENT_VPX3214C = 3214, - V4L2_IDENT_VPX3216B = 3216, - V4L2_IDENT_VPX3220A = 3220, - - /* VX855 just ident 3409 */ - /* Other via devs could use 3314, 3324, 3327, 3336, 3364, 3353 */ - V4L2_IDENT_VIA_VX855 = 3409, - - /* module tvp5150 */ - V4L2_IDENT_TVP5150 = 5150, - - /* module saa5246a: just ident 5246 */ - V4L2_IDENT_SAA5246A = 5246, - - /* module saa5249: just ident 5249 */ - V4L2_IDENT_SAA5249 = 5249, - - /* module cs5345: just ident 5345 */ - V4L2_IDENT_CS5345 = 5345, - - /* module tea6415c: just ident 6415 */ - V4L2_IDENT_TEA6415C = 6415, - - /* module tea6420: just ident 6420 */ - V4L2_IDENT_TEA6420 = 6420, - - /* module saa6588: just ident 6588 */ - V4L2_IDENT_SAA6588 = 6588, - - /* module vs6624: just ident 6624 */ - V4L2_IDENT_VS6624 = 6624, - - /* module saa6752hs: reserved range 6750-6759 */ - V4L2_IDENT_SAA6752HS = 6752, - V4L2_IDENT_SAA6752HS_AC3 = 6753, - - /* modules tef6862: just ident 6862 */ - V4L2_IDENT_TEF6862 = 6862, - - /* module tvp7002: just ident 7002 */ - V4L2_IDENT_TVP7002 = 7002, - - /* module adv7170: just ident 7170 */ - V4L2_IDENT_ADV7170 = 7170, - - /* module adv7175: just ident 7175 */ - V4L2_IDENT_ADV7175 = 7175, - - /* module adv7180: just ident 7180 */ - V4L2_IDENT_ADV7180 = 7180, - - /* module adv7183: just ident 7183 */ - V4L2_IDENT_ADV7183 = 7183, - - /* module saa7185: just ident 7185 */ - V4L2_IDENT_SAA7185 = 7185, - - /* module saa7191: just ident 7191 */ - V4L2_IDENT_SAA7191 = 7191, - - /* module ths7303: just ident 7303 */ - V4L2_IDENT_THS7303 = 7303, - - /* module adv7343: just ident 7343 */ - V4L2_IDENT_ADV7343 = 7343, - - /* module ths7353: just ident 7353 */ - V4L2_IDENT_THS7353 = 7353, - - /* module adv7393: just ident 7393 */ - V4L2_IDENT_ADV7393 = 7393, - - /* module adv7604: just ident 7604 */ - V4L2_IDENT_ADV7604 = 7604, - - /* module saa7706h: just ident 7706 */ - V4L2_IDENT_SAA7706H = 7706, - - /* module mt9v011, just ident 8243 */ - V4L2_IDENT_MT9V011 = 8243, - - /* module wm8739: just ident 8739 */ - V4L2_IDENT_WM8739 = 8739, - - /* module wm8775: just ident 8775 */ - V4L2_IDENT_WM8775 = 8775, - - /* Marvell controllers starting at 8801 */ - V4L2_IDENT_CAFE = 8801, - V4L2_IDENT_ARMADA610 = 8802, - - /* AKM AK8813/AK8814 */ - V4L2_IDENT_AK8813 = 8813, - V4L2_IDENT_AK8814 = 8814, - - /* module cx23885 and cx25840 */ - V4L2_IDENT_CX23885 = 8850, - V4L2_IDENT_CX23885_AV = 8851, /* Integrated A/V decoder */ - V4L2_IDENT_CX23887 = 8870, - V4L2_IDENT_CX23887_AV = 8871, /* Integrated A/V decoder */ - V4L2_IDENT_CX23888 = 8880, - V4L2_IDENT_CX23888_AV = 8881, /* Integrated A/V decoder */ - V4L2_IDENT_CX23888_IR = 8882, /* Integrated infrared controller */ - - /* module ad9389b: just ident 9389 */ - V4L2_IDENT_AD9389B = 9389, - - /* module tda9840: just ident 9840 */ - V4L2_IDENT_TDA9840 = 9840, - - /* module tw9910: just ident 9910 */ - V4L2_IDENT_TW9910 = 9910, - - /* module sn9c20x: just ident 10000 */ - V4L2_IDENT_SN9C20X = 10000, - - /* module cx231xx and cx25840 */ - V4L2_IDENT_CX2310X_AV = 23099, /* Integrated A/V decoder; not in '100 */ - V4L2_IDENT_CX23100 = 23100, - V4L2_IDENT_CX23101 = 23101, - V4L2_IDENT_CX23102 = 23102, - - /* module msp3400: reserved range 34000-34999 for msp34xx */ - V4L2_IDENT_MSPX4XX = 34000, /* generic MSPX4XX identifier, only - use internally (tveeprom.c). */ - - V4L2_IDENT_MSP3400B = 34002, - V4L2_IDENT_MSP3400C = 34003, - V4L2_IDENT_MSP3400D = 34004, - V4L2_IDENT_MSP3400G = 34007, - V4L2_IDENT_MSP3401G = 34017, - V4L2_IDENT_MSP3402G = 34027, - V4L2_IDENT_MSP3405D = 34054, - V4L2_IDENT_MSP3405G = 34057, - V4L2_IDENT_MSP3407D = 34074, - V4L2_IDENT_MSP3407G = 34077, - - V4L2_IDENT_MSP3410B = 34102, - V4L2_IDENT_MSP3410C = 34103, - V4L2_IDENT_MSP3410D = 34104, - V4L2_IDENT_MSP3410G = 34107, - V4L2_IDENT_MSP3411G = 34117, - V4L2_IDENT_MSP3412G = 34127, - V4L2_IDENT_MSP3415D = 34154, - V4L2_IDENT_MSP3415G = 34157, - V4L2_IDENT_MSP3417D = 34174, - V4L2_IDENT_MSP3417G = 34177, - - V4L2_IDENT_MSP3420G = 34207, - V4L2_IDENT_MSP3421G = 34217, - V4L2_IDENT_MSP3422G = 34227, - V4L2_IDENT_MSP3425G = 34257, - V4L2_IDENT_MSP3427G = 34277, - - V4L2_IDENT_MSP3430G = 34307, - V4L2_IDENT_MSP3431G = 34317, - V4L2_IDENT_MSP3435G = 34357, - V4L2_IDENT_MSP3437G = 34377, - - V4L2_IDENT_MSP3440G = 34407, - V4L2_IDENT_MSP3441G = 34417, - V4L2_IDENT_MSP3442G = 34427, - V4L2_IDENT_MSP3445G = 34457, - V4L2_IDENT_MSP3447G = 34477, - - V4L2_IDENT_MSP3450G = 34507, - V4L2_IDENT_MSP3451G = 34517, - V4L2_IDENT_MSP3452G = 34527, - V4L2_IDENT_MSP3455G = 34557, - V4L2_IDENT_MSP3457G = 34577, - - V4L2_IDENT_MSP3460G = 34607, - V4L2_IDENT_MSP3461G = 34617, - V4L2_IDENT_MSP3465G = 34657, - V4L2_IDENT_MSP3467G = 34677, - - /* module msp3400: reserved range 44000-44999 for msp44xx */ - V4L2_IDENT_MSP4400G = 44007, - V4L2_IDENT_MSP4408G = 44087, - V4L2_IDENT_MSP4410G = 44107, - V4L2_IDENT_MSP4418G = 44187, - V4L2_IDENT_MSP4420G = 44207, - V4L2_IDENT_MSP4428G = 44287, - V4L2_IDENT_MSP4440G = 44407, - V4L2_IDENT_MSP4448G = 44487, - V4L2_IDENT_MSP4450G = 44507, - V4L2_IDENT_MSP4458G = 44587, - - /* Micron CMOS sensor chips: 45000-45099 */ - V4L2_IDENT_MT9M001C12ST = 45000, - V4L2_IDENT_MT9M001C12STM = 45005, - V4L2_IDENT_MT9M111 = 45007, - V4L2_IDENT_MT9M112 = 45008, - V4L2_IDENT_MT9V022IX7ATC = 45010, /* No way to detect "normal" I77ATx */ - V4L2_IDENT_MT9V022IX7ATM = 45015, /* and "lead free" IA7ATx chips */ - V4L2_IDENT_MT9T031 = 45020, - V4L2_IDENT_MT9T111 = 45021, - V4L2_IDENT_MT9T112 = 45022, - V4L2_IDENT_MT9V111 = 45031, - V4L2_IDENT_MT9V112 = 45032, - - /* HV7131R CMOS sensor: just ident 46000 */ - V4L2_IDENT_HV7131R = 46000, - - /* Sharp RJ54N1CB0C, 0xCB0C = 51980 */ - V4L2_IDENT_RJ54N1CB0C = 51980, - - /* module m52790: just ident 52790 */ - V4L2_IDENT_M52790 = 52790, - - /* module cs53132a: just ident 53132 */ - V4L2_IDENT_CS53l32A = 53132, - - /* modules upd61151 MPEG2 encoder: just ident 54000 */ - V4L2_IDENT_UPD61161 = 54000, - /* modules upd61152 MPEG2 encoder with AC3: just ident 54001 */ - V4L2_IDENT_UPD61162 = 54001, - - /* module upd64031a: just ident 64031 */ - V4L2_IDENT_UPD64031A = 64031, - - /* module upd64083: just ident 64083 */ - V4L2_IDENT_UPD64083 = 64083, - - /* Don't just add new IDs at the end: KEEP THIS LIST ORDERED BY ID! */ -}; - -#endif diff --git a/include/media/v4l2-ioctl.h b/include/media/v4l2-ioctl.h index 931652f0e2af..e0b74a430b3a 100644 --- a/include/media/v4l2-ioctl.h +++ b/include/media/v4l2-ioctl.h @@ -247,8 +247,6 @@ struct v4l2_ioctl_ops { int (*vidioc_g_chip_info) (struct file *file, void *fh, struct v4l2_dbg_chip_info *chip); #endif - int (*vidioc_g_chip_ident) (struct file *file, void *fh, - struct v4l2_dbg_chip_ident *chip); int (*vidioc_enum_framesizes) (struct file *file, void *fh, struct v4l2_frmsizeenum *fsize); diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h index 5298d678d0f3..21fc9e16d7be 100644 --- a/include/media/v4l2-subdev.h +++ b/include/media/v4l2-subdev.h @@ -88,7 +88,6 @@ struct v4l2_decode_vbi_line { /* Core ops: it is highly recommended to implement at least these ops: - g_chip_ident log_status g_register s_register @@ -145,7 +144,6 @@ struct v4l2_subdev_io_pin_config { performed later. It must not sleep. *Called from an IRQ context*. */ struct v4l2_subdev_core_ops { - int (*g_chip_ident)(struct v4l2_subdev *sd, struct v4l2_dbg_chip_ident *chip); int (*log_status)(struct v4l2_subdev *sd); int (*s_io_pin_config)(struct v4l2_subdev *sd, size_t n, struct v4l2_subdev_io_pin_config *pincfg); @@ -660,7 +658,7 @@ void v4l2_subdev_init(struct v4l2_subdev *sd, /* Call an ops of a v4l2_subdev, doing the right checks against NULL pointers. - Example: err = v4l2_subdev_call(sd, core, g_chip_ident, &chip); + Example: err = v4l2_subdev_call(sd, core, s_std, norm); */ #define v4l2_subdev_call(sd, o, f, args...) \ (!(sd) ? -ENODEV : (((sd)->ops->o && (sd)->ops->o->f) ? \ diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 2c5e67a45436..95ef4551edc1 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -1787,11 +1787,13 @@ struct v4l2_event_subscription { /* VIDIOC_DBG_G_REGISTER and VIDIOC_DBG_S_REGISTER */ #define V4L2_CHIP_MATCH_BRIDGE 0 /* Match against chip ID on the bridge (0 for the bridge) */ +#define V4L2_CHIP_MATCH_SUBDEV 4 /* Match against subdev index */ + +/* The following four defines are no longer in use */ #define V4L2_CHIP_MATCH_HOST V4L2_CHIP_MATCH_BRIDGE #define V4L2_CHIP_MATCH_I2C_DRIVER 1 /* Match against I2C driver name */ #define V4L2_CHIP_MATCH_I2C_ADDR 2 /* Match against I2C 7-bit address */ #define V4L2_CHIP_MATCH_AC97 3 /* Match against ancillary AC97 chip */ -#define V4L2_CHIP_MATCH_SUBDEV 4 /* Match against subdev index */ struct v4l2_dbg_match { __u32 type; /* Match type */ @@ -1808,13 +1810,6 @@ struct v4l2_dbg_register { __u64 val; } __attribute__ ((packed)); -/* VIDIOC_DBG_G_CHIP_IDENT */ -struct v4l2_dbg_chip_ident { - struct v4l2_dbg_match match; - __u32 ident; /* chip identifier as specified in */ - __u32 revision; /* chip revision, chip specific */ -} __attribute__ ((packed)); - #define V4L2_CHIP_FL_READABLE (1 << 0) #define V4L2_CHIP_FL_WRITABLE (1 << 1) @@ -1915,12 +1910,6 @@ struct v4l2_create_buffers { #define VIDIOC_DBG_S_REGISTER _IOW('V', 79, struct v4l2_dbg_register) #define VIDIOC_DBG_G_REGISTER _IOWR('V', 80, struct v4l2_dbg_register) -/* Experimental, meant for debugging, testing and internal use. - Never use this ioctl in applications! - Note: this ioctl is deprecated in favor of VIDIOC_DBG_G_CHIP_INFO and - will go away in the future. */ -#define VIDIOC_DBG_G_CHIP_IDENT _IOWR('V', 81, struct v4l2_dbg_chip_ident) - #define VIDIOC_S_HW_FREQ_SEEK _IOW('V', 82, struct v4l2_hw_freq_seek) #define VIDIOC_S_DV_TIMINGS _IOWR('V', 87, struct v4l2_dv_timings) -- cgit v1.2.3-71-gd317 From 166fd7d94afdac040b28c473e45241820ca522a2 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 21 Jun 2013 09:38:02 -0600 Subject: vfio: hugepage support for vfio_iommu_type1 We currently send all mappings to the iommu in PAGE_SIZE chunks, which prevents the iommu from enabling support for larger page sizes. We still need to pin pages, which means we step through them in PAGE_SIZE chunks, but we can batch up contiguous physical memory chunks to allow the iommu the opportunity to use larger pages. The approach here is a bit different that the one currently used for legacy KVM device assignment. Rather than looking at the vma page size and using that as the maximum size to pass to the iommu, we instead simply look at whether the next page is physically contiguous. This means we might ask the iommu to map a 4MB region, while legacy KVM might limit itself to a maximum of 2MB. Splitting our mapping path also allows us to be smarter about locked memory because we can more easily unwind if the user attempts to exceed the limit. Therefore, rather than assuming that a mapping will result in locked memory, we test each page as it is pinned to determine whether it locks RAM vs an mmap'd MMIO region. This should result in better locking granularity and less locked page fudge factors in userspace. The unmap path uses the same algorithm as legacy KVM. We don't want to track the pfn for each mapping ourselves, but we need the pfn in order to unpin pages. We therefore ask the iommu for the iova to physical address translation, ask it to unpin a page, and see how many pages were actually unpinned. iommus supporting large pages will often return something bigger than a page here, which we know will be physically contiguous and we can unpin a batch of pfns. iommus not supporting large mappings won't see an improvement in batching here as they only unmap a page at a time. With this change, we also make a clarification to the API for mapping and unmapping DMA. We can only guarantee unmaps at the same granularity as used for the original mapping. In other words, unmapping a subregion of a previous mapping is not guaranteed and may result in a larger or smaller unmapping than requested. The size field in the unmapping structure is updated to reflect this. Previously this was unmodified on mapping, always returning the the requested unmap size. This is now updated to return the actual unmap size on success, allowing userspace to appropriately track mappings. Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 523 ++++++++++++++++++++++++++-------------- include/uapi/linux/vfio.h | 8 +- 2 files changed, 344 insertions(+), 187 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 0e863b3ddcab..6654a7eb42d3 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -60,7 +60,7 @@ struct vfio_dma { struct rb_node node; dma_addr_t iova; /* Device address */ unsigned long vaddr; /* Process virtual addr */ - long npage; /* Number of pages */ + size_t size; /* Map size (bytes) */ int prot; /* IOMMU_READ/WRITE */ }; @@ -74,8 +74,6 @@ struct vfio_group { * into DMA'ble space using the IOMMU */ -#define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT) - static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, dma_addr_t start, size_t size) { @@ -86,7 +84,7 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, if (start + size <= dma->iova) node = node->rb_left; - else if (start >= dma->iova + NPAGE_TO_SIZE(dma->npage)) + else if (start >= dma->iova + dma->size) node = node->rb_right; else return dma; @@ -104,7 +102,7 @@ static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new) parent = *link; dma = rb_entry(parent, struct vfio_dma, node); - if (new->iova + NPAGE_TO_SIZE(new->npage) <= dma->iova) + if (new->iova + new->size <= dma->iova) link = &(*link)->rb_left; else link = &(*link)->rb_right; @@ -144,8 +142,8 @@ static void vfio_lock_acct(long npage) struct vwork *vwork; struct mm_struct *mm; - if (!current->mm) - return; /* process exited */ + if (!current->mm || !npage) + return; /* process exited or nothing to do */ if (down_write_trylock(¤t->mm->mmap_sem)) { current->mm->locked_vm += npage; @@ -217,33 +215,6 @@ static int put_pfn(unsigned long pfn, int prot) return 0; } -/* Unmap DMA region */ -static long __vfio_dma_do_unmap(struct vfio_iommu *iommu, dma_addr_t iova, - long npage, int prot) -{ - long i, unlocked = 0; - - for (i = 0; i < npage; i++, iova += PAGE_SIZE) { - unsigned long pfn; - - pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT; - if (pfn) { - iommu_unmap(iommu->domain, iova, PAGE_SIZE); - unlocked += put_pfn(pfn, prot); - } - } - return unlocked; -} - -static void vfio_dma_unmap(struct vfio_iommu *iommu, dma_addr_t iova, - long npage, int prot) -{ - long unlocked; - - unlocked = __vfio_dma_do_unmap(iommu, iova, npage, prot); - vfio_lock_acct(-unlocked); -} - static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) { struct page *page[1]; @@ -270,79 +241,142 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) return ret; } -/* Map DMA region */ -static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova, - unsigned long vaddr, long npage, int prot) +/* + * Attempt to pin pages. We really don't want to track all the pfns and + * the iommu can only map chunks of consecutive pfns anyway, so get the + * first page and all consecutive pages with the same locking. + */ +static long vfio_pin_pages(unsigned long vaddr, long npage, + int prot, unsigned long *pfn_base) { - dma_addr_t start = iova; - long i, locked = 0; - int ret; + unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + bool lock_cap = capable(CAP_IPC_LOCK); + long ret, i; - /* Verify that pages are not already mapped */ - for (i = 0; i < npage; i++, iova += PAGE_SIZE) - if (iommu_iova_to_phys(iommu->domain, iova)) - return -EBUSY; + if (!current->mm) + return -ENODEV; - iova = start; + ret = vaddr_get_pfn(vaddr, prot, pfn_base); + if (ret) + return ret; - if (iommu->cache) - prot |= IOMMU_CACHE; + if (is_invalid_reserved_pfn(*pfn_base)) + return 1; - /* - * XXX We break mappings into pages and use get_user_pages_fast to - * pin the pages in memory. It's been suggested that mlock might - * provide a more efficient mechanism, but nothing prevents the - * user from munlocking the pages, which could then allow the user - * access to random host memory. We also have no guarantee from the - * IOMMU API that the iommu driver can unmap sub-pages of previous - * mappings. This means we might lose an entire range if a single - * page within it is unmapped. Single page mappings are inefficient, - * but provide the most flexibility for now. - */ - for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) { + if (!lock_cap && current->mm->locked_vm + 1 > limit) { + put_pfn(*pfn_base, prot); + pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, + limit << PAGE_SHIFT); + return -ENOMEM; + } + + /* Lock all the consecutive pages from pfn_base */ + for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) { unsigned long pfn = 0; ret = vaddr_get_pfn(vaddr, prot, &pfn); - if (ret) { - __vfio_dma_do_unmap(iommu, start, i, prot); - return ret; + if (ret) + break; + + if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) { + put_pfn(pfn, prot); + break; } + if (!lock_cap && current->mm->locked_vm + i + 1 > limit) { + put_pfn(pfn, prot); + pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", + __func__, limit << PAGE_SHIFT); + break; + } + } + + vfio_lock_acct(i); + + return i; +} + +static long vfio_unpin_pages(unsigned long pfn, long npage, + int prot, bool do_accounting) +{ + unsigned long unlocked = 0; + long i; + + for (i = 0; i < npage; i++) + unlocked += put_pfn(pfn++, prot); + + if (do_accounting) + vfio_lock_acct(-unlocked); + + return unlocked; +} + +static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, + dma_addr_t iova, size_t *size) +{ + dma_addr_t start = iova, end = iova + *size; + long unlocked = 0; + + while (iova < end) { + size_t unmapped; + phys_addr_t phys; + /* - * Only add actual locked pages to accounting - * XXX We're effectively marking a page locked for every - * IOVA page even though it's possible the user could be - * backing multiple IOVAs with the same vaddr. This over- - * penalizes the user process, but we currently have no - * easy way to do this properly. + * We use the IOMMU to track the physical address. This + * saves us from having a lot more entries in our mapping + * tree. The downside is that we don't track the size + * used to do the mapping. We request unmap of a single + * page, but expect IOMMUs that support large pages to + * unmap a larger chunk. */ - if (!is_invalid_reserved_pfn(pfn)) - locked++; - - ret = iommu_map(iommu->domain, iova, - (phys_addr_t)pfn << PAGE_SHIFT, - PAGE_SIZE, prot); - if (ret) { - /* Back out mappings on error */ - put_pfn(pfn, prot); - __vfio_dma_do_unmap(iommu, start, i, prot); - return ret; + phys = iommu_iova_to_phys(iommu->domain, iova); + if (WARN_ON(!phys)) { + iova += PAGE_SIZE; + continue; } + + unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE); + if (!unmapped) + break; + + unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT, + unmapped >> PAGE_SHIFT, + dma->prot, false); + iova += unmapped; } - vfio_lock_acct(locked); + + vfio_lock_acct(-unlocked); + + *size = iova - start; + return 0; } static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, - size_t size, struct vfio_dma *dma) + size_t *size, struct vfio_dma *dma) { + size_t offset, overlap, tmp; struct vfio_dma *split; - long npage_lo, npage_hi; + int ret; + + /* + * Existing dma region is completely covered, unmap all. This is + * the likely case since userspace tends to map and unmap buffers + * in one shot rather than multiple mappings within a buffer. + */ + if (likely(start <= dma->iova && + start + *size >= dma->iova + dma->size)) { + *size = dma->size; + ret = vfio_unmap_unpin(iommu, dma, dma->iova, size); + if (ret) + return ret; + + /* + * Did we remove more than we have? Should never happen + * since a vfio_dma is contiguous in iova and vaddr. + */ + WARN_ON(*size != dma->size); - /* Existing dma region is completely covered, unmap all */ - if (start <= dma->iova && - start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { - vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); vfio_remove_dma(iommu, dma); kfree(dma); return 0; @@ -350,47 +384,79 @@ static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, /* Overlap low address of existing range */ if (start <= dma->iova) { - size_t overlap; + overlap = start + *size - dma->iova; + ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap); + if (ret) + return ret; - overlap = start + size - dma->iova; - npage_lo = overlap >> PAGE_SHIFT; + vfio_remove_dma(iommu, dma); - vfio_dma_unmap(iommu, dma->iova, npage_lo, dma->prot); - dma->iova += overlap; - dma->vaddr += overlap; - dma->npage -= npage_lo; + /* + * Check, we may have removed to whole vfio_dma. If not + * fixup and re-insert. + */ + if (overlap < dma->size) { + dma->iova += overlap; + dma->vaddr += overlap; + dma->size -= overlap; + vfio_insert_dma(iommu, dma); + } + *size = overlap; return 0; } /* Overlap high address of existing range */ - if (start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { - size_t overlap; + if (start + *size >= dma->iova + dma->size) { + offset = start - dma->iova; + overlap = dma->size - offset; - overlap = dma->iova + NPAGE_TO_SIZE(dma->npage) - start; - npage_hi = overlap >> PAGE_SHIFT; + ret = vfio_unmap_unpin(iommu, dma, start, &overlap); + if (ret) + return ret; + + /* + * We may have unmapped the entire vfio_dma if the user is + * trying to unmap a sub-region of what was originally + * mapped. If anything left, we can resize in place since + * iova is unchanged. + */ + if (overlap < dma->size) + dma->size -= overlap; + else + vfio_remove_dma(iommu, dma); - vfio_dma_unmap(iommu, start, npage_hi, dma->prot); - dma->npage -= npage_hi; + *size = overlap; return 0; } /* Split existing */ - npage_lo = (start - dma->iova) >> PAGE_SHIFT; - npage_hi = dma->npage - (size >> PAGE_SHIFT) - npage_lo; + offset = start - dma->iova; - split = kzalloc(sizeof *split, GFP_KERNEL); - if (!split) - return -ENOMEM; + ret = vfio_unmap_unpin(iommu, dma, start, size); + if (ret) + return ret; - vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, dma->prot); + WARN_ON(!*size); + tmp = dma->size; - dma->npage = npage_lo; + /* + * Resize the lower vfio_dma in place, insert new for remaining + * upper segment. + */ + dma->size = offset; + + if (offset + *size < tmp) { + split = kzalloc(sizeof(*split), GFP_KERNEL); + if (!split) + return -ENOMEM; + + split->size = tmp - offset - *size; + split->iova = dma->iova + offset + *size; + split->vaddr = dma->vaddr + offset + *size; + split->prot = dma->prot; + vfio_insert_dma(iommu, split); + } - split->npage = npage_hi; - split->iova = start + size; - split->vaddr = dma->vaddr + NPAGE_TO_SIZE(npage_lo) + size; - split->prot = dma->prot; - vfio_insert_dma(iommu, split); return 0; } @@ -399,6 +465,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, { uint64_t mask; struct vfio_dma *dma; + size_t unmapped = 0, size; int ret = 0; mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; @@ -408,30 +475,66 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, if (unmap->size & mask) return -EINVAL; - /* XXX We still break these down into PAGE_SIZE */ WARN_ON(mask & PAGE_MASK); mutex_lock(&iommu->lock); - while (!ret && (dma = vfio_find_dma(iommu, - unmap->iova, unmap->size))) - ret = vfio_remove_dma_overlap(iommu, unmap->iova, - unmap->size, dma); + while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) { + size = unmap->size; + ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma); + if (ret) + break; + unmapped += size; + } mutex_unlock(&iommu->lock); + + /* + * We may unmap more than requested, update the unmap struct so + * userspace can know. + */ + unmap->size = unmapped; + + return ret; +} + +/* + * Turns out AMD IOMMU has a page table bug where it won't map large pages + * to a region that previously mapped smaller pages. This should be fixed + * soon, so this is just a temporary workaround to break mappings down into + * PAGE_SIZE. Better to map smaller pages than nothing. + */ +static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova, + unsigned long pfn, long npage, int prot) +{ + long i; + int ret; + + for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) { + ret = iommu_map(iommu->domain, iova, + (phys_addr_t)pfn << PAGE_SHIFT, + PAGE_SIZE, prot); + if (ret) + break; + } + + for (; i < npage && i > 0; i--, iova -= PAGE_SIZE) + iommu_unmap(iommu->domain, iova, PAGE_SIZE); + return ret; } static int vfio_dma_do_map(struct vfio_iommu *iommu, struct vfio_iommu_type1_dma_map *map) { - struct vfio_dma *dma; - dma_addr_t iova = map->iova; - unsigned long locked, lock_limit, vaddr = map->vaddr; + dma_addr_t end, iova; + unsigned long vaddr = map->vaddr; size_t size = map->size; + long npage; int ret = 0, prot = 0; uint64_t mask; - long npage; + + end = map->iova + map->size; mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; @@ -444,92 +547,138 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, if (!prot) return -EINVAL; /* No READ/WRITE? */ + if (iommu->cache) + prot |= IOMMU_CACHE; + if (vaddr & mask) return -EINVAL; - if (iova & mask) + if (map->iova & mask) return -EINVAL; - if (size & mask) + if (!map->size || map->size & mask) return -EINVAL; - /* XXX We still break these down into PAGE_SIZE */ WARN_ON(mask & PAGE_MASK); /* Don't allow IOVA wrap */ - if (iova + size && iova + size < iova) + if (end && end < map->iova) return -EINVAL; /* Don't allow virtual address wrap */ - if (vaddr + size && vaddr + size < vaddr) - return -EINVAL; - - npage = size >> PAGE_SHIFT; - if (!npage) + if (vaddr + map->size && vaddr + map->size < vaddr) return -EINVAL; - dma = kzalloc(sizeof *dma, GFP_KERNEL); - if (!dma) - return -ENOMEM; - mutex_lock(&iommu->lock); - if (vfio_find_dma(iommu, iova, size)) { - ret = -EBUSY; - goto out_lock; + if (vfio_find_dma(iommu, map->iova, map->size)) { + mutex_unlock(&iommu->lock); + return -EEXIST; } - /* account for locked pages */ - locked = current->mm->locked_vm + npage; - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { - pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", - __func__, rlimit(RLIMIT_MEMLOCK)); - ret = -ENOMEM; - goto out_lock; - } + for (iova = map->iova; iova < end; iova += size, vaddr += size) { + struct vfio_dma *dma = NULL; + unsigned long pfn; + long i; + + /* Pin a contiguous chunk of memory */ + npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT, + prot, &pfn); + if (npage <= 0) { + WARN_ON(!npage); + ret = (int)npage; + break; + } - ret = __vfio_dma_map(iommu, iova, vaddr, npage, prot); - if (ret) - goto out_lock; - - dma->npage = npage; - dma->iova = iova; - dma->vaddr = vaddr; - dma->prot = prot; - - /* Check if we abut a region below - nothing below 0 */ - if (iova) { - struct vfio_dma *tmp = vfio_find_dma(iommu, iova - 1, 1); - if (tmp && tmp->prot == prot && - tmp->vaddr + NPAGE_TO_SIZE(tmp->npage) == vaddr) { - vfio_remove_dma(iommu, tmp); - dma->npage += tmp->npage; - dma->iova = iova = tmp->iova; - dma->vaddr = vaddr = tmp->vaddr; - kfree(tmp); - npage = dma->npage; - size = NPAGE_TO_SIZE(npage); + /* Verify pages are not already mapped */ + for (i = 0; i < npage; i++) { + if (iommu_iova_to_phys(iommu->domain, + iova + (i << PAGE_SHIFT))) { + vfio_unpin_pages(pfn, npage, prot, true); + ret = -EBUSY; + break; + } + } + + ret = iommu_map(iommu->domain, iova, + (phys_addr_t)pfn << PAGE_SHIFT, + npage << PAGE_SHIFT, prot); + if (ret) { + if (ret != -EBUSY || + map_try_harder(iommu, iova, pfn, npage, prot)) { + vfio_unpin_pages(pfn, npage, prot, true); + break; + } + } + + size = npage << PAGE_SHIFT; + + /* + * Check if we abut a region below - nothing below 0. + * This is the most likely case when mapping chunks of + * physically contiguous regions within a virtual address + * range. Update the abutting entry in place since iova + * doesn't change. + */ + if (likely(iova)) { + struct vfio_dma *tmp; + tmp = vfio_find_dma(iommu, iova - 1, 1); + if (tmp && tmp->prot == prot && + tmp->vaddr + tmp->size == vaddr) { + tmp->size += size; + + iova = tmp->iova; + size = tmp->size; + vaddr = tmp->vaddr; + dma = tmp; + } + } + + /* Check if we abut a region above - nothing above ~0 + 1 */ + if (likely(iova + size)) { + struct vfio_dma *tmp; + + tmp = vfio_find_dma(iommu, iova + size, 1); + if (tmp && tmp->prot == prot && + tmp->vaddr == vaddr + size) { + vfio_remove_dma(iommu, tmp); + if (dma) + dma->size += tmp->size; + else + size += tmp->size; + kfree(tmp); + } } - } - /* Check if we abut a region above - nothing above ~0 + 1 */ - if (iova + size) { - struct vfio_dma *tmp = vfio_find_dma(iommu, iova + size, 1); - if (tmp && tmp->prot == prot && - tmp->vaddr == vaddr + size) { - vfio_remove_dma(iommu, tmp); - dma->npage += tmp->npage; - kfree(tmp); - npage = dma->npage; - size = NPAGE_TO_SIZE(npage); + if (!dma) { + dma = kzalloc(sizeof(*dma), GFP_KERNEL); + if (!dma) { + iommu_unmap(iommu->domain, iova, size); + vfio_unpin_pages(pfn, npage, prot, true); + ret = -ENOMEM; + break; + } + + dma->size = size; + dma->iova = iova; + dma->vaddr = vaddr; + dma->prot = prot; + vfio_insert_dma(iommu, dma); } } - vfio_insert_dma(iommu, dma); + if (ret) { + struct vfio_dma *tmp; + iova = map->iova; + size = map->size; + while ((tmp = vfio_find_dma(iommu, iova, size))) { + if (vfio_remove_dma_overlap(iommu, iova, &size, tmp)) { + pr_warn("%s: Error rolling back failed map\n", + __func__); + break; + } + } + } -out_lock: mutex_unlock(&iommu->lock); - if (ret) - kfree(dma); return ret; } @@ -651,9 +800,8 @@ static void vfio_iommu_type1_release(void *iommu_data) while ((node = rb_first(&iommu->dma_list))) { struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); - vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); - vfio_remove_dma(iommu, dma); - kfree(dma); + size_t size = dma->size; + vfio_remove_dma_overlap(iommu, dma->iova, &size, dma); } iommu_domain_free(iommu->domain); @@ -708,6 +856,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, } else if (cmd == VFIO_IOMMU_UNMAP_DMA) { struct vfio_iommu_type1_dma_unmap unmap; + long ret; minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); @@ -717,7 +866,11 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, if (unmap.argsz < minsz || unmap.flags) return -EINVAL; - return vfio_dma_do_unmap(iommu, &unmap); + ret = vfio_dma_do_unmap(iommu, &unmap); + if (ret) + return ret; + + return copy_to_user((void __user *)arg, &unmap, minsz); } return -ENOTTY; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 284ff2436829..513600612995 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -361,10 +361,14 @@ struct vfio_iommu_type1_dma_map { #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13) /** - * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap) + * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14, + * struct vfio_dma_unmap) * * Unmap IO virtual addresses using the provided struct vfio_dma_unmap. - * Caller sets argsz. + * Caller sets argsz. The actual unmapped size is returned in the size + * field. No guarantee is made to the user that arbitrary unmaps of iova + * or size different from those used in the original mapping call will + * succeed. */ struct vfio_iommu_type1_dma_unmap { __u32 argsz; -- cgit v1.2.3-71-gd317 From 77e2af0312b12dccd5043a7cc9cd49ab6a212996 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 21 Jun 2013 19:38:06 +0200 Subject: net: if_arp: add ARPHRD_NETLINK type This small patch adds the definition of ARPHRD_NETLINK which can for example be used by netlink monitoring devices as device type. So that sockaddr_ll can pick it up and based on that choose the correct packet dissector. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/if_arp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_arp.h b/include/uapi/linux/if_arp.h index 82c7d1bdadeb..d7fea3496f32 100644 --- a/include/uapi/linux/if_arp.h +++ b/include/uapi/linux/if_arp.h @@ -93,6 +93,7 @@ #define ARPHRD_PHONET_PIPE 821 /* PhoNet pipe header */ #define ARPHRD_CAIF 822 /* CAIF media type */ #define ARPHRD_IP6GRE 823 /* GRE over IPv6 */ +#define ARPHRD_NETLINK 824 /* Netlink header */ #define ARPHRD_VOID 0xFFFF /* Void type, nothing is known */ #define ARPHRD_NONE 0xFFFE /* zero header length */ -- cgit v1.2.3-71-gd317 From 2fc016c5bd8aad2e201cdf71b9fb4573f94775bd Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sat, 27 Apr 2013 16:07:49 -0700 Subject: linux/const.h: Add _BITUL() and _BITULL() Add macros for single bit definitions of a specific type. These are similar to the BIT() macro that already exists, but with a few exceptions: 1. The namespace is such that they can be used in uapi definitions. 2. The type is set with the _AC() macro to allow it to be used in assembly. 3. The type is explicitly specified to be UL or ULL. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/n/tip-nbca8p7cg6jyjoit7klh3o91@git.kernel.org --- include/uapi/linux/const.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/const.h b/include/uapi/linux/const.h index c22c707c455d..c872bfd25e13 100644 --- a/include/uapi/linux/const.h +++ b/include/uapi/linux/const.h @@ -21,4 +21,7 @@ #define _AT(T,X) ((T)(X)) #endif +#define _BITUL(x) (_AC(1,UL) << (x)) +#define _BITULL(x) (_AC(1,ULL) << (x)) + #endif /* !(_LINUX_CONST_H) */ -- cgit v1.2.3-71-gd317 From eba3b5a78799d21dea05118b294524958f0ab592 Mon Sep 17 00:00:00 2001 From: Alexander Frolkin Date: Wed, 19 Jun 2013 10:54:25 +0100 Subject: ipvs: SH fallback and L4 hashing By default the SH scheduler rejects connections that are hashed onto a realserver of weight 0. This patch adds a flag to make SH choose a different realserver in this case, instead of rejecting the connection. The patch also adds a flag to make SH include the source port (TCP, UDP, SCTP) in the hash as well as the source address. This basically allows for deterministic round-robin load balancing (i.e., where any director in a cluster of directors with identical config will send the same packet the same way). The flags are service flags (IP_VS_SVC_F_SCHED*) so that these options can be set per service. They are set using a new option to ipvsadm. Signed-off-by: Alexander Frolkin Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/uapi/linux/ip_vs.h | 6 +++ net/netfilter/ipvs/ip_vs_sh.c | 100 +++++++++++++++++++++++++++++++++++------- 2 files changed, 91 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h index a24537725e80..29458223d044 100644 --- a/include/uapi/linux/ip_vs.h +++ b/include/uapi/linux/ip_vs.h @@ -20,6 +20,12 @@ #define IP_VS_SVC_F_PERSISTENT 0x0001 /* persistent port */ #define IP_VS_SVC_F_HASHED 0x0002 /* hashed entry */ #define IP_VS_SVC_F_ONEPACKET 0x0004 /* one-packet scheduling */ +#define IP_VS_SVC_F_SCHED1 0x0008 /* scheduler flag 1 */ +#define IP_VS_SVC_F_SCHED2 0x0010 /* scheduler flag 2 */ +#define IP_VS_SVC_F_SCHED3 0x0020 /* scheduler flag 3 */ + +#define IP_VS_SVC_F_SCHED_SH_FALLBACK IP_VS_SVC_F_SCHED1 /* SH fallback */ +#define IP_VS_SVC_F_SCHED_SH_PORT IP_VS_SVC_F_SCHED2 /* SH use port */ /* * Destination Server Flags diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index e0d5d1653566..f16c027df15b 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -48,6 +48,10 @@ #include +#include +#include +#include + /* * IPVS SH bucket @@ -71,10 +75,19 @@ struct ip_vs_sh_state { struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE]; }; +/* Helper function to determine if server is unavailable */ +static inline bool is_unavailable(struct ip_vs_dest *dest) +{ + return atomic_read(&dest->weight) <= 0 || + dest->flags & IP_VS_DEST_F_OVERLOAD; +} + /* * Returns hash value for IPVS SH entry */ -static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr) +static inline unsigned int +ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr, + __be16 port, unsigned int offset) { __be32 addr_fold = addr->ip; @@ -83,7 +96,8 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK; + return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) & + IP_VS_SH_TAB_MASK; } @@ -91,12 +105,42 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad * Get ip_vs_dest associated with supplied parameters. */ static inline struct ip_vs_dest * -ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr) +ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s, + const union nf_inet_addr *addr, __be16 port) { - return rcu_dereference(s->buckets[ip_vs_sh_hashkey(af, addr)].dest); + unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0); + struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest); + + return (!dest || is_unavailable(dest)) ? NULL : dest; } +/* As ip_vs_sh_get, but with fallback if selected server is unavailable */ +static inline struct ip_vs_dest * +ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s, + const union nf_inet_addr *addr, __be16 port) +{ + unsigned int offset; + unsigned int hash; + struct ip_vs_dest *dest; + + for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) { + hash = ip_vs_sh_hashkey(svc->af, addr, port, offset); + dest = rcu_dereference(s->buckets[hash].dest); + if (!dest) + break; + if (is_unavailable(dest)) + IP_VS_DBG_BUF(6, "SH: selected unavailable server " + "%s:%d (offset %d)", + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port), offset); + else + return dest; + } + + return NULL; +} + /* * Assign all the hash buckets of the specified table with the service. */ @@ -213,13 +257,33 @@ static int ip_vs_sh_dest_changed(struct ip_vs_service *svc, } -/* - * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, - * consider that the server is overloaded here. - */ -static inline int is_overloaded(struct ip_vs_dest *dest) +/* Helper function to get port number */ +static inline __be16 +ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) { - return dest->flags & IP_VS_DEST_F_OVERLOAD; + __be16 port; + struct tcphdr _tcph, *th; + struct udphdr _udph, *uh; + sctp_sctphdr_t _sctph, *sh; + + switch (iph->protocol) { + case IPPROTO_TCP: + th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); + port = th->source; + break; + case IPPROTO_UDP: + uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph); + port = uh->source; + break; + case IPPROTO_SCTP: + sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph); + port = sh->source; + break; + default: + port = 0; + } + + return port; } @@ -232,15 +296,21 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, { struct ip_vs_dest *dest; struct ip_vs_sh_state *s; + __be16 port = 0; IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); + if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT) + port = ip_vs_sh_get_port(skb, iph); + s = (struct ip_vs_sh_state *) svc->sched_data; - dest = ip_vs_sh_get(svc->af, s, &iph->saddr); - if (!dest - || !(dest->flags & IP_VS_DEST_F_AVAILABLE) - || atomic_read(&dest->weight) <= 0 - || is_overloaded(dest)) { + + if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK) + dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port); + else + dest = ip_vs_sh_get(svc, s, &iph->saddr, port); + + if (!dest) { ip_vs_scheduler_err(svc, "no destination available"); return NULL; } -- cgit v1.2.3-71-gd317 From d09bbfd2a8408a995419dff0d2ba906013cf4cc9 Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Sat, 15 Jun 2013 15:32:44 +0200 Subject: input: document gamepad API and add extra keycodes Until today all gamepad input drivers report their data differently. It is nearly impossible to write applications for more than one device in a generic way. Therefore, this patch introduces a uniform gamepad API which will be used for all new drivers. Instead of mapping buttons by their labels, we now map them by position. This allows applications to work with any gamepad regardless of the labels on the buttons. Furthermore, we standardize the ABS_* codes for analog triggers and sticks. For D-Pads the long overdue BTN_DPAD_* codes are introduced. They should be fairly obvious how to use. To avoid confusion, the action buttons now have BTN_EAST/SOUTH/WEST/NORTH aliases. Reported-by: Todd Showalter Acked-by: Dmitry Torokhov Signed-off-by: David Herrmann Signed-off-by: Jiri Kosina --- include/uapi/linux/input.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h index 4649ee35b605..8f82ad29fc30 100644 --- a/include/uapi/linux/input.h +++ b/include/uapi/linux/input.h @@ -507,10 +507,14 @@ struct input_keymap_entry { #define BTN_GAMEPAD 0x130 #define BTN_A 0x130 +#define BTN_SOUTH 0x130 #define BTN_B 0x131 +#define BTN_EAST 0x131 #define BTN_C 0x132 #define BTN_X 0x133 +#define BTN_NORTH 0x133 #define BTN_Y 0x134 +#define BTN_WEST 0x134 #define BTN_Z 0x135 #define BTN_TL 0x136 #define BTN_TR 0x137 @@ -707,6 +711,11 @@ struct input_keymap_entry { #define KEY_ATTENDANT_TOGGLE 0x21d /* Attendant call on or off */ #define KEY_LIGHTS_TOGGLE 0x21e /* Reading light on or off */ +#define BTN_DPAD_UP 0x220 +#define BTN_DPAD_DOWN 0x221 +#define BTN_DPAD_LEFT 0x222 +#define BTN_DPAD_RIGHT 0x223 + #define BTN_TRIGGER_HAPPY 0x2c0 #define BTN_TRIGGER_HAPPY1 0x2c0 #define BTN_TRIGGER_HAPPY2 0x2c1 -- cgit v1.2.3-71-gd317 From 701ba533f6587aaf402a9bb732548503263d2b23 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 27 Jun 2013 11:54:51 +0200 Subject: Input: make gamepad API keycodes more clear Shuffle the defines around so that it is clear that BTN_A, BTN_B, etc are legacy definitions and not an accidental typos that need their own key codes. Suggested-by: Dmitry Torokhov Signed-off-by: Jiri Kosina --- include/uapi/linux/input.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h index 8f82ad29fc30..d584047b072b 100644 --- a/include/uapi/linux/input.h +++ b/include/uapi/linux/input.h @@ -506,15 +506,15 @@ struct input_keymap_entry { #define BTN_DEAD 0x12f #define BTN_GAMEPAD 0x130 -#define BTN_A 0x130 #define BTN_SOUTH 0x130 -#define BTN_B 0x131 +#define BTN_A BTN_SOUTH #define BTN_EAST 0x131 +#define BTN_B BTN_EAST #define BTN_C 0x132 -#define BTN_X 0x133 #define BTN_NORTH 0x133 -#define BTN_Y 0x134 +#define BTN_X BTN_NORTH #define BTN_WEST 0x134 +#define BTN_Y BTN_WEST #define BTN_Z 0x135 #define BTN_TL 0x136 #define BTN_TR 0x137 -- cgit v1.2.3-71-gd317 From 5a1d3e9f18ceb8d791bdc9a78d8ee10ddc80a6e8 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Fri, 21 Jun 2013 02:05:34 -0300 Subject: [media] v4l2-controls.h: fix copy-and-paste error in comment The comment for the FM_RX class was copied from the DV class unchanged. Fixed. Also made the FM_TX comment consistent with the others. Signed-off-by: Hans Verkuil Acked-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/v4l2-controls.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 69bd5bb0d5af..e90a88a8708f 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -53,13 +53,13 @@ #define V4L2_CTRL_CLASS_USER 0x00980000 /* Old-style 'user' controls */ #define V4L2_CTRL_CLASS_MPEG 0x00990000 /* MPEG-compression controls */ #define V4L2_CTRL_CLASS_CAMERA 0x009a0000 /* Camera class controls */ -#define V4L2_CTRL_CLASS_FM_TX 0x009b0000 /* FM Modulator control class */ +#define V4L2_CTRL_CLASS_FM_TX 0x009b0000 /* FM Modulator controls */ #define V4L2_CTRL_CLASS_FLASH 0x009c0000 /* Camera flash controls */ #define V4L2_CTRL_CLASS_JPEG 0x009d0000 /* JPEG-compression controls */ #define V4L2_CTRL_CLASS_IMAGE_SOURCE 0x009e0000 /* Image source controls */ #define V4L2_CTRL_CLASS_IMAGE_PROC 0x009f0000 /* Image processing controls */ #define V4L2_CTRL_CLASS_DV 0x00a00000 /* Digital Video controls */ -#define V4L2_CTRL_CLASS_FM_RX 0x00a10000 /* Digital Video controls */ +#define V4L2_CTRL_CLASS_FM_RX 0x00a10000 /* FM Receiver controls */ /* User-class control IDs */ -- cgit v1.2.3-71-gd317 From 496e4ae7dc944faa1721bfda7e9d834d5611a874 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 29 Jun 2013 14:15:47 +0200 Subject: netfilter: nf_queue: add NFQA_SKB_CSUM_NOTVERIFIED info flag The common case is that TCP/IP checksums have already been verified, e.g. by hardware (rx checksum offload), or conntrack. Userspace can use this flag to determine when the checksum has not been validated yet. If the flag is set, this doesn't necessarily mean that the packet has an invalid checksum, e.g. if NIC doesn't support rx checksum. Userspace that sucessfully enabled NFQA_CFG_F_GSO queue feature flag can infer that IP/TCP checksum has already been validated if either the SKB_INFO attribute is not present or the NFQA_SKB_CSUM_NOTVERIFIED flag is unset. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nfnetlink_queue.h | 2 ++ net/netfilter/nfnetlink_queue_core.c | 16 ++++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h index a2308ae5a73d..3a9b92147339 100644 --- a/include/uapi/linux/netfilter/nfnetlink_queue.h +++ b/include/uapi/linux/netfilter/nfnetlink_queue.h @@ -105,5 +105,7 @@ enum nfqnl_attr_config { #define NFQA_SKB_CSUMNOTREADY (1 << 0) /* packet is GSO (i.e., exceeds device mtu) */ #define NFQA_SKB_GSO (1 << 1) +/* csum not validated (incoming device doesn't support hw checksum, etc.) */ +#define NFQA_SKB_CSUM_NOTVERIFIED (1 << 2) #endif /* _NFNETLINK_QUEUE_H */ diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 299a48ae5dc9..971ea145ab3e 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -280,12 +280,17 @@ nfqnl_zcopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen) skb_shinfo(to)->nr_frags = j; } -static int nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet) +static int +nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet, + bool csum_verify) { __u32 flags = 0; if (packet->ip_summed == CHECKSUM_PARTIAL) flags = NFQA_SKB_CSUMNOTREADY; + else if (csum_verify) + flags = NFQA_SKB_CSUM_NOTVERIFIED; + if (skb_is_gso(packet)) flags |= NFQA_SKB_GSO; @@ -310,6 +315,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, struct net_device *outdev; struct nf_conn *ct = NULL; enum ip_conntrack_info uninitialized_var(ctinfo); + bool csum_verify; size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) @@ -327,6 +333,12 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, if (entskb->tstamp.tv64) size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); + if (entry->hook <= NF_INET_FORWARD || + (entry->hook == NF_INET_POST_ROUTING && entskb->sk == NULL)) + csum_verify = !skb_csum_unnecessary(entskb); + else + csum_verify = false; + outdev = entry->outdev; switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) { @@ -476,7 +488,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len))) goto nla_put_failure; - if (nfqnl_put_packet_info(skb, entskb)) + if (nfqnl_put_packet_info(skb, entskb, csum_verify)) goto nla_put_failure; if (data_len) { -- cgit v1.2.3-71-gd317 From d753601a2a5ff67bbc96ce6512a16305bfd293e7 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 18 Jun 2013 16:55:41 +0000 Subject: MIPS: BCM63XX: recognize Cable Modem firmware format Add the firmware header format which is used by Broadcom Cable Modem SoCs such as the BCM3368 SoC. We export the bcm_hcs firmware format structure because it is used by user-land tools to create firmware images for these SoCs and will later be used by a corresponding MTD parser. Signed-off-by: Florian Fainelli Cc: linux-mips@linux-mips.org Cc: cernekee@gmail.com Cc: jogo@openwrt.org Patchwork: https://patchwork.linux-mips.org/patch/5496/ Signed-off-by: Ralf Baechle --- arch/mips/bcm63xx/boards/board_bcm963xx.c | 14 ++++++++++++-- include/uapi/linux/Kbuild | 1 + include/uapi/linux/bcm933xx_hcs.h | 24 ++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 include/uapi/linux/bcm933xx_hcs.h (limited to 'include/uapi/linux') diff --git a/arch/mips/bcm63xx/boards/board_bcm963xx.c b/arch/mips/bcm63xx/boards/board_bcm963xx.c index a9505c4867e8..46eabb9aff51 100644 --- a/arch/mips/bcm63xx/boards/board_bcm963xx.c +++ b/arch/mips/bcm63xx/boards/board_bcm963xx.c @@ -28,8 +28,12 @@ #include #include +#include + #define PFX "board_bcm963xx: " +#define HCS_OFFSET_128K 0x20000 + static struct board_info board; /* @@ -722,8 +726,9 @@ void __init board_prom_init(void) unsigned int i; u8 *boot_addr, *cfe; char cfe_version[32]; - char *board_name; + char *board_name = NULL; u32 val; + struct bcm_hcs *hcs; /* read base address of boot chip select (0) * 6328/6362 do not have MPI but boot from a fixed address @@ -747,7 +752,12 @@ void __init board_prom_init(void) bcm63xx_nvram_init(boot_addr + BCM963XX_NVRAM_OFFSET); - board_name = bcm63xx_nvram_get_name(); + if (BCMCPU_IS_3368()) { + hcs = (struct bcm_hcs *)boot_addr; + board_name = hcs->filename; + } else { + board_name = bcm63xx_nvram_get_name(); + } /* find board by name */ for (i = 0; i < ARRAY_SIZE(bcm963xx_boards); i++) { if (strncmp(board_name, bcm963xx_boards[i]->name, 16)) diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index ab5d4992e568..ba1c11ab0d11 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -62,6 +62,7 @@ header-y += auxvec.h header-y += ax25.h header-y += b1lli.h header-y += baycom.h +header-y += bcm933xx_hcs.h header-y += bfs_fs.h header-y += binfmts.h header-y += blkpg.h diff --git a/include/uapi/linux/bcm933xx_hcs.h b/include/uapi/linux/bcm933xx_hcs.h new file mode 100644 index 000000000000..d22821831549 --- /dev/null +++ b/include/uapi/linux/bcm933xx_hcs.h @@ -0,0 +1,24 @@ +/* + * Broadcom Cable Modem firmware format + */ + +#ifndef __BCM933XX_HCS_H +#define __BCM933XX_HCS_H + +#include + +struct bcm_hcs { + __u16 magic; + __u16 control; + __u16 rev_maj; + __u16 rev_min; + __u32 build_date; + __u32 filelen; + __u32 ldaddress; + char filename[64]; + __u16 hcs; + __u16 her_znaet_chto; + __u32 crc; +}; + +#endif /* __BCM933XX_HCS */ -- cgit v1.2.3-71-gd317 From b1a5a34bd0b8767ea689e68f8ea513e9710b671e Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Sat, 29 Jun 2013 00:15:51 +0800 Subject: net: Swap ver and type in pppoe_hdr Ver and type in pppoe_hdr should be swapped as defined by RFC2516 section-4. Signed-off-by: David S. Miller --- include/uapi/linux/if_pppox.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_pppox.h b/include/uapi/linux/if_pppox.h index 0b46fd57c8f6..e36a4aecd311 100644 --- a/include/uapi/linux/if_pppox.h +++ b/include/uapi/linux/if_pppox.h @@ -135,11 +135,11 @@ struct pppoe_tag { struct pppoe_hdr { #if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 ver : 4; __u8 type : 4; + __u8 ver : 4; #elif defined(__BIG_ENDIAN_BITFIELD) - __u8 type : 4; __u8 ver : 4; + __u8 type : 4; #else #error "Please fix " #endif -- cgit v1.2.3-71-gd317 From 29000caecbe87b6b66f144f72111f0d02fbbf0c1 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Wed, 3 Jul 2013 15:08:12 -0700 Subject: ptrace: add ability to get/set signal-blocked mask crtools uses a parasite code for dumping processes. The parasite code is injected into a process with help PTRACE_SEIZE. Currently crtools blocks signals from a parasite code. If a process has pending signals, crtools wait while a process handles these signals. This method is not suitable for stopped tasks. A stopped task can have a few pending signals, when we will try to execute a parasite code, we will need to drop SIGSTOP, but all other signals must remain pending, because a state of processes must not be changed during checkpointing. This patch adds two ptrace commands to set/get signal-blocked mask. I think gdb can use this commands too. [akpm@linux-foundation.org: be consistent with brace layout] Signed-off-by: Andrey Vagin Reviewed-by: Oleg Nesterov Cc: Roland McGrath Cc: Michael Kerrisk Cc: Pavel Emelyanov Cc: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/ptrace.h | 3 +++ kernel/ptrace.c | 44 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h index 52ebcc89f306..cf1019e15f5b 100644 --- a/include/uapi/linux/ptrace.h +++ b/include/uapi/linux/ptrace.h @@ -61,6 +61,9 @@ struct ptrace_peeksiginfo_args { __s32 nr; /* how may siginfos to take */ }; +#define PTRACE_GETSIGMASK 0x420a +#define PTRACE_SETSIGMASK 0x420b + /* Read signals from a shared (process wide) queue */ #define PTRACE_PEEKSIGINFO_SHARED (1 << 0) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 335a7ae697f5..ba5e6cea181a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -844,6 +844,47 @@ int ptrace_request(struct task_struct *child, long request, ret = ptrace_setsiginfo(child, &siginfo); break; + case PTRACE_GETSIGMASK: + if (addr != sizeof(sigset_t)) { + ret = -EINVAL; + break; + } + + if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t))) + ret = -EFAULT; + else + ret = 0; + + break; + + case PTRACE_SETSIGMASK: { + sigset_t new_set; + + if (addr != sizeof(sigset_t)) { + ret = -EINVAL; + break; + } + + if (copy_from_user(&new_set, datavp, sizeof(sigset_t))) { + ret = -EFAULT; + break; + } + + sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); + + /* + * Every thread does recalc_sigpending() after resume, so + * retarget_shared_pending() and recalc_sigpending() are not + * called here. + */ + spin_lock_irq(&child->sighand->siglock); + child->blocked = new_set; + spin_unlock_irq(&child->sighand->siglock); + + ret = 0; + break; + } + case PTRACE_INTERRUPT: /* * Stop tracee without any side-effect on signal or job @@ -948,8 +989,7 @@ int ptrace_request(struct task_struct *child, long request, #ifdef CONFIG_HAVE_ARCH_TRACEHOOK case PTRACE_GETREGSET: - case PTRACE_SETREGSET: - { + case PTRACE_SETREGSET: { struct iovec kiov; struct iovec __user *uiov = datavp; -- cgit v1.2.3-71-gd317 From 62525a00b87cc967bce9779d63fcc84fb9199130 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 8 Jul 2013 10:33:37 +0930 Subject: virtio: VIRTIO_F_ANY_LAYOUT feature Also known as the "no really, I read the spec" bit. Signed-off-by: Rusty Russell --- include/uapi/linux/virtio_config.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h index b7cda390fd00..3ce768c6910d 100644 --- a/include/uapi/linux/virtio_config.h +++ b/include/uapi/linux/virtio_config.h @@ -51,4 +51,7 @@ * suppressed them? */ #define VIRTIO_F_NOTIFY_ON_EMPTY 24 +/* Can the device handle any descriptor layout? */ +#define VIRTIO_F_ANY_LAYOUT 27 + #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */ -- cgit v1.2.3-71-gd317 From 6e5b93ee55d401f1619092fb675b57c28c9ed7ec Mon Sep 17 00:00:00 2001 From: Mike Lockwood Date: Mon, 8 Jul 2013 16:00:46 -0700 Subject: fatfs: add FAT_IOCTL_GET_VOLUME_ID This patch, originally from Android kernel, adds vfat ioctl command FAT_IOCTL_GET_VOLUME_ID, with this command we can get the vfat volume ID using following code: ioctl(fd, FAT_IOCTL_GET_VOLUME_ID, &volume_ID) This patch is a modified version of the patch by Mike Lockwood, with changes from Dmitry Pervushin, who noticed the original patch makes some volume IDs abiguous with error returns: for example, if volume id is 0xFFFFFDAD, that matches -ENOIOCTLCMD, we get "FFFFFFFF" from the user space. So add a parameter to ioctl to get the correct volume ID. Android uses vfat volume ID to identify different sd card, when a new sd card is inserted to device, android can scan the media on it and pop up new contents. Signed-off-by: Bintian Wang Cc: dmitry pervushin Cc: Mike Lockwood Cc: Colin Cross Acked-by: OGAWA Hirofumi Cc: John Stultz Cc: Sean McNeil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/fat.h | 1 + fs/fat/file.c | 8 ++++++++ fs/fat/inode.c | 12 ++++++++++++ include/uapi/linux/msdos_fs.h | 10 ++++++++++ 4 files changed, 31 insertions(+) (limited to 'include/uapi/linux') diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 21664fcf3616..4241e6f39e86 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -86,6 +86,7 @@ struct msdos_sb_info { const void *dir_ops; /* Opaque; default directory operations */ int dir_per_block; /* dir entries per block */ int dir_per_block_bits; /* log2(dir_per_block) */ + unsigned int vol_id; /*volume ID*/ int fatent_shift; struct fatent_operations *fatent_ops; diff --git a/fs/fat/file.c b/fs/fat/file.c index b0b632e50ddb..9b104f543056 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -114,6 +114,12 @@ out: return err; } +static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + return put_user(sbi->vol_id, user_attr); +} + long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -124,6 +130,8 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return fat_ioctl_get_attributes(inode, user_attr); case FAT_IOCTL_SET_ATTRIBUTES: return fat_ioctl_set_attributes(filp, user_attr); + case FAT_IOCTL_GET_VOLUME_ID: + return fat_ioctl_get_volume_id(inode, user_attr); default: return -ENOTTY; /* Inappropriate ioctl for device */ } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 5d4513cb1b3c..11b51bb55b42 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1415,6 +1415,18 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, brelse(fsinfo_bh); } + /* interpret volume ID as a little endian 32 bit integer */ + if (sbi->fat_bits == 32) + sbi->vol_id = (((u32)b->fat32.vol_id[0]) | + ((u32)b->fat32.vol_id[1] << 8) | + ((u32)b->fat32.vol_id[2] << 16) | + ((u32)b->fat32.vol_id[3] << 24)); + else /* fat 16 or 12 */ + sbi->vol_id = (((u32)b->fat16.vol_id[0]) | + ((u32)b->fat16.vol_id[1] << 8) | + ((u32)b->fat16.vol_id[2] << 16) | + ((u32)b->fat16.vol_id[3] << 24)); + sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry); sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1; diff --git a/include/uapi/linux/msdos_fs.h b/include/uapi/linux/msdos_fs.h index f055e58b3147..e284ff919d6e 100644 --- a/include/uapi/linux/msdos_fs.h +++ b/include/uapi/linux/msdos_fs.h @@ -104,6 +104,8 @@ struct __fat_dirent { /* has used 0x72 ('r') in collision, so skip a few */ #define FAT_IOCTL_GET_ATTRIBUTES _IOR('r', 0x10, __u32) #define FAT_IOCTL_SET_ATTRIBUTES _IOW('r', 0x11, __u32) +/*Android kernel has used 0x12, so we use 0x13*/ +#define FAT_IOCTL_GET_VOLUME_ID _IOR('r', 0x13, __u32) struct fat_boot_sector { __u8 ignored[3]; /* Boot strap short or near jump */ @@ -128,6 +130,10 @@ struct fat_boot_sector { __u8 drive_number; /* Physical drive number */ __u8 state; /* undocumented, but used for mount state. */ + __u8 signature; /* extended boot signature */ + __u8 vol_id[4]; /* volume ID */ + __u8 vol_label[11]; /* volume label */ + __u8 fs_type[8]; /* file system type */ /* other fiealds are not added here */ } fat16; @@ -147,6 +153,10 @@ struct fat_boot_sector { __u8 drive_number; /* Physical drive number */ __u8 state; /* undocumented, but used for mount state. */ + __u8 signature; /* extended boot signature */ + __u8 vol_id[4]; /* volume ID */ + __u8 vol_label[11]; /* volume label */ + __u8 fs_type[8]; /* file system type */ /* other fiealds are not added here */ } fat32; }; -- cgit v1.2.3-71-gd317 From 83d5e5b0af907d46d241a86d9e44003b3f0accbd Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 10 Jul 2013 23:41:18 +0100 Subject: dm: optimize use SRCU and RCU This patch removes "io_lock" and "map_lock" in struct mapped_device and "holders" in struct dm_table and replaces these mechanisms with sleepable-rcu. Previously, the code would call "dm_get_live_table" and "dm_table_put" to get and release table. Now, the code is changed to call "dm_get_live_table" and "dm_put_live_table". dm_get_live_table locks sleepable-rcu and dm_put_live_table unlocks it. dm_get_live_table_fast/dm_put_live_table_fast can be used instead of dm_get_live_table/dm_put_live_table. These *_fast functions use non-sleepable RCU, so the caller must not block between them. If the code changes active or inactive dm table, it must call dm_sync_table before destroying the old table. Signed-off-by: Mikulas Patocka Signed-off-by: Jun'ichi Nomura Signed-off-by: Alasdair G Kergon --- drivers/md/dm-ioctl.c | 122 +++++++++++++++++++++----------- drivers/md/dm-table.c | 35 --------- drivers/md/dm.c | 160 ++++++++++++++++++++++++------------------ include/linux/device-mapper.h | 6 +- include/uapi/linux/dm-ioctl.h | 4 +- 5 files changed, 180 insertions(+), 147 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 12f04868e899..f1b758675ec7 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -36,6 +36,14 @@ struct hash_cell { struct dm_table *new_map; }; +/* + * A dummy definition to make RCU happy. + * struct dm_table should never be dereferenced in this file. + */ +struct dm_table { + int undefined__; +}; + struct vers_iter { size_t param_size; struct dm_target_versions *vers, *old_vers; @@ -242,9 +250,10 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi return -EBUSY; } -static void __hash_remove(struct hash_cell *hc) +static struct dm_table *__hash_remove(struct hash_cell *hc) { struct dm_table *table; + int srcu_idx; /* remove from the dev hash */ list_del(&hc->uuid_list); @@ -253,16 +262,18 @@ static void __hash_remove(struct hash_cell *hc) dm_set_mdptr(hc->md, NULL); mutex_unlock(&dm_hash_cells_mutex); - table = dm_get_live_table(hc->md); - if (table) { + table = dm_get_live_table(hc->md, &srcu_idx); + if (table) dm_table_event(table); - dm_table_put(table); - } + dm_put_live_table(hc->md, srcu_idx); + table = NULL; if (hc->new_map) - dm_table_destroy(hc->new_map); + table = hc->new_map; dm_put(hc->md); free_cell(hc); + + return table; } static void dm_hash_remove_all(int keep_open_devices) @@ -270,6 +281,7 @@ static void dm_hash_remove_all(int keep_open_devices) int i, dev_skipped; struct hash_cell *hc; struct mapped_device *md; + struct dm_table *t; retry: dev_skipped = 0; @@ -287,10 +299,14 @@ retry: continue; } - __hash_remove(hc); + t = __hash_remove(hc); up_write(&_hash_lock); + if (t) { + dm_sync_table(md); + dm_table_destroy(t); + } dm_put(md); if (likely(keep_open_devices)) dm_destroy(md); @@ -356,6 +372,7 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, struct dm_table *table; struct mapped_device *md; unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; + int srcu_idx; /* * duplicate new. @@ -418,11 +435,10 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, /* * Wake up any dm event waiters. */ - table = dm_get_live_table(hc->md); - if (table) { + table = dm_get_live_table(hc->md, &srcu_idx); + if (table) dm_table_event(table); - dm_table_put(table); - } + dm_put_live_table(hc->md, srcu_idx); if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, param->event_nr)) param->flags |= DM_UEVENT_GENERATED_FLAG; @@ -620,11 +636,14 @@ static int check_name(const char *name) * _hash_lock without first calling dm_table_put, because dm_table_destroy * waits for this dm_table_put and could be called under this lock. */ -static struct dm_table *dm_get_inactive_table(struct mapped_device *md) +static struct dm_table *dm_get_inactive_table(struct mapped_device *md, int *srcu_idx) { struct hash_cell *hc; struct dm_table *table = NULL; + /* increment rcu count, we don't care about the table pointer */ + dm_get_live_table(md, srcu_idx); + down_read(&_hash_lock); hc = dm_get_mdptr(md); if (!hc || hc->md != md) { @@ -633,8 +652,6 @@ static struct dm_table *dm_get_inactive_table(struct mapped_device *md) } table = hc->new_map; - if (table) - dm_table_get(table); out: up_read(&_hash_lock); @@ -643,10 +660,11 @@ out: } static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md, - struct dm_ioctl *param) + struct dm_ioctl *param, + int *srcu_idx) { return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ? - dm_get_inactive_table(md) : dm_get_live_table(md); + dm_get_inactive_table(md, srcu_idx) : dm_get_live_table(md, srcu_idx); } /* @@ -657,6 +675,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) { struct gendisk *disk = dm_disk(md); struct dm_table *table; + int srcu_idx; param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | DM_ACTIVE_PRESENT_FLAG); @@ -676,26 +695,27 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) param->event_nr = dm_get_event_nr(md); param->target_count = 0; - table = dm_get_live_table(md); + table = dm_get_live_table(md, &srcu_idx); if (table) { if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) { if (get_disk_ro(disk)) param->flags |= DM_READONLY_FLAG; param->target_count = dm_table_get_num_targets(table); } - dm_table_put(table); param->flags |= DM_ACTIVE_PRESENT_FLAG; } + dm_put_live_table(md, srcu_idx); if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) { - table = dm_get_inactive_table(md); + int srcu_idx; + table = dm_get_inactive_table(md, &srcu_idx); if (table) { if (!(dm_table_get_mode(table) & FMODE_WRITE)) param->flags |= DM_READONLY_FLAG; param->target_count = dm_table_get_num_targets(table); - dm_table_put(table); } + dm_put_live_table(md, srcu_idx); } } @@ -796,6 +816,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) struct hash_cell *hc; struct mapped_device *md; int r; + struct dm_table *t; down_write(&_hash_lock); hc = __find_device_hash_cell(param); @@ -819,9 +840,14 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) return r; } - __hash_remove(hc); + t = __hash_remove(hc); up_write(&_hash_lock); + if (t) { + dm_sync_table(md); + dm_table_destroy(t); + } + if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr)) param->flags |= DM_UEVENT_GENERATED_FLAG; @@ -986,6 +1012,7 @@ static int do_resume(struct dm_ioctl *param) old_map = dm_swap_table(md, new_map); if (IS_ERR(old_map)) { + dm_sync_table(md); dm_table_destroy(new_map); dm_put(md); return PTR_ERR(old_map); @@ -1003,6 +1030,10 @@ static int do_resume(struct dm_ioctl *param) param->flags |= DM_UEVENT_GENERATED_FLAG; } + /* + * Since dm_swap_table synchronizes RCU, nobody should be in + * read-side critical section already. + */ if (old_map) dm_table_destroy(old_map); @@ -1125,6 +1156,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size) int r = 0; struct mapped_device *md; struct dm_table *table; + int srcu_idx; md = find_device(param); if (!md) @@ -1145,11 +1177,10 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size) */ __dev_status(md, param); - table = dm_get_live_or_inactive_table(md, param); - if (table) { + table = dm_get_live_or_inactive_table(md, param, &srcu_idx); + if (table) retrieve_status(table, param, param_size); - dm_table_put(table); - } + dm_put_live_table(md, srcu_idx); out: dm_put(md); @@ -1221,7 +1252,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size) { int r; struct hash_cell *hc; - struct dm_table *t; + struct dm_table *t, *old_map = NULL; struct mapped_device *md; struct target_type *immutable_target_type; @@ -1277,14 +1308,14 @@ static int table_load(struct dm_ioctl *param, size_t param_size) hc = dm_get_mdptr(md); if (!hc || hc->md != md) { DMWARN("device has been removed from the dev hash table."); - dm_table_destroy(t); up_write(&_hash_lock); + dm_table_destroy(t); r = -ENXIO; goto out; } if (hc->new_map) - dm_table_destroy(hc->new_map); + old_map = hc->new_map; hc->new_map = t; up_write(&_hash_lock); @@ -1292,6 +1323,11 @@ static int table_load(struct dm_ioctl *param, size_t param_size) __dev_status(md, param); out: + if (old_map) { + dm_sync_table(md); + dm_table_destroy(old_map); + } + dm_put(md); return r; @@ -1301,6 +1337,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size) { struct hash_cell *hc; struct mapped_device *md; + struct dm_table *old_map = NULL; down_write(&_hash_lock); @@ -1312,7 +1349,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size) } if (hc->new_map) { - dm_table_destroy(hc->new_map); + old_map = hc->new_map; hc->new_map = NULL; } @@ -1321,6 +1358,10 @@ static int table_clear(struct dm_ioctl *param, size_t param_size) __dev_status(hc->md, param); md = hc->md; up_write(&_hash_lock); + if (old_map) { + dm_sync_table(md); + dm_table_destroy(old_map); + } dm_put(md); return 0; @@ -1370,6 +1411,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size) { struct mapped_device *md; struct dm_table *table; + int srcu_idx; md = find_device(param); if (!md) @@ -1377,11 +1419,10 @@ static int table_deps(struct dm_ioctl *param, size_t param_size) __dev_status(md, param); - table = dm_get_live_or_inactive_table(md, param); - if (table) { + table = dm_get_live_or_inactive_table(md, param, &srcu_idx); + if (table) retrieve_deps(table, param, param_size); - dm_table_put(table); - } + dm_put_live_table(md, srcu_idx); dm_put(md); @@ -1396,6 +1437,7 @@ static int table_status(struct dm_ioctl *param, size_t param_size) { struct mapped_device *md; struct dm_table *table; + int srcu_idx; md = find_device(param); if (!md) @@ -1403,11 +1445,10 @@ static int table_status(struct dm_ioctl *param, size_t param_size) __dev_status(md, param); - table = dm_get_live_or_inactive_table(md, param); - if (table) { + table = dm_get_live_or_inactive_table(md, param, &srcu_idx); + if (table) retrieve_status(table, param, param_size); - dm_table_put(table); - } + dm_put_live_table(md, srcu_idx); dm_put(md); @@ -1443,6 +1484,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size) struct dm_target_msg *tmsg = (void *) param + param->data_start; size_t maxlen; char *result = get_result_buffer(param, param_size, &maxlen); + int srcu_idx; md = find_device(param); if (!md) @@ -1470,9 +1512,9 @@ static int target_message(struct dm_ioctl *param, size_t param_size) if (r <= 1) goto out_argv; - table = dm_get_live_table(md); + table = dm_get_live_table(md, &srcu_idx); if (!table) - goto out_argv; + goto out_table; if (dm_deleting_md(md)) { r = -ENXIO; @@ -1491,7 +1533,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size) } out_table: - dm_table_put(table); + dm_put_live_table(md, srcu_idx); out_argv: kfree(argv); out: diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 1ff252ab7d46..f221812b7dbc 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -26,22 +26,8 @@ #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) #define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) -/* - * The table has always exactly one reference from either mapped_device->map - * or hash_cell->new_map. This reference is not counted in table->holders. - * A pair of dm_create_table/dm_destroy_table functions is used for table - * creation/destruction. - * - * Temporary references from the other code increase table->holders. A pair - * of dm_table_get/dm_table_put functions is used to manipulate it. - * - * When the table is about to be destroyed, we wait for table->holders to - * drop to zero. - */ - struct dm_table { struct mapped_device *md; - atomic_t holders; unsigned type; /* btree table */ @@ -208,7 +194,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode, INIT_LIST_HEAD(&t->devices); INIT_LIST_HEAD(&t->target_callbacks); - atomic_set(&t->holders, 0); if (!num_targets) num_targets = KEYS_PER_NODE; @@ -246,10 +231,6 @@ void dm_table_destroy(struct dm_table *t) if (!t) return; - while (atomic_read(&t->holders)) - msleep(1); - smp_mb(); - /* free the indexes */ if (t->depth >= 2) vfree(t->index[t->depth - 2]); @@ -274,22 +255,6 @@ void dm_table_destroy(struct dm_table *t) kfree(t); } -void dm_table_get(struct dm_table *t) -{ - atomic_inc(&t->holders); -} -EXPORT_SYMBOL(dm_table_get); - -void dm_table_put(struct dm_table *t) -{ - if (!t) - return; - - smp_mb__before_atomic_dec(); - atomic_dec(&t->holders); -} -EXPORT_SYMBOL(dm_table_put); - /* * Checks to see if we need to extend highs or targets. */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 33f20103d8d5..ecff83f5b53a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -116,13 +116,20 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); #define DMF_NOFLUSH_SUSPENDING 5 #define DMF_MERGE_IS_OPTIONAL 6 +/* + * A dummy definition to make RCU happy. + * struct dm_table should never be dereferenced in this file. + */ +struct dm_table { + int undefined__; +}; + /* * Work processed by per-device workqueue. */ struct mapped_device { - struct rw_semaphore io_lock; + struct srcu_struct io_barrier; struct mutex suspend_lock; - rwlock_t map_lock; atomic_t holders; atomic_t open_count; @@ -156,6 +163,8 @@ struct mapped_device { /* * The current mapping. + * Use dm_get_live_table{_fast} or take suspend_lock for + * dereference. */ struct dm_table *map; @@ -386,12 +395,14 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct mapped_device *md = bdev->bd_disk->private_data; + int srcu_idx; struct dm_table *map; struct dm_target *tgt; int r = -ENOTTY; retry: - map = dm_get_live_table(md); + map = dm_get_live_table(md, &srcu_idx); + if (!map || !dm_table_get_size(map)) goto out; @@ -410,7 +421,7 @@ retry: r = tgt->type->ioctl(tgt, cmd, arg); out: - dm_table_put(map); + dm_put_live_table(md, srcu_idx); if (r == -ENOTCONN) { msleep(10); @@ -509,20 +520,39 @@ static void queue_io(struct mapped_device *md, struct bio *bio) /* * Everyone (including functions in this file), should use this * function to access the md->map field, and make sure they call - * dm_table_put() when finished. + * dm_put_live_table() when finished. */ -struct dm_table *dm_get_live_table(struct mapped_device *md) +struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) { - struct dm_table *t; - unsigned long flags; + *srcu_idx = srcu_read_lock(&md->io_barrier); + + return srcu_dereference(md->map, &md->io_barrier); +} - read_lock_irqsave(&md->map_lock, flags); - t = md->map; - if (t) - dm_table_get(t); - read_unlock_irqrestore(&md->map_lock, flags); +void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) +{ + srcu_read_unlock(&md->io_barrier, srcu_idx); +} + +void dm_sync_table(struct mapped_device *md) +{ + synchronize_srcu(&md->io_barrier); + synchronize_rcu_expedited(); +} + +/* + * A fast alternative to dm_get_live_table/dm_put_live_table. + * The caller must not block between these two functions. + */ +static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) +{ + rcu_read_lock(); + return rcu_dereference(md->map); +} - return t; +static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) +{ + rcu_read_unlock(); } /* @@ -1356,17 +1386,18 @@ static int __split_and_process_non_flush(struct clone_info *ci) /* * Entry point to split a bio into clones and submit them to the targets. */ -static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) +static void __split_and_process_bio(struct mapped_device *md, + struct dm_table *map, struct bio *bio) { struct clone_info ci; int error = 0; - ci.map = dm_get_live_table(md); - if (unlikely(!ci.map)) { + if (unlikely(!map)) { bio_io_error(bio); return; } + ci.map = map; ci.md = md; ci.io = alloc_io(md); ci.io->error = 0; @@ -1393,7 +1424,6 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) /* drop the extra reference count */ dec_pending(ci.io, error); - dm_table_put(ci.map); } /*----------------------------------------------------------------- * CRUD END @@ -1404,7 +1434,7 @@ static int dm_merge_bvec(struct request_queue *q, struct bio_vec *biovec) { struct mapped_device *md = q->queuedata; - struct dm_table *map = dm_get_live_table(md); + struct dm_table *map = dm_get_live_table_fast(md); struct dm_target *ti; sector_t max_sectors; int max_size = 0; @@ -1414,7 +1444,7 @@ static int dm_merge_bvec(struct request_queue *q, ti = dm_table_find_target(map, bvm->bi_sector); if (!dm_target_is_valid(ti)) - goto out_table; + goto out; /* * Find maximum amount of I/O that won't need splitting @@ -1443,10 +1473,8 @@ static int dm_merge_bvec(struct request_queue *q, max_size = 0; -out_table: - dm_table_put(map); - out: + dm_put_live_table_fast(md); /* * Always allow an entire first page */ @@ -1465,8 +1493,10 @@ static void _dm_request(struct request_queue *q, struct bio *bio) int rw = bio_data_dir(bio); struct mapped_device *md = q->queuedata; int cpu; + int srcu_idx; + struct dm_table *map; - down_read(&md->io_lock); + map = dm_get_live_table(md, &srcu_idx); cpu = part_stat_lock(); part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); @@ -1475,7 +1505,7 @@ static void _dm_request(struct request_queue *q, struct bio *bio) /* if we're suspended, we have to queue this io for later */ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { - up_read(&md->io_lock); + dm_put_live_table(md, srcu_idx); if (bio_rw(bio) != READA) queue_io(md, bio); @@ -1484,8 +1514,8 @@ static void _dm_request(struct request_queue *q, struct bio *bio) return; } - __split_and_process_bio(md, bio); - up_read(&md->io_lock); + __split_and_process_bio(md, map, bio); + dm_put_live_table(md, srcu_idx); return; } @@ -1671,7 +1701,8 @@ static struct request *dm_start_request(struct mapped_device *md, struct request static void dm_request_fn(struct request_queue *q) { struct mapped_device *md = q->queuedata; - struct dm_table *map = dm_get_live_table(md); + int srcu_idx; + struct dm_table *map = dm_get_live_table(md, &srcu_idx); struct dm_target *ti; struct request *rq, *clone; sector_t pos; @@ -1726,7 +1757,7 @@ requeued: delay_and_out: blk_delay_queue(q, HZ / 10); out: - dm_table_put(map); + dm_put_live_table(md, srcu_idx); } int dm_underlying_device_busy(struct request_queue *q) @@ -1739,14 +1770,14 @@ static int dm_lld_busy(struct request_queue *q) { int r; struct mapped_device *md = q->queuedata; - struct dm_table *map = dm_get_live_table(md); + struct dm_table *map = dm_get_live_table_fast(md); if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) r = 1; else r = dm_table_any_busy_target(map); - dm_table_put(map); + dm_put_live_table_fast(md); return r; } @@ -1758,7 +1789,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits) struct dm_table *map; if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { - map = dm_get_live_table(md); + map = dm_get_live_table_fast(md); if (map) { /* * Request-based dm cares about only own queue for @@ -1769,9 +1800,8 @@ static int dm_any_congested(void *congested_data, int bdi_bits) bdi_bits; else r = dm_table_any_congested(map, bdi_bits); - - dm_table_put(map); } + dm_put_live_table_fast(md); } return r; @@ -1876,12 +1906,14 @@ static struct mapped_device *alloc_dev(int minor) if (r < 0) goto bad_minor; + r = init_srcu_struct(&md->io_barrier); + if (r < 0) + goto bad_io_barrier; + md->type = DM_TYPE_NONE; - init_rwsem(&md->io_lock); mutex_init(&md->suspend_lock); mutex_init(&md->type_lock); spin_lock_init(&md->deferred_lock); - rwlock_init(&md->map_lock); atomic_set(&md->holders, 1); atomic_set(&md->open_count, 0); atomic_set(&md->event_nr, 0); @@ -1944,6 +1976,8 @@ bad_thread: bad_disk: blk_cleanup_queue(md->queue); bad_queue: + cleanup_srcu_struct(&md->io_barrier); +bad_io_barrier: free_minor(minor); bad_minor: module_put(THIS_MODULE); @@ -1967,6 +2001,7 @@ static void free_dev(struct mapped_device *md) bioset_free(md->bs); blk_integrity_unregister(md->disk); del_gendisk(md->disk); + cleanup_srcu_struct(&md->io_barrier); free_minor(minor); spin_lock(&_minor_lock); @@ -2109,7 +2144,6 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, struct dm_table *old_map; struct request_queue *q = md->queue; sector_t size; - unsigned long flags; int merge_is_optional; size = dm_table_get_size(t); @@ -2138,9 +2172,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, merge_is_optional = dm_table_merge_is_optional(t); - write_lock_irqsave(&md->map_lock, flags); old_map = md->map; - md->map = t; + rcu_assign_pointer(md->map, t); md->immutable_target_type = dm_table_get_immutable_target_type(t); dm_table_set_restrictions(t, q, limits); @@ -2148,7 +2181,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); else clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); - write_unlock_irqrestore(&md->map_lock, flags); + dm_sync_table(md); return old_map; } @@ -2159,15 +2192,13 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, static struct dm_table *__unbind(struct mapped_device *md) { struct dm_table *map = md->map; - unsigned long flags; if (!map) return NULL; dm_table_event_callback(map, NULL, NULL); - write_lock_irqsave(&md->map_lock, flags); - md->map = NULL; - write_unlock_irqrestore(&md->map_lock, flags); + rcu_assign_pointer(md->map, NULL); + dm_sync_table(md); return map; } @@ -2319,11 +2350,12 @@ EXPORT_SYMBOL_GPL(dm_device_name); static void __dm_destroy(struct mapped_device *md, bool wait) { struct dm_table *map; + int srcu_idx; might_sleep(); spin_lock(&_minor_lock); - map = dm_get_live_table(md); + map = dm_get_live_table(md, &srcu_idx); idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); set_bit(DMF_FREEING, &md->flags); spin_unlock(&_minor_lock); @@ -2333,6 +2365,9 @@ static void __dm_destroy(struct mapped_device *md, bool wait) dm_table_postsuspend_targets(map); } + /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ + dm_put_live_table(md, srcu_idx); + /* * Rare, but there may be I/O requests still going to complete, * for example. Wait for all references to disappear. @@ -2347,7 +2382,6 @@ static void __dm_destroy(struct mapped_device *md, bool wait) dm_device_name(md), atomic_read(&md->holders)); dm_sysfs_exit(md); - dm_table_put(map); dm_table_destroy(__unbind(md)); free_dev(md); } @@ -2404,8 +2438,10 @@ static void dm_wq_work(struct work_struct *work) struct mapped_device *md = container_of(work, struct mapped_device, work); struct bio *c; + int srcu_idx; + struct dm_table *map; - down_read(&md->io_lock); + map = dm_get_live_table(md, &srcu_idx); while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { spin_lock_irq(&md->deferred_lock); @@ -2415,17 +2451,13 @@ static void dm_wq_work(struct work_struct *work) if (!c) break; - up_read(&md->io_lock); - if (dm_request_based(md)) generic_make_request(c); else - __split_and_process_bio(md, c); - - down_read(&md->io_lock); + __split_and_process_bio(md, map, c); } - up_read(&md->io_lock); + dm_put_live_table(md, srcu_idx); } static void dm_queue_flush(struct mapped_device *md) @@ -2457,10 +2489,10 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) * reappear. */ if (dm_table_has_no_data_devices(table)) { - live_map = dm_get_live_table(md); + live_map = dm_get_live_table_fast(md); if (live_map) limits = md->queue->limits; - dm_table_put(live_map); + dm_put_live_table_fast(md); } if (!live_map) { @@ -2540,7 +2572,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) goto out_unlock; } - map = dm_get_live_table(md); + map = md->map; /* * DMF_NOFLUSH_SUSPENDING must be set before presuspend. @@ -2561,7 +2593,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) if (!noflush && do_lockfs) { r = lock_fs(md); if (r) - goto out; + goto out_unlock; } /* @@ -2576,9 +2608,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call * flush_workqueue(md->wq). */ - down_write(&md->io_lock); set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); - up_write(&md->io_lock); + synchronize_srcu(&md->io_barrier); /* * Stop md->queue before flushing md->wq in case request-based @@ -2596,10 +2627,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) */ r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); - down_write(&md->io_lock); if (noflush) clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); - up_write(&md->io_lock); + synchronize_srcu(&md->io_barrier); /* were we interrupted ? */ if (r < 0) { @@ -2609,7 +2639,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) start_queue(md->queue); unlock_fs(md); - goto out; /* pushback list is already flushed, so skip flush */ + goto out_unlock; /* pushback list is already flushed, so skip flush */ } /* @@ -2622,9 +2652,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) dm_table_postsuspend_targets(map); -out: - dm_table_put(map); - out_unlock: mutex_unlock(&md->suspend_lock); return r; @@ -2639,7 +2666,7 @@ int dm_resume(struct mapped_device *md) if (!dm_suspended_md(md)) goto out; - map = dm_get_live_table(md); + map = md->map; if (!map || !dm_table_get_size(map)) goto out; @@ -2663,7 +2690,6 @@ int dm_resume(struct mapped_device *md) r = 0; out: - dm_table_put(map); mutex_unlock(&md->suspend_lock); return r; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 3cd32478f2fd..e151d4c9298d 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -446,9 +446,9 @@ int __must_check dm_set_target_max_io_len(struct dm_target *ti, sector_t len); /* * Table reference counting. */ -struct dm_table *dm_get_live_table(struct mapped_device *md); -void dm_table_get(struct dm_table *t); -void dm_table_put(struct dm_table *t); +struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx); +void dm_put_live_table(struct mapped_device *md, int srcu_idx); +void dm_sync_table(struct mapped_device *md); /* * Queries diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index 7e75b6fd8d45..afd0cbd52edb 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -267,9 +267,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 24 +#define DM_VERSION_MINOR 25 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2013-01-15)" +#define DM_VERSION_EXTRA "-ioctl (2013-06-26)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ -- cgit v1.2.3-71-gd317 From 50ac6607845755e594c8a39b9c6a00d1c9b48ea4 Mon Sep 17 00:00:00 2001 From: Amitkumar Karwar Date: Tue, 25 Jun 2013 19:03:56 -0700 Subject: cfg80211/nl80211: rename packet pattern related structures and enums Currently packet patterns and it's enum/structures are used only for WoWLAN feature. As we intend to reuse them for new feature packet coalesce, they are renamed in this patch. Older names are kept for backward compatibility purpose. Signed-off-by: Amitkumar Karwar Signed-off-by: Bing Zhao Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath9k/main.c | 2 +- drivers/net/wireless/mwifiex/cfg80211.c | 3 +-- drivers/net/wireless/ti/wlcore/main.c | 10 ++++---- include/net/cfg80211.h | 6 ++--- include/uapi/linux/nl80211.h | 45 ++++++++++++++++++++------------- net/wireless/nl80211.c | 34 ++++++++++++------------- 6 files changed, 53 insertions(+), 47 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c index 1737a3e33685..1adb803a9d86 100644 --- a/drivers/net/wireless/ath/ath9k/main.c +++ b/drivers/net/wireless/ath/ath9k/main.c @@ -2094,7 +2094,7 @@ static void ath9k_wow_add_pattern(struct ath_softc *sc, { struct ath_hw *ah = sc->sc_ah; struct ath9k_wow_pattern *wow_pattern = NULL; - struct cfg80211_wowlan_trig_pkt_pattern *patterns = wowlan->patterns; + struct cfg80211_pkt_pattern *patterns = wowlan->patterns; int mask_len; s8 i = 0; diff --git a/drivers/net/wireless/mwifiex/cfg80211.c b/drivers/net/wireless/mwifiex/cfg80211.c index ef5fa890a286..0bd631fd556c 100644 --- a/drivers/net/wireless/mwifiex/cfg80211.c +++ b/drivers/net/wireless/mwifiex/cfg80211.c @@ -2298,8 +2298,7 @@ EXPORT_SYMBOL_GPL(mwifiex_del_virtual_intf); #ifdef CONFIG_PM static bool -mwifiex_is_pattern_supported(struct cfg80211_wowlan_trig_pkt_pattern *pat, - s8 *byte_seq) +mwifiex_is_pattern_supported(struct cfg80211_pkt_pattern *pat, s8 *byte_seq) { int j, k, valid_byte_cnt = 0; bool dont_care_byte = false; diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index b8db55c868c7..d1b19c38a907 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -1315,7 +1315,7 @@ static struct sk_buff *wl12xx_alloc_dummy_packet(struct wl1271 *wl) #ifdef CONFIG_PM static int -wl1271_validate_wowlan_pattern(struct cfg80211_wowlan_trig_pkt_pattern *p) +wl1271_validate_wowlan_pattern(struct cfg80211_pkt_pattern *p) { int num_fields = 0, in_field = 0, fields_size = 0; int i, pattern_len = 0; @@ -1458,9 +1458,9 @@ void wl1271_rx_filter_flatten_fields(struct wl12xx_rx_filter *filter, * Allocates an RX filter returned through f * which needs to be freed using rx_filter_free() */ -static int wl1271_convert_wowlan_pattern_to_rx_filter( - struct cfg80211_wowlan_trig_pkt_pattern *p, - struct wl12xx_rx_filter **f) +static int +wl1271_convert_wowlan_pattern_to_rx_filter(struct cfg80211_pkt_pattern *p, + struct wl12xx_rx_filter **f) { int i, j, ret = 0; struct wl12xx_rx_filter *filter; @@ -1562,7 +1562,7 @@ static int wl1271_configure_wowlan(struct wl1271 *wl, /* Translate WoWLAN patterns into filters */ for (i = 0; i < wow->n_patterns; i++) { - struct cfg80211_wowlan_trig_pkt_pattern *p; + struct cfg80211_pkt_pattern *p; struct wl12xx_rx_filter *filter = NULL; p = &wow->patterns[i]; diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7b0730aeb892..207b079a8fa5 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1698,7 +1698,7 @@ struct cfg80211_pmksa { }; /** - * struct cfg80211_wowlan_trig_pkt_pattern - packet pattern + * struct cfg80211_pkt_pattern - packet pattern * @mask: bitmask where to match pattern and where to ignore bytes, * one bit per byte, in same format as nl80211 * @pattern: bytes to match where bitmask is 1 @@ -1708,7 +1708,7 @@ struct cfg80211_pmksa { * Internal note: @mask and @pattern are allocated in one chunk of * memory, free @mask only! */ -struct cfg80211_wowlan_trig_pkt_pattern { +struct cfg80211_pkt_pattern { u8 *mask, *pattern; int pattern_len; int pkt_offset; @@ -1770,7 +1770,7 @@ struct cfg80211_wowlan { bool any, disconnect, magic_pkt, gtk_rekey_failure, eap_identity_req, four_way_handshake, rfkill_release; - struct cfg80211_wowlan_trig_pkt_pattern *patterns; + struct cfg80211_pkt_pattern *patterns; struct cfg80211_wowlan_tcp *tcp; int n_patterns; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 861e5eba3953..de0ce809068a 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3060,11 +3060,11 @@ enum nl80211_tx_power_setting { }; /** - * enum nl80211_wowlan_packet_pattern_attr - WoWLAN packet pattern attribute - * @__NL80211_WOWLAN_PKTPAT_INVALID: invalid number for nested attribute - * @NL80211_WOWLAN_PKTPAT_PATTERN: the pattern, values where the mask has + * enum nl80211_packet_pattern_attr - packet pattern attribute + * @__NL80211_PKTPAT_INVALID: invalid number for nested attribute + * @NL80211_PKTPAT_PATTERN: the pattern, values where the mask has * a zero bit are ignored - * @NL80211_WOWLAN_PKTPAT_MASK: pattern mask, must be long enough to have + * @NL80211_PKTPAT_MASK: pattern mask, must be long enough to have * a bit for each byte in the pattern. The lowest-order bit corresponds * to the first byte of the pattern, but the bytes of the pattern are * in a little-endian-like format, i.e. the 9th byte of the pattern @@ -3075,23 +3075,23 @@ enum nl80211_tx_power_setting { * Note that the pattern matching is done as though frames were not * 802.11 frames but 802.3 frames, i.e. the frame is fully unpacked * first (including SNAP header unpacking) and then matched. - * @NL80211_WOWLAN_PKTPAT_OFFSET: packet offset, pattern is matched after + * @NL80211_PKTPAT_OFFSET: packet offset, pattern is matched after * these fixed number of bytes of received packet - * @NUM_NL80211_WOWLAN_PKTPAT: number of attributes - * @MAX_NL80211_WOWLAN_PKTPAT: max attribute number + * @NUM_NL80211_PKTPAT: number of attributes + * @MAX_NL80211_PKTPAT: max attribute number */ -enum nl80211_wowlan_packet_pattern_attr { - __NL80211_WOWLAN_PKTPAT_INVALID, - NL80211_WOWLAN_PKTPAT_MASK, - NL80211_WOWLAN_PKTPAT_PATTERN, - NL80211_WOWLAN_PKTPAT_OFFSET, +enum nl80211_packet_pattern_attr { + __NL80211_PKTPAT_INVALID, + NL80211_PKTPAT_MASK, + NL80211_PKTPAT_PATTERN, + NL80211_PKTPAT_OFFSET, - NUM_NL80211_WOWLAN_PKTPAT, - MAX_NL80211_WOWLAN_PKTPAT = NUM_NL80211_WOWLAN_PKTPAT - 1, + NUM_NL80211_PKTPAT, + MAX_NL80211_PKTPAT = NUM_NL80211_PKTPAT - 1, }; /** - * struct nl80211_wowlan_pattern_support - pattern support information + * struct nl80211_pattern_support - packet pattern support information * @max_patterns: maximum number of patterns supported * @min_pattern_len: minimum length of each pattern * @max_pattern_len: maximum length of each pattern @@ -3101,13 +3101,22 @@ enum nl80211_wowlan_packet_pattern_attr { * that is part of %NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED in the * capability information given by the kernel to userspace. */ -struct nl80211_wowlan_pattern_support { +struct nl80211_pattern_support { __u32 max_patterns; __u32 min_pattern_len; __u32 max_pattern_len; __u32 max_pkt_offset; } __attribute__((packed)); +/* only for backward compatibility */ +#define __NL80211_WOWLAN_PKTPAT_INVALID __NL80211_PKTPAT_INVALID +#define NL80211_WOWLAN_PKTPAT_MASK NL80211_PKTPAT_MASK +#define NL80211_WOWLAN_PKTPAT_PATTERN NL80211_PKTPAT_PATTERN +#define NL80211_WOWLAN_PKTPAT_OFFSET NL80211_PKTPAT_OFFSET +#define NUM_NL80211_WOWLAN_PKTPAT NUM_NL80211_PKTPAT +#define MAX_NL80211_WOWLAN_PKTPAT MAX_NL80211_PKTPAT +#define nl80211_wowlan_pattern_support nl80211_pattern_support + /** * enum nl80211_wowlan_triggers - WoWLAN trigger definitions * @__NL80211_WOWLAN_TRIG_INVALID: invalid number for nested attributes @@ -3127,7 +3136,7 @@ struct nl80211_wowlan_pattern_support { * pattern matching is done after the packet is converted to the MSDU. * * In %NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED, it is a binary attribute - * carrying a &struct nl80211_wowlan_pattern_support. + * carrying a &struct nl80211_pattern_support. * * When reporting wakeup. it is a u32 attribute containing the 0-based * index of the pattern that caused the wakeup, in the patterns passed @@ -3284,7 +3293,7 @@ struct nl80211_wowlan_tcp_data_token_feature { * @NL80211_WOWLAN_TCP_WAKE_PAYLOAD: wake packet payload, for advertising a * u32 attribute holding the maximum length * @NL80211_WOWLAN_TCP_WAKE_MASK: Wake packet payload mask, not used for - * feature advertising. The mask works like @NL80211_WOWLAN_PKTPAT_MASK + * feature advertising. The mask works like @NL80211_PKTPAT_MASK * but on the TCP payload only. * @NUM_NL80211_WOWLAN_TCP: number of TCP attributes * @MAX_NL80211_WOWLAN_TCP: highest attribute number diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 1cc47aca7f05..a044762f5ea3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -974,7 +974,7 @@ static int nl80211_send_wowlan(struct sk_buff *msg, return -ENOBUFS; if (dev->wiphy.wowlan->n_patterns) { - struct nl80211_wowlan_pattern_support pat = { + struct nl80211_pattern_support pat = { .max_patterns = dev->wiphy.wowlan->n_patterns, .min_pattern_len = dev->wiphy.wowlan->pattern_min_len, .max_pattern_len = dev->wiphy.wowlan->pattern_max_len, @@ -7591,12 +7591,11 @@ static int nl80211_send_wowlan_patterns(struct sk_buff *msg, if (!nl_pat) return -ENOBUFS; pat_len = wowlan->patterns[i].pattern_len; - if (nla_put(msg, NL80211_WOWLAN_PKTPAT_MASK, - DIV_ROUND_UP(pat_len, 8), + if (nla_put(msg, NL80211_PKTPAT_MASK, DIV_ROUND_UP(pat_len, 8), wowlan->patterns[i].mask) || - nla_put(msg, NL80211_WOWLAN_PKTPAT_PATTERN, - pat_len, wowlan->patterns[i].pattern) || - nla_put_u32(msg, NL80211_WOWLAN_PKTPAT_OFFSET, + nla_put(msg, NL80211_PKTPAT_PATTERN, pat_len, + wowlan->patterns[i].pattern) || + nla_put_u32(msg, NL80211_PKTPAT_OFFSET, wowlan->patterns[i].pkt_offset)) return -ENOBUFS; nla_nest_end(msg, nl_pat); @@ -7937,7 +7936,7 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) struct nlattr *pat; int n_patterns = 0; int rem, pat_len, mask_len, pkt_offset; - struct nlattr *pat_tb[NUM_NL80211_WOWLAN_PKTPAT]; + struct nlattr *pat_tb[NUM_NL80211_PKTPAT]; nla_for_each_nested(pat, tb[NL80211_WOWLAN_TRIG_PKT_PATTERN], rem) @@ -7956,26 +7955,25 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) nla_for_each_nested(pat, tb[NL80211_WOWLAN_TRIG_PKT_PATTERN], rem) { - nla_parse(pat_tb, MAX_NL80211_WOWLAN_PKTPAT, - nla_data(pat), nla_len(pat), NULL); + nla_parse(pat_tb, MAX_NL80211_PKTPAT, nla_data(pat), + nla_len(pat), NULL); err = -EINVAL; - if (!pat_tb[NL80211_WOWLAN_PKTPAT_MASK] || - !pat_tb[NL80211_WOWLAN_PKTPAT_PATTERN]) + if (!pat_tb[NL80211_PKTPAT_MASK] || + !pat_tb[NL80211_PKTPAT_PATTERN]) goto error; - pat_len = nla_len(pat_tb[NL80211_WOWLAN_PKTPAT_PATTERN]); + pat_len = nla_len(pat_tb[NL80211_PKTPAT_PATTERN]); mask_len = DIV_ROUND_UP(pat_len, 8); - if (nla_len(pat_tb[NL80211_WOWLAN_PKTPAT_MASK]) != - mask_len) + if (nla_len(pat_tb[NL80211_PKTPAT_MASK]) != mask_len) goto error; if (pat_len > wowlan->pattern_max_len || pat_len < wowlan->pattern_min_len) goto error; - if (!pat_tb[NL80211_WOWLAN_PKTPAT_OFFSET]) + if (!pat_tb[NL80211_PKTPAT_OFFSET]) pkt_offset = 0; else pkt_offset = nla_get_u32( - pat_tb[NL80211_WOWLAN_PKTPAT_OFFSET]); + pat_tb[NL80211_PKTPAT_OFFSET]); if (pkt_offset > wowlan->max_pkt_offset) goto error; new_triggers.patterns[i].pkt_offset = pkt_offset; @@ -7989,11 +7987,11 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) new_triggers.patterns[i].pattern = new_triggers.patterns[i].mask + mask_len; memcpy(new_triggers.patterns[i].mask, - nla_data(pat_tb[NL80211_WOWLAN_PKTPAT_MASK]), + nla_data(pat_tb[NL80211_PKTPAT_MASK]), mask_len); new_triggers.patterns[i].pattern_len = pat_len; memcpy(new_triggers.patterns[i].pattern, - nla_data(pat_tb[NL80211_WOWLAN_PKTPAT_PATTERN]), + nla_data(pat_tb[NL80211_PKTPAT_PATTERN]), pat_len); i++; } -- cgit v1.2.3-71-gd317 From be29b99a9b51b0338eea3c66a58de53bbd01de24 Mon Sep 17 00:00:00 2001 From: Amitkumar Karwar Date: Fri, 28 Jun 2013 11:51:26 -0700 Subject: cfg80211/nl80211: Add packet coalesce support In most cases, host that receives IPv4 and IPv6 multicast/broadcast packets does not do anything with these packets. Therefore the reception of these unwanted packets causes unnecessary processing and power consumption. Packet coalesce feature helps to reduce number of received interrupts to host by buffering these packets in firmware/hardware for some predefined time. Received interrupt will be generated when one of the following events occur. a) Expiration of hardware timer whose expiration time is set to maximum coalescing delay of matching coalesce rule. b) Coalescing buffer in hardware reaches it's limit. c) Packet doesn't match any of the configured coalesce rules. This patch adds set/get configuration support for packet coalesce. User needs to configure following parameters for creating a coalesce rule. a) Maximum coalescing delay b) List of packet patterns which needs to be matched c) Condition for coalescence. pattern 'match' or 'no match' Multiple such rules can be created. This feature needs to be advertised during driver initialization. Drivers are supposed to do required firmware/hardware settings based on user configuration. Signed-off-by: Amitkumar Karwar Signed-off-by: Bing Zhao [fix kernel-doc, change free function, fix copy/paste error] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 54 ++++++++ include/uapi/linux/nl80211.h | 90 ++++++++++++- net/wireless/core.c | 9 ++ net/wireless/core.h | 2 + net/wireless/nl80211.c | 308 +++++++++++++++++++++++++++++++++++++++++++ net/wireless/nl80211.h | 2 + 6 files changed, 463 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 49409602fe3d..071ed2395c9a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1780,6 +1780,35 @@ struct cfg80211_wowlan { int n_patterns; }; +/** + * struct cfg80211_coalesce_rules - Coalesce rule parameters + * + * This structure defines coalesce rule for the device. + * @delay: maximum coalescing delay in msecs. + * @condition: condition for packet coalescence. + * see &enum nl80211_coalesce_condition. + * @patterns: array of packet patterns + * @n_patterns: number of patterns + */ +struct cfg80211_coalesce_rules { + int delay; + enum nl80211_coalesce_condition condition; + struct cfg80211_pkt_pattern *patterns; + int n_patterns; +}; + +/** + * struct cfg80211_coalesce - Packet coalescing settings + * + * This structure defines coalescing settings. + * @rules: array of coalesce rules + * @n_rules: number of rules + */ +struct cfg80211_coalesce { + struct cfg80211_coalesce_rules *rules; + int n_rules; +}; + /** * struct cfg80211_wowlan_wakeup - wakeup report * @disconnect: woke up by getting disconnected @@ -2076,6 +2105,7 @@ struct cfg80211_update_ft_ies_params { * driver can take the most appropriate actions. * @crit_proto_stop: Indicates critical protocol no longer needs increased link * reliability. This operation can not fail. + * @set_coalesce: Set coalesce parameters. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -2311,6 +2341,8 @@ struct cfg80211_ops { u16 duration); void (*crit_proto_stop)(struct wiphy *wiphy, struct wireless_dev *wdev); + int (*set_coalesce)(struct wiphy *wiphy, + struct cfg80211_coalesce *coalesce); }; /* @@ -2536,6 +2568,25 @@ struct wiphy_wowlan_support { const struct wiphy_wowlan_tcp_support *tcp; }; +/** + * struct wiphy_coalesce_support - coalesce support data + * @n_rules: maximum number of coalesce rules + * @max_delay: maximum supported coalescing delay in msecs + * @n_patterns: number of supported patterns in a rule + * (see nl80211.h for the pattern definition) + * @pattern_max_len: maximum length of each pattern + * @pattern_min_len: minimum length of each pattern + * @max_pkt_offset: maximum Rx packet offset + */ +struct wiphy_coalesce_support { + int n_rules; + int max_delay; + int n_patterns; + int pattern_max_len; + int pattern_min_len; + int max_pkt_offset; +}; + /** * struct wiphy - wireless hardware description * @reg_notifier: the driver's regulatory notification callback, @@ -2646,6 +2697,7 @@ struct wiphy_wowlan_support { * 802.11-2012 8.4.2.29 for the defined fields. * @extended_capabilities_mask: mask of the valid values * @extended_capabilities_len: length of the extended capabilities + * @coalesce: packet coalescing support information */ struct wiphy { /* assign these fields before you register the wiphy */ @@ -2755,6 +2807,8 @@ struct wiphy { const struct iw_handler_def *wext; #endif + const struct wiphy_coalesce_support *coalesce; + char priv[0] __aligned(NETDEV_ALIGN); }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index de0ce809068a..5abc54d14d4d 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -125,6 +125,31 @@ * interfaces that a given device supports. */ +/** + * DOC: packet coalesce support + * + * In most cases, host that receives IPv4 and IPv6 multicast/broadcast + * packets does not do anything with these packets. Therefore the + * reception of these unwanted packets causes unnecessary processing + * and power consumption. + * + * Packet coalesce feature helps to reduce number of received interrupts + * to host by buffering these packets in firmware/hardware for some + * predefined time. Received interrupt will be generated when one of the + * following events occur. + * a) Expiration of hardware timer whose expiration time is set to maximum + * coalescing delay of matching coalesce rule. + * b) Coalescing buffer in hardware reaches it's limit. + * c) Packet doesn't match any of the configured coalesce rules. + * + * User needs to configure following parameters for creating a coalesce + * rule. + * a) Maximum coalescing delay + * b) List of packet patterns which needs to be matched + * c) Condition for coalescence. pattern 'match' or 'no match' + * Multiple such rules can be created. + */ + /** * enum nl80211_commands - supported nl80211 commands * @@ -648,6 +673,9 @@ * @NL80211_CMD_CRIT_PROTOCOL_STOP: Indicates the connection reliability can * return back to normal. * + * @NL80211_CMD_GET_COALESCE: Get currently supported coalesce rules. + * @NL80211_CMD_SET_COALESCE: Configure coalesce rules or clear existing rules. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -810,6 +838,9 @@ enum nl80211_commands { NL80211_CMD_CRIT_PROTOCOL_START, NL80211_CMD_CRIT_PROTOCOL_STOP, + NL80211_CMD_GET_COALESCE, + NL80211_CMD_SET_COALESCE, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -1436,6 +1467,8 @@ enum nl80211_commands { * allowed to be used with the first @NL80211_CMD_SET_STATION command to * update a TDLS peer STA entry. * + * @NL80211_ATTR_COALESCE_RULE: Coalesce rule information. + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -1736,6 +1769,8 @@ enum nl80211_attrs { NL80211_ATTR_PEER_AID, + NL80211_ATTR_COALESCE_RULE, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -3098,8 +3133,10 @@ enum nl80211_packet_pattern_attr { * @max_pkt_offset: maximum Rx packet offset * * This struct is carried in %NL80211_WOWLAN_TRIG_PKT_PATTERN when - * that is part of %NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED in the - * capability information given by the kernel to userspace. + * that is part of %NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED or in + * %NL80211_ATTR_COALESCE_RULE_PKT_PATTERN when that is part of + * %NL80211_ATTR_COALESCE_RULE in the capability information given + * by the kernel to userspace. */ struct nl80211_pattern_support { __u32 max_patterns; @@ -3317,6 +3354,55 @@ enum nl80211_wowlan_tcp_attrs { MAX_NL80211_WOWLAN_TCP = NUM_NL80211_WOWLAN_TCP - 1 }; +/** + * struct nl80211_coalesce_rule_support - coalesce rule support information + * @max_rules: maximum number of rules supported + * @pat: packet pattern support information + * @max_delay: maximum supported coalescing delay in msecs + * + * This struct is carried in %NL80211_ATTR_COALESCE_RULE in the + * capability information given by the kernel to userspace. + */ +struct nl80211_coalesce_rule_support { + __u32 max_rules; + struct nl80211_pattern_support pat; + __u32 max_delay; +} __attribute__((packed)); + +/** + * enum nl80211_attr_coalesce_rule - coalesce rule attribute + * @__NL80211_COALESCE_RULE_INVALID: invalid number for nested attribute + * @NL80211_ATTR_COALESCE_RULE_DELAY: delay in msecs used for packet coalescing + * @NL80211_ATTR_COALESCE_RULE_CONDITION: condition for packet coalescence, + * see &enum nl80211_coalesce_condition. + * @NL80211_ATTR_COALESCE_RULE_PKT_PATTERN: packet offset, pattern is matched + * after these fixed number of bytes of received packet + * @NUM_NL80211_ATTR_COALESCE_RULE: number of attributes + * @NL80211_ATTR_COALESCE_RULE_MAX: max attribute number + */ +enum nl80211_attr_coalesce_rule { + __NL80211_COALESCE_RULE_INVALID, + NL80211_ATTR_COALESCE_RULE_DELAY, + NL80211_ATTR_COALESCE_RULE_CONDITION, + NL80211_ATTR_COALESCE_RULE_PKT_PATTERN, + + /* keep last */ + NUM_NL80211_ATTR_COALESCE_RULE, + NL80211_ATTR_COALESCE_RULE_MAX = NUM_NL80211_ATTR_COALESCE_RULE - 1 +}; + +/** + * enum nl80211_coalesce_condition - coalesce rule conditions + * @NL80211_COALESCE_CONDITION_MATCH: coalaesce Rx packets when patterns + * in a rule are matched. + * @NL80211_COALESCE_CONDITION_NO_MATCH: coalesce Rx packets when patterns + * in a rule are not matched. + */ +enum nl80211_coalesce_condition { + NL80211_COALESCE_CONDITION_MATCH, + NL80211_COALESCE_CONDITION_NO_MATCH +}; + /** * enum nl80211_iface_limit_attrs - limit attributes * @NL80211_IFACE_LIMIT_UNSPEC: (reserved) diff --git a/net/wireless/core.c b/net/wireless/core.c index 4f9f216665e9..389a3f2ee464 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -462,6 +462,14 @@ int wiphy_register(struct wiphy *wiphy) return -EINVAL; #endif + if (WARN_ON(wiphy->coalesce && + (!wiphy->coalesce->n_rules || + !wiphy->coalesce->n_patterns) && + (!wiphy->coalesce->pattern_min_len || + wiphy->coalesce->pattern_min_len > + wiphy->coalesce->pattern_max_len))) + return -EINVAL; + if (WARN_ON(wiphy->ap_sme_capa && !(wiphy->flags & WIPHY_FLAG_HAVE_AP_SME))) return -EINVAL; @@ -668,6 +676,7 @@ void wiphy_unregister(struct wiphy *wiphy) rdev_set_wakeup(rdev, false); #endif cfg80211_rdev_free_wowlan(rdev); + cfg80211_rdev_free_coalesce(rdev); } EXPORT_SYMBOL(wiphy_unregister); diff --git a/net/wireless/core.h b/net/wireless/core.h index a6b45bf00f33..9ad43c619c54 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -79,6 +79,8 @@ struct cfg80211_registered_device { /* netlink port which started critical protocol (0 means not started) */ u32 crit_proto_nlportid; + struct cfg80211_coalesce *coalesce; + /* must be last because of the way we do wiphy_priv(), * and it should at least be aligned to NETDEV_ALIGN */ struct wiphy wiphy __aligned(NETDEV_ALIGN); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0492478ab74e..6dca5a700174 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -403,6 +403,14 @@ nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = { [NL80211_WOWLAN_TCP_WAKE_MASK] = { .len = 1 }, }; +/* policy for coalesce rule attributes */ +static const struct nla_policy +nl80211_coalesce_policy[NUM_NL80211_ATTR_COALESCE_RULE] = { + [NL80211_ATTR_COALESCE_RULE_DELAY] = { .type = NLA_U32 }, + [NL80211_ATTR_COALESCE_RULE_CONDITION] = { .type = NLA_U32 }, + [NL80211_ATTR_COALESCE_RULE_PKT_PATTERN] = { .type = NLA_NESTED }, +}; + /* policy for GTK rekey offload attributes */ static const struct nla_policy nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = { @@ -995,6 +1003,27 @@ static int nl80211_send_wowlan(struct sk_buff *msg, } #endif +static int nl80211_send_coalesce(struct sk_buff *msg, + struct cfg80211_registered_device *dev) +{ + struct nl80211_coalesce_rule_support rule; + + if (!dev->wiphy.coalesce) + return 0; + + rule.max_rules = dev->wiphy.coalesce->n_rules; + rule.max_delay = dev->wiphy.coalesce->max_delay; + rule.pat.max_patterns = dev->wiphy.coalesce->n_patterns; + rule.pat.min_pattern_len = dev->wiphy.coalesce->pattern_min_len; + rule.pat.max_pattern_len = dev->wiphy.coalesce->pattern_max_len; + rule.pat.max_pkt_offset = dev->wiphy.coalesce->max_pkt_offset; + + if (nla_put(msg, NL80211_ATTR_COALESCE_RULE, sizeof(rule), &rule)) + return -ENOBUFS; + + return 0; +} + static int nl80211_send_band_rateinfo(struct sk_buff *msg, struct ieee80211_supported_band *sband) { @@ -1513,6 +1542,12 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev, dev->wiphy.vht_capa_mod_mask)) goto nla_put_failure; + state->split_start++; + break; + case 10: + if (nl80211_send_coalesce(msg, dev)) + goto nla_put_failure; + /* done */ state->split_start = 0; break; @@ -8043,6 +8078,264 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) } #endif +static int nl80211_send_coalesce_rules(struct sk_buff *msg, + struct cfg80211_registered_device *rdev) +{ + struct nlattr *nl_pats, *nl_pat, *nl_rule, *nl_rules; + int i, j, pat_len; + struct cfg80211_coalesce_rules *rule; + + if (!rdev->coalesce->n_rules) + return 0; + + nl_rules = nla_nest_start(msg, NL80211_ATTR_COALESCE_RULE); + if (!nl_rules) + return -ENOBUFS; + + for (i = 0; i < rdev->coalesce->n_rules; i++) { + nl_rule = nla_nest_start(msg, i + 1); + if (!nl_rule) + return -ENOBUFS; + + rule = &rdev->coalesce->rules[i]; + if (nla_put_u32(msg, NL80211_ATTR_COALESCE_RULE_DELAY, + rule->delay)) + return -ENOBUFS; + + if (nla_put_u32(msg, NL80211_ATTR_COALESCE_RULE_CONDITION, + rule->condition)) + return -ENOBUFS; + + nl_pats = nla_nest_start(msg, + NL80211_ATTR_COALESCE_RULE_PKT_PATTERN); + if (!nl_pats) + return -ENOBUFS; + + for (j = 0; j < rule->n_patterns; j++) { + nl_pat = nla_nest_start(msg, j + 1); + if (!nl_pat) + return -ENOBUFS; + pat_len = rule->patterns[j].pattern_len; + if (nla_put(msg, NL80211_PKTPAT_MASK, + DIV_ROUND_UP(pat_len, 8), + rule->patterns[j].mask) || + nla_put(msg, NL80211_PKTPAT_PATTERN, pat_len, + rule->patterns[j].pattern) || + nla_put_u32(msg, NL80211_PKTPAT_OFFSET, + rule->patterns[j].pkt_offset)) + return -ENOBUFS; + nla_nest_end(msg, nl_pat); + } + nla_nest_end(msg, nl_pats); + nla_nest_end(msg, nl_rule); + } + nla_nest_end(msg, nl_rules); + + return 0; +} + +static int nl80211_get_coalesce(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct sk_buff *msg; + void *hdr; + + if (!rdev->wiphy.coalesce) + return -EOPNOTSUPP; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0, + NL80211_CMD_GET_COALESCE); + if (!hdr) + goto nla_put_failure; + + if (rdev->coalesce && nl80211_send_coalesce_rules(msg, rdev)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + +nla_put_failure: + nlmsg_free(msg); + return -ENOBUFS; +} + +void cfg80211_rdev_free_coalesce(struct cfg80211_registered_device *rdev) +{ + struct cfg80211_coalesce *coalesce = rdev->coalesce; + int i, j; + struct cfg80211_coalesce_rules *rule; + + if (!coalesce) + return; + + for (i = 0; i < coalesce->n_rules; i++) { + rule = &coalesce->rules[i]; + for (j = 0; j < rule->n_patterns; j++) + kfree(rule->patterns[j].mask); + kfree(rule->patterns); + } + kfree(coalesce->rules); + kfree(coalesce); + rdev->coalesce = NULL; +} + +static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev, + struct nlattr *rule, + struct cfg80211_coalesce_rules *new_rule) +{ + int err, i; + const struct wiphy_coalesce_support *coalesce = rdev->wiphy.coalesce; + struct nlattr *tb[NUM_NL80211_ATTR_COALESCE_RULE], *pat; + int rem, pat_len, mask_len, pkt_offset, n_patterns = 0; + struct nlattr *pat_tb[NUM_NL80211_PKTPAT]; + + err = nla_parse(tb, NL80211_ATTR_COALESCE_RULE_MAX, nla_data(rule), + nla_len(rule), nl80211_coalesce_policy); + if (err) + return err; + + if (tb[NL80211_ATTR_COALESCE_RULE_DELAY]) + new_rule->delay = + nla_get_u32(tb[NL80211_ATTR_COALESCE_RULE_DELAY]); + if (new_rule->delay > coalesce->max_delay) + return -EINVAL; + + if (tb[NL80211_ATTR_COALESCE_RULE_CONDITION]) + new_rule->condition = + nla_get_u32(tb[NL80211_ATTR_COALESCE_RULE_CONDITION]); + if (new_rule->condition != NL80211_COALESCE_CONDITION_MATCH && + new_rule->condition != NL80211_COALESCE_CONDITION_NO_MATCH) + return -EINVAL; + + if (!tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN]) + return -EINVAL; + + nla_for_each_nested(pat, tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN], + rem) + n_patterns++; + if (n_patterns > coalesce->n_patterns) + return -EINVAL; + + new_rule->patterns = kcalloc(n_patterns, sizeof(new_rule->patterns[0]), + GFP_KERNEL); + if (!new_rule->patterns) + return -ENOMEM; + + new_rule->n_patterns = n_patterns; + i = 0; + + nla_for_each_nested(pat, tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN], + rem) { + nla_parse(pat_tb, MAX_NL80211_PKTPAT, nla_data(pat), + nla_len(pat), NULL); + if (!pat_tb[NL80211_PKTPAT_MASK] || + !pat_tb[NL80211_PKTPAT_PATTERN]) + return -EINVAL; + pat_len = nla_len(pat_tb[NL80211_PKTPAT_PATTERN]); + mask_len = DIV_ROUND_UP(pat_len, 8); + if (nla_len(pat_tb[NL80211_PKTPAT_MASK]) != mask_len) + return -EINVAL; + if (pat_len > coalesce->pattern_max_len || + pat_len < coalesce->pattern_min_len) + return -EINVAL; + + if (!pat_tb[NL80211_PKTPAT_OFFSET]) + pkt_offset = 0; + else + pkt_offset = nla_get_u32(pat_tb[NL80211_PKTPAT_OFFSET]); + if (pkt_offset > coalesce->max_pkt_offset) + return -EINVAL; + new_rule->patterns[i].pkt_offset = pkt_offset; + + new_rule->patterns[i].mask = + kmalloc(mask_len + pat_len, GFP_KERNEL); + if (!new_rule->patterns[i].mask) + return -ENOMEM; + new_rule->patterns[i].pattern = + new_rule->patterns[i].mask + mask_len; + memcpy(new_rule->patterns[i].mask, + nla_data(pat_tb[NL80211_PKTPAT_MASK]), mask_len); + new_rule->patterns[i].pattern_len = pat_len; + memcpy(new_rule->patterns[i].pattern, + nla_data(pat_tb[NL80211_PKTPAT_PATTERN]), pat_len); + i++; + } + + return 0; +} + +static int nl80211_set_coalesce(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + const struct wiphy_coalesce_support *coalesce = rdev->wiphy.coalesce; + struct cfg80211_coalesce new_coalesce = {}; + struct cfg80211_coalesce *n_coalesce; + int err, rem_rule, n_rules = 0, i, j; + struct nlattr *rule; + struct cfg80211_coalesce_rules *tmp_rule; + + if (!rdev->wiphy.coalesce || !rdev->ops->set_coalesce) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_COALESCE_RULE]) { + cfg80211_rdev_free_coalesce(rdev); + rdev->ops->set_coalesce(&rdev->wiphy, NULL); + return 0; + } + + nla_for_each_nested(rule, info->attrs[NL80211_ATTR_COALESCE_RULE], + rem_rule) + n_rules++; + if (n_rules > coalesce->n_rules) + return -EINVAL; + + new_coalesce.rules = kcalloc(n_rules, sizeof(new_coalesce.rules[0]), + GFP_KERNEL); + if (!new_coalesce.rules) + return -ENOMEM; + + new_coalesce.n_rules = n_rules; + i = 0; + + nla_for_each_nested(rule, info->attrs[NL80211_ATTR_COALESCE_RULE], + rem_rule) { + err = nl80211_parse_coalesce_rule(rdev, rule, + &new_coalesce.rules[i]); + if (err) + goto error; + + i++; + } + + err = rdev->ops->set_coalesce(&rdev->wiphy, &new_coalesce); + if (err) + goto error; + + n_coalesce = kmemdup(&new_coalesce, sizeof(new_coalesce), GFP_KERNEL); + if (!n_coalesce) { + err = -ENOMEM; + goto error; + } + cfg80211_rdev_free_coalesce(rdev); + rdev->coalesce = n_coalesce; + + return 0; +error: + for (i = 0; i < new_coalesce.n_rules; i++) { + tmp_rule = &new_coalesce.rules[i]; + for (j = 0; j < tmp_rule->n_patterns; j++) + kfree(tmp_rule->patterns[j].mask); + kfree(tmp_rule->patterns); + } + kfree(new_coalesce.rules); + + return err; +} + static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -9050,6 +9343,21 @@ static struct genl_ops nl80211_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WDEV_UP | NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_GET_COALESCE, + .doit = nl80211_get_coalesce, + .policy = nl80211_policy, + .internal_flags = NL80211_FLAG_NEED_WIPHY | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_SET_COALESCE, + .doit = nl80211_set_coalesce, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WIPHY | + NL80211_FLAG_NEED_RTNL, } }; diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index a4073e808c13..44341bf53cfc 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -74,4 +74,6 @@ nl80211_radar_notify(struct cfg80211_registered_device *rdev, enum nl80211_radar_event event, struct net_device *netdev, gfp_t gfp); +void cfg80211_rdev_free_coalesce(struct cfg80211_registered_device *rdev); + #endif /* __NET_WIRELESS_NL80211_H */ -- cgit v1.2.3-71-gd317 From dcd6eac1f3b5fa1df11dfa99da0cf75b76cfef97 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Mon, 8 Jul 2013 16:55:49 +0200 Subject: nl80211: add scan width to bss and scan request structs To allow scanning and working with 5 MHz and 10 MHz BSS, extend the inform bss commands and add wrappers to take 5 and 10 MHz bss into account. Signed-off-by: Simon Wunderlich Signed-off-by: Mathias Kretschmer Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 54 +++++++++++++++++++++++++++++++++++++++++--- include/uapi/linux/nl80211.h | 18 +++++++++++++++ net/wireless/nl80211.c | 1 + net/wireless/scan.c | 31 +++++++++++++++---------- net/wireless/trace.h | 12 ++++++---- 5 files changed, 97 insertions(+), 19 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 071ed2395c9a..bae8614a450f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1285,6 +1285,7 @@ struct cfg80211_ssid { * @n_ssids: number of SSIDs * @channels: channels to scan on. * @n_channels: total number of channels to scan + * @scan_width: channel width for scanning * @ie: optional information element(s) to add into Probe Request or %NULL * @ie_len: length of ie in octets * @flags: bit field of flags controlling operation @@ -1300,6 +1301,7 @@ struct cfg80211_scan_request { struct cfg80211_ssid *ssids; int n_ssids; u32 n_channels; + enum nl80211_bss_scan_width scan_width; const u8 *ie; size_t ie_len; u32 flags; @@ -1333,6 +1335,7 @@ struct cfg80211_match_set { * @ssids: SSIDs to scan for (passed in the probe_reqs in active scans) * @n_ssids: number of SSIDs * @n_channels: total number of channels to scan + * @scan_width: channel width for scanning * @interval: interval between each scheduled scan cycle * @ie: optional information element(s) to add into Probe Request or %NULL * @ie_len: length of ie in octets @@ -1352,6 +1355,7 @@ struct cfg80211_sched_scan_request { struct cfg80211_ssid *ssids; int n_ssids; u32 n_channels; + enum nl80211_bss_scan_width scan_width; u32 interval; const u8 *ie; size_t ie_len; @@ -1403,6 +1407,7 @@ struct cfg80211_bss_ies { * for use in scan results and similar. * * @channel: channel this BSS is on + * @scan_width: width of the control channel * @bssid: BSSID of the BSS * @beacon_interval: the beacon interval as from the frame * @capability: the capability field in host byte order @@ -1424,6 +1429,7 @@ struct cfg80211_bss_ies { */ struct cfg80211_bss { struct ieee80211_channel *channel; + enum nl80211_bss_scan_width scan_width; const struct cfg80211_bss_ies __rcu *ies; const struct cfg80211_bss_ies __rcu *beacon_ies; @@ -3438,10 +3444,11 @@ void cfg80211_sched_scan_results(struct wiphy *wiphy); void cfg80211_sched_scan_stopped(struct wiphy *wiphy); /** - * cfg80211_inform_bss_frame - inform cfg80211 of a received BSS frame + * cfg80211_inform_bss_width_frame - inform cfg80211 of a received BSS frame * * @wiphy: the wiphy reporting the BSS * @channel: The channel the frame was received on + * @scan_width: width of the control channel * @mgmt: the management frame (probe response or beacon) * @len: length of the management frame * @signal: the signal strength, type depends on the wiphy's signal_type @@ -3454,16 +3461,29 @@ void cfg80211_sched_scan_stopped(struct wiphy *wiphy); * Or %NULL on error. */ struct cfg80211_bss * __must_check +cfg80211_inform_bss_width_frame(struct wiphy *wiphy, + struct ieee80211_channel *channel, + enum nl80211_bss_scan_width scan_width, + struct ieee80211_mgmt *mgmt, size_t len, + s32 signal, gfp_t gfp); + +static inline struct cfg80211_bss * __must_check cfg80211_inform_bss_frame(struct wiphy *wiphy, struct ieee80211_channel *channel, struct ieee80211_mgmt *mgmt, size_t len, - s32 signal, gfp_t gfp); + s32 signal, gfp_t gfp) +{ + return cfg80211_inform_bss_width_frame(wiphy, channel, + NL80211_BSS_CHAN_WIDTH_20, + mgmt, len, signal, gfp); +} /** * cfg80211_inform_bss - inform cfg80211 of a new BSS * * @wiphy: the wiphy reporting the BSS * @channel: The channel the frame was received on + * @scan_width: width of the control channel * @bssid: the BSSID of the BSS * @tsf: the TSF sent by the peer in the beacon/probe response (or 0) * @capability: the capability field sent by the peer @@ -3480,11 +3500,26 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, * Or %NULL on error. */ struct cfg80211_bss * __must_check +cfg80211_inform_bss_width(struct wiphy *wiphy, + struct ieee80211_channel *channel, + enum nl80211_bss_scan_width scan_width, + const u8 *bssid, u64 tsf, u16 capability, + u16 beacon_interval, const u8 *ie, size_t ielen, + s32 signal, gfp_t gfp); + +static inline struct cfg80211_bss * __must_check cfg80211_inform_bss(struct wiphy *wiphy, struct ieee80211_channel *channel, const u8 *bssid, u64 tsf, u16 capability, u16 beacon_interval, const u8 *ie, size_t ielen, - s32 signal, gfp_t gfp); + s32 signal, gfp_t gfp) +{ + return cfg80211_inform_bss_width(wiphy, channel, + NL80211_BSS_CHAN_WIDTH_20, + bssid, tsf, capability, + beacon_interval, ie, ielen, signal, + gfp); +} struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy, struct ieee80211_channel *channel, @@ -3530,6 +3565,19 @@ void cfg80211_put_bss(struct wiphy *wiphy, struct cfg80211_bss *bss); */ void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *bss); +static inline enum nl80211_bss_scan_width +cfg80211_chandef_to_scan_width(const struct cfg80211_chan_def *chandef) +{ + switch (chandef->width) { + case NL80211_CHAN_WIDTH_5: + return NL80211_BSS_CHAN_WIDTH_5; + case NL80211_CHAN_WIDTH_10: + return NL80211_BSS_CHAN_WIDTH_10; + default: + return NL80211_BSS_CHAN_WIDTH_20; + } +} + /** * cfg80211_rx_mlme_mgmt - notification of processed MLME management frame * @dev: network device diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 5abc54d14d4d..eb68735b3318 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2807,6 +2807,21 @@ enum nl80211_chan_width { NL80211_CHAN_WIDTH_10, }; +/** + * enum nl80211_bss_scan_width - control channel width for a BSS + * + * These values are used with the %NL80211_BSS_CHAN_WIDTH attribute. + * + * @NL80211_BSS_CHAN_WIDTH_20: control channel is 20 MHz wide or compatible + * @NL80211_BSS_CHAN_WIDTH_10: control channel is 10 MHz wide + * @NL80211_BSS_CHAN_WIDTH_5: control channel is 5 MHz wide + */ +enum nl80211_bss_scan_width { + NL80211_BSS_CHAN_WIDTH_20, + NL80211_BSS_CHAN_WIDTH_10, + NL80211_BSS_CHAN_WIDTH_5, +}; + /** * enum nl80211_bss - netlink attributes for a BSS * @@ -2831,6 +2846,8 @@ enum nl80211_chan_width { * @NL80211_BSS_BEACON_IES: binary attribute containing the raw information * elements from a Beacon frame (bin); not present if no Beacon frame has * yet been received + * @NL80211_BSS_CHAN_WIDTH: channel width of the control channel + * (u32, enum nl80211_bss_scan_width) * @__NL80211_BSS_AFTER_LAST: internal * @NL80211_BSS_MAX: highest BSS attribute */ @@ -2847,6 +2864,7 @@ enum nl80211_bss { NL80211_BSS_STATUS, NL80211_BSS_SEEN_MS_AGO, NL80211_BSS_BEACON_IES, + NL80211_BSS_CHAN_WIDTH, /* keep last */ __NL80211_BSS_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 6dca5a700174..ef4c312cc92c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5674,6 +5674,7 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, goto nla_put_failure; if (nla_put_u16(msg, NL80211_BSS_CAPABILITY, res->capability) || nla_put_u32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq) || + nla_put_u32(msg, NL80211_BSS_CHAN_WIDTH, res->scan_width) || nla_put_u32(msg, NL80211_BSS_SEEN_MS_AGO, jiffies_to_msecs(jiffies - intbss->ts))) goto nla_put_failure; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index ae8c186b50d6..ad1e4068ce06 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -651,6 +651,8 @@ static bool cfg80211_combine_bsses(struct cfg80211_registered_device *dev, continue; if (bss->pub.channel != new->pub.channel) continue; + if (bss->pub.scan_width != new->pub.scan_width) + continue; if (rcu_access_pointer(bss->pub.beacon_ies)) continue; ies = rcu_access_pointer(bss->pub.ies); @@ -870,11 +872,12 @@ cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, /* Returned bss is reference counted and must be cleaned up appropriately. */ struct cfg80211_bss* -cfg80211_inform_bss(struct wiphy *wiphy, - struct ieee80211_channel *channel, - const u8 *bssid, u64 tsf, u16 capability, - u16 beacon_interval, const u8 *ie, size_t ielen, - s32 signal, gfp_t gfp) +cfg80211_inform_bss_width(struct wiphy *wiphy, + struct ieee80211_channel *channel, + enum nl80211_bss_scan_width scan_width, + const u8 *bssid, u64 tsf, u16 capability, + u16 beacon_interval, const u8 *ie, size_t ielen, + s32 signal, gfp_t gfp) { struct cfg80211_bss_ies *ies; struct cfg80211_internal_bss tmp = {}, *res; @@ -892,6 +895,7 @@ cfg80211_inform_bss(struct wiphy *wiphy, memcpy(tmp.pub.bssid, bssid, ETH_ALEN); tmp.pub.channel = channel; + tmp.pub.scan_width = scan_width; tmp.pub.signal = signal; tmp.pub.beacon_interval = beacon_interval; tmp.pub.capability = capability; @@ -924,14 +928,15 @@ cfg80211_inform_bss(struct wiphy *wiphy, /* cfg80211_bss_update gives us a referenced result */ return &res->pub; } -EXPORT_SYMBOL(cfg80211_inform_bss); +EXPORT_SYMBOL(cfg80211_inform_bss_width); /* Returned bss is reference counted and must be cleaned up appropriately. */ struct cfg80211_bss * -cfg80211_inform_bss_frame(struct wiphy *wiphy, - struct ieee80211_channel *channel, - struct ieee80211_mgmt *mgmt, size_t len, - s32 signal, gfp_t gfp) +cfg80211_inform_bss_width_frame(struct wiphy *wiphy, + struct ieee80211_channel *channel, + enum nl80211_bss_scan_width scan_width, + struct ieee80211_mgmt *mgmt, size_t len, + s32 signal, gfp_t gfp) { struct cfg80211_internal_bss tmp = {}, *res; struct cfg80211_bss_ies *ies; @@ -941,7 +946,8 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, BUILD_BUG_ON(offsetof(struct ieee80211_mgmt, u.probe_resp.variable) != offsetof(struct ieee80211_mgmt, u.beacon.variable)); - trace_cfg80211_inform_bss_frame(wiphy, channel, mgmt, len, signal); + trace_cfg80211_inform_bss_width_frame(wiphy, channel, scan_width, mgmt, + len, signal); if (WARN_ON(!mgmt)) return NULL; @@ -976,6 +982,7 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, memcpy(tmp.pub.bssid, mgmt->bssid, ETH_ALEN); tmp.pub.channel = channel; + tmp.pub.scan_width = scan_width; tmp.pub.signal = signal; tmp.pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int); tmp.pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info); @@ -991,7 +998,7 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy, /* cfg80211_bss_update gives us a referenced result */ return &res->pub; } -EXPORT_SYMBOL(cfg80211_inform_bss_frame); +EXPORT_SYMBOL(cfg80211_inform_bss_width_frame); void cfg80211_ref_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) { diff --git a/net/wireless/trace.h b/net/wireless/trace.h index e1534baf2ebb..09af6eb426a8 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2391,26 +2391,30 @@ TRACE_EVENT(cfg80211_get_bss, __entry->capa_mask, __entry->capa_val) ); -TRACE_EVENT(cfg80211_inform_bss_frame, +TRACE_EVENT(cfg80211_inform_bss_width_frame, TP_PROTO(struct wiphy *wiphy, struct ieee80211_channel *channel, + enum nl80211_bss_scan_width scan_width, struct ieee80211_mgmt *mgmt, size_t len, s32 signal), - TP_ARGS(wiphy, channel, mgmt, len, signal), + TP_ARGS(wiphy, channel, scan_width, mgmt, len, signal), TP_STRUCT__entry( WIPHY_ENTRY CHAN_ENTRY + __field(enum nl80211_bss_scan_width, scan_width) __dynamic_array(u8, mgmt, len) __field(s32, signal) ), TP_fast_assign( WIPHY_ASSIGN; CHAN_ASSIGN(channel); + __entry->scan_width = scan_width; if (mgmt) memcpy(__get_dynamic_array(mgmt), mgmt, len); __entry->signal = signal; ), - TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT "signal: %d", - WIPHY_PR_ARG, CHAN_PR_ARG, __entry->signal) + TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT "(scan_width: %d) signal: %d", + WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width, + __entry->signal) ); DECLARE_EVENT_CLASS(cfg80211_bss_evt, -- cgit v1.2.3-71-gd317 From 36ff66db3fb5642906e46e73ca9cf92f1c5974ff Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 27 Jun 2013 15:27:07 -0400 Subject: USB: move the definition of USB_MAXCHILDREN The USB_MAXCHILDREN symbol is used in include/uapi/linux/usb/ch11.h, a user-mode header, even though it is defined in include/linux/usb.h, which is kernel-only. This causes compile-time errors when user programs try to #include linux/usb/ch11.h. This patch fixes the problem by moving the definition of USB_MAXCHILDREN into ch11.h. It also gets rid of unneeded parentheses. Signed-off-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 11 ----------- include/uapi/linux/usb/ch11.h | 11 +++++++++++ 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index a232b7ece1f6..0eec2689b955 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -367,17 +367,6 @@ struct usb_bus { /* ----------------------------------------------------------------------- */ -/* This is arbitrary. - * From USB 2.0 spec Table 11-13, offset 7, a hub can - * have up to 255 ports. The most yet reported is 10. - * - * Current Wireless USB host hardware (Intel i1480 for example) allows - * up to 22 devices to connect. Upcoming hardware might raise that - * limit. Because the arrays need to add a bit for hub status data, we - * do 31, so plus one evens out to four bytes. - */ -#define USB_MAXCHILDREN (31) - struct usb_tt; enum usb_device_removable { diff --git a/include/uapi/linux/usb/ch11.h b/include/uapi/linux/usb/ch11.h index 7692dc69ccf7..331499d597fa 100644 --- a/include/uapi/linux/usb/ch11.h +++ b/include/uapi/linux/usb/ch11.h @@ -11,6 +11,17 @@ #include /* __u8 etc */ +/* This is arbitrary. + * From USB 2.0 spec Table 11-13, offset 7, a hub can + * have up to 255 ports. The most yet reported is 10. + * + * Current Wireless USB host hardware (Intel i1480 for example) allows + * up to 22 devices to connect. Upcoming hardware might raise that + * limit. Because the arrays need to add a bit for hub status data, we + * use 31, so plus one evens out to four bytes. + */ +#define USB_MAXCHILDREN 31 + /* * Hub request types */ -- cgit v1.2.3-71-gd317 From eda297729171fe16bf34fe5b0419dfb69060f623 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Fri, 19 Jul 2013 19:40:10 +0200 Subject: tun: Support software transmit time stamping. This patch adds transmit time stamping to the tun/tap driver. Similar support already exists for UDP, can, and raw packets. Signed-off-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/tun.c | 14 ++++++++++++-- include/uapi/linux/if_tun.h | 3 +++ 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index db690a372260..a72d141047cb 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -739,6 +739,11 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) >= dev->tx_queue_len / tun->numqueues) goto drop; + if (skb->sk) { + sock_tx_timestamp(skb->sk, &skb_shinfo(skb)->tx_flags); + sw_tx_timestamp(skb); + } + /* Orphan the skb - required as we might hang on to it * for indefinite time. */ if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) @@ -1476,7 +1481,6 @@ static int tun_sendmsg(struct kiocb *iocb, struct socket *sock, return ret; } - static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, size_t total_len, int flags) @@ -1488,10 +1492,15 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, if (!tun) return -EBADFD; - if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) { + if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) { ret = -EINVAL; goto out; } + if (flags & MSG_ERRQUEUE) { + ret = sock_recv_errqueue(sock->sk, m, total_len, + SOL_PACKET, TUN_TX_TIMESTAMP); + goto out; + } ret = tun_do_read(tun, tfile, iocb, m->msg_iov, total_len, flags & MSG_DONTWAIT); if (ret > total_len) { @@ -2274,6 +2283,7 @@ static const struct ethtool_ops tun_ethtool_ops = { .get_msglevel = tun_get_msglevel, .set_msglevel = tun_set_msglevel, .get_link = ethtool_op_get_link, + .get_ts_info = ethtool_op_get_ts_info, }; diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 82334f88967e..1870ee29bb37 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -71,6 +71,9 @@ /* read-only flag */ #define IFF_PERSIST 0x0800 +/* Socket options */ +#define TUN_TX_TIMESTAMP 1 + /* Features for GSO (TUNSETOFFLOAD). */ #define TUN_F_CSUM 0x01 /* You can hand me unchecksummed packets. */ #define TUN_F_TSO4 0x02 /* I can handle TSO for IPv4 packets */ -- cgit v1.2.3-71-gd317 From a5cdd40c9877e9aba704c020fd65d26b5cfecf18 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 16 Jul 2013 17:09:07 +0200 Subject: perf: Update perf_event_type documentation Due to a discussion with Adrian I had a good look at the perf_event_type record layout and found the documentation to be somewhat unclear. Cc: Adrian Hunter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20130716150907.GL23818@dyad.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 18 +++++++++++++++++- kernel/events/core.c | 31 ++++++++++++++++--------------- 2 files changed, 33 insertions(+), 16 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 0b1df41691e8..00d8274730b4 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -478,6 +478,16 @@ enum perf_event_type { * file will be supported by older perf tools, with these new optional * fields being ignored. * + * struct sample_id { + * { u32 pid, tid; } && PERF_SAMPLE_TID + * { u64 time; } && PERF_SAMPLE_TIME + * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID + * { u32 cpu, res; } && PERF_SAMPLE_CPU + * } && perf_event_attr::sample_id_all + */ + + /* * The MMAP events record the PROT_EXEC mappings so that we can * correlate userspace IPs to code. They have the following structure: * @@ -498,6 +508,7 @@ enum perf_event_type { * struct perf_event_header header; * u64 id; * u64 lost; + * struct sample_id sample_id; * }; */ PERF_RECORD_LOST = 2, @@ -508,6 +519,7 @@ enum perf_event_type { * * u32 pid, tid; * char comm[]; + * struct sample_id sample_id; * }; */ PERF_RECORD_COMM = 3, @@ -518,6 +530,7 @@ enum perf_event_type { * u32 pid, ppid; * u32 tid, ptid; * u64 time; + * struct sample_id sample_id; * }; */ PERF_RECORD_EXIT = 4, @@ -528,6 +541,7 @@ enum perf_event_type { * u64 time; * u64 id; * u64 stream_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_THROTTLE = 5, @@ -539,6 +553,7 @@ enum perf_event_type { * u32 pid, ppid; * u32 tid, ptid; * u64 time; + * struct sample_id sample_id; * }; */ PERF_RECORD_FORK = 7, @@ -549,6 +564,7 @@ enum perf_event_type { * u32 pid, tid; * * struct read_format values; + * struct sample_id sample_id; * }; */ PERF_RECORD_READ = 8, @@ -596,7 +612,7 @@ enum perf_event_type { * u64 dyn_size; } && PERF_SAMPLE_STACK_USER * * { u64 weight; } && PERF_SAMPLE_WEIGHT - * { u64 data_src; } && PERF_SAMPLE_DATA_SRC + * { u64 data_src; } && PERF_SAMPLE_DATA_SRC * }; */ PERF_RECORD_SAMPLE = 9, diff --git a/kernel/events/core.c b/kernel/events/core.c index 5e2bce90b477..127411400116 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4462,20 +4462,6 @@ void perf_output_sample(struct perf_output_handle *handle, } } - if (!event->attr.watermark) { - int wakeup_events = event->attr.wakeup_events; - - if (wakeup_events) { - struct ring_buffer *rb = handle->rb; - int events = local_inc_return(&rb->events); - - if (events >= wakeup_events) { - local_sub(wakeup_events, &rb->events); - local_inc(&rb->wakeup); - } - } - } - if (sample_type & PERF_SAMPLE_BRANCH_STACK) { if (data->br_stack) { size_t size; @@ -4511,16 +4497,31 @@ void perf_output_sample(struct perf_output_handle *handle, } } - if (sample_type & PERF_SAMPLE_STACK_USER) + if (sample_type & PERF_SAMPLE_STACK_USER) { perf_output_sample_ustack(handle, data->stack_user_size, data->regs_user.regs); + } if (sample_type & PERF_SAMPLE_WEIGHT) perf_output_put(handle, data->weight); if (sample_type & PERF_SAMPLE_DATA_SRC) perf_output_put(handle, data->data_src.val); + + if (!event->attr.watermark) { + int wakeup_events = event->attr.wakeup_events; + + if (wakeup_events) { + struct ring_buffer *rb = handle->rb; + int events = local_inc_return(&rb->events); + + if (events >= wakeup_events) { + local_sub(wakeup_events, &rb->events); + local_inc(&rb->wakeup); + } + } + } } void perf_prepare_sample(struct perf_event_header *header, -- cgit v1.2.3-71-gd317 From 860f085b74e9f0075de8140ed3a1e5b5e3e39aa8 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 28 Jun 2013 16:22:17 +0300 Subject: perf: Fix broken union in 'struct perf_event_mmap_page' The capabilities bits must not be "union'ed" together. Put them in a separate struct. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1372425741-1676-2-git-send-email-adrian.hunter@intel.com Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 00d8274730b4..0041aedf2297 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -375,9 +375,11 @@ struct perf_event_mmap_page { __u64 time_running; /* time event on cpu */ union { __u64 capabilities; - __u64 cap_usr_time : 1, - cap_usr_rdpmc : 1, - cap_____res : 62; + struct { + __u64 cap_usr_time : 1, + cap_usr_rdpmc : 1, + cap_____res : 62; + }; }; /* -- cgit v1.2.3-71-gd317 From c73deb6aecda2955716f31572516f09d930ef450 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 28 Jun 2013 16:22:18 +0300 Subject: perf/x86: Add ability to calculate TSC from perf sample timestamps For modern CPUs, perf clock is directly related to TSC. TSC can be calculated from perf clock and vice versa using a simple calculation. Two of the three componenets of that calculation are already exported in struct perf_event_mmap_page. This patch exports the third. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1372425741-1676-3-git-send-email-adrian.hunter@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tsc.h | 1 + arch/x86/kernel/cpu/perf_event.c | 6 ++++++ arch/x86/kernel/tsc.c | 6 ++++++ include/uapi/linux/perf_event.h | 22 ++++++++++++++++++++-- 4 files changed, 33 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index c91e8b9d588b..235be70d5bb4 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -49,6 +49,7 @@ extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); +extern int check_tsc_disabled(void); extern unsigned long native_calibrate_tsc(void); extern int tsc_clocksource_reliable; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index a7c7305030cc..8355c84b9729 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1884,6 +1884,7 @@ static struct pmu pmu = { void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { userpg->cap_usr_time = 0; + userpg->cap_usr_time_zero = 0; userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc; userpg->pmc_width = x86_pmu.cntval_bits; @@ -1897,6 +1898,11 @@ void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) userpg->time_mult = this_cpu_read(cyc2ns); userpg->time_shift = CYC2NS_SCALE_FACTOR; userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; + + if (sched_clock_stable && !check_tsc_disabled()) { + userpg->cap_usr_time_zero = 1; + userpg->time_zero = this_cpu_read(cyc2ns_offset); + } } /* diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 6ff49247edf8..930e5d48f560 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -89,6 +89,12 @@ int check_tsc_unstable(void) } EXPORT_SYMBOL_GPL(check_tsc_unstable); +int check_tsc_disabled(void) +{ + return tsc_disabled; +} +EXPORT_SYMBOL_GPL(check_tsc_disabled); + #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 0041aedf2297..efef1d37a371 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -378,7 +378,8 @@ struct perf_event_mmap_page { struct { __u64 cap_usr_time : 1, cap_usr_rdpmc : 1, - cap_____res : 62; + cap_usr_time_zero : 1, + cap_____res : 61; }; }; @@ -420,12 +421,29 @@ struct perf_event_mmap_page { __u16 time_shift; __u32 time_mult; __u64 time_offset; + /* + * If cap_usr_time_zero, the hardware clock (e.g. TSC) can be calculated + * from sample timestamps. + * + * time = timestamp - time_zero; + * quot = time / time_mult; + * rem = time % time_mult; + * cyc = (quot << time_shift) + (rem << time_shift) / time_mult; + * + * And vice versa: + * + * quot = cyc >> time_shift; + * rem = cyc & ((1 << time_shift) - 1); + * timestamp = time_zero + quot * time_mult + + * ((rem * time_mult) >> time_shift); + */ + __u64 time_zero; /* * Hole for extension of the self monitor capabilities */ - __u64 __reserved[120]; /* align to 1k */ + __u64 __reserved[119]; /* align to 1k */ /* * Control data for the mmap() data buffer. -- cgit v1.2.3-71-gd317 From 91705c61b52029ab5da67a15a23eef08667bf40e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 23 Jul 2013 14:51:47 +0200 Subject: net: sctp: trivial: update mailing list address The SCTP mailing list address to send patches or questions to is linux-sctp@vger.kernel.org and not lksctp-developers@lists.sourceforge.net anymore. Therefore, update all occurences. Signed-off-by: Daniel Borkmann Acked-by: Neil Horman Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- Documentation/networking/sctp.txt | 5 +---- include/net/sctp/auth.h | 2 +- include/net/sctp/checksum.h | 2 +- include/net/sctp/constants.h | 2 +- include/net/sctp/sctp.h | 2 +- include/net/sctp/sm.h | 2 +- include/net/sctp/structs.h | 2 +- include/net/sctp/tsnmap.h | 2 +- include/net/sctp/ulpevent.h | 2 +- include/net/sctp/ulpqueue.h | 2 +- include/uapi/linux/sctp.h | 2 +- net/sctp/associola.c | 2 +- net/sctp/auth.c | 2 +- net/sctp/bind_addr.c | 2 +- net/sctp/chunk.c | 2 +- net/sctp/command.c | 2 +- net/sctp/debug.c | 2 +- net/sctp/endpointola.c | 2 +- net/sctp/input.c | 2 +- net/sctp/inqueue.c | 2 +- net/sctp/ipv6.c | 2 +- net/sctp/objcnt.c | 2 +- net/sctp/output.c | 2 +- net/sctp/outqueue.c | 2 +- net/sctp/primitive.c | 2 +- net/sctp/proc.c | 2 +- net/sctp/protocol.c | 4 ++-- net/sctp/sm_make_chunk.c | 2 +- net/sctp/sm_sideeffect.c | 2 +- net/sctp/sm_statefuns.c | 2 +- net/sctp/sm_statetable.c | 2 +- net/sctp/socket.c | 2 +- net/sctp/ssnmap.c | 2 +- net/sctp/sysctl.c | 2 +- net/sctp/transport.c | 2 +- net/sctp/tsnmap.c | 2 +- net/sctp/ulpevent.c | 2 +- net/sctp/ulpqueue.c | 2 +- 38 files changed, 39 insertions(+), 42 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/sctp.txt b/Documentation/networking/sctp.txt index 0c790a76910e..97b810ca9082 100644 --- a/Documentation/networking/sctp.txt +++ b/Documentation/networking/sctp.txt @@ -19,7 +19,6 @@ of SCTP that is RFC 2960 compliant and provides an programming interface referred to as the UDP-style API of the Sockets Extensions for SCTP, as proposed in IETF Internet-Drafts. - Caveats: -lksctp can be built as statically or as a module. However, be aware that @@ -33,6 +32,4 @@ For more information, please visit the lksctp project website: http://www.sf.net/projects/lksctp Or contact the lksctp developers through the mailing list: - - - + diff --git a/include/net/sctp/auth.h b/include/net/sctp/auth.h index 49bc9577c61e..754c511f4cf9 100644 --- a/include/net/sctp/auth.h +++ b/include/net/sctp/auth.h @@ -22,7 +22,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h index 0cb08e6fb6df..78b88aa8c810 100644 --- a/include/net/sctp/checksum.h +++ b/include/net/sctp/checksum.h @@ -25,7 +25,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index ca50e0751e47..a1e7aa5c8bbf 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -25,7 +25,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index d8e37ecea691..554cf88605fc 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -27,7 +27,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 2a82d1384706..2aa66dda4a5a 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -27,7 +27,7 @@ * * Please send any bug reports or fixes you make to the * email addresses: - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index e745c92a1532..75c4c16601b6 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -25,7 +25,7 @@ * * Please send any bug reports or fixes you make to the * email addresses: - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/include/net/sctp/tsnmap.h b/include/net/sctp/tsnmap.h index 2c5d2b4d5d1e..c361d6f7e0cb 100644 --- a/include/net/sctp/tsnmap.h +++ b/include/net/sctp/tsnmap.h @@ -28,7 +28,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index ca4693b4e09e..34f0d34d22ce 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -31,7 +31,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/include/net/sctp/ulpqueue.h b/include/net/sctp/ulpqueue.h index 00e50ba3f24b..add3237a9fdb 100644 --- a/include/net/sctp/ulpqueue.h +++ b/include/net/sctp/ulpqueue.h @@ -30,7 +30,7 @@ * * Please send any bug reports or fixes you make to the * email addresses: - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 66b466e4ca08..ca451e99b28b 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -28,7 +28,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/associola.c b/net/sctp/associola.c index bce5b79662a6..e425ba0bd79e 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -28,7 +28,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/auth.c b/net/sctp/auth.c index ba1dfc3f8def..3aab967081be 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -22,7 +22,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index 64977ea0f9c5..f34ce8bc1395 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -27,7 +27,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 5780565f5b7d..b50b90c16a86 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -24,7 +24,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/command.c b/net/sctp/command.c index c0044019db9e..f4bebdadf007 100644 --- a/net/sctp/command.c +++ b/net/sctp/command.c @@ -25,7 +25,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/debug.c b/net/sctp/debug.c index f4998780d6df..44aa4e7543a9 100644 --- a/net/sctp/debug.c +++ b/net/sctp/debug.c @@ -28,7 +28,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 9e3d257de0e0..825b7543a436 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -29,7 +29,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/input.c b/net/sctp/input.c index 3fa4d858c35a..7993495a4c0f 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -29,7 +29,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index cb25f040fed0..e70f60ae92ef 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -30,7 +30,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 09ffcc912d23..85d688f59e6a 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -27,7 +27,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c index fe012c44f8df..aec346cbed67 100644 --- a/net/sctp/objcnt.c +++ b/net/sctp/objcnt.c @@ -26,7 +26,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/output.c b/net/sctp/output.c index a46d1eb41762..5a55c55d71ad 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -26,7 +26,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index ef9e2bbc0f2f..51313233e635 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -28,7 +28,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c index 794bb14decde..31fa437da117 100644 --- a/net/sctp/primitive.c +++ b/net/sctp/primitive.c @@ -29,7 +29,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/proc.c b/net/sctp/proc.c index 62526c477050..aff0cac45b64 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -22,7 +22,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 4a17494d736c..b52ec2510101 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -29,7 +29,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp @@ -1547,7 +1547,7 @@ module_exit(sctp_exit); */ MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132"); MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-132"); -MODULE_AUTHOR("Linux Kernel SCTP developers "); +MODULE_AUTHOR("Linux Kernel SCTP developers "); MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)"); module_param_named(no_checksums, sctp_checksum_disable, bool, 0644); MODULE_PARM_DESC(no_checksums, "Disable checksums computing and verification"); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 362ae6e2fd93..780a2d4b98a2 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -29,7 +29,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 9da68852ee94..f1f3aac3f3bb 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -28,7 +28,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index f6b7109195a6..93271f0966c6 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -28,7 +28,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index 84d98d8a5a74..64ea5fd87eb2 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -28,7 +28,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/socket.c b/net/sctp/socket.c index c6670d2e3f8d..02457123bdee 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -34,7 +34,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c index da8603523808..72b5939101bb 100644 --- a/net/sctp/ssnmap.c +++ b/net/sctp/ssnmap.c @@ -24,7 +24,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 9a5c4c9eddaf..190674702b20 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -25,7 +25,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/transport.c b/net/sctp/transport.c index bdbbc3fd7c14..9602c52aa617 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -30,7 +30,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/tsnmap.c b/net/sctp/tsnmap.c index b46019568a86..0eff8667d4c7 100644 --- a/net/sctp/tsnmap.c +++ b/net/sctp/tsnmap.c @@ -27,7 +27,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 44a45dbee4df..c07624fca7e5 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -28,7 +28,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 04e3d470f877..2cdb3010f50f 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -27,7 +27,7 @@ * * Please send any bug reports or fixes you make to the * email address(es): - * lksctp developers + * lksctp developers * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp -- cgit v1.2.3-71-gd317 From c9bee3b7fdecb0c1d070c7b54113b3bdfb9a3d36 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Jul 2013 20:27:07 -0700 Subject: tcp: TCP_NOTSENT_LOWAT socket option Idea of this patch is to add optional limitation of number of unsent bytes in TCP sockets, to reduce usage of kernel memory. TCP receiver might announce a big window, and TCP sender autotuning might allow a large amount of bytes in write queue, but this has little performance impact if a large part of this buffering is wasted : Write queue needs to be large only to deal with large BDP, not necessarily to cope with scheduling delays (incoming ACKS make room for the application to queue more bytes) For most workloads, using a value of 128 KB or less is OK to give applications enough time to react to POLLOUT events in time (or being awaken in a blocking sendmsg()) This patch adds two ways to set the limit : 1) Per socket option TCP_NOTSENT_LOWAT 2) A sysctl (/proc/sys/net/ipv4/tcp_notsent_lowat) for sockets not using TCP_NOTSENT_LOWAT socket option (or setting a zero value) Default value being UINT_MAX (0xFFFFFFFF), meaning this has no effect. This changes poll()/select()/epoll() to report POLLOUT only if number of unsent bytes is below tp->nosent_lowat Note this might increase number of sendmsg()/sendfile() calls when using non blocking sockets, and increase number of context switches for blocking sockets. Note this is not related to SO_SNDLOWAT (as SO_SNDLOWAT is defined as : Specify the minimum number of bytes in the buffer until the socket layer will pass the data to the protocol) Tested: netperf sessions, and watching /proc/net/protocols "memory" column for TCP With 200 concurrent netperf -t TCP_STREAM sessions, amount of kernel memory used by TCP buffers shrinks by ~55 % (20567 pages instead of 45458) lpq83:~# echo -1 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# (super_netperf 200 -t TCP_STREAM -H remote -l 90 &); sleep 60 ; grep TCP /proc/net/protocols TCPv6 1880 2 45458 no 208 yes ipv6 y y y y y y y y y y y y y n y y y y y TCP 1696 508 45458 no 208 yes kernel y y y y y y y y y y y y y n y y y y y lpq83:~# echo 131072 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# (super_netperf 200 -t TCP_STREAM -H remote -l 90 &); sleep 60 ; grep TCP /proc/net/protocols TCPv6 1880 2 20567 no 208 yes ipv6 y y y y y y y y y y y y y n y y y y y TCP 1696 508 20567 no 208 yes kernel y y y y y y y y y y y y y n y y y y y Using 128KB has no bad effect on the throughput or cpu usage of a single flow, although there is an increase of context switches. A bonus is that we hold socket lock for a shorter amount of time and should improve latencies of ACK processing. lpq83:~# echo -1 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# perf stat -e context-switches ./netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3 OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : +/-2.500% @ 99% conf. Local Remote Local Elapsed Throughput Throughput Local Local Remote Remote Local Remote Service Send Socket Recv Socket Send Time Units CPU CPU CPU CPU Service Service Demand Size Size Size (sec) Util Util Util Util Demand Demand Units Final Final % Method % Method 1651584 6291456 16384 20.00 17447.90 10^6bits/s 3.13 S -1.00 U 0.353 -1.000 usec/KB Performance counter stats for './netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3': 412,514 context-switches 200.034645535 seconds time elapsed lpq83:~# echo 131072 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# perf stat -e context-switches ./netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3 OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : +/-2.500% @ 99% conf. Local Remote Local Elapsed Throughput Throughput Local Local Remote Remote Local Remote Service Send Socket Recv Socket Send Time Units CPU CPU CPU CPU Service Service Demand Size Size Size (sec) Util Util Util Util Demand Demand Units Final Final % Method % Method 1593240 6291456 16384 20.00 17321.16 10^6bits/s 3.35 S -1.00 U 0.381 -1.000 usec/KB Performance counter stats for './netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3': 2,675,818 context-switches 200.029651391 seconds time elapsed Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Acked-By: Yuchung Cheng Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 13 +++++++++++++ include/linux/tcp.h | 1 + include/net/sock.h | 19 +++++++++++++------ include/net/tcp.h | 14 ++++++++++++++ include/uapi/linux/tcp.h | 1 + net/ipv4/sysctl_net_ipv4.c | 7 +++++++ net/ipv4/tcp.c | 7 +++++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_output.c | 3 +++ net/ipv6/tcp_ipv6.c | 1 + 10 files changed, 61 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 10742902146f..53cea9bcb14c 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -516,6 +516,19 @@ tcp_wmem - vector of 3 INTEGERs: min, default, max this value is ignored. Default: between 64K and 4MB, depending on RAM size. +tcp_notsent_lowat - UNSIGNED INTEGER + A TCP socket can control the amount of unsent bytes in its write queue, + thanks to TCP_NOTSENT_LOWAT socket option. poll()/select()/epoll() + reports POLLOUT events if the amount of unsent bytes is below a per + socket value, and if the write queue is not full. sendmsg() will + also not add new buffers if the limit is hit. + + This global variable controls the amount of unsent data for + sockets not using TCP_NOTSENT_LOWAT. For these sockets, a change + to the global variable has immediate effect. + + Default: UINT_MAX (0xFFFFFFFF) + tcp_workaround_signed_windows - BOOLEAN If set, assume no receipt of a window scaling option means the remote TCP is broken and treats the window as a signed quantity. diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 472120b4fac5..9640803a17a7 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -238,6 +238,7 @@ struct tcp_sock { u32 rcv_wnd; /* Current receiver window */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ + u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */ u32 pushed_seq; /* Last pushed seq, required to talk to windows */ u32 lost_out; /* Lost packets */ u32 sacked_out; /* SACK'd packets */ diff --git a/include/net/sock.h b/include/net/sock.h index d0b5fdee50a2..b9f2b095b1ab 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -746,11 +746,6 @@ static inline int sk_stream_wspace(const struct sock *sk) extern void sk_stream_write_space(struct sock *sk); -static inline bool sk_stream_memory_free(const struct sock *sk) -{ - return sk->sk_wmem_queued < sk->sk_sndbuf; -} - /* OOB backlog add */ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) { @@ -950,6 +945,7 @@ struct proto { unsigned int inuse_idx; #endif + bool (*stream_memory_free)(const struct sock *sk); /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); atomic_long_t *memory_allocated; /* Current allocated memory. */ @@ -1088,11 +1084,22 @@ static inline struct cg_proto *parent_cg_proto(struct proto *proto, } #endif +static inline bool sk_stream_memory_free(const struct sock *sk) +{ + if (sk->sk_wmem_queued >= sk->sk_sndbuf) + return false; + + return sk->sk_prot->stream_memory_free ? + sk->sk_prot->stream_memory_free(sk) : true; +} + static inline bool sk_stream_is_writeable(const struct sock *sk) { - return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk); + return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && + sk_stream_memory_free(sk); } + static inline bool sk_has_memory_pressure(const struct sock *sk) { return sk->sk_prot->memory_pressure != NULL; diff --git a/include/net/tcp.h b/include/net/tcp.h index c5868471abae..18fc999dae3c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -284,6 +284,7 @@ extern int sysctl_tcp_thin_dupack; extern int sysctl_tcp_early_retrans; extern int sysctl_tcp_limit_output_bytes; extern int sysctl_tcp_challenge_ack_limit; +extern unsigned int sysctl_tcp_notsent_lowat; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; @@ -1539,6 +1540,19 @@ extern int tcp_gro_complete(struct sk_buff *skb); extern void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); +static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) +{ + return tp->notsent_lowat ?: sysctl_tcp_notsent_lowat; +} + +static inline bool tcp_stream_memory_free(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + u32 notsent_bytes = tp->write_seq - tp->snd_nxt; + + return notsent_bytes < tcp_notsent_lowat(tp); +} + #ifdef CONFIG_PROC_FS extern int tcp4_proc_init(void); extern void tcp4_proc_exit(void); diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 8d776ebc4829..377f1e59411d 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -111,6 +111,7 @@ enum { #define TCP_REPAIR_OPTIONS 22 #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ #define TCP_TIMESTAMP 24 +#define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */ struct tcp_repair_opt { __u32 opt_code; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b2c123c44d69..69ed203802da 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -554,6 +554,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &one, }, + { + .procname = "tcp_notsent_lowat", + .data = &sysctl_tcp_notsent_lowat, + .maxlen = sizeof(sysctl_tcp_notsent_lowat), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "tcp_rmem", .data = &sysctl_tcp_rmem, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5eca9060bb8e..c27e81392398 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2631,6 +2631,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level, else tp->tsoffset = val - tcp_time_stamp; break; + case TCP_NOTSENT_LOWAT: + tp->notsent_lowat = val; + sk->sk_write_space(sk); + break; default: err = -ENOPROTOOPT; break; @@ -2847,6 +2851,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_TIMESTAMP: val = tcp_time_stamp + tp->tsoffset; break; + case TCP_NOTSENT_LOWAT: + val = tp->notsent_lowat; + break; default: return -ENOPROTOOPT; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2e3f129df0eb..2a5d5c469d17 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2800,6 +2800,7 @@ struct proto tcp_prot = { .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, + .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .orphan_count = &tcp_orphan_count, .memory_allocated = &tcp_memory_allocated, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 92fde8d1aa82..884efff5b531 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -65,6 +65,9 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; +unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; +EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); + static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 80fe69ef2188..b792e870686b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1924,6 +1924,7 @@ struct proto tcpv6_prot = { .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, + .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, -- cgit v1.2.3-71-gd317 From 9548906b2bb7ff09e12c013a55d669bef2c8e121 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 25 Jul 2013 05:44:02 +0900 Subject: xattr: Constify ->name member of "struct xattr". Since everybody sets kstrdup()ed constant string to "struct xattr"->name but nobody modifies "struct xattr"->name , we can omit kstrdup() and its failure checking by constifying ->name member of "struct xattr". Signed-off-by: Tetsuo Handa Reviewed-by: Joel Becker [ocfs2] Acked-by: Serge E. Hallyn Acked-by: Casey Schaufler Acked-by: Mimi Zohar Reviewed-by: Paul Moore Tested-by: Paul Moore Acked-by: Eric Paris Signed-off-by: James Morris --- fs/ocfs2/xattr.h | 2 +- include/linux/security.h | 8 ++++---- include/linux/xattr.h | 2 +- include/uapi/linux/reiserfs_xattr.h | 2 +- security/capability.c | 2 +- security/integrity/evm/evm_main.c | 2 +- security/security.c | 8 +++----- security/selinux/hooks.c | 17 ++++++----------- security/smack/smack_lsm.c | 9 +++------ 9 files changed, 21 insertions(+), 31 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index e5c7f15465b4..19f134e896a9 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -32,7 +32,7 @@ enum ocfs2_xattr_type { struct ocfs2_security_xattr_info { int enable; - char *name; + const char *name; void *value; size_t value_len; }; diff --git a/include/linux/security.h b/include/linux/security.h index 7ce53ae1266b..9d37e2b9d3ec 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1492,7 +1492,7 @@ struct security_operations { int (*inode_alloc_security) (struct inode *inode); void (*inode_free_security) (struct inode *inode); int (*inode_init_security) (struct inode *inode, struct inode *dir, - const struct qstr *qstr, char **name, + const struct qstr *qstr, const char **name, void **value, size_t *len); int (*inode_create) (struct inode *dir, struct dentry *dentry, umode_t mode); @@ -1770,7 +1770,7 @@ int security_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, initxattrs initxattrs, void *fs_data); int security_old_inode_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr, char **name, + const struct qstr *qstr, const char **name, void **value, size_t *len); int security_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode); int security_inode_link(struct dentry *old_dentry, struct inode *dir, @@ -2094,8 +2094,8 @@ static inline int security_inode_init_security(struct inode *inode, static inline int security_old_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, - char **name, void **value, - size_t *len) + const char **name, + void **value, size_t *len) { return -EOPNOTSUPP; } diff --git a/include/linux/xattr.h b/include/linux/xattr.h index fdbafc6841cf..91b0a68d38dc 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -31,7 +31,7 @@ struct xattr_handler { }; struct xattr { - char *name; + const char *name; void *value; size_t value_len; }; diff --git a/include/uapi/linux/reiserfs_xattr.h b/include/uapi/linux/reiserfs_xattr.h index d8ce17c2459a..38fdd648be21 100644 --- a/include/uapi/linux/reiserfs_xattr.h +++ b/include/uapi/linux/reiserfs_xattr.h @@ -16,7 +16,7 @@ struct reiserfs_xattr_header { }; struct reiserfs_security_handle { - char *name; + const char *name; void *value; size_t length; }; diff --git a/security/capability.c b/security/capability.c index 32b515766df1..dbeb9bc27b24 100644 --- a/security/capability.c +++ b/security/capability.c @@ -129,7 +129,7 @@ static void cap_inode_free_security(struct inode *inode) } static int cap_inode_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr, char **name, + const struct qstr *qstr, const char **name, void **value, size_t *len) { return -EOPNOTSUPP; diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c index df0fa451a871..af9b6852f4e1 100644 --- a/security/integrity/evm/evm_main.c +++ b/security/integrity/evm/evm_main.c @@ -418,7 +418,7 @@ int evm_inode_init_security(struct inode *inode, evm_xattr->value = xattr_data; evm_xattr->value_len = sizeof(*xattr_data); - evm_xattr->name = kstrdup(XATTR_EVM_SUFFIX, GFP_NOFS); + evm_xattr->name = XATTR_EVM_SUFFIX; return 0; out: kfree(xattr_data); diff --git a/security/security.c b/security/security.c index 94b35aef6871..4dc31f4f2700 100644 --- a/security/security.c +++ b/security/security.c @@ -348,10 +348,10 @@ int security_inode_init_security(struct inode *inode, struct inode *dir, if (unlikely(IS_PRIVATE(inode))) return 0; - memset(new_xattrs, 0, sizeof new_xattrs); if (!initxattrs) return security_ops->inode_init_security(inode, dir, qstr, NULL, NULL, NULL); + memset(new_xattrs, 0, sizeof(new_xattrs)); lsm_xattr = new_xattrs; ret = security_ops->inode_init_security(inode, dir, qstr, &lsm_xattr->name, @@ -366,16 +366,14 @@ int security_inode_init_security(struct inode *inode, struct inode *dir, goto out; ret = initxattrs(inode, new_xattrs, fs_data); out: - for (xattr = new_xattrs; xattr->name != NULL; xattr++) { - kfree(xattr->name); + for (xattr = new_xattrs; xattr->value != NULL; xattr++) kfree(xattr->value); - } return (ret == -EOPNOTSUPP) ? 0 : ret; } EXPORT_SYMBOL(security_inode_init_security); int security_old_inode_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr, char **name, + const struct qstr *qstr, const char **name, void **value, size_t *len) { if (unlikely(IS_PRIVATE(inode))) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c956390a9136..a5091ec06aa6 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2587,7 +2587,8 @@ static int selinux_dentry_init_security(struct dentry *dentry, int mode, } static int selinux_inode_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr, char **name, + const struct qstr *qstr, + const char **name, void **value, size_t *len) { const struct task_security_struct *tsec = current_security(); @@ -2595,7 +2596,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir, struct superblock_security_struct *sbsec; u32 sid, newsid, clen; int rc; - char *namep = NULL, *context; + char *context; dsec = dir->i_security; sbsec = dir->i_sb->s_security; @@ -2631,19 +2632,13 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir, if (!ss_initialized || !(sbsec->flags & SE_SBLABELSUPP)) return -EOPNOTSUPP; - if (name) { - namep = kstrdup(XATTR_SELINUX_SUFFIX, GFP_NOFS); - if (!namep) - return -ENOMEM; - *name = namep; - } + if (name) + *name = XATTR_SELINUX_SUFFIX; if (value && len) { rc = security_sid_to_context_force(newsid, &context, &clen); - if (rc) { - kfree(namep); + if (rc) return rc; - } *value = context; *len = clen; } diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 3f7682a387b7..a113a779f00c 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -582,7 +582,7 @@ static void smack_inode_free_security(struct inode *inode) * Returns 0 if it all works out, -ENOMEM if there's no memory */ static int smack_inode_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr, char **name, + const struct qstr *qstr, const char **name, void **value, size_t *len) { struct inode_smack *issp = inode->i_security; @@ -591,11 +591,8 @@ static int smack_inode_init_security(struct inode *inode, struct inode *dir, char *dsp = smk_of_inode(dir); int may; - if (name) { - *name = kstrdup(XATTR_SMACK_SUFFIX, GFP_NOFS); - if (*name == NULL) - return -ENOMEM; - } + if (name) + *name = XATTR_SMACK_SUFFIX; if (value) { rcu_read_lock(); -- cgit v1.2.3-71-gd317 From c4415084218c68c5ee2fc583431e89a78d896b19 Mon Sep 17 00:00:00 2001 From: Dmitry Fink Date: Mon, 8 Jul 2013 13:04:44 +0300 Subject: OMAP: UART: Keep the TX fifo full when possible Current logic results in interrupt storm since the fifo is constantly below the threshold level. Change the logic to fill all the available spaces in the fifo as long as we have data to minimize the possibilty of underflow and elimiate excessive interrupts. Signed-off-by: Dmitry Fink Signed-off-by: Alexander Savchenko Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/omap-serial.c | 3 ++- include/uapi/linux/serial_reg.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c index 33c758e09d86..9271a1dceec3 100644 --- a/drivers/tty/serial/omap-serial.c +++ b/drivers/tty/serial/omap-serial.c @@ -315,7 +315,8 @@ static void transmit_chars(struct uart_omap_port *up, unsigned int lsr) serial_omap_stop_tx(&up->port); return; } - count = up->port.fifosize / 4; + count = up->port.fifosize - + (serial_in(up, UART_OMAP_TXFIFO_LVL) & 0xFF); do { serial_out(up, UART_TX, xmit->buf[xmit->tail]); xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1); diff --git a/include/uapi/linux/serial_reg.h b/include/uapi/linux/serial_reg.h index e6322605b138..97c26beae605 100644 --- a/include/uapi/linux/serial_reg.h +++ b/include/uapi/linux/serial_reg.h @@ -366,6 +366,7 @@ #define UART_OMAP_MDR1_FIR_MODE 0x05 /* FIR mode */ #define UART_OMAP_MDR1_CIR_MODE 0x06 /* CIR mode */ #define UART_OMAP_MDR1_DISABLE 0x07 /* Disable (default state) */ +#define UART_OMAP_TXFIFO_LVL 0x1A /* TX FIFO fullness */ /* * These are definitions for the Exar XR17V35X and XR17(C|D)15X -- cgit v1.2.3-71-gd317 From c4b058560762ec7ffe159b668fc47a8b7e271949 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Mon, 15 Jul 2013 12:39:23 +0100 Subject: serial:st-asc: Add ST ASC driver. This patch adds support to ASC (asynchronous serial controller) driver, which is basically a standard serial driver. This IP is common across all the ST parts for settop box platforms. ASC is embedded in ST COMMS IP block. It supports Rx & Tx functionality. It support all industry standard baud rates. Signed-off-by: Srinivas Kandagatla CC: Stephen Gallimore CC: Stuart Menefy CC: Arnd Bergmann CC: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- .../devicetree/bindings/tty/serial/st-asc.txt | 18 + drivers/tty/serial/Kconfig | 16 + drivers/tty/serial/Makefile | 1 + drivers/tty/serial/st-asc.c | 937 +++++++++++++++++++++ include/uapi/linux/serial_core.h | 3 + 5 files changed, 975 insertions(+) create mode 100644 Documentation/devicetree/bindings/tty/serial/st-asc.txt create mode 100644 drivers/tty/serial/st-asc.c (limited to 'include/uapi/linux') diff --git a/Documentation/devicetree/bindings/tty/serial/st-asc.txt b/Documentation/devicetree/bindings/tty/serial/st-asc.txt new file mode 100644 index 000000000000..75d877f5968f --- /dev/null +++ b/Documentation/devicetree/bindings/tty/serial/st-asc.txt @@ -0,0 +1,18 @@ +*st-asc(Serial Port) + +Required properties: +- compatible : Should be "st,asc". +- reg, reg-names, interrupts, interrupt-names : Standard way to define device + resources with names. look in + Documentation/devicetree/bindings/resource-names.txt + +Optional properties: +- st,hw-flow-ctrl bool flag to enable hardware flow control. +- st,force-m1 bool flat to force asc to be in Mode-1 recommeded + for high bit rates (above 19.2K) +Example: +serial@fe440000{ + compatible = "st,asc"; + reg = <0xfe440000 0x2c>; + interrupts = <0 209 0>; +}; diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig index 6e73781bea63..bdf5c0e44b68 100644 --- a/drivers/tty/serial/Kconfig +++ b/drivers/tty/serial/Kconfig @@ -1497,6 +1497,22 @@ config SERIAL_FSL_LPUART_CONSOLE If you have enabled the lpuart serial port on the Freescale SoCs, you can make it the console by answering Y to this option. +config SERIAL_ST_ASC + tristate "ST ASC serial port support" + select SERIAL_CORE + help + This driver is for the on-chip Asychronous Serial Controller on + STMicroelectronics STi SoCs. + ASC is embedded in ST COMMS IP block. It supports Rx & Tx functionality. + It support all industry standard baud rates. + + If unsure, say N. + +config SERIAL_ST_ASC_CONSOLE + bool "Support for console on ST ASC" + depends on SERIAL_ST_ASC=y + select SERIAL_CORE_CONSOLE + endmenu endif # TTY diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile index cf650f0cd6e4..47b679c547e9 100644 --- a/drivers/tty/serial/Makefile +++ b/drivers/tty/serial/Makefile @@ -65,6 +65,7 @@ obj-$(CONFIG_SERIAL_KGDB_NMI) += kgdb_nmi.o obj-$(CONFIG_SERIAL_KS8695) += serial_ks8695.o obj-$(CONFIG_SERIAL_OMAP) += omap-serial.o obj-$(CONFIG_SERIAL_ALTERA_UART) += altera_uart.o +obj-$(CONFIG_SERIAL_ST_ASC) += st-asc.o obj-$(CONFIG_KGDB_SERIAL_CONSOLE) += kgdboc.o obj-$(CONFIG_SERIAL_QE) += ucc_uart.o obj-$(CONFIG_SERIAL_TIMBERDALE) += timbuart.o diff --git a/drivers/tty/serial/st-asc.c b/drivers/tty/serial/st-asc.c new file mode 100644 index 000000000000..183879825695 --- /dev/null +++ b/drivers/tty/serial/st-asc.c @@ -0,0 +1,937 @@ +/* + * st-asc.c: ST Asynchronous serial controller (ASC) driver + * + * Copyright (C) 2003-2013 STMicroelectronics (R&D) Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +#if defined(CONFIG_SERIAL_ST_ASC_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ) +#define SUPPORT_SYSRQ +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRIVER_NAME "st-asc" +#define ASC_SERIAL_NAME "ttyAS" +#define ASC_FIFO_SIZE 16 +#define ASC_MAX_PORTS 8 + +struct asc_port { + struct uart_port port; + struct clk *clk; + unsigned int hw_flow_control:1; + unsigned int force_m1:1; +}; + +static struct asc_port asc_ports[ASC_MAX_PORTS]; +static struct uart_driver asc_uart_driver; + +/*---- UART Register definitions ------------------------------*/ + +/* Register offsets */ + +#define ASC_BAUDRATE 0x00 +#define ASC_TXBUF 0x04 +#define ASC_RXBUF 0x08 +#define ASC_CTL 0x0C +#define ASC_INTEN 0x10 +#define ASC_STA 0x14 +#define ASC_GUARDTIME 0x18 +#define ASC_TIMEOUT 0x1C +#define ASC_TXRESET 0x20 +#define ASC_RXRESET 0x24 +#define ASC_RETRIES 0x28 + +/* ASC_RXBUF */ +#define ASC_RXBUF_PE 0x100 +#define ASC_RXBUF_FE 0x200 +/** + * Some of status comes from higher bits of the character and some come from + * the status register. Combining both of them in to single status using dummy + * bits. + */ +#define ASC_RXBUF_DUMMY_RX 0x10000 +#define ASC_RXBUF_DUMMY_BE 0x20000 +#define ASC_RXBUF_DUMMY_OE 0x40000 + +/* ASC_CTL */ + +#define ASC_CTL_MODE_MSK 0x0007 +#define ASC_CTL_MODE_8BIT 0x0001 +#define ASC_CTL_MODE_7BIT_PAR 0x0003 +#define ASC_CTL_MODE_9BIT 0x0004 +#define ASC_CTL_MODE_8BIT_WKUP 0x0005 +#define ASC_CTL_MODE_8BIT_PAR 0x0007 +#define ASC_CTL_STOP_MSK 0x0018 +#define ASC_CTL_STOP_HALFBIT 0x0000 +#define ASC_CTL_STOP_1BIT 0x0008 +#define ASC_CTL_STOP_1_HALFBIT 0x0010 +#define ASC_CTL_STOP_2BIT 0x0018 +#define ASC_CTL_PARITYODD 0x0020 +#define ASC_CTL_LOOPBACK 0x0040 +#define ASC_CTL_RUN 0x0080 +#define ASC_CTL_RXENABLE 0x0100 +#define ASC_CTL_SCENABLE 0x0200 +#define ASC_CTL_FIFOENABLE 0x0400 +#define ASC_CTL_CTSENABLE 0x0800 +#define ASC_CTL_BAUDMODE 0x1000 + +/* ASC_GUARDTIME */ + +#define ASC_GUARDTIME_MSK 0x00FF + +/* ASC_INTEN */ + +#define ASC_INTEN_RBE 0x0001 +#define ASC_INTEN_TE 0x0002 +#define ASC_INTEN_THE 0x0004 +#define ASC_INTEN_PE 0x0008 +#define ASC_INTEN_FE 0x0010 +#define ASC_INTEN_OE 0x0020 +#define ASC_INTEN_TNE 0x0040 +#define ASC_INTEN_TOI 0x0080 +#define ASC_INTEN_RHF 0x0100 + +/* ASC_RETRIES */ + +#define ASC_RETRIES_MSK 0x00FF + +/* ASC_RXBUF */ + +#define ASC_RXBUF_MSK 0x03FF + +/* ASC_STA */ + +#define ASC_STA_RBF 0x0001 +#define ASC_STA_TE 0x0002 +#define ASC_STA_THE 0x0004 +#define ASC_STA_PE 0x0008 +#define ASC_STA_FE 0x0010 +#define ASC_STA_OE 0x0020 +#define ASC_STA_TNE 0x0040 +#define ASC_STA_TOI 0x0080 +#define ASC_STA_RHF 0x0100 +#define ASC_STA_TF 0x0200 +#define ASC_STA_NKD 0x0400 + +/* ASC_TIMEOUT */ + +#define ASC_TIMEOUT_MSK 0x00FF + +/* ASC_TXBUF */ + +#define ASC_TXBUF_MSK 0x01FF + +/*---- Inline function definitions ---------------------------*/ + +static inline struct asc_port *to_asc_port(struct uart_port *port) +{ + return container_of(port, struct asc_port, port); +} + +static inline u32 asc_in(struct uart_port *port, u32 offset) +{ + return readl(port->membase + offset); +} + +static inline void asc_out(struct uart_port *port, u32 offset, u32 value) +{ + writel(value, port->membase + offset); +} + +/* + * Some simple utility functions to enable and disable interrupts. + * Note that these need to be called with interrupts disabled. + */ +static inline void asc_disable_tx_interrupts(struct uart_port *port) +{ + u32 intenable = asc_in(port, ASC_INTEN) & ~ASC_INTEN_THE; + asc_out(port, ASC_INTEN, intenable); + (void)asc_in(port, ASC_INTEN); /* Defeat bus write posting */ +} + +static inline void asc_enable_tx_interrupts(struct uart_port *port) +{ + u32 intenable = asc_in(port, ASC_INTEN) | ASC_INTEN_THE; + asc_out(port, ASC_INTEN, intenable); +} + +static inline void asc_disable_rx_interrupts(struct uart_port *port) +{ + u32 intenable = asc_in(port, ASC_INTEN) & ~ASC_INTEN_RBE; + asc_out(port, ASC_INTEN, intenable); + (void)asc_in(port, ASC_INTEN); /* Defeat bus write posting */ +} + +static inline void asc_enable_rx_interrupts(struct uart_port *port) +{ + u32 intenable = asc_in(port, ASC_INTEN) | ASC_INTEN_RBE; + asc_out(port, ASC_INTEN, intenable); +} + +static inline u32 asc_txfifo_is_empty(struct uart_port *port) +{ + return asc_in(port, ASC_STA) & ASC_STA_TE; +} + +static inline int asc_txfifo_is_full(struct uart_port *port) +{ + return asc_in(port, ASC_STA) & ASC_STA_TF; +} + +static inline const char *asc_port_name(struct uart_port *port) +{ + return to_platform_device(port->dev)->name; +} + +/*----------------------------------------------------------------------*/ + +/* + * This section contains code to support the use of the ASC as a + * generic serial port. + */ + +static inline unsigned asc_hw_txroom(struct uart_port *port) +{ + u32 status = asc_in(port, ASC_STA); + + if (status & ASC_STA_THE) + return port->fifosize / 2; + else if (!(status & ASC_STA_TF)) + return 1; + + return 0; +} + +/* + * Start transmitting chars. + * This is called from both interrupt and task level. + * Either way interrupts are disabled. + */ +static void asc_transmit_chars(struct uart_port *port) +{ + struct circ_buf *xmit = &port->state->xmit; + int txroom; + unsigned char c; + + txroom = asc_hw_txroom(port); + + if ((txroom != 0) && port->x_char) { + c = port->x_char; + port->x_char = 0; + asc_out(port, ASC_TXBUF, c); + port->icount.tx++; + txroom = asc_hw_txroom(port); + } + + if (uart_tx_stopped(port)) { + /* + * We should try and stop the hardware here, but I + * don't think the ASC has any way to do that. + */ + asc_disable_tx_interrupts(port); + return; + } + + if (uart_circ_empty(xmit)) { + asc_disable_tx_interrupts(port); + return; + } + + if (txroom == 0) + return; + + do { + c = xmit->buf[xmit->tail]; + xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1); + asc_out(port, ASC_TXBUF, c); + port->icount.tx++; + txroom--; + } while ((txroom > 0) && (!uart_circ_empty(xmit))); + + if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) + uart_write_wakeup(port); + + if (uart_circ_empty(xmit)) + asc_disable_tx_interrupts(port); +} + +static void asc_receive_chars(struct uart_port *port) +{ + struct tty_port *tport = &port->state->port; + unsigned long status; + unsigned long c = 0; + char flag; + + if (port->irq_wake) + pm_wakeup_event(tport->tty->dev, 0); + + while ((status = asc_in(port, ASC_STA)) & ASC_STA_RBF) { + c = asc_in(port, ASC_RXBUF) | ASC_RXBUF_DUMMY_RX; + flag = TTY_NORMAL; + port->icount.rx++; + + if ((c & (ASC_RXBUF_FE | ASC_RXBUF_PE)) || + status & ASC_STA_OE) { + + if (c & ASC_RXBUF_FE) { + if (c == ASC_RXBUF_FE) { + port->icount.brk++; + if (uart_handle_break(port)) + continue; + c |= ASC_RXBUF_DUMMY_BE; + } else { + port->icount.frame++; + } + } else if (c & ASC_RXBUF_PE) { + port->icount.parity++; + } + /* + * Reading any data from the RX FIFO clears the + * overflow error condition. + */ + if (status & ASC_STA_OE) { + port->icount.overrun++; + c |= ASC_RXBUF_DUMMY_OE; + } + + c &= port->read_status_mask; + + if (c & ASC_RXBUF_DUMMY_BE) + flag = TTY_BREAK; + else if (c & ASC_RXBUF_PE) + flag = TTY_PARITY; + else if (c & ASC_RXBUF_FE) + flag = TTY_FRAME; + } + + if (uart_handle_sysrq_char(port, c)) + continue; + + uart_insert_char(port, c, ASC_RXBUF_DUMMY_OE, c & 0xff, flag); + } + + /* Tell the rest of the system the news. New characters! */ + tty_flip_buffer_push(tport); +} + +static irqreturn_t asc_interrupt(int irq, void *ptr) +{ + struct uart_port *port = ptr; + u32 status; + + spin_lock(&port->lock); + + status = asc_in(port, ASC_STA); + + if (status & ASC_STA_RBF) { + /* Receive FIFO not empty */ + asc_receive_chars(port); + } + + if ((status & ASC_STA_THE) && + (asc_in(port, ASC_INTEN) & ASC_INTEN_THE)) { + /* Transmitter FIFO at least half empty */ + asc_transmit_chars(port); + } + + spin_unlock(&port->lock); + + return IRQ_HANDLED; +} + +/*----------------------------------------------------------------------*/ + +/* + * UART Functions + */ + +static unsigned int asc_tx_empty(struct uart_port *port) +{ + return asc_txfifo_is_empty(port) ? TIOCSER_TEMT : 0; +} + +static void asc_set_mctrl(struct uart_port *port, unsigned int mctrl) +{ + /* + * This routine is used for seting signals of: DTR, DCD, CTS/RTS + * We use ASC's hardware for CTS/RTS, so don't need any for that. + * Some boards have DTR and DCD implemented using PIO pins, + * code to do this should be hooked in here. + */ +} + +static unsigned int asc_get_mctrl(struct uart_port *port) +{ + /* + * This routine is used for geting signals of: DTR, DCD, DSR, RI, + * and CTS/RTS + */ + return TIOCM_CAR | TIOCM_DSR | TIOCM_CTS; +} + +/* There are probably characters waiting to be transmitted. */ +static void asc_start_tx(struct uart_port *port) +{ + struct circ_buf *xmit = &port->state->xmit; + + if (!uart_circ_empty(xmit)) + asc_enable_tx_interrupts(port); +} + +/* Transmit stop */ +static void asc_stop_tx(struct uart_port *port) +{ + asc_disable_tx_interrupts(port); +} + +/* Receive stop */ +static void asc_stop_rx(struct uart_port *port) +{ + asc_disable_rx_interrupts(port); +} + +/* Force modem status interrupts on */ +static void asc_enable_ms(struct uart_port *port) +{ + /* Nothing here yet .. */ +} + +/* Handle breaks - ignored by us */ +static void asc_break_ctl(struct uart_port *port, int break_state) +{ + /* Nothing here yet .. */ +} + +/* + * Enable port for reception. + */ +static int asc_startup(struct uart_port *port) +{ + if (request_irq(port->irq, asc_interrupt, IRQF_NO_SUSPEND, + asc_port_name(port), port)) { + dev_err(port->dev, "cannot allocate irq.\n"); + return -ENODEV; + } + + asc_transmit_chars(port); + asc_enable_rx_interrupts(port); + + return 0; +} + +static void asc_shutdown(struct uart_port *port) +{ + asc_disable_tx_interrupts(port); + asc_disable_rx_interrupts(port); + free_irq(port->irq, port); +} + +static void asc_pm(struct uart_port *port, unsigned int state, + unsigned int oldstate) +{ + struct asc_port *ascport = to_asc_port(port); + unsigned long flags = 0; + u32 ctl; + + switch (state) { + case UART_PM_STATE_ON: + clk_prepare_enable(ascport->clk); + break; + case UART_PM_STATE_OFF: + /* + * Disable the ASC baud rate generator, which is as close as + * we can come to turning it off. Note this is not called with + * the port spinlock held. + */ + spin_lock_irqsave(&port->lock, flags); + ctl = asc_in(port, ASC_CTL) & ~ASC_CTL_RUN; + asc_out(port, ASC_CTL, ctl); + spin_unlock_irqrestore(&port->lock, flags); + clk_disable_unprepare(ascport->clk); + break; + } +} + +static void asc_set_termios(struct uart_port *port, struct ktermios *termios, + struct ktermios *old) +{ + struct asc_port *ascport = to_asc_port(port); + unsigned int baud; + u32 ctrl_val; + tcflag_t cflag; + unsigned long flags; + + /* Update termios to reflect hardware capabilities */ + termios->c_cflag &= ~(CMSPAR | + (ascport->hw_flow_control ? 0 : CRTSCTS)); + + port->uartclk = clk_get_rate(ascport->clk); + + baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk/16); + cflag = termios->c_cflag; + + spin_lock_irqsave(&port->lock, flags); + + /* read control register */ + ctrl_val = asc_in(port, ASC_CTL); + + /* stop serial port and reset value */ + asc_out(port, ASC_CTL, (ctrl_val & ~ASC_CTL_RUN)); + ctrl_val = ASC_CTL_RXENABLE | ASC_CTL_FIFOENABLE; + + /* reset fifo rx & tx */ + asc_out(port, ASC_TXRESET, 1); + asc_out(port, ASC_RXRESET, 1); + + /* set character length */ + if ((cflag & CSIZE) == CS7) { + ctrl_val |= ASC_CTL_MODE_7BIT_PAR; + } else { + ctrl_val |= (cflag & PARENB) ? ASC_CTL_MODE_8BIT_PAR : + ASC_CTL_MODE_8BIT; + } + + /* set stop bit */ + ctrl_val |= (cflag & CSTOPB) ? ASC_CTL_STOP_2BIT : ASC_CTL_STOP_1BIT; + + /* odd parity */ + if (cflag & PARODD) + ctrl_val |= ASC_CTL_PARITYODD; + + /* hardware flow control */ + if ((cflag & CRTSCTS)) + ctrl_val |= ASC_CTL_CTSENABLE; + + if ((baud < 19200) && !ascport->force_m1) { + asc_out(port, ASC_BAUDRATE, (port->uartclk / (16 * baud))); + } else { + /* + * MODE 1: recommended for high bit rates (above 19.2K) + * + * baudrate * 16 * 2^16 + * ASCBaudRate = ------------------------ + * inputclock + * + * However to keep the maths inside 32bits we divide top and + * bottom by 64. The +1 is to avoid a divide by zero if the + * input clock rate is something unexpected. + */ + u32 counter = (baud * 16384) / ((port->uartclk / 64) + 1); + asc_out(port, ASC_BAUDRATE, counter); + ctrl_val |= ASC_CTL_BAUDMODE; + } + + uart_update_timeout(port, cflag, baud); + + ascport->port.read_status_mask = ASC_RXBUF_DUMMY_OE; + if (termios->c_iflag & INPCK) + ascport->port.read_status_mask |= ASC_RXBUF_FE | ASC_RXBUF_PE; + if (termios->c_iflag & (BRKINT | PARMRK)) + ascport->port.read_status_mask |= ASC_RXBUF_DUMMY_BE; + + /* + * Characters to ignore + */ + ascport->port.ignore_status_mask = 0; + if (termios->c_iflag & IGNPAR) + ascport->port.ignore_status_mask |= ASC_RXBUF_FE | ASC_RXBUF_PE; + if (termios->c_iflag & IGNBRK) { + ascport->port.ignore_status_mask |= ASC_RXBUF_DUMMY_BE; + /* + * If we're ignoring parity and break indicators, + * ignore overruns too (for real raw support). + */ + if (termios->c_iflag & IGNPAR) + ascport->port.ignore_status_mask |= ASC_RXBUF_DUMMY_OE; + } + + /* + * Ignore all characters if CREAD is not set. + */ + if (!(termios->c_cflag & CREAD)) + ascport->port.ignore_status_mask |= ASC_RXBUF_DUMMY_RX; + + /* Set the timeout */ + asc_out(port, ASC_TIMEOUT, 20); + + /* write final value and enable port */ + asc_out(port, ASC_CTL, (ctrl_val | ASC_CTL_RUN)); + + spin_unlock_irqrestore(&port->lock, flags); +} + +static const char *asc_type(struct uart_port *port) +{ + return (port->type == PORT_ASC) ? DRIVER_NAME : NULL; +} + +static void asc_release_port(struct uart_port *port) +{ +} + +static int asc_request_port(struct uart_port *port) +{ + return 0; +} + +/* + * Called when the port is opened, and UPF_BOOT_AUTOCONF flag is set + * Set type field if successful + */ +static void asc_config_port(struct uart_port *port, int flags) +{ + if ((flags & UART_CONFIG_TYPE)) + port->type = PORT_ASC; +} + +static int +asc_verify_port(struct uart_port *port, struct serial_struct *ser) +{ + /* No user changeable parameters */ + return -EINVAL; +} + +#ifdef CONFIG_CONSOLE_POLL +/* + * Console polling routines for writing and reading from the uart while + * in an interrupt or debug context (i.e. kgdb). + */ + +static int asc_get_poll_char(struct uart_port *port) +{ + if (!(asc_in(port, ASC_STA) & ASC_STA_RBF)) + return NO_POLL_CHAR; + + return asc_in(port, ASC_RXBUF); +} + +static void asc_put_poll_char(struct uart_port *port, unsigned char c) +{ + while (asc_txfifo_is_full(port)) + cpu_relax(); + asc_out(port, ASC_TXBUF, c); +} + +#endif /* CONFIG_CONSOLE_POLL */ + +/*---------------------------------------------------------------------*/ + +static struct uart_ops asc_uart_ops = { + .tx_empty = asc_tx_empty, + .set_mctrl = asc_set_mctrl, + .get_mctrl = asc_get_mctrl, + .start_tx = asc_start_tx, + .stop_tx = asc_stop_tx, + .stop_rx = asc_stop_rx, + .enable_ms = asc_enable_ms, + .break_ctl = asc_break_ctl, + .startup = asc_startup, + .shutdown = asc_shutdown, + .set_termios = asc_set_termios, + .type = asc_type, + .release_port = asc_release_port, + .request_port = asc_request_port, + .config_port = asc_config_port, + .verify_port = asc_verify_port, + .pm = asc_pm, +#ifdef CONFIG_CONSOLE_POLL + .poll_get_char = asc_get_poll_char, + .poll_put_char = asc_put_poll_char, +#endif /* CONFIG_CONSOLE_POLL */ +}; + +static int asc_init_port(struct asc_port *ascport, + struct platform_device *pdev) +{ + struct uart_port *port = &ascport->port; + struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + + if (!res) { + dev_err(&pdev->dev, "Unable to get io resource\n"); + return -ENODEV; + } + + port->iotype = UPIO_MEM; + port->flags = UPF_BOOT_AUTOCONF; + port->ops = &asc_uart_ops; + port->fifosize = ASC_FIFO_SIZE; + port->dev = &pdev->dev; + port->mapbase = res->start; + port->irq = platform_get_irq(pdev, 0); + + port->membase = devm_request_and_ioremap(&pdev->dev, res); + if (!port->membase) { + dev_err(&pdev->dev, "Unable to request io memory\n"); + return -ENODEV; + } + + spin_lock_init(&port->lock); + + ascport->clk = devm_clk_get(&pdev->dev, NULL); + + if (WARN_ON(IS_ERR(ascport->clk))) + return -EINVAL; + /* ensure that clk rate is correct by enabling the clk */ + clk_prepare_enable(ascport->clk); + ascport->port.uartclk = clk_get_rate(ascport->clk); + WARN_ON(ascport->port.uartclk == 0); + clk_disable_unprepare(ascport->clk); + + return 0; +} + +static struct asc_port *asc_of_get_asc_port(struct platform_device *pdev) +{ + struct device_node *np = pdev->dev.of_node; + int id; + + if (!np) + return NULL; + + id = of_alias_get_id(np, ASC_SERIAL_NAME); + + if (id < 0) + id = 0; + + if (WARN_ON(id >= ASC_MAX_PORTS)) + return NULL; + + asc_ports[id].hw_flow_control = of_property_read_bool(np, + "st,hw-flow-control"); + asc_ports[id].force_m1 = of_property_read_bool(np, "st,force_m1"); + asc_ports[id].port.line = id; + return &asc_ports[id]; +} + +static struct of_device_id asc_match[] = { + { .compatible = "st,asc", }, + {}, +}; + +MODULE_DEVICE_TABLE(of, asc_match); + +static int asc_serial_probe(struct platform_device *pdev) +{ + int ret; + struct asc_port *ascport; + + ascport = asc_of_get_asc_port(pdev); + if (!ascport) + return -ENODEV; + + ret = asc_init_port(ascport, pdev); + if (ret) + return ret; + + ret = uart_add_one_port(&asc_uart_driver, &ascport->port); + if (ret) + return ret; + + platform_set_drvdata(pdev, &ascport->port); + + return 0; +} + +static int asc_serial_remove(struct platform_device *pdev) +{ + struct uart_port *port = platform_get_drvdata(pdev); + + platform_set_drvdata(pdev, NULL); + return uart_remove_one_port(&asc_uart_driver, port); +} + +#ifdef CONFIG_PM_SLEEP +static int asc_serial_suspend(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct uart_port *port = platform_get_drvdata(pdev); + + return uart_suspend_port(&asc_uart_driver, port); +} + +static int asc_serial_resume(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct uart_port *port = platform_get_drvdata(pdev); + + return uart_resume_port(&asc_uart_driver, port); +} + +#endif /* CONFIG_PM_SLEEP */ + +/*----------------------------------------------------------------------*/ + +#ifdef CONFIG_SERIAL_ST_ASC_CONSOLE +static void asc_console_putchar(struct uart_port *port, int ch) +{ + unsigned int timeout = 1000000; + + /* Wait for upto 1 second in case flow control is stopping us. */ + while (--timeout && asc_txfifo_is_full(port)) + udelay(1); + + asc_out(port, ASC_TXBUF, ch); +} + +/* + * Print a string to the serial port trying not to disturb + * any possible real use of the port... + */ + +static void asc_console_write(struct console *co, const char *s, unsigned count) +{ + struct uart_port *port = &asc_ports[co->index].port; + unsigned long flags; + unsigned long timeout = 1000000; + int locked = 1; + u32 intenable; + + local_irq_save(flags); + if (port->sysrq) + locked = 0; /* asc_interrupt has already claimed the lock */ + else if (oops_in_progress) + locked = spin_trylock(&port->lock); + else + spin_lock(&port->lock); + + /* + * Disable interrupts so we don't get the IRQ line bouncing + * up and down while interrupts are disabled. + */ + intenable = asc_in(port, ASC_INTEN); + asc_out(port, ASC_INTEN, 0); + (void)asc_in(port, ASC_INTEN); /* Defeat bus write posting */ + + uart_console_write(port, s, count, asc_console_putchar); + + while (--timeout && !asc_txfifo_is_empty(port)) + udelay(1); + + asc_out(port, ASC_INTEN, intenable); + + if (locked) + spin_unlock(&port->lock); + local_irq_restore(flags); +} + +static int asc_console_setup(struct console *co, char *options) +{ + struct asc_port *ascport; + int baud = 9600; + int bits = 8; + int parity = 'n'; + int flow = 'n'; + + if (co->index >= ASC_MAX_PORTS) + return -ENODEV; + + ascport = &asc_ports[co->index]; + + /* + * This driver does not support early console initialization + * (use ARM early printk support instead), so we only expect + * this to be called during the uart port registration when the + * driver gets probed and the port should be mapped at that point. + */ + BUG_ON(ascport->port.mapbase == 0 || ascport->port.membase == NULL); + + if (options) + uart_parse_options(options, &baud, &parity, &bits, &flow); + + return uart_set_options(&ascport->port, co, baud, parity, bits, flow); +} + +static struct console asc_console = { + .name = ASC_SERIAL_NAME, + .device = uart_console_device, + .write = asc_console_write, + .setup = asc_console_setup, + .flags = CON_PRINTBUFFER, + .index = -1, + .data = &asc_uart_driver, +}; + +#define ASC_SERIAL_CONSOLE (&asc_console) + +#else +#define ASC_SERIAL_CONSOLE NULL +#endif /* CONFIG_SERIAL_ST_ASC_CONSOLE */ + +static struct uart_driver asc_uart_driver = { + .owner = THIS_MODULE, + .driver_name = DRIVER_NAME, + .dev_name = ASC_SERIAL_NAME, + .major = 0, + .minor = 0, + .nr = ASC_MAX_PORTS, + .cons = ASC_SERIAL_CONSOLE, +}; + +static const struct dev_pm_ops asc_serial_pm_ops = { + SET_SYSTEM_SLEEP_PM_OPS(asc_serial_suspend, asc_serial_resume) +}; + +static struct platform_driver asc_serial_driver = { + .probe = asc_serial_probe, + .remove = asc_serial_remove, + .driver = { + .name = DRIVER_NAME, + .pm = &asc_serial_pm_ops, + .owner = THIS_MODULE, + .of_match_table = of_match_ptr(asc_match), + }, +}; + +static int __init asc_init(void) +{ + int ret; + static char banner[] __initdata = + KERN_INFO "STMicroelectronics ASC driver initialized\n"; + + printk(banner); + + ret = uart_register_driver(&asc_uart_driver); + if (ret) + return ret; + + ret = platform_driver_register(&asc_serial_driver); + if (ret) + uart_unregister_driver(&asc_uart_driver); + + return ret; +} + +static void __exit asc_exit(void) +{ + platform_driver_unregister(&asc_serial_driver); + uart_unregister_driver(&asc_uart_driver); +} + +module_init(asc_init); +module_exit(asc_exit); + +MODULE_ALIAS("platform:" DRIVER_NAME); +MODULE_AUTHOR("STMicroelectronics (R&D) Limited"); +MODULE_DESCRIPTION("STMicroelectronics ASC serial port driver"); +MODULE_LICENSE("GPL"); diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index 9119cc0977bf..e40ebe124ced 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -232,4 +232,7 @@ /* SH-SCI */ #define PORT_HSCIF 104 +/* ST ASC type numbers */ +#define PORT_ASC 105 + #endif /* _UAPILINUX_SERIAL_CORE_H */ -- cgit v1.2.3-71-gd317 From 0699a73af3811b66b1ab5650575acee5eea841ab Mon Sep 17 00:00:00 2001 From: Clemens Ladisch Date: Mon, 22 Jul 2013 21:32:09 +0200 Subject: firewire: fix libdc1394/FlyCap2 iso event regression Commit 18d627113b83 (firewire: prevent dropping of completed iso packet header data) was intended to be an obvious bug fix, but libdc1394 and FlyCap2 depend on the old behaviour by ignoring all returned information and thus not noticing that not all packets have been received yet. The result was that the video frame buffers would be saved before they contained the correct data. Reintroduce the old behaviour for old clients. Tested-by: Stepan Salenikovich Tested-by: Josep Bosch Cc: # 3.4+ Signed-off-by: Clemens Ladisch Signed-off-by: Stefan Richter --- drivers/firewire/core-cdev.c | 3 +++ drivers/firewire/ohci.c | 10 ++++++++-- include/linux/firewire.h | 1 + include/uapi/linux/firewire-cdev.h | 4 ++-- 4 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 7ef316fdc4d9..ac1b43a04285 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -54,6 +54,7 @@ #define FW_CDEV_KERNEL_VERSION 5 #define FW_CDEV_VERSION_EVENT_REQUEST2 4 #define FW_CDEV_VERSION_ALLOCATE_REGION_END 4 +#define FW_CDEV_VERSION_AUTO_FLUSH_ISO_OVERFLOW 5 struct client { u32 version; @@ -1005,6 +1006,8 @@ static int ioctl_create_iso_context(struct client *client, union ioctl_arg *arg) a->channel, a->speed, a->header_size, cb, client); if (IS_ERR(context)) return PTR_ERR(context); + if (client->version < FW_CDEV_VERSION_AUTO_FLUSH_ISO_OVERFLOW) + context->drop_overflow_headers = true; /* We only support one context at this time. */ spin_lock_irq(&client->lock); diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 9e1db6490b9a..afb701ec90ca 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -2749,8 +2749,11 @@ static void copy_iso_headers(struct iso_context *ctx, const u32 *dma_hdr) { u32 *ctx_hdr; - if (ctx->header_length + ctx->base.header_size > PAGE_SIZE) + if (ctx->header_length + ctx->base.header_size > PAGE_SIZE) { + if (ctx->base.drop_overflow_headers) + return; flush_iso_completions(ctx); + } ctx_hdr = ctx->header + ctx->header_length; ctx->last_timestamp = (u16)le32_to_cpu((__force __le32)dma_hdr[0]); @@ -2910,8 +2913,11 @@ static int handle_it_packet(struct context *context, sync_it_packet_for_cpu(context, d); - if (ctx->header_length + 4 > PAGE_SIZE) + if (ctx->header_length + 4 > PAGE_SIZE) { + if (ctx->base.drop_overflow_headers) + return 1; flush_iso_completions(ctx); + } ctx_hdr = ctx->header + ctx->header_length; ctx->last_timestamp = le16_to_cpu(last->res_count); diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 3b0e820375ab..5d7782e42b8f 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -436,6 +436,7 @@ struct fw_iso_context { int type; int channel; int speed; + bool drop_overflow_headers; size_t header_size; union { fw_iso_callback_t sc; diff --git a/include/uapi/linux/firewire-cdev.h b/include/uapi/linux/firewire-cdev.h index d50036953497..1db453e4b550 100644 --- a/include/uapi/linux/firewire-cdev.h +++ b/include/uapi/linux/firewire-cdev.h @@ -215,8 +215,8 @@ struct fw_cdev_event_request2 { * with the %FW_CDEV_ISO_INTERRUPT bit set, when explicitly requested with * %FW_CDEV_IOC_FLUSH_ISO, or when there have been so many completed packets * without the interrupt bit set that the kernel's internal buffer for @header - * is about to overflow. (In the last case, kernels with ABI version < 5 drop - * header data up to the next interrupt packet.) + * is about to overflow. (In the last case, ABI versions < 5 drop header data + * up to the next interrupt packet.) * * Isochronous transmit events (context type %FW_CDEV_ISO_CONTEXT_TRANSMIT): * -- cgit v1.2.3-71-gd317 From e7428e95a06fb516fac1308bd0e176e27c0b9287 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 25 Jul 2013 10:20:23 +0930 Subject: virtio-net: put virtio net header inline with data For small packets we can simplify xmit processing by linearizing buffers with the header: most packets seem to have enough head room we can use for this purpose. Since existing hypervisors require that header is the first s/g element, we need a feature bit for this. Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 42 +++++++++++++++++++++++++++++++++-------- include/uapi/linux/virtio_net.h | 4 +++- 2 files changed, 37 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 3d2a90a62649..f21600277583 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -106,6 +106,9 @@ struct virtnet_info { /* Has control virtqueue */ bool has_cvq; + /* Host can handle any s/g split between our header and packet data */ + bool any_header_sg; + /* enable config space updates */ bool config_enable; @@ -669,12 +672,28 @@ static void free_old_xmit_skbs(struct send_queue *sq) static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) { - struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb); + struct skb_vnet_hdr *hdr; const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; struct virtnet_info *vi = sq->vq->vdev->priv; unsigned num_sg; + unsigned hdr_len; + bool can_push; pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest); + if (vi->mergeable_rx_bufs) + hdr_len = sizeof hdr->mhdr; + else + hdr_len = sizeof hdr->hdr; + + can_push = vi->any_header_sg && + !((unsigned long)skb->data & (__alignof__(*hdr) - 1)) && + !skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len; + /* Even if we can, don't push here yet as this would skew + * csum_start offset below. */ + if (can_push) + hdr = (struct skb_vnet_hdr *)(skb->data - hdr_len); + else + hdr = skb_vnet_hdr(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) { hdr->hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; @@ -703,15 +722,18 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) hdr->hdr.gso_size = hdr->hdr.hdr_len = 0; } - hdr->mhdr.num_buffers = 0; - - /* Encode metadata header at front. */ if (vi->mergeable_rx_bufs) - sg_set_buf(sq->sg, &hdr->mhdr, sizeof hdr->mhdr); - else - sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr); + hdr->mhdr.num_buffers = 0; - num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1; + if (can_push) { + __skb_push(skb, hdr_len); + num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len); + /* Pull header back to avoid skew in tx bytes calculations. */ + __skb_pull(skb, hdr_len); + } else { + sg_set_buf(sq->sg, hdr, hdr_len); + num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1; + } return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC); } @@ -1552,6 +1574,9 @@ static int virtnet_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) vi->mergeable_rx_bufs = true; + if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT)) + vi->any_header_sg = true; + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) vi->has_cvq = true; @@ -1727,6 +1752,7 @@ static unsigned int features[] = { VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, VIRTIO_NET_F_CTRL_MAC_ADDR, + VIRTIO_F_ANY_LAYOUT, }; static struct virtio_driver virtio_net_driver = { diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index c520203fac2f..227d4ce84e79 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -70,7 +70,9 @@ struct virtio_net_config { __u16 max_virtqueue_pairs; } __attribute__((packed)); -/* This is the first element of the scatter-gather list. If you don't +/* This header comes first in the scatter-gather list. + * If VIRTIO_F_ANY_LAYOUT is not negotiated, it must + * be the first element of the scatter-gather list. If you don't * specify GSO or CSUM features, you can simply ignore the header. */ struct virtio_net_hdr { #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 // Use csum_start, csum_offset -- cgit v1.2.3-71-gd317 From 9ea7187c53f63e31f2d1b2b1e474e31808565009 Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Wed, 31 Jul 2013 01:19:43 +0200 Subject: NFC: netlink: Rename CMD_FW_UPLOAD to CMD_FW_DOWNLOAD Loading a firmware into a target is typically called firmware download, not firmware upload. So we rename the netlink API to NFC_CMD_FW_DOWNLOAD in order to avoid any terminology confusion from userspace. Signed-off-by: Samuel Ortiz --- include/net/nfc/hci.h | 2 +- include/net/nfc/nfc.h | 4 ++-- include/uapi/linux/nfc.h | 6 +++--- net/nfc/core.c | 20 ++++++++++---------- net/nfc/hci/core.c | 8 ++++---- net/nfc/netlink.c | 12 ++++++------ net/nfc/nfc.h | 6 +++--- 7 files changed, 29 insertions(+), 29 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/nfc/hci.h b/include/net/nfc/hci.h index 0af851c3b038..b64b7bce4b94 100644 --- a/include/net/nfc/hci.h +++ b/include/net/nfc/hci.h @@ -59,7 +59,7 @@ struct nfc_hci_ops { struct nfc_target *target); int (*event_received)(struct nfc_hci_dev *hdev, u8 gate, u8 event, struct sk_buff *skb); - int (*fw_upload)(struct nfc_hci_dev *hdev, const char *firmware_name); + int (*fw_download)(struct nfc_hci_dev *hdev, const char *firmware_name); int (*discover_se)(struct nfc_hci_dev *dev); int (*enable_se)(struct nfc_hci_dev *dev, u32 se_idx); int (*disable_se)(struct nfc_hci_dev *dev, u32 se_idx); diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h index 0e353f1658bb..5f286b726bb6 100644 --- a/include/net/nfc/nfc.h +++ b/include/net/nfc/nfc.h @@ -68,7 +68,7 @@ struct nfc_ops { void *cb_context); int (*tm_send)(struct nfc_dev *dev, struct sk_buff *skb); int (*check_presence)(struct nfc_dev *dev, struct nfc_target *target); - int (*fw_upload)(struct nfc_dev *dev, const char *firmware_name); + int (*fw_download)(struct nfc_dev *dev, const char *firmware_name); /* Secure Element API */ int (*discover_se)(struct nfc_dev *dev); @@ -127,7 +127,7 @@ struct nfc_dev { int targets_generation; struct device dev; bool dev_up; - bool fw_upload_in_progress; + bool fw_download_in_progress; u8 rf_mode; bool polling; struct nfc_target *active_target; diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index caed0f324d5f..8137dd8d2adf 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -69,8 +69,8 @@ * starting a poll from a device which has a secure element enabled means * we want to do SE based card emulation. * @NFC_CMD_DISABLE_SE: Disable the physical link to a specific secure element. - * @NFC_CMD_FW_UPLOAD: Request to Load/flash firmware, or event to inform that - * some firmware was loaded + * @NFC_CMD_FW_DOWNLOAD: Request to Load/flash firmware, or event to inform + * that some firmware was loaded */ enum nfc_commands { NFC_CMD_UNSPEC, @@ -94,7 +94,7 @@ enum nfc_commands { NFC_CMD_DISABLE_SE, NFC_CMD_LLC_SDREQ, NFC_EVENT_LLC_SDRES, - NFC_CMD_FW_UPLOAD, + NFC_CMD_FW_DOWNLOAD, NFC_EVENT_SE_ADDED, NFC_EVENT_SE_REMOVED, /* private: internal use only */ diff --git a/net/nfc/core.c b/net/nfc/core.c index dc96a83aa6ab..1d074dd1650f 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -44,7 +44,7 @@ DEFINE_MUTEX(nfc_devlist_mutex); /* NFC device ID bitmap */ static DEFINE_IDA(nfc_index_ida); -int nfc_fw_upload(struct nfc_dev *dev, const char *firmware_name) +int nfc_fw_download(struct nfc_dev *dev, const char *firmware_name) { int rc = 0; @@ -62,28 +62,28 @@ int nfc_fw_upload(struct nfc_dev *dev, const char *firmware_name) goto error; } - if (!dev->ops->fw_upload) { + if (!dev->ops->fw_download) { rc = -EOPNOTSUPP; goto error; } - dev->fw_upload_in_progress = true; - rc = dev->ops->fw_upload(dev, firmware_name); + dev->fw_download_in_progress = true; + rc = dev->ops->fw_download(dev, firmware_name); if (rc) - dev->fw_upload_in_progress = false; + dev->fw_download_in_progress = false; error: device_unlock(&dev->dev); return rc; } -int nfc_fw_upload_done(struct nfc_dev *dev, const char *firmware_name) +int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name) { - dev->fw_upload_in_progress = false; + dev->fw_download_in_progress = false; - return nfc_genl_fw_upload_done(dev, firmware_name); + return nfc_genl_fw_download_done(dev, firmware_name); } -EXPORT_SYMBOL(nfc_fw_upload_done); +EXPORT_SYMBOL(nfc_fw_download_done); /** * nfc_dev_up - turn on the NFC device @@ -110,7 +110,7 @@ int nfc_dev_up(struct nfc_dev *dev) goto error; } - if (dev->fw_upload_in_progress) { + if (dev->fw_download_in_progress) { rc = -EBUSY; goto error; } diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index 7b1c186736eb..fe66908401f5 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -809,14 +809,14 @@ static void nfc_hci_recv_from_llc(struct nfc_hci_dev *hdev, struct sk_buff *skb) } } -static int hci_fw_upload(struct nfc_dev *nfc_dev, const char *firmware_name) +static int hci_fw_download(struct nfc_dev *nfc_dev, const char *firmware_name) { struct nfc_hci_dev *hdev = nfc_get_drvdata(nfc_dev); - if (!hdev->ops->fw_upload) + if (!hdev->ops->fw_download) return -ENOTSUPP; - return hdev->ops->fw_upload(hdev, firmware_name); + return hdev->ops->fw_download(hdev, firmware_name); } static struct nfc_ops hci_nfc_ops = { @@ -831,7 +831,7 @@ static struct nfc_ops hci_nfc_ops = { .im_transceive = hci_transceive, .tm_send = hci_tm_send, .check_presence = hci_check_presence, - .fw_upload = hci_fw_upload, + .fw_download = hci_fw_download, .discover_se = hci_discover_se, .enable_se = hci_enable_se, .disable_se = hci_disable_se, diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index b05ad909778f..f16fd59d4160 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1089,7 +1089,7 @@ exit: return rc; } -static int nfc_genl_fw_upload(struct sk_buff *skb, struct genl_info *info) +static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; @@ -1108,13 +1108,13 @@ static int nfc_genl_fw_upload(struct sk_buff *skb, struct genl_info *info) nla_strlcpy(firmware_name, info->attrs[NFC_ATTR_FIRMWARE_NAME], sizeof(firmware_name)); - rc = nfc_fw_upload(dev, firmware_name); + rc = nfc_fw_download(dev, firmware_name); nfc_put_device(dev); return rc; } -int nfc_genl_fw_upload_done(struct nfc_dev *dev, const char *firmware_name) +int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name) { struct sk_buff *msg; void *hdr; @@ -1124,7 +1124,7 @@ int nfc_genl_fw_upload_done(struct nfc_dev *dev, const char *firmware_name) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, - NFC_CMD_FW_UPLOAD); + NFC_CMD_FW_DOWNLOAD); if (!hdr) goto free_msg; @@ -1251,8 +1251,8 @@ static struct genl_ops nfc_genl_ops[] = { .policy = nfc_genl_policy, }, { - .cmd = NFC_CMD_FW_UPLOAD, - .doit = nfc_genl_fw_upload, + .cmd = NFC_CMD_FW_DOWNLOAD, + .doit = nfc_genl_fw_download, .policy = nfc_genl_policy, }, { diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h index ee85a1fc1b24..820a7850c36a 100644 --- a/net/nfc/nfc.h +++ b/net/nfc/nfc.h @@ -123,10 +123,10 @@ static inline void nfc_device_iter_exit(struct class_dev_iter *iter) class_dev_iter_exit(iter); } -int nfc_fw_upload(struct nfc_dev *dev, const char *firmware_name); -int nfc_genl_fw_upload_done(struct nfc_dev *dev, const char *firmware_name); +int nfc_fw_download(struct nfc_dev *dev, const char *firmware_name); +int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name); -int nfc_fw_upload_done(struct nfc_dev *dev, const char *firmware_name); +int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name); int nfc_dev_up(struct nfc_dev *dev); -- cgit v1.2.3-71-gd317 From 66cae9ed6bc46b8cc57a9693f99f69926f3cc7ef Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 29 Jul 2013 18:16:50 +0200 Subject: rtnl: export physical port id via RT netlink Signed-off-by: Jiri Pirko Acked-by: Ben Hutchings Signed-off-by: Narendra K Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/core/rtnetlink.c | 25 ++++++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 03f6170ab337..04c0e7a5d484 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -143,6 +143,7 @@ enum { IFLA_NUM_TX_QUEUES, IFLA_NUM_RX_QUEUES, IFLA_CARRIER, + IFLA_PHYS_PORT_ID, __IFLA_MAX }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3de740834d1f..0b2972c59b97 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -767,7 +767,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ - + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */ + + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */ + + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */ } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -846,6 +847,24 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev) return 0; } +static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) +{ + int err; + struct netdev_phys_port_id ppid; + + err = dev_get_phys_port_id(dev, &ppid); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id)) + return -EMSGSIZE; + + return 0; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask) @@ -913,6 +932,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, goto nla_put_failure; } + if (rtnl_phys_port_id_fill(skb, dev)) + goto nla_put_failure; + attr = nla_reserve(skb, IFLA_STATS, sizeof(struct rtnl_link_stats)); if (attr == NULL) @@ -1113,6 +1135,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PROMISCUITY] = { .type = NLA_U32 }, [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, + [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN }, }; EXPORT_SYMBOL(ifla_policy); -- cgit v1.2.3-71-gd317 From bdb829e1dd710029a075b5f86d4053e7715beb06 Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Mon, 15 Jul 2013 19:10:15 +0200 Subject: HID: uhid: use generic hidinput_input_event() HID core provides the same functionality and can convert the input event to a raw output report. We can thus drop UHID_OUTPUT_EV and rely on the mandatory UHID_OUTPUT. User-space wasn't able to do anything with UHID_OUTPUT_EV, anyway. They don't have access to the report fields. Signed-off-by: David Herrmann Acked-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- Documentation/hid/uhid.txt | 4 +++- drivers/hid/uhid.c | 25 ------------------------- include/uapi/linux/uhid.h | 4 +++- 3 files changed, 6 insertions(+), 27 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/hid/uhid.txt b/Documentation/hid/uhid.txt index 3c741214dfbb..dc35a2b75eee 100644 --- a/Documentation/hid/uhid.txt +++ b/Documentation/hid/uhid.txt @@ -149,11 +149,13 @@ needs. Only UHID_OUTPUT and UHID_OUTPUT_EV have payloads. is of type "struct uhid_data_req". This may be received even though you haven't received UHID_OPEN, yet. - UHID_OUTPUT_EV: + UHID_OUTPUT_EV (obsolete): Same as UHID_OUTPUT but this contains a "struct input_event" as payload. This is called for force-feedback, LED or similar events which are received through an input device by the HID subsystem. You should convert this into raw reports and send them to your device similar to events of type UHID_OUTPUT. + This is no longer sent by newer kernels. Instead, HID core converts it into a + raw output report and sends it via UHID_OUTPUT. UHID_FEATURE: This event is sent if the kernel driver wants to perform a feature request as diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c index fc307e0422af..f53f2d52e677 100644 --- a/drivers/hid/uhid.c +++ b/drivers/hid/uhid.c @@ -116,30 +116,6 @@ static void uhid_hid_close(struct hid_device *hid) uhid_queue_event(uhid, UHID_CLOSE); } -static int uhid_hid_input(struct input_dev *input, unsigned int type, - unsigned int code, int value) -{ - struct hid_device *hid = input_get_drvdata(input); - struct uhid_device *uhid = hid->driver_data; - unsigned long flags; - struct uhid_event *ev; - - ev = kzalloc(sizeof(*ev), GFP_ATOMIC); - if (!ev) - return -ENOMEM; - - ev->type = UHID_OUTPUT_EV; - ev->u.output_ev.type = type; - ev->u.output_ev.code = code; - ev->u.output_ev.value = value; - - spin_lock_irqsave(&uhid->qlock, flags); - uhid_queue(uhid, ev); - spin_unlock_irqrestore(&uhid->qlock, flags); - - return 0; -} - static int uhid_hid_parse(struct hid_device *hid) { struct uhid_device *uhid = hid->driver_data; @@ -273,7 +249,6 @@ static struct hid_ll_driver uhid_hid_driver = { .stop = uhid_hid_stop, .open = uhid_hid_open, .close = uhid_hid_close, - .hidinput_input_event = uhid_hid_input, .parse = uhid_hid_parse, }; diff --git a/include/uapi/linux/uhid.h b/include/uapi/linux/uhid.h index e9ed951e2b09..414b74be4da1 100644 --- a/include/uapi/linux/uhid.h +++ b/include/uapi/linux/uhid.h @@ -30,7 +30,7 @@ enum uhid_event_type { UHID_OPEN, UHID_CLOSE, UHID_OUTPUT, - UHID_OUTPUT_EV, + UHID_OUTPUT_EV, /* obsolete! */ UHID_INPUT, UHID_FEATURE, UHID_FEATURE_ANSWER, @@ -69,6 +69,8 @@ struct uhid_output_req { __u8 rtype; } __attribute__((__packed__)); +/* Obsolete! Newer kernels will no longer send these events but instead convert + * it into raw output reports via UHID_OUTPUT. */ struct uhid_output_ev_req { __u16 type; __u16 code; -- cgit v1.2.3-71-gd317 From 7764a45a8f1fe74d4f7d301eaca2e558e7e2831a Mon Sep 17 00:00:00 2001 From: Stefan Tomanek Date: Thu, 1 Aug 2013 02:17:15 +0200 Subject: fib_rules: add .suppress operation This change adds a new operation to the fib_rules_ops struct; it allows the suppression of routing decisions if certain criteria are not met by its results. The first implemented constraint is a minimum prefix length added to the structures of routing rules. If a rule is added with a minimum prefix length >0, only routes meeting this threshold will be considered. Any other (more general) routing table entries will be ignored. When configuring a system with multiple network uplinks and default routes, it is often convinient to reference the main routing table multiple times - but omitting the default route. Using this patch and a modified "ip" utility, this can be achieved by using the following command sequence: $ ip route add table secuplink default via 10.42.23.1 $ ip rule add pref 100 table main prefixlength 1 $ ip rule add pref 150 fwmark 0xA table secuplink With this setup, packets marked 0xA will be processed by the additional routing table "secuplink", but only if no suitable route in the main routing table can be found. By using a minimal prefixlength of 1, the default route (/0) of the table "main" is hidden to packets processed by rule 100; packets traveling to destinations with more specific routing entries are processed as usual. Signed-off-by: Stefan Tomanek Signed-off-by: David S. Miller --- include/net/fib_rules.h | 4 ++++ include/uapi/linux/fib_rules.h | 2 +- net/core/fib_rules.c | 8 ++++++++ net/ipv4/fib_rules.c | 14 ++++++++++++++ net/ipv6/fib6_rules.c | 13 +++++++++++++ 5 files changed, 40 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index e361f4882426..2f286dce9259 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -18,6 +18,7 @@ struct fib_rule { u32 pref; u32 flags; u32 table; + u8 table_prefixlen_min; u8 action; u32 target; struct fib_rule __rcu *ctarget; @@ -46,6 +47,8 @@ struct fib_rules_ops { int (*action)(struct fib_rule *, struct flowi *, int, struct fib_lookup_arg *); + bool (*suppress)(struct fib_rule *, + struct fib_lookup_arg *); int (*match)(struct fib_rule *, struct flowi *, int); int (*configure)(struct fib_rule *, @@ -80,6 +83,7 @@ struct fib_rules_ops { [FRA_FWMARK] = { .type = NLA_U32 }, \ [FRA_FWMASK] = { .type = NLA_U32 }, \ [FRA_TABLE] = { .type = NLA_U32 }, \ + [FRA_TABLE_PREFIXLEN_MIN] = { .type = NLA_U8 }, \ [FRA_GOTO] = { .type = NLA_U32 } static inline void fib_rule_get(struct fib_rule *rule) diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 51da65b68b85..59cd31b3455e 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -45,7 +45,7 @@ enum { FRA_FLOW, /* flow/class id */ FRA_UNUSED6, FRA_UNUSED7, - FRA_UNUSED8, + FRA_TABLE_PREFIXLEN_MIN, FRA_TABLE, /* Extended table id */ FRA_FWMASK, /* mask for netfilter mark */ FRA_OIFNAME, diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 21735440c44a..2ef5040c99c8 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -226,6 +226,9 @@ jumped: else err = ops->action(rule, fl, flags, arg); + if (!err && ops->suppress && ops->suppress(rule, arg)) + continue; + if (err != -EAGAIN) { if ((arg->flags & FIB_LOOKUP_NOREF) || likely(atomic_inc_not_zero(&rule->refcnt))) { @@ -337,6 +340,8 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) rule->action = frh->action; rule->flags = frh->flags; rule->table = frh_get_table(frh, tb); + if (tb[FRA_TABLE_PREFIXLEN_MIN]) + rule->table_prefixlen_min = nla_get_u8(tb[FRA_TABLE_PREFIXLEN_MIN]); if (!tb[FRA_PRIORITY] && ops->default_pref) rule->pref = ops->default_pref(ops); @@ -523,6 +528,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + + nla_total_size(1) /* FRA_TABLE_PREFIXLEN_MIN */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4); /* FRA_FWMASK */ @@ -548,6 +554,8 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh->table = rule->table; if (nla_put_u32(skb, FRA_TABLE, rule->table)) goto nla_put_failure; + if (nla_put_u8(skb, FRA_TABLE_PREFIXLEN_MIN, rule->table_prefixlen_min)) + goto nla_put_failure; frh->res1 = 0; frh->res2 = 0; frh->action = rule->action; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 26aa65d1fce4..9f2906679d1f 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -101,6 +101,19 @@ errout: return err; } +static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) +{ + /* do not accept result if the route does + * not meet the required prefix length + */ + struct fib_result *result = (struct fib_result *) arg->result; + if (result->prefixlen < rule->table_prefixlen_min) { + if (!(arg->flags & FIB_LOOKUP_NOREF)) + fib_info_put(result->fi); + return true; + } + return false; +} static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { @@ -267,6 +280,7 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = { .rule_size = sizeof(struct fib4_rule), .addr_size = sizeof(u32), .action = fib4_rule_action, + .suppress = fib4_rule_suppress, .match = fib4_rule_match, .configure = fib4_rule_configure, .delete = fib4_rule_delete, diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 2e1a432867c0..e64e6a55fc4a 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -111,6 +111,18 @@ out: return rt == NULL ? -EAGAIN : 0; } +static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) +{ + struct rt6_info *rt = (struct rt6_info *) arg->result; + /* do not accept result if the route does + * not meet the required prefix length + */ + if (rt->rt6i_dst.plen < rule->table_prefixlen_min) { + ip6_rt_put(rt); + return true; + } + return false; +} static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { @@ -244,6 +256,7 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { .addr_size = sizeof(struct in6_addr), .action = fib6_rule_action, .match = fib6_rule_match, + .suppress = fib6_rule_suppress, .configure = fib6_rule_configure, .compare = fib6_rule_compare, .fill = fib6_rule_fill, -- cgit v1.2.3-71-gd317 From 16ef1fe272332b2f7fd99236017b891db48d9cd6 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Thu, 11 Jul 2013 16:09:05 +0200 Subject: nl80211/cfg80211: add channel switch command To allow channel switch announcements within beacons, add the channel switch command to nl80211/cfg80211. This is implementation is intended for AP and (later) IBSS mode. Signed-off-by: Simon Wunderlich Signed-off-by: Mathias Kretschmer Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 33 ++++++++++++ include/uapi/linux/nl80211.h | 30 +++++++++++ net/wireless/nl80211.c | 122 ++++++++++++++++++++++++++++++++++++++++++- net/wireless/rdev-ops.h | 12 +++++ net/wireless/trace.h | 33 ++++++++++++ 5 files changed, 229 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index aeaf6dff6e05..b7495c72061c 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -665,6 +665,30 @@ struct cfg80211_ap_settings { bool radar_required; }; +/** + * struct cfg80211_csa_settings - channel switch settings + * + * Used for channel switch + * + * @chandef: defines the channel to use after the switch + * @beacon_csa: beacon data while performing the switch + * @counter_offset_beacon: offset for the counter within the beacon (tail) + * @counter_offset_presp: offset for the counter within the probe response + * @beacon_after: beacon data to be used on the new channel + * @radar_required: whether radar detection is required on the new channel + * @block_tx: whether transmissions should be blocked while changing + * @count: number of beacons until switch + */ +struct cfg80211_csa_settings { + struct cfg80211_chan_def chandef; + struct cfg80211_beacon_data beacon_csa; + u16 counter_offset_beacon, counter_offset_presp; + struct cfg80211_beacon_data beacon_after; + bool radar_required; + bool block_tx; + u8 count; +}; + /** * enum station_parameters_apply_mask - station parameter values to apply * @STATION_PARAM_APPLY_UAPSD: apply new uAPSD parameters (uapsd_queues, max_sp) @@ -2139,6 +2163,8 @@ struct cfg80211_update_ft_ies_params { * @crit_proto_stop: Indicates critical protocol no longer needs increased link * reliability. This operation can not fail. * @set_coalesce: Set coalesce parameters. + * + * @channel_switch: initiate channel-switch procedure (with CSA) */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -2376,6 +2402,10 @@ struct cfg80211_ops { struct wireless_dev *wdev); int (*set_coalesce)(struct wiphy *wiphy, struct cfg80211_coalesce *coalesce); + + int (*channel_switch)(struct wiphy *wiphy, + struct net_device *dev, + struct cfg80211_csa_settings *params); }; /* @@ -2441,6 +2471,8 @@ struct cfg80211_ops { * @WIPHY_FLAG_OFFCHAN_TX: Device supports direct off-channel TX. * @WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL: Device supports remain-on-channel call. * @WIPHY_FLAG_SUPPORTS_5_10_MHZ: Device supports 5 MHz and 10 MHz channels. + * @WIPHY_FLAG_HAS_CHANNEL_SWITCH: Device supports channel switch in + * beaconing mode (AP, IBSS, Mesh, ...). */ enum wiphy_flags { WIPHY_FLAG_CUSTOM_REGULATORY = BIT(0), @@ -2465,6 +2497,7 @@ enum wiphy_flags { WIPHY_FLAG_OFFCHAN_TX = BIT(20), WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL = BIT(21), WIPHY_FLAG_SUPPORTS_5_10_MHZ = BIT(22), + WIPHY_FLAG_HAS_CHANNEL_SWITCH = BIT(23), }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index eb68735b3318..1f42bc3dcb9c 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -676,6 +676,16 @@ * @NL80211_CMD_GET_COALESCE: Get currently supported coalesce rules. * @NL80211_CMD_SET_COALESCE: Configure coalesce rules or clear existing rules. * + * @NL80211_CMD_CHANNEL_SWITCH: Perform a channel switch by announcing the + * the new channel information (Channel Switch Announcement - CSA) + * in the beacon for some time (as defined in the + * %NL80211_ATTR_CH_SWITCH_COUNT parameter) and then change to the + * new channel. Userspace provides the new channel information (using + * %NL80211_ATTR_WIPHY_FREQ and the attributes determining channel + * width). %NL80211_ATTR_CH_SWITCH_BLOCK_TX may be supplied to inform + * other station that transmission must be blocked until the channel + * switch is complete. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -841,6 +851,8 @@ enum nl80211_commands { NL80211_CMD_GET_COALESCE, NL80211_CMD_SET_COALESCE, + NL80211_CMD_CHANNEL_SWITCH, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -1469,6 +1481,18 @@ enum nl80211_commands { * * @NL80211_ATTR_COALESCE_RULE: Coalesce rule information. * + * @NL80211_ATTR_CH_SWITCH_COUNT: u32 attribute specifying the number of TBTT's + * until the channel switch event. + * @NL80211_ATTR_CH_SWITCH_BLOCK_TX: flag attribute specifying that transmission + * must be blocked on the current channel (before the channel switch + * operation). + * @NL80211_ATTR_CSA_IES: Nested set of attributes containing the IE information + * for the time while performing a channel switch. + * @NL80211_ATTR_CSA_C_OFF_BEACON: Offset of the channel switch counter + * field in the beacons tail (%NL80211_ATTR_BEACON_TAIL). + * @NL80211_ATTR_CSA_C_OFF_PRESP: Offset of the channel switch counter + * field in the probe response (%NL80211_ATTR_PROBE_RESP). + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -1771,6 +1795,12 @@ enum nl80211_attrs { NL80211_ATTR_COALESCE_RULE, + NL80211_ATTR_CH_SWITCH_COUNT, + NL80211_ATTR_CH_SWITCH_BLOCK_TX, + NL80211_ATTR_CSA_IES, + NL80211_ATTR_CSA_C_OFF_BEACON, + NL80211_ATTR_CSA_C_OFF_PRESP, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 03d4ef95292e..f7cb12178bd2 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -349,6 +349,11 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_IE_RIC] = { .type = NLA_BINARY, .len = IEEE80211_MAX_DATA_LEN }, [NL80211_ATTR_PEER_AID] = { .type = NLA_U16 }, + [NL80211_ATTR_CH_SWITCH_COUNT] = { .type = NLA_U32 }, + [NL80211_ATTR_CH_SWITCH_BLOCK_TX] = { .type = NLA_FLAG }, + [NL80211_ATTR_CSA_IES] = { .type = NLA_NESTED }, + [NL80211_ATTR_CSA_C_OFF_BEACON] = { .type = NLA_U16 }, + [NL80211_ATTR_CSA_C_OFF_PRESP] = { .type = NLA_U16 }, }; /* policy for the key attributes */ @@ -1422,6 +1427,8 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev, if (state->split) { CMD(crit_proto_start, CRIT_PROTOCOL_START); CMD(crit_proto_stop, CRIT_PROTOCOL_STOP); + if (dev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH) + CMD(channel_switch, CHANNEL_SWITCH); } #ifdef CONFIG_NL80211_TESTMODE @@ -5613,6 +5620,111 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, return err; } +static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_csa_settings params; + /* csa_attrs is defined static to avoid waste of stack size - this + * function is called under RTNL lock, so this should not be a problem. + */ + static struct nlattr *csa_attrs[NL80211_ATTR_MAX+1]; + u8 radar_detect_width = 0; + int err; + + if (!rdev->ops->channel_switch || + !(rdev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH)) + return -EOPNOTSUPP; + + /* may add IBSS support later */ + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + return -EOPNOTSUPP; + + memset(¶ms, 0, sizeof(params)); + + if (!info->attrs[NL80211_ATTR_WIPHY_FREQ] || + !info->attrs[NL80211_ATTR_CH_SWITCH_COUNT]) + return -EINVAL; + + /* only important for AP, IBSS and mesh create IEs internally */ + if (!info->attrs[NL80211_ATTR_CSA_IES]) + return -EINVAL; + + /* useless if AP is not running */ + if (!wdev->beacon_interval) + return -EINVAL; + + params.count = nla_get_u32(info->attrs[NL80211_ATTR_CH_SWITCH_COUNT]); + + err = nl80211_parse_beacon(info->attrs, ¶ms.beacon_after); + if (err) + return err; + + err = nla_parse_nested(csa_attrs, NL80211_ATTR_MAX, + info->attrs[NL80211_ATTR_CSA_IES], + nl80211_policy); + if (err) + return err; + + err = nl80211_parse_beacon(csa_attrs, ¶ms.beacon_csa); + if (err) + return err; + + if (!csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]) + return -EINVAL; + + params.counter_offset_beacon = + nla_get_u16(csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]); + if (params.counter_offset_beacon >= params.beacon_csa.tail_len) + return -EINVAL; + + /* sanity check - counters should be the same */ + if (params.beacon_csa.tail[params.counter_offset_beacon] != + params.count) + return -EINVAL; + + if (csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]) { + params.counter_offset_presp = + nla_get_u16(csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]); + if (params.counter_offset_presp >= + params.beacon_csa.probe_resp_len) + return -EINVAL; + + if (params.beacon_csa.probe_resp[params.counter_offset_presp] != + params.count) + return -EINVAL; + } + + err = nl80211_parse_chandef(rdev, info, ¶ms.chandef); + if (err) + return err; + + if (!cfg80211_reg_can_beacon(&rdev->wiphy, ¶ms.chandef)) + return -EINVAL; + + err = cfg80211_chandef_dfs_required(wdev->wiphy, ¶ms.chandef); + if (err < 0) { + return err; + } else if (err) { + radar_detect_width = BIT(params.chandef.width); + params.radar_required = true; + } + + err = cfg80211_can_use_iftype_chan(rdev, wdev, wdev->iftype, + params.chandef.chan, + CHAN_MODE_SHARED, + radar_detect_width); + if (err) + return err; + + if (info->attrs[NL80211_ATTR_CH_SWITCH_BLOCK_TX]) + params.block_tx = true; + + return rdev_channel_switch(rdev, dev, ¶ms); +} + static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, u32 seq, int flags, struct cfg80211_registered_device *rdev, @@ -9361,7 +9473,15 @@ static struct genl_ops nl80211_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, - } + }, + { + .cmd = NL80211_CMD_CHANNEL_SWITCH, + .doit = nl80211_channel_switch, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, }; static struct genl_multicast_group nl80211_mlme_mcgrp = { diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 9f15f0ac824d..de870d4d0bcc 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -923,4 +923,16 @@ static inline void rdev_crit_proto_stop(struct cfg80211_registered_device *rdev, trace_rdev_return_void(&rdev->wiphy); } +static inline int rdev_channel_switch(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_csa_settings *params) +{ + int ret; + + trace_rdev_channel_switch(&rdev->wiphy, dev, params); + ret = rdev->ops->channel_switch(&rdev->wiphy, dev, params); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 09af6eb426a8..f0ebdcd394ef 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1841,6 +1841,39 @@ TRACE_EVENT(rdev_crit_proto_stop, WIPHY_PR_ARG, WDEV_PR_ARG) ); +TRACE_EVENT(rdev_channel_switch, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_csa_settings *params), + TP_ARGS(wiphy, netdev, params), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + CHAN_DEF_ENTRY + __field(u16, counter_offset_beacon) + __field(u16, counter_offset_presp) + __field(bool, radar_required) + __field(bool, block_tx) + __field(u8, count) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + CHAN_DEF_ASSIGN(¶ms->chandef); + __entry->counter_offset_beacon = params->counter_offset_beacon; + __entry->counter_offset_presp = params->counter_offset_presp; + __entry->radar_required = params->radar_required; + __entry->block_tx = params->block_tx; + __entry->count = params->count; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT + ", block_tx: %d, count: %u, radar_required: %d" + ", counter offsets (beacon/presp): %u/%u", + WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG, + __entry->block_tx, __entry->count, __entry->radar_required, + __entry->counter_offset_beacon, + __entry->counter_offset_presp) +); + /************************************************************* * cfg80211 exported functions traces * *************************************************************/ -- cgit v1.2.3-71-gd317 From e216975ad97cfcfc436789aa66d59a0e93f337f7 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 1 Aug 2013 16:17:47 -0700 Subject: uapi: Convert some uses of 6 to ETH_ALEN Use the #define where appropriate. Add #include where appropriate too. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/uapi/linux/dn.h | 3 ++- include/uapi/linux/if_bridge.h | 3 ++- include/uapi/linux/netfilter_bridge/ebt_802_3.h | 5 +++-- include/uapi/linux/netfilter_ipv4/ipt_CLUSTERIP.h | 3 ++- include/uapi/linux/virtio_net.h | 2 +- include/uapi/linux/wimax/i2400m.h | 4 ++-- 6 files changed, 12 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/dn.h b/include/uapi/linux/dn.h index 9c50445462d9..5fbdd3d49eba 100644 --- a/include/uapi/linux/dn.h +++ b/include/uapi/linux/dn.h @@ -2,6 +2,7 @@ #define _LINUX_DN_H #include +#include /* @@ -120,7 +121,7 @@ struct linkinfo_dn { * Ethernet address format (for DECnet) */ union etheraddress { - __u8 dne_addr[6]; /* Full ethernet address */ + __u8 dne_addr[ETH_ALEN]; /* Full ethernet address */ struct { __u8 dne_hiord[4]; /* DECnet HIORD prefix */ __u8 dne_nodeaddr[2]; /* DECnet node address */ diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 2d70d79ce2fd..39f621a9fe82 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -14,6 +14,7 @@ #define _UAPI_LINUX_IF_BRIDGE_H #include +#include #define SYSFS_BRIDGE_ATTR "bridge" #define SYSFS_BRIDGE_FDB "brforward" @@ -88,7 +89,7 @@ struct __port_info { }; struct __fdb_entry { - __u8 mac_addr[6]; + __u8 mac_addr[ETH_ALEN]; __u8 port_no; __u8 is_local; __u32 ageing_timer_value; diff --git a/include/uapi/linux/netfilter_bridge/ebt_802_3.h b/include/uapi/linux/netfilter_bridge/ebt_802_3.h index 5bf84912a082..f37522aade24 100644 --- a/include/uapi/linux/netfilter_bridge/ebt_802_3.h +++ b/include/uapi/linux/netfilter_bridge/ebt_802_3.h @@ -2,6 +2,7 @@ #define _UAPI__LINUX_BRIDGE_EBT_802_3_H #include +#include #define EBT_802_3_SAP 0x01 #define EBT_802_3_TYPE 0x02 @@ -42,8 +43,8 @@ struct hdr_ni { }; struct ebt_802_3_hdr { - __u8 daddr[6]; - __u8 saddr[6]; + __u8 daddr[ETH_ALEN]; + __u8 saddr[ETH_ALEN]; __be16 len; union { struct hdr_ui ui; diff --git a/include/uapi/linux/netfilter_ipv4/ipt_CLUSTERIP.h b/include/uapi/linux/netfilter_ipv4/ipt_CLUSTERIP.h index c6a204c97047..eac0f6548f47 100644 --- a/include/uapi/linux/netfilter_ipv4/ipt_CLUSTERIP.h +++ b/include/uapi/linux/netfilter_ipv4/ipt_CLUSTERIP.h @@ -2,6 +2,7 @@ #define _IPT_CLUSTERIP_H_target #include +#include enum clusterip_hashmode { CLUSTERIP_HASHMODE_SIP = 0, @@ -22,7 +23,7 @@ struct ipt_clusterip_tgt_info { __u32 flags; /* only relevant for new ones */ - __u8 clustermac[6]; + __u8 clustermac[ETH_ALEN]; __u16 num_total_nodes; __u16 num_local_nodes; __u16 local_nodes[CLUSTERIP_MAX_NODES]; diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 227d4ce84e79..172a7f00780c 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -60,7 +60,7 @@ struct virtio_net_config { /* The config defining mac address (if VIRTIO_NET_F_MAC) */ - __u8 mac[6]; + __u8 mac[ETH_ALEN]; /* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */ __u16 status; /* Maximum number of each of transmit and receive queues; diff --git a/include/uapi/linux/wimax/i2400m.h b/include/uapi/linux/wimax/i2400m.h index 62d356153565..fd198bc24a3c 100644 --- a/include/uapi/linux/wimax/i2400m.h +++ b/include/uapi/linux/wimax/i2400m.h @@ -122,7 +122,7 @@ #define __LINUX__WIMAX__I2400M_H__ #include - +#include /* * Host Device Interface (HDI) common to all busses @@ -487,7 +487,7 @@ struct i2400m_tlv_l4_message_versions { struct i2400m_tlv_detailed_device_info { struct i2400m_tlv_hdr hdr; __u8 reserved1[400]; - __u8 mac_address[6]; + __u8 mac_address[ETH_ALEN]; __u8 reserved2[2]; } __attribute__((packed)); -- cgit v1.2.3-71-gd317 From 6ef94cfafba159d6b1a902ccb3349ac6a34ff6ad Mon Sep 17 00:00:00 2001 From: Stefan Tomanek Date: Fri, 2 Aug 2013 17:19:56 +0200 Subject: fib_rules: add route suppression based on ifgroup This change adds the ability to suppress a routing decision based upon the interface group the selected interface belongs to. This allows it to exclude specific devices from a routing decision. Signed-off-by: Stefan Tomanek Signed-off-by: David S. Miller --- include/net/fib_rules.h | 2 ++ include/uapi/linux/fib_rules.h | 2 +- net/core/fib_rules.c | 10 ++++++++++ net/ipv4/fib_rules.c | 23 +++++++++++++++++------ net/ipv6/fib6_rules.c | 16 +++++++++++++--- 5 files changed, 43 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 2f286dce9259..d13c461b4b59 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -18,6 +18,7 @@ struct fib_rule { u32 pref; u32 flags; u32 table; + int suppress_ifgroup; u8 table_prefixlen_min; u8 action; u32 target; @@ -84,6 +85,7 @@ struct fib_rules_ops { [FRA_FWMASK] = { .type = NLA_U32 }, \ [FRA_TABLE] = { .type = NLA_U32 }, \ [FRA_TABLE_PREFIXLEN_MIN] = { .type = NLA_U8 }, \ + [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \ [FRA_GOTO] = { .type = NLA_U32 } static inline void fib_rule_get(struct fib_rule *rule) diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 59cd31b3455e..63e31166e85b 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -44,7 +44,7 @@ enum { FRA_FWMARK, /* mark */ FRA_FLOW, /* flow/class id */ FRA_UNUSED6, - FRA_UNUSED7, + FRA_SUPPRESS_IFGROUP, FRA_TABLE_PREFIXLEN_MIN, FRA_TABLE, /* Extended table id */ FRA_FWMASK, /* mask for netfilter mark */ diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 2ef5040c99c8..5040a61bf28a 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -343,6 +343,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) if (tb[FRA_TABLE_PREFIXLEN_MIN]) rule->table_prefixlen_min = nla_get_u8(tb[FRA_TABLE_PREFIXLEN_MIN]); + if (tb[FRA_SUPPRESS_IFGROUP]) + rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); + if (!tb[FRA_PRIORITY] && ops->default_pref) rule->pref = ops->default_pref(ops); @@ -529,6 +532,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + nla_total_size(1) /* FRA_TABLE_PREFIXLEN_MIN */ + + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4); /* FRA_FWMASK */ @@ -588,6 +592,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, (rule->target && nla_put_u32(skb, FRA_GOTO, rule->target))) goto nla_put_failure; + + if (rule->suppress_ifgroup != -1) { + if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup)) + goto nla_put_failure; + } + if (ops->fill(rule, skb, frh) < 0) goto nla_put_failure; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 9f2906679d1f..b78fd28970c9 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -103,16 +103,27 @@ errout: static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) { + struct fib_result *result = (struct fib_result *) arg->result; + struct net_device *dev = result->fi->fib_dev; + /* do not accept result if the route does * not meet the required prefix length */ - struct fib_result *result = (struct fib_result *) arg->result; - if (result->prefixlen < rule->table_prefixlen_min) { - if (!(arg->flags & FIB_LOOKUP_NOREF)) - fib_info_put(result->fi); - return true; - } + if (result->prefixlen < rule->table_prefixlen_min) + goto suppress_route; + + /* do not accept result if the route uses a device + * belonging to a forbidden interface group + */ + if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup) + goto suppress_route; + return false; + +suppress_route: + if (!(arg->flags & FIB_LOOKUP_NOREF)) + fib_info_put(result->fi); + return true; } static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 554a4fbabfb3..36283267e2f8 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -122,14 +122,24 @@ out: static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) { struct rt6_info *rt = (struct rt6_info *) arg->result; + struct net_device *dev = rt->rt6i_idev->dev; /* do not accept result if the route does * not meet the required prefix length */ - if (rt->rt6i_dst.plen < rule->table_prefixlen_min) { + if (rt->rt6i_dst.plen < rule->table_prefixlen_min) + goto suppress_route; + + /* do not accept result if the route uses a device + * belonging to a forbidden interface group + */ + if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup) + goto suppress_route; + + return false; + +suppress_route: ip6_rt_put(rt); return true; - } - return false; } static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) -- cgit v1.2.3-71-gd317 From 73f5698e77219bfc3ea1903759fe8e20ab5b285e Mon Sep 17 00:00:00 2001 From: Stefan Tomanek Date: Sat, 3 Aug 2013 14:14:43 +0200 Subject: fib_rules: fix suppressor names and default values This change brings the suppressor attribute names into line; it also changes the data types to provide a more consistent interface. While -1 indicates that the suppressor is not enabled, values >= 0 for suppress_prefixlen or suppress_ifgroup reject routing decisions violating the constraint. This changes the previously presented behaviour of suppress_prefixlen, where a prefix length _less_ than the attribute value was rejected. After this change, a prefix length less than *or* equal to the value is considered a violation of the rule constraint. It also changes the default values for default and newly added rules (disabling any suppression for those). Signed-off-by: Stefan Tomanek Signed-off-by: David S. Miller --- include/net/fib_rules.h | 4 ++-- include/uapi/linux/fib_rules.h | 2 +- net/core/fib_rules.c | 15 +++++++++++---- net/ipv4/fib_rules.c | 2 +- net/ipv6/fib6_rules.c | 2 +- 5 files changed, 16 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index d13c461b4b59..9d0fcbaa9cbb 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -19,7 +19,7 @@ struct fib_rule { u32 flags; u32 table; int suppress_ifgroup; - u8 table_prefixlen_min; + int suppress_prefixlen; u8 action; u32 target; struct fib_rule __rcu *ctarget; @@ -84,7 +84,7 @@ struct fib_rules_ops { [FRA_FWMARK] = { .type = NLA_U32 }, \ [FRA_FWMASK] = { .type = NLA_U32 }, \ [FRA_TABLE] = { .type = NLA_U32 }, \ - [FRA_TABLE_PREFIXLEN_MIN] = { .type = NLA_U8 }, \ + [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \ [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \ [FRA_GOTO] = { .type = NLA_U32 } diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 63e31166e85b..2b82d7e30974 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -45,7 +45,7 @@ enum { FRA_FLOW, /* flow/class id */ FRA_UNUSED6, FRA_SUPPRESS_IFGROUP, - FRA_TABLE_PREFIXLEN_MIN, + FRA_SUPPRESS_PREFIXLEN, FRA_TABLE, /* Extended table id */ FRA_FWMASK, /* mask for netfilter mark */ FRA_OIFNAME, diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 5040a61bf28a..2e654138433c 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -33,6 +33,9 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->flags = flags; r->fr_net = hold_net(ops->fro_net); + r->suppress_prefixlen = -1; + r->suppress_ifgroup = -1; + /* The lock is not required here, the list in unreacheable * at the moment this function is called */ list_add_tail(&r->list, &ops->rules_list); @@ -340,11 +343,15 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) rule->action = frh->action; rule->flags = frh->flags; rule->table = frh_get_table(frh, tb); - if (tb[FRA_TABLE_PREFIXLEN_MIN]) - rule->table_prefixlen_min = nla_get_u8(tb[FRA_TABLE_PREFIXLEN_MIN]); + if (tb[FRA_SUPPRESS_PREFIXLEN]) + rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); + else + rule->suppress_prefixlen = -1; if (tb[FRA_SUPPRESS_IFGROUP]) rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); + else + rule->suppress_ifgroup = -1; if (!tb[FRA_PRIORITY] && ops->default_pref) rule->pref = ops->default_pref(ops); @@ -531,7 +538,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ - + nla_total_size(1) /* FRA_TABLE_PREFIXLEN_MIN */ + + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */ + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4); /* FRA_FWMASK */ @@ -558,7 +565,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh->table = rule->table; if (nla_put_u32(skb, FRA_TABLE, rule->table)) goto nla_put_failure; - if (nla_put_u8(skb, FRA_TABLE_PREFIXLEN_MIN, rule->table_prefixlen_min)) + if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) goto nla_put_failure; frh->res1 = 0; frh->res2 = 0; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index b78fd28970c9..523be38e37de 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -109,7 +109,7 @@ static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg /* do not accept result if the route does * not meet the required prefix length */ - if (result->prefixlen < rule->table_prefixlen_min) + if (result->prefixlen <= rule->suppress_prefixlen) goto suppress_route; /* do not accept result if the route uses a device diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 36283267e2f8..a6c58ce43d34 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -126,7 +126,7 @@ static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg /* do not accept result if the route does * not meet the required prefix length */ - if (rt->rt6i_dst.plen < rule->table_prefixlen_min) + if (rt->rt6i_dst.plen <= rule->suppress_prefixlen) goto suppress_route; /* do not accept result if the route uses a device -- cgit v1.2.3-71-gd317 From cf4957f17f2a89984915ea808876d9c82225b862 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 24 Oct 2012 13:37:58 +0200 Subject: perf: Add PERF_EVENT_IOC_ID ioctl to return event ID The only way to get the event ID is by reading the event fd, followed by parsing the ID value out of the returned data. While this is ok for current read format used by perf tool, it is not ok when we use PERF_FORMAT_GROUP format. With this format the data are returned for the whole group and there's no way to find out what ID belongs to our fd (if we are not group leader event). Adding a simple ioctl that returns event primary ID for given fd. Signed-off-by: Jiri Olsa Acked-by: Namhyung Kim Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-v1bn5cto707jn0bon34afqr1@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- include/uapi/linux/perf_event.h | 1 + kernel/events/core.c | 9 +++++++++ 2 files changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index efef1d37a371..62c25a25291c 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -321,6 +321,7 @@ struct perf_event_attr { #define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64) #define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) #define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) +#define PERF_EVENT_IOC_ID _IOR('$', 7, u64 *) enum perf_event_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, diff --git a/kernel/events/core.c b/kernel/events/core.c index 916cf1f593b4..5200b608b481 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3568,6 +3568,15 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_EVENT_IOC_PERIOD: return perf_event_period(event, (u64 __user *)arg); + case PERF_EVENT_IOC_ID: + { + u64 id = primary_event_id(event); + + if (copy_to_user((void __user *)arg, &id, sizeof(id))) + return -EFAULT; + return 0; + } + case PERF_EVENT_IOC_SET_OUTPUT: { int ret; -- cgit v1.2.3-71-gd317 From 1f07d03e2069df2ecd82301936b598a3b257c6d6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 6 Aug 2013 03:32:11 -0700 Subject: net: add SNMP counters tracking incoming ECN bits With GRO/LRO processing, there is a problem because Ip[6]InReceives SNMP counters do not count the number of frames, but number of aggregated segments. Its probably too late to change this now. This patch adds four new counters, tracking number of frames, regardless of LRO/GRO, and on a per ECN status basis, for IPv4 and IPv6. Ip[6]NoECTPkts : Number of packets received with NOECT Ip[6]ECT1Pkts : Number of packets received with ECT(1) Ip[6]ECT0Pkts : Number of packets received with ECT(0) Ip[6]CEPkts : Number of packets received with Congestion Experienced lph37:~# nstat | egrep "Pkts|InReceive" IpInReceives 1634137 0.0 Ip6InReceives 3714107 0.0 Ip6InNoECTPkts 19205 0.0 Ip6InECT0Pkts 52651828 0.0 IpExtInNoECTPkts 33630 0.0 IpExtInECT0Pkts 15581379 0.0 IpExtInCEPkts 6 0.0 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 4 ++++ net/ipv4/ip_input.c | 8 ++++++++ net/ipv4/proc.c | 7 ++++++- net/ipv6/ip6_input.c | 6 +++++- net/ipv6/proc.c | 4 ++++ 5 files changed, 27 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index af0a674cc677..60601d28da75 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -51,6 +51,10 @@ enum IPSTATS_MIB_INBCASTOCTETS, /* InBcastOctets */ IPSTATS_MIB_OUTBCASTOCTETS, /* OutBcastOctets */ IPSTATS_MIB_CSUMERRORS, /* InCsumErrors */ + IPSTATS_MIB_NOECTPKTS, /* InNoECTPkts */ + IPSTATS_MIB_ECT1PKTS, /* InECT1Pkts */ + IPSTATS_MIB_ECT0PKTS, /* InECT0Pkts */ + IPSTATS_MIB_CEPKTS, /* InCEPkts */ __IPSTATS_MIB_MAX }; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 15e3e683adec..054a3e97d822 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -141,6 +141,7 @@ #include #include #include +#include #include #include #include @@ -410,6 +411,13 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; + BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); + BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); + BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); + IP_ADD_STATS_BH(dev_net(dev), + IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), + max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); + if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 6577a1149a47..5f5fa612647f 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -111,7 +111,7 @@ static const struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_SENTINEL }; -/* Following RFC4293 items are displayed in /proc/net/netstat */ +/* Following items are displayed in /proc/net/netstat */ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES), SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), @@ -125,7 +125,12 @@ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), + /* Non RFC4293 fields */ SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS), + SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS), + SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS), + SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS), + SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 2bab2aa59745..302d6fb1ff2b 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -44,7 +44,7 @@ #include #include #include - +#include int ip6_rcv_finish(struct sk_buff *skb) @@ -109,6 +109,10 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (hdr->version != 6) goto err; + IP6_ADD_STATS_BH(dev_net(dev), idev, + IPSTATS_MIB_NOECTPKTS + + (ipv6_get_dsfield(hdr) & INET_ECN_MASK), + max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); /* * RFC4291 2.5.3 * A packet received on an interface with a destination address diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 51c3285b5d9b..091d066a57b3 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -91,6 +91,10 @@ static const struct snmp_mib snmp6_ipstats_list[] = { SNMP_MIB_ITEM("Ip6InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), SNMP_MIB_ITEM("Ip6OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), /* IPSTATS_MIB_CSUMERRORS is not relevant in IPv6 (no checksum) */ + SNMP_MIB_ITEM("Ip6InNoECTPkts", IPSTATS_MIB_NOECTPKTS), + SNMP_MIB_ITEM("Ip6InECT1Pkts", IPSTATS_MIB_ECT1PKTS), + SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS), + SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_SENTINEL }; -- cgit v1.2.3-71-gd317 From 3a3bb00d5c73f3ae7833f6534982b5b2f56ac9b4 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Fri, 9 Aug 2013 19:52:00 +0530 Subject: kvm uapi: Add KICK_CPU and PV_UNHALT definition to uapi These are needed by both guest and host. Originally-from: Srivatsa Vaddagiri Signed-off-by: Raghavendra K T Link: http://lkml.kernel.org/r/1376058122-8248-13-git-send-email-raghavendra.kt@linux.vnet.ibm.com Acked-by: Gleb Natapov Acked-by: Ingo Molnar Signed-off-by: H. Peter Anvin --- arch/x86/include/uapi/asm/kvm_para.h | 1 + include/uapi/linux/kvm_para.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 06fdbd987e97..94dc8ca434e0 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -23,6 +23,7 @@ #define KVM_FEATURE_ASYNC_PF 4 #define KVM_FEATURE_STEAL_TIME 5 #define KVM_FEATURE_PV_EOI 6 +#define KVM_FEATURE_PV_UNHALT 7 /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h index cea2c5c72d26..2841f86eae0b 100644 --- a/include/uapi/linux/kvm_para.h +++ b/include/uapi/linux/kvm_para.h @@ -19,6 +19,7 @@ #define KVM_HC_MMU_OP 2 #define KVM_HC_FEATURES 3 #define KVM_HC_PPC_MAP_MAGIC_PAGE 4 +#define KVM_HC_KICK_CPU 5 /* * hypercalls use architecture specific -- cgit v1.2.3-71-gd317 From 288a9376371d425edeeea41a0310922c5fb2092d Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Wed, 7 Aug 2013 11:33:25 +0300 Subject: net: rename busy poll MIB counter Rename mib counter from "low latency" to "busy poll" v1 also moved the counter to the ip MIB (suggested by Shawn Bohrer) Eric Dumazet suggested that the current location is better. So v2 just renames the counter to fit the new naming convention. Signed-off-by: Eliezer Tamir Signed-off-by: David S. Miller --- include/net/busy_poll.h | 2 +- include/uapi/linux/snmp.h | 2 +- net/ipv4/proc.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 8e2dfc106aed..8a358a2c97e6 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -122,7 +122,7 @@ static inline bool sk_busy_loop(struct sock *sk, int nonblock) if (rc > 0) /* local bh are disabled so it is ok to use _BH */ NET_ADD_STATS_BH(sock_net(sk), - LINUX_MIB_LOWLATENCYRXPACKETS, rc); + LINUX_MIB_BUSYPOLLRXPACKETS, rc); } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && !need_resched() && !busy_loop_timeout(end_time)); diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index af0a674cc677..a1356d3b54df 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -253,7 +253,7 @@ enum LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */ LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */ - LINUX_MIB_LOWLATENCYRXPACKETS, /* LowLatencyRxPackets */ + LINUX_MIB_BUSYPOLLRXPACKETS, /* BusyPollRxPackets */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 6577a1149a47..463bd1273346 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -273,7 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), - SNMP_MIB_ITEM("LowLatencyRxPackets", LINUX_MIB_LOWLATENCYRXPACKETS), + SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), SNMP_MIB_SENTINEL }; -- cgit v1.2.3-71-gd317 From bd0779370588386e4a67ba5d0b176cfded8e6a53 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 7 Aug 2013 18:13:20 +0200 Subject: netfilter: nfnetlink_queue: allow to attach expectations to conntracks This patch adds the capability to attach expectations via nfnetlink_queue. This is required by conntrack helpers that trigger expectations based on the first packet seen like the TFTP and the DHCPv6 user-space helpers. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 2 + include/net/netfilter/nfnetlink_queue.h | 8 +++ include/uapi/linux/netfilter/nfnetlink_queue.h | 1 + net/netfilter/nf_conntrack_netlink.c | 95 ++++++++++++++++++++++---- net/netfilter/nfnetlink_queue_core.c | 9 ++- net/netfilter/nfnetlink_queue_ct.c | 15 ++++ 6 files changed, 114 insertions(+), 16 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 655d5d198d49..e2cf786be22f 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -325,6 +325,8 @@ struct nfq_ct_hook { size_t (*build_size)(const struct nf_conn *ct); int (*build)(struct sk_buff *skb, struct nf_conn *ct); int (*parse)(const struct nlattr *attr, struct nf_conn *ct); + int (*attach_expect)(const struct nlattr *attr, struct nf_conn *ct, + u32 portid, u32 report); }; extern struct nfq_ct_hook __rcu *nfq_ct_hook; diff --git a/include/net/netfilter/nfnetlink_queue.h b/include/net/netfilter/nfnetlink_queue.h index 86267a529514..aff88ba91391 100644 --- a/include/net/netfilter/nfnetlink_queue.h +++ b/include/net/netfilter/nfnetlink_queue.h @@ -15,6 +15,8 @@ int nfqnl_ct_put(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo); void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, int diff); +int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr, + u32 portid, u32 report); #else inline struct nf_conn * nfqnl_ct_get(struct sk_buff *entskb, size_t *size, enum ip_conntrack_info *ctinfo) @@ -39,5 +41,11 @@ inline void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, int diff) { } + +inline int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr, + u32 portid, u32 report) +{ + return 0; +} #endif /* NF_CONNTRACK */ #endif diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h index 3a9b92147339..0132bad79de7 100644 --- a/include/uapi/linux/netfilter/nfnetlink_queue.h +++ b/include/uapi/linux/netfilter/nfnetlink_queue.h @@ -46,6 +46,7 @@ enum nfqnl_attr_type { NFQA_CT_INFO, /* enum ip_conntrack_info */ NFQA_CAP_LEN, /* __u32 length of captured packet */ NFQA_SKB_INFO, /* __u32 skb meta information */ + NFQA_EXP, /* nf_conntrack_netlink.h */ __NFQA_MAX }; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 9aaa68bbbcdb..fa61fea63234 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1987,6 +1987,27 @@ out: return err == -EAGAIN ? -ENOBUFS : err; } +static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { + [CTA_EXPECT_MASTER] = { .type = NLA_NESTED }, + [CTA_EXPECT_TUPLE] = { .type = NLA_NESTED }, + [CTA_EXPECT_MASK] = { .type = NLA_NESTED }, + [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 }, + [CTA_EXPECT_ID] = { .type = NLA_U32 }, + [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING, + .len = NF_CT_HELPER_NAME_LEN - 1 }, + [CTA_EXPECT_ZONE] = { .type = NLA_U16 }, + [CTA_EXPECT_FLAGS] = { .type = NLA_U32 }, + [CTA_EXPECT_CLASS] = { .type = NLA_U32 }, + [CTA_EXPECT_NAT] = { .type = NLA_NESTED }, + [CTA_EXPECT_FN] = { .type = NLA_NUL_STRING }, +}; + +static struct nf_conntrack_expect * +ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct, + struct nf_conntrack_helper *helper, + struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple *mask); + #ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT static size_t ctnetlink_nfqueue_build_size(const struct nf_conn *ct) @@ -2127,10 +2148,69 @@ ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct) return ret; } +static int ctnetlink_nfqueue_exp_parse(const struct nlattr * const *cda, + const struct nf_conn *ct, + struct nf_conntrack_tuple *tuple, + struct nf_conntrack_tuple *mask) +{ + int err; + + err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE, + nf_ct_l3num(ct)); + if (err < 0) + return err; + + return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK, + nf_ct_l3num(ct)); +} + +static int +ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, + u32 portid, u32 report) +{ + struct nlattr *cda[CTA_EXPECT_MAX+1]; + struct nf_conntrack_tuple tuple, mask; + struct nf_conntrack_helper *helper; + struct nf_conntrack_expect *exp; + int err; + + err = nla_parse_nested(cda, CTA_EXPECT_MAX, attr, exp_nla_policy); + if (err < 0) + return err; + + err = ctnetlink_nfqueue_exp_parse((const struct nlattr * const *)cda, + ct, &tuple, &mask); + if (err < 0) + return err; + + if (cda[CTA_EXPECT_HELP_NAME]) { + const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); + + helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), + nf_ct_protonum(ct)); + if (helper == NULL) + return -EOPNOTSUPP; + } + + exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct, + helper, &tuple, &mask); + if (IS_ERR(exp)) + return PTR_ERR(exp); + + err = nf_ct_expect_related_report(exp, portid, report); + if (err < 0) { + nf_ct_expect_put(exp); + return err; + } + + return 0; +} + static struct nfq_ct_hook ctnetlink_nfqueue_hook = { .build_size = ctnetlink_nfqueue_build_size, .build = ctnetlink_nfqueue_build, .parse = ctnetlink_nfqueue_parse, + .attach_expect = ctnetlink_nfqueue_attach_expect, }; #endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */ @@ -2498,21 +2578,6 @@ static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb, return err; } -static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { - [CTA_EXPECT_MASTER] = { .type = NLA_NESTED }, - [CTA_EXPECT_TUPLE] = { .type = NLA_NESTED }, - [CTA_EXPECT_MASK] = { .type = NLA_NESTED }, - [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 }, - [CTA_EXPECT_ID] = { .type = NLA_U32 }, - [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING, - .len = NF_CT_HELPER_NAME_LEN - 1 }, - [CTA_EXPECT_ZONE] = { .type = NLA_U16 }, - [CTA_EXPECT_FLAGS] = { .type = NLA_U32 }, - [CTA_EXPECT_CLASS] = { .type = NLA_U32 }, - [CTA_EXPECT_NAT] = { .type = NLA_NESTED }, - [CTA_EXPECT_FN] = { .type = NLA_NUL_STRING }, -}; - static int ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index ec9de12aa488..e8c9f3bb779c 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -859,6 +859,7 @@ static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = { [NFQA_MARK] = { .type = NLA_U32 }, [NFQA_PAYLOAD] = { .type = NLA_UNSPEC }, [NFQA_CT] = { .type = NLA_UNSPEC }, + [NFQA_EXP] = { .type = NLA_UNSPEC }, }; static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { @@ -987,8 +988,14 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, if (entry == NULL) return -ENOENT; - if (nfqa[NFQA_CT]) + if (nfqa[NFQA_CT]) { ct = nfqnl_ct_parse(entry->skb, nfqa[NFQA_CT], &ctinfo); + if (ct && nfqa[NFQA_EXP]) { + nfqnl_attach_expect(ct, nfqa[NFQA_EXP], + NETLINK_CB(skb).portid, + nlmsg_report(nlh)); + } + } if (nfqa[NFQA_PAYLOAD]) { u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]); diff --git a/net/netfilter/nfnetlink_queue_ct.c b/net/netfilter/nfnetlink_queue_ct.c index ab61d66bc0b9..be893039966d 100644 --- a/net/netfilter/nfnetlink_queue_ct.c +++ b/net/netfilter/nfnetlink_queue_ct.c @@ -96,3 +96,18 @@ void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, if ((ct->status & IPS_NAT_MASK) && diff) nfq_nat_ct->seq_adjust(skb, ct, ctinfo, diff); } + +int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr, + u32 portid, u32 report) +{ + struct nfq_ct_hook *nfq_ct; + + if (nf_ct_is_untracked(ct)) + return 0; + + nfq_ct = rcu_dereference(nfq_ct_hook); + if (nfq_ct == NULL) + return -EOPNOTSUPP; + + return nfq_ct->attach_expect(attr, ct, portid, report); +} -- cgit v1.2.3-71-gd317 From ebd8b934e23f45ad3fc8a5a28bc5a96741a6a106 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Sat, 10 Aug 2013 15:22:58 -0700 Subject: pptp: fix byte order warnings Pptp driver has lots of byte order warnings from sparse. This was because the on-the-wire header is in network byte order (obviously) but the definition did not reflect that. Also, the address structure to user space actually put the call id in host order. Rather than break ABI compatibility, just acknowledge the existing design. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/ppp/pptp.c | 10 +++++----- include/uapi/linux/if_pppox.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c index 9785a3ac02b3..6fa5ae00039f 100644 --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -83,11 +83,11 @@ static const struct proto_ops pptp_ops; struct pptp_gre_header { u8 flags; u8 ver; - u16 protocol; - u16 payload_len; - u16 call_id; - u32 seq; - u32 ack; + __be16 protocol; + __be16 payload_len; + __be16 call_id; + __be32 seq; + __be32 ack; } __packed; static struct pppox_sock *lookup_chan(u16 call_id, __be32 s_addr) diff --git a/include/uapi/linux/if_pppox.h b/include/uapi/linux/if_pppox.h index e36a4aecd311..e128769331b5 100644 --- a/include/uapi/linux/if_pppox.h +++ b/include/uapi/linux/if_pppox.h @@ -46,7 +46,7 @@ struct pppoe_addr { * PPTP addressing definition */ struct pptp_addr { - __be16 call_id; + __u16 call_id; struct in_addr sin_addr; }; -- cgit v1.2.3-71-gd317 From 1972b5b3a66f1b6a9df04cc91faf401354919262 Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Tue, 25 Jun 2013 12:42:54 +0200 Subject: NFC: Document secure element addition/removal netlink events Signed-off-by: Samuel Ortiz --- include/uapi/linux/nfc.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index 8137dd8d2adf..40ada984cb78 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -71,6 +71,11 @@ * @NFC_CMD_DISABLE_SE: Disable the physical link to a specific secure element. * @NFC_CMD_FW_DOWNLOAD: Request to Load/flash firmware, or event to inform * that some firmware was loaded + * @NFC_EVENT_SE_ADDED: Event emitted when a new secure element is discovered. + * This typically will be sent whenever a new NFC controller with either + * an embedded SE or an UICC one connected to it through SWP. + * @NFC_EVENT_SE_REMOVED: Event emitted when a secure element is removed from + * the system, as a consequence of e.g. an NFC controller being unplugged. */ enum nfc_commands { NFC_CMD_UNSPEC, -- cgit v1.2.3-71-gd317 From 91a32269e31282405db405b9ec9ce1a30568e4b0 Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Tue, 25 Jun 2013 16:22:08 +0200 Subject: NFC: Define secure element connectivity and transaction events The SE_CONNECTIVITY event is for an SE to request connection to e.g. a modem. The SE_TRANSACTION one is sent when an application running on a specific SE wants to notify the host CPU about the end of a transaction. Those events respectively map to the EVT_CONNECTIVITY and the EVT_TRANSACTION HCI events. Signed-off-by: Samuel Ortiz --- include/uapi/linux/nfc.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index 40ada984cb78..539d60494c04 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -76,6 +76,14 @@ * an embedded SE or an UICC one connected to it through SWP. * @NFC_EVENT_SE_REMOVED: Event emitted when a secure element is removed from * the system, as a consequence of e.g. an NFC controller being unplugged. + * @NFC_EVENT_SE_CONNECTIVITY: This event is emitted whenever a secure element + * is requesting connectivity access. For example a UICC SE may need to + * talk with a sleeping modem and will notify this need by sending this + * event. It is then up to userspace to decide if it will wake the modem + * up or not. + * @NFC_EVENT_SE_TRANSACTION: This event is sent when an application running on + * a specific SE notifies us about the end of a transaction. The parameter + * for this event is the application ID (AID). */ enum nfc_commands { NFC_CMD_UNSPEC, @@ -102,6 +110,8 @@ enum nfc_commands { NFC_CMD_FW_DOWNLOAD, NFC_EVENT_SE_ADDED, NFC_EVENT_SE_REMOVED, + NFC_EVENT_SE_CONNECTIVITY, + NFC_EVENT_SE_TRANSACTION, /* private: internal use only */ __NFC_CMD_AFTER_LAST }; @@ -159,6 +169,7 @@ enum nfc_attrs { NFC_ATTR_FIRMWARE_NAME, NFC_ATTR_SE_INDEX, NFC_ATTR_SE_TYPE, + NFC_ATTR_SE_AID, /* private: internal use only */ __NFC_ATTR_AFTER_LAST }; -- cgit v1.2.3-71-gd317 From ac22ac466a659f1b2e02a2e2ee23fc5c42da2c95 Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Wed, 24 Jul 2013 18:10:50 +0200 Subject: NFC: Add a GET_SE netlink API In order to fetch the discovered secure elements from an NFC controller, we need to send a netlink command that will dump the list of available SEs from NFC. Signed-off-by: Samuel Ortiz --- include/uapi/linux/nfc.h | 2 ++ net/nfc/netlink.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index 539d60494c04..029921b067fd 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -84,6 +84,7 @@ * @NFC_EVENT_SE_TRANSACTION: This event is sent when an application running on * a specific SE notifies us about the end of a transaction. The parameter * for this event is the application ID (AID). + * @NFC_CMD_GET_SE: Dump all discovered secure elements from an NFC controller. */ enum nfc_commands { NFC_CMD_UNSPEC, @@ -112,6 +113,7 @@ enum nfc_commands { NFC_EVENT_SE_REMOVED, NFC_EVENT_SE_CONNECTIVITY, NFC_EVENT_SE_TRANSACTION, + NFC_CMD_GET_SE, /* private: internal use only */ __NFC_CMD_AFTER_LAST }; diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index f16fd59d4160..3b08ef90e045 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1191,6 +1191,91 @@ static int nfc_genl_disable_se(struct sk_buff *skb, struct genl_info *info) return rc; } +static int nfc_genl_send_se(struct sk_buff *msg, struct nfc_dev *dev, + u32 portid, u32 seq, + struct netlink_callback *cb, + int flags) +{ + void *hdr; + struct nfc_se *se, *n; + + list_for_each_entry_safe(se, n, &dev->secure_elements, list) { + hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, flags, + NFC_CMD_GET_SE); + if (!hdr) + goto nla_put_failure; + + if (cb) + genl_dump_check_consistent(cb, hdr, &nfc_genl_family); + + if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || + nla_put_u32(msg, NFC_ATTR_SE_INDEX, se->idx) || + nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type)) + goto nla_put_failure; + + if (genlmsg_end(msg, hdr) < 0) + goto nla_put_failure; + } + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int nfc_genl_dump_ses(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; + struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; + bool first_call = false; + + if (!iter) { + first_call = true; + iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + cb->args[0] = (long) iter; + } + + mutex_lock(&nfc_devlist_mutex); + + cb->seq = nfc_devlist_generation; + + if (first_call) { + nfc_device_iter_init(iter); + dev = nfc_device_iter_next(iter); + } + + while (dev) { + int rc; + + rc = nfc_genl_send_se(skb, dev, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, cb, NLM_F_MULTI); + if (rc < 0) + break; + + dev = nfc_device_iter_next(iter); + } + + mutex_unlock(&nfc_devlist_mutex); + + cb->args[1] = (long) dev; + + return skb->len; +} + +static int nfc_genl_dump_ses_done(struct netlink_callback *cb) +{ + struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; + + nfc_device_iter_exit(iter); + kfree(iter); + + return 0; +} + static struct genl_ops nfc_genl_ops[] = { { .cmd = NFC_CMD_GET_DEVICE, @@ -1265,6 +1350,12 @@ static struct genl_ops nfc_genl_ops[] = { .doit = nfc_genl_disable_se, .policy = nfc_genl_policy, }, + { + .cmd = NFC_CMD_GET_SE, + .dumpit = nfc_genl_dump_ses, + .done = nfc_genl_dump_ses_done, + .policy = nfc_genl_policy, + }, }; -- cgit v1.2.3-71-gd317 From 352a5f5fb3ad8f829cfd4248fe6119895bda881f Mon Sep 17 00:00:00 2001 From: Eric Lapuyade Date: Fri, 19 Jul 2013 14:57:55 +0200 Subject: NFC: netlink: Add result of firmware operation to completion event Result is added as an NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS attribute containing the standard errno positive value of the completion result. This event will be sent when the firmare download operation is done and will contain the operation result. Signed-off-by: Eric Lapuyade Signed-off-by: Samuel Ortiz --- include/net/nfc/nfc.h | 3 ++- include/uapi/linux/nfc.h | 2 ++ net/nfc/core.c | 12 ++++++++++-- net/nfc/netlink.c | 4 +++- net/nfc/nfc.h | 3 ++- 5 files changed, 19 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h index 100595560584..f68ee68e4e3e 100644 --- a/include/net/nfc/nfc.h +++ b/include/net/nfc/nfc.h @@ -224,7 +224,8 @@ int nfc_set_remote_general_bytes(struct nfc_dev *dev, u8 *gt, u8 gt_len); u8 *nfc_get_local_general_bytes(struct nfc_dev *dev, size_t *gb_len); -int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name); +int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name, + u32 result); int nfc_targets_found(struct nfc_dev *dev, struct nfc_target *targets, int ntargets); diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index 029921b067fd..29bed72a4ac4 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -146,6 +146,7 @@ enum nfc_commands { * @NFC_ATTR_FIRMWARE_NAME: Free format firmware version * @NFC_ATTR_SE_INDEX: Secure element index * @NFC_ATTR_SE_TYPE: Secure element type (UICC or EMBEDDED) + * @NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS: Firmware download operation status */ enum nfc_attrs { NFC_ATTR_UNSPEC, @@ -172,6 +173,7 @@ enum nfc_attrs { NFC_ATTR_SE_INDEX, NFC_ATTR_SE_TYPE, NFC_ATTR_SE_AID, + NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS, /* private: internal use only */ __NFC_ATTR_AFTER_LAST }; diff --git a/net/nfc/core.c b/net/nfc/core.c index aad7f8f59784..d252912b8deb 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -77,11 +77,19 @@ error: return rc; } -int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name) +/** + * nfc_fw_download_done - inform that a firmware download was completed + * + * @dev: The nfc device to which firmware was downloaded + * @firmware_name: The firmware filename + * @result: The positive value of a standard errno value + */ +int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name, + u32 result) { dev->fw_download_in_progress = false; - return nfc_genl_fw_download_done(dev, firmware_name); + return nfc_genl_fw_download_done(dev, firmware_name, result); } EXPORT_SYMBOL(nfc_fw_download_done); diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 3b08ef90e045..68063b2025da 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1114,7 +1114,8 @@ static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info) return rc; } -int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name) +int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name, + u32 result) { struct sk_buff *msg; void *hdr; @@ -1129,6 +1130,7 @@ int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name) goto free_msg; if (nla_put_string(msg, NFC_ATTR_FIRMWARE_NAME, firmware_name) || + nla_put_u32(msg, NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS, result) || nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h index 4e2e5a787c4a..aaf606fc1faa 100644 --- a/net/nfc/nfc.h +++ b/net/nfc/nfc.h @@ -124,7 +124,8 @@ static inline void nfc_device_iter_exit(struct class_dev_iter *iter) } int nfc_fw_download(struct nfc_dev *dev, const char *firmware_name); -int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name); +int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name, + u32 result); int nfc_dev_up(struct nfc_dev *dev); -- cgit v1.2.3-71-gd317 From fc4eba58b4c1462ff3d6247b66fb47d6928db6d2 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Wed, 14 Aug 2013 01:03:46 +0200 Subject: ipv6: make unsolicited report intervals configurable for mld Commit cab70040dfd95ee32144f02fade64f0cb94f31a0 ("net: igmp: Reduce Unsolicited report interval to 1s when using IGMPv3") and 2690048c01f32bf45d1c1e1ab3079bc10ad2aea7 ("net: igmp: Allow user-space configuration of igmp unsolicited report interval") by William Manley made igmp unsolicited report intervals configurable per interface and corrected the interval of unsolicited igmpv3 report messages resendings to 1s. Same needs to be done for IPv6: MLDv1 (RFC2710 7.10.): 10 seconds MLDv2 (RFC3810 9.11.): 1 second Both intervals are configurable via new procfs knobs mldv1_unsolicited_report_interval and mldv2_unsolicited_report_interval. (also added .force_mld_version to ipv6_devconf_dflt to bring structs in line without semantic changes) v2: a) Joined documentation update for IPv4 and IPv6 MLD/IGMP unsolicited_report_interval procfs knobs. b) incorporate stylistic feedback from William Manley v3: a) add new DEVCONF_* values to the end of the enum (thanks to David Miller) Cc: Cong Wang Cc: William Manley Cc: Benjamin LaHaise Cc: YOSHIFUJI Hideaki Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 18 ++++++++++++++++++ include/linux/ipv6.h | 2 ++ include/uapi/linux/ipv6.h | 2 ++ net/ipv6/addrconf.c | 25 +++++++++++++++++++++++++ net/ipv6/mcast.c | 17 ++++++++++++++--- 5 files changed, 61 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 36be26b2ef7a..debfe857d8f9 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1039,7 +1039,15 @@ disable_policy - BOOLEAN disable_xfrm - BOOLEAN Disable IPSEC encryption on this interface, whatever the policy +igmpv2_unsolicited_report_interval - INTEGER + The interval in milliseconds in which the next unsolicited + IGMPv1 or IGMPv2 report retransmit will take place. + Default: 10000 (10 seconds) +igmpv3_unsolicited_report_interval - INTEGER + The interval in milliseconds in which the next unsolicited + IGMPv3 report retransmit will take place. + Default: 1000 (1 seconds) tag - INTEGER Allows you to write a number, which can be used as required. @@ -1331,6 +1339,16 @@ ndisc_notify - BOOLEAN 1 - Generate unsolicited neighbour advertisements when device is brought up or hardware address changes. +mldv1_unsolicited_report_interval - INTEGER + The interval in milliseconds in which the next unsolicited + MLDv1 report retransmit will take place. + Default: 10000 (10 seconds) + +mldv2_unsolicited_report_interval - INTEGER + The interval in milliseconds in which the next unsolicited + MLDv2 report retransmit will take place. + Default: 1000 (1 second) + icmp/*: ratelimit - INTEGER Limit the maximal rates for sending ICMPv6 packets. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 850e95bc766c..77a478462d8e 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -19,6 +19,8 @@ struct ipv6_devconf { __s32 rtr_solicit_interval; __s32 rtr_solicit_delay; __s32 force_mld_version; + __s32 mldv1_unsolicited_report_interval; + __s32 mldv2_unsolicited_report_interval; #ifdef CONFIG_IPV6_PRIVACY __s32 use_tempaddr; __s32 temp_valid_lft; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 4bda4cf5b0f5..d07ac6903e59 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -160,6 +160,8 @@ enum { DEVCONF_ACCEPT_DAD, DEVCONF_FORCE_TLLAO, DEVCONF_NDISC_NOTIFY, + DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL, + DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 7fd8572bac80..ad12f7c43f25 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -177,6 +177,8 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .accept_redirects = 1, .autoconf = 1, .force_mld_version = 0, + .mldv1_unsolicited_report_interval = 10 * HZ, + .mldv2_unsolicited_report_interval = HZ, .dad_transmits = 1, .rtr_solicits = MAX_RTR_SOLICITATIONS, .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, @@ -211,6 +213,9 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .accept_ra = 1, .accept_redirects = 1, .autoconf = 1, + .force_mld_version = 0, + .mldv1_unsolicited_report_interval = 10 * HZ, + .mldv2_unsolicited_report_interval = HZ, .dad_transmits = 1, .rtr_solicits = MAX_RTR_SOLICITATIONS, .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, @@ -4179,6 +4184,10 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_RTR_SOLICIT_DELAY] = jiffies_to_msecs(cnf->rtr_solicit_delay); array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version; + array[DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL] = + jiffies_to_msecs(cnf->mldv1_unsolicited_report_interval); + array[DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL] = + jiffies_to_msecs(cnf->mldv2_unsolicited_report_interval); #ifdef CONFIG_IPV6_PRIVACY array[DEVCONF_USE_TEMPADDR] = cnf->use_tempaddr; array[DEVCONF_TEMP_VALID_LFT] = cnf->temp_valid_lft; @@ -4862,6 +4871,22 @@ static struct addrconf_sysctl_table .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "mldv1_unsolicited_report_interval", + .data = + &ipv6_devconf.mldv1_unsolicited_report_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { + .procname = "mldv2_unsolicited_report_interval", + .data = + &ipv6_devconf.mldv2_unsolicited_report_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, #ifdef CONFIG_IPV6_PRIVACY { .procname = "use_tempaddr", diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index db25b8eb62bd..6c76df9909bf 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -108,7 +108,6 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, struct inet6_dev *idev); -#define IGMP6_UNSOLICITED_IVAL (10*HZ) #define MLD_QRV_DEFAULT 2 #define MLD_V1_SEEN(idev) (dev_net((idev)->dev)->ipv6.devconf_all->force_mld_version == 1 || \ @@ -129,6 +128,18 @@ int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF; pmc != NULL; \ pmc = rcu_dereference(pmc->next)) +static int unsolicited_report_interval(struct inet6_dev *idev) +{ + int iv; + + if (MLD_V1_SEEN(idev)) + iv = idev->cnf.mldv1_unsolicited_report_interval; + else + iv = idev->cnf.mldv2_unsolicited_report_interval; + + return iv > 0 ? iv : 1; +} + int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct net_device *dev = NULL; @@ -2158,7 +2169,7 @@ static void igmp6_join_group(struct ifmcaddr6 *ma) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); - delay = net_random() % IGMP6_UNSOLICITED_IVAL; + delay = net_random() % unsolicited_report_interval(ma->idev); spin_lock_bh(&ma->mca_lock); if (del_timer(&ma->mca_timer)) { @@ -2325,7 +2336,7 @@ void ipv6_mc_init_dev(struct inet6_dev *idev) setup_timer(&idev->mc_dad_timer, mld_dad_timer_expire, (unsigned long)idev); idev->mc_qrv = MLD_QRV_DEFAULT; - idev->mc_maxdelay = IGMP6_UNSOLICITED_IVAL; + idev->mc_maxdelay = unsolicited_report_interval(idev); idev->mc_v1_seen = 0; write_unlock_bh(&idev->lock); } -- cgit v1.2.3-71-gd317 From f0c03956ac40fdc4fbce7bed262ae74ac372e765 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 13 Aug 2013 15:05:38 +0200 Subject: netfilter: export xt_rpfilter.h to userland This file contains the API for the match "rpfilter", hence it should be exported to userland. Signed-off-by: Nicolas Dichtel Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/xt_rpfilter.h | 23 ----------------------- include/uapi/linux/netfilter/Kbuild | 1 + include/uapi/linux/netfilter/xt_rpfilter.h | 23 +++++++++++++++++++++++ 3 files changed, 24 insertions(+), 23 deletions(-) delete mode 100644 include/linux/netfilter/xt_rpfilter.h create mode 100644 include/uapi/linux/netfilter/xt_rpfilter.h (limited to 'include/uapi/linux') diff --git a/include/linux/netfilter/xt_rpfilter.h b/include/linux/netfilter/xt_rpfilter.h deleted file mode 100644 index 8358d4f71952..000000000000 --- a/include/linux/netfilter/xt_rpfilter.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _XT_RPATH_H -#define _XT_RPATH_H - -#include - -enum { - XT_RPFILTER_LOOSE = 1 << 0, - XT_RPFILTER_VALID_MARK = 1 << 1, - XT_RPFILTER_ACCEPT_LOCAL = 1 << 2, - XT_RPFILTER_INVERT = 1 << 3, -#ifdef __KERNEL__ - XT_RPFILTER_OPTION_MASK = XT_RPFILTER_LOOSE | - XT_RPFILTER_VALID_MARK | - XT_RPFILTER_ACCEPT_LOCAL | - XT_RPFILTER_INVERT, -#endif -}; - -struct xt_rpfilter_info { - __u8 flags; -}; - -#endif diff --git a/include/uapi/linux/netfilter/Kbuild b/include/uapi/linux/netfilter/Kbuild index 41115776d76f..dc00927ffd62 100644 --- a/include/uapi/linux/netfilter/Kbuild +++ b/include/uapi/linux/netfilter/Kbuild @@ -68,6 +68,7 @@ header-y += xt_quota.h header-y += xt_rateest.h header-y += xt_realm.h header-y += xt_recent.h +header-y += xt_rpfilter.h header-y += xt_sctp.h header-y += xt_set.h header-y += xt_socket.h diff --git a/include/uapi/linux/netfilter/xt_rpfilter.h b/include/uapi/linux/netfilter/xt_rpfilter.h new file mode 100644 index 000000000000..8358d4f71952 --- /dev/null +++ b/include/uapi/linux/netfilter/xt_rpfilter.h @@ -0,0 +1,23 @@ +#ifndef _XT_RPATH_H +#define _XT_RPATH_H + +#include + +enum { + XT_RPFILTER_LOOSE = 1 << 0, + XT_RPFILTER_VALID_MARK = 1 << 1, + XT_RPFILTER_ACCEPT_LOCAL = 1 << 2, + XT_RPFILTER_INVERT = 1 << 3, +#ifdef __KERNEL__ + XT_RPFILTER_OPTION_MASK = XT_RPFILTER_LOOSE | + XT_RPFILTER_VALID_MARK | + XT_RPFILTER_ACCEPT_LOCAL | + XT_RPFILTER_INVERT, +#endif +}; + +struct xt_rpfilter_info { + __u8 flags; +}; + +#endif -- cgit v1.2.3-71-gd317 From 38c67328ac79cb9eaf61b5d4750fe3b9cff0dd15 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 13 Aug 2013 15:05:39 +0200 Subject: netfilter: export xt_HMARK.h to userland This file contains the API for the target "HMARK", hence it should be exported to userland. Signed-off-by: Nicolas Dichtel Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/xt_HMARK.h | 50 --------------------------------- include/uapi/linux/netfilter/Kbuild | 1 + include/uapi/linux/netfilter/xt_HMARK.h | 50 +++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 50 deletions(-) delete mode 100644 include/linux/netfilter/xt_HMARK.h create mode 100644 include/uapi/linux/netfilter/xt_HMARK.h (limited to 'include/uapi/linux') diff --git a/include/linux/netfilter/xt_HMARK.h b/include/linux/netfilter/xt_HMARK.h deleted file mode 100644 index 826fc5807577..000000000000 --- a/include/linux/netfilter/xt_HMARK.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef XT_HMARK_H_ -#define XT_HMARK_H_ - -#include - -enum { - XT_HMARK_SADDR_MASK, - XT_HMARK_DADDR_MASK, - XT_HMARK_SPI, - XT_HMARK_SPI_MASK, - XT_HMARK_SPORT, - XT_HMARK_DPORT, - XT_HMARK_SPORT_MASK, - XT_HMARK_DPORT_MASK, - XT_HMARK_PROTO_MASK, - XT_HMARK_RND, - XT_HMARK_MODULUS, - XT_HMARK_OFFSET, - XT_HMARK_CT, - XT_HMARK_METHOD_L3, - XT_HMARK_METHOD_L3_4, -}; -#define XT_HMARK_FLAG(flag) (1 << flag) - -union hmark_ports { - struct { - __u16 src; - __u16 dst; - } p16; - struct { - __be16 src; - __be16 dst; - } b16; - __u32 v32; - __be32 b32; -}; - -struct xt_hmark_info { - union nf_inet_addr src_mask; - union nf_inet_addr dst_mask; - union hmark_ports port_mask; - union hmark_ports port_set; - __u32 flags; - __u16 proto_mask; - __u32 hashrnd; - __u32 hmodulus; - __u32 hoffset; /* Mark offset to start from */ -}; - -#endif /* XT_HMARK_H_ */ diff --git a/include/uapi/linux/netfilter/Kbuild b/include/uapi/linux/netfilter/Kbuild index dc00927ffd62..174915420d3f 100644 --- a/include/uapi/linux/netfilter/Kbuild +++ b/include/uapi/linux/netfilter/Kbuild @@ -22,6 +22,7 @@ header-y += xt_CONNMARK.h header-y += xt_CONNSECMARK.h header-y += xt_CT.h header-y += xt_DSCP.h +header-y += xt_HMARK.h header-y += xt_IDLETIMER.h header-y += xt_LED.h header-y += xt_LOG.h diff --git a/include/uapi/linux/netfilter/xt_HMARK.h b/include/uapi/linux/netfilter/xt_HMARK.h new file mode 100644 index 000000000000..826fc5807577 --- /dev/null +++ b/include/uapi/linux/netfilter/xt_HMARK.h @@ -0,0 +1,50 @@ +#ifndef XT_HMARK_H_ +#define XT_HMARK_H_ + +#include + +enum { + XT_HMARK_SADDR_MASK, + XT_HMARK_DADDR_MASK, + XT_HMARK_SPI, + XT_HMARK_SPI_MASK, + XT_HMARK_SPORT, + XT_HMARK_DPORT, + XT_HMARK_SPORT_MASK, + XT_HMARK_DPORT_MASK, + XT_HMARK_PROTO_MASK, + XT_HMARK_RND, + XT_HMARK_MODULUS, + XT_HMARK_OFFSET, + XT_HMARK_CT, + XT_HMARK_METHOD_L3, + XT_HMARK_METHOD_L3_4, +}; +#define XT_HMARK_FLAG(flag) (1 << flag) + +union hmark_ports { + struct { + __u16 src; + __u16 dst; + } p16; + struct { + __be16 src; + __be16 dst; + } b16; + __u32 v32; + __be32 b32; +}; + +struct xt_hmark_info { + union nf_inet_addr src_mask; + union nf_inet_addr dst_mask; + union hmark_ports port_mask; + union hmark_ports port_set; + __u32 flags; + __u16 proto_mask; + __u32 hashrnd; + __u32 hmodulus; + __u32 hoffset; /* Mark offset to start from */ +}; + +#endif /* XT_HMARK_H_ */ -- cgit v1.2.3-71-gd317 From 8a8e3d84b1719a56f9151909e80ea6ebc5b8e318 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 14 Aug 2013 23:47:11 +0200 Subject: net_sched: restore "linklayer atm" handling commit 56b765b79 ("htb: improved accuracy at high rates") broke the "linklayer atm" handling. tc class add ... htb rate X ceil Y linklayer atm The linklayer setting is implemented by modifying the rate table which is send to the kernel. No direct parameter were transferred to the kernel indicating the linklayer setting. The commit 56b765b79 ("htb: improved accuracy at high rates") removed the use of the rate table system. To keep compatible with older iproute2 utils, this patch detects the linklayer by parsing the rate table. It also supports future versions of iproute2 to send this linklayer parameter to the kernel directly. This is done by using the __reserved field in struct tc_ratespec, to convey the choosen linklayer option, but only using the lower 4 bits of this field. Linklayer detection is limited to speeds below 100Mbit/s, because at high rates the rtab is gets too inaccurate, so bad that several fields contain the same values, this resembling the ATM detect. Fields even start to contain "0" time to send, e.g. at 1000Mbit/s sending a 96 bytes packet cost "0", thus the rtab have been more broken than we first realized. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/sch_generic.h | 9 ++++++++- include/uapi/linux/pkt_sched.h | 10 +++++++++- net/sched/sch_api.c | 41 +++++++++++++++++++++++++++++++++++++++++ net/sched/sch_generic.c | 1 + net/sched/sch_htb.c | 13 +++++++++++++ 5 files changed, 72 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 6eab63363e59..e5ae0c50fa9c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -683,13 +683,19 @@ struct psched_ratecfg { u64 rate_bytes_ps; /* bytes per second */ u32 mult; u16 overhead; + u8 linklayer; u8 shift; }; static inline u64 psched_l2t_ns(const struct psched_ratecfg *r, unsigned int len) { - return ((u64)(len + r->overhead) * r->mult) >> r->shift; + len += r->overhead; + + if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) + return ((u64)(DIV_ROUND_UP(len,48)*53) * r->mult) >> r->shift; + + return ((u64)len * r->mult) >> r->shift; } extern void psched_ratecfg_precompute(struct psched_ratecfg *r, const struct tc_ratespec *conf); @@ -700,6 +706,7 @@ static inline void psched_ratecfg_getrate(struct tc_ratespec *res, memset(res, 0, sizeof(*res)); res->rate = r->rate_bytes_ps; res->overhead = r->overhead; + res->linklayer = (r->linklayer & TC_LINKLAYER_MASK); } #endif diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index dbd71b0c7d8c..09d62b9228ff 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -73,9 +73,17 @@ struct tc_estimator { #define TC_H_ROOT (0xFFFFFFFFU) #define TC_H_INGRESS (0xFFFFFFF1U) +/* Need to corrospond to iproute2 tc/tc_core.h "enum link_layer" */ +enum tc_link_layer { + TC_LINKLAYER_UNAWARE, /* Indicate unaware old iproute2 util */ + TC_LINKLAYER_ETHERNET, + TC_LINKLAYER_ATM, +}; +#define TC_LINKLAYER_MASK 0x0F /* limit use to lower 4 bits */ + struct tc_ratespec { unsigned char cell_log; - unsigned char __reserved; + __u8 linklayer; /* lower 4 bits */ unsigned short overhead; short cell_align; unsigned short mpu; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 281c1bded1f6..51b968d3febb 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -285,6 +285,45 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) return q; } +/* The linklayer setting were not transferred from iproute2, in older + * versions, and the rate tables lookup systems have been dropped in + * the kernel. To keep backward compatible with older iproute2 tc + * utils, we detect the linklayer setting by detecting if the rate + * table were modified. + * + * For linklayer ATM table entries, the rate table will be aligned to + * 48 bytes, thus some table entries will contain the same value. The + * mpu (min packet unit) is also encoded into the old rate table, thus + * starting from the mpu, we find low and high table entries for + * mapping this cell. If these entries contain the same value, when + * the rate tables have been modified for linklayer ATM. + * + * This is done by rounding mpu to the nearest 48 bytes cell/entry, + * and then roundup to the next cell, calc the table entry one below, + * and compare. + */ +static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab) +{ + int low = roundup(r->mpu, 48); + int high = roundup(low+1, 48); + int cell_low = low >> r->cell_log; + int cell_high = (high >> r->cell_log) - 1; + + /* rtab is too inaccurate at rates > 100Mbit/s */ + if ((r->rate > (100000000/8)) || (rtab[0] == 0)) { + pr_debug("TC linklayer: Giving up ATM detection\n"); + return TC_LINKLAYER_ETHERNET; + } + + if ((cell_high > cell_low) && (cell_high < 256) + && (rtab[cell_low] == rtab[cell_high])) { + pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n", + cell_low, cell_high, rtab[cell_high]); + return TC_LINKLAYER_ATM; + } + return TC_LINKLAYER_ETHERNET; +} + static struct qdisc_rate_table *qdisc_rtab_list; struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab) @@ -308,6 +347,8 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *ta rtab->rate = *r; rtab->refcnt = 1; memcpy(rtab->data, nla_data(tab), 1024); + if (r->linklayer == TC_LINKLAYER_UNAWARE) + r->linklayer = __detect_linklayer(r, rtab->data); rtab->next = qdisc_rtab_list; qdisc_rtab_list = rtab; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index eeb8276d7a89..48be3d5c0d92 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -909,6 +909,7 @@ void psched_ratecfg_precompute(struct psched_ratecfg *r, memset(r, 0, sizeof(*r)); r->overhead = conf->overhead; r->rate_bytes_ps = conf->rate; + r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK); r->mult = 1; /* * The deal here is to replace a divide by a reciprocal one diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 45e751527dfc..c2178b15ca6e 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1329,6 +1329,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)*arg, *parent; struct nlattr *opt = tca[TCA_OPTIONS]; + struct qdisc_rate_table *rtab = NULL, *ctab = NULL; struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_opt *hopt; @@ -1350,6 +1351,18 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!hopt->rate.rate || !hopt->ceil.rate) goto failure; + /* Keeping backward compatible with rate_table based iproute2 tc */ + if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE) { + rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]); + if (rtab) + qdisc_put_rtab(rtab); + } + if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE) { + ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]); + if (ctab) + qdisc_put_rtab(ctab); + } + if (!cl) { /* new class */ struct Qdisc *new_q; int prio; -- cgit v1.2.3-71-gd317 From 7869a4a6c5caa7b2e5c41ccaf46eb3371f88eea7 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 16 Aug 2013 22:05:14 -0400 Subject: ext4: add support for extent pre-caching Add a new fiemap flag which forces the all of the extents in an inode to be cached in the extent_status tree. This is critically important when using AIO to a preallocated file, since if we need to read in blocks from the extent tree, the io_submit(2) system call becomes synchronous, and the AIO is no longer "A", which is bad. In addition, for most files which have an external leaf tree block, the cost of caching the information in the extent status tree will be less than caching the entire 4k block in the buffer cache. So it is generally a win to keep the extent information cached. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 17 ++++++----- fs/ext4/extents.c | 73 ++++++++++++++++++++++++++++++++++++++++++++- fs/ext4/extents_status.c | 72 +++++++++++++++++++++++++++++++------------- fs/ext4/ioctl.c | 3 ++ include/uapi/linux/fiemap.h | 1 + 5 files changed, 137 insertions(+), 29 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c74b1948feb0..635135e6148e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -561,15 +561,16 @@ enum { #define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 /* - * The bit position of this flag must not overlap with any of the - * EXT4_GET_BLOCKS_*. It is used by ext4_ext_find_extent(), + * The bit position of these flags must not overlap with any of the + * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(), * read_extent_tree_block(), ext4_split_extent_at(), - * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf() to - * indicate that the we shouldn't be caching the extents when reading - * from the extent tree while a truncate or punch hole operation - * is in progress. + * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). + * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be + * caching the extents when reading from the extent tree while a + * truncate or punch hole operation is in progress. */ #define EXT4_EX_NOCACHE 0x0400 +#define EXT4_EX_FORCE_CACHE 0x0800 /* * Flags used by ext4_free_blocks @@ -601,6 +602,7 @@ enum { #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) #define EXT4_IOC_SWAP_BOOT _IO('f', 17) +#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -1386,6 +1388,7 @@ enum { nolocking */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ + EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -2705,7 +2708,7 @@ extern int ext4_find_delalloc_range(struct inode *inode, extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); - +extern int ext4_ext_precache(struct inode *inode); /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 08c1ac976479..01838875fcaf 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -482,7 +482,7 @@ __read_extent_tree_block(const char *function, unsigned int line, if (err < 0) goto errout; } - if (buffer_verified(bh)) + if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) return bh; err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh), depth, pblk); @@ -526,6 +526,71 @@ errout: __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \ (depth), (flags)) +/* + * This function is called to cache a file's extent information in the + * extent status tree + */ +int ext4_ext_precache(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_ext_path *path = NULL; + struct buffer_head *bh; + int i = 0, depth, ret = 0; + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return 0; /* not an extent-mapped inode */ + + down_read(&ei->i_data_sem); + depth = ext_depth(inode); + + path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), + GFP_NOFS); + if (path == NULL) { + up_read(&ei->i_data_sem); + return -ENOMEM; + } + + /* Don't cache anything if there are no external extent blocks */ + if (depth == 0) + goto out; + path[0].p_hdr = ext_inode_hdr(inode); + ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); + if (ret) + goto out; + path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr); + while (i >= 0) { + /* + * If this is a leaf block or we've reached the end of + * the index block, go up + */ + if ((i == depth) || + path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { + brelse(path[i].p_bh); + path[i].p_bh = NULL; + i--; + continue; + } + bh = read_extent_tree_block(inode, + ext4_idx_pblock(path[i].p_idx++), + depth - i - 1, + EXT4_EX_FORCE_CACHE); + if (IS_ERR(bh)) { + ret = PTR_ERR(bh); + break; + } + i++; + path[i].p_bh = bh; + path[i].p_hdr = ext_block_hdr(bh); + path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr); + } + ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); +out: + up_read(&ei->i_data_sem); + ext4_ext_drop_refs(path); + kfree(path); + return ret; +} + #ifdef EXT_DEBUG static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) { @@ -4766,6 +4831,12 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return error; } + if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + error = ext4_ext_precache(inode); + if (error) + return error; + } + /* fallback to generic here if not in extents fmt */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return generic_block_fiemap(inode, fieinfo, start, len, diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 1dc5df016e25..0e88a367b535 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -710,11 +710,8 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, write_lock(&EXT4_I(inode)->i_es_lock); es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); - if (es && ((es->es_lblk <= lblk) || (es->es_lblk <= end))) - goto out; - - __es_insert_extent(inode, &newes); -out: + if (!es || es->es_lblk > end) + __es_insert_extent(inode, &newes); write_unlock(&EXT4_I(inode)->i_es_lock); } @@ -930,6 +927,12 @@ static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, eia = list_entry(a, struct ext4_inode_info, i_es_lru); eib = list_entry(b, struct ext4_inode_info, i_es_lru); + if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && + !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) + return 1; + if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && + ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) + return -1; if (eia->i_touch_when == eib->i_touch_when) return 0; if (time_after(eia->i_touch_when, eib->i_touch_when)) @@ -943,21 +946,13 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, { struct ext4_inode_info *ei; struct list_head *cur, *tmp; - LIST_HEAD(skiped); + LIST_HEAD(skipped); int ret, nr_shrunk = 0; + int retried = 0, skip_precached = 1, nr_skipped = 0; spin_lock(&sbi->s_es_lru_lock); - /* - * If the inode that is at the head of LRU list is newer than - * last_sorted time, that means that we need to sort this list. - */ - ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru); - if (sbi->s_es_last_sorted < ei->i_touch_when) { - list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); - sbi->s_es_last_sorted = jiffies; - } - +retry: list_for_each_safe(cur, tmp, &sbi->s_es_lru) { /* * If we have already reclaimed all extents from extent @@ -968,9 +963,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, ei = list_entry(cur, struct ext4_inode_info, i_es_lru); - /* Skip the inode that is newer than the last_sorted time */ - if (sbi->s_es_last_sorted < ei->i_touch_when) { - list_move_tail(cur, &skiped); + /* + * Skip the inode that is newer than the last_sorted + * time. Normally we try hard to avoid shrinking + * precached inodes, but we will as a last resort. + */ + if ((sbi->s_es_last_sorted < ei->i_touch_when) || + (skip_precached && ext4_test_inode_state(&ei->vfs_inode, + EXT4_STATE_EXT_PRECACHED))) { + nr_skipped++; + list_move_tail(cur, &skipped); continue; } @@ -990,11 +992,33 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, } /* Move the newer inodes into the tail of the LRU list. */ - list_splice_tail(&skiped, &sbi->s_es_lru); + list_splice_tail(&skipped, &sbi->s_es_lru); + INIT_LIST_HEAD(&skipped); + + /* + * If we skipped any inodes, and we weren't able to make any + * forward progress, sort the list and try again. + */ + if ((nr_shrunk == 0) && nr_skipped && !retried) { + retried++; + list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); + sbi->s_es_last_sorted = jiffies; + ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, + i_es_lru); + /* + * If there are no non-precached inodes left on the + * list, start releasing precached extents. + */ + if (ext4_test_inode_state(&ei->vfs_inode, + EXT4_STATE_EXT_PRECACHED)) + skip_precached = 0; + goto retry; + } + spin_unlock(&sbi->s_es_lru_lock); if (locked_ei && nr_shrunk == 0) - nr_shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); + nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); return nr_shrunk; } @@ -1069,10 +1093,16 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, struct rb_node *node; struct extent_status *es; int nr_shrunk = 0; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); if (ei->i_es_lru_nr == 0) return 0; + if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && + __ratelimit(&_rs)) + ext4_warning(inode->i_sb, "forced shrink of precached extents"); + node = rb_first(&tree->root); while (node != NULL) { es = rb_entry(node, struct extent_status, rb_node); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index c0427e2f6648..5498f75a1648 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -624,6 +624,8 @@ resizefs_out: return 0; } + case EXT4_IOC_PRECACHE_EXTENTS: + return ext4_ext_precache(inode); default: return -ENOTTY; @@ -688,6 +690,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_MOVE_EXT: case FITRIM: case EXT4_IOC_RESIZE_FS: + case EXT4_IOC_PRECACHE_EXTENTS: break; default: return -ENOIOCTLCMD; diff --git a/include/uapi/linux/fiemap.h b/include/uapi/linux/fiemap.h index d830747f5c0b..0c51d617dae9 100644 --- a/include/uapi/linux/fiemap.h +++ b/include/uapi/linux/fiemap.h @@ -40,6 +40,7 @@ struct fiemap { #define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */ #define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */ +#define FIEMAP_FLAG_CACHE 0x00000004 /* request caching of the extents */ #define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR) -- cgit v1.2.3-71-gd317 From bc9028e1d38419f9249cb0d1285e290be7e67223 Mon Sep 17 00:00:00 2001 From: Arun Kumar K Date: Tue, 9 Jul 2013 01:24:41 -0300 Subject: [media] V4L: Add VP8 encoder controls This patch adds new V4L controls for VP8 encoding. Signed-off-by: Kiran AVND Signed-off-by: Arun Kumar K Acked-by: Hans Verkuil Signed-off-by: Kamil Debski Signed-off-by: Mauro Carvalho Chehab --- Documentation/DocBook/media/v4l/controls.xml | 168 ++++++++++++++++++++++++++- drivers/media/v4l2-core/v4l2-ctrls.c | 39 ++++++- include/uapi/linux/v4l2-controls.h | 29 +++++ 3 files changed, 229 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/DocBook/media/v4l/controls.xml b/Documentation/DocBook/media/v4l/controls.xml index c2fc9ec1417e..7a3b49b3cc3b 100644 --- a/Documentation/DocBook/media/v4l/controls.xml +++ b/Documentation/DocBook/media/v4l/controls.xml @@ -722,17 +722,22 @@ for more details.
- MPEG Control Reference + Codec Control Reference - Below all controls within the MPEG control class are + Below all controls within the Codec control class are described. First the generic controls, then controls specific for certain hardware. + Note: These controls are applicable to all codecs and +not just MPEG. The defines are prefixed with V4L2_CID_MPEG/V4L2_MPEG +as the controls were originally made for MPEG codecs and later +extended to cover all encoding formats. +
- Generic MPEG Controls + Generic Codec Controls - MPEG Control IDs + Codec Control IDs @@ -752,7 +757,7 @@ certain hardware. V4L2_CID_MPEG_CLASS  class - The MPEG class + The Codec class descriptor. Calling &VIDIOC-QUERYCTRL; for this control will return a description of this control class. This description can be used as the caption of a Tab page in a GUI, for example. @@ -3009,6 +3014,159 @@ in by the application. 0 = do not insert, 1 = insert packets.
+ +
+ VPX Control Reference + + The VPX controls include controls for encoding parameters + of VPx video codec. + + + VPX Control IDs + + + + + + + + + + + ID + Type + Description + + + + + + + + V4L2_CID_MPEG_VIDEO_VPX_NUM_PARTITIONS + enum v4l2_vp8_num_partitions + + The number of token partitions to use in VP8 encoder. +Possible values are: + + + + + + V4L2_CID_MPEG_VIDEO_VPX_1_PARTITION + 1 coefficient partition + + + V4L2_CID_MPEG_VIDEO_VPX_2_PARTITIONS + 2 coefficient partitions + + + V4L2_CID_MPEG_VIDEO_VPX_4_PARTITIONS + 4 coefficient partitions + + + V4L2_CID_MPEG_VIDEO_VPX_8_PARTITIONS + 8 coefficient partitions + + + + + + + + V4L2_CID_MPEG_VIDEO_VPX_IMD_DISABLE_4X4 + boolean + + Setting this prevents intra 4x4 mode in the intra mode decision. + + + + + V4L2_CID_MPEG_VIDEO_VPX_NUM_REF_FRAMES + enum v4l2_vp8_num_ref_frames + + The number of reference pictures for encoding P frames. +Possible values are: + + + + + + V4L2_CID_MPEG_VIDEO_VPX_1_REF_FRAME + Last encoded frame will be searched + + + V4L2_CID_MPEG_VIDEO_VPX_2_REF_FRAME + Two frames will be searched among the last encoded frame, the golden frame +and the alternate reference (altref) frame. The encoder implementation will decide which two are chosen. + + + V4L2_CID_MPEG_VIDEO_VPX_3_REF_FRAME + The last encoded frame, the golden frame and the altref frame will be searched. + + + + + + + + V4L2_CID_MPEG_VIDEO_VPX_FILTER_LEVEL + integer + + Indicates the loop filter level. The adjustment of the loop +filter level is done via a delta value against a baseline loop filter value. + + + + + V4L2_CID_MPEG_VIDEO_VPX_FILTER_SHARPNESS + integer + + This parameter affects the loop filter. Anything above +zero weakens the deblocking effect on the loop filter. + + + + + V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_REF_PERIOD + integer + + Sets the refresh period for the golden frame. The period is defined +in number of frames. For a value of 'n', every nth frame starting from the first key frame will be taken as a golden frame. +For eg. for encoding sequence of 0, 1, 2, 3, 4, 5, 6, 7 where the golden frame refresh period is set as 4, the frames +0, 4, 8 etc will be taken as the golden frames as frame 0 is always a key frame. + + + + + V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_SEL + enum v4l2_vp8_golden_frame_sel + + Selects the golden frame for encoding. +Possible values are: + + + + + + V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_USE_PREV + Use the (n-2)th frame as a golden frame, current frame index being 'n'. + + + V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_USE_REF_PERIOD + Use the previous specific frame indicated by +V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_REF_PERIOD as a golden frame. + + + + + + + + +
+ +
diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c index e03a2e852143..c6dc1fd427d7 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls.c +++ b/drivers/media/v4l2-core/v4l2-ctrls.c @@ -424,6 +424,12 @@ const char * const *v4l2_ctrl_get_menu(u32 id) NULL, }; + static const char * const vpx_golden_frame_sel[] = { + "Use Previous Frame", + "Use Previous Specific Frame", + NULL, + }; + static const char * const flash_led_mode[] = { "Off", "Flash", @@ -538,6 +544,8 @@ const char * const *v4l2_ctrl_get_menu(u32 id) return mpeg_mpeg4_level; case V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE: return mpeg4_profile; + case V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_SEL: + return vpx_golden_frame_sel; case V4L2_CID_JPEG_CHROMA_SUBSAMPLING: return jpeg_chroma_subsampling; case V4L2_CID_DV_TX_MODE: @@ -552,13 +560,26 @@ const char * const *v4l2_ctrl_get_menu(u32 id) } EXPORT_SYMBOL(v4l2_ctrl_get_menu); +#define __v4l2_qmenu_int_len(arr, len) ({ *(len) = ARRAY_SIZE(arr); arr; }) /* * Returns NULL or an s64 type array containing the menu for given * control ID. The total number of the menu items is returned in @len. */ const s64 const *v4l2_ctrl_get_int_menu(u32 id, u32 *len) { + static const s64 const qmenu_int_vpx_num_partitions[] = { + 1, 2, 4, 8, + }; + + static const s64 const qmenu_int_vpx_num_ref_frames[] = { + 1, 2, 3, + }; + switch (id) { + case V4L2_CID_MPEG_VIDEO_VPX_NUM_PARTITIONS: + return __v4l2_qmenu_int_len(qmenu_int_vpx_num_partitions, len); + case V4L2_CID_MPEG_VIDEO_VPX_NUM_REF_FRAMES: + return __v4l2_qmenu_int_len(qmenu_int_vpx_num_ref_frames, len); default: *len = 0; return NULL; @@ -614,9 +635,11 @@ const char *v4l2_ctrl_get_name(u32 id) case V4L2_CID_ALPHA_COMPONENT: return "Alpha Component"; case V4L2_CID_COLORFX_CBCR: return "Color Effects, CbCr"; - /* MPEG controls */ + /* Codec controls */ + /* The MPEG controls are applicable to all codec controls + * and the 'MPEG' part of the define is historical */ /* Keep the order of the 'case's the same as in videodev2.h! */ - case V4L2_CID_MPEG_CLASS: return "MPEG Encoder Controls"; + case V4L2_CID_MPEG_CLASS: return "Codec Controls"; case V4L2_CID_MPEG_STREAM_TYPE: return "Stream Type"; case V4L2_CID_MPEG_STREAM_PID_PMT: return "Stream PMT Program ID"; case V4L2_CID_MPEG_STREAM_PID_AUDIO: return "Stream Audio Program ID"; @@ -714,6 +737,15 @@ const char *v4l2_ctrl_get_name(u32 id) case V4L2_CID_MPEG_VIDEO_VBV_DELAY: return "Initial Delay for VBV Control"; case V4L2_CID_MPEG_VIDEO_REPEAT_SEQ_HEADER: return "Repeat Sequence Header"; + /* VPX controls */ + case V4L2_CID_MPEG_VIDEO_VPX_NUM_PARTITIONS: return "VPX Number of Partitions"; + case V4L2_CID_MPEG_VIDEO_VPX_IMD_DISABLE_4X4: return "VPX Intra Mode Decision Disable"; + case V4L2_CID_MPEG_VIDEO_VPX_NUM_REF_FRAMES: return "VPX No. of Refs for P Frame"; + case V4L2_CID_MPEG_VIDEO_VPX_FILTER_LEVEL: return "VPX Loop Filter Level Range"; + case V4L2_CID_MPEG_VIDEO_VPX_FILTER_SHARPNESS: return "VPX Deblocking Effect Control"; + case V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_REF_PERIOD: return "VPX Golden Frame Refresh Period"; + case V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_SEL: return "VPX Golden Frame Indicator"; + /* CAMERA controls */ /* Keep the order of the 'case's the same as in videodev2.h! */ case V4L2_CID_CAMERA_CLASS: return "Camera Controls"; @@ -928,6 +960,7 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type, case V4L2_CID_DV_RX_RGB_RANGE: case V4L2_CID_TEST_PATTERN: case V4L2_CID_TUNE_DEEMPHASIS: + case V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_SEL: *type = V4L2_CTRL_TYPE_MENU; break; case V4L2_CID_LINK_FREQ: @@ -939,6 +972,8 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type, break; case V4L2_CID_ISO_SENSITIVITY: case V4L2_CID_AUTO_EXPOSURE_BIAS: + case V4L2_CID_MPEG_VIDEO_VPX_NUM_PARTITIONS: + case V4L2_CID_MPEG_VIDEO_VPX_NUM_REF_FRAMES: *type = V4L2_CTRL_TYPE_INTEGER_MENU; break; case V4L2_CID_USER_CLASS: diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index e90a88a8708f..083bb5a5aae2 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -161,6 +161,8 @@ enum v4l2_colorfx { #define V4L2_CID_USER_SI476X_BASE (V4L2_CID_USER_BASE + 0x1040) /* MPEG-class control IDs */ +/* The MPEG controls are applicable to all codec controls + * and the 'MPEG' part of the define is historical */ #define V4L2_CID_MPEG_BASE (V4L2_CTRL_CLASS_MPEG | 0x900) #define V4L2_CID_MPEG_CLASS (V4L2_CTRL_CLASS_MPEG | 1) @@ -522,6 +524,33 @@ enum v4l2_mpeg_video_mpeg4_profile { }; #define V4L2_CID_MPEG_VIDEO_MPEG4_QPEL (V4L2_CID_MPEG_BASE+407) +/* Control IDs for VP8 streams + * Although VP8 is not part of MPEG we add these controls to the MPEG class + * as that class is already handling other video compression standards + */ +#define V4L2_CID_MPEG_VIDEO_VPX_NUM_PARTITIONS (V4L2_CID_MPEG_BASE+500) +enum v4l2_vp8_num_partitions { + V4L2_CID_MPEG_VIDEO_VPX_1_PARTITION = 0, + V4L2_CID_MPEG_VIDEO_VPX_2_PARTITIONS = 1, + V4L2_CID_MPEG_VIDEO_VPX_4_PARTITIONS = 2, + V4L2_CID_MPEG_VIDEO_VPX_8_PARTITIONS = 3, +}; +#define V4L2_CID_MPEG_VIDEO_VPX_IMD_DISABLE_4X4 (V4L2_CID_MPEG_BASE+501) +#define V4L2_CID_MPEG_VIDEO_VPX_NUM_REF_FRAMES (V4L2_CID_MPEG_BASE+502) +enum v4l2_vp8_num_ref_frames { + V4L2_CID_MPEG_VIDEO_VPX_1_REF_FRAME = 0, + V4L2_CID_MPEG_VIDEO_VPX_2_REF_FRAME = 1, + V4L2_CID_MPEG_VIDEO_VPX_3_REF_FRAME = 2, +}; +#define V4L2_CID_MPEG_VIDEO_VPX_FILTER_LEVEL (V4L2_CID_MPEG_BASE+503) +#define V4L2_CID_MPEG_VIDEO_VPX_FILTER_SHARPNESS (V4L2_CID_MPEG_BASE+504) +#define V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_REF_PERIOD (V4L2_CID_MPEG_BASE+505) +#define V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_SEL (V4L2_CID_MPEG_BASE+506) +enum v4l2_vp8_golden_frame_sel { + V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_USE_PREV = 0, + V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_USE_REF_PERIOD = 1, +}; + /* MPEG-class control IDs specific to the CX2341x driver as defined by V4L2 */ #define V4L2_CID_MPEG_CX2341X_BASE (V4L2_CTRL_CLASS_MPEG | 0x1000) #define V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE (V4L2_CID_MPEG_CX2341X_BASE+0) -- cgit v1.2.3-71-gd317 From f57fa2102cd5c0b50359def79a3d39cda8431204 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 5 Jun 2013 04:19:53 -0300 Subject: [media] v4l: Add media format codes for ARGB8888 and AYUV8888 on 32-bit busses Signed-off-by: Laurent Pinchart Reviewed-by: Sylwester Nawrocki Acked-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/DocBook/media/v4l/subdev-formats.xml | 609 +++++++++------------ Documentation/DocBook/media_api.tmpl | 6 + include/uapi/linux/v4l2-mediabus.h | 6 +- 3 files changed, 254 insertions(+), 367 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/DocBook/media/v4l/subdev-formats.xml b/Documentation/DocBook/media/v4l/subdev-formats.xml index 0c2b1f2e8754..f72c1cc93a9b 100644 --- a/Documentation/DocBook/media/v4l/subdev-formats.xml +++ b/Documentation/DocBook/media/v4l/subdev-formats.xml @@ -97,31 +97,39 @@ - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Identifier @@ -133,6 +141,14 @@ Bit + 31 + 30 + 29 + 28 + 27 + 26 + 25 + 24 23 22 21 @@ -164,7 +180,7 @@ V4L2_MBUS_FMT_RGB444_2X8_PADHI_BE 0x1001 - &dash-ent-16; + &dash-ent-24; 0 0 0 @@ -178,7 +194,7 @@ - &dash-ent-16; + &dash-ent-24; g3 g2 g1 @@ -192,7 +208,7 @@ V4L2_MBUS_FMT_RGB444_2X8_PADHI_LE 0x1002 - &dash-ent-16; + &dash-ent-24; g3 g2 g1 @@ -206,7 +222,7 @@ - &dash-ent-16; + &dash-ent-24; 0 0 0 @@ -220,7 +236,7 @@ V4L2_MBUS_FMT_RGB555_2X8_PADHI_BE 0x1003 - &dash-ent-16; + &dash-ent-24; 0 r4 r3 @@ -234,7 +250,7 @@ - &dash-ent-16; + &dash-ent-24; g2 g1 g0 @@ -248,7 +264,7 @@ V4L2_MBUS_FMT_RGB555_2X8_PADHI_LE 0x1004 - &dash-ent-16; + &dash-ent-24; g2 g1 g0 @@ -262,7 +278,7 @@ - &dash-ent-16; + &dash-ent-24; 0 r4 r3 @@ -276,7 +292,7 @@ V4L2_MBUS_FMT_BGR565_2X8_BE 0x1005 - &dash-ent-16; + &dash-ent-24; b4 b3 b2 @@ -290,7 +306,7 @@ - &dash-ent-16; + &dash-ent-24; g2 g1 g0 @@ -304,7 +320,7 @@ V4L2_MBUS_FMT_BGR565_2X8_LE 0x1006 - &dash-ent-16; + &dash-ent-24; g2 g1 g0 @@ -318,7 +334,7 @@ - &dash-ent-16; + &dash-ent-24; b4 b3 b2 @@ -332,7 +348,7 @@ V4L2_MBUS_FMT_RGB565_2X8_BE 0x1007 - &dash-ent-16; + &dash-ent-24; r4 r3 r2 @@ -346,7 +362,7 @@ - &dash-ent-16; + &dash-ent-24; g2 g1 g0 @@ -360,7 +376,7 @@ V4L2_MBUS_FMT_RGB565_2X8_LE 0x1008 - &dash-ent-16; + &dash-ent-24; g2 g1 g0 @@ -374,7 +390,7 @@ - &dash-ent-16; + &dash-ent-24; r4 r3 r2 @@ -388,12 +404,7 @@ V4L2_MBUS_FMT_RGB666_1X18 0x1009 - - - - - - - - - - - - + &dash-ent-14; r5 r4 r3 @@ -417,6 +428,7 @@ V4L2_MBUS_FMT_RGB888_1X24 0x100a + &dash-ent-8; r7 r6 r5 @@ -446,9 +458,7 @@ V4L2_MBUS_FMT_RGB888_2X12_BE 0x100b - &dash-ent-10; - - - - + &dash-ent-20; r7 r6 r5 @@ -466,9 +476,7 @@ - &dash-ent-10; - - - - + &dash-ent-20; g3 g2 g1 @@ -486,9 +494,7 @@ V4L2_MBUS_FMT_RGB888_2X12_LE 0x100c - &dash-ent-10; - - - - + &dash-ent-20; g3 g2 g1 @@ -506,9 +512,7 @@ - &dash-ent-10; - - - - + &dash-ent-20; r7 r6 r5 @@ -522,6 +526,43 @@ g5 g4 + + V4L2_MBUS_FMT_ARGB888_1X32 + 0x100d + + a7 + a6 + a5 + a4 + a3 + a2 + a1 + a0 + r7 + r6 + r5 + r4 + r3 + r2 + r1 + r0 + g7 + g6 + g5 + g4 + g3 + g2 + g1 + g0 + b7 + b6 + b5 + b4 + b3 + b2 + b1 + b0 + @@ -1149,6 +1190,7 @@ yx for luma component bit number x ux for blue chroma component bit number x vx for red chroma component bit number x + ax for alpha component bit number x - for non-available bits (for positions higher than the bus width) d for dummy bits @@ -1159,37 +1201,39 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Identifier @@ -1201,6 +1245,8 @@ Bit + 31 + 30 29 28 27 @@ -1238,10 +1284,7 @@ V4L2_MBUS_FMT_Y8_1X8 0x2001 - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1255,18 +1298,7 @@ V4L2_MBUS_FMT_UV8_1X8 0x2015 - - - - - - - - - - - - - - - - - - - - - - - - + &dash-ent-24; u7 u6 u5 @@ -1280,18 +1312,7 @@ - - - - - - - - - - - - - - - - - - - - - - - - + &dash-ent-24; v7 v6 v5 @@ -1305,10 +1326,7 @@ V4L2_MBUS_FMT_UYVY8_1_5X8 0x2002 - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; u7 u6 u5 @@ -1322,10 +1340,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1339,10 +1354,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1356,10 +1368,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; v7 v6 v5 @@ -1373,10 +1382,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1390,10 +1396,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1407,10 +1410,7 @@ V4L2_MBUS_FMT_VYUY8_1_5X8 0x2003 - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; v7 v6 v5 @@ -1424,10 +1424,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1441,10 +1438,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1458,10 +1452,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; u7 u6 u5 @@ -1475,10 +1466,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1492,10 +1480,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1509,10 +1494,7 @@ V4L2_MBUS_FMT_YUYV8_1_5X8 0x2004 - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1526,10 +1508,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1543,10 +1522,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; u7 u6 u5 @@ -1560,10 +1536,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1577,10 +1550,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1594,10 +1564,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; v7 v6 v5 @@ -1611,10 +1578,7 @@ V4L2_MBUS_FMT_YVYU8_1_5X8 0x2005 - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1628,10 +1592,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1645,10 +1606,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; v7 v6 v5 @@ -1662,10 +1620,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1679,10 +1634,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1696,10 +1648,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; u7 u6 u5 @@ -1713,10 +1662,7 @@ V4L2_MBUS_FMT_UYVY8_2X8 0x2006 - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; u7 u6 u5 @@ -1730,10 +1676,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1747,10 +1690,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; v7 v6 v5 @@ -1764,10 +1704,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1781,10 +1718,7 @@ V4L2_MBUS_FMT_VYUY8_2X8 0x2007 - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; v7 v6 v5 @@ -1798,10 +1732,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1815,10 +1746,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; u7 u6 u5 @@ -1832,10 +1760,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1849,10 +1774,7 @@ V4L2_MBUS_FMT_YUYV8_2X8 0x2008 - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1866,10 +1788,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; u7 u6 u5 @@ -1883,10 +1802,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1900,10 +1816,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; v7 v6 v5 @@ -1917,10 +1830,7 @@ V4L2_MBUS_FMT_YVYU8_2X8 0x2009 - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1934,10 +1844,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; v7 v6 v5 @@ -1951,10 +1858,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; y7 y6 y5 @@ -1968,10 +1872,7 @@ - &dash-ent-10; - &dash-ent-10; - - - - + &dash-ent-24; u7 u6 u5 @@ -1985,8 +1886,7 @@ V4L2_MBUS_FMT_Y10_1X10 0x200a - &dash-ent-10; - &dash-ent-10; + &dash-ent-22; y9 y8 y7 @@ -2002,8 +1902,7 @@ V4L2_MBUS_FMT_YUYV10_2X10 0x200b - &dash-ent-10; - &dash-ent-10; + &dash-ent-22; y9 y8 y7 @@ -2019,8 +1918,7 @@ - &dash-ent-10; - &dash-ent-10; + &dash-ent-22; u9 u8 u7 @@ -2036,8 +1934,7 @@ - &dash-ent-10; - &dash-ent-10; + &dash-ent-22; y9 y8 y7 @@ -2053,8 +1950,7 @@ - &dash-ent-10; - &dash-ent-10; + &dash-ent-22; v9 v8 v7 @@ -2070,8 +1966,7 @@ V4L2_MBUS_FMT_YVYU10_2X10 0x200c - &dash-ent-10; - &dash-ent-10; + &dash-ent-22; y9 y8 y7 @@ -2087,8 +1982,7 @@ - &dash-ent-10; - &dash-ent-10; + &dash-ent-22; v9 v8 v7 @@ -2104,8 +1998,7 @@ - &dash-ent-10; - &dash-ent-10; + &dash-ent-22; y9 y8 y7 @@ -2121,8 +2014,7 @@ - &dash-ent-10; - &dash-ent-10; + &dash-ent-22; u9 u8 u7 @@ -2138,15 +2030,7 @@ V4L2_MBUS_FMT_Y12_1X12 0x2013 - &dash-ent-10; - - - - - - - - - - - - - - - - + &dash-ent-20; y11 y10 y9 @@ -2164,11 +2048,7 @@ V4L2_MBUS_FMT_UYVY8_1X16 0x200f - &dash-ent-10; - - - - - - - - + &dash-ent-16; u7 u6 u5 @@ -2190,11 +2070,7 @@ - &dash-ent-10; - - - - - - - - + &dash-ent-16; v7 v6 v5 @@ -2216,11 +2092,7 @@ V4L2_MBUS_FMT_VYUY8_1X16 0x2010 - &dash-ent-10; - - - - - - - - + &dash-ent-16; v7 v6 v5 @@ -2242,11 +2114,7 @@ - &dash-ent-10; - - - - - - - - + &dash-ent-16; u7 u6 u5 @@ -2268,11 +2136,7 @@ V4L2_MBUS_FMT_YUYV8_1X16 0x2011 - &dash-ent-10; - - - - - - - - + &dash-ent-16; y7 y6 y5 @@ -2294,11 +2158,7 @@ - &dash-ent-10; - - - - - - - - + &dash-ent-16; y7 y6 y5 @@ -2320,11 +2180,7 @@ V4L2_MBUS_FMT_YVYU8_1X16 0x2012 - &dash-ent-10; - - - - - - - - + &dash-ent-16; y7 y6 y5 @@ -2346,11 +2202,7 @@ - &dash-ent-10; - - - - - - - - + &dash-ent-16; y7 y6 y5 @@ -2372,10 +2224,7 @@ V4L2_MBUS_FMT_YDYUYDYV8_1X16 0x2014 - - - - - - - - + &dash-ent-16; y7 y6 y5 @@ -2397,10 +2246,7 @@ - - - - - - - - + &dash-ent-16; y7 y6 y5 @@ -2422,10 +2268,7 @@ - - - - - - - - + &dash-ent-16; y7 y6 y5 @@ -2447,10 +2290,7 @@ - - - - - - - - + &dash-ent-16; y7 y6 y5 @@ -2472,7 +2312,7 @@ V4L2_MBUS_FMT_YUYV10_1X20 0x200d - &dash-ent-10; + &dash-ent-12; y9 y8 y7 @@ -2498,7 +2338,7 @@ - &dash-ent-10; + &dash-ent-12; y9 y8 y7 @@ -2524,7 +2364,7 @@ V4L2_MBUS_FMT_YVYU10_1X20 0x200e - &dash-ent-10; + &dash-ent-12; y9 y8 y7 @@ -2550,7 +2390,7 @@ - &dash-ent-10; + &dash-ent-12; y9 y8 y7 @@ -2576,6 +2416,8 @@ V4L2_MBUS_FMT_YUV10_1X30 0x2016 + - + - y9 y8 y7 @@ -2607,6 +2449,43 @@ v1 v0 + + V4L2_MBUS_FMT_AYUV8_1X32 + 0x2017 + + a7 + a6 + a5 + a4 + a3 + a2 + a1 + a0 + y7 + y6 + y5 + y4 + y3 + y2 + y1 + y0 + u7 + u6 + u5 + u4 + u3 + u2 + u1 + u0 + v7 + v6 + v5 + v4 + v3 + v2 + v1 + v0 + diff --git a/Documentation/DocBook/media_api.tmpl b/Documentation/DocBook/media_api.tmpl index 6a8b7158697f..07e7eea52a0b 100644 --- a/Documentation/DocBook/media_api.tmpl +++ b/Documentation/DocBook/media_api.tmpl @@ -22,8 +22,14 @@ http://linuxtv.org/repo/"> +--------"> ----------"> +------------"> +--------------"> ----------------"> +--------------------"> +----------------------"> +------------------------"> ]> diff --git a/include/uapi/linux/v4l2-mediabus.h b/include/uapi/linux/v4l2-mediabus.h index 6ee63d09b32d..a9601257bb43 100644 --- a/include/uapi/linux/v4l2-mediabus.h +++ b/include/uapi/linux/v4l2-mediabus.h @@ -37,7 +37,7 @@ enum v4l2_mbus_pixelcode { V4L2_MBUS_FMT_FIXED = 0x0001, - /* RGB - next is 0x100d */ + /* RGB - next is 0x100e */ V4L2_MBUS_FMT_RGB444_2X8_PADHI_BE = 0x1001, V4L2_MBUS_FMT_RGB444_2X8_PADHI_LE = 0x1002, V4L2_MBUS_FMT_RGB555_2X8_PADHI_BE = 0x1003, @@ -50,8 +50,9 @@ enum v4l2_mbus_pixelcode { V4L2_MBUS_FMT_RGB888_1X24 = 0x100a, V4L2_MBUS_FMT_RGB888_2X12_BE = 0x100b, V4L2_MBUS_FMT_RGB888_2X12_LE = 0x100c, + V4L2_MBUS_FMT_ARGB8888_1X32 = 0x100d, - /* YUV (including grey) - next is 0x2017 */ + /* YUV (including grey) - next is 0x2018 */ V4L2_MBUS_FMT_Y8_1X8 = 0x2001, V4L2_MBUS_FMT_UV8_1X8 = 0x2015, V4L2_MBUS_FMT_UYVY8_1_5X8 = 0x2002, @@ -74,6 +75,7 @@ enum v4l2_mbus_pixelcode { V4L2_MBUS_FMT_YUYV10_1X20 = 0x200d, V4L2_MBUS_FMT_YVYU10_1X20 = 0x200e, V4L2_MBUS_FMT_YUV10_1X30 = 0x2016, + V4L2_MBUS_FMT_AYUV8_1X32 = 0x2017, /* Bayer - next is 0x3019 */ V4L2_MBUS_FMT_SBGGR8_1X8 = 0x3001, -- cgit v1.2.3-71-gd317 From 8493054844145e47a8f0668f57e2b0073aca850f Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 26 Jun 2013 09:46:42 -0300 Subject: [media] v4l: Add V4L2_PIX_FMT_NV16M and V4L2_PIX_FMT_NV61M formats NV16M and NV61M are planar YCbCr 4:2:2 and YCrCb 4:2:2 formats with a luma plane followed by an interleaved chroma plane. The planes are not required to be contiguous in memory, and the formats can only be used with the multi-planar formats API. Signed-off-by: Laurent Pinchart Reviewed-by: Sylwester Nawrocki Reviewed-by: Sakari Ailus Acked-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/DocBook/media/v4l/pixfmt-nv16m.xml | 171 +++++++++++++++++++++++ Documentation/DocBook/media/v4l/pixfmt.xml | 1 + include/uapi/linux/videodev2.h | 2 + 3 files changed, 174 insertions(+) create mode 100644 Documentation/DocBook/media/v4l/pixfmt-nv16m.xml (limited to 'include/uapi/linux') diff --git a/Documentation/DocBook/media/v4l/pixfmt-nv16m.xml b/Documentation/DocBook/media/v4l/pixfmt-nv16m.xml new file mode 100644 index 000000000000..c51d5a4cda09 --- /dev/null +++ b/Documentation/DocBook/media/v4l/pixfmt-nv16m.xml @@ -0,0 +1,171 @@ + + + V4L2_PIX_FMT_NV16M ('NM16'), V4L2_PIX_FMT_NV61M ('NM61') + &manvol; + + + V4L2_PIX_FMT_NV16M + V4L2_PIX_FMT_NV61M + Variation of V4L2_PIX_FMT_NV16 and V4L2_PIX_FMT_NV61 with planes + non contiguous in memory. + + + Description + + This is a multi-planar, two-plane version of the YUV 4:2:0 format. +The three components are separated into two sub-images or planes. +V4L2_PIX_FMT_NV16M differs from V4L2_PIX_FMT_NV16 + in that the two planes are non-contiguous in memory, i.e. the chroma +plane does not necessarily immediately follows the luma plane. +The luminance data occupies the first plane. The Y plane has one byte per pixel. +In the second plane there is chrominance data with alternating chroma samples. +The CbCr plane is the same width and height, in bytes, as the Y plane. +Each CbCr pair belongs to four pixels. For example, +Cb0/Cr0 belongs to +Y'00, Y'01, +Y'10, Y'11. +V4L2_PIX_FMT_NV61M is the same as V4L2_PIX_FMT_NV16M +except the Cb and Cr bytes are swapped, the CrCb plane starts with a Cr byte. + + V4L2_PIX_FMT_NV16M and +V4L2_PIX_FMT_NV61M are intended to be used only in drivers +and applications that support the multi-planar API, described in +. + + + <constant>V4L2_PIX_FMT_NV16M</constant> 4 × 4 pixel image + + + Byte Order. + Each cell is one byte. + + + + + + start0 + 0: + Y'00 + Y'01 + Y'02 + Y'03 + + + start0 + 4: + Y'10 + Y'11 + Y'12 + Y'13 + + + start0 + 8: + Y'20 + Y'21 + Y'22 + Y'23 + + + start0 + 12: + Y'30 + Y'31 + Y'32 + Y'33 + + + + + + start1 + 0: + Cb00 + Cr00 + Cb02 + Cr02 + + + start1 + 4: + Cb10 + Cr10 + Cb12 + Cr12 + + + start1 + 8: + Cb20 + Cr20 + Cb22 + Cr22 + + + start1 + 12: + Cb30 + Cr30 + Cb32 + Cr32 + + + + + + + + + Color Sample Location. + + + + + + + 01 + 23 + + + 0 + YY + YY + + + + C + C + + + 1 + YY + YY + + + + C + C + + + + + + 2 + YY + YY + + + + C + C + + + 3 + YY + YY + + + + C + C + + + + + + + + + diff --git a/Documentation/DocBook/media/v4l/pixfmt.xml b/Documentation/DocBook/media/v4l/pixfmt.xml index 99b8d2ad6e4f..16db350848af 100644 --- a/Documentation/DocBook/media/v4l/pixfmt.xml +++ b/Documentation/DocBook/media/v4l/pixfmt.xml @@ -718,6 +718,7 @@ information. &sub-nv12m; &sub-nv12mt; &sub-nv16; + &sub-nv16m; &sub-nv24; &sub-m420;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 95ef4551edc1..fec0c205f38b 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -348,6 +348,8 @@ struct v4l2_pix_format { /* two non contiguous planes - one Y, one Cr + Cb interleaved */ #define V4L2_PIX_FMT_NV12M v4l2_fourcc('N', 'M', '1', '2') /* 12 Y/CbCr 4:2:0 */ #define V4L2_PIX_FMT_NV21M v4l2_fourcc('N', 'M', '2', '1') /* 21 Y/CrCb 4:2:0 */ +#define V4L2_PIX_FMT_NV16M v4l2_fourcc('N', 'M', '1', '6') /* 16 Y/CbCr 4:2:2 */ +#define V4L2_PIX_FMT_NV61M v4l2_fourcc('N', 'M', '6', '1') /* 16 Y/CrCb 4:2:2 */ #define V4L2_PIX_FMT_NV12MT v4l2_fourcc('T', 'M', '1', '2') /* 12 Y/CbCr 4:2:0 64x32 macroblocks */ #define V4L2_PIX_FMT_NV12MT_16X16 v4l2_fourcc('V', 'M', '1', '2') /* 12 Y/CbCr 4:2:0 16x16 macroblocks */ -- cgit v1.2.3-71-gd317 From 299878fa3c373dbf74edf5872c79ef4c65b80a04 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Mon, 29 Jul 2013 08:40:54 -0300 Subject: [media] v4l2-dv-timings.h: remove duplicate V4L2_DV_BT_DMT_1366X768P60 This particular DMT timing definition was duplicated in the header. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/v4l2-dv-timings.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/v4l2-dv-timings.h b/include/uapi/linux/v4l2-dv-timings.h index 4e0c58d25ff0..be709fe29552 100644 --- a/include/uapi/linux/v4l2-dv-timings.h +++ b/include/uapi/linux/v4l2-dv-timings.h @@ -823,12 +823,4 @@ V4L2_DV_FL_REDUCED_BLANKING) \ } -#define V4L2_DV_BT_DMT_1366X768P60 { \ - .type = V4L2_DV_BT_656_1120, \ - V4L2_INIT_BT_TIMINGS(1366, 768, 0, \ - V4L2_DV_HSYNC_POS_POL | V4L2_DV_VSYNC_POS_POL, \ - 85500000, 70, 143, 213, 3, 3, 24, 0, 0, 0, \ - V4L2_DV_BT_STD_DMT, 0) \ -} - #endif -- cgit v1.2.3-71-gd317 From 7f68127fa11f08c5468537eb28ca6b8b95d70f08 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Mon, 29 Jul 2013 08:40:58 -0300 Subject: [media] videodev2.h: defines to calculate blanking and frame sizes It is very common to have to calculate the total width and height of the blanking and the full frame, so add a few defines that deal with that. Signed-off-by: Hans Verkuil Acked-by: Lad, Prabhakar Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/videodev2.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index fec0c205f38b..437f1b0f8937 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -1057,6 +1057,16 @@ struct v4l2_bt_timings { or used depends on the hardware. */ #define V4L2_DV_FL_HALF_LINE (1 << 3) +/* A few useful defines to calculate the total blanking and frame sizes */ +#define V4L2_DV_BT_BLANKING_WIDTH(bt) \ + (bt->hfrontporch + bt->hsync + bt->hbackporch) +#define V4L2_DV_BT_FRAME_WIDTH(bt) \ + (bt->width + V4L2_DV_BT_BLANKING_WIDTH(bt)) +#define V4L2_DV_BT_BLANKING_HEIGHT(bt) \ + (bt->vfrontporch + bt->vsync + bt->vbackporch + \ + bt->il_vfrontporch + bt->il_vsync + bt->il_vbackporch) +#define V4L2_DV_BT_FRAME_HEIGHT(bt) \ + (bt->height + V4L2_DV_BT_BLANKING_HEIGHT(bt)) /** struct v4l2_dv_timings - DV timings * @type: the type of the timings -- cgit v1.2.3-71-gd317 From 58264848a5a7b91195f43c4729072e8cc980288d Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 19 Aug 2013 11:23:34 -0700 Subject: openvswitch: Add vxlan tunneling support. Following patch adds vxlan vport type for openvswitch using vxlan api. So now there is vxlan dependency for openvswitch. CC: Jesse Gross Signed-off-by: Pravin B Shelar Acked-by: Jesse Gross Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 11 +++ net/openvswitch/Kconfig | 13 +++ net/openvswitch/Makefile | 4 + net/openvswitch/vport-vxlan.c | 204 +++++++++++++++++++++++++++++++++++++++ net/openvswitch/vport.c | 3 + net/openvswitch/vport.h | 1 + 6 files changed, 236 insertions(+) create mode 100644 net/openvswitch/vport-vxlan.c (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index c55efaaa9bb4..52490b0e62b5 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -165,6 +165,7 @@ enum ovs_vport_type { OVS_VPORT_TYPE_NETDEV, /* network device */ OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */ OVS_VPORT_TYPE_GRE, /* GRE tunnel. */ + OVS_VPORT_TYPE_VXLAN, /* VXLAN tunnel. */ __OVS_VPORT_TYPE_MAX }; @@ -211,6 +212,16 @@ enum ovs_vport_attr { #define OVS_VPORT_ATTR_MAX (__OVS_VPORT_ATTR_MAX - 1) +/* OVS_VPORT_ATTR_OPTIONS attributes for tunnels. + */ +enum { + OVS_TUNNEL_ATTR_UNSPEC, + OVS_TUNNEL_ATTR_DST_PORT, /* 16-bit UDP port, used by L4 tunnels. */ + __OVS_TUNNEL_ATTR_MAX +}; + +#define OVS_TUNNEL_ATTR_MAX (__OVS_TUNNEL_ATTR_MAX - 1) + /* Flows. */ #define OVS_FLOW_FAMILY "ovs_flow" diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 27ee56b688a3..bed30e69baa7 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -40,3 +40,16 @@ config OPENVSWITCH_GRE Say N to exclude this support and reduce the binary size. If unsure, say Y. + +config OPENVSWITCH_VXLAN + bool "Open vSwitch VXLAN tunneling support" + depends on INET + depends on OPENVSWITCH + depends on VXLAN && !(OPENVSWITCH=y && VXLAN=m) + default y + ---help--- + If you say Y here, then the Open vSwitch will be able create vxlan vport. + + Say N to exclude this support and reduce the binary size. + + If unsure, say Y. diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 01bddb2991e3..82e4ee54a44b 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -13,3 +13,7 @@ openvswitch-y := \ vport-gre.o \ vport-internal_dev.o \ vport-netdev.o + +ifneq ($(CONFIG_OPENVSWITCH_VXLAN),) +openvswitch-y += vport-vxlan.o +endif diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c new file mode 100644 index 000000000000..36848bd54a77 --- /dev/null +++ b/net/openvswitch/vport-vxlan.c @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2013 Nicira, Inc. + * Copyright (c) 2013 Cisco Systems, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "vport.h" + +/** + * struct vxlan_port - Keeps track of open UDP ports + * @vs: vxlan_sock created for the port. + * @name: vport name. + */ +struct vxlan_port { + struct vxlan_sock *vs; + char name[IFNAMSIZ]; +}; + +static inline struct vxlan_port *vxlan_vport(const struct vport *vport) +{ + return vport_priv(vport); +} + +/* Called with rcu_read_lock and BH disabled. */ +static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) +{ + struct ovs_key_ipv4_tunnel tun_key; + struct vport *vport = vs->data; + struct iphdr *iph; + __be64 key; + + /* Save outer tunnel values */ + iph = ip_hdr(skb); + key = cpu_to_be64(ntohl(vx_vni) >> 8); + ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY); + + ovs_vport_receive(vport, skb, &tun_key); +} + +static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) +{ + struct vxlan_port *vxlan_port = vxlan_vport(vport); + __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; + + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) + return -EMSGSIZE; + return 0; +} + +static void vxlan_tnl_destroy(struct vport *vport) +{ + struct vxlan_port *vxlan_port = vxlan_vport(vport); + + vxlan_sock_release(vxlan_port->vs); + + ovs_vport_deferred_free(vport); +} + +static struct vport *vxlan_tnl_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct nlattr *options = parms->options; + struct vxlan_port *vxlan_port; + struct vxlan_sock *vs; + struct vport *vport; + struct nlattr *a; + u16 dst_port; + int err; + + if (!options) { + err = -EINVAL; + goto error; + } + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); + if (a && nla_len(a) == sizeof(u16)) { + dst_port = nla_get_u16(a); + } else { + /* Require destination port from userspace. */ + err = -EINVAL; + goto error; + } + + vport = ovs_vport_alloc(sizeof(struct vxlan_port), + &ovs_vxlan_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + vxlan_port = vxlan_vport(vport); + strncpy(vxlan_port->name, parms->name, IFNAMSIZ); + + vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true); + if (IS_ERR(vs)) { + ovs_vport_free(vport); + return (void *)vs; + } + vxlan_port->vs = vs; + + return vport; + +error: + return ERR_PTR(err); +} + +static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) +{ + struct net *net = ovs_dp_get_net(vport->dp); + struct vxlan_port *vxlan_port = vxlan_vport(vport); + __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; + struct rtable *rt; + struct flowi4 fl; + __be16 src_port; + int port_min; + int port_max; + __be16 df; + int err; + + if (unlikely(!OVS_CB(skb)->tun_key)) { + err = -EINVAL; + goto error; + } + + /* Route lookup */ + memset(&fl, 0, sizeof(fl)); + fl.daddr = OVS_CB(skb)->tun_key->ipv4_dst; + fl.saddr = OVS_CB(skb)->tun_key->ipv4_src; + fl.flowi4_tos = RT_TOS(OVS_CB(skb)->tun_key->ipv4_tos); + fl.flowi4_mark = skb->mark; + fl.flowi4_proto = IPPROTO_UDP; + + rt = ip_route_output_key(net, &fl); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto error; + } + + df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? + htons(IP_DF) : 0; + + skb->local_df = 1; + + inet_get_local_port_range(&port_min, &port_max); + src_port = vxlan_src_port(port_min, port_max, skb); + + err = vxlan_xmit_skb(net, vxlan_port->vs, rt, skb, + fl.saddr, OVS_CB(skb)->tun_key->ipv4_dst, + OVS_CB(skb)->tun_key->ipv4_tos, + OVS_CB(skb)->tun_key->ipv4_ttl, df, + src_port, dst_port, + htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << 8)); + if (err < 0) + ip_rt_put(rt); +error: + return err; +} + +static const char *vxlan_get_name(const struct vport *vport) +{ + struct vxlan_port *vxlan_port = vxlan_vport(vport); + return vxlan_port->name; +} + +const struct vport_ops ovs_vxlan_vport_ops = { + .type = OVS_VPORT_TYPE_VXLAN, + .create = vxlan_tnl_create, + .destroy = vxlan_tnl_destroy, + .get_name = vxlan_get_name, + .get_options = vxlan_get_options, + .send = vxlan_tnl_send, +}; diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index d4c7fa04ce08..d69e0c06dfde 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -42,6 +42,9 @@ static const struct vport_ops *vport_ops_list[] = { #ifdef CONFIG_OPENVSWITCH_GRE &ovs_gre_vport_ops, #endif +#ifdef CONFIG_OPENVSWITCH_VXLAN + &ovs_vxlan_vport_ops, +#endif }; /* Protected by RCU read lock for reading, ovs_mutex for writing. */ diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 376045c42f8b..1a9fbcec6e1b 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -199,6 +199,7 @@ void ovs_vport_record_error(struct vport *, enum vport_err_type err_type); extern const struct vport_ops ovs_netdev_vport_ops; extern const struct vport_ops ovs_internal_vport_ops; extern const struct vport_ops ovs_gre_vport_ops; +extern const struct vport_ops ovs_vxlan_vport_ops; static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) -- cgit v1.2.3-71-gd317 From af30cb446dd5f4ad5b93d7d4188c49a864c0d643 Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Tue, 6 Aug 2013 17:27:07 -0500 Subject: quota: Add a new quotactl command Q_XGETQSTATV XFS now supports three types of quotas (user, group and project). Current version of Q_XGETSTAT has support for only two types of quotas. In order to support three types of quotas, the interface, specifically struct fs_quota_stat, need to be expanded. Current version of fs_quota_stat does not allow expansion without breaking backward compatibility. So, a quotactl command and new fs_quota_stat structure need to be added. This patch adds a new command Q_XGETQSTATV to quotactl() which takes a new data structure fs_quota_statv. This new data structure provides support for future expansion and backward compatibility. Callers of the new quotactl command have to set the version of the data structure being passed, and kernel will fill as much data as requested. If the kernel does not support the user-space provided version, EINVAL will be returned. User-space can reduce the version number and call the same quotactl again. Signed-off-by: Chandra Seetharaman Reviewed-by: Jan Kara Reviewed-by: Rich Johnston Signed-off-by: Ben Myers [v2: Applied rjohnston's suggestions as per Chandra's request. -bpm] --- fs/quota/quota.c | 29 ++++++++++++++++++++++++++ include/linux/quota.h | 1 + include/uapi/linux/dqblk_xfs.h | 47 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+) (limited to 'include/uapi/linux') diff --git a/fs/quota/quota.c b/fs/quota/quota.c index c7314f1771f5..dea86e8967ee 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -27,6 +27,7 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd, case Q_SYNC: case Q_GETINFO: case Q_XGETQSTAT: + case Q_XGETQSTATV: case Q_XQUOTASYNC: break; /* allow to query information for dquots we "own" */ @@ -217,6 +218,31 @@ static int quota_getxstate(struct super_block *sb, void __user *addr) return ret; } +static int quota_getxstatev(struct super_block *sb, void __user *addr) +{ + struct fs_quota_statv fqs; + int ret; + + if (!sb->s_qcop->get_xstatev) + return -ENOSYS; + + memset(&fqs, 0, sizeof(fqs)); + if (copy_from_user(&fqs, addr, 1)) /* Just read qs_version */ + return -EFAULT; + + /* If this kernel doesn't support user specified version, fail */ + switch (fqs.qs_version) { + case FS_QSTATV_VERSION1: + break; + default: + return -EINVAL; + } + ret = sb->s_qcop->get_xstatev(sb, &fqs); + if (!ret && copy_to_user(addr, &fqs, sizeof(fqs))) + return -EFAULT; + return ret; +} + static int quota_setxquota(struct super_block *sb, int type, qid_t id, void __user *addr) { @@ -293,6 +319,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, return quota_setxstate(sb, cmd, addr); case Q_XGETQSTAT: return quota_getxstate(sb, addr); + case Q_XGETQSTATV: + return quota_getxstatev(sb, addr); case Q_XSETQLIM: return quota_setxquota(sb, type, id, addr); case Q_XGETQUOTA: @@ -317,6 +345,7 @@ static int quotactl_cmd_write(int cmd) case Q_GETINFO: case Q_SYNC: case Q_XGETQSTAT: + case Q_XGETQSTATV: case Q_XGETQUOTA: case Q_XQUOTASYNC: return 0; diff --git a/include/linux/quota.h b/include/linux/quota.h index d13371134c59..cc7494a35429 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -328,6 +328,7 @@ struct quotactl_ops { int (*set_dqblk)(struct super_block *, struct kqid, struct fs_disk_quota *); int (*get_xstate)(struct super_block *, struct fs_quota_stat *); int (*set_xstate)(struct super_block *, unsigned int, int); + int (*get_xstatev)(struct super_block *, struct fs_quota_statv *); }; struct quota_format_type { diff --git a/include/uapi/linux/dqblk_xfs.h b/include/uapi/linux/dqblk_xfs.h index 86552807aed9..dcd75cc26196 100644 --- a/include/uapi/linux/dqblk_xfs.h +++ b/include/uapi/linux/dqblk_xfs.h @@ -38,6 +38,7 @@ #define Q_XGETQSTAT XQM_CMD(5) /* get quota subsystem status */ #define Q_XQUOTARM XQM_CMD(6) /* free disk space used by dquots */ #define Q_XQUOTASYNC XQM_CMD(7) /* delalloc flush, updates dquots */ +#define Q_XGETQSTATV XQM_CMD(8) /* newer version of get quota */ /* * fs_disk_quota structure: @@ -163,4 +164,50 @@ typedef struct fs_quota_stat { __u16 qs_iwarnlimit; /* limit for num warnings */ } fs_quota_stat_t; +/* + * fs_quota_statv is used by Q_XGETQSTATV for a given file system. It provides + * a centralized way to get meta information about the quota subsystem. eg. + * space taken up for user, group, and project quotas, number of dquots + * currently incore. + * + * This version has proper versioning support with appropriate padding for + * future expansions, and ability to expand for future without creating any + * backward compatibility issues. + * + * Q_XGETQSTATV uses the passed in value of the requested version via + * fs_quota_statv.qs_version to determine the return data layout of + * fs_quota_statv. The kernel will fill the data fields relevant to that + * version. + * + * If kernel does not support user space caller specified version, EINVAL will + * be returned. User space caller can then reduce the version number and retry + * the same command. + */ +#define FS_QSTATV_VERSION1 1 /* fs_quota_statv.qs_version */ +/* + * Some basic information about 'quota files' for Q_XGETQSTATV command + */ +struct fs_qfilestatv { + __u64 qfs_ino; /* inode number */ + __u64 qfs_nblks; /* number of BBs 512-byte-blks */ + __u32 qfs_nextents; /* number of extents */ + __u32 qfs_pad; /* pad for 8-byte alignment */ +}; + +struct fs_quota_statv { + __s8 qs_version; /* version for future changes */ + __u8 qs_pad1; /* pad for 16bit alignment */ + __u16 qs_flags; /* FS_QUOTA_.* flags */ + __u32 qs_incoredqs; /* number of dquots incore */ + struct fs_qfilestatv qs_uquota; /* user quota information */ + struct fs_qfilestatv qs_gquota; /* group quota information */ + struct fs_qfilestatv qs_pquota; /* project quota information */ + __s32 qs_btimelimit; /* limit for blks timer */ + __s32 qs_itimelimit; /* limit for inodes timer */ + __s32 qs_rtbtimelimit;/* limit for rt blks timer */ + __u16 qs_bwarnlimit; /* limit for num warnings */ + __u16 qs_iwarnlimit; /* limit for num warnings */ + __u64 qs_pad2[8]; /* for future proofing */ +}; + #endif /* _LINUX_DQBLK_XFS_H */ -- cgit v1.2.3-71-gd317 From fb7589a162162223e6bb6422dde3fb1ce07d9a78 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 21 Aug 2013 14:31:38 +0400 Subject: tun: Add ability to create tun device with given index Tun devices cannot be created with ifidex user wants, but it's required by checkpoint-restore project. Long time ago such ability was implemented for rtnl_ops-based interface for creating links (9c7dafbf net: Allow to create links with given ifindex), but the only API for creating and managing tuntap devices is ioctl-based and is evolving with adding new ones (cde8b15f tuntap: add ioctl to attach or detach a file form tuntap device). Following that trend, here's how a new ioctl that sets the ifindex for device, that _will_ be created by TUNSETIFF ioctl looks like. So those who want a tuntap device with the ifindex N, should open the tun device, call ioctl(fd, TUNSETIFINDEX, &N), then call TUNSETIFF. If the index N is busy, then the register_netdev will find this out and the ioctl would be failed with -EBUSY. If setifindex is not called, then it will be generated as before. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- drivers/net/tun.c | 21 ++++++++++++++++++++- include/uapi/linux/if_tun.h | 1 + 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 7ed13cc0dcb2..4b65fbcc490f 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -138,7 +138,10 @@ struct tun_file { struct fasync_struct *fasync; /* only used for fasnyc */ unsigned int flags; - u16 queue_index; + union { + u16 queue_index; + unsigned int ifindex; + }; struct list_head next; struct tun_struct *detached; }; @@ -1601,6 +1604,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) dev_net_set(dev, net); dev->rtnl_link_ops = &tun_link_ops; + dev->ifindex = tfile->ifindex; tun = netdev_priv(dev); tun->dev = dev; @@ -1817,6 +1821,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, kgid_t group; int sndbuf; int vnet_hdr_sz; + unsigned int ifindex; int ret; if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == 0x89) { @@ -1851,6 +1856,19 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = -EFAULT; goto unlock; } + if (cmd == TUNSETIFINDEX) { + ret = -EPERM; + if (tun) + goto unlock; + + ret = -EFAULT; + if (copy_from_user(&ifindex, argp, sizeof(ifindex))) + goto unlock; + + ret = 0; + tfile->ifindex = ifindex; + goto unlock; + } ret = -EBADFD; if (!tun) @@ -2099,6 +2117,7 @@ static int tun_chr_open(struct inode *inode, struct file * file) rcu_assign_pointer(tfile->tun, NULL); tfile->net = get_net(current->nsproxy->net_ns); tfile->flags = 0; + tfile->ifindex = 0; rcu_assign_pointer(tfile->socket.wq, &tfile->wq); init_waitqueue_head(&tfile->wq.wait); diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 1870ee29bb37..c58d023c4822 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -56,6 +56,7 @@ #define TUNGETVNETHDRSZ _IOR('T', 215, int) #define TUNSETVNETHDRSZ _IOW('T', 216, int) #define TUNSETQUEUE _IOW('T', 217, int) +#define TUNSETIFINDEX _IOW('T', 218, unsigned int) /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 -- cgit v1.2.3-71-gd317 From 849c9b6f93cc4cb5eb59301b6380a7a81b43f414 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 21 Aug 2013 14:32:21 +0400 Subject: tun: Allow to skip filter on attach There's a small problem with sk-filters on tun devices. Consider an application doing this sequence of steps: fd = open("/dev/net/tun"); ioctl(fd, TUNSETIFF, { .ifr_name = "tun0" }); ioctl(fd, TUNATTACHFILTER, &my_filter); ioctl(fd, TUNSETPERSIST, 1); close(fd); At that point the tun0 will remain in the system and will keep in mind that there should be a socket filter at address '&my_filter'. If after that we do fd = open("/dev/net/tun"); ioctl(fd, TUNSETIFF, { .ifr_name = "tun0" }); we most likely receive the -EFAULT error, since tun_attach() would try to connect the filter back. But (!) if we provide a filter at address &my_filter, then tun0 will be created and the "new" filter would be attached, but application may not know about that. This may create certain problems to anyone using tun-s, but it's critical problem for c/r -- if we meet a persistent tun device with a filter in mind, we will not be able to attach to it to dump its state (flags, owner, address, vnethdr size, etc.). The proposal is to allow to attach to tun device (with TUNSETIFF) w/o attaching the filter to the tun-file's socket. After this attach app may e.g clean the device by dropping the filter, it doesn't want to have one, or (in case of c/r) get information about the device with tun ioctls. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- drivers/net/tun.c | 12 +++++++----- include/uapi/linux/if_tun.h | 1 + 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index db43a2409733..6acbdbccebcd 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -501,7 +501,7 @@ static void tun_detach_all(struct net_device *dev) module_put(THIS_MODULE); } -static int tun_attach(struct tun_struct *tun, struct file *file) +static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter) { struct tun_file *tfile = file->private_data; int err; @@ -526,7 +526,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file) err = 0; /* Re-attach the filter to presist device */ - if (tun->filter_attached == true) { + if (!skip_filter && (tun->filter_attached == true)) { err = sk_attach_filter(&tun->fprog, tfile->socket.sk); if (!err) goto out; @@ -1557,7 +1557,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) if (err < 0) return err; - err = tun_attach(tun, file); + err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER); if (err < 0) return err; @@ -1631,7 +1631,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) dev->vlan_features = dev->features; INIT_LIST_HEAD(&tun->disabled); - err = tun_attach(tun, file); + err = tun_attach(tun, file, false); if (err < 0) goto err_free_dev; @@ -1795,7 +1795,7 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) ret = security_tun_dev_attach_queue(tun->security); if (ret < 0) goto unlock; - ret = tun_attach(tun, file); + ret = tun_attach(tun, file, false); } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) { tun = rtnl_dereference(tfile->tun); if (!tun || !(tun->flags & TUN_TAP_MQ) || tfile->detached) @@ -1883,6 +1883,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, if (tfile->detached) ifr.ifr_flags |= IFF_DETACH_QUEUE; + if (!tfile->socket.sk->sk_filter) + ifr.ifr_flags |= IFF_NOFILTER; if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index c58d023c4822..cc127b2b4c3d 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -71,6 +71,7 @@ #define IFF_DETACH_QUEUE 0x0400 /* read-only flag */ #define IFF_PERSIST 0x0800 +#define IFF_NOFILTER 0x1000 /* Socket options */ #define TUN_TX_TIMESTAMP 1 -- cgit v1.2.3-71-gd317 From 76975e9cb4a7c6fe39478a3dc4dd292a5c6c8c74 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 21 Aug 2013 14:32:39 +0400 Subject: tun: Get skfilter layout The only thing we may have from tun device is the fprog, whic contains the number of filter elements and a pointer to (user-space) memory where the elements are. The program itself may not be available if the device is persistent and detached. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- drivers/net/tun.c | 10 ++++++++++ include/uapi/linux/if_tun.h | 1 + 2 files changed, 11 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 6acbdbccebcd..60a1e93e9d35 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2042,6 +2042,16 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, tun_detach_filter(tun, tun->numqueues); break; + case TUNGETFILTER: + ret = -EINVAL; + if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV) + break; + ret = -EFAULT; + if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog))) + break; + ret = 0; + break; + default: ret = -EINVAL; break; diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index cc127b2b4c3d..e9502dd1ee2c 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -57,6 +57,7 @@ #define TUNSETVNETHDRSZ _IOW('T', 216, int) #define TUNSETQUEUE _IOW('T', 217, int) #define TUNSETIFINDEX _IOW('T', 218, unsigned int) +#define TUNGETFILTER _IOR('T', 219, struct sock_fprog) /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 -- cgit v1.2.3-71-gd317 From 4a5a8aa6c966eafc106543bd955ae388230420e5 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Wed, 21 Aug 2013 21:09:47 -0700 Subject: ipv4: expose IPV4_DEVCONF IP sends device configuration (see inet_fill_link_af) as an array in the netlink information, but the indices in that array are not exposed to userspace through any current santized header file. It was available back in 2.6.32 (in /usr/include/linux/sysctl.h) but was broken by: commit 02291680ffba92e5b5865bc0c5e7d1f3056b80ec Author: Eric W. Biederman Date: Sun Feb 14 03:25:51 2010 +0000 net ipv4: Decouple ipv4 interface parameters from binary sysctl numbers Eric was solving the sysctl problem but then the indices were re-exposed by a later addition of devconf support for IPV4 commit 9f0f7272ac9506f4c8c05cc597b7e376b0b9f3e4 Author: Thomas Graf Date: Tue Nov 16 04:32:48 2010 +0000 ipv4: AF_INET link address family Putting them in /usr/include/linux/ip.h seemed the logical match for the DEVCONF_ definitions for IPV6 in /usr/include/linux/ip6.h Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 34 +--------------------------------- include/uapi/linux/ip.h | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 33 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index b99cd23f3474..79640e015a86 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -5,45 +5,13 @@ #include #include +#include #include #include #include #include #include -enum -{ - IPV4_DEVCONF_FORWARDING=1, - IPV4_DEVCONF_MC_FORWARDING, - IPV4_DEVCONF_PROXY_ARP, - IPV4_DEVCONF_ACCEPT_REDIRECTS, - IPV4_DEVCONF_SECURE_REDIRECTS, - IPV4_DEVCONF_SEND_REDIRECTS, - IPV4_DEVCONF_SHARED_MEDIA, - IPV4_DEVCONF_RP_FILTER, - IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE, - IPV4_DEVCONF_BOOTP_RELAY, - IPV4_DEVCONF_LOG_MARTIANS, - IPV4_DEVCONF_TAG, - IPV4_DEVCONF_ARPFILTER, - IPV4_DEVCONF_MEDIUM_ID, - IPV4_DEVCONF_NOXFRM, - IPV4_DEVCONF_NOPOLICY, - IPV4_DEVCONF_FORCE_IGMP_VERSION, - IPV4_DEVCONF_ARP_ANNOUNCE, - IPV4_DEVCONF_ARP_IGNORE, - IPV4_DEVCONF_PROMOTE_SECONDARIES, - IPV4_DEVCONF_ARP_ACCEPT, - IPV4_DEVCONF_ARP_NOTIFY, - IPV4_DEVCONF_ACCEPT_LOCAL, - IPV4_DEVCONF_SRC_VMARK, - IPV4_DEVCONF_PROXY_ARP_PVLAN, - IPV4_DEVCONF_ROUTE_LOCALNET, - __IPV4_DEVCONF_MAX -}; - -#define IPV4_DEVCONF_MAX (__IPV4_DEVCONF_MAX - 1) - struct ipv4_devconf { void *sysctl; int data[IPV4_DEVCONF_MAX]; diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h index 6cf06bfd841b..2fee45bdec0a 100644 --- a/include/uapi/linux/ip.h +++ b/include/uapi/linux/ip.h @@ -133,4 +133,38 @@ struct ip_beet_phdr { __u8 reserved; }; +/* index values for the variables in ipv4_devconf */ +enum +{ + IPV4_DEVCONF_FORWARDING=1, + IPV4_DEVCONF_MC_FORWARDING, + IPV4_DEVCONF_PROXY_ARP, + IPV4_DEVCONF_ACCEPT_REDIRECTS, + IPV4_DEVCONF_SECURE_REDIRECTS, + IPV4_DEVCONF_SEND_REDIRECTS, + IPV4_DEVCONF_SHARED_MEDIA, + IPV4_DEVCONF_RP_FILTER, + IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE, + IPV4_DEVCONF_BOOTP_RELAY, + IPV4_DEVCONF_LOG_MARTIANS, + IPV4_DEVCONF_TAG, + IPV4_DEVCONF_ARPFILTER, + IPV4_DEVCONF_MEDIUM_ID, + IPV4_DEVCONF_NOXFRM, + IPV4_DEVCONF_NOPOLICY, + IPV4_DEVCONF_FORCE_IGMP_VERSION, + IPV4_DEVCONF_ARP_ANNOUNCE, + IPV4_DEVCONF_ARP_IGNORE, + IPV4_DEVCONF_PROMOTE_SECONDARIES, + IPV4_DEVCONF_ARP_ACCEPT, + IPV4_DEVCONF_ARP_NOTIFY, + IPV4_DEVCONF_ACCEPT_LOCAL, + IPV4_DEVCONF_SRC_VMARK, + IPV4_DEVCONF_PROXY_ARP_PVLAN, + IPV4_DEVCONF_ROUTE_LOCALNET, + __IPV4_DEVCONF_MAX +}; + +#define IPV4_DEVCONF_MAX (__IPV4_DEVCONF_MAX - 1) + #endif /* _UAPI_LINUX_IP_H */ -- cgit v1.2.3-71-gd317 From 19504cf5f35fbe85db811fce9f4392a0cbdada2f Mon Sep 17 00:00:00 2001 From: Vladimir Kondratiev Date: Thu, 15 Aug 2013 14:51:28 +0300 Subject: cfg80211: add flags to cfg80211_rx_mgmt() Add flags intended to report various auxiliary information and introduce the NL80211_RXMGMT_FLAG_ANSWERED flag to report that the frame was already answered by the device. Signed-off-by: Vladimir Kondratiev [REPLIED->ANSWERED, reword commit message] Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath6kl/wmi.c | 7 +++---- drivers/net/wireless/ath/wil6210/wmi.c | 2 +- drivers/net/wireless/brcm80211/brcmfmac/p2p.c | 4 ++-- drivers/net/wireless/mwifiex/util.c | 4 ++-- include/net/cfg80211.h | 3 ++- include/uapi/linux/nl80211.h | 16 ++++++++++++++++ net/mac80211/rx.c | 3 +-- net/wireless/mlme.c | 4 ++-- net/wireless/nl80211.c | 6 ++++-- net/wireless/nl80211.h | 2 +- 10 files changed, 34 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/wireless/ath/ath6kl/wmi.c b/drivers/net/wireless/ath/ath6kl/wmi.c index 87aefb4c4c23..546d5da0b894 100644 --- a/drivers/net/wireless/ath/ath6kl/wmi.c +++ b/drivers/net/wireless/ath/ath6kl/wmi.c @@ -568,8 +568,8 @@ static int ath6kl_wmi_rx_probe_req_event_rx(struct wmi *wmi, u8 *datap, int len, dlen, freq, vif->probe_req_report); if (vif->probe_req_report || vif->nw_type == AP_NETWORK) - cfg80211_rx_mgmt(&vif->wdev, freq, 0, - ev->data, dlen, GFP_ATOMIC); + cfg80211_rx_mgmt(&vif->wdev, freq, 0, ev->data, dlen, 0, + GFP_ATOMIC); return 0; } @@ -608,8 +608,7 @@ static int ath6kl_wmi_rx_action_event_rx(struct wmi *wmi, u8 *datap, int len, return -EINVAL; } ath6kl_dbg(ATH6KL_DBG_WMI, "rx_action: len=%u freq=%u\n", dlen, freq); - cfg80211_rx_mgmt(&vif->wdev, freq, 0, - ev->data, dlen, GFP_ATOMIC); + cfg80211_rx_mgmt(&vif->wdev, freq, 0, ev->data, dlen, 0, GFP_ATOMIC); return 0; } diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c index dc8059ad4bab..21c791ee8178 100644 --- a/drivers/net/wireless/ath/wil6210/wmi.c +++ b/drivers/net/wireless/ath/wil6210/wmi.c @@ -339,7 +339,7 @@ static void wmi_evt_rx_mgmt(struct wil6210_priv *wil, int id, void *d, int len) } } else { cfg80211_rx_mgmt(wil->wdev, freq, signal, - (void *)rx_mgmt_frame, d_len, GFP_KERNEL); + (void *)rx_mgmt_frame, d_len, 0, GFP_KERNEL); } } diff --git a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c index 79555f006d53..d7a974532909 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c @@ -1430,7 +1430,7 @@ int brcmf_p2p_notify_action_frame_rx(struct brcmf_if *ifp, IEEE80211_BAND_5GHZ); wdev = &ifp->vif->wdev; - cfg80211_rx_mgmt(wdev, freq, 0, (u8 *)mgmt_frame, mgmt_frame_len, + cfg80211_rx_mgmt(wdev, freq, 0, (u8 *)mgmt_frame, mgmt_frame_len, 0, GFP_ATOMIC); kfree(mgmt_frame); @@ -1895,7 +1895,7 @@ s32 brcmf_p2p_notify_rx_mgmt_p2p_probereq(struct brcmf_if *ifp, IEEE80211_BAND_2GHZ : IEEE80211_BAND_5GHZ); - cfg80211_rx_mgmt(&vif->wdev, freq, 0, mgmt_frame, mgmt_frame_len, + cfg80211_rx_mgmt(&vif->wdev, freq, 0, mgmt_frame, mgmt_frame_len, 0, GFP_ATOMIC); brcmf_dbg(INFO, "mgmt_frame_len (%d) , e->datalen (%d), chanspec (%04x), freq (%d)\n", diff --git a/drivers/net/wireless/mwifiex/util.c b/drivers/net/wireless/mwifiex/util.c index e57ac0dd3ab5..5d9e150f4111 100644 --- a/drivers/net/wireless/mwifiex/util.c +++ b/drivers/net/wireless/mwifiex/util.c @@ -171,8 +171,8 @@ mwifiex_process_mgmt_packet(struct mwifiex_private *priv, rx_pd->rx_pkt_length = cpu_to_le16(pkt_len); cfg80211_rx_mgmt(priv->wdev, priv->roc_cfg.chan.center_freq, - CAL_RSSI(rx_pd->snr, rx_pd->nf), - skb->data, pkt_len, GFP_ATOMIC); + CAL_RSSI(rx_pd->snr, rx_pd->nf), skb->data, pkt_len, + 0, GFP_ATOMIC); return 0; } diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 9ab7a0690d93..d530c54a3662 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4056,6 +4056,7 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, * @sig_dbm: signal strength in mBm, or 0 if unknown * @buf: Management frame (header + body) * @len: length of the frame data + * @flags: flags, as defined in enum nl80211_rxmgmt_flags * @gfp: context flags * * This function is called whenever an Action frame is received for a station @@ -4067,7 +4068,7 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, * driver is responsible for rejecting the frame. */ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm, - const u8 *buf, size_t len, gfp_t gfp); + const u8 *buf, size_t len, u32 flags, gfp_t gfp); /** * cfg80211_mgmt_tx_status - notification of TX status for management frame diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 1f42bc3dcb9c..fde2c021b26d 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1493,6 +1493,9 @@ enum nl80211_commands { * @NL80211_ATTR_CSA_C_OFF_PRESP: Offset of the channel switch counter * field in the probe response (%NL80211_ATTR_PROBE_RESP). * + * @NL80211_ATTR_RXMGMT_FLAGS: flags for nl80211_send_mgmt(), u32. + * As specified in the &enum nl80211_rxmgmt_flags. + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -1801,6 +1804,8 @@ enum nl80211_attrs { NL80211_ATTR_CSA_C_OFF_BEACON, NL80211_ATTR_CSA_C_OFF_PRESP, + NL80211_ATTR_RXMGMT_FLAGS, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -3901,4 +3906,15 @@ enum nl80211_crit_proto_id { /* maximum duration for critical protocol measures */ #define NL80211_CRIT_PROTO_MAX_DURATION 5000 /* msec */ +/** + * enum nl80211_rxmgmt_flags - flags for received management frame. + * + * Used by cfg80211_rx_mgmt() + * + * @NL80211_RXMGMT_FLAG_ANSWERED: frame was answered by device/driver. + */ +enum nl80211_rxmgmt_flags { + NL80211_RXMGMT_FLAG_ANSWERED = 1 << 0, +}; + #endif /* __LINUX_NL80211_H */ diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index ffad155316a9..07901050812f 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2678,8 +2678,7 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) sig = status->signal; if (cfg80211_rx_mgmt(&rx->sdata->wdev, status->freq, sig, - rx->skb->data, rx->skb->len, - GFP_ATOMIC)) { + rx->skb->data, rx->skb->len, 0, GFP_ATOMIC)) { if (rx->sta) rx->sta->rx_packets++; dev_kfree_skb(rx->skb); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index bfac5e186f57..8d49c1ce3dea 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -621,7 +621,7 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, } bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm, - const u8 *buf, size_t len, gfp_t gfp) + const u8 *buf, size_t len, u32 flags, gfp_t gfp) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); @@ -664,7 +664,7 @@ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm, /* Indicate the received Action frame to user space */ if (nl80211_send_mgmt(rdev, wdev, reg->nlportid, freq, sig_mbm, - buf, len, gfp)) + buf, len, flags, gfp)) continue; result = true; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 334697de5cc0..a51269d2d488 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10446,7 +10446,7 @@ EXPORT_SYMBOL(cfg80211_rx_unexpected_4addr_frame); int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u32 nlportid, int freq, int sig_dbm, - const u8 *buf, size_t len, gfp_t gfp) + const u8 *buf, size_t len, u32 flags, gfp_t gfp) { struct net_device *netdev = wdev->netdev; struct sk_buff *msg; @@ -10469,7 +10469,9 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, freq) || (sig_dbm && nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) || - nla_put(msg, NL80211_ATTR_FRAME, len, buf)) + nla_put(msg, NL80211_ATTR_FRAME, len, buf) || + (flags && + nla_put_u32(msg, NL80211_ATTR_RXMGMT_FLAGS, flags))) goto nla_put_failure; genlmsg_end(msg, hdr); diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index 44341bf53cfc..2c0f2b3c07cb 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -66,7 +66,7 @@ void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev, int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u32 nlpid, int freq, int sig_dbm, - const u8 *buf, size_t len, gfp_t gfp); + const u8 *buf, size_t len, u32 flags, gfp_t gfp); void nl80211_radar_notify(struct cfg80211_registered_device *rdev, -- cgit v1.2.3-71-gd317 From 03f0d916aa0317592dda11bd17c7357858719b6c Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Wed, 7 Aug 2013 20:01:00 -0700 Subject: openvswitch: Mega flow implementation Add wildcarded flow support in kernel datapath. Wildcarded flow can improve OVS flow set up performance by avoid sending matching new flows to the user space program. The exact performance boost will largely dependent on wildcarded flow hit rate. In case all new flows hits wildcard flows, the flow set up rate is within 5% of that of linux bridge module. Pravin has made significant contributions to this patch. Including API clean ups and bug fixes. Signed-off-by: Pravin B Shelar Signed-off-by: Andy Zhou Signed-off-by: Jesse Gross --- Documentation/networking/openvswitch.txt | 40 + include/uapi/linux/openvswitch.h | 9 +- net/openvswitch/actions.c | 6 +- net/openvswitch/datapath.c | 140 ++- net/openvswitch/datapath.h | 6 + net/openvswitch/flow.c | 1387 ++++++++++++++++++++---------- net/openvswitch/flow.h | 96 ++- 7 files changed, 1171 insertions(+), 513 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/openvswitch.txt b/Documentation/networking/openvswitch.txt index 8fa2dd1e792e..37c20ee2455e 100644 --- a/Documentation/networking/openvswitch.txt +++ b/Documentation/networking/openvswitch.txt @@ -91,6 +91,46 @@ Often we ellipsize arguments not important to the discussion, e.g.: in_port(1), eth(...), eth_type(0x0800), ipv4(...), tcp(...) +Wildcarded flow key format +-------------------------- + +A wildcarded flow is described with two sequences of Netlink attributes +passed over the Netlink socket. A flow key, exactly as described above, and an +optional corresponding flow mask. + +A wildcarded flow can represent a group of exact match flows. Each '1' bit +in the mask specifies a exact match with the corresponding bit in the flow key. +A '0' bit specifies a don't care bit, which will match either a '1' or '0' bit +of a incoming packet. Using wildcarded flow can improve the flow set up rate +by reduce the number of new flows need to be processed by the user space program. + +Support for the mask Netlink attribute is optional for both the kernel and user +space program. The kernel can ignore the mask attribute, installing an exact +match flow, or reduce the number of don't care bits in the kernel to less than +what was specified by the user space program. In this case, variations in bits +that the kernel does not implement will simply result in additional flow setups. +The kernel module will also work with user space programs that neither support +nor supply flow mask attributes. + +Since the kernel may ignore or modify wildcard bits, it can be difficult for +the userspace program to know exactly what matches are installed. There are +two possible approaches: reactively install flows as they miss the kernel +flow table (and therefore not attempt to determine wildcard changes at all) +or use the kernel's response messages to determine the installed wildcards. + +When interacting with userspace, the kernel should maintain the match portion +of the key exactly as originally installed. This will provides a handle to +identify the flow for all future operations. However, when reporting the +mask of an installed flow, the mask should include any restrictions imposed +by the kernel. + +The behavior when using overlapping wildcarded flows is undefined. It is the +responsibility of the user space program to ensure that any incoming packet +can match at most one flow, wildcarded or not. The current implementation +performs best-effort detection of overlapping wildcarded flows and may reject +some but not all of them. However, this behavior may change in future versions. + + Basic rule for evolving flow keys --------------------------------- diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 52490b0e62b5..de1fa5d3780f 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -1,6 +1,6 @@ /* - * Copyright (c) 2007-2011 Nicira Networks. + * Copyright (c) 2007-2013 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -379,6 +379,12 @@ struct ovs_key_nd { * @OVS_FLOW_ATTR_CLEAR: If present in a %OVS_FLOW_CMD_SET request, clears the * last-used time, accumulated TCP flags, and statistics for this flow. * Otherwise ignored in requests. Never present in notifications. + * @OVS_FLOW_ATTR_MASK: Nested %OVS_KEY_ATTR_* attributes specifying the + * mask bits for wildcarded flow match. Mask bit value '1' specifies exact + * match with corresponding flow key bit, while mask bit value '0' specifies + * a wildcarded match. Omitting attribute is treated as wildcarding all + * corresponding fields. Optional for all requests. If not present, + * all flow key bits are exact match bits. * * These attributes follow the &struct ovs_header within the Generic Netlink * payload for %OVS_FLOW_* commands. @@ -391,6 +397,7 @@ enum ovs_flow_attr { OVS_FLOW_ATTR_TCP_FLAGS, /* 8-bit OR'd TCP flags. */ OVS_FLOW_ATTR_USED, /* u64 msecs last used in monotonic time. */ OVS_FLOW_ATTR_CLEAR, /* Flag to clear stats, tcp_flags, used. */ + OVS_FLOW_ATTR_MASK, /* Sequence of OVS_KEY_ATTR_* attributes. */ __OVS_FLOW_ATTR_MAX }; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index ab101f715447..1f680222f4f7 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Nicira, Inc. + * Copyright (c) 2007-2013 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -376,8 +376,10 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, const struct nlattr *a; int rem; + BUG_ON(!OVS_CB(skb)->pkt_key); + upcall.cmd = OVS_PACKET_CMD_ACTION; - upcall.key = &OVS_CB(skb)->flow->key; + upcall.key = OVS_CB(skb)->pkt_key; upcall.userdata = NULL; upcall.portid = 0; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 9d97ef3c9830..d29cd9aa4a67 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Nicira, Inc. + * Copyright (c) 2007-2013 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -165,7 +165,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu) { struct datapath *dp = container_of(rcu, struct datapath, rcu); - ovs_flow_tbl_destroy((__force struct flow_table *)dp->table); + ovs_flow_tbl_destroy((__force struct flow_table *)dp->table, false); free_percpu(dp->stats_percpu); release_net(ovs_dp_get_net(dp)); kfree(dp->ports); @@ -226,19 +226,18 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) struct sw_flow_key key; u64 *stats_counter; int error; - int key_len; stats = this_cpu_ptr(dp->stats_percpu); /* Extract flow from 'skb' into 'key'. */ - error = ovs_flow_extract(skb, p->port_no, &key, &key_len); + error = ovs_flow_extract(skb, p->port_no, &key); if (unlikely(error)) { kfree_skb(skb); return; } /* Look up flow. */ - flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table), &key, key_len); + flow = ovs_flow_lookup(rcu_dereference(dp->table), &key); if (unlikely(!flow)) { struct dp_upcall_info upcall; @@ -253,6 +252,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) } OVS_CB(skb)->flow = flow; + OVS_CB(skb)->pkt_key = &key; stats_counter = &stats->n_hit; ovs_flow_used(OVS_CB(skb)->flow, skb); @@ -435,7 +435,7 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex, upcall->dp_ifindex = dp_ifindex; nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY); - ovs_flow_to_nlattrs(upcall_info->key, user_skb); + ovs_flow_to_nlattrs(upcall_info->key, upcall_info->key, user_skb); nla_nest_end(user_skb, nla); if (upcall_info->userdata) @@ -468,7 +468,7 @@ static int flush_flows(struct datapath *dp) rcu_assign_pointer(dp->table, new_table); - ovs_flow_tbl_deferred_destroy(old_table); + ovs_flow_tbl_destroy(old_table, true); return 0; } @@ -611,10 +611,12 @@ static int validate_tp_port(const struct sw_flow_key *flow_key) static int validate_and_copy_set_tun(const struct nlattr *attr, struct sw_flow_actions **sfa) { - struct ovs_key_ipv4_tunnel tun_key; + struct sw_flow_match match; + struct sw_flow_key key; int err, start; - err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &tun_key); + ovs_match_init(&match, &key, NULL); + err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &match, false); if (err) return err; @@ -622,7 +624,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (start < 0) return start; - err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &tun_key, sizeof(tun_key)); + err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key, + sizeof(match.key->tun_key)); add_nested_action_end(*sfa, start); return err; @@ -857,7 +860,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) struct ethhdr *eth; int len; int err; - int key_len; err = -EINVAL; if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] || @@ -890,11 +892,11 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(flow)) goto err_kfree_skb; - err = ovs_flow_extract(packet, -1, &flow->key, &key_len); + err = ovs_flow_extract(packet, -1, &flow->key); if (err) goto err_flow_free; - err = ovs_flow_metadata_from_nlattrs(flow, key_len, a[OVS_PACKET_ATTR_KEY]); + err = ovs_flow_metadata_from_nlattrs(flow, a[OVS_PACKET_ATTR_KEY]); if (err) goto err_flow_free; acts = ovs_flow_actions_alloc(nla_len(a[OVS_PACKET_ATTR_ACTIONS])); @@ -908,6 +910,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) goto err_flow_free; OVS_CB(packet)->flow = flow; + OVS_CB(packet)->pkt_key = &flow->key; packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; @@ -922,13 +925,13 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) local_bh_enable(); rcu_read_unlock(); - ovs_flow_free(flow); + ovs_flow_free(flow, false); return err; err_unlock: rcu_read_unlock(); err_flow_free: - ovs_flow_free(flow); + ovs_flow_free(flow, false); err_kfree_skb: kfree_skb(packet); err: @@ -1045,7 +1048,8 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) if (!start) return -EMSGSIZE; - err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key)); + err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key), + nla_data(ovs_key)); if (err) return err; nla_nest_end(skb, start); @@ -1093,6 +1097,7 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts) { return NLMSG_ALIGN(sizeof(struct ovs_header)) + nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_KEY */ + + nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_MASK */ + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */ + nla_total_size(8) /* OVS_FLOW_ATTR_USED */ @@ -1119,12 +1124,25 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, ovs_header->dp_ifindex = get_dpifindex(dp); + /* Fill flow key. */ nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY); if (!nla) goto nla_put_failure; - err = ovs_flow_to_nlattrs(&flow->key, skb); + + err = ovs_flow_to_nlattrs(&flow->unmasked_key, + &flow->unmasked_key, skb); + if (err) + goto error; + nla_nest_end(skb, nla); + + nla = nla_nest_start(skb, OVS_FLOW_ATTR_MASK); + if (!nla) + goto nla_put_failure; + + err = ovs_flow_to_nlattrs(&flow->key, &flow->mask->key, skb); if (err) goto error; + nla_nest_end(skb, nla); spin_lock_bh(&flow->lock); @@ -1214,20 +1232,24 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; - struct sw_flow_key key; - struct sw_flow *flow; + struct sw_flow_key key, masked_key; + struct sw_flow *flow = NULL; + struct sw_flow_mask mask; struct sk_buff *reply; struct datapath *dp; struct flow_table *table; struct sw_flow_actions *acts = NULL; + struct sw_flow_match match; int error; - int key_len; /* Extract key. */ error = -EINVAL; if (!a[OVS_FLOW_ATTR_KEY]) goto error; - error = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]); + + ovs_match_init(&match, &key, &mask); + error = ovs_match_from_nlattrs(&match, + a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]); if (error) goto error; @@ -1238,9 +1260,13 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(acts)) goto error; - error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, 0, &acts); - if (error) + ovs_flow_key_mask(&masked_key, &key, &mask); + error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], + &masked_key, 0, &acts); + if (error) { + OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); goto err_kfree; + } } else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) { error = -EINVAL; goto error; @@ -1253,8 +1279,11 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) goto err_unlock_ovs; table = ovsl_dereference(dp->table); - flow = ovs_flow_tbl_lookup(table, &key, key_len); + + /* Check if this is a duplicate flow */ + flow = ovs_flow_lookup(table, &key); if (!flow) { + struct sw_flow_mask *mask_p; /* Bail out if we're not allowed to create a new flow. */ error = -ENOENT; if (info->genlhdr->cmd == OVS_FLOW_CMD_SET) @@ -1267,7 +1296,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) new_table = ovs_flow_tbl_expand(table); if (!IS_ERR(new_table)) { rcu_assign_pointer(dp->table, new_table); - ovs_flow_tbl_deferred_destroy(table); + ovs_flow_tbl_destroy(table, true); table = ovsl_dereference(dp->table); } } @@ -1280,14 +1309,30 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) } clear_stats(flow); + flow->key = masked_key; + flow->unmasked_key = key; + + /* Make sure mask is unique in the system */ + mask_p = ovs_sw_flow_mask_find(table, &mask); + if (!mask_p) { + /* Allocate a new mask if none exsits. */ + mask_p = ovs_sw_flow_mask_alloc(); + if (!mask_p) + goto err_flow_free; + mask_p->key = mask.key; + mask_p->range = mask.range; + ovs_sw_flow_mask_insert(table, mask_p); + } + + ovs_sw_flow_mask_add_ref(mask_p); + flow->mask = mask_p; rcu_assign_pointer(flow->sf_acts, acts); /* Put flow in bucket. */ - ovs_flow_tbl_insert(table, flow, &key, key_len); + ovs_flow_insert(table, flow); reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, - info->snd_seq, - OVS_FLOW_CMD_NEW); + info->snd_seq, OVS_FLOW_CMD_NEW); } else { /* We found a matching flow. */ struct sw_flow_actions *old_acts; @@ -1303,6 +1348,13 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) goto err_unlock_ovs; + /* The unmasked key has to be the same for flow updates. */ + error = -EINVAL; + if (!ovs_flow_cmp_unmasked_key(flow, &key, match.range.end)) { + OVS_NLERR("Flow modification message rejected, unmasked key does not match.\n"); + goto err_unlock_ovs; + } + /* Update actions. */ old_acts = ovsl_dereference(flow->sf_acts); rcu_assign_pointer(flow->sf_acts, acts); @@ -1327,6 +1379,8 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) ovs_dp_flow_multicast_group.id, PTR_ERR(reply)); return 0; +err_flow_free: + ovs_flow_free(flow, false); err_unlock_ovs: ovs_unlock(); err_kfree: @@ -1344,12 +1398,16 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) struct sw_flow *flow; struct datapath *dp; struct flow_table *table; + struct sw_flow_match match; int err; - int key_len; - if (!a[OVS_FLOW_ATTR_KEY]) + if (!a[OVS_FLOW_ATTR_KEY]) { + OVS_NLERR("Flow get message rejected, Key attribute missing.\n"); return -EINVAL; - err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]); + } + + ovs_match_init(&match, &key, NULL); + err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL); if (err) return err; @@ -1361,7 +1419,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) } table = ovsl_dereference(dp->table); - flow = ovs_flow_tbl_lookup(table, &key, key_len); + flow = ovs_flow_lookup_unmasked_key(table, &match); if (!flow) { err = -ENOENT; goto unlock; @@ -1390,8 +1448,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) struct sw_flow *flow; struct datapath *dp; struct flow_table *table; + struct sw_flow_match match; int err; - int key_len; ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); @@ -1404,12 +1462,14 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) err = flush_flows(dp); goto unlock; } - err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]); + + ovs_match_init(&match, &key, NULL); + err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL); if (err) goto unlock; table = ovsl_dereference(dp->table); - flow = ovs_flow_tbl_lookup(table, &key, key_len); + flow = ovs_flow_lookup_unmasked_key(table, &match); if (!flow) { err = -ENOENT; goto unlock; @@ -1421,13 +1481,13 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) goto unlock; } - ovs_flow_tbl_remove(table, flow); + ovs_flow_remove(table, flow); err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_portid, info->snd_seq, 0, OVS_FLOW_CMD_DEL); BUG_ON(err < 0); - ovs_flow_deferred_free(flow); + ovs_flow_free(flow, true); ovs_unlock(); ovs_notify(reply, info, &ovs_dp_flow_multicast_group); @@ -1457,7 +1517,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) bucket = cb->args[0]; obj = cb->args[1]; - flow = ovs_flow_tbl_next(table, &bucket, &obj); + flow = ovs_flow_dump_next(table, &bucket, &obj); if (!flow) break; @@ -1680,7 +1740,7 @@ err_destroy_ports_array: err_destroy_percpu: free_percpu(dp->stats_percpu); err_destroy_table: - ovs_flow_tbl_destroy(ovsl_dereference(dp->table)); + ovs_flow_tbl_destroy(ovsl_dereference(dp->table), false); err_free_dp: release_net(ovs_dp_get_net(dp)); kfree(dp); @@ -2287,7 +2347,7 @@ static void rehash_flow_table(struct work_struct *work) new_table = ovs_flow_tbl_rehash(old_table); if (!IS_ERR(new_table)) { rcu_assign_pointer(dp->table, new_table); - ovs_flow_tbl_deferred_destroy(old_table); + ovs_flow_tbl_destroy(old_table, true); } } } diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index a91486484916..4d109c176ef3 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -88,11 +88,13 @@ struct datapath { /** * struct ovs_skb_cb - OVS data in skb CB * @flow: The flow associated with this packet. May be %NULL if no flow. + * @pkt_key: The flow information extracted from the packet. Must be nonnull. * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the * packet is not being tunneled. */ struct ovs_skb_cb { struct sw_flow *flow; + struct sw_flow_key *pkt_key; struct ovs_key_ipv4_tunnel *tun_key; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -183,4 +185,8 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb); void ovs_dp_notify_wq(struct work_struct *work); + +#define OVS_NLERR(fmt, ...) \ + pr_info_once("netlink: " fmt, ##__VA_ARGS__) + #endif /* datapath.h */ diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index fca282520cee..1fceb9653598 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2011 Nicira, Inc. + * Copyright (c) 2007-2013 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -46,6 +46,184 @@ static struct kmem_cache *flow_cache; +static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask, + struct sw_flow_key_range *range, u8 val); + +static void update_range__(struct sw_flow_match *match, + size_t offset, size_t size, bool is_mask) +{ + struct sw_flow_key_range *range = NULL; + size_t start = offset; + size_t end = offset + size; + + if (!is_mask) + range = &match->range; + else if (match->mask) + range = &match->mask->range; + + if (!range) + return; + + if (range->start == range->end) { + range->start = start; + range->end = end; + return; + } + + if (range->start > start) + range->start = start; + + if (range->end < end) + range->end = end; +} + +#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \ + do { \ + update_range__(match, offsetof(struct sw_flow_key, field), \ + sizeof((match)->key->field), is_mask); \ + if (is_mask) { \ + if ((match)->mask) \ + (match)->mask->key.field = value; \ + } else { \ + (match)->key->field = value; \ + } \ + } while (0) + +#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ + do { \ + update_range__(match, offsetof(struct sw_flow_key, field), \ + len, is_mask); \ + if (is_mask) { \ + if ((match)->mask) \ + memcpy(&(match)->mask->key.field, value_p, len);\ + } else { \ + memcpy(&(match)->key->field, value_p, len); \ + } \ + } while (0) + +void ovs_match_init(struct sw_flow_match *match, + struct sw_flow_key *key, + struct sw_flow_mask *mask) +{ + memset(match, 0, sizeof(*match)); + match->key = key; + match->mask = mask; + + memset(key, 0, sizeof(*key)); + + if (mask) { + memset(&mask->key, 0, sizeof(mask->key)); + mask->range.start = mask->range.end = 0; + } +} + +static bool ovs_match_validate(const struct sw_flow_match *match, + u64 key_attrs, u64 mask_attrs) +{ + u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET; + u64 mask_allowed = key_attrs; /* At most allow all key attributes */ + + /* The following mask attributes allowed only if they + * pass the validation tests. */ + mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4) + | (1 << OVS_KEY_ATTR_IPV6) + | (1 << OVS_KEY_ATTR_TCP) + | (1 << OVS_KEY_ATTR_UDP) + | (1 << OVS_KEY_ATTR_ICMP) + | (1 << OVS_KEY_ATTR_ICMPV6) + | (1 << OVS_KEY_ATTR_ARP) + | (1 << OVS_KEY_ATTR_ND)); + + /* Always allowed mask fields. */ + mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL) + | (1 << OVS_KEY_ATTR_IN_PORT) + | (1 << OVS_KEY_ATTR_ETHERTYPE)); + + /* Check key attributes. */ + if (match->key->eth.type == htons(ETH_P_ARP) + || match->key->eth.type == htons(ETH_P_RARP)) { + key_expected |= 1 << OVS_KEY_ATTR_ARP; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_ARP; + } + + if (match->key->eth.type == htons(ETH_P_IP)) { + key_expected |= 1 << OVS_KEY_ATTR_IPV4; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_IPV4; + + if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { + if (match->key->ip.proto == IPPROTO_UDP) { + key_expected |= 1 << OVS_KEY_ATTR_UDP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_UDP; + } + + if (match->key->ip.proto == IPPROTO_TCP) { + key_expected |= 1 << OVS_KEY_ATTR_TCP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_TCP; + } + + if (match->key->ip.proto == IPPROTO_ICMP) { + key_expected |= 1 << OVS_KEY_ATTR_ICMP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_ICMP; + } + } + } + + if (match->key->eth.type == htons(ETH_P_IPV6)) { + key_expected |= 1 << OVS_KEY_ATTR_IPV6; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_IPV6; + + if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { + if (match->key->ip.proto == IPPROTO_UDP) { + key_expected |= 1 << OVS_KEY_ATTR_UDP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_UDP; + } + + if (match->key->ip.proto == IPPROTO_TCP) { + key_expected |= 1 << OVS_KEY_ATTR_TCP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_TCP; + } + + if (match->key->ip.proto == IPPROTO_ICMPV6) { + key_expected |= 1 << OVS_KEY_ATTR_ICMPV6; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6; + + if (match->key->ipv6.tp.src == + htons(NDISC_NEIGHBOUR_SOLICITATION) || + match->key->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { + key_expected |= 1 << OVS_KEY_ATTR_ND; + if (match->mask && (match->mask->key.ipv6.tp.src == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_ND; + } + } + } + } + + if ((key_attrs & key_expected) != key_expected) { + /* Key attributes check failed. */ + OVS_NLERR("Missing expected key attributes (key_attrs=%llx, expected=%llx).\n", + key_attrs, key_expected); + return false; + } + + if ((mask_attrs & mask_allowed) != mask_attrs) { + /* Mask attributes check failed. */ + OVS_NLERR("Contain more than allowed mask fields (mask_attrs=%llx, mask_allowed=%llx).\n", + mask_attrs, mask_allowed); + return false; + } + + return true; +} + static int check_header(struct sk_buff *skb, int len) { if (unlikely(skb->len < len)) @@ -121,12 +299,7 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies) return cur_ms - idle_ms; } -#define SW_FLOW_KEY_OFFSET(field) \ - (offsetof(struct sw_flow_key, field) + \ - FIELD_SIZEOF(struct sw_flow_key, field)) - -static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key, - int *key_lenp) +static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) { unsigned int nh_ofs = skb_network_offset(skb); unsigned int nh_len; @@ -136,8 +309,6 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key, __be16 frag_off; int err; - *key_lenp = SW_FLOW_KEY_OFFSET(ipv6.label); - err = check_header(skb, nh_ofs + sizeof(*nh)); if (unlikely(err)) return err; @@ -176,6 +347,21 @@ static bool icmp6hdr_ok(struct sk_buff *skb) sizeof(struct icmp6hdr)); } +void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src, + const struct sw_flow_mask *mask) +{ + u8 *m = (u8 *)&mask->key + mask->range.start; + u8 *s = (u8 *)src + mask->range.start; + u8 *d = (u8 *)dst + mask->range.start; + int i; + + memset(dst, 0, sizeof(*dst)); + for (i = 0; i < ovs_sw_flow_mask_size_roundup(mask); i++) { + *d = *s & *m; + d++, s++, m++; + } +} + #define TCP_FLAGS_OFFSET 13 #define TCP_FLAG_MASK 0x3f @@ -224,6 +410,7 @@ struct sw_flow *ovs_flow_alloc(void) spin_lock_init(&flow->lock); flow->sf_acts = NULL; + flow->mask = NULL; return flow; } @@ -263,7 +450,7 @@ static void free_buckets(struct flex_array *buckets) flex_array_free(buckets); } -struct flow_table *ovs_flow_tbl_alloc(int new_size) +static struct flow_table *__flow_tbl_alloc(int new_size) { struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL); @@ -281,17 +468,15 @@ struct flow_table *ovs_flow_tbl_alloc(int new_size) table->node_ver = 0; table->keep_flows = false; get_random_bytes(&table->hash_seed, sizeof(u32)); + table->mask_list = NULL; return table; } -void ovs_flow_tbl_destroy(struct flow_table *table) +static void __flow_tbl_destroy(struct flow_table *table) { int i; - if (!table) - return; - if (table->keep_flows) goto skip_flows; @@ -303,31 +488,55 @@ void ovs_flow_tbl_destroy(struct flow_table *table) hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) { hlist_del(&flow->hash_node[ver]); - ovs_flow_free(flow); + ovs_flow_free(flow, false); } } + BUG_ON(!list_empty(table->mask_list)); + kfree(table->mask_list); + skip_flows: free_buckets(table->buckets); kfree(table); } +struct flow_table *ovs_flow_tbl_alloc(int new_size) +{ + struct flow_table *table = __flow_tbl_alloc(new_size); + + if (!table) + return NULL; + + table->mask_list = kmalloc(sizeof(struct list_head), GFP_KERNEL); + if (!table->mask_list) { + table->keep_flows = true; + __flow_tbl_destroy(table); + return NULL; + } + INIT_LIST_HEAD(table->mask_list); + + return table; +} + static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) { struct flow_table *table = container_of(rcu, struct flow_table, rcu); - ovs_flow_tbl_destroy(table); + __flow_tbl_destroy(table); } -void ovs_flow_tbl_deferred_destroy(struct flow_table *table) +void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred) { if (!table) return; - call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb); + if (deferred) + call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb); + else + __flow_tbl_destroy(table); } -struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *last) +struct sw_flow *ovs_flow_dump_next(struct flow_table *table, u32 *bucket, u32 *last) { struct sw_flow *flow; struct hlist_head *head; @@ -353,11 +562,13 @@ struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *la return NULL; } -static void __flow_tbl_insert(struct flow_table *table, struct sw_flow *flow) +static void __tbl_insert(struct flow_table *table, struct sw_flow *flow) { struct hlist_head *head; + head = find_bucket(table, flow->hash); hlist_add_head_rcu(&flow->hash_node[table->node_ver], head); + table->count++; } @@ -377,8 +588,10 @@ static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new head = flex_array_get(old->buckets, i); hlist_for_each_entry(flow, head, hash_node[old_ver]) - __flow_tbl_insert(new, flow); + __tbl_insert(new, flow); } + + new->mask_list = old->mask_list; old->keep_flows = true; } @@ -386,7 +599,7 @@ static struct flow_table *__flow_tbl_rehash(struct flow_table *table, int n_buck { struct flow_table *new_table; - new_table = ovs_flow_tbl_alloc(n_buckets); + new_table = __flow_tbl_alloc(n_buckets); if (!new_table) return ERR_PTR(-ENOMEM); @@ -405,28 +618,30 @@ struct flow_table *ovs_flow_tbl_expand(struct flow_table *table) return __flow_tbl_rehash(table, table->n_buckets * 2); } -void ovs_flow_free(struct sw_flow *flow) +static void __flow_free(struct sw_flow *flow) { - if (unlikely(!flow)) - return; - kfree((struct sf_flow_acts __force *)flow->sf_acts); kmem_cache_free(flow_cache, flow); } -/* RCU callback used by ovs_flow_deferred_free. */ static void rcu_free_flow_callback(struct rcu_head *rcu) { struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu); - ovs_flow_free(flow); + __flow_free(flow); } -/* Schedules 'flow' to be freed after the next RCU grace period. - * The caller must hold rcu_read_lock for this to be sensible. */ -void ovs_flow_deferred_free(struct sw_flow *flow) +void ovs_flow_free(struct sw_flow *flow, bool deferred) { - call_rcu(&flow->rcu, rcu_free_flow_callback); + if (!flow) + return; + + ovs_sw_flow_mask_del_ref(flow->mask, deferred); + + if (deferred) + call_rcu(&flow->rcu, rcu_free_flow_callback); + else + __flow_free(flow); } /* Schedules 'sf_acts' to be freed after the next RCU grace period. @@ -497,18 +712,15 @@ static __be16 parse_ethertype(struct sk_buff *skb) } static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, - int *key_lenp, int nh_len) + int nh_len) { struct icmp6hdr *icmp = icmp6_hdr(skb); - int error = 0; - int key_len; /* The ICMPv6 type and code fields use the 16-bit transport port * fields, so we need to store them in 16-bit network byte order. */ key->ipv6.tp.src = htons(icmp->icmp6_type); key->ipv6.tp.dst = htons(icmp->icmp6_code); - key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); if (icmp->icmp6_code == 0 && (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION || @@ -517,21 +729,17 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, struct nd_msg *nd; int offset; - key_len = SW_FLOW_KEY_OFFSET(ipv6.nd); - /* In order to process neighbor discovery options, we need the * entire packet. */ if (unlikely(icmp_len < sizeof(*nd))) - goto out; - if (unlikely(skb_linearize(skb))) { - error = -ENOMEM; - goto out; - } + return 0; + + if (unlikely(skb_linearize(skb))) + return -ENOMEM; nd = (struct nd_msg *)skb_transport_header(skb); key->ipv6.nd.target = nd->target; - key_len = SW_FLOW_KEY_OFFSET(ipv6.nd); icmp_len -= sizeof(*nd); offset = 0; @@ -541,7 +749,7 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, int opt_len = nd_opt->nd_opt_len * 8; if (unlikely(!opt_len || opt_len > icmp_len)) - goto invalid; + return 0; /* Store the link layer address if the appropriate * option is provided. It is considered an error if @@ -566,16 +774,14 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, } } - goto out; + return 0; invalid: memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target)); memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll)); memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll)); -out: - *key_lenp = key_len; - return error; + return 0; } /** @@ -584,7 +790,6 @@ out: * Ethernet header * @in_port: port number on which @skb was received. * @key: output flow key - * @key_lenp: length of output flow key * * The caller must ensure that skb->len >= ETH_HLEN. * @@ -602,11 +807,9 @@ out: * of a correct length, otherwise the same as skb->network_header. * For other key->eth.type values it is left untouched. */ -int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, - int *key_lenp) +int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) { - int error = 0; - int key_len = SW_FLOW_KEY_OFFSET(eth); + int error; struct ethhdr *eth; memset(key, 0, sizeof(*key)); @@ -649,15 +852,13 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, struct iphdr *nh; __be16 offset; - key_len = SW_FLOW_KEY_OFFSET(ipv4.addr); - error = check_iphdr(skb); if (unlikely(error)) { if (error == -EINVAL) { skb->transport_header = skb->network_header; error = 0; } - goto out; + return error; } nh = ip_hdr(skb); @@ -671,7 +872,7 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, offset = nh->frag_off & htons(IP_OFFSET); if (offset) { key->ip.frag = OVS_FRAG_TYPE_LATER; - goto out; + return 0; } if (nh->frag_off & htons(IP_MF) || skb_shinfo(skb)->gso_type & SKB_GSO_UDP) @@ -679,21 +880,18 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, /* Transport layer. */ if (key->ip.proto == IPPROTO_TCP) { - key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); if (tcphdr_ok(skb)) { struct tcphdr *tcp = tcp_hdr(skb); key->ipv4.tp.src = tcp->source; key->ipv4.tp.dst = tcp->dest; } } else if (key->ip.proto == IPPROTO_UDP) { - key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); if (udphdr_ok(skb)) { struct udphdr *udp = udp_hdr(skb); key->ipv4.tp.src = udp->source; key->ipv4.tp.dst = udp->dest; } } else if (key->ip.proto == IPPROTO_ICMP) { - key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); if (icmphdr_ok(skb)) { struct icmphdr *icmp = icmp_hdr(skb); /* The ICMP type and code fields use the 16-bit @@ -722,53 +920,49 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst)); memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN); memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN); - key_len = SW_FLOW_KEY_OFFSET(ipv4.arp); } } else if (key->eth.type == htons(ETH_P_IPV6)) { int nh_len; /* IPv6 Header + Extensions */ - nh_len = parse_ipv6hdr(skb, key, &key_len); + nh_len = parse_ipv6hdr(skb, key); if (unlikely(nh_len < 0)) { - if (nh_len == -EINVAL) + if (nh_len == -EINVAL) { skb->transport_header = skb->network_header; - else + error = 0; + } else { error = nh_len; - goto out; + } + return error; } if (key->ip.frag == OVS_FRAG_TYPE_LATER) - goto out; + return 0; if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) key->ip.frag = OVS_FRAG_TYPE_FIRST; /* Transport layer. */ if (key->ip.proto == NEXTHDR_TCP) { - key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); if (tcphdr_ok(skb)) { struct tcphdr *tcp = tcp_hdr(skb); key->ipv6.tp.src = tcp->source; key->ipv6.tp.dst = tcp->dest; } } else if (key->ip.proto == NEXTHDR_UDP) { - key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); if (udphdr_ok(skb)) { struct udphdr *udp = udp_hdr(skb); key->ipv6.tp.src = udp->source; key->ipv6.tp.dst = udp->dest; } } else if (key->ip.proto == NEXTHDR_ICMP) { - key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); if (icmp6hdr_ok(skb)) { - error = parse_icmpv6(skb, key, &key_len, nh_len); - if (error < 0) - goto out; + error = parse_icmpv6(skb, key, nh_len); + if (error) + return error; } } } -out: - *key_lenp = key_len; - return error; + return 0; } static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start, int key_len) @@ -777,7 +971,7 @@ static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start, int key_l DIV_ROUND_UP(key_len - key_start, sizeof(u32)), 0); } -static int flow_key_start(struct sw_flow_key *key) +static int flow_key_start(const struct sw_flow_key *key) { if (key->tun_key.ipv4_dst) return 0; @@ -785,39 +979,95 @@ static int flow_key_start(struct sw_flow_key *key) return offsetof(struct sw_flow_key, phy); } -struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table, - struct sw_flow_key *key, int key_len) +static bool __cmp_key(const struct sw_flow_key *key1, + const struct sw_flow_key *key2, int key_start, int key_len) +{ + return !memcmp((u8 *)key1 + key_start, + (u8 *)key2 + key_start, (key_len - key_start)); +} + +static bool __flow_cmp_key(const struct sw_flow *flow, + const struct sw_flow_key *key, int key_start, int key_len) +{ + return __cmp_key(&flow->key, key, key_start, key_len); +} + +static bool __flow_cmp_unmasked_key(const struct sw_flow *flow, + const struct sw_flow_key *key, int key_start, int key_len) +{ + return __cmp_key(&flow->unmasked_key, key, key_start, key_len); +} + +bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, + const struct sw_flow_key *key, int key_len) +{ + int key_start; + key_start = flow_key_start(key); + + return __flow_cmp_unmasked_key(flow, key, key_start, key_len); + +} + +struct sw_flow *ovs_flow_lookup_unmasked_key(struct flow_table *table, + struct sw_flow_match *match) +{ + struct sw_flow_key *unmasked = match->key; + int key_len = match->range.end; + struct sw_flow *flow; + + flow = ovs_flow_lookup(table, unmasked); + if (flow && (!ovs_flow_cmp_unmasked_key(flow, unmasked, key_len))) + flow = NULL; + + return flow; +} + +static struct sw_flow *ovs_masked_flow_lookup(struct flow_table *table, + const struct sw_flow_key *flow_key, + struct sw_flow_mask *mask) { struct sw_flow *flow; struct hlist_head *head; - u8 *_key; - int key_start; + int key_start = mask->range.start; + int key_len = mask->range.end; u32 hash; + struct sw_flow_key masked_key; - key_start = flow_key_start(key); - hash = ovs_flow_hash(key, key_start, key_len); - - _key = (u8 *) key + key_start; + ovs_flow_key_mask(&masked_key, flow_key, mask); + hash = ovs_flow_hash(&masked_key, key_start, key_len); head = find_bucket(table, hash); hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) { - - if (flow->hash == hash && - !memcmp((u8 *)&flow->key + key_start, _key, key_len - key_start)) { + if (flow->mask == mask && + __flow_cmp_key(flow, &masked_key, key_start, key_len)) return flow; - } } return NULL; } -void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, - struct sw_flow_key *key, int key_len) +struct sw_flow *ovs_flow_lookup(struct flow_table *tbl, + const struct sw_flow_key *key) { - flow->hash = ovs_flow_hash(key, flow_key_start(key), key_len); - memcpy(&flow->key, key, sizeof(flow->key)); - __flow_tbl_insert(table, flow); + struct sw_flow *flow = NULL; + struct sw_flow_mask *mask; + + list_for_each_entry_rcu(mask, tbl->mask_list, list) { + flow = ovs_masked_flow_lookup(tbl, key, mask); + if (flow) /* Found */ + break; + } + + return flow; } -void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) + +void ovs_flow_insert(struct flow_table *table, struct sw_flow *flow) +{ + flow->hash = ovs_flow_hash(&flow->key, flow->mask->range.start, + flow->mask->range.end); + __tbl_insert(table, flow); +} + +void ovs_flow_remove(struct flow_table *table, struct sw_flow *flow) { BUG_ON(table->count == 0); hlist_del_rcu(&flow->hash_node[table->node_ver]); @@ -844,149 +1094,84 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_TUNNEL] = -1, }; -static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len, - const struct nlattr *a[], u32 *attrs) -{ - const struct ovs_key_icmp *icmp_key; - const struct ovs_key_tcp *tcp_key; - const struct ovs_key_udp *udp_key; - - switch (swkey->ip.proto) { - case IPPROTO_TCP: - if (!(*attrs & (1 << OVS_KEY_ATTR_TCP))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_TCP); - - *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); - tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); - swkey->ipv4.tp.src = tcp_key->tcp_src; - swkey->ipv4.tp.dst = tcp_key->tcp_dst; - break; - - case IPPROTO_UDP: - if (!(*attrs & (1 << OVS_KEY_ATTR_UDP))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_UDP); - - *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); - udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); - swkey->ipv4.tp.src = udp_key->udp_src; - swkey->ipv4.tp.dst = udp_key->udp_dst; - break; - - case IPPROTO_ICMP: - if (!(*attrs & (1 << OVS_KEY_ATTR_ICMP))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_ICMP); - - *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); - icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]); - swkey->ipv4.tp.src = htons(icmp_key->icmp_type); - swkey->ipv4.tp.dst = htons(icmp_key->icmp_code); - break; - } - - return 0; -} - -static int ipv6_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len, - const struct nlattr *a[], u32 *attrs) +static bool is_all_zero(const u8 *fp, size_t size) { - const struct ovs_key_icmpv6 *icmpv6_key; - const struct ovs_key_tcp *tcp_key; - const struct ovs_key_udp *udp_key; - - switch (swkey->ip.proto) { - case IPPROTO_TCP: - if (!(*attrs & (1 << OVS_KEY_ATTR_TCP))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_TCP); - - *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); - tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); - swkey->ipv6.tp.src = tcp_key->tcp_src; - swkey->ipv6.tp.dst = tcp_key->tcp_dst; - break; - - case IPPROTO_UDP: - if (!(*attrs & (1 << OVS_KEY_ATTR_UDP))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_UDP); - - *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); - udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); - swkey->ipv6.tp.src = udp_key->udp_src; - swkey->ipv6.tp.dst = udp_key->udp_dst; - break; - - case IPPROTO_ICMPV6: - if (!(*attrs & (1 << OVS_KEY_ATTR_ICMPV6))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6); - - *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); - icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]); - swkey->ipv6.tp.src = htons(icmpv6_key->icmpv6_type); - swkey->ipv6.tp.dst = htons(icmpv6_key->icmpv6_code); + int i; - if (swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) || - swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { - const struct ovs_key_nd *nd_key; + if (!fp) + return false; - if (!(*attrs & (1 << OVS_KEY_ATTR_ND))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_ND); - - *key_len = SW_FLOW_KEY_OFFSET(ipv6.nd); - nd_key = nla_data(a[OVS_KEY_ATTR_ND]); - memcpy(&swkey->ipv6.nd.target, nd_key->nd_target, - sizeof(swkey->ipv6.nd.target)); - memcpy(swkey->ipv6.nd.sll, nd_key->nd_sll, ETH_ALEN); - memcpy(swkey->ipv6.nd.tll, nd_key->nd_tll, ETH_ALEN); - } - break; - } + for (i = 0; i < size; i++) + if (fp[i]) + return false; - return 0; + return true; } -static int parse_flow_nlattrs(const struct nlattr *attr, - const struct nlattr *a[], u32 *attrsp) +static int __parse_flow_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], + u64 *attrsp, bool nz) { const struct nlattr *nla; u32 attrs; int rem; - attrs = 0; + attrs = *attrsp; nla_for_each_nested(nla, attr, rem) { u16 type = nla_type(nla); int expected_len; - if (type > OVS_KEY_ATTR_MAX || attrs & (1 << type)) + if (type > OVS_KEY_ATTR_MAX) { + OVS_NLERR("Unknown key attribute (type=%d, max=%d).\n", + type, OVS_KEY_ATTR_MAX); + } + + if (attrs & (1 << type)) { + OVS_NLERR("Duplicate key attribute (type %d).\n", type); return -EINVAL; + } expected_len = ovs_key_lens[type]; - if (nla_len(nla) != expected_len && expected_len != -1) + if (nla_len(nla) != expected_len && expected_len != -1) { + OVS_NLERR("Key attribute has unexpected length (type=%d" + ", length=%d, expected=%d).\n", type, + nla_len(nla), expected_len); return -EINVAL; + } - attrs |= 1 << type; - a[type] = nla; + if (!nz || !is_all_zero(nla_data(nla), expected_len)) { + attrs |= 1 << type; + a[type] = nla; + } } - if (rem) + if (rem) { + OVS_NLERR("Message has %d unknown bytes.\n", rem); return -EINVAL; + } *attrsp = attrs; return 0; } +static int parse_flow_mask_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], u64 *attrsp) +{ + return __parse_flow_nlattrs(attr, a, attrsp, true); +} + +static int parse_flow_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], u64 *attrsp) +{ + return __parse_flow_nlattrs(attr, a, attrsp, false); +} + int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr, - struct ovs_key_ipv4_tunnel *tun_key) + struct sw_flow_match *match, bool is_mask) { struct nlattr *a; int rem; bool ttl = false; - - memset(tun_key, 0, sizeof(*tun_key)); + __be16 tun_flags = 0; nla_for_each_nested(a, attr, rem) { int type = nla_type(a); @@ -1000,53 +1185,78 @@ int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr, [OVS_TUNNEL_KEY_ATTR_CSUM] = 0, }; - if (type > OVS_TUNNEL_KEY_ATTR_MAX || - ovs_tunnel_key_lens[type] != nla_len(a)) + if (type > OVS_TUNNEL_KEY_ATTR_MAX) { + OVS_NLERR("Unknown IPv4 tunnel attribute (type=%d, max=%d).\n", + type, OVS_TUNNEL_KEY_ATTR_MAX); return -EINVAL; + } + + if (ovs_tunnel_key_lens[type] != nla_len(a)) { + OVS_NLERR("IPv4 tunnel attribute type has unexpected " + " length (type=%d, length=%d, expected=%d).\n", + type, nla_len(a), ovs_tunnel_key_lens[type]); + return -EINVAL; + } switch (type) { case OVS_TUNNEL_KEY_ATTR_ID: - tun_key->tun_id = nla_get_be64(a); - tun_key->tun_flags |= TUNNEL_KEY; + SW_FLOW_KEY_PUT(match, tun_key.tun_id, + nla_get_be64(a), is_mask); + tun_flags |= TUNNEL_KEY; break; case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: - tun_key->ipv4_src = nla_get_be32(a); + SW_FLOW_KEY_PUT(match, tun_key.ipv4_src, + nla_get_be32(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_IPV4_DST: - tun_key->ipv4_dst = nla_get_be32(a); + SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst, + nla_get_be32(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_TOS: - tun_key->ipv4_tos = nla_get_u8(a); + SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos, + nla_get_u8(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_TTL: - tun_key->ipv4_ttl = nla_get_u8(a); + SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl, + nla_get_u8(a), is_mask); ttl = true; break; case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT: - tun_key->tun_flags |= TUNNEL_DONT_FRAGMENT; + tun_flags |= TUNNEL_DONT_FRAGMENT; break; case OVS_TUNNEL_KEY_ATTR_CSUM: - tun_key->tun_flags |= TUNNEL_CSUM; + tun_flags |= TUNNEL_CSUM; break; default: return -EINVAL; - } } - if (rem > 0) - return -EINVAL; - if (!tun_key->ipv4_dst) - return -EINVAL; + SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask); - if (!ttl) + if (rem > 0) { + OVS_NLERR("IPv4 tunnel attribute has %d unknown bytes.\n", rem); return -EINVAL; + } + + if (!is_mask) { + if (!match->key->tun_key.ipv4_dst) { + OVS_NLERR("IPv4 tunnel destination address is zero.\n"); + return -EINVAL; + } + + if (!ttl) { + OVS_NLERR("IPv4 tunnel TTL not specified.\n"); + return -EINVAL; + } + } return 0; } int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *tun_key) + const struct ovs_key_ipv4_tunnel *tun_key, + const struct ovs_key_ipv4_tunnel *output) { struct nlattr *nla; @@ -1054,23 +1264,24 @@ int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, if (!nla) return -EMSGSIZE; - if (tun_key->tun_flags & TUNNEL_KEY && - nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, tun_key->tun_id)) + if (output->tun_flags & TUNNEL_KEY && + nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) return -EMSGSIZE; - if (tun_key->ipv4_src && - nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, tun_key->ipv4_src)) + if (output->ipv4_src && + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src)) return -EMSGSIZE; - if (nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, tun_key->ipv4_dst)) + if (output->ipv4_dst && + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst)) return -EMSGSIZE; - if (tun_key->ipv4_tos && - nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, tun_key->ipv4_tos)) + if (output->ipv4_tos && + nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) return -EMSGSIZE; - if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, tun_key->ipv4_ttl)) + if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl)) return -EMSGSIZE; - if ((tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) && + if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) return -EMSGSIZE; - if ((tun_key->tun_flags & TUNNEL_CSUM) && + if ((output->tun_flags & TUNNEL_CSUM) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) return -EMSGSIZE; @@ -1078,176 +1289,372 @@ int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, return 0; } -/** - * ovs_flow_from_nlattrs - parses Netlink attributes into a flow key. - * @swkey: receives the extracted flow key. - * @key_lenp: number of bytes used in @swkey. - * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute - * sequence. - */ -int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, - const struct nlattr *attr) +static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, + const struct nlattr **a, bool is_mask) { - const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; - const struct ovs_key_ethernet *eth_key; - int key_len; - u32 attrs; - int err; + if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { + SW_FLOW_KEY_PUT(match, phy.priority, + nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY); + } - memset(swkey, 0, sizeof(struct sw_flow_key)); - key_len = SW_FLOW_KEY_OFFSET(eth); + if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) { + u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]); - err = parse_flow_nlattrs(attr, a, &attrs); - if (err) - return err; + if (is_mask) + in_port = 0xffffffff; /* Always exact match in_port. */ + else if (in_port >= DP_MAX_PORTS) + return -EINVAL; - /* Metadata attributes. */ - if (attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { - swkey->phy.priority = nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]); - attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY); + SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT); + } else if (!is_mask) { + SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask); } - if (attrs & (1 << OVS_KEY_ATTR_IN_PORT)) { - u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]); - if (in_port >= DP_MAX_PORTS) - return -EINVAL; - swkey->phy.in_port = in_port; - attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT); - } else { - swkey->phy.in_port = DP_MAX_PORTS; + + if (*attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) { + uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]); + + SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); } - if (attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) { - swkey->phy.skb_mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]); - attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); + if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { + if (ovs_ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, + is_mask)) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); } + return 0; +} - if (attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { - err = ovs_ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], &swkey->tun_key); - if (err) - return err; +static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, + const struct nlattr **a, bool is_mask) +{ + int err; + u64 orig_attrs = attrs; - attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); - } + err = metadata_from_nlattrs(match, &attrs, a, is_mask); + if (err) + return err; - /* Data attributes. */ - if (!(attrs & (1 << OVS_KEY_ATTR_ETHERNET))) - return -EINVAL; - attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET); + if (attrs & (1 << OVS_KEY_ATTR_ETHERNET)) { + const struct ovs_key_ethernet *eth_key; - eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]); - memcpy(swkey->eth.src, eth_key->eth_src, ETH_ALEN); - memcpy(swkey->eth.dst, eth_key->eth_dst, ETH_ALEN); + eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]); + SW_FLOW_KEY_MEMCPY(match, eth.src, + eth_key->eth_src, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, eth.dst, + eth_key->eth_dst, ETH_ALEN, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET); + } - if (attrs & (1u << OVS_KEY_ATTR_ETHERTYPE) && - nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q)) { - const struct nlattr *encap; + if (attrs & (1 << OVS_KEY_ATTR_VLAN)) { __be16 tci; - if (attrs != ((1 << OVS_KEY_ATTR_VLAN) | - (1 << OVS_KEY_ATTR_ETHERTYPE) | - (1 << OVS_KEY_ATTR_ENCAP))) - return -EINVAL; - - encap = a[OVS_KEY_ATTR_ENCAP]; tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - if (tci & htons(VLAN_TAG_PRESENT)) { - swkey->eth.tci = tci; - - err = parse_flow_nlattrs(encap, a, &attrs); - if (err) - return err; - } else if (!tci) { - /* Corner case for truncated 802.1Q header. */ - if (nla_len(encap)) - return -EINVAL; + if (!(tci & htons(VLAN_TAG_PRESENT))) { + if (is_mask) + OVS_NLERR("VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.\n"); + else + OVS_NLERR("VLAN TCI does not have VLAN_TAG_PRESENT bit set.\n"); - swkey->eth.type = htons(ETH_P_8021Q); - *key_lenp = key_len; - return 0; - } else { return -EINVAL; } - } + + SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_VLAN); + } else if (!is_mask) + SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true); if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { - swkey->eth.type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); - if (ntohs(swkey->eth.type) < ETH_P_802_3_MIN) + __be16 eth_type; + + eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + if (is_mask) { + /* Always exact match EtherType. */ + eth_type = htons(0xffff); + } else if (ntohs(eth_type) < ETH_P_802_3_MIN) { + OVS_NLERR("EtherType is less than minimum (type=%x, min=%x).\n", + ntohs(eth_type), ETH_P_802_3_MIN); return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask); attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); - } else { - swkey->eth.type = htons(ETH_P_802_2); + } else if (!is_mask) { + SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask); } - if (swkey->eth.type == htons(ETH_P_IP)) { + if (attrs & (1 << OVS_KEY_ATTR_IPV4)) { const struct ovs_key_ipv4 *ipv4_key; - if (!(attrs & (1 << OVS_KEY_ATTR_IPV4))) - return -EINVAL; - attrs &= ~(1 << OVS_KEY_ATTR_IPV4); - - key_len = SW_FLOW_KEY_OFFSET(ipv4.addr); ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]); - if (ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) + if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) { + OVS_NLERR("Unknown IPv4 fragment type (value=%d, max=%d).\n", + ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX); return -EINVAL; - swkey->ip.proto = ipv4_key->ipv4_proto; - swkey->ip.tos = ipv4_key->ipv4_tos; - swkey->ip.ttl = ipv4_key->ipv4_ttl; - swkey->ip.frag = ipv4_key->ipv4_frag; - swkey->ipv4.addr.src = ipv4_key->ipv4_src; - swkey->ipv4.addr.dst = ipv4_key->ipv4_dst; - - if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) { - err = ipv4_flow_from_nlattrs(swkey, &key_len, a, &attrs); - if (err) - return err; } - } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - const struct ovs_key_ipv6 *ipv6_key; + SW_FLOW_KEY_PUT(match, ip.proto, + ipv4_key->ipv4_proto, is_mask); + SW_FLOW_KEY_PUT(match, ip.tos, + ipv4_key->ipv4_tos, is_mask); + SW_FLOW_KEY_PUT(match, ip.ttl, + ipv4_key->ipv4_ttl, is_mask); + SW_FLOW_KEY_PUT(match, ip.frag, + ipv4_key->ipv4_frag, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.src, + ipv4_key->ipv4_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.dst, + ipv4_key->ipv4_dst, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_IPV4); + } - if (!(attrs & (1 << OVS_KEY_ATTR_IPV6))) - return -EINVAL; - attrs &= ~(1 << OVS_KEY_ATTR_IPV6); + if (attrs & (1 << OVS_KEY_ATTR_IPV6)) { + const struct ovs_key_ipv6 *ipv6_key; - key_len = SW_FLOW_KEY_OFFSET(ipv6.label); ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]); - if (ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) + if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) { + OVS_NLERR("Unknown IPv6 fragment type (value=%d, max=%d).\n", + ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX); return -EINVAL; - swkey->ipv6.label = ipv6_key->ipv6_label; - swkey->ip.proto = ipv6_key->ipv6_proto; - swkey->ip.tos = ipv6_key->ipv6_tclass; - swkey->ip.ttl = ipv6_key->ipv6_hlimit; - swkey->ip.frag = ipv6_key->ipv6_frag; - memcpy(&swkey->ipv6.addr.src, ipv6_key->ipv6_src, - sizeof(swkey->ipv6.addr.src)); - memcpy(&swkey->ipv6.addr.dst, ipv6_key->ipv6_dst, - sizeof(swkey->ipv6.addr.dst)); - - if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) { - err = ipv6_flow_from_nlattrs(swkey, &key_len, a, &attrs); - if (err) - return err; } - } else if (swkey->eth.type == htons(ETH_P_ARP) || - swkey->eth.type == htons(ETH_P_RARP)) { + SW_FLOW_KEY_PUT(match, ipv6.label, + ipv6_key->ipv6_label, is_mask); + SW_FLOW_KEY_PUT(match, ip.proto, + ipv6_key->ipv6_proto, is_mask); + SW_FLOW_KEY_PUT(match, ip.tos, + ipv6_key->ipv6_tclass, is_mask); + SW_FLOW_KEY_PUT(match, ip.ttl, + ipv6_key->ipv6_hlimit, is_mask); + SW_FLOW_KEY_PUT(match, ip.frag, + ipv6_key->ipv6_frag, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src, + ipv6_key->ipv6_src, + sizeof(match->key->ipv6.addr.src), + is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst, + ipv6_key->ipv6_dst, + sizeof(match->key->ipv6.addr.dst), + is_mask); + + attrs &= ~(1 << OVS_KEY_ATTR_IPV6); + } + + if (attrs & (1 << OVS_KEY_ATTR_ARP)) { const struct ovs_key_arp *arp_key; - if (!(attrs & (1 << OVS_KEY_ATTR_ARP))) + arp_key = nla_data(a[OVS_KEY_ATTR_ARP]); + if (!is_mask && (arp_key->arp_op & htons(0xff00))) { + OVS_NLERR("Unknown ARP opcode (opcode=%d).\n", + arp_key->arp_op); return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, ipv4.addr.src, + arp_key->arp_sip, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.dst, + arp_key->arp_tip, is_mask); + SW_FLOW_KEY_PUT(match, ip.proto, + ntohs(arp_key->arp_op), is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha, + arp_key->arp_sha, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha, + arp_key->arp_tha, ETH_ALEN, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ARP); + } - key_len = SW_FLOW_KEY_OFFSET(ipv4.arp); - arp_key = nla_data(a[OVS_KEY_ATTR_ARP]); - swkey->ipv4.addr.src = arp_key->arp_sip; - swkey->ipv4.addr.dst = arp_key->arp_tip; - if (arp_key->arp_op & htons(0xff00)) + if (attrs & (1 << OVS_KEY_ATTR_TCP)) { + const struct ovs_key_tcp *tcp_key; + + tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); + if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { + SW_FLOW_KEY_PUT(match, ipv4.tp.src, + tcp_key->tcp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.tp.dst, + tcp_key->tcp_dst, is_mask); + } else { + SW_FLOW_KEY_PUT(match, ipv6.tp.src, + tcp_key->tcp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv6.tp.dst, + tcp_key->tcp_dst, is_mask); + } + attrs &= ~(1 << OVS_KEY_ATTR_TCP); + } + + if (attrs & (1 << OVS_KEY_ATTR_UDP)) { + const struct ovs_key_udp *udp_key; + + udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); + if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { + SW_FLOW_KEY_PUT(match, ipv4.tp.src, + udp_key->udp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.tp.dst, + udp_key->udp_dst, is_mask); + } else { + SW_FLOW_KEY_PUT(match, ipv6.tp.src, + udp_key->udp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv6.tp.dst, + udp_key->udp_dst, is_mask); + } + attrs &= ~(1 << OVS_KEY_ATTR_UDP); + } + + if (attrs & (1 << OVS_KEY_ATTR_ICMP)) { + const struct ovs_key_icmp *icmp_key; + + icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]); + SW_FLOW_KEY_PUT(match, ipv4.tp.src, + htons(icmp_key->icmp_type), is_mask); + SW_FLOW_KEY_PUT(match, ipv4.tp.dst, + htons(icmp_key->icmp_code), is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ICMP); + } + + if (attrs & (1 << OVS_KEY_ATTR_ICMPV6)) { + const struct ovs_key_icmpv6 *icmpv6_key; + + icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]); + SW_FLOW_KEY_PUT(match, ipv6.tp.src, + htons(icmpv6_key->icmpv6_type), is_mask); + SW_FLOW_KEY_PUT(match, ipv6.tp.dst, + htons(icmpv6_key->icmpv6_code), is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6); + } + + if (attrs & (1 << OVS_KEY_ATTR_ND)) { + const struct ovs_key_nd *nd_key; + + nd_key = nla_data(a[OVS_KEY_ATTR_ND]); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target, + nd_key->nd_target, + sizeof(match->key->ipv6.nd.target), + is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll, + nd_key->nd_sll, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll, + nd_key->nd_tll, ETH_ALEN, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ND); + } + + if (attrs != 0) + return -EINVAL; + + return 0; +} + +/** + * ovs_match_from_nlattrs - parses Netlink attributes into a flow key and + * mask. In case the 'mask' is NULL, the flow is treated as exact match + * flow. Otherwise, it is treated as a wildcarded flow, except the mask + * does not include any don't care bit. + * @match: receives the extracted flow match information. + * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute + * sequence. The fields should of the packet that triggered the creation + * of this flow. + * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink + * attribute specifies the mask field of the wildcarded flow. + */ +int ovs_match_from_nlattrs(struct sw_flow_match *match, + const struct nlattr *key, + const struct nlattr *mask) +{ + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + const struct nlattr *encap; + u64 key_attrs = 0; + u64 mask_attrs = 0; + bool encap_valid = false; + int err; + + err = parse_flow_nlattrs(key, a, &key_attrs); + if (err) + return err; + + if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) && + (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) && + (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) { + __be16 tci; + + if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) && + (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) { + OVS_NLERR("Invalid Vlan frame.\n"); return -EINVAL; - swkey->ip.proto = ntohs(arp_key->arp_op); - memcpy(swkey->ipv4.arp.sha, arp_key->arp_sha, ETH_ALEN); - memcpy(swkey->ipv4.arp.tha, arp_key->arp_tha, ETH_ALEN); + } + + key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + encap = a[OVS_KEY_ATTR_ENCAP]; + key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); + encap_valid = true; + + if (tci & htons(VLAN_TAG_PRESENT)) { + err = parse_flow_nlattrs(encap, a, &key_attrs); + if (err) + return err; + } else if (!tci) { + /* Corner case for truncated 802.1Q header. */ + if (nla_len(encap)) { + OVS_NLERR("Truncated 802.1Q header has non-zero encap attribute.\n"); + return -EINVAL; + } + } else { + OVS_NLERR("Encap attribute is set for a non-VLAN frame.\n"); + return -EINVAL; + } } - if (attrs) + err = ovs_key_from_nlattrs(match, key_attrs, a, false); + if (err) + return err; + + if (mask) { + err = parse_flow_mask_nlattrs(mask, a, &mask_attrs); + if (err) + return err; + + if (mask_attrs & 1ULL << OVS_KEY_ATTR_ENCAP) { + __be16 eth_type = 0; + __be16 tci = 0; + + if (!encap_valid) { + OVS_NLERR("Encap mask attribute is set for non-VLAN frame.\n"); + return -EINVAL; + } + + mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); + if (a[OVS_KEY_ATTR_ETHERTYPE]) + eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + + if (eth_type == htons(0xffff)) { + mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + encap = a[OVS_KEY_ATTR_ENCAP]; + err = parse_flow_mask_nlattrs(encap, a, &mask_attrs); + } else { + OVS_NLERR("VLAN frames must have an exact match on the TPID (mask=%x).\n", + ntohs(eth_type)); + return -EINVAL; + } + + if (a[OVS_KEY_ATTR_VLAN]) + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + + if (!(tci & htons(VLAN_TAG_PRESENT))) { + OVS_NLERR("VLAN tag present bit must have an exact match (tci_mask=%x).\n", ntohs(tci)); + return -EINVAL; + } + } + + err = ovs_key_from_nlattrs(match, mask_attrs, a, true); + if (err) + return err; + } else { + /* Populate exact match flow's key mask. */ + if (match->mask) + ovs_sw_flow_mask_set(match->mask, &match->range, 0xff); + } + + if (!ovs_match_validate(match, key_attrs, mask_attrs)) return -EINVAL; - *key_lenp = key_len; return 0; } @@ -1255,7 +1662,6 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, /** * ovs_flow_metadata_from_nlattrs - parses Netlink attributes into a flow key. * @flow: Receives extracted in_port, priority, tun_key and skb_mark. - * @key_len: Length of key in @flow. Used for calculating flow hash. * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute * sequence. * @@ -1264,102 +1670,100 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, * get the metadata, that is, the parts of the flow key that cannot be * extracted from the packet itself. */ -int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, int key_len, - const struct nlattr *attr) + +int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, + const struct nlattr *attr) { struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key; - const struct nlattr *nla; - int rem; + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + u64 attrs = 0; + int err; + struct sw_flow_match match; flow->key.phy.in_port = DP_MAX_PORTS; flow->key.phy.priority = 0; flow->key.phy.skb_mark = 0; memset(tun_key, 0, sizeof(flow->key.tun_key)); - nla_for_each_nested(nla, attr, rem) { - int type = nla_type(nla); - - if (type <= OVS_KEY_ATTR_MAX && ovs_key_lens[type] > 0) { - int err; - - if (nla_len(nla) != ovs_key_lens[type]) - return -EINVAL; - - switch (type) { - case OVS_KEY_ATTR_PRIORITY: - flow->key.phy.priority = nla_get_u32(nla); - break; - - case OVS_KEY_ATTR_TUNNEL: - err = ovs_ipv4_tun_from_nlattr(nla, tun_key); - if (err) - return err; - break; - - case OVS_KEY_ATTR_IN_PORT: - if (nla_get_u32(nla) >= DP_MAX_PORTS) - return -EINVAL; - flow->key.phy.in_port = nla_get_u32(nla); - break; - - case OVS_KEY_ATTR_SKB_MARK: - flow->key.phy.skb_mark = nla_get_u32(nla); - break; - } - } - } - if (rem) + err = parse_flow_nlattrs(attr, a, &attrs); + if (err) return -EINVAL; - flow->hash = ovs_flow_hash(&flow->key, - flow_key_start(&flow->key), key_len); + memset(&match, 0, sizeof(match)); + match.key = &flow->key; + + err = metadata_from_nlattrs(&match, &attrs, a, false); + if (err) + return err; return 0; } -int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) +int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, struct sk_buff *skb) { struct ovs_key_ethernet *eth_key; struct nlattr *nla, *encap; + bool is_mask = (swkey != output); - if (swkey->phy.priority && - nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority)) + if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) goto nla_put_failure; - if (swkey->tun_key.ipv4_dst && - ovs_ipv4_tun_to_nlattr(skb, &swkey->tun_key)) + if ((swkey->tun_key.ipv4_dst || is_mask) && + ovs_ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key)) goto nla_put_failure; - if (swkey->phy.in_port != DP_MAX_PORTS && - nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port)) - goto nla_put_failure; + if (swkey->phy.in_port == DP_MAX_PORTS) { + if (is_mask && (output->phy.in_port == 0xffff)) + if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff)) + goto nla_put_failure; + } else { + u16 upper_u16; + upper_u16 = !is_mask ? 0 : 0xffff; - if (swkey->phy.skb_mark && - nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, swkey->phy.skb_mark)) + if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, + (upper_u16 << 16) | output->phy.in_port)) + goto nla_put_failure; + } + + if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark)) goto nla_put_failure; nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); if (!nla) goto nla_put_failure; + eth_key = nla_data(nla); - memcpy(eth_key->eth_src, swkey->eth.src, ETH_ALEN); - memcpy(eth_key->eth_dst, swkey->eth.dst, ETH_ALEN); + memcpy(eth_key->eth_src, output->eth.src, ETH_ALEN); + memcpy(eth_key->eth_dst, output->eth.dst, ETH_ALEN); if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) { - if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, htons(ETH_P_8021Q)) || - nla_put_be16(skb, OVS_KEY_ATTR_VLAN, swkey->eth.tci)) + __be16 eth_type; + eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff); + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) || + nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci)) goto nla_put_failure; encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); if (!swkey->eth.tci) goto unencap; - } else { + } else encap = NULL; - } - if (swkey->eth.type == htons(ETH_P_802_2)) + if (swkey->eth.type == htons(ETH_P_802_2)) { + /* + * Ethertype 802.2 is represented in the netlink with omitted + * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and + * 0xffff in the mask attribute. Ethertype can also + * be wildcarded. + */ + if (is_mask && output->eth.type) + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, + output->eth.type)) + goto nla_put_failure; goto unencap; + } - if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, swkey->eth.type)) + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type)) goto nla_put_failure; if (swkey->eth.type == htons(ETH_P_IP)) { @@ -1369,12 +1773,12 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) if (!nla) goto nla_put_failure; ipv4_key = nla_data(nla); - ipv4_key->ipv4_src = swkey->ipv4.addr.src; - ipv4_key->ipv4_dst = swkey->ipv4.addr.dst; - ipv4_key->ipv4_proto = swkey->ip.proto; - ipv4_key->ipv4_tos = swkey->ip.tos; - ipv4_key->ipv4_ttl = swkey->ip.ttl; - ipv4_key->ipv4_frag = swkey->ip.frag; + ipv4_key->ipv4_src = output->ipv4.addr.src; + ipv4_key->ipv4_dst = output->ipv4.addr.dst; + ipv4_key->ipv4_proto = output->ip.proto; + ipv4_key->ipv4_tos = output->ip.tos; + ipv4_key->ipv4_ttl = output->ip.ttl; + ipv4_key->ipv4_frag = output->ip.frag; } else if (swkey->eth.type == htons(ETH_P_IPV6)) { struct ovs_key_ipv6 *ipv6_key; @@ -1382,15 +1786,15 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) if (!nla) goto nla_put_failure; ipv6_key = nla_data(nla); - memcpy(ipv6_key->ipv6_src, &swkey->ipv6.addr.src, + memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src, sizeof(ipv6_key->ipv6_src)); - memcpy(ipv6_key->ipv6_dst, &swkey->ipv6.addr.dst, + memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst, sizeof(ipv6_key->ipv6_dst)); - ipv6_key->ipv6_label = swkey->ipv6.label; - ipv6_key->ipv6_proto = swkey->ip.proto; - ipv6_key->ipv6_tclass = swkey->ip.tos; - ipv6_key->ipv6_hlimit = swkey->ip.ttl; - ipv6_key->ipv6_frag = swkey->ip.frag; + ipv6_key->ipv6_label = output->ipv6.label; + ipv6_key->ipv6_proto = output->ip.proto; + ipv6_key->ipv6_tclass = output->ip.tos; + ipv6_key->ipv6_hlimit = output->ip.ttl; + ipv6_key->ipv6_frag = output->ip.frag; } else if (swkey->eth.type == htons(ETH_P_ARP) || swkey->eth.type == htons(ETH_P_RARP)) { struct ovs_key_arp *arp_key; @@ -1400,11 +1804,11 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) goto nla_put_failure; arp_key = nla_data(nla); memset(arp_key, 0, sizeof(struct ovs_key_arp)); - arp_key->arp_sip = swkey->ipv4.addr.src; - arp_key->arp_tip = swkey->ipv4.addr.dst; - arp_key->arp_op = htons(swkey->ip.proto); - memcpy(arp_key->arp_sha, swkey->ipv4.arp.sha, ETH_ALEN); - memcpy(arp_key->arp_tha, swkey->ipv4.arp.tha, ETH_ALEN); + arp_key->arp_sip = output->ipv4.addr.src; + arp_key->arp_tip = output->ipv4.addr.dst; + arp_key->arp_op = htons(output->ip.proto); + memcpy(arp_key->arp_sha, output->ipv4.arp.sha, ETH_ALEN); + memcpy(arp_key->arp_tha, output->ipv4.arp.tha, ETH_ALEN); } if ((swkey->eth.type == htons(ETH_P_IP) || @@ -1419,11 +1823,11 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) goto nla_put_failure; tcp_key = nla_data(nla); if (swkey->eth.type == htons(ETH_P_IP)) { - tcp_key->tcp_src = swkey->ipv4.tp.src; - tcp_key->tcp_dst = swkey->ipv4.tp.dst; + tcp_key->tcp_src = output->ipv4.tp.src; + tcp_key->tcp_dst = output->ipv4.tp.dst; } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - tcp_key->tcp_src = swkey->ipv6.tp.src; - tcp_key->tcp_dst = swkey->ipv6.tp.dst; + tcp_key->tcp_src = output->ipv6.tp.src; + tcp_key->tcp_dst = output->ipv6.tp.dst; } } else if (swkey->ip.proto == IPPROTO_UDP) { struct ovs_key_udp *udp_key; @@ -1433,11 +1837,11 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) goto nla_put_failure; udp_key = nla_data(nla); if (swkey->eth.type == htons(ETH_P_IP)) { - udp_key->udp_src = swkey->ipv4.tp.src; - udp_key->udp_dst = swkey->ipv4.tp.dst; + udp_key->udp_src = output->ipv4.tp.src; + udp_key->udp_dst = output->ipv4.tp.dst; } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - udp_key->udp_src = swkey->ipv6.tp.src; - udp_key->udp_dst = swkey->ipv6.tp.dst; + udp_key->udp_src = output->ipv6.tp.src; + udp_key->udp_dst = output->ipv6.tp.dst; } } else if (swkey->eth.type == htons(ETH_P_IP) && swkey->ip.proto == IPPROTO_ICMP) { @@ -1447,8 +1851,8 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) if (!nla) goto nla_put_failure; icmp_key = nla_data(nla); - icmp_key->icmp_type = ntohs(swkey->ipv4.tp.src); - icmp_key->icmp_code = ntohs(swkey->ipv4.tp.dst); + icmp_key->icmp_type = ntohs(output->ipv4.tp.src); + icmp_key->icmp_code = ntohs(output->ipv4.tp.dst); } else if (swkey->eth.type == htons(ETH_P_IPV6) && swkey->ip.proto == IPPROTO_ICMPV6) { struct ovs_key_icmpv6 *icmpv6_key; @@ -1458,8 +1862,8 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) if (!nla) goto nla_put_failure; icmpv6_key = nla_data(nla); - icmpv6_key->icmpv6_type = ntohs(swkey->ipv6.tp.src); - icmpv6_key->icmpv6_code = ntohs(swkey->ipv6.tp.dst); + icmpv6_key->icmpv6_type = ntohs(output->ipv6.tp.src); + icmpv6_key->icmpv6_code = ntohs(output->ipv6.tp.dst); if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION || icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) { @@ -1469,10 +1873,10 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) if (!nla) goto nla_put_failure; nd_key = nla_data(nla); - memcpy(nd_key->nd_target, &swkey->ipv6.nd.target, + memcpy(nd_key->nd_target, &output->ipv6.nd.target, sizeof(nd_key->nd_target)); - memcpy(nd_key->nd_sll, swkey->ipv6.nd.sll, ETH_ALEN); - memcpy(nd_key->nd_tll, swkey->ipv6.nd.tll, ETH_ALEN); + memcpy(nd_key->nd_sll, output->ipv6.nd.sll, ETH_ALEN); + memcpy(nd_key->nd_tll, output->ipv6.nd.tll, ETH_ALEN); } } } @@ -1504,3 +1908,84 @@ void ovs_flow_exit(void) { kmem_cache_destroy(flow_cache); } + +struct sw_flow_mask *ovs_sw_flow_mask_alloc(void) +{ + struct sw_flow_mask *mask; + + mask = kmalloc(sizeof(*mask), GFP_KERNEL); + if (mask) + mask->ref_count = 0; + + return mask; +} + +void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *mask) +{ + mask->ref_count++; +} + +void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *mask, bool deferred) +{ + if (!mask) + return; + + BUG_ON(!mask->ref_count); + mask->ref_count--; + + if (!mask->ref_count) { + list_del_rcu(&mask->list); + if (deferred) + kfree_rcu(mask, rcu); + else + kfree(mask); + } +} + +static bool ovs_sw_flow_mask_equal(const struct sw_flow_mask *a, + const struct sw_flow_mask *b) +{ + u8 *a_ = (u8 *)&a->key + a->range.start; + u8 *b_ = (u8 *)&b->key + b->range.start; + + return (a->range.end == b->range.end) + && (a->range.start == b->range.start) + && (memcmp(a_, b_, ovs_sw_flow_mask_actual_size(a)) == 0); +} + +struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *tbl, + const struct sw_flow_mask *mask) +{ + struct list_head *ml; + + list_for_each(ml, tbl->mask_list) { + struct sw_flow_mask *m; + m = container_of(ml, struct sw_flow_mask, list); + if (ovs_sw_flow_mask_equal(mask, m)) + return m; + } + + return NULL; +} + +/** + * add a new mask into the mask list. + * The caller needs to make sure that 'mask' is not the same + * as any masks that are already on the list. + */ +void ovs_sw_flow_mask_insert(struct flow_table *tbl, struct sw_flow_mask *mask) +{ + list_add_rcu(&mask->list, tbl->mask_list); +} + +/** + * Set 'range' fields in the mask to the value of 'val'. + */ +static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask, + struct sw_flow_key_range *range, u8 val) +{ + u8 *m = (u8 *)&mask->key + range->start; + + mask->range = *range; + memset(m, val, ovs_sw_flow_mask_size_roundup(mask)); +} diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 66ef7220293e..9674e45f6969 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2011 Nicira, Inc. + * Copyright (c) 2007-2013 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -33,6 +33,8 @@ #include struct sk_buff; +struct sw_flow_mask; +struct flow_table; struct sw_flow_actions { struct rcu_head rcu; @@ -131,6 +133,8 @@ struct sw_flow { u32 hash; struct sw_flow_key key; + struct sw_flow_key unmasked_key; + struct sw_flow_mask *mask; struct sw_flow_actions __rcu *sf_acts; spinlock_t lock; /* Lock for values below. */ @@ -140,6 +144,25 @@ struct sw_flow { u8 tcp_flags; /* Union of seen TCP flags. */ }; +struct sw_flow_key_range { + size_t start; + size_t end; +}; + +static inline u16 ovs_sw_flow_key_range_actual_size(const struct sw_flow_key_range *range) +{ + return range->end - range->start; +} + +struct sw_flow_match { + struct sw_flow_key *key; + struct sw_flow_key_range range; + struct sw_flow_mask *mask; +}; + +void ovs_match_init(struct sw_flow_match *match, + struct sw_flow_key *key, struct sw_flow_mask *mask); + struct arp_eth_header { __be16 ar_hrd; /* format of hardware address */ __be16 ar_pro; /* format of protocol address */ @@ -159,21 +182,21 @@ void ovs_flow_exit(void); struct sw_flow *ovs_flow_alloc(void); void ovs_flow_deferred_free(struct sw_flow *); -void ovs_flow_free(struct sw_flow *flow); +void ovs_flow_free(struct sw_flow *, bool deferred); struct sw_flow_actions *ovs_flow_actions_alloc(int actions_len); void ovs_flow_deferred_free_acts(struct sw_flow_actions *); -int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *, - int *key_lenp); +int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *); void ovs_flow_used(struct sw_flow *, struct sk_buff *); u64 ovs_flow_used_time(unsigned long flow_jiffies); - -int ovs_flow_to_nlattrs(const struct sw_flow_key *, struct sk_buff *); -int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, +int ovs_flow_to_nlattrs(const struct sw_flow_key *, + const struct sw_flow_key *, struct sk_buff *); +int ovs_match_from_nlattrs(struct sw_flow_match *match, + const struct nlattr *, const struct nlattr *); -int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, int key_len, - const struct nlattr *attr); +int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, + const struct nlattr *attr); #define MAX_ACTIONS_BUFSIZE (32 * 1024) #define TBL_MIN_BUCKETS 1024 @@ -182,6 +205,7 @@ struct flow_table { struct flex_array *buckets; unsigned int count, n_buckets; struct rcu_head rcu; + struct list_head *mask_list; int node_ver; u32 hash_seed; bool keep_flows; @@ -197,22 +221,56 @@ static inline int ovs_flow_tbl_need_to_expand(struct flow_table *table) return (table->count > table->n_buckets); } -struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table, - struct sw_flow_key *key, int len); -void ovs_flow_tbl_destroy(struct flow_table *table); -void ovs_flow_tbl_deferred_destroy(struct flow_table *table); +struct sw_flow *ovs_flow_lookup(struct flow_table *, + const struct sw_flow_key *); +struct sw_flow *ovs_flow_lookup_unmasked_key(struct flow_table *table, + struct sw_flow_match *match); + +void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred); struct flow_table *ovs_flow_tbl_alloc(int new_size); struct flow_table *ovs_flow_tbl_expand(struct flow_table *table); struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table); -void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, - struct sw_flow_key *key, int key_len); -void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); -struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *idx); +void ovs_flow_insert(struct flow_table *table, struct sw_flow *flow); +void ovs_flow_remove(struct flow_table *table, struct sw_flow *flow); + +struct sw_flow *ovs_flow_dump_next(struct flow_table *table, u32 *bucket, u32 *idx); extern const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1]; int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr, - struct ovs_key_ipv4_tunnel *tun_key); + struct sw_flow_match *match, bool is_mask); int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *tun_key); + const struct ovs_key_ipv4_tunnel *tun_key, + const struct ovs_key_ipv4_tunnel *output); + +bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, + const struct sw_flow_key *key, int key_len); + +struct sw_flow_mask { + int ref_count; + struct rcu_head rcu; + struct list_head list; + struct sw_flow_key_range range; + struct sw_flow_key key; +}; + +static inline u16 +ovs_sw_flow_mask_actual_size(const struct sw_flow_mask *mask) +{ + return ovs_sw_flow_key_range_actual_size(&mask->range); +} + +static inline u16 +ovs_sw_flow_mask_size_roundup(const struct sw_flow_mask *mask) +{ + return roundup(ovs_sw_flow_mask_actual_size(mask), sizeof(u32)); +} +struct sw_flow_mask *ovs_sw_flow_mask_alloc(void); +void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *); +void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *, bool deferred); +void ovs_sw_flow_mask_insert(struct flow_table *, struct sw_flow_mask *); +struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *, + const struct sw_flow_mask *); +void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src, + const struct sw_flow_mask *mask); #endif /* flow.h */ -- cgit v1.2.3-71-gd317 From 4b0a8670852c1f577014bf2dc4f8a9d1fd908bb9 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Mon, 26 Aug 2013 14:18:33 +0530 Subject: kvm uapi: Add KICK_CPU and PV_UNHALT definition to uapi this is needed by both guest and host. Originally-from: Srivatsa Vaddagiri Signed-off-by: Raghavendra K T Acked-by: Gleb Natapov Acked-by: Ingo Molnar Signed-off-by: Gleb Natapov --- arch/x86/include/uapi/asm/kvm_para.h | 1 + include/uapi/linux/kvm_para.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 06fdbd987e97..94dc8ca434e0 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -23,6 +23,7 @@ #define KVM_FEATURE_ASYNC_PF 4 #define KVM_FEATURE_STEAL_TIME 5 #define KVM_FEATURE_PV_EOI 6 +#define KVM_FEATURE_PV_UNHALT 7 /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h index cea2c5c72d26..2841f86eae0b 100644 --- a/include/uapi/linux/kvm_para.h +++ b/include/uapi/linux/kvm_para.h @@ -19,6 +19,7 @@ #define KVM_HC_MMU_OP 2 #define KVM_HC_FEATURES 3 #define KVM_HC_PPC_MAP_MAGIC_PAGE 4 +#define KVM_HC_KICK_CPU 5 /* * hypercalls use architecture specific -- cgit v1.2.3-71-gd317 From 0bd50dc971aad3c29043de4fb7bce45c351d1b67 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 1 Aug 2013 14:44:24 +1000 Subject: KVM: PPC: reserve a capability number for multitce support This is to reserve a capablity number for upcoming support of H_PUT_TCE_INDIRECT and H_STUFF_TCE pseries hypercalls which support mulptiple DMA map/unmap operations per one call. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Gleb Natapov --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index acccd08be6c7..99c25338ede8 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -667,6 +667,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_PPC_RTAS 91 #define KVM_CAP_IRQ_XICS 92 #define KVM_CAP_ARM_EL1_32BIT 93 +#define KVM_CAP_SPAPR_MULTITCE 94 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3-71-gd317 From a175a723301a8a4a9fedf9ce5b8ca586e7a97b40 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Thu, 22 Aug 2013 12:30:48 -0700 Subject: openvswitch: Add SCTP support This patch adds support for rewriting SCTP src,dst ports similar to the functionality already available for TCP/UDP. Rewriting SCTP ports is expensive due to double-recalculation of the SCTP checksums; this is performed to ensure that packets traversing OVS with invalid checksums will continue to the destination with any checksum corruption intact. Reviewed-by: Simon Horman Signed-off-by: Joe Stringer Signed-off-by: Ben Pfaff Signed-off-by: Jesse Gross --- include/uapi/linux/openvswitch.h | 6 ++++ net/openvswitch/Kconfig | 1 + net/openvswitch/actions.c | 39 ++++++++++++++++++++++++ net/openvswitch/datapath.c | 6 ++++ net/openvswitch/flow.c | 65 ++++++++++++++++++++++++++++++++++++++++ net/openvswitch/flow.h | 8 ++--- 6 files changed, 121 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index de1fa5d3780f..a74d375b439b 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -259,6 +259,7 @@ enum ovs_key_attr { OVS_KEY_ATTR_ND, /* struct ovs_key_nd */ OVS_KEY_ATTR_SKB_MARK, /* u32 skb mark */ OVS_KEY_ATTR_TUNNEL, /* Nested set of ovs_tunnel attributes */ + OVS_KEY_ATTR_SCTP, /* struct ovs_key_sctp */ #ifdef __KERNEL__ OVS_KEY_ATTR_IPV4_TUNNEL, /* struct ovs_key_ipv4_tunnel */ @@ -333,6 +334,11 @@ struct ovs_key_udp { __be16 udp_dst; }; +struct ovs_key_sctp { + __be16 sctp_src; + __be16 sctp_dst; +}; + struct ovs_key_icmp { __u8 icmp_type; __u8 icmp_code; diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index bed30e69baa7..6ecf491ad509 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -4,6 +4,7 @@ config OPENVSWITCH tristate "Open vSwitch" + select LIBCRC32C ---help--- Open vSwitch is a multilayer Ethernet switch targeted at virtualized environments. In addition to supporting a variety of features diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 1f680222f4f7..65cfaa816075 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -31,6 +32,7 @@ #include #include #include +#include #include "datapath.h" #include "vport.h" @@ -352,6 +354,39 @@ static int set_tcp(struct sk_buff *skb, const struct ovs_key_tcp *tcp_port_key) return 0; } +static int set_sctp(struct sk_buff *skb, + const struct ovs_key_sctp *sctp_port_key) +{ + struct sctphdr *sh; + int err; + unsigned int sctphoff = skb_transport_offset(skb); + + err = make_writable(skb, sctphoff + sizeof(struct sctphdr)); + if (unlikely(err)) + return err; + + sh = sctp_hdr(skb); + if (sctp_port_key->sctp_src != sh->source || + sctp_port_key->sctp_dst != sh->dest) { + __le32 old_correct_csum, new_csum, old_csum; + + old_csum = sh->checksum; + old_correct_csum = sctp_compute_cksum(skb, sctphoff); + + sh->source = sctp_port_key->sctp_src; + sh->dest = sctp_port_key->sctp_dst; + + new_csum = sctp_compute_cksum(skb, sctphoff); + + /* Carry any checksum errors through. */ + sh->checksum = old_csum ^ old_correct_csum ^ new_csum; + + skb->rxhash = 0; + } + + return 0; +} + static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port) { struct vport *vport; @@ -461,6 +496,10 @@ static int execute_set_action(struct sk_buff *skb, case OVS_KEY_ATTR_UDP: err = set_udp(skb, nla_data(nested_attr)); break; + + case OVS_KEY_ATTR_SCTP: + err = set_sctp(skb, nla_data(nested_attr)); + break; } return err; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index d29cd9aa4a67..2aa13bd7f2b2 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -712,6 +712,12 @@ static int validate_set(const struct nlattr *a, return validate_tp_port(flow_key); + case OVS_KEY_ATTR_SCTP: + if (flow_key->ip.proto != IPPROTO_SCTP) + return -EINVAL; + + return validate_tp_port(flow_key); + default: return -EINVAL; } diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 1fceb9653598..2b4785590b56 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -129,6 +130,7 @@ static bool ovs_match_validate(const struct sw_flow_match *match, | (1 << OVS_KEY_ATTR_IPV6) | (1 << OVS_KEY_ATTR_TCP) | (1 << OVS_KEY_ATTR_UDP) + | (1 << OVS_KEY_ATTR_SCTP) | (1 << OVS_KEY_ATTR_ICMP) | (1 << OVS_KEY_ATTR_ICMPV6) | (1 << OVS_KEY_ATTR_ARP) @@ -159,6 +161,12 @@ static bool ovs_match_validate(const struct sw_flow_match *match, mask_allowed |= 1 << OVS_KEY_ATTR_UDP; } + if (match->key->ip.proto == IPPROTO_SCTP) { + key_expected |= 1 << OVS_KEY_ATTR_SCTP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; + } + if (match->key->ip.proto == IPPROTO_TCP) { key_expected |= 1 << OVS_KEY_ATTR_TCP; if (match->mask && (match->mask->key.ip.proto == 0xff)) @@ -185,6 +193,12 @@ static bool ovs_match_validate(const struct sw_flow_match *match, mask_allowed |= 1 << OVS_KEY_ATTR_UDP; } + if (match->key->ip.proto == IPPROTO_SCTP) { + key_expected |= 1 << OVS_KEY_ATTR_SCTP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; + } + if (match->key->ip.proto == IPPROTO_TCP) { key_expected |= 1 << OVS_KEY_ATTR_TCP; if (match->mask && (match->mask->key.ip.proto == 0xff)) @@ -280,6 +294,12 @@ static bool udphdr_ok(struct sk_buff *skb) sizeof(struct udphdr)); } +static bool sctphdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_transport_offset(skb) + + sizeof(struct sctphdr)); +} + static bool icmphdr_ok(struct sk_buff *skb) { return pskb_may_pull(skb, skb_transport_offset(skb) + @@ -891,6 +911,12 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) key->ipv4.tp.src = udp->source; key->ipv4.tp.dst = udp->dest; } + } else if (key->ip.proto == IPPROTO_SCTP) { + if (sctphdr_ok(skb)) { + struct sctphdr *sctp = sctp_hdr(skb); + key->ipv4.tp.src = sctp->source; + key->ipv4.tp.dst = sctp->dest; + } } else if (key->ip.proto == IPPROTO_ICMP) { if (icmphdr_ok(skb)) { struct icmphdr *icmp = icmp_hdr(skb); @@ -953,6 +979,12 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) key->ipv6.tp.src = udp->source; key->ipv6.tp.dst = udp->dest; } + } else if (key->ip.proto == NEXTHDR_SCTP) { + if (sctphdr_ok(skb)) { + struct sctphdr *sctp = sctp_hdr(skb); + key->ipv6.tp.src = sctp->source; + key->ipv6.tp.dst = sctp->dest; + } } else if (key->ip.proto == NEXTHDR_ICMP) { if (icmp6hdr_ok(skb)) { error = parse_icmpv6(skb, key, nh_len); @@ -1087,6 +1119,7 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6), [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp), [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp), + [OVS_KEY_ATTR_SCTP] = sizeof(struct ovs_key_sctp), [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp), [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6), [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp), @@ -1500,6 +1533,24 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, attrs &= ~(1 << OVS_KEY_ATTR_UDP); } + if (attrs & (1 << OVS_KEY_ATTR_SCTP)) { + const struct ovs_key_sctp *sctp_key; + + sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]); + if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { + SW_FLOW_KEY_PUT(match, ipv4.tp.src, + sctp_key->sctp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.tp.dst, + sctp_key->sctp_dst, is_mask); + } else { + SW_FLOW_KEY_PUT(match, ipv6.tp.src, + sctp_key->sctp_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv6.tp.dst, + sctp_key->sctp_dst, is_mask); + } + attrs &= ~(1 << OVS_KEY_ATTR_SCTP); + } + if (attrs & (1 << OVS_KEY_ATTR_ICMP)) { const struct ovs_key_icmp *icmp_key; @@ -1843,6 +1894,20 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, udp_key->udp_src = output->ipv6.tp.src; udp_key->udp_dst = output->ipv6.tp.dst; } + } else if (swkey->ip.proto == IPPROTO_SCTP) { + struct ovs_key_sctp *sctp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key)); + if (!nla) + goto nla_put_failure; + sctp_key = nla_data(nla); + if (swkey->eth.type == htons(ETH_P_IP)) { + sctp_key->sctp_src = swkey->ipv4.tp.src; + sctp_key->sctp_dst = swkey->ipv4.tp.dst; + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + sctp_key->sctp_src = swkey->ipv6.tp.src; + sctp_key->sctp_dst = swkey->ipv6.tp.dst; + } } else if (swkey->eth.type == htons(ETH_P_IP) && swkey->ip.proto == IPPROTO_ICMP) { struct ovs_key_icmp *icmp_key; diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 9674e45f6969..d08dcf78dbf3 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -99,8 +99,8 @@ struct sw_flow_key { } addr; union { struct { - __be16 src; /* TCP/UDP source port. */ - __be16 dst; /* TCP/UDP destination port. */ + __be16 src; /* TCP/UDP/SCTP source port. */ + __be16 dst; /* TCP/UDP/SCTP destination port. */ } tp; struct { u8 sha[ETH_ALEN]; /* ARP source hardware address. */ @@ -115,8 +115,8 @@ struct sw_flow_key { } addr; __be32 label; /* IPv6 flow label. */ struct { - __be16 src; /* TCP/UDP source port. */ - __be16 dst; /* TCP/UDP destination port. */ + __be16 src; /* TCP/UDP/SCTP source port. */ + __be16 dst; /* TCP/UDP/SCTP destination port. */ } tp; struct { struct in6_addr target; /* ND target address. */ -- cgit v1.2.3-71-gd317 From d2ab1fa68c61f01b28ab0859a972c892d81f5d32 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 27 Aug 2013 11:11:10 -0600 Subject: PCI: Rename PCIe capability definitions to follow convention All other PCIe capability register fields include "PCI_EXP" + + . This renames PCI_EXP_OBFF_MASK, PCI_EXP_IDO_REQ_EN, PCI_EXP_LTR_EN, and related fields using the same convention. No functional change. Signed-off-by: Bjorn Helgaas Acked-by: Samuel Ortiz # for MFD driver --- drivers/mfd/rts5227.c | 2 +- drivers/pci/pci.c | 31 +++++++++++++++++-------------- include/uapi/linux/pci_regs.h | 24 ++++++++++++------------ 3 files changed, 30 insertions(+), 27 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/mfd/rts5227.c b/drivers/mfd/rts5227.c index fc831dcb1480..164b7faa70c9 100644 --- a/drivers/mfd/rts5227.c +++ b/drivers/mfd/rts5227.c @@ -44,7 +44,7 @@ static int rts5227_extra_init_hw(struct rtsx_pcr *pcr) rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, OLT_LED_CTL, 0x0F, 0x02); /* Configure LTR */ pcie_capability_read_word(pcr->pci, PCI_EXP_DEVCTL2, &cap); - if (cap & PCI_EXP_LTR_EN) + if (cap & PCI_EXP_DEVCTL2_LTR_EN) rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, LTR_CTL, 0xFF, 0xA3); /* Configure OBFF */ rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, OBFF_CFG, 0x03, 0x03); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 42e5f86e2387..3d5d45cdc4dd 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2095,9 +2095,9 @@ void pci_enable_ido(struct pci_dev *dev, unsigned long type) u16 ctrl = 0; if (type & PCI_EXP_IDO_REQUEST) - ctrl |= PCI_EXP_IDO_REQ_EN; + ctrl |= PCI_EXP_DEVCTL2_IDO_REQ_EN; if (type & PCI_EXP_IDO_COMPLETION) - ctrl |= PCI_EXP_IDO_CMP_EN; + ctrl |= PCI_EXP_DEVCTL2_IDO_CMP_EN; if (ctrl) pcie_capability_set_word(dev, PCI_EXP_DEVCTL2, ctrl); } @@ -2113,9 +2113,9 @@ void pci_disable_ido(struct pci_dev *dev, unsigned long type) u16 ctrl = 0; if (type & PCI_EXP_IDO_REQUEST) - ctrl |= PCI_EXP_IDO_REQ_EN; + ctrl |= PCI_EXP_DEVCTL2_IDO_REQ_EN; if (type & PCI_EXP_IDO_COMPLETION) - ctrl |= PCI_EXP_IDO_CMP_EN; + ctrl |= PCI_EXP_DEVCTL2_IDO_CMP_EN; if (ctrl) pcie_capability_clear_word(dev, PCI_EXP_DEVCTL2, ctrl); } @@ -2147,7 +2147,7 @@ int pci_enable_obff(struct pci_dev *dev, enum pci_obff_signal_type type) int ret; pcie_capability_read_dword(dev, PCI_EXP_DEVCAP2, &cap); - if (!(cap & PCI_EXP_OBFF_MASK)) + if (!(cap & PCI_EXP_DEVCAP2_OBFF_MASK)) return -ENOTSUPP; /* no OBFF support at all */ /* Make sure the topology supports OBFF as well */ @@ -2158,17 +2158,17 @@ int pci_enable_obff(struct pci_dev *dev, enum pci_obff_signal_type type) } pcie_capability_read_word(dev, PCI_EXP_DEVCTL2, &ctrl); - if (cap & PCI_EXP_OBFF_WAKE) - ctrl |= PCI_EXP_OBFF_WAKE_EN; + if (cap & PCI_EXP_DEVCAP2_OBFF_WAKE) + ctrl |= PCI_EXP_DEVCTL2_OBFF_WAKE_EN; else { switch (type) { case PCI_EXP_OBFF_SIGNAL_L0: - if (!(ctrl & PCI_EXP_OBFF_WAKE_EN)) - ctrl |= PCI_EXP_OBFF_MSGA_EN; + if (!(ctrl & PCI_EXP_DEVCTL2_OBFF_WAKE_EN)) + ctrl |= PCI_EXP_DEVCTL2_OBFF_MSGA_EN; break; case PCI_EXP_OBFF_SIGNAL_ALWAYS: - ctrl &= ~PCI_EXP_OBFF_WAKE_EN; - ctrl |= PCI_EXP_OBFF_MSGB_EN; + ctrl &= ~PCI_EXP_DEVCTL2_OBFF_WAKE_EN; + ctrl |= PCI_EXP_DEVCTL2_OBFF_MSGB_EN; break; default: WARN(1, "bad OBFF signal type\n"); @@ -2189,7 +2189,8 @@ EXPORT_SYMBOL(pci_enable_obff); */ void pci_disable_obff(struct pci_dev *dev) { - pcie_capability_clear_word(dev, PCI_EXP_DEVCTL2, PCI_EXP_OBFF_WAKE_EN); + pcie_capability_clear_word(dev, PCI_EXP_DEVCTL2, + PCI_EXP_DEVCTL2_OBFF_WAKE_EN); } EXPORT_SYMBOL(pci_disable_obff); @@ -2237,7 +2238,8 @@ int pci_enable_ltr(struct pci_dev *dev) return ret; } - return pcie_capability_set_word(dev, PCI_EXP_DEVCTL2, PCI_EXP_LTR_EN); + return pcie_capability_set_word(dev, PCI_EXP_DEVCTL2, + PCI_EXP_DEVCTL2_LTR_EN); } EXPORT_SYMBOL(pci_enable_ltr); @@ -2254,7 +2256,8 @@ void pci_disable_ltr(struct pci_dev *dev) if (!pci_ltr_supported(dev)) return; - pcie_capability_clear_word(dev, PCI_EXP_DEVCTL2, PCI_EXP_LTR_EN); + pcie_capability_clear_word(dev, PCI_EXP_DEVCTL2, + PCI_EXP_DEVCTL2_LTR_EN); } EXPORT_SYMBOL(pci_disable_ltr); diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index c3cc01d474b0..4b8f2e3bad58 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -550,19 +550,19 @@ * to use these fields safely. */ #define PCI_EXP_DEVCAP2 36 /* Device Capabilities 2 */ -#define PCI_EXP_DEVCAP2_ARI 0x20 /* Alternative Routing-ID */ -#define PCI_EXP_DEVCAP2_LTR 0x800 /* Latency tolerance reporting */ -#define PCI_EXP_OBFF_MASK 0xc0000 /* OBFF support mechanism */ -#define PCI_EXP_OBFF_MSG 0x40000 /* New message signaling */ -#define PCI_EXP_OBFF_WAKE 0x80000 /* Re-use WAKE# for OBFF */ +#define PCI_EXP_DEVCAP2_ARI 0x20 /* Alternative Routing-ID */ +#define PCI_EXP_DEVCAP2_LTR 0x800 /* Latency tolerance reporting */ +#define PCI_EXP_DEVCAP2_OBFF_MASK 0xc0000 /* OBFF support mechanism */ +#define PCI_EXP_DEVCAP2_OBFF_MSG 0x40000 /* New message signaling */ +#define PCI_EXP_DEVCAP2_OBFF_WAKE 0x80000 /* Re-use WAKE# for OBFF */ #define PCI_EXP_DEVCTL2 40 /* Device Control 2 */ -#define PCI_EXP_DEVCTL2_ARI 0x20 /* Alternative Routing-ID */ -#define PCI_EXP_IDO_REQ_EN 0x100 /* ID-based ordering request enable */ -#define PCI_EXP_IDO_CMP_EN 0x200 /* ID-based ordering completion enable */ -#define PCI_EXP_LTR_EN 0x400 /* Latency tolerance reporting */ -#define PCI_EXP_OBFF_MSGA_EN 0x2000 /* OBFF enable with Message type A */ -#define PCI_EXP_OBFF_MSGB_EN 0x4000 /* OBFF enable with Message type B */ -#define PCI_EXP_OBFF_WAKE_EN 0x6000 /* OBFF using WAKE# signaling */ +#define PCI_EXP_DEVCTL2_ARI 0x20 /* Alternative Routing-ID */ +#define PCI_EXP_DEVCTL2_IDO_REQ_EN 0x100 /* ID-based ordering request enable */ +#define PCI_EXP_DEVCTL2_IDO_CMP_EN 0x200 /* ID-based ordering completion enable */ +#define PCI_EXP_DEVCTL2_LTR_EN 0x400 /* Latency tolerance reporting */ +#define PCI_EXP_DEVCTL2_OBFF_MSGA_EN 0x2000 /* OBFF enable with Message type A */ +#define PCI_EXP_DEVCTL2_OBFF_MSGB_EN 0x4000 /* OBFF enable with Message type B */ +#define PCI_EXP_DEVCTL2_OBFF_WAKE_EN 0x6000 /* OBFF using WAKE# signaling */ #define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 44 /* v2 endpoints end here */ #define PCI_EXP_LNKCAP2 44 /* Link Capability 2 */ #define PCI_EXP_LNKCAP2_SLS_2_5GB 0x02 /* Supported Link Speed 2.5GT/s */ -- cgit v1.2.3-71-gd317 From 41d73ec053d2424599c4ed8452b889374d523ade Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 27 Aug 2013 08:50:12 +0200 Subject: netfilter: nf_conntrack: make sequence number adjustments usuable without NAT Split out sequence number adjustments from NAT and move them to the conntrack core to make them usable for SYN proxying. The sequence number adjustment information is moved to a seperate extend. The extend is added to new conntracks when a NAT mapping is set up for a connection using a helper. As a side effect, this saves 24 bytes per connection with NAT in the common case that a connection does not have a helper assigned. Signed-off-by: Patrick McHardy Tested-by: Martin Topholm Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 9 +- include/net/netfilter/nf_conntrack_extend.h | 2 + include/net/netfilter/nf_conntrack_seqadj.h | 49 +++++ include/net/netfilter/nf_nat.h | 10 - include/net/netfilter/nf_nat_helper.h | 19 -- include/uapi/linux/netfilter/nf_conntrack_common.h | 3 +- include/uapi/linux/netfilter/nfnetlink_conntrack.h | 15 +- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 7 +- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 7 +- net/netfilter/Makefile | 2 +- net/netfilter/nf_conntrack_core.c | 16 +- net/netfilter/nf_conntrack_netlink.c | 115 +++++------ net/netfilter/nf_conntrack_proto_tcp.c | 18 +- net/netfilter/nf_conntrack_seqadj.c | 218 ++++++++++++++++++++ net/netfilter/nf_nat_core.c | 16 +- net/netfilter/nf_nat_helper.c | 228 +-------------------- net/netfilter/nf_nat_sip.c | 3 +- net/netfilter/nfnetlink_queue_ct.c | 8 +- 18 files changed, 369 insertions(+), 376 deletions(-) create mode 100644 include/net/netfilter/nf_conntrack_seqadj.h create mode 100644 net/netfilter/nf_conntrack_seqadj.c (limited to 'include/uapi/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index e2cf786be22f..708fe72ab913 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -319,6 +319,7 @@ extern void nf_ct_attach(struct sk_buff *, const struct sk_buff *); extern void (*nf_ct_destroy)(struct nf_conntrack *) __rcu; struct nf_conn; +enum ip_conntrack_info; struct nlattr; struct nfq_ct_hook { @@ -327,14 +328,10 @@ struct nfq_ct_hook { int (*parse)(const struct nlattr *attr, struct nf_conn *ct); int (*attach_expect)(const struct nlattr *attr, struct nf_conn *ct, u32 portid, u32 report); -}; -extern struct nfq_ct_hook __rcu *nfq_ct_hook; - -struct nfq_ct_nat_hook { void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct, - u32 ctinfo, s32 off); + enum ip_conntrack_info ctinfo, s32 off); }; -extern struct nfq_ct_nat_hook __rcu *nfq_ct_nat_hook; +extern struct nfq_ct_hook __rcu *nfq_ct_hook; #else static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} #endif diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 977bc8a46444..2a22bcbfe6e4 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -9,6 +9,7 @@ enum nf_ct_ext_id { NF_CT_EXT_HELPER, #if defined(CONFIG_NF_NAT) || defined(CONFIG_NF_NAT_MODULE) NF_CT_EXT_NAT, + NF_CT_EXT_SEQADJ, #endif NF_CT_EXT_ACCT, #ifdef CONFIG_NF_CONNTRACK_EVENTS @@ -31,6 +32,7 @@ enum nf_ct_ext_id { #define NF_CT_EXT_HELPER_TYPE struct nf_conn_help #define NF_CT_EXT_NAT_TYPE struct nf_conn_nat +#define NF_CT_EXT_SEQADJ_TYPE struct nf_conn_seqadj #define NF_CT_EXT_ACCT_TYPE struct nf_conn_counter #define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache #define NF_CT_EXT_ZONE_TYPE struct nf_conntrack_zone diff --git a/include/net/netfilter/nf_conntrack_seqadj.h b/include/net/netfilter/nf_conntrack_seqadj.h new file mode 100644 index 000000000000..30bfbbed9f47 --- /dev/null +++ b/include/net/netfilter/nf_conntrack_seqadj.h @@ -0,0 +1,49 @@ +#ifndef _NF_CONNTRACK_SEQADJ_H +#define _NF_CONNTRACK_SEQADJ_H + +#include + +/** + * struct nf_ct_seqadj - sequence number adjustment information + * + * @correction_pos: position of the last TCP sequence number modification + * @offset_before: sequence number offset before last modification + * @offset_after: sequence number offset after last modification + */ +struct nf_ct_seqadj { + u32 correction_pos; + s32 offset_before; + s32 offset_after; +}; + +struct nf_conn_seqadj { + struct nf_ct_seqadj seq[IP_CT_DIR_MAX]; +}; + +static inline struct nf_conn_seqadj *nfct_seqadj(const struct nf_conn *ct) +{ + return nf_ct_ext_find(ct, NF_CT_EXT_SEQADJ); +} + +static inline struct nf_conn_seqadj *nfct_seqadj_ext_add(struct nf_conn *ct) +{ + return nf_ct_ext_add(ct, NF_CT_EXT_SEQADJ, GFP_ATOMIC); +} + +extern int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + __be32 seq, s32 off); +extern void nf_ct_tcp_seqadj_set(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + s32 off); + +extern int nf_ct_seq_adjust(struct sk_buff *skb, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff); +extern s32 nf_ct_seq_offset(const struct nf_conn *ct, enum ip_conntrack_dir, + u32 seq); + +extern int nf_conntrack_seqadj_init(void); +extern void nf_conntrack_seqadj_fini(void); + +#endif /* _NF_CONNTRACK_SEQADJ_H */ diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index e2441413675c..59a192420053 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -13,15 +13,6 @@ enum nf_nat_manip_type { #define HOOK2MANIP(hooknum) ((hooknum) != NF_INET_POST_ROUTING && \ (hooknum) != NF_INET_LOCAL_IN) -/* NAT sequence number modifications */ -struct nf_nat_seq { - /* position of the last TCP sequence number modification (if any) */ - u_int32_t correction_pos; - - /* sequence number offset before and after last modification */ - int32_t offset_before, offset_after; -}; - #include #include #include @@ -39,7 +30,6 @@ struct nf_conn; /* The structure embedded in the conntrack structure. */ struct nf_conn_nat { struct hlist_node bysource; - struct nf_nat_seq seq[IP_CT_DIR_MAX]; struct nf_conn *ct; union nf_conntrack_nat_help help; #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ diff --git a/include/net/netfilter/nf_nat_helper.h b/include/net/netfilter/nf_nat_helper.h index 194c34794923..404324d1d0c4 100644 --- a/include/net/netfilter/nf_nat_helper.h +++ b/include/net/netfilter/nf_nat_helper.h @@ -39,28 +39,9 @@ extern int nf_nat_mangle_udp_packet(struct sk_buff *skb, const char *rep_buffer, unsigned int rep_len); -extern void nf_nat_set_seq_adjust(struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - __be32 seq, s32 off); -extern int nf_nat_seq_adjust(struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int protoff); -extern int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int protoff); - /* Setup NAT on this expected conntrack so it follows master, but goes * to port ct->master->saved_proto. */ extern void nf_nat_follow_master(struct nf_conn *ct, struct nf_conntrack_expect *this); -extern s32 nf_nat_get_offset(const struct nf_conn *ct, - enum ip_conntrack_dir dir, - u32 seq); - -extern void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, - u32 dir, s32 off); - #endif diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index d69483fb3825..8dd803818ebe 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -99,7 +99,8 @@ enum ip_conntrack_events { IPCT_PROTOINFO, /* protocol information has changed */ IPCT_HELPER, /* new helper has been set */ IPCT_MARK, /* new mark has been set */ - IPCT_NATSEQADJ, /* NAT is doing sequence adjustment */ + IPCT_SEQADJ, /* sequence adjustment has changed */ + IPCT_NATSEQADJ = IPCT_SEQADJ, IPCT_SECMARK, /* new security mark has been set */ IPCT_LABEL, /* new connlabel has been set */ }; diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index 08fabc6c93f3..acad6c52a652 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -42,8 +42,10 @@ enum ctattr_type { CTA_ID, CTA_NAT_DST, CTA_TUPLE_MASTER, - CTA_NAT_SEQ_ADJ_ORIG, - CTA_NAT_SEQ_ADJ_REPLY, + CTA_SEQ_ADJ_ORIG, + CTA_NAT_SEQ_ADJ_ORIG = CTA_SEQ_ADJ_ORIG, + CTA_SEQ_ADJ_REPLY, + CTA_NAT_SEQ_ADJ_REPLY = CTA_SEQ_ADJ_REPLY, CTA_SECMARK, /* obsolete */ CTA_ZONE, CTA_SECCTX, @@ -165,6 +167,15 @@ enum ctattr_protonat { }; #define CTA_PROTONAT_MAX (__CTA_PROTONAT_MAX - 1) +enum ctattr_seqadj { + CTA_SEQADJ_UNSPEC, + CTA_SEQADJ_CORRECTION_POS, + CTA_SEQADJ_OFFSET_BEFORE, + CTA_SEQADJ_OFFSET_AFTER, + __CTA_SEQADJ_MAX +}; +#define CTA_SEQADJ_MAX (__CTA_SEQADJ_MAX - 1) + enum ctattr_natseq { CTA_NAT_SEQ_UNSPEC, CTA_NAT_SEQ_CORRECTION_POS, diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 0a2e0e3e95ba..86f5b34a4ed1 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -136,11 +137,7 @@ static unsigned int ipv4_confirm(unsigned int hooknum, /* adjust seqs for loopback traffic only in outgoing direction */ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb)) { - typeof(nf_nat_seq_adjust_hook) seq_adjust; - - seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); - if (!seq_adjust || - !seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { + if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); return NF_DROP; } diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index c9b6a6e6a1e8..d6e4dd8b58df 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -158,11 +159,7 @@ static unsigned int ipv6_confirm(unsigned int hooknum, /* adjust seqs for loopback traffic only in outgoing direction */ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb)) { - typeof(nf_nat_seq_adjust_hook) seq_adjust; - - seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); - if (!seq_adjust || - !seq_adjust(skb, ct, ctinfo, protoff)) { + if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); return NF_DROP; } diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index ebfa7dc747cd..89a9c1658f5e 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -1,6 +1,6 @@ netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o -nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o +nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index da6f1787a102..00a7a94d4132 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -1326,6 +1327,7 @@ void nf_conntrack_cleanup_end(void) nf_ct_extend_unregister(&nf_ct_zone_extend); #endif nf_conntrack_proto_fini(); + nf_conntrack_seqadj_fini(); nf_conntrack_labels_fini(); nf_conntrack_helper_fini(); nf_conntrack_timeout_fini(); @@ -1531,6 +1533,10 @@ int nf_conntrack_init_start(void) if (ret < 0) goto err_labels; + ret = nf_conntrack_seqadj_init(); + if (ret < 0) + goto err_seqadj; + #ifdef CONFIG_NF_CONNTRACK_ZONES ret = nf_ct_extend_register(&nf_ct_zone_extend); if (ret < 0) @@ -1555,6 +1561,8 @@ err_proto: nf_ct_extend_unregister(&nf_ct_zone_extend); err_extend: #endif + nf_conntrack_seqadj_fini(); +err_seqadj: nf_conntrack_labels_fini(); err_labels: nf_conntrack_helper_fini(); @@ -1577,9 +1585,6 @@ void nf_conntrack_init_end(void) /* For use by REJECT target */ RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach); RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack); - - /* Howto get NAT offsets */ - RCU_INIT_POINTER(nf_ct_nat_offset, NULL); } /* @@ -1666,8 +1671,3 @@ err_slabname: err_stat: return ret; } - -s32 (*nf_ct_nat_offset)(const struct nf_conn *ct, - enum ip_conntrack_dir dir, - u32 seq); -EXPORT_SYMBOL_GPL(nf_ct_nat_offset); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index fa61fea63234..7c55745ececf 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -381,9 +382,8 @@ nla_put_failure: return -1; } -#ifdef CONFIG_NF_NAT_NEEDED static int -dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type) +dump_ct_seq_adj(struct sk_buff *skb, const struct nf_ct_seqadj *seq, int type) { struct nlattr *nest_parms; @@ -391,12 +391,12 @@ dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type) if (!nest_parms) goto nla_put_failure; - if (nla_put_be32(skb, CTA_NAT_SEQ_CORRECTION_POS, - htonl(natseq->correction_pos)) || - nla_put_be32(skb, CTA_NAT_SEQ_OFFSET_BEFORE, - htonl(natseq->offset_before)) || - nla_put_be32(skb, CTA_NAT_SEQ_OFFSET_AFTER, - htonl(natseq->offset_after))) + if (nla_put_be32(skb, CTA_SEQADJ_CORRECTION_POS, + htonl(seq->correction_pos)) || + nla_put_be32(skb, CTA_SEQADJ_OFFSET_BEFORE, + htonl(seq->offset_before)) || + nla_put_be32(skb, CTA_SEQADJ_OFFSET_AFTER, + htonl(seq->offset_after))) goto nla_put_failure; nla_nest_end(skb, nest_parms); @@ -408,27 +408,24 @@ nla_put_failure: } static inline int -ctnetlink_dump_nat_seq_adj(struct sk_buff *skb, const struct nf_conn *ct) +ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, const struct nf_conn *ct) { - struct nf_nat_seq *natseq; - struct nf_conn_nat *nat = nfct_nat(ct); + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + struct nf_ct_seqadj *seq; - if (!(ct->status & IPS_SEQ_ADJUST) || !nat) + if (!(ct->status & IPS_SEQ_ADJUST) || !seqadj) return 0; - natseq = &nat->seq[IP_CT_DIR_ORIGINAL]; - if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_ORIG) == -1) + seq = &seqadj->seq[IP_CT_DIR_ORIGINAL]; + if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_ORIG) == -1) return -1; - natseq = &nat->seq[IP_CT_DIR_REPLY]; - if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_REPLY) == -1) + seq = &seqadj->seq[IP_CT_DIR_REPLY]; + if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_REPLY) == -1) return -1; return 0; } -#else -#define ctnetlink_dump_nat_seq_adj(a, b) (0) -#endif static inline int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) @@ -502,7 +499,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, ctnetlink_dump_id(skb, ct) < 0 || ctnetlink_dump_use(skb, ct) < 0 || ctnetlink_dump_master(skb, ct) < 0 || - ctnetlink_dump_nat_seq_adj(skb, ct) < 0) + ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -707,8 +704,8 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) ctnetlink_dump_master(skb, ct) < 0) goto nla_put_failure; - if (events & (1 << IPCT_NATSEQADJ) && - ctnetlink_dump_nat_seq_adj(skb, ct) < 0) + if (events & (1 << IPCT_SEQADJ) && + ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; } @@ -1439,66 +1436,65 @@ ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[] return err; } -#ifdef CONFIG_NF_NAT_NEEDED -static const struct nla_policy nat_seq_policy[CTA_NAT_SEQ_MAX+1] = { - [CTA_NAT_SEQ_CORRECTION_POS] = { .type = NLA_U32 }, - [CTA_NAT_SEQ_OFFSET_BEFORE] = { .type = NLA_U32 }, - [CTA_NAT_SEQ_OFFSET_AFTER] = { .type = NLA_U32 }, +static const struct nla_policy seqadj_policy[CTA_SEQADJ_MAX+1] = { + [CTA_SEQADJ_CORRECTION_POS] = { .type = NLA_U32 }, + [CTA_SEQADJ_OFFSET_BEFORE] = { .type = NLA_U32 }, + [CTA_SEQADJ_OFFSET_AFTER] = { .type = NLA_U32 }, }; static inline int -change_nat_seq_adj(struct nf_nat_seq *natseq, const struct nlattr * const attr) +change_seq_adj(struct nf_ct_seqadj *seq, const struct nlattr * const attr) { int err; - struct nlattr *cda[CTA_NAT_SEQ_MAX+1]; + struct nlattr *cda[CTA_SEQADJ_MAX+1]; - err = nla_parse_nested(cda, CTA_NAT_SEQ_MAX, attr, nat_seq_policy); + err = nla_parse_nested(cda, CTA_SEQADJ_MAX, attr, seqadj_policy); if (err < 0) return err; - if (!cda[CTA_NAT_SEQ_CORRECTION_POS]) + if (!cda[CTA_SEQADJ_CORRECTION_POS]) return -EINVAL; - natseq->correction_pos = - ntohl(nla_get_be32(cda[CTA_NAT_SEQ_CORRECTION_POS])); + seq->correction_pos = + ntohl(nla_get_be32(cda[CTA_SEQADJ_CORRECTION_POS])); - if (!cda[CTA_NAT_SEQ_OFFSET_BEFORE]) + if (!cda[CTA_SEQADJ_OFFSET_BEFORE]) return -EINVAL; - natseq->offset_before = - ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_BEFORE])); + seq->offset_before = + ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_BEFORE])); - if (!cda[CTA_NAT_SEQ_OFFSET_AFTER]) + if (!cda[CTA_SEQADJ_OFFSET_AFTER]) return -EINVAL; - natseq->offset_after = - ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_AFTER])); + seq->offset_after = + ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_AFTER])); return 0; } static int -ctnetlink_change_nat_seq_adj(struct nf_conn *ct, - const struct nlattr * const cda[]) +ctnetlink_change_seq_adj(struct nf_conn *ct, + const struct nlattr * const cda[]) { + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); int ret = 0; - struct nf_conn_nat *nat = nfct_nat(ct); - if (!nat) + if (!seqadj) return 0; - if (cda[CTA_NAT_SEQ_ADJ_ORIG]) { - ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_ORIGINAL], - cda[CTA_NAT_SEQ_ADJ_ORIG]); + if (cda[CTA_SEQ_ADJ_ORIG]) { + ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_ORIGINAL], + cda[CTA_SEQ_ADJ_ORIG]); if (ret < 0) return ret; ct->status |= IPS_SEQ_ADJUST; } - if (cda[CTA_NAT_SEQ_ADJ_REPLY]) { - ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_REPLY], - cda[CTA_NAT_SEQ_ADJ_REPLY]); + if (cda[CTA_SEQ_ADJ_REPLY]) { + ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_REPLY], + cda[CTA_SEQ_ADJ_REPLY]); if (ret < 0) return ret; @@ -1507,7 +1503,6 @@ ctnetlink_change_nat_seq_adj(struct nf_conn *ct, return 0; } -#endif static int ctnetlink_attach_labels(struct nf_conn *ct, const struct nlattr * const cda[]) @@ -1573,13 +1568,12 @@ ctnetlink_change_conntrack(struct nf_conn *ct, ct->mark = ntohl(nla_get_be32(cda[CTA_MARK])); #endif -#ifdef CONFIG_NF_NAT_NEEDED - if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) { - err = ctnetlink_change_nat_seq_adj(ct, cda); + if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) { + err = ctnetlink_change_seq_adj(ct, cda); if (err < 0) return err; } -#endif + if (cda[CTA_LABELS]) { err = ctnetlink_attach_labels(ct, cda); if (err < 0) @@ -1684,13 +1678,11 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, goto err2; } -#ifdef CONFIG_NF_NAT_NEEDED - if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) { - err = ctnetlink_change_nat_seq_adj(ct, cda); + if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) { + err = ctnetlink_change_seq_adj(ct, cda); if (err < 0) goto err2; } -#endif memset(&ct->proto, 0, sizeof(ct->proto)); if (cda[CTA_PROTOINFO]) { @@ -1804,7 +1796,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, (1 << IPCT_ASSURED) | (1 << IPCT_HELPER) | (1 << IPCT_PROTOINFO) | - (1 << IPCT_NATSEQADJ) | + (1 << IPCT_SEQADJ) | (1 << IPCT_MARK) | events, ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); @@ -1827,7 +1819,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, (1 << IPCT_HELPER) | (1 << IPCT_LABEL) | (1 << IPCT_PROTOINFO) | - (1 << IPCT_NATSEQADJ) | + (1 << IPCT_SEQADJ) | (1 << IPCT_MARK), ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); @@ -2082,7 +2074,7 @@ ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct) goto nla_put_failure; if ((ct->status & IPS_SEQ_ADJUST) && - ctnetlink_dump_nat_seq_adj(skb, ct) < 0) + ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; #ifdef CONFIG_NF_CONNTRACK_MARK @@ -2211,6 +2203,7 @@ static struct nfq_ct_hook ctnetlink_nfqueue_hook = { .build = ctnetlink_nfqueue_build, .parse = ctnetlink_nfqueue_parse, .attach_expect = ctnetlink_nfqueue_attach_expect, + .seq_adjust = nf_ct_tcp_seqadj_set, }; #endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */ diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index d224e001f14f..984a8d1a3359 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -495,21 +496,6 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, } } -#ifdef CONFIG_NF_NAT_NEEDED -static inline s32 nat_offset(const struct nf_conn *ct, - enum ip_conntrack_dir dir, - u32 seq) -{ - typeof(nf_ct_nat_offset) get_offset = rcu_dereference(nf_ct_nat_offset); - - return get_offset != NULL ? get_offset(ct, dir, seq) : 0; -} -#define NAT_OFFSET(ct, dir, seq) \ - (nat_offset(ct, dir, seq)) -#else -#define NAT_OFFSET(ct, dir, seq) 0 -#endif - static bool tcp_in_window(const struct nf_conn *ct, struct ip_ct_tcp *state, enum ip_conntrack_dir dir, @@ -540,7 +526,7 @@ static bool tcp_in_window(const struct nf_conn *ct, tcp_sack(skb, dataoff, tcph, &sack); /* Take into account NAT sequence number mangling */ - receiver_offset = NAT_OFFSET(ct, !dir, ack - 1); + receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1); ack -= receiver_offset; sack -= receiver_offset; diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c new file mode 100644 index 000000000000..483eb9ce3216 --- /dev/null +++ b/net/netfilter/nf_conntrack_seqadj.c @@ -0,0 +1,218 @@ +#include +#include +#include + +#include +#include +#include + +int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + __be32 seq, s32 off) +{ + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct nf_ct_seqadj *this_way; + + if (off == 0) + return 0; + + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); + + spin_lock_bh(&ct->lock); + this_way = &seqadj->seq[dir]; + if (this_way->offset_before == this_way->offset_after || + before(this_way->correction_pos, seq)) { + this_way->correction_pos = seq; + this_way->offset_before = this_way->offset_after; + this_way->offset_after += off; + } + spin_unlock_bh(&ct->lock); + return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_seqadj_set); + +void nf_ct_tcp_seqadj_set(struct sk_buff *skb, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + s32 off) +{ + const struct tcphdr *th; + + if (nf_ct_protonum(ct) != IPPROTO_TCP) + return; + + th = (struct tcphdr *)(skb_network_header(skb) + ip_hdrlen(skb)); + nf_ct_seqadj_set(ct, ctinfo, th->seq, off); +} +EXPORT_SYMBOL_GPL(nf_ct_tcp_seqadj_set); + +/* Adjust one found SACK option including checksum correction */ +static void nf_ct_sack_block_adjust(struct sk_buff *skb, + struct tcphdr *tcph, + unsigned int sackoff, + unsigned int sackend, + struct nf_ct_seqadj *seq) +{ + while (sackoff < sackend) { + struct tcp_sack_block_wire *sack; + __be32 new_start_seq, new_end_seq; + + sack = (void *)skb->data + sackoff; + if (after(ntohl(sack->start_seq) - seq->offset_before, + seq->correction_pos)) + new_start_seq = htonl(ntohl(sack->start_seq) - + seq->offset_after); + else + new_start_seq = htonl(ntohl(sack->start_seq) - + seq->offset_before); + + if (after(ntohl(sack->end_seq) - seq->offset_before, + seq->correction_pos)) + new_end_seq = htonl(ntohl(sack->end_seq) - + seq->offset_after); + else + new_end_seq = htonl(ntohl(sack->end_seq) - + seq->offset_before); + + pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n", + ntohl(sack->start_seq), new_start_seq, + ntohl(sack->end_seq), new_end_seq); + + inet_proto_csum_replace4(&tcph->check, skb, + sack->start_seq, new_start_seq, 0); + inet_proto_csum_replace4(&tcph->check, skb, + sack->end_seq, new_end_seq, 0); + sack->start_seq = new_start_seq; + sack->end_seq = new_end_seq; + sackoff += sizeof(*sack); + } +} + +/* TCP SACK sequence number adjustment */ +static unsigned int nf_ct_sack_adjust(struct sk_buff *skb, + unsigned int protoff, + struct tcphdr *tcph, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dir, optoff, optend; + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + + optoff = protoff + sizeof(struct tcphdr); + optend = protoff + tcph->doff * 4; + + if (!skb_make_writable(skb, optend)) + return 0; + + dir = CTINFO2DIR(ctinfo); + + while (optoff < optend) { + /* Usually: option, length. */ + unsigned char *op = skb->data + optoff; + + switch (op[0]) { + case TCPOPT_EOL: + return 1; + case TCPOPT_NOP: + optoff++; + continue; + default: + /* no partial options */ + if (optoff + 1 == optend || + optoff + op[1] > optend || + op[1] < 2) + return 0; + if (op[0] == TCPOPT_SACK && + op[1] >= 2+TCPOLEN_SACK_PERBLOCK && + ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) + nf_ct_sack_block_adjust(skb, tcph, optoff + 2, + optoff+op[1], + &seqadj->seq[!dir]); + optoff += op[1]; + } + } + return 1; +} + +/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ +int nf_ct_seq_adjust(struct sk_buff *skb, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff) +{ + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct tcphdr *tcph; + __be32 newseq, newack; + s32 seqoff, ackoff; + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + struct nf_ct_seqadj *this_way, *other_way; + int res; + + this_way = &seqadj->seq[dir]; + other_way = &seqadj->seq[!dir]; + + if (!skb_make_writable(skb, protoff + sizeof(*tcph))) + return 0; + + tcph = (void *)skb->data + protoff; + spin_lock_bh(&ct->lock); + if (after(ntohl(tcph->seq), this_way->correction_pos)) + seqoff = this_way->offset_after; + else + seqoff = this_way->offset_before; + + if (after(ntohl(tcph->ack_seq) - other_way->offset_before, + other_way->correction_pos)) + ackoff = other_way->offset_after; + else + ackoff = other_way->offset_before; + + newseq = htonl(ntohl(tcph->seq) + seqoff); + newack = htonl(ntohl(tcph->ack_seq) - ackoff); + + inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0); + inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0); + + pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n", + ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), + ntohl(newack)); + + tcph->seq = newseq; + tcph->ack_seq = newack; + + res = nf_ct_sack_adjust(skb, protoff, tcph, ct, ctinfo); + spin_unlock_bh(&ct->lock); + + return res; +} +EXPORT_SYMBOL_GPL(nf_ct_seq_adjust); + +s32 nf_ct_seq_offset(const struct nf_conn *ct, + enum ip_conntrack_dir dir, + u32 seq) +{ + struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); + struct nf_ct_seqadj *this_way; + + if (!seqadj) + return 0; + + this_way = &seqadj->seq[dir]; + return after(seq, this_way->correction_pos) ? + this_way->offset_after : this_way->offset_before; +} +EXPORT_SYMBOL_GPL(nf_ct_seq_offset); + +static struct nf_ct_ext_type nf_ct_seqadj_extend __read_mostly = { + .len = sizeof(struct nf_conn_seqadj), + .align = __alignof__(struct nf_conn_seqadj), + .id = NF_CT_EXT_SEQADJ, +}; + +int nf_conntrack_seqadj_init(void) +{ + return nf_ct_extend_register(&nf_ct_seqadj_extend); +} + +void nf_conntrack_seqadj_fini(void) +{ + nf_ct_extend_unregister(&nf_ct_seqadj_extend); +} diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 6ff808375b5e..6f0f4f7f68a5 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -402,6 +403,9 @@ nf_nat_setup_info(struct nf_conn *ct, ct->status |= IPS_SRC_NAT; else ct->status |= IPS_DST_NAT; + + if (nfct_help(ct)) + nfct_seqadj_ext_add(ct); } if (maniptype == NF_NAT_MANIP_SRC) { @@ -764,10 +768,6 @@ static struct nf_ct_helper_expectfn follow_master_nat = { .expectfn = nf_nat_follow_master, }; -static struct nfq_ct_nat_hook nfq_ct_nat = { - .seq_adjust = nf_nat_tcp_seq_adjust, -}; - static int __init nf_nat_init(void) { int ret; @@ -787,14 +787,9 @@ static int __init nf_nat_init(void) /* Initialize fake conntrack so that NAT will skip it */ nf_ct_untracked_status_or(IPS_NAT_DONE_MASK); - BUG_ON(nf_nat_seq_adjust_hook != NULL); - RCU_INIT_POINTER(nf_nat_seq_adjust_hook, nf_nat_seq_adjust); BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, nfnetlink_parse_nat_setup); - BUG_ON(nf_ct_nat_offset != NULL); - RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset); - RCU_INIT_POINTER(nfq_ct_nat_hook, &nfq_ct_nat); #ifdef CONFIG_XFRM BUG_ON(nf_nat_decode_session_hook != NULL); RCU_INIT_POINTER(nf_nat_decode_session_hook, __nf_nat_decode_session); @@ -813,10 +808,7 @@ static void __exit nf_nat_cleanup(void) unregister_pernet_subsys(&nf_nat_net_ops); nf_ct_extend_unregister(&nat_extend); nf_ct_helper_expectfn_unregister(&follow_master_nat); - RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL); RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL); - RCU_INIT_POINTER(nf_ct_nat_offset, NULL); - RCU_INIT_POINTER(nfq_ct_nat_hook, NULL); #ifdef CONFIG_XFRM RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL); #endif diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index 46b9baa845a6..2840abb5bb99 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -20,67 +20,13 @@ #include #include #include +#include #include #include #include #include #include -#define DUMP_OFFSET(x) \ - pr_debug("offset_before=%d, offset_after=%d, correction_pos=%u\n", \ - x->offset_before, x->offset_after, x->correction_pos); - -/* Setup TCP sequence correction given this change at this sequence */ -static inline void -adjust_tcp_sequence(u32 seq, - int sizediff, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - struct nf_conn_nat *nat = nfct_nat(ct); - struct nf_nat_seq *this_way = &nat->seq[dir]; - - pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n", - seq, sizediff); - - pr_debug("adjust_tcp_sequence: Seq_offset before: "); - DUMP_OFFSET(this_way); - - spin_lock_bh(&ct->lock); - - /* SYN adjust. If it's uninitialized, or this is after last - * correction, record it: we don't handle more than one - * adjustment in the window, but do deal with common case of a - * retransmit */ - if (this_way->offset_before == this_way->offset_after || - before(this_way->correction_pos, seq)) { - this_way->correction_pos = seq; - this_way->offset_before = this_way->offset_after; - this_way->offset_after += sizediff; - } - spin_unlock_bh(&ct->lock); - - pr_debug("adjust_tcp_sequence: Seq_offset after: "); - DUMP_OFFSET(this_way); -} - -/* Get the offset value, for conntrack. Caller must have the conntrack locked */ -s32 nf_nat_get_offset(const struct nf_conn *ct, - enum ip_conntrack_dir dir, - u32 seq) -{ - struct nf_conn_nat *nat = nfct_nat(ct); - struct nf_nat_seq *this_way; - - if (!nat) - return 0; - - this_way = &nat->seq[dir]; - return after(seq, this_way->correction_pos) - ? this_way->offset_after : this_way->offset_before; -} - /* Frobs data inside this packet, which is linear. */ static void mangle_contents(struct sk_buff *skb, unsigned int dataoff, @@ -135,30 +81,6 @@ static int enlarge_skb(struct sk_buff *skb, unsigned int extra) return 1; } -void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo, - __be32 seq, s32 off) -{ - if (!off) - return; - set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); - adjust_tcp_sequence(ntohl(seq), off, ct, ctinfo); - nf_conntrack_event_cache(IPCT_NATSEQADJ, ct); -} -EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); - -void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, - u32 ctinfo, int off) -{ - const struct tcphdr *th; - - if (nf_ct_protonum(ct) != IPPROTO_TCP) - return; - - th = (struct tcphdr *)(skb_network_header(skb)+ ip_hdrlen(skb)); - nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off); -} -EXPORT_SYMBOL_GPL(nf_nat_tcp_seq_adjust); - /* Generic function for mangling variable-length address changes inside * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX * command in FTP). @@ -203,8 +125,8 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb, datalen, oldlen); if (adjust && rep_len != match_len) - nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq, - (int)rep_len - (int)match_len); + nf_ct_seqadj_set(ct, ctinfo, tcph->seq, + (int)rep_len - (int)match_len); return 1; } @@ -264,150 +186,6 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb, } EXPORT_SYMBOL(nf_nat_mangle_udp_packet); -/* Adjust one found SACK option including checksum correction */ -static void -sack_adjust(struct sk_buff *skb, - struct tcphdr *tcph, - unsigned int sackoff, - unsigned int sackend, - struct nf_nat_seq *natseq) -{ - while (sackoff < sackend) { - struct tcp_sack_block_wire *sack; - __be32 new_start_seq, new_end_seq; - - sack = (void *)skb->data + sackoff; - if (after(ntohl(sack->start_seq) - natseq->offset_before, - natseq->correction_pos)) - new_start_seq = htonl(ntohl(sack->start_seq) - - natseq->offset_after); - else - new_start_seq = htonl(ntohl(sack->start_seq) - - natseq->offset_before); - - if (after(ntohl(sack->end_seq) - natseq->offset_before, - natseq->correction_pos)) - new_end_seq = htonl(ntohl(sack->end_seq) - - natseq->offset_after); - else - new_end_seq = htonl(ntohl(sack->end_seq) - - natseq->offset_before); - - pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n", - ntohl(sack->start_seq), new_start_seq, - ntohl(sack->end_seq), new_end_seq); - - inet_proto_csum_replace4(&tcph->check, skb, - sack->start_seq, new_start_seq, 0); - inet_proto_csum_replace4(&tcph->check, skb, - sack->end_seq, new_end_seq, 0); - sack->start_seq = new_start_seq; - sack->end_seq = new_end_seq; - sackoff += sizeof(*sack); - } -} - -/* TCP SACK sequence number adjustment */ -static inline unsigned int -nf_nat_sack_adjust(struct sk_buff *skb, - unsigned int protoff, - struct tcphdr *tcph, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - unsigned int dir, optoff, optend; - struct nf_conn_nat *nat = nfct_nat(ct); - - optoff = protoff + sizeof(struct tcphdr); - optend = protoff + tcph->doff * 4; - - if (!skb_make_writable(skb, optend)) - return 0; - - dir = CTINFO2DIR(ctinfo); - - while (optoff < optend) { - /* Usually: option, length. */ - unsigned char *op = skb->data + optoff; - - switch (op[0]) { - case TCPOPT_EOL: - return 1; - case TCPOPT_NOP: - optoff++; - continue; - default: - /* no partial options */ - if (optoff + 1 == optend || - optoff + op[1] > optend || - op[1] < 2) - return 0; - if (op[0] == TCPOPT_SACK && - op[1] >= 2+TCPOLEN_SACK_PERBLOCK && - ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) - sack_adjust(skb, tcph, optoff+2, - optoff+op[1], &nat->seq[!dir]); - optoff += op[1]; - } - } - return 1; -} - -/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ -int -nf_nat_seq_adjust(struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int protoff) -{ - struct tcphdr *tcph; - int dir; - __be32 newseq, newack; - s32 seqoff, ackoff; - struct nf_conn_nat *nat = nfct_nat(ct); - struct nf_nat_seq *this_way, *other_way; - int res; - - dir = CTINFO2DIR(ctinfo); - - this_way = &nat->seq[dir]; - other_way = &nat->seq[!dir]; - - if (!skb_make_writable(skb, protoff + sizeof(*tcph))) - return 0; - - tcph = (void *)skb->data + protoff; - spin_lock_bh(&ct->lock); - if (after(ntohl(tcph->seq), this_way->correction_pos)) - seqoff = this_way->offset_after; - else - seqoff = this_way->offset_before; - - if (after(ntohl(tcph->ack_seq) - other_way->offset_before, - other_way->correction_pos)) - ackoff = other_way->offset_after; - else - ackoff = other_way->offset_before; - - newseq = htonl(ntohl(tcph->seq) + seqoff); - newack = htonl(ntohl(tcph->ack_seq) - ackoff); - - inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0); - inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0); - - pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n", - ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), - ntohl(newack)); - - tcph->seq = newseq; - tcph->ack_seq = newack; - - res = nf_nat_sack_adjust(skb, protoff, tcph, ct, ctinfo); - spin_unlock_bh(&ct->lock); - - return res; -} - /* Setup NAT on this expected conntrack so it follows master. */ /* If we fail to get a free NAT slot, we'll get dropped on confirm */ void nf_nat_follow_master(struct nf_conn *ct, diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c index dac11f73868e..f9790405b7ff 100644 --- a/net/netfilter/nf_nat_sip.c +++ b/net/netfilter/nf_nat_sip.c @@ -20,6 +20,7 @@ #include #include #include +#include #include MODULE_LICENSE("GPL"); @@ -308,7 +309,7 @@ static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff, return; th = (struct tcphdr *)(skb->data + protoff); - nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off); + nf_ct_seqadj_set(ct, ctinfo, th->seq, off); } /* Handles expected signalling connections and media streams */ diff --git a/net/netfilter/nfnetlink_queue_ct.c b/net/netfilter/nfnetlink_queue_ct.c index be893039966d..96cac50e0d12 100644 --- a/net/netfilter/nfnetlink_queue_ct.c +++ b/net/netfilter/nfnetlink_queue_ct.c @@ -87,14 +87,14 @@ nla_put_failure: void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, int diff) { - struct nfq_ct_nat_hook *nfq_nat_ct; + struct nfq_ct_hook *nfq_ct; - nfq_nat_ct = rcu_dereference(nfq_ct_nat_hook); - if (nfq_nat_ct == NULL) + nfq_ct = rcu_dereference(nfq_ct_hook); + if (nfq_ct == NULL) return; if ((ct->status & IPS_NAT_MASK) && diff) - nfq_nat_ct->seq_adjust(skb, ct, ctinfo, diff); + nfq_ct->seq_adjust(skb, ct, ctinfo, diff); } int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr, -- cgit v1.2.3-71-gd317 From 48b1de4c110a7afa4b85862f6c75af817db26fad Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 27 Aug 2013 08:50:14 +0200 Subject: netfilter: add SYNPROXY core/target Add a SYNPROXY for netfilter. The code is split into two parts, the synproxy core with common functions and an address family specific target. The SYNPROXY receives the connection request from the client, responds with a SYN/ACK containing a SYN cookie and announcing a zero window and checks whether the final ACK from the client contains a valid cookie. It then establishes a connection to the original destination and, if successful, sends a window update to the client with the window size announced by the server. Support for timestamps, SACK, window scaling and MSS options can be statically configured as target parameters if the features of the server are known. If timestamps are used, the timestamp value sent back to the client in the SYN/ACK will be different from the real timestamp of the server. In order to now break PAWS, the timestamps are translated in the direction server->client. Signed-off-by: Patrick McHardy Tested-by: Martin Topholm Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_extend.h | 6 +- include/net/netfilter/nf_conntrack_seqadj.h | 2 + include/net/netfilter/nf_conntrack_synproxy.h | 77 +++++ include/uapi/linux/netfilter/xt_SYNPROXY.h | 16 + net/ipv4/netfilter/Kconfig | 13 + net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/ipt_SYNPROXY.c | 472 ++++++++++++++++++++++++++ net/netfilter/Kconfig | 3 + net/netfilter/Makefile | 3 + net/netfilter/nf_conntrack_core.c | 6 + net/netfilter/nf_conntrack_proto_tcp.c | 16 + net/netfilter/nf_conntrack_seqadj.c | 20 ++ net/netfilter/nf_synproxy_core.c | 432 +++++++++++++++++++++++ 13 files changed, 1066 insertions(+), 1 deletion(-) create mode 100644 include/net/netfilter/nf_conntrack_synproxy.h create mode 100644 include/uapi/linux/netfilter/xt_SYNPROXY.h create mode 100644 net/ipv4/netfilter/ipt_SYNPROXY.c create mode 100644 net/netfilter/nf_synproxy_core.c (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 2a22bcbfe6e4..ff95434e50ca 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -9,8 +9,8 @@ enum nf_ct_ext_id { NF_CT_EXT_HELPER, #if defined(CONFIG_NF_NAT) || defined(CONFIG_NF_NAT_MODULE) NF_CT_EXT_NAT, - NF_CT_EXT_SEQADJ, #endif + NF_CT_EXT_SEQADJ, NF_CT_EXT_ACCT, #ifdef CONFIG_NF_CONNTRACK_EVENTS NF_CT_EXT_ECACHE, @@ -26,6 +26,9 @@ enum nf_ct_ext_id { #endif #ifdef CONFIG_NF_CONNTRACK_LABELS NF_CT_EXT_LABELS, +#endif +#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) + NF_CT_EXT_SYNPROXY, #endif NF_CT_EXT_NUM, }; @@ -39,6 +42,7 @@ enum nf_ct_ext_id { #define NF_CT_EXT_TSTAMP_TYPE struct nf_conn_tstamp #define NF_CT_EXT_TIMEOUT_TYPE struct nf_conn_timeout #define NF_CT_EXT_LABELS_TYPE struct nf_conn_labels +#define NF_CT_EXT_SYNPROXY_TYPE struct nf_conn_synproxy /* Extensions: optional stuff which isn't permanently in struct. */ struct nf_ct_ext { diff --git a/include/net/netfilter/nf_conntrack_seqadj.h b/include/net/netfilter/nf_conntrack_seqadj.h index 30bfbbed9f47..f6177a5fe0ca 100644 --- a/include/net/netfilter/nf_conntrack_seqadj.h +++ b/include/net/netfilter/nf_conntrack_seqadj.h @@ -30,6 +30,8 @@ static inline struct nf_conn_seqadj *nfct_seqadj_ext_add(struct nf_conn *ct) return nf_ct_ext_add(ct, NF_CT_EXT_SEQADJ, GFP_ATOMIC); } +extern int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + s32 off); extern int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, __be32 seq, s32 off); extern void nf_ct_tcp_seqadj_set(struct sk_buff *skb, diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h new file mode 100644 index 000000000000..806f54a290d6 --- /dev/null +++ b/include/net/netfilter/nf_conntrack_synproxy.h @@ -0,0 +1,77 @@ +#ifndef _NF_CONNTRACK_SYNPROXY_H +#define _NF_CONNTRACK_SYNPROXY_H + +#include + +struct nf_conn_synproxy { + u32 isn; + u32 its; + u32 tsoff; +}; + +static inline struct nf_conn_synproxy *nfct_synproxy(const struct nf_conn *ct) +{ +#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) + return nf_ct_ext_find(ct, NF_CT_EXT_SYNPROXY); +#else + return NULL; +#endif +} + +static inline struct nf_conn_synproxy *nfct_synproxy_ext_add(struct nf_conn *ct) +{ +#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) + return nf_ct_ext_add(ct, NF_CT_EXT_SYNPROXY, GFP_ATOMIC); +#else + return NULL; +#endif +} + +struct synproxy_stats { + unsigned int syn_received; + unsigned int cookie_invalid; + unsigned int cookie_valid; + unsigned int cookie_retrans; + unsigned int conn_reopened; +}; + +struct synproxy_net { + struct nf_conn *tmpl; + struct synproxy_stats __percpu *stats; +}; + +extern int synproxy_net_id; +static inline struct synproxy_net *synproxy_pernet(struct net *net) +{ + return net_generic(net, synproxy_net_id); +} + +struct synproxy_options { + u8 options; + u8 wscale; + u16 mss; + u32 tsval; + u32 tsecr; +}; + +struct tcphdr; +struct xt_synproxy_info; +extern void synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, + const struct tcphdr *th, + struct synproxy_options *opts); +extern unsigned int synproxy_options_size(const struct synproxy_options *opts); +extern void synproxy_build_options(struct tcphdr *th, + const struct synproxy_options *opts); + +extern void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info, + struct synproxy_options *opts); +extern void synproxy_check_timestamp_cookie(struct synproxy_options *opts); + +extern unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, + unsigned int protoff, + struct tcphdr *th, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct nf_conn_synproxy *synproxy); + +#endif /* _NF_CONNTRACK_SYNPROXY_H */ diff --git a/include/uapi/linux/netfilter/xt_SYNPROXY.h b/include/uapi/linux/netfilter/xt_SYNPROXY.h new file mode 100644 index 000000000000..2d59fbaa93c6 --- /dev/null +++ b/include/uapi/linux/netfilter/xt_SYNPROXY.h @@ -0,0 +1,16 @@ +#ifndef _XT_SYNPROXY_H +#define _XT_SYNPROXY_H + +#define XT_SYNPROXY_OPT_MSS 0x01 +#define XT_SYNPROXY_OPT_WSCALE 0x02 +#define XT_SYNPROXY_OPT_SACK_PERM 0x04 +#define XT_SYNPROXY_OPT_TIMESTAMP 0x08 +#define XT_SYNPROXY_OPT_ECN 0x10 + +struct xt_synproxy_info { + __u8 options; + __u8 wscale; + __u16 mss; +}; + +#endif /* _XT_SYNPROXY_H */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 4e9028017428..1657e39b291f 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -110,6 +110,19 @@ config IP_NF_TARGET_REJECT To compile it as a module, choose M here. If unsure, say N. +config IP_NF_TARGET_SYNPROXY + tristate "SYNPROXY target support" + depends on NF_CONNTRACK && NETFILTER_ADVANCED + select NETFILTER_SYNPROXY + select SYN_COOKIES + help + The SYNPROXY target allows you to intercept TCP connections and + establish them using syncookies before they are passed on to the + server. This allows to avoid conntrack and server resource usage + during SYN-flood attacks. + + To compile it as a module, choose M here. If unsure, say N. + config IP_NF_TARGET_ULOG tristate "ULOG target support (obsolete)" default m if NETFILTER_ADVANCED=n diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 007b128eecc9..3622b248b6dd 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -46,6 +46,7 @@ obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o +obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o # generic ARP tables diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c new file mode 100644 index 000000000000..94371db6aecc --- /dev/null +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -0,0 +1,472 @@ +/* + * Copyright (c) 2013 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static struct iphdr * +synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr) +{ + struct iphdr *iph; + + skb_reset_network_header(skb); + iph = (struct iphdr *)skb_put(skb, sizeof(*iph)); + iph->version = 4; + iph->ihl = sizeof(*iph) / 4; + iph->tos = 0; + iph->id = 0; + iph->frag_off = htons(IP_DF); + iph->ttl = sysctl_ip_default_ttl; + iph->protocol = IPPROTO_TCP; + iph->check = 0; + iph->saddr = saddr; + iph->daddr = daddr; + + return iph; +} + +static void +synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb, + struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, + struct iphdr *niph, struct tcphdr *nth, + unsigned int tcp_hdr_size) +{ + nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)nth - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); + + skb_dst_set_noref(nskb, skb_dst(skb)); + nskb->protocol = htons(ETH_P_IP); + if (ip_route_me_harder(nskb, RTN_UNSPEC)) + goto free_nskb; + + if (nfct) { + nskb->nfct = nfct; + nskb->nfctinfo = ctinfo; + nf_conntrack_get(nfct); + } + + ip_local_out(nskb); + return; + +free_nskb: + kfree_skb(nskb); +} + +static void +synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + u16 mss = opts->mss; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; + if (opts->options & XT_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE; + nth->doff = tcp_hdr_size / 4; + nth->window = 0; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_syn(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts, u32 recv_seq) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(recv_seq - 1); + /* ack_seq is used to relay our ISN to the synproxy hook to initialize + * sequence number translation once a connection tracking entry exists. + */ + nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); + tcp_flag_word(nth) = TCP_FLAG_SYN; + if (opts->options & XT_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; + nth->doff = tcp_hdr_size / 4; + nth->window = th->window; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_ack(const struct synproxy_net *snet, + const struct ip_ct_tcp *state, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(ntohl(th->ack_seq)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_client_ack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (nskb == NULL) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr); + + skb_reset_transport_header(nskb); + nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(ntohl(th->seq) + 1); + nth->ack_seq = th->ack_seq; + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = ntohs(htons(th->window) >> opts->wscale); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static bool +synproxy_recv_client_ack(const struct synproxy_net *snet, + const struct sk_buff *skb, const struct tcphdr *th, + struct synproxy_options *opts, u32 recv_seq) +{ + int mss; + + mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1); + if (mss == 0) { + this_cpu_inc(snet->stats->cookie_invalid); + return false; + } + + this_cpu_inc(snet->stats->cookie_valid); + opts->mss = mss; + + if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy_check_timestamp_cookie(opts); + + synproxy_send_server_syn(snet, skb, th, opts, recv_seq); + return true; +} + +static unsigned int +synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_synproxy_info *info = par->targinfo; + struct synproxy_net *snet = synproxy_pernet(dev_net(par->in)); + struct synproxy_options opts = {}; + struct tcphdr *th, _th; + + if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP)) + return NF_DROP; + + th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); + if (th == NULL) + return NF_DROP; + + synproxy_parse_options(skb, par->thoff, th, &opts); + + if (th->syn && !th->ack) { + /* Initial SYN from client */ + this_cpu_inc(snet->stats->syn_received); + + if (th->ece && th->cwr) + opts.options |= XT_SYNPROXY_OPT_ECN; + + opts.options &= info->options; + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy_init_timestamp_cookie(info, &opts); + else + opts.options &= ~(XT_SYNPROXY_OPT_WSCALE | + XT_SYNPROXY_OPT_SACK_PERM | + XT_SYNPROXY_OPT_ECN); + + synproxy_send_client_synack(skb, th, &opts); + } else if (th->ack && !(th->fin || th->rst)) + /* ACK from client */ + synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq)); + + return NF_DROP; +} + +static unsigned int ipv4_synproxy_hook(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out)); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + struct nf_conn_synproxy *synproxy; + struct synproxy_options opts = {}; + const struct ip_ct_tcp *state; + struct tcphdr *th, _th; + unsigned int thoff; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return NF_ACCEPT; + + synproxy = nfct_synproxy(ct); + if (synproxy == NULL) + return NF_ACCEPT; + + if (nf_is_loopback_packet(skb)) + return NF_ACCEPT; + + thoff = ip_hdrlen(skb); + th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); + if (th == NULL) + return NF_DROP; + + state = &ct->proto.tcp; + switch (state->state) { + case TCP_CONNTRACK_CLOSE: + if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - + ntohl(th->seq) + 1); + break; + } + + if (!th->syn || th->ack || + CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + break; + + /* Reopened connection - reset the sequence number and timestamp + * adjustments, they will get initialized once the connection is + * reestablished. + */ + nf_ct_seqadj_init(ct, ctinfo, 0); + synproxy->tsoff = 0; + this_cpu_inc(snet->stats->conn_reopened); + + /* fall through */ + case TCP_CONNTRACK_SYN_SENT: + synproxy_parse_options(skb, thoff, th, &opts); + + if (!th->syn && th->ack && + CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, + * therefore we need to add 1 to make the SYN sequence + * number match the one of first SYN. + */ + if (synproxy_recv_client_ack(snet, skb, th, &opts, + ntohl(th->seq) + 1)) + this_cpu_inc(snet->stats->cookie_retrans); + + return NF_DROP; + } + + synproxy->isn = ntohl(th->ack_seq); + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy->its = opts.tsecr; + break; + case TCP_CONNTRACK_SYN_RECV: + if (!th->syn || !th->ack) + break; + + synproxy_parse_options(skb, thoff, th, &opts); + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + synproxy->tsoff = opts.tsval - synproxy->its; + + opts.options &= ~(XT_SYNPROXY_OPT_MSS | + XT_SYNPROXY_OPT_WSCALE | + XT_SYNPROXY_OPT_SACK_PERM); + + swap(opts.tsval, opts.tsecr); + synproxy_send_server_ack(snet, state, skb, th, &opts); + + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + + swap(opts.tsval, opts.tsecr); + synproxy_send_client_ack(snet, skb, th, &opts); + + consume_skb(skb); + return NF_STOLEN; + default: + break; + } + + synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); + return NF_ACCEPT; +} + +static int synproxy_tg4_check(const struct xt_tgchk_param *par) +{ + const struct ipt_entry *e = par->entryinfo; + + if (e->ip.proto != IPPROTO_TCP || + e->ip.invflags & XT_INV_PROTO) + return -EINVAL; + + return nf_ct_l3proto_try_module_get(par->family); +} + +static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_target synproxy_tg4_reg __read_mostly = { + .name = "SYNPROXY", + .family = NFPROTO_IPV4, + .target = synproxy_tg4, + .targetsize = sizeof(struct xt_synproxy_info), + .checkentry = synproxy_tg4_check, + .destroy = synproxy_tg4_destroy, + .me = THIS_MODULE, +}; + +static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = { + { + .hook = ipv4_synproxy_hook, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, + { + .hook = ipv4_synproxy_hook, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, +}; + +static int __init synproxy_tg4_init(void) +{ + int err; + + err = nf_register_hooks(ipv4_synproxy_ops, + ARRAY_SIZE(ipv4_synproxy_ops)); + if (err < 0) + goto err1; + + err = xt_register_target(&synproxy_tg4_reg); + if (err < 0) + goto err2; + + return 0; + +err2: + nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); +err1: + return err; +} + +static void __exit synproxy_tg4_exit(void) +{ + xt_unregister_target(&synproxy_tg4_reg); + nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); +} + +module_init(synproxy_tg4_init); +module_exit(synproxy_tg4_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy "); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index c45fc1a60e0d..62a171ab204f 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -408,6 +408,9 @@ config NF_NAT_TFTP depends on NF_CONNTRACK && NF_NAT default NF_NAT && NF_CONNTRACK_TFTP +config NETFILTER_SYNPROXY + tristate + endif # NF_CONNTRACK config NETFILTER_XTABLES diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 89a9c1658f5e..c3a0a12907f6 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -61,6 +61,9 @@ obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o +# SYNPROXY +obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o + # generic X tables obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 00a7a94d4132..5d892febd64c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -799,6 +800,11 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, if (IS_ERR(ct)) return (struct nf_conntrack_tuple_hash *)ct; + if (tmpl && nfct_synproxy(tmpl)) { + nfct_seqadj_ext_add(ct); + nfct_synproxy_ext_add(ct); + } + timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; if (timeout_ext) timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 984a8d1a3359..44d1ea32570a 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -946,6 +947,21 @@ static int tcp_packet(struct nf_conn *ct, "state %s ", tcp_conntrack_names[old_state]); return NF_ACCEPT; case TCP_CONNTRACK_MAX: + /* Special case for SYN proxy: when the SYN to the server or + * the SYN/ACK from the server is lost, the client may transmit + * a keep-alive packet while in SYN_SENT state. This needs to + * be associated with the original conntrack entry in order to + * generate a new SYN with the correct sequence number. + */ + if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT && + index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL && + ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL && + ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) { + pr_debug("nf_ct_tcp: SYN proxy client keep alive\n"); + spin_unlock_bh(&ct->lock); + return NF_ACCEPT; + } + /* Invalid packet */ pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", dir, get_conntrack_index(th), old_state); diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c index 483eb9ce3216..5f9bfd060dea 100644 --- a/net/netfilter/nf_conntrack_seqadj.c +++ b/net/netfilter/nf_conntrack_seqadj.c @@ -6,6 +6,26 @@ #include #include +int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + s32 off) +{ + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct nf_conn_seqadj *seqadj; + struct nf_ct_seqadj *this_way; + + if (off == 0) + return 0; + + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); + + seqadj = nfct_seqadj(ct); + this_way = &seqadj->seq[dir]; + this_way->offset_before = off; + this_way->offset_after = off; + return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_seqadj_init); + int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, __be32 seq, s32 off) { diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c new file mode 100644 index 000000000000..d23dc791aca7 --- /dev/null +++ b/net/netfilter/nf_synproxy_core.c @@ -0,0 +1,432 @@ +/* + * Copyright (c) 2013 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +int synproxy_net_id; +EXPORT_SYMBOL_GPL(synproxy_net_id); + +void +synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, + const struct tcphdr *th, struct synproxy_options *opts) +{ + int length = (th->doff * 4) - sizeof(*th); + u8 buf[40], *ptr; + + ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf); + BUG_ON(ptr == NULL); + + opts->options = 0; + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) + return; + if (opsize > length) + return; + + switch (opcode) { + case TCPOPT_MSS: + if (opsize == TCPOLEN_MSS) { + opts->mss = get_unaligned_be16(ptr); + opts->options |= XT_SYNPROXY_OPT_MSS; + } + break; + case TCPOPT_WINDOW: + if (opsize == TCPOLEN_WINDOW) { + opts->wscale = *ptr; + if (opts->wscale > 14) + opts->wscale = 14; + opts->options |= XT_SYNPROXY_OPT_WSCALE; + } + break; + case TCPOPT_TIMESTAMP: + if (opsize == TCPOLEN_TIMESTAMP) { + opts->tsval = get_unaligned_be32(ptr); + opts->tsecr = get_unaligned_be32(ptr + 4); + opts->options |= XT_SYNPROXY_OPT_TIMESTAMP; + } + break; + case TCPOPT_SACK_PERM: + if (opsize == TCPOLEN_SACK_PERM) + opts->options |= XT_SYNPROXY_OPT_SACK_PERM; + break; + } + + ptr += opsize - 2; + length -= opsize; + } + } +} +EXPORT_SYMBOL_GPL(synproxy_parse_options); + +unsigned int synproxy_options_size(const struct synproxy_options *opts) +{ + unsigned int size = 0; + + if (opts->options & XT_SYNPROXY_OPT_MSS) + size += TCPOLEN_MSS_ALIGNED; + if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) + size += TCPOLEN_TSTAMP_ALIGNED; + else if (opts->options & XT_SYNPROXY_OPT_SACK_PERM) + size += TCPOLEN_SACKPERM_ALIGNED; + if (opts->options & XT_SYNPROXY_OPT_WSCALE) + size += TCPOLEN_WSCALE_ALIGNED; + + return size; +} +EXPORT_SYMBOL_GPL(synproxy_options_size); + +void +synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts) +{ + __be32 *ptr = (__be32 *)(th + 1); + u8 options = opts->options; + + if (options & XT_SYNPROXY_OPT_MSS) + *ptr++ = htonl((TCPOPT_MSS << 24) | + (TCPOLEN_MSS << 16) | + opts->mss); + + if (options & XT_SYNPROXY_OPT_TIMESTAMP) { + if (options & XT_SYNPROXY_OPT_SACK_PERM) + *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | + (TCPOLEN_SACK_PERM << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + else + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + + *ptr++ = htonl(opts->tsval); + *ptr++ = htonl(opts->tsecr); + } else if (options & XT_SYNPROXY_OPT_SACK_PERM) + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_SACK_PERM << 8) | + TCPOLEN_SACK_PERM); + + if (options & XT_SYNPROXY_OPT_WSCALE) + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_WINDOW << 16) | + (TCPOLEN_WINDOW << 8) | + opts->wscale); +} +EXPORT_SYMBOL_GPL(synproxy_build_options); + +void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info, + struct synproxy_options *opts) +{ + opts->tsecr = opts->tsval; + opts->tsval = tcp_time_stamp & ~0x3f; + + if (opts->options & XT_SYNPROXY_OPT_WSCALE) + opts->tsval |= info->wscale; + else + opts->tsval |= 0xf; + + if (opts->options & XT_SYNPROXY_OPT_SACK_PERM) + opts->tsval |= 1 << 4; + + if (opts->options & XT_SYNPROXY_OPT_ECN) + opts->tsval |= 1 << 5; +} +EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie); + +void synproxy_check_timestamp_cookie(struct synproxy_options *opts) +{ + opts->wscale = opts->tsecr & 0xf; + if (opts->wscale != 0xf) + opts->options |= XT_SYNPROXY_OPT_WSCALE; + + opts->options |= opts->tsecr & (1 << 4) ? XT_SYNPROXY_OPT_SACK_PERM : 0; + + opts->options |= opts->tsecr & (1 << 5) ? XT_SYNPROXY_OPT_ECN : 0; +} +EXPORT_SYMBOL_GPL(synproxy_check_timestamp_cookie); + +unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, + unsigned int protoff, + struct tcphdr *th, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct nf_conn_synproxy *synproxy) +{ + unsigned int optoff, optend; + u32 *ptr, old; + + if (synproxy->tsoff == 0) + return 1; + + optoff = protoff + sizeof(struct tcphdr); + optend = protoff + th->doff * 4; + + if (!skb_make_writable(skb, optend)) + return 0; + + while (optoff < optend) { + unsigned char *op = skb->data + optoff; + + switch (op[0]) { + case TCPOPT_EOL: + return 1; + case TCPOPT_NOP: + optoff++; + continue; + default: + if (optoff + 1 == optend || + optoff + op[1] > optend || + op[1] < 2) + return 0; + if (op[0] == TCPOPT_TIMESTAMP && + op[1] == TCPOLEN_TIMESTAMP) { + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { + ptr = (u32 *)&op[2]; + old = *ptr; + *ptr = htonl(ntohl(*ptr) - + synproxy->tsoff); + } else { + ptr = (u32 *)&op[6]; + old = *ptr; + *ptr = htonl(ntohl(*ptr) + + synproxy->tsoff); + } + inet_proto_csum_replace4(&th->check, skb, + old, *ptr, 0); + return 1; + } + optoff += op[1]; + } + } + return 1; +} +EXPORT_SYMBOL_GPL(synproxy_tstamp_adjust); + +static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = { + .len = sizeof(struct nf_conn_synproxy), + .align = __alignof__(struct nf_conn_synproxy), + .id = NF_CT_EXT_SYNPROXY, +}; + +#ifdef CONFIG_PROC_FS +static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq)); + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos - 1; cpu < nr_cpu_ids; cpu++) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return per_cpu_ptr(snet->stats, cpu); + } + + return NULL; +} + +static void *synproxy_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq)); + int cpu; + + for (cpu = *pos; cpu < nr_cpu_ids; cpu++) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return per_cpu_ptr(snet->stats, cpu); + } + + return NULL; +} + +static void synproxy_cpu_seq_stop(struct seq_file *seq, void *v) +{ + return; +} + +static int synproxy_cpu_seq_show(struct seq_file *seq, void *v) +{ + struct synproxy_stats *stats = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries\t\tsyn_received\t" + "cookie_invalid\tcookie_valid\t" + "cookie_retrans\tconn_reopened\n"); + return 0; + } + + seq_printf(seq, "%08x\t%08x\t%08x\t%08x\t%08x\t%08x\n", 0, + stats->syn_received, + stats->cookie_invalid, + stats->cookie_valid, + stats->cookie_retrans, + stats->conn_reopened); + + return 0; +} + +static const struct seq_operations synproxy_cpu_seq_ops = { + .start = synproxy_cpu_seq_start, + .next = synproxy_cpu_seq_next, + .stop = synproxy_cpu_seq_stop, + .show = synproxy_cpu_seq_show, +}; + +static int synproxy_cpu_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &synproxy_cpu_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations synproxy_cpu_seq_fops = { + .owner = THIS_MODULE, + .open = synproxy_cpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int __net_init synproxy_proc_init(struct net *net) +{ + if (!proc_create("synproxy", S_IRUGO, net->proc_net_stat, + &synproxy_cpu_seq_fops)) + return -ENOMEM; + return 0; +} + +static void __net_exit synproxy_proc_exit(struct net *net) +{ + remove_proc_entry("synproxy", net->proc_net_stat); +} +#else +static int __net_init synproxy_proc_init(struct net *net) +{ + return 0; +} + +static void __net_exit synproxy_proc_exit(struct net *net) +{ + return; +} +#endif /* CONFIG_PROC_FS */ + +static int __net_init synproxy_net_init(struct net *net) +{ + struct synproxy_net *snet = synproxy_pernet(net); + struct nf_conntrack_tuple t; + struct nf_conn *ct; + int err = -ENOMEM; + + memset(&t, 0, sizeof(t)); + ct = nf_conntrack_alloc(net, 0, &t, &t, GFP_KERNEL); + if (IS_ERR(ct)) { + err = PTR_ERR(ct); + goto err1; + } + + __set_bit(IPS_TEMPLATE_BIT, &ct->status); + __set_bit(IPS_CONFIRMED_BIT, &ct->status); + if (!nfct_seqadj_ext_add(ct)) + goto err2; + if (!nfct_synproxy_ext_add(ct)) + goto err2; + + snet->tmpl = ct; + + snet->stats = alloc_percpu(struct synproxy_stats); + if (snet->stats == NULL) + goto err2; + + err = synproxy_proc_init(net); + if (err < 0) + goto err3; + + return 0; + +err3: + free_percpu(snet->stats); +err2: + nf_conntrack_free(ct); +err1: + return err; +} + +static void __net_exit synproxy_net_exit(struct net *net) +{ + struct synproxy_net *snet = synproxy_pernet(net); + + nf_conntrack_free(snet->tmpl); + synproxy_proc_exit(net); + free_percpu(snet->stats); +} + +static struct pernet_operations synproxy_net_ops = { + .init = synproxy_net_init, + .exit = synproxy_net_exit, + .id = &synproxy_net_id, + .size = sizeof(struct synproxy_net), +}; + +static int __init synproxy_core_init(void) +{ + int err; + + err = nf_ct_extend_register(&nf_ct_synproxy_extend); + if (err < 0) + goto err1; + + err = register_pernet_subsys(&synproxy_net_ops); + if (err < 0) + goto err2; + + return 0; + +err2: + nf_ct_extend_unregister(&nf_ct_synproxy_extend); +err1: + return err; +} + +static void __exit synproxy_core_exit(void) +{ + unregister_pernet_subsys(&synproxy_net_ops); + nf_ct_extend_unregister(&nf_ct_synproxy_extend); +} + +module_init(synproxy_core_init); +module_exit(synproxy_core_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy "); -- cgit v1.2.3-71-gd317 From 355fe568e285815e6f62fcd004ca80085ae46a69 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 27 Aug 2013 16:02:18 -0700 Subject: Revert "OMAP: UART: Keep the TX fifo full when possible" This reverts commit c4415084218c68c5ee2fc583431e89a78d896b19. Kevin writes: Hmm, another OMAP serial patch that wasn't Cc'd to linux-omap where OMAP users might have seen it. :( I just bisected a strange problem in linux-next on OMAP3 down to this patch. Reverting it fixes the problem. On OMAP3530 Beagle and Overo, after boot, doing a 'cat /proc/cpuinfo' was not returning to a prompt, suggesting something strange with the FIFO. Hitting return gets me back to a prompt. Greg, this one should also be dropped from tty-next until it can be further investgated and the problem solved. Reported-by: Kevin Hilman Cc: Dmitry Fink Cc: Alexander Savchenko Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/omap-serial.c | 3 +-- include/uapi/linux/serial_reg.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c index b0f46e666e1a..816d1a23f9d0 100644 --- a/drivers/tty/serial/omap-serial.c +++ b/drivers/tty/serial/omap-serial.c @@ -350,8 +350,7 @@ static void transmit_chars(struct uart_omap_port *up, unsigned int lsr) serial_omap_stop_tx(&up->port); return; } - count = up->port.fifosize - - (serial_in(up, UART_OMAP_TXFIFO_LVL) & 0xFF); + count = up->port.fifosize / 4; do { serial_out(up, UART_TX, xmit->buf[xmit->tail]); xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1); diff --git a/include/uapi/linux/serial_reg.h b/include/uapi/linux/serial_reg.h index 97c26beae605..e6322605b138 100644 --- a/include/uapi/linux/serial_reg.h +++ b/include/uapi/linux/serial_reg.h @@ -366,7 +366,6 @@ #define UART_OMAP_MDR1_FIR_MODE 0x05 /* FIR mode */ #define UART_OMAP_MDR1_CIR_MODE 0x06 /* CIR mode */ #define UART_OMAP_MDR1_DISABLE 0x07 /* Disable (default state) */ -#define UART_OMAP_TXFIFO_LVL 0x1A /* TX FIFO fullness */ /* * These are definitions for the Exar XR17V35X and XR17(C|D)15X -- cgit v1.2.3-71-gd317 From fbf501c347b2eea8451a615bd823b6b91a1a8eed Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 27 Aug 2013 10:28:25 -0600 Subject: PCI: Clarify PCI_EXP_TYPE_PCI_BRIDGE comment The PCI_EXP_TYPE_PCI_BRIDGE is a *PCIe* function that is a bridge to PCI/PCI-X. See PCIe spec r3.0, sec 7.8.2. Signed-off-by: Bjorn Helgaas --- include/uapi/linux/pci_regs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 4b8f2e3bad58..5bb33f485271 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -421,8 +421,8 @@ #define PCI_EXP_TYPE_ROOT_PORT 0x4 /* Root Port */ #define PCI_EXP_TYPE_UPSTREAM 0x5 /* Upstream Port */ #define PCI_EXP_TYPE_DOWNSTREAM 0x6 /* Downstream Port */ -#define PCI_EXP_TYPE_PCI_BRIDGE 0x7 /* PCI/PCI-X Bridge */ -#define PCI_EXP_TYPE_PCIE_BRIDGE 0x8 /* PCI/PCI-X to PCIE Bridge */ +#define PCI_EXP_TYPE_PCI_BRIDGE 0x7 /* PCIe to PCI/PCI-X Bridge */ +#define PCI_EXP_TYPE_PCIE_BRIDGE 0x8 /* PCI/PCI-X to PCIe Bridge */ #define PCI_EXP_TYPE_RC_END 0x9 /* Root Complex Integrated Endpoint */ #define PCI_EXP_TYPE_RC_EC 0xa /* Root Complex Event Collector */ #define PCI_EXP_FLAGS_SLOT 0x0100 /* Slot implemented */ -- cgit v1.2.3-71-gd317 From 1b121c24dd4a9bb7156f10c7da3f39515a9fa6f0 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 27 Aug 2013 11:10:02 -0600 Subject: PCI: Remove obsolete comment reference to pci_pcie_cap2() pci_pcie_cap2() was replaced by pcie_capability_read_word() and similar functions, so update the comment. Signed-off-by: Bjorn Helgaas --- include/uapi/linux/pci_regs.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 5bb33f485271..591bb3b2ee4a 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -544,10 +544,12 @@ #define PCI_EXP_RTSTA_PME 0x10000 /* PME status */ #define PCI_EXP_RTSTA_PENDING 0x20000 /* PME pending */ /* - * Note that the following PCI Express 'Capability Structure' registers - * were introduced with 'Capability Version' 0x2 (v2). These registers - * do not exist on devices with Capability Version 1. Use pci_pcie_cap2() - * to use these fields safely. + * The Device Capabilities 2, Device Status 2, Device Control 2, + * Link Capabilities 2, Link Status 2, Link Control 2, + * Slot Capabilities 2, Slot Status 2, and Slot Control 2 registers + * are only present on devices with PCIe Capability version 2. + * Use pcie_capability_read_word() and similar interfaces to use them + * safely. */ #define PCI_EXP_DEVCAP2 36 /* Device Capabilities 2 */ #define PCI_EXP_DEVCAP2_ARI 0x20 /* Alternative Routing-ID */ -- cgit v1.2.3-71-gd317 From c0b4b3815d4e65c082d6e85d0fccf25b230e7890 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 27 Aug 2013 11:28:36 -0600 Subject: PCI: Tidy bitmasks and spacing of PCIe capability definitions The convention of showing bits in a mask of the full register width, e.g., "0x00000007" instead of "0x07" for a field in a 32-bit register, is common but not universal in this file. This patch makes it consistently used at least for the PCIe capability. Whitespace and zero-extension changes only; no functional change. Signed-off-by: Bjorn Helgaas --- include/uapi/linux/pci_regs.h | 90 +++++++++++++++++++++---------------------- 1 file changed, 45 insertions(+), 45 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 591bb3b2ee4a..28c83eceefe3 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -428,17 +428,17 @@ #define PCI_EXP_FLAGS_SLOT 0x0100 /* Slot implemented */ #define PCI_EXP_FLAGS_IRQ 0x3e00 /* Interrupt message number */ #define PCI_EXP_DEVCAP 4 /* Device capabilities */ -#define PCI_EXP_DEVCAP_PAYLOAD 0x07 /* Max_Payload_Size */ -#define PCI_EXP_DEVCAP_PHANTOM 0x18 /* Phantom functions */ -#define PCI_EXP_DEVCAP_EXT_TAG 0x20 /* Extended tags */ -#define PCI_EXP_DEVCAP_L0S 0x1c0 /* L0s Acceptable Latency */ -#define PCI_EXP_DEVCAP_L1 0xe00 /* L1 Acceptable Latency */ -#define PCI_EXP_DEVCAP_ATN_BUT 0x1000 /* Attention Button Present */ -#define PCI_EXP_DEVCAP_ATN_IND 0x2000 /* Attention Indicator Present */ -#define PCI_EXP_DEVCAP_PWR_IND 0x4000 /* Power Indicator Present */ -#define PCI_EXP_DEVCAP_RBER 0x8000 /* Role-Based Error Reporting */ -#define PCI_EXP_DEVCAP_PWR_VAL 0x3fc0000 /* Slot Power Limit Value */ -#define PCI_EXP_DEVCAP_PWR_SCL 0xc000000 /* Slot Power Limit Scale */ +#define PCI_EXP_DEVCAP_PAYLOAD 0x00000007 /* Max_Payload_Size */ +#define PCI_EXP_DEVCAP_PHANTOM 0x00000018 /* Phantom functions */ +#define PCI_EXP_DEVCAP_EXT_TAG 0x00000020 /* Extended tags */ +#define PCI_EXP_DEVCAP_L0S 0x000001c0 /* L0s Acceptable Latency */ +#define PCI_EXP_DEVCAP_L1 0x00000e00 /* L1 Acceptable Latency */ +#define PCI_EXP_DEVCAP_ATN_BUT 0x00001000 /* Attention Button Present */ +#define PCI_EXP_DEVCAP_ATN_IND 0x00002000 /* Attention Indicator Present */ +#define PCI_EXP_DEVCAP_PWR_IND 0x00004000 /* Power Indicator Present */ +#define PCI_EXP_DEVCAP_RBER 0x00008000 /* Role-Based Error Reporting */ +#define PCI_EXP_DEVCAP_PWR_VAL 0x03fc0000 /* Slot Power Limit Value */ +#define PCI_EXP_DEVCAP_PWR_SCL 0x0c000000 /* Slot Power Limit Scale */ #define PCI_EXP_DEVCAP_FLR 0x10000000 /* Function Level Reset */ #define PCI_EXP_DEVCTL 8 /* Device Control */ #define PCI_EXP_DEVCTL_CERE 0x0001 /* Correctable Error Reporting En. */ @@ -454,16 +454,16 @@ #define PCI_EXP_DEVCTL_READRQ 0x7000 /* Max_Read_Request_Size */ #define PCI_EXP_DEVCTL_BCR_FLR 0x8000 /* Bridge Configuration Retry / FLR */ #define PCI_EXP_DEVSTA 10 /* Device Status */ -#define PCI_EXP_DEVSTA_CED 0x01 /* Correctable Error Detected */ -#define PCI_EXP_DEVSTA_NFED 0x02 /* Non-Fatal Error Detected */ -#define PCI_EXP_DEVSTA_FED 0x04 /* Fatal Error Detected */ -#define PCI_EXP_DEVSTA_URD 0x08 /* Unsupported Request Detected */ -#define PCI_EXP_DEVSTA_AUXPD 0x10 /* AUX Power Detected */ -#define PCI_EXP_DEVSTA_TRPND 0x20 /* Transactions Pending */ +#define PCI_EXP_DEVSTA_CED 0x0001 /* Correctable Error Detected */ +#define PCI_EXP_DEVSTA_NFED 0x0002 /* Non-Fatal Error Detected */ +#define PCI_EXP_DEVSTA_FED 0x0004 /* Fatal Error Detected */ +#define PCI_EXP_DEVSTA_URD 0x0008 /* Unsupported Request Detected */ +#define PCI_EXP_DEVSTA_AUXPD 0x0010 /* AUX Power Detected */ +#define PCI_EXP_DEVSTA_TRPND 0x0020 /* Transactions Pending */ #define PCI_EXP_LNKCAP 12 /* Link Capabilities */ #define PCI_EXP_LNKCAP_SLS 0x0000000f /* Supported Link Speeds */ -#define PCI_EXP_LNKCAP_SLS_2_5GB 0x1 /* LNKCAP2 SLS Vector bit 0 (2.5GT/s) */ -#define PCI_EXP_LNKCAP_SLS_5_0GB 0x2 /* LNKCAP2 SLS Vector bit 1 (5.0GT/s) */ +#define PCI_EXP_LNKCAP_SLS_2_5GB 0x00000001 /* LNKCAP2 SLS Vector bit 0 */ +#define PCI_EXP_LNKCAP_SLS_5_0GB 0x00000002 /* LNKCAP2 SLS Vector bit 1 */ #define PCI_EXP_LNKCAP_MLW 0x000003f0 /* Maximum Link Width */ #define PCI_EXP_LNKCAP_ASPMS 0x00000c00 /* ASPM Support */ #define PCI_EXP_LNKCAP_L0SEL 0x00007000 /* L0s Exit Latency */ @@ -475,21 +475,21 @@ #define PCI_EXP_LNKCAP_PN 0xff000000 /* Port Number */ #define PCI_EXP_LNKCTL 16 /* Link Control */ #define PCI_EXP_LNKCTL_ASPMC 0x0003 /* ASPM Control */ -#define PCI_EXP_LNKCTL_ASPM_L0S 0x01 /* L0s Enable */ -#define PCI_EXP_LNKCTL_ASPM_L1 0x02 /* L1 Enable */ +#define PCI_EXP_LNKCTL_ASPM_L0S 0x0001 /* L0s Enable */ +#define PCI_EXP_LNKCTL_ASPM_L1 0x0002 /* L1 Enable */ #define PCI_EXP_LNKCTL_RCB 0x0008 /* Read Completion Boundary */ #define PCI_EXP_LNKCTL_LD 0x0010 /* Link Disable */ #define PCI_EXP_LNKCTL_RL 0x0020 /* Retrain Link */ #define PCI_EXP_LNKCTL_CCC 0x0040 /* Common Clock Configuration */ #define PCI_EXP_LNKCTL_ES 0x0080 /* Extended Synch */ -#define PCI_EXP_LNKCTL_CLKREQ_EN 0x100 /* Enable clkreq */ +#define PCI_EXP_LNKCTL_CLKREQ_EN 0x0100 /* Enable clkreq */ #define PCI_EXP_LNKCTL_HAWD 0x0200 /* Hardware Autonomous Width Disable */ #define PCI_EXP_LNKCTL_LBMIE 0x0400 /* Link Bandwidth Management Interrupt Enable */ #define PCI_EXP_LNKCTL_LABIE 0x0800 /* Lnk Autonomous Bandwidth Interrupt Enable */ #define PCI_EXP_LNKSTA 18 /* Link Status */ #define PCI_EXP_LNKSTA_CLS 0x000f /* Current Link Speed */ -#define PCI_EXP_LNKSTA_CLS_2_5GB 0x01 /* Current Link Speed 2.5GT/s */ -#define PCI_EXP_LNKSTA_CLS_5_0GB 0x02 /* Current Link Speed 5.0GT/s */ +#define PCI_EXP_LNKSTA_CLS_2_5GB 0x0001 /* Current Link Speed 2.5GT/s */ +#define PCI_EXP_LNKSTA_CLS_5_0GB 0x0002 /* Current Link Speed 5.0GT/s */ #define PCI_EXP_LNKSTA_NLW 0x03f0 /* Nogotiated Link Width */ #define PCI_EXP_LNKSTA_NLW_SHIFT 4 /* start of NLW mask in link status */ #define PCI_EXP_LNKSTA_LT 0x0800 /* Link Training */ @@ -534,15 +534,15 @@ #define PCI_EXP_SLTSTA_EIS 0x0080 /* Electromechanical Interlock Status */ #define PCI_EXP_SLTSTA_DLLSC 0x0100 /* Data Link Layer State Changed */ #define PCI_EXP_RTCTL 28 /* Root Control */ -#define PCI_EXP_RTCTL_SECEE 0x01 /* System Error on Correctable Error */ -#define PCI_EXP_RTCTL_SENFEE 0x02 /* System Error on Non-Fatal Error */ -#define PCI_EXP_RTCTL_SEFEE 0x04 /* System Error on Fatal Error */ -#define PCI_EXP_RTCTL_PMEIE 0x08 /* PME Interrupt Enable */ -#define PCI_EXP_RTCTL_CRSSVE 0x10 /* CRS Software Visibility Enable */ +#define PCI_EXP_RTCTL_SECEE 0x0001 /* System Error on Correctable Error */ +#define PCI_EXP_RTCTL_SENFEE 0x0002 /* System Error on Non-Fatal Error */ +#define PCI_EXP_RTCTL_SEFEE 0x0004 /* System Error on Fatal Error */ +#define PCI_EXP_RTCTL_PMEIE 0x0008 /* PME Interrupt Enable */ +#define PCI_EXP_RTCTL_CRSSVE 0x0010 /* CRS Software Visibility Enable */ #define PCI_EXP_RTCAP 30 /* Root Capabilities */ #define PCI_EXP_RTSTA 32 /* Root Status */ -#define PCI_EXP_RTSTA_PME 0x10000 /* PME status */ -#define PCI_EXP_RTSTA_PENDING 0x20000 /* PME pending */ +#define PCI_EXP_RTSTA_PME 0x00010000 /* PME status */ +#define PCI_EXP_RTSTA_PENDING 0x00020000 /* PME pending */ /* * The Device Capabilities 2, Device Status 2, Device Control 2, * Link Capabilities 2, Link Status 2, Link Control 2, @@ -552,25 +552,25 @@ * safely. */ #define PCI_EXP_DEVCAP2 36 /* Device Capabilities 2 */ -#define PCI_EXP_DEVCAP2_ARI 0x20 /* Alternative Routing-ID */ -#define PCI_EXP_DEVCAP2_LTR 0x800 /* Latency tolerance reporting */ -#define PCI_EXP_DEVCAP2_OBFF_MASK 0xc0000 /* OBFF support mechanism */ -#define PCI_EXP_DEVCAP2_OBFF_MSG 0x40000 /* New message signaling */ -#define PCI_EXP_DEVCAP2_OBFF_WAKE 0x80000 /* Re-use WAKE# for OBFF */ +#define PCI_EXP_DEVCAP2_ARI 0x00000020 /* Alternative Routing-ID */ +#define PCI_EXP_DEVCAP2_LTR 0x00000800 /* Latency tolerance reporting */ +#define PCI_EXP_DEVCAP2_OBFF_MASK 0x000c0000 /* OBFF support mechanism */ +#define PCI_EXP_DEVCAP2_OBFF_MSG 0x00040000 /* New message signaling */ +#define PCI_EXP_DEVCAP2_OBFF_WAKE 0x00080000 /* Re-use WAKE# for OBFF */ #define PCI_EXP_DEVCTL2 40 /* Device Control 2 */ #define PCI_EXP_DEVCTL2_ARI 0x20 /* Alternative Routing-ID */ -#define PCI_EXP_DEVCTL2_IDO_REQ_EN 0x100 /* ID-based ordering request enable */ -#define PCI_EXP_DEVCTL2_IDO_CMP_EN 0x200 /* ID-based ordering completion enable */ -#define PCI_EXP_DEVCTL2_LTR_EN 0x400 /* Latency tolerance reporting */ -#define PCI_EXP_DEVCTL2_OBFF_MSGA_EN 0x2000 /* OBFF enable with Message type A */ -#define PCI_EXP_DEVCTL2_OBFF_MSGB_EN 0x4000 /* OBFF enable with Message type B */ +#define PCI_EXP_DEVCTL2_IDO_REQ_EN 0x0100 /* Allow IDO for requests */ +#define PCI_EXP_DEVCTL2_IDO_CMP_EN 0x0200 /* Allow IDO for completions */ +#define PCI_EXP_DEVCTL2_LTR_EN 0x0400 /* Enable LTR mechanism */ +#define PCI_EXP_DEVCTL2_OBFF_MSGA_EN 0x2000 /* Enable OBFF Message type A */ +#define PCI_EXP_DEVCTL2_OBFF_MSGB_EN 0x4000 /* Enable OBFF Message type B */ #define PCI_EXP_DEVCTL2_OBFF_WAKE_EN 0x6000 /* OBFF using WAKE# signaling */ #define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 44 /* v2 endpoints end here */ #define PCI_EXP_LNKCAP2 44 /* Link Capability 2 */ -#define PCI_EXP_LNKCAP2_SLS_2_5GB 0x02 /* Supported Link Speed 2.5GT/s */ -#define PCI_EXP_LNKCAP2_SLS_5_0GB 0x04 /* Supported Link Speed 5.0GT/s */ -#define PCI_EXP_LNKCAP2_SLS_8_0GB 0x08 /* Supported Link Speed 8.0GT/s */ -#define PCI_EXP_LNKCAP2_CROSSLINK 0x100 /* Crosslink supported */ +#define PCI_EXP_LNKCAP2_SLS_2_5GB 0x00000002 /* Supported Speed 2.5GT/s */ +#define PCI_EXP_LNKCAP2_SLS_5_0GB 0x00000004 /* Supported Speed 5.0GT/s */ +#define PCI_EXP_LNKCAP2_SLS_8_0GB 0x00000008 /* Supported Speed 8.0GT/s */ +#define PCI_EXP_LNKCAP2_CROSSLINK 0x00000100 /* Crosslink supported */ #define PCI_EXP_LNKCTL2 48 /* Link Control 2 */ #define PCI_EXP_LNKSTA2 50 /* Link Status 2 */ #define PCI_EXP_SLTCTL2 56 /* Slot Control 2 */ -- cgit v1.2.3-71-gd317 From bd6fb762b595fb83758ae50d3610223244234a2d Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 27 Aug 2013 12:17:59 -0600 Subject: PCI: Add offsets of PCIe capability registers These offsets are not used, and in some cases are completely reserved even in the spec, but I'm adding them for completeness just to match the diagrams in the spec, e.g., PCIe spec r3.0, sec 7.8. Signed-off-by: Bjorn Helgaas --- include/uapi/linux/pci_regs.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 28c83eceefe3..baa7852468ef 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -565,15 +565,18 @@ #define PCI_EXP_DEVCTL2_OBFF_MSGA_EN 0x2000 /* Enable OBFF Message type A */ #define PCI_EXP_DEVCTL2_OBFF_MSGB_EN 0x4000 /* Enable OBFF Message type B */ #define PCI_EXP_DEVCTL2_OBFF_WAKE_EN 0x6000 /* OBFF using WAKE# signaling */ +#define PCI_EXP_DEVSTA2 42 /* Device Status 2 */ #define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 44 /* v2 endpoints end here */ -#define PCI_EXP_LNKCAP2 44 /* Link Capability 2 */ +#define PCI_EXP_LNKCAP2 44 /* Link Capabilities 2 */ #define PCI_EXP_LNKCAP2_SLS_2_5GB 0x00000002 /* Supported Speed 2.5GT/s */ #define PCI_EXP_LNKCAP2_SLS_5_0GB 0x00000004 /* Supported Speed 5.0GT/s */ #define PCI_EXP_LNKCAP2_SLS_8_0GB 0x00000008 /* Supported Speed 8.0GT/s */ #define PCI_EXP_LNKCAP2_CROSSLINK 0x00000100 /* Crosslink supported */ #define PCI_EXP_LNKCTL2 48 /* Link Control 2 */ #define PCI_EXP_LNKSTA2 50 /* Link Status 2 */ +#define PCI_EXP_SLTCAP2 52 /* Slot Capabilities 2 */ #define PCI_EXP_SLTCTL2 56 /* Slot Control 2 */ +#define PCI_EXP_SLTSTA2 58 /* Slot Status 2 */ /* Extended Capabilities (PCI-X 2.0 and Express) */ #define PCI_EXT_CAP_ID(header) (header & 0x0000ffff) -- cgit v1.2.3-71-gd317 From aaaafb7f953c60d9c9eeb1b10ecdbe6970421b24 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Wed, 28 Aug 2013 16:35:16 -0700 Subject: Omnikey Cardman 4000: pull in ioctl.h in user header This file uses the ioctl helpers (_IOR/_IOW/etc...), so include ioctl.h for the definitions. Signed-off-by: Mike Frysinger Cc: Harald Welte Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/cm4000_cs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/cm4000_cs.h b/include/uapi/linux/cm4000_cs.h index bc51f77db918..1217f751a1bc 100644 --- a/include/uapi/linux/cm4000_cs.h +++ b/include/uapi/linux/cm4000_cs.h @@ -2,6 +2,7 @@ #define _UAPI_CM4000_H_ #include +#include #define MAX_ATR 33 -- cgit v1.2.3-71-gd317 From ff3d527cebc1fa3707c617bfe9e74f53fcfb0955 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 27 Aug 2013 11:23:07 +0300 Subject: perf: make events stream always parsable The event stream is not always parsable because the format of a sample is dependent on the sample_type of the selected event. When there is more than one selected event and the sample_types are not the same then parsing becomes problematic. A sample can be matched to its selected event using the ID that is allocated when the event is opened. Unfortunately, to get the ID from the sample means first parsing it. This patch adds a new sample format bit PERF_SAMPLE_IDENTIFER that puts the ID at a fixed position so that the ID can be retrieved without parsing the sample. For sample events, that is the first position immediately after the header. For non-sample events, that is the last position. In this respect parsing samples requires that the sample_type and ID values are recorded. For example, perf tools records struct perf_event_attr and the IDs within the perf.data file. Those must be read first before it is possible to parse samples found later in the perf.data file. Signed-off-by: Adrian Hunter Tested-by: Stephane Eranian Acked-by: Peter Zijlstra Cc: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1377591794-30553-6-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- include/uapi/linux/perf_event.h | 27 ++++++++++++++++++++------- kernel/events/core.c | 11 ++++++++++- 2 files changed, 30 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 62c25a25291c..42cb7b62ca59 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -134,8 +134,9 @@ enum perf_event_sample_format { PERF_SAMPLE_STACK_USER = 1U << 13, PERF_SAMPLE_WEIGHT = 1U << 14, PERF_SAMPLE_DATA_SRC = 1U << 15, + PERF_SAMPLE_IDENTIFIER = 1U << 16, - PERF_SAMPLE_MAX = 1U << 16, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 17, /* non-ABI */ }; /* @@ -492,12 +493,12 @@ enum perf_event_type { /* * If perf_event_attr.sample_id_all is set then all event types will * have the sample_type selected fields related to where/when - * (identity) an event took place (TID, TIME, ID, CPU, STREAM_ID) - * described in PERF_RECORD_SAMPLE below, it will be stashed just after - * the perf_event_header and the fields already present for the existing - * fields, i.e. at the end of the payload. That way a newer perf.data - * file will be supported by older perf tools, with these new optional - * fields being ignored. + * (identity) an event took place (TID, TIME, ID, STREAM_ID, CPU, + * IDENTIFIER) described in PERF_RECORD_SAMPLE below, it will be stashed + * just after the perf_event_header and the fields already present for + * the existing fields, i.e. at the end of the payload. That way a newer + * perf.data file will be supported by older perf tools, with these new + * optional fields being ignored. * * struct sample_id { * { u32 pid, tid; } && PERF_SAMPLE_TID @@ -505,7 +506,12 @@ enum perf_event_type { * { u64 id; } && PERF_SAMPLE_ID * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID * { u32 cpu, res; } && PERF_SAMPLE_CPU + * { u64 id; } && PERF_SAMPLE_IDENTIFIER * } && perf_event_attr::sample_id_all + * + * Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID. The + * advantage of PERF_SAMPLE_IDENTIFIER is that its position is fixed + * relative to header.size. */ /* @@ -594,6 +600,13 @@ enum perf_event_type { * struct { * struct perf_event_header header; * + * # + * # Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID. + * # The advantage of PERF_SAMPLE_IDENTIFIER is that its position + * # is fixed relative to header. + * # + * + * { u64 id; } && PERF_SAMPLE_IDENTIFIER * { u64 ip; } && PERF_SAMPLE_IP * { u32 pid, tid; } && PERF_SAMPLE_TID * { u64 time; } && PERF_SAMPLE_TIME diff --git a/kernel/events/core.c b/kernel/events/core.c index 928fae7ca8c7..15d0f2418e54 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1213,6 +1213,9 @@ static void perf_event__id_header_size(struct perf_event *event) if (sample_type & PERF_SAMPLE_TIME) size += sizeof(data->time); + if (sample_type & PERF_SAMPLE_IDENTIFIER) + size += sizeof(data->id); + if (sample_type & PERF_SAMPLE_ID) size += sizeof(data->id); @@ -4280,7 +4283,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_TIME) data->time = perf_clock(); - if (sample_type & PERF_SAMPLE_ID) + if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) data->id = primary_event_id(event); if (sample_type & PERF_SAMPLE_STREAM_ID) @@ -4319,6 +4322,9 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_CPU) perf_output_put(handle, data->cpu_entry); + + if (sample_type & PERF_SAMPLE_IDENTIFIER) + perf_output_put(handle, data->id); } void perf_event__output_id_sample(struct perf_event *event, @@ -4432,6 +4438,9 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_put(handle, *header); + if (sample_type & PERF_SAMPLE_IDENTIFIER) + perf_output_put(handle, data->id); + if (sample_type & PERF_SAMPLE_IP) perf_output_put(handle, data->ip); -- cgit v1.2.3-71-gd317 From b800c3b966bcf004bd8592293a49ed5cb7ea67a9 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Tue, 27 Aug 2013 01:36:51 +0200 Subject: ipv6: drop fragmented ndisc packets by default (RFC 6980) This patch implements RFC6980: Drop fragmented ndisc packets by default. If a fragmented ndisc packet is received the user is informed that it is possible to disable the check. Cc: Fernando Gont Cc: YOSHIFUJI Hideaki Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 6 ++++++ include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + net/ipv6/addrconf.c | 10 ++++++++++ net/ipv6/ndisc.c | 17 +++++++++++++++++ 5 files changed, 35 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index debfe857d8f9..a2be556032c9 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1349,6 +1349,12 @@ mldv2_unsolicited_report_interval - INTEGER MLDv2 report retransmit will take place. Default: 1000 (1 second) +suppress_frag_ndisc - INTEGER + Control RFC 6980 (Security Implications of IPv6 Fragmentation + with IPv6 Neighbor Discovery) behavior: + 1 - (default) discard fragmented neighbor discovery packets + 0 - allow fragmented neighbor discovery packets + icmp/*: ratelimit - INTEGER Limit the maximal rates for sending ICMPv6 packets. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 9ac5047062c8..28ea38439313 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -50,6 +50,7 @@ struct ipv6_devconf { __s32 accept_dad; __s32 force_tllao; __s32 ndisc_notify; + __s32 suppress_frag_ndisc; void *sysctl; }; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index d07ac6903e59..593b0e32d956 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -162,6 +162,7 @@ enum { DEVCONF_NDISC_NOTIFY, DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL, DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL, + DEVCONF_SUPPRESS_FRAG_NDISC, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 2d6d1793bbfe..a7183fc9bbc2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -204,6 +204,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .accept_source_route = 0, /* we do not accept RH0 by default. */ .disable_ipv6 = 0, .accept_dad = 1, + .suppress_frag_ndisc = 1, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -241,6 +242,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .accept_source_route = 0, /* we do not accept RH0 by default. */ .disable_ipv6 = 0, .accept_dad = 1, + .suppress_frag_ndisc = 1, }; /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ @@ -4188,6 +4190,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad; array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao; array[DEVCONF_NDISC_NOTIFY] = cnf->ndisc_notify; + array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc; } static inline size_t inet6_ifla6_size(void) @@ -5001,6 +5004,13 @@ static struct addrconf_sysctl_table .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "suppress_frag_ndisc", + .data = &ipv6_devconf.suppress_frag_ndisc, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { /* sentinel */ } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 04d31c2fbef1..41720feeaa64 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1519,10 +1519,27 @@ static void pndisc_redo(struct sk_buff *skb) kfree_skb(skb); } +static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb) +{ + struct inet6_dev *idev = __in6_dev_get(skb->dev); + + if (!idev) + return true; + if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED && + idev->cnf.suppress_frag_ndisc) { + net_warn_ratelimited("Received fragmented ndisc packet. Carefully consider disabling suppress_frag_ndisc.\n"); + return true; + } + return false; +} + int ndisc_rcv(struct sk_buff *skb) { struct nd_msg *msg; + if (ndisc_suppress_frag_ndisc(skb)) + return 0; + if (skb_linearize(skb)) return 0; -- cgit v1.2.3-71-gd317 From 5df0ddfbc9209ffafc82236509ba0e975120e3c3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 28 Aug 2013 22:13:09 +0200 Subject: net: packet: add randomized fanout scheduler We currently allow for different fanout scheduling policies in pf_packet such as scheduling by skb's rxhash, round-robin, by cpu, and rollover. Also allow for a random, equidistributed selection of the socket from the fanout process group. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/if_packet.h | 1 + net/packet/af_packet.c | 13 ++++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index b950c02030c0..dbf06667394b 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -56,6 +56,7 @@ struct sockaddr_ll { #define PACKET_FANOUT_LB 1 #define PACKET_FANOUT_CPU 2 #define PACKET_FANOUT_ROLLOVER 3 +#define PACKET_FANOUT_RND 4 #define PACKET_FANOUT_FLAG_ROLLOVER 0x1000 #define PACKET_FANOUT_FLAG_DEFRAG 0x8000 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 1fdf9ab91c3f..bee9bfdc8d05 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -88,7 +88,7 @@ #include #include #include - +#include #ifdef CONFIG_INET #include #endif @@ -1158,6 +1158,13 @@ static unsigned int fanout_demux_cpu(struct packet_fanout *f, return smp_processor_id() % num; } +static unsigned int fanout_demux_rnd(struct packet_fanout *f, + struct sk_buff *skb, + unsigned int num) +{ + return reciprocal_divide(prandom_u32(), num); +} + static unsigned int fanout_demux_rollover(struct packet_fanout *f, struct sk_buff *skb, unsigned int idx, unsigned int skip, @@ -1215,6 +1222,9 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, case PACKET_FANOUT_CPU: idx = fanout_demux_cpu(f, skb, num); break; + case PACKET_FANOUT_RND: + idx = fanout_demux_rnd(f, skb, num); + break; case PACKET_FANOUT_ROLLOVER: idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num); break; @@ -1284,6 +1294,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) case PACKET_FANOUT_HASH: case PACKET_FANOUT_LB: case PACKET_FANOUT_CPU: + case PACKET_FANOUT_RND: break; default: return -EINVAL; -- cgit v1.2.3-71-gd317 From 391ac1282dd7ff1cb8245cccc5262e8e4173edc4 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 26 Aug 2013 15:05:36 +0200 Subject: can: gw: add a per rule limitation of frame hops Usually the received CAN frames can be processed/routed as much as 'max_hops' times (which is given at module load time of the can-gw module). Introduce a new configuration option to reduce the number of possible hops for a specific gateway rule to a value smaller then max_hops. Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/gw.h | 9 ++++++++- net/can/gw.c | 35 +++++++++++++++++++++++++++++++---- 2 files changed, 39 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/can/gw.h b/include/uapi/linux/can/gw.h index ae07bec74f4b..4e27c82b564a 100644 --- a/include/uapi/linux/can/gw.h +++ b/include/uapi/linux/can/gw.h @@ -45,6 +45,7 @@ enum { CGW_DST_IF, /* ifindex of destination network interface */ CGW_FILTER, /* specify struct can_filter on source CAN device */ CGW_DELETED, /* number of deleted CAN frames (see max_hops param) */ + CGW_LIM_HOPS, /* limit the number of hops of this specific rule */ __CGW_MAX }; @@ -116,13 +117,19 @@ enum { * Sets a CAN receive filter for the gateway job specified by the * struct can_filter described in include/linux/can.h * - * CGW_MOD_XXX (length 17 bytes): + * CGW_MOD_(AND|OR|XOR|SET) (length 17 bytes): * Specifies a modification that's done to a received CAN frame before it is * send out to the destination interface. * * data used as operator * affected CAN frame elements * + * CGW_LIM_HOPS (length 1 byte): + * Limit the number of hops of this specific rule. Usually the received CAN + * frame can be processed as much as 'max_hops' times (which is given at module + * load time of the can-gw module). This value is used to reduce the number of + * possible hops for this gateway rule to a value smaller then max_hops. + * * CGW_CS_XOR (length 4 bytes): * Set a simple XOR checksum starting with an initial value into * data[result-idx] using data[start-idx] .. data[end-idx] diff --git a/net/can/gw.c b/net/can/gw.c index 2f291f961a17..3f9b0f3a2818 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -146,6 +146,7 @@ struct cgw_job { /* tbc */ }; u8 gwtype; + u8 limit_hops; u16 flags; }; @@ -402,6 +403,11 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) /* put the incremented hop counter in the cloned skb */ cgw_hops(nskb) = cgw_hops(skb) + 1; + + /* first processing of this CAN frame -> adjust to private hop limit */ + if (gwj->limit_hops && cgw_hops(nskb) == 1) + cgw_hops(nskb) = max_hops - gwj->limit_hops + 1; + nskb->dev = gwj->dst.dev; /* pointer to modifiable CAN frame */ @@ -509,6 +515,11 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, /* check non default settings of attributes */ + if (gwj->limit_hops) { + if (nla_put_u8(skb, CGW_LIM_HOPS, gwj->limit_hops) < 0) + goto cancel; + } + if (gwj->mod.modtype.and) { memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.and; @@ -606,11 +617,12 @@ static const struct nla_policy cgw_policy[CGW_MAX+1] = { [CGW_SRC_IF] = { .type = NLA_U32 }, [CGW_DST_IF] = { .type = NLA_U32 }, [CGW_FILTER] = { .len = sizeof(struct can_filter) }, + [CGW_LIM_HOPS] = { .type = NLA_U8 }, }; /* check for common and gwtype specific attributes */ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, - u8 gwtype, void *gwtypeattr) + u8 gwtype, void *gwtypeattr, u8 *limhops) { struct nlattr *tb[CGW_MAX+1]; struct cgw_frame_mod mb; @@ -625,6 +637,13 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, if (err < 0) return err; + if (tb[CGW_LIM_HOPS]) { + *limhops = nla_get_u8(tb[CGW_LIM_HOPS]); + + if (*limhops < 1 || *limhops > max_hops) + return -EINVAL; + } + /* check for AND/OR/XOR/SET modifications */ if (tb[CGW_MOD_AND]) { @@ -782,6 +801,7 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) { struct rtcanmsg *r; struct cgw_job *gwj; + u8 limhops = 0; int err = 0; if (!capable(CAP_NET_ADMIN)) @@ -808,7 +828,8 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) gwj->flags = r->flags; gwj->gwtype = r->gwtype; - err = cgw_parse_attr(nlh, &gwj->mod, CGW_TYPE_CAN_CAN, &gwj->ccgw); + err = cgw_parse_attr(nlh, &gwj->mod, CGW_TYPE_CAN_CAN, &gwj->ccgw, + &limhops); if (err < 0) goto out; @@ -836,6 +857,8 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) if (gwj->dst.dev->type != ARPHRD_CAN || gwj->dst.dev->header_ops) goto put_src_dst_out; + gwj->limit_hops = limhops; + ASSERT_RTNL(); err = cgw_register_filter(gwj); @@ -867,13 +890,14 @@ static void cgw_remove_all_jobs(void) } } -static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh) +static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh) { struct cgw_job *gwj = NULL; struct hlist_node *nx; struct rtcanmsg *r; struct cf_mod mod; struct can_can_gw ccgw; + u8 limhops = 0; int err = 0; if (!capable(CAP_NET_ADMIN)) @@ -890,7 +914,7 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh) if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; - err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw); + err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); if (err < 0) return err; @@ -910,6 +934,9 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh) if (gwj->flags != r->flags) continue; + if (gwj->limit_hops != limhops) + continue; + if (memcmp(&gwj->mod, &mod, sizeof(mod))) continue; -- cgit v1.2.3-71-gd317 From afe4fd062416b158a8a8538b23adc1930a9b88dc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 29 Aug 2013 15:49:55 -0700 Subject: pkt_sched: fq: Fair Queue packet scheduler - Uses perfect flow match (not stochastic hash like SFQ/FQ_codel) - Uses the new_flow/old_flow separation from FQ_codel - New flows get an initial credit allowing IW10 without added delay. - Special FIFO queue for high prio packets (no need for PRIO + FQ) - Uses a hash table of RB trees to locate the flows at enqueue() time - Smart on demand gc (at enqueue() time, RB tree lookup evicts old unused flows) - Dynamic memory allocations. - Designed to allow millions of concurrent flows per Qdisc. - Small memory footprint : ~8K per Qdisc, and 104 bytes per flow. - Single high resolution timer for throttled flows (if any). - One RB tree to link throttled flows. - Ability to have a max rate per flow. We might add a socket option to add per socket limitation. Attempts have been made to add TCP pacing in TCP stack, but this seems to add complex code to an already complex stack. TCP pacing is welcomed for flows having idle times, as the cwnd permits TCP stack to queue a possibly large number of packets. This removes the 'slow start after idle' choice, hitting badly large BDP flows, and applications delivering chunks of data as video streams. Nicely spaced packets : Here interface is 10Gbit, but flow bottleneck is ~20Mbit cwin is big, yet FQ avoids the typical bursts generated by TCP (as in netperf TCP_RR -- -r 100000,100000) 15:01:23.545279 IP A > B: . 78193:81089(2896) ack 65248 win 3125 15:01:23.545394 IP B > A: . ack 81089 win 3668 15:01:23.546488 IP A > B: . 81089:83985(2896) ack 65248 win 3125 15:01:23.546565 IP B > A: . ack 83985 win 3668 15:01:23.547713 IP A > B: . 83985:86881(2896) ack 65248 win 3125 15:01:23.547778 IP B > A: . ack 86881 win 3668 15:01:23.548911 IP A > B: . 86881:89777(2896) ack 65248 win 3125 15:01:23.548949 IP B > A: . ack 89777 win 3668 15:01:23.550116 IP A > B: . 89777:92673(2896) ack 65248 win 3125 15:01:23.550182 IP B > A: . ack 92673 win 3668 15:01:23.551333 IP A > B: . 92673:95569(2896) ack 65248 win 3125 15:01:23.551406 IP B > A: . ack 95569 win 3668 15:01:23.552539 IP A > B: . 95569:98465(2896) ack 65248 win 3125 15:01:23.552576 IP B > A: . ack 98465 win 3668 15:01:23.553756 IP A > B: . 98465:99913(1448) ack 65248 win 3125 15:01:23.554138 IP A > B: P 99913:100001(88) ack 65248 win 3125 15:01:23.554204 IP B > A: . ack 100001 win 3668 15:01:23.554234 IP B > A: . 65248:68144(2896) ack 100001 win 3668 15:01:23.555620 IP B > A: . 68144:71040(2896) ack 100001 win 3668 15:01:23.557005 IP B > A: . 71040:73936(2896) ack 100001 win 3668 15:01:23.558390 IP B > A: . 73936:76832(2896) ack 100001 win 3668 15:01:23.559773 IP B > A: . 76832:79728(2896) ack 100001 win 3668 15:01:23.561158 IP B > A: . 79728:82624(2896) ack 100001 win 3668 15:01:23.562543 IP B > A: . 82624:85520(2896) ack 100001 win 3668 15:01:23.563928 IP B > A: . 85520:88416(2896) ack 100001 win 3668 15:01:23.565313 IP B > A: . 88416:91312(2896) ack 100001 win 3668 15:01:23.566698 IP B > A: . 91312:94208(2896) ack 100001 win 3668 15:01:23.568083 IP B > A: . 94208:97104(2896) ack 100001 win 3668 15:01:23.569467 IP B > A: . 97104:100000(2896) ack 100001 win 3668 15:01:23.570852 IP B > A: . 100000:102896(2896) ack 100001 win 3668 15:01:23.572237 IP B > A: . 102896:105792(2896) ack 100001 win 3668 15:01:23.573639 IP B > A: . 105792:108688(2896) ack 100001 win 3668 15:01:23.575024 IP B > A: . 108688:111584(2896) ack 100001 win 3668 15:01:23.576408 IP B > A: . 111584:114480(2896) ack 100001 win 3668 15:01:23.577793 IP B > A: . 114480:117376(2896) ack 100001 win 3668 TCP timestamps show that most packets from B were queued in the same ms timeframe (TSval 1159799{3,4}), but FQ managed to send them right in time to avoid a big burst. In slow start or steady state, very few packets are throttled [1] FQ gets a bunch of tunables as : limit : max number of packets on whole Qdisc (default 10000) flow_limit : max number of packets per flow (default 100) quantum : the credit per RR round (default is 2 MTU) initial_quantum : initial credit for new flows (default is 10 MTU) maxrate : max per flow rate (default : unlimited) buckets : number of RB trees (default : 1024) in hash table. (consumes 8 bytes per bucket) [no]pacing : disable/enable pacing (default is enable) All of them can be changed on a live qdisc. $ tc qd add dev eth0 root fq help Usage: ... fq [ limit PACKETS ] [ flow_limit PACKETS ] [ quantum BYTES ] [ initial_quantum BYTES ] [ maxrate RATE ] [ buckets NUMBER ] [ [no]pacing ] $ tc -s -d qd qdisc fq 8002: dev eth0 root refcnt 32 limit 10000p flow_limit 100p buckets 256 quantum 3028 initial_quantum 15140 Sent 216532416 bytes 148395 pkt (dropped 0, overlimits 0 requeues 14) backlog 0b 0p requeues 14 511 flows, 511 inactive, 0 throttled 110 gc, 0 highprio, 0 retrans, 1143 throttled, 0 flows_plimit [1] Except if initial srtt is overestimated, as if using cached srtt in tcp metrics. We'll provide a fix for this issue. Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Neal Cardwell Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 41 +++ net/sched/Kconfig | 14 + net/sched/Makefile | 1 + net/sched/sch_fq.c | 792 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 848 insertions(+) create mode 100644 net/sched/sch_fq.c (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 09d62b9228ff..9b829134d422 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -744,4 +744,45 @@ struct tc_fq_codel_xstats { }; }; +/* FQ */ + +enum { + TCA_FQ_UNSPEC, + + TCA_FQ_PLIMIT, /* limit of total number of packets in queue */ + + TCA_FQ_FLOW_PLIMIT, /* limit of packets per flow */ + + TCA_FQ_QUANTUM, /* RR quantum */ + + TCA_FQ_INITIAL_QUANTUM, /* RR quantum for new flow */ + + TCA_FQ_RATE_ENABLE, /* enable/disable rate limiting */ + + TCA_FQ_FLOW_DEFAULT_RATE,/* for sockets with unspecified sk_rate, + * use the following rate + */ + + TCA_FQ_FLOW_MAX_RATE, /* per flow max rate */ + + TCA_FQ_BUCKETS_LOG, /* log2(number of buckets) */ + __TCA_FQ_MAX +}; + +#define TCA_FQ_MAX (__TCA_FQ_MAX - 1) + +struct tc_fq_qd_stats { + __u64 gc_flows; + __u64 highprio_packets; + __u64 tcp_retrans; + __u64 throttled; + __u64 flows_plimit; + __u64 pkts_too_long; + __u64 allocation_errors; + __s64 time_next_delayed_flow; + __u32 flows; + __u32 inactive_flows; + __u32 throttled_flows; + __u32 pad; +}; #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 235e01acac51..c03a32a0418e 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -272,6 +272,20 @@ config NET_SCH_FQ_CODEL If unsure, say N. +config NET_SCH_FQ + tristate "Fair Queue" + help + Say Y here if you want to use the FQ packet scheduling algorithm. + + FQ does flow separation, and is able to respect pacing requirements + set by TCP stack into sk->sk_pacing_rate (for localy generated + traffic) + + To compile this driver as a module, choose M here: the module + will be called sch_fq. + + If unsure, say N. + config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT diff --git a/net/sched/Makefile b/net/sched/Makefile index 978cbf004e80..e5f9abe9a5db 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o +obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c new file mode 100644 index 000000000000..91ceca7bcb52 --- /dev/null +++ b/net/sched/sch_fq.c @@ -0,0 +1,792 @@ +/* + * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) + * + * Copyright (C) 2013 Eric Dumazet + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Meant to be mostly used for localy generated traffic : + * Fast classification depends on skb->sk being set before reaching us. + * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash. + * All packets belonging to a socket are considered as a 'flow'. + * + * Flows are dynamically allocated and stored in a hash table of RB trees + * They are also part of one Round Robin 'queues' (new or old flows) + * + * Burst avoidance (aka pacing) capability : + * + * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a + * bunch of packets, and this packet scheduler adds delay between + * packets to respect rate limitation. + * + * enqueue() : + * - lookup one RB tree (out of 1024 or more) to find the flow. + * If non existent flow, create it, add it to the tree. + * Add skb to the per flow list of skb (fifo). + * - Use a special fifo for high prio packets + * + * dequeue() : serves flows in Round Robin + * Note : When a flow becomes empty, we do not immediately remove it from + * rb trees, for performance reasons (its expected to send additional packets, + * or SLAB cache will reuse socket for another flow) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Per flow structure, dynamically allocated + */ +struct fq_flow { + struct sk_buff *head; /* list of skbs for this flow : first skb */ + union { + struct sk_buff *tail; /* last skb in the list */ + unsigned long age; /* jiffies when flow was emptied, for gc */ + }; + struct rb_node fq_node; /* anchor in fq_root[] trees */ + struct sock *sk; + int qlen; /* number of packets in flow queue */ + int credit; + u32 socket_hash; /* sk_hash */ + struct fq_flow *next; /* next pointer in RR lists, or &detached */ + + struct rb_node rate_node; /* anchor in q->delayed tree */ + u64 time_next_packet; +}; + +struct fq_flow_head { + struct fq_flow *first; + struct fq_flow *last; +}; + +struct fq_sched_data { + struct fq_flow_head new_flows; + + struct fq_flow_head old_flows; + + struct rb_root delayed; /* for rate limited flows */ + u64 time_next_delayed_flow; + + struct fq_flow internal; /* for non classified or high prio packets */ + u32 quantum; + u32 initial_quantum; + u32 flow_default_rate;/* rate per flow : bytes per second */ + u32 flow_max_rate; /* optional max rate per flow */ + u32 flow_plimit; /* max packets per flow */ + struct rb_root *fq_root; + u8 rate_enable; + u8 fq_trees_log; + + u32 flows; + u32 inactive_flows; + u32 throttled_flows; + + u64 stat_gc_flows; + u64 stat_internal_packets; + u64 stat_tcp_retrans; + u64 stat_throttled; + u64 stat_flows_plimit; + u64 stat_pkts_too_long; + u64 stat_allocation_errors; + struct qdisc_watchdog watchdog; +}; + +/* special value to mark a detached flow (not on old/new list) */ +static struct fq_flow detached, throttled; + +static void fq_flow_set_detached(struct fq_flow *f) +{ + f->next = &detached; +} + +static bool fq_flow_is_detached(const struct fq_flow *f) +{ + return f->next == &detached; +} + +static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) +{ + struct rb_node **p = &q->delayed.rb_node, *parent = NULL; + + while (*p) { + struct fq_flow *aux; + + parent = *p; + aux = container_of(parent, struct fq_flow, rate_node); + if (f->time_next_packet >= aux->time_next_packet) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&f->rate_node, parent, p); + rb_insert_color(&f->rate_node, &q->delayed); + q->throttled_flows++; + q->stat_throttled++; + + f->next = &throttled; + if (q->time_next_delayed_flow > f->time_next_packet) + q->time_next_delayed_flow = f->time_next_packet; +} + + +static struct kmem_cache *fq_flow_cachep __read_mostly; + +static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow) +{ + if (head->first) + head->last->next = flow; + else + head->first = flow; + head->last = flow; + flow->next = NULL; +} + +/* limit number of collected flows per round */ +#define FQ_GC_MAX 8 +#define FQ_GC_AGE (3*HZ) + +static bool fq_gc_candidate(const struct fq_flow *f) +{ + return fq_flow_is_detached(f) && + time_after(jiffies, f->age + FQ_GC_AGE); +} + +static void fq_gc(struct fq_sched_data *q, + struct rb_root *root, + struct sock *sk) +{ + struct fq_flow *f, *tofree[FQ_GC_MAX]; + struct rb_node **p, *parent; + int fcnt = 0; + + p = &root->rb_node; + parent = NULL; + while (*p) { + parent = *p; + + f = container_of(parent, struct fq_flow, fq_node); + if (f->sk == sk) + break; + + if (fq_gc_candidate(f)) { + tofree[fcnt++] = f; + if (fcnt == FQ_GC_MAX) + break; + } + + if (f->sk > sk) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + + q->flows -= fcnt; + q->inactive_flows -= fcnt; + q->stat_gc_flows += fcnt; + while (fcnt) { + struct fq_flow *f = tofree[--fcnt]; + + rb_erase(&f->fq_node, root); + kmem_cache_free(fq_flow_cachep, f); + } +} + +static const u8 prio2band[TC_PRIO_MAX + 1] = { + 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 +}; + +static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) +{ + struct rb_node **p, *parent; + struct sock *sk = skb->sk; + struct rb_root *root; + struct fq_flow *f; + int band; + + /* warning: no starvation prevention... */ + band = prio2band[skb->priority & TC_PRIO_MAX]; + if (unlikely(band == 0)) + return &q->internal; + + if (unlikely(!sk)) { + /* By forcing low order bit to 1, we make sure to not + * collide with a local flow (socket pointers are word aligned) + */ + sk = (struct sock *)(skb_get_rxhash(skb) | 1L); + } + + root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)]; + + if (q->flows >= (2U << q->fq_trees_log) && + q->inactive_flows > q->flows/2) + fq_gc(q, root, sk); + + p = &root->rb_node; + parent = NULL; + while (*p) { + parent = *p; + + f = container_of(parent, struct fq_flow, fq_node); + if (f->sk == sk) { + /* socket might have been reallocated, so check + * if its sk_hash is the same. + * It not, we need to refill credit with + * initial quantum + */ + if (unlikely(skb->sk && + f->socket_hash != sk->sk_hash)) { + f->credit = q->initial_quantum; + f->socket_hash = sk->sk_hash; + } + return f; + } + if (f->sk > sk) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + + f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(!f)) { + q->stat_allocation_errors++; + return &q->internal; + } + fq_flow_set_detached(f); + f->sk = sk; + if (skb->sk) + f->socket_hash = sk->sk_hash; + f->credit = q->initial_quantum; + + rb_link_node(&f->fq_node, parent, p); + rb_insert_color(&f->fq_node, root); + + q->flows++; + q->inactive_flows++; + return f; +} + + +/* remove one skb from head of flow queue */ +static struct sk_buff *fq_dequeue_head(struct fq_flow *flow) +{ + struct sk_buff *skb = flow->head; + + if (skb) { + flow->head = skb->next; + skb->next = NULL; + flow->qlen--; + } + return skb; +} + +/* We might add in the future detection of retransmits + * For the time being, just return false + */ +static bool skb_is_retransmit(struct sk_buff *skb) +{ + return false; +} + +/* add skb to flow queue + * flow queue is a linked list, kind of FIFO, except for TCP retransmits + * We special case tcp retransmits to be transmitted before other packets. + * We rely on fact that TCP retransmits are unlikely, so we do not waste + * a separate queue or a pointer. + * head-> [retrans pkt 1] + * [retrans pkt 2] + * [ normal pkt 1] + * [ normal pkt 2] + * [ normal pkt 3] + * tail-> [ normal pkt 4] + */ +static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) +{ + struct sk_buff *prev, *head = flow->head; + + skb->next = NULL; + if (!head) { + flow->head = skb; + flow->tail = skb; + return; + } + if (likely(!skb_is_retransmit(skb))) { + flow->tail->next = skb; + flow->tail = skb; + return; + } + + /* This skb is a tcp retransmit, + * find the last retrans packet in the queue + */ + prev = NULL; + while (skb_is_retransmit(head)) { + prev = head; + head = head->next; + if (!head) + break; + } + if (!prev) { /* no rtx packet in queue, become the new head */ + skb->next = flow->head; + flow->head = skb; + } else { + if (prev == flow->tail) + flow->tail = skb; + else + skb->next = prev->next; + prev->next = skb; + } +} + +static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct fq_flow *f; + + if (unlikely(sch->q.qlen >= sch->limit)) + return qdisc_drop(skb, sch); + + f = fq_classify(skb, q); + if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { + q->stat_flows_plimit++; + return qdisc_drop(skb, sch); + } + + f->qlen++; + flow_queue_add(f, skb); + if (skb_is_retransmit(skb)) + q->stat_tcp_retrans++; + sch->qstats.backlog += qdisc_pkt_len(skb); + if (fq_flow_is_detached(f)) { + fq_flow_add_tail(&q->new_flows, f); + if (q->quantum > f->credit) + f->credit = q->quantum; + q->inactive_flows--; + qdisc_unthrottled(sch); + } + if (unlikely(f == &q->internal)) { + q->stat_internal_packets++; + qdisc_unthrottled(sch); + } + sch->q.qlen++; + + return NET_XMIT_SUCCESS; +} + +static void fq_check_throttled(struct fq_sched_data *q, u64 now) +{ + struct rb_node *p; + + if (q->time_next_delayed_flow > now) + return; + + q->time_next_delayed_flow = ~0ULL; + while ((p = rb_first(&q->delayed)) != NULL) { + struct fq_flow *f = container_of(p, struct fq_flow, rate_node); + + if (f->time_next_packet > now) { + q->time_next_delayed_flow = f->time_next_packet; + break; + } + rb_erase(p, &q->delayed); + q->throttled_flows--; + fq_flow_add_tail(&q->old_flows, f); + } +} + +static struct sk_buff *fq_dequeue(struct Qdisc *sch) +{ + struct fq_sched_data *q = qdisc_priv(sch); + u64 now = ktime_to_ns(ktime_get()); + struct fq_flow_head *head; + struct sk_buff *skb; + struct fq_flow *f; + + skb = fq_dequeue_head(&q->internal); + if (skb) + goto out; + fq_check_throttled(q, now); +begin: + head = &q->new_flows; + if (!head->first) { + head = &q->old_flows; + if (!head->first) { + if (q->time_next_delayed_flow != ~0ULL) + qdisc_watchdog_schedule_ns(&q->watchdog, + q->time_next_delayed_flow); + return NULL; + } + } + f = head->first; + + if (f->credit <= 0) { + f->credit += q->quantum; + head->first = f->next; + fq_flow_add_tail(&q->old_flows, f); + goto begin; + } + + if (unlikely(f->head && now < f->time_next_packet)) { + head->first = f->next; + fq_flow_set_throttled(q, f); + goto begin; + } + + skb = fq_dequeue_head(f); + if (!skb) { + head->first = f->next; + /* force a pass through old_flows to prevent starvation */ + if ((head == &q->new_flows) && q->old_flows.first) { + fq_flow_add_tail(&q->old_flows, f); + } else { + fq_flow_set_detached(f); + f->age = jiffies; + q->inactive_flows++; + } + goto begin; + } + f->time_next_packet = now; + f->credit -= qdisc_pkt_len(skb); + + if (f->credit <= 0 && + q->rate_enable && + skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) { + u32 rate = skb->sk->sk_pacing_rate ?: q->flow_default_rate; + + rate = min(rate, q->flow_max_rate); + if (rate) { + u64 len = (u64)qdisc_pkt_len(skb) * NSEC_PER_SEC; + + do_div(len, rate); + /* Since socket rate can change later, + * clamp the delay to 125 ms. + * TODO: maybe segment the too big skb, as in commit + * e43ac79a4bc ("sch_tbf: segment too big GSO packets") + */ + if (unlikely(len > 125 * NSEC_PER_MSEC)) { + len = 125 * NSEC_PER_MSEC; + q->stat_pkts_too_long++; + } + + f->time_next_packet = now + len; + } + } +out: + prefetch(&skb->end); + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + qdisc_unthrottled(sch); + return skb; +} + +static void fq_reset(struct Qdisc *sch) +{ + struct sk_buff *skb; + + while ((skb = fq_dequeue(sch)) != NULL) + kfree_skb(skb); +} + +static void fq_rehash(struct fq_sched_data *q, + struct rb_root *old_array, u32 old_log, + struct rb_root *new_array, u32 new_log) +{ + struct rb_node *op, **np, *parent; + struct rb_root *oroot, *nroot; + struct fq_flow *of, *nf; + int fcnt = 0; + u32 idx; + + for (idx = 0; idx < (1U << old_log); idx++) { + oroot = &old_array[idx]; + while ((op = rb_first(oroot)) != NULL) { + rb_erase(op, oroot); + of = container_of(op, struct fq_flow, fq_node); + if (fq_gc_candidate(of)) { + fcnt++; + kmem_cache_free(fq_flow_cachep, of); + continue; + } + nroot = &new_array[hash_32((u32)(long)of->sk, new_log)]; + + np = &nroot->rb_node; + parent = NULL; + while (*np) { + parent = *np; + + nf = container_of(parent, struct fq_flow, fq_node); + BUG_ON(nf->sk == of->sk); + + if (nf->sk > of->sk) + np = &parent->rb_right; + else + np = &parent->rb_left; + } + + rb_link_node(&of->fq_node, parent, np); + rb_insert_color(&of->fq_node, nroot); + } + } + q->flows -= fcnt; + q->inactive_flows -= fcnt; + q->stat_gc_flows += fcnt; +} + +static int fq_resize(struct fq_sched_data *q, u32 log) +{ + struct rb_root *array; + u32 idx; + + if (q->fq_root && log == q->fq_trees_log) + return 0; + + array = kmalloc(sizeof(struct rb_root) << log, GFP_KERNEL); + if (!array) + return -ENOMEM; + + for (idx = 0; idx < (1U << log); idx++) + array[idx] = RB_ROOT; + + if (q->fq_root) { + fq_rehash(q, q->fq_root, q->fq_trees_log, array, log); + kfree(q->fq_root); + } + q->fq_root = array; + q->fq_trees_log = log; + + return 0; +} + +static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { + [TCA_FQ_PLIMIT] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, + [TCA_FQ_QUANTUM] = { .type = NLA_U32 }, + [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 }, + [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 }, + [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, + [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, +}; + +static int fq_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_FQ_MAX + 1]; + int err, drop_count = 0; + u32 fq_log; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy); + if (err < 0) + return err; + + sch_tree_lock(sch); + + fq_log = q->fq_trees_log; + + if (tb[TCA_FQ_BUCKETS_LOG]) { + u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]); + + if (nval >= 1 && nval <= ilog2(256*1024)) + fq_log = nval; + else + err = -EINVAL; + } + if (tb[TCA_FQ_PLIMIT]) + sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]); + + if (tb[TCA_FQ_FLOW_PLIMIT]) + q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]); + + if (tb[TCA_FQ_QUANTUM]) + q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); + + if (tb[TCA_FQ_INITIAL_QUANTUM]) + q->quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); + + if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) + q->flow_default_rate = nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]); + + if (tb[TCA_FQ_FLOW_MAX_RATE]) + q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); + + if (tb[TCA_FQ_RATE_ENABLE]) { + u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); + + if (enable <= 1) + q->rate_enable = enable; + else + err = -EINVAL; + } + + if (!err) + err = fq_resize(q, fq_log); + + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = fq_dequeue(sch); + + kfree_skb(skb); + drop_count++; + } + qdisc_tree_decrease_qlen(sch, drop_count); + + sch_tree_unlock(sch); + return err; +} + +static void fq_destroy(struct Qdisc *sch) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct rb_root *root; + struct rb_node *p; + unsigned int idx; + + if (q->fq_root) { + for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { + root = &q->fq_root[idx]; + while ((p = rb_first(root)) != NULL) { + rb_erase(p, root); + kmem_cache_free(fq_flow_cachep, + container_of(p, struct fq_flow, fq_node)); + } + } + kfree(q->fq_root); + } + qdisc_watchdog_cancel(&q->watchdog); +} + +static int fq_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct fq_sched_data *q = qdisc_priv(sch); + int err; + + sch->limit = 10000; + q->flow_plimit = 100; + q->quantum = 2 * psched_mtu(qdisc_dev(sch)); + q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); + q->flow_default_rate = 0; + q->flow_max_rate = ~0U; + q->rate_enable = 1; + q->new_flows.first = NULL; + q->old_flows.first = NULL; + q->delayed = RB_ROOT; + q->fq_root = NULL; + q->fq_trees_log = ilog2(1024); + qdisc_watchdog_init(&q->watchdog, sch); + + if (opt) + err = fq_change(sch, opt); + else + err = fq_resize(q, q->fq_trees_log); + + return err; +} + +static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct fq_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || + nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || + nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || + nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || + nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || + nla_put_u32(skb, TCA_FQ_FLOW_DEFAULT_RATE, q->flow_default_rate) || + nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || + nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) + goto nla_put_failure; + + nla_nest_end(skb, opts); + return skb->len; + +nla_put_failure: + return -1; +} + +static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct fq_sched_data *q = qdisc_priv(sch); + u64 now = ktime_to_ns(ktime_get()); + struct tc_fq_qd_stats st = { + .gc_flows = q->stat_gc_flows, + .highprio_packets = q->stat_internal_packets, + .tcp_retrans = q->stat_tcp_retrans, + .throttled = q->stat_throttled, + .flows_plimit = q->stat_flows_plimit, + .pkts_too_long = q->stat_pkts_too_long, + .allocation_errors = q->stat_allocation_errors, + .flows = q->flows, + .inactive_flows = q->inactive_flows, + .throttled_flows = q->throttled_flows, + .time_next_delayed_flow = q->time_next_delayed_flow - now, + }; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct Qdisc_ops fq_qdisc_ops __read_mostly = { + .id = "fq", + .priv_size = sizeof(struct fq_sched_data), + + .enqueue = fq_enqueue, + .dequeue = fq_dequeue, + .peek = qdisc_peek_dequeued, + .init = fq_init, + .reset = fq_reset, + .destroy = fq_destroy, + .change = fq_change, + .dump = fq_dump, + .dump_stats = fq_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init fq_module_init(void) +{ + int ret; + + fq_flow_cachep = kmem_cache_create("fq_flow_cache", + sizeof(struct fq_flow), + 0, 0, NULL); + if (!fq_flow_cachep) + return -ENOMEM; + + ret = register_qdisc(&fq_qdisc_ops); + if (ret) + kmem_cache_destroy(fq_flow_cachep); + return ret; +} + +static void __exit fq_module_exit(void) +{ + unregister_qdisc(&fq_qdisc_ops); + kmem_cache_destroy(fq_flow_cachep); +} + +module_init(fq_module_init) +module_exit(fq_module_exit) +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3-71-gd317 From e4c7ed415387cf718ffbec305396c30cee092987 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 31 Aug 2013 13:44:33 +0800 Subject: vxlan: add ipv6 support This patch adds IPv6 support to vxlan device, as the new version RFC already mentions it: http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-03 Cc: David Stevens Cc: Stephen Hemminger Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 764 ++++++++++++++++++++++++++++++++++-------- include/net/vxlan.h | 2 +- include/uapi/linux/if_link.h | 2 + net/openvswitch/vport-vxlan.c | 2 +- 4 files changed, 622 insertions(+), 148 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 3b21aca0c0c2..faf131e02a72 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -6,9 +6,6 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. - * - * TODO - * - IPv6 (not in RFC) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -43,6 +40,11 @@ #include #include #include +#if IS_ENABLED(CONFIG_IPV6) +#include +#include +#include +#endif #define VXLAN_VERSION "0.1" @@ -59,6 +61,8 @@ #define VXLAN_VID_MASK (VXLAN_N_VID - 1) /* IP header + UDP + VXLAN + Ethernet header */ #define VXLAN_HEADROOM (20 + 8 + 8 + 14) +/* IPv6 header + UDP + VXLAN + Ethernet header */ +#define VXLAN6_HEADROOM (40 + 8 + 8 + 14) #define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) #define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. */ @@ -92,8 +96,14 @@ struct vxlan_net { spinlock_t sock_lock; }; +union vxlan_addr { + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + struct sockaddr sa; +}; + struct vxlan_rdst { - __be32 remote_ip; + union vxlan_addr remote_ip; __be16 remote_port; u32 remote_vni; u32 remote_ifindex; @@ -120,7 +130,7 @@ struct vxlan_dev { struct vxlan_sock *vn_sock; /* listening socket */ struct net_device *dev; struct vxlan_rdst default_dst; /* default destination */ - __be32 saddr; /* source address */ + union vxlan_addr saddr; /* source address */ __be16 dst_port; __u16 port_min; /* source port range */ __u16 port_max; @@ -146,6 +156,7 @@ struct vxlan_dev { #define VXLAN_F_RSC 0x04 #define VXLAN_F_L2MISS 0x08 #define VXLAN_F_L3MISS 0x10 +#define VXLAN_F_IPV6 0x20 /* internal flag */ /* salt for hash table */ static u32 vxlan_salt __read_mostly; @@ -153,6 +164,96 @@ static struct workqueue_struct *vxlan_wq; static void vxlan_sock_work(struct work_struct *work); +#if IS_ENABLED(CONFIG_IPV6) +static inline +bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) +{ + if (a->sa.sa_family != b->sa.sa_family) + return false; + if (a->sa.sa_family == AF_INET6) + return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr); + else + return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; +} + +static inline bool vxlan_addr_any(const union vxlan_addr *ipa) +{ + if (ipa->sa.sa_family == AF_INET6) + return ipv6_addr_any(&ipa->sin6.sin6_addr); + else + return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY); +} + +static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa) +{ + if (ipa->sa.sa_family == AF_INET6) + return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr); + else + return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr)); +} + +static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla) +{ + if (nla_len(nla) >= sizeof(struct in6_addr)) { + nla_memcpy(&ip->sin6.sin6_addr, nla, sizeof(struct in6_addr)); + ip->sa.sa_family = AF_INET6; + return 0; + } else if (nla_len(nla) >= sizeof(__be32)) { + ip->sin.sin_addr.s_addr = nla_get_be32(nla); + ip->sa.sa_family = AF_INET; + return 0; + } else { + return -EAFNOSUPPORT; + } +} + +static int vxlan_nla_put_addr(struct sk_buff *skb, int attr, + const union vxlan_addr *ip) +{ + if (ip->sa.sa_family == AF_INET6) + return nla_put(skb, attr, sizeof(struct in6_addr), &ip->sin6.sin6_addr); + else + return nla_put_be32(skb, attr, ip->sin.sin_addr.s_addr); +} + +#else /* !CONFIG_IPV6 */ + +static inline +bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) +{ + return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; +} + +static inline bool vxlan_addr_any(const union vxlan_addr *ipa) +{ + return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY); +} + +static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa) +{ + return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr)); +} + +static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla) +{ + if (nla_len(nla) >= sizeof(struct in6_addr)) { + return -EAFNOSUPPORT; + } else if (nla_len(nla) >= sizeof(__be32)) { + ip->sin.sin_addr.s_addr = nla_get_be32(nla); + ip->sa.sa_family = AF_INET; + return 0; + } else { + return -EAFNOSUPPORT; + } +} + +static int vxlan_nla_put_addr(struct sk_buff *skb, int attr, + const union vxlan_addr *ip) +{ + return nla_put_be32(skb, attr, ip->sin.sin_addr.s_addr); +} +#endif + /* Virtual Network hash table head */ static inline struct hlist_head *vni_head(struct vxlan_sock *vs, u32 id) { @@ -239,7 +340,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, if (type == RTM_GETNEIGH) { ndm->ndm_family = AF_INET; - send_ip = rdst->remote_ip != htonl(INADDR_ANY); + send_ip = !vxlan_addr_any(&rdst->remote_ip); send_eth = !is_zero_ether_addr(fdb->eth_addr); } else ndm->ndm_family = AF_BRIDGE; @@ -251,7 +352,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) goto nla_put_failure; - if (send_ip && nla_put_be32(skb, NDA_DST, rdst->remote_ip)) + if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip)) goto nla_put_failure; if (rdst->remote_port && rdst->remote_port != vxlan->dst_port && @@ -283,7 +384,7 @@ static inline size_t vxlan_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ - + nla_total_size(sizeof(__be32)) /* NDA_DST */ + + nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */ + nla_total_size(sizeof(__be16)) /* NDA_PORT */ + nla_total_size(sizeof(__be32)) /* NDA_VNI */ + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */ @@ -317,14 +418,14 @@ errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } -static void vxlan_ip_miss(struct net_device *dev, __be32 ipa) +static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb f = { .state = NUD_STALE, }; struct vxlan_rdst remote = { - .remote_ip = ipa, /* goes to NDA_DST */ + .remote_ip = *ipa, /* goes to NDA_DST */ .remote_vni = VXLAN_N_VID, }; @@ -397,13 +498,13 @@ static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, /* caller should hold vxlan->hash_lock */ static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f, - __be32 ip, __be16 port, + union vxlan_addr *ip, __be16 port, __u32 vni, __u32 ifindex) { struct vxlan_rdst *rd; list_for_each_entry(rd, &f->remotes, list) { - if (rd->remote_ip == ip && + if (vxlan_addr_equal(&rd->remote_ip, ip) && rd->remote_port == port && rd->remote_vni == vni && rd->remote_ifindex == ifindex) @@ -415,7 +516,7 @@ static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f, /* Replace destination of unicast mac */ static int vxlan_fdb_replace(struct vxlan_fdb *f, - __be32 ip, __be16 port, __u32 vni, __u32 ifindex) + union vxlan_addr *ip, __be16 port, __u32 vni, __u32 ifindex) { struct vxlan_rdst *rd; @@ -426,7 +527,7 @@ static int vxlan_fdb_replace(struct vxlan_fdb *f, rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list); if (!rd) return 0; - rd->remote_ip = ip; + rd->remote_ip = *ip; rd->remote_port = port; rd->remote_vni = vni; rd->remote_ifindex = ifindex; @@ -435,7 +536,7 @@ static int vxlan_fdb_replace(struct vxlan_fdb *f, /* Add/update destinations for multicast */ static int vxlan_fdb_append(struct vxlan_fdb *f, - __be32 ip, __be16 port, __u32 vni, __u32 ifindex) + union vxlan_addr *ip, __be16 port, __u32 vni, __u32 ifindex) { struct vxlan_rdst *rd; @@ -446,7 +547,7 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, rd = kmalloc(sizeof(*rd), GFP_ATOMIC); if (rd == NULL) return -ENOBUFS; - rd->remote_ip = ip; + rd->remote_ip = *ip; rd->remote_port = port; rd->remote_vni = vni; rd->remote_ifindex = ifindex; @@ -458,7 +559,7 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, /* Add new entry to forwarding table -- assumes lock held */ static int vxlan_fdb_create(struct vxlan_dev *vxlan, - const u8 *mac, __be32 ip, + const u8 *mac, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __u32 vni, __u32 ifindex, __u8 ndm_flags) @@ -517,7 +618,7 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac))) return -EOPNOTSUPP; - netdev_dbg(vxlan->dev, "add %pM -> %pI4\n", mac, &ip); + netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); f = kmalloc(sizeof(*f), GFP_ATOMIC); if (!f) return -ENOMEM; @@ -565,17 +666,26 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f) } static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, - __be32 *ip, __be16 *port, u32 *vni, u32 *ifindex) + union vxlan_addr *ip, __be16 *port, u32 *vni, u32 *ifindex) { struct net *net = dev_net(vxlan->dev); + int err; if (tb[NDA_DST]) { - if (nla_len(tb[NDA_DST]) != sizeof(__be32)) - return -EAFNOSUPPORT; - - *ip = nla_get_be32(tb[NDA_DST]); + err = vxlan_nla_get_addr(ip, tb[NDA_DST]); + if (err) + return err; } else { - *ip = htonl(INADDR_ANY); + union vxlan_addr *remote = &vxlan->default_dst.remote_ip; + if (remote->sa.sa_family == AF_INET) { + ip->sin.sin_addr.s_addr = htonl(INADDR_ANY); + ip->sa.sa_family = AF_INET; +#if IS_ENABLED(CONFIG_IPV6) + } else { + ip->sin6.sin6_addr = in6addr_any; + ip->sa.sa_family = AF_INET6; +#endif + } } if (tb[NDA_PORT]) { @@ -618,7 +728,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], { struct vxlan_dev *vxlan = netdev_priv(dev); /* struct net *net = dev_net(vxlan->dev); */ - __be32 ip; + union vxlan_addr ip; __be16 port; u32 vni, ifindex; int err; @@ -637,7 +747,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], return err; spin_lock_bh(&vxlan->hash_lock); - err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags, + err = vxlan_fdb_create(vxlan, addr, &ip, ndm->ndm_state, flags, port, vni, ifindex, ndm->ndm_flags); spin_unlock_bh(&vxlan->hash_lock); @@ -652,7 +762,7 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; struct vxlan_rdst *rd = NULL; - __be32 ip; + union vxlan_addr ip; __be16 port; u32 vni, ifindex; int err; @@ -668,8 +778,8 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], if (!f) goto out; - if (ip != htonl(INADDR_ANY)) { - rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); + if (!vxlan_addr_any(&ip)) { + rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex); if (!rd) goto out; } @@ -732,7 +842,7 @@ out: * Return true if packet is bogus and should be droppped. */ static bool vxlan_snoop(struct net_device *dev, - __be32 src_ip, const u8 *src_mac) + union vxlan_addr *src_ip, const u8 *src_mac) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; @@ -741,7 +851,7 @@ static bool vxlan_snoop(struct net_device *dev, if (likely(f)) { struct vxlan_rdst *rdst = first_remote_rcu(f); - if (likely(rdst->remote_ip == src_ip)) + if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip))) return false; /* Don't migrate static entries, drop packets */ @@ -750,10 +860,10 @@ static bool vxlan_snoop(struct net_device *dev, if (net_ratelimit()) netdev_info(dev, - "%pM migrated from %pI4 to %pI4\n", + "%pM migrated from %pIS to %pIS\n", src_mac, &rdst->remote_ip, &src_ip); - rdst->remote_ip = src_ip; + rdst->remote_ip = *src_ip; f->updated = jiffies; vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH); } else { @@ -775,7 +885,7 @@ static bool vxlan_snoop(struct net_device *dev, } /* See if multicast group is already in use by other ID */ -static bool vxlan_group_used(struct vxlan_net *vn, __be32 remote_ip) +static bool vxlan_group_used(struct vxlan_net *vn, union vxlan_addr *remote_ip) { struct vxlan_dev *vxlan; @@ -783,7 +893,8 @@ static bool vxlan_group_used(struct vxlan_net *vn, __be32 remote_ip) if (!netif_running(vxlan->dev)) continue; - if (vxlan->default_dst.remote_ip == remote_ip) + if (vxlan_addr_equal(&vxlan->default_dst.remote_ip, + remote_ip)) return true; } @@ -819,13 +930,23 @@ static void vxlan_igmp_join(struct work_struct *work) struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_join); struct vxlan_sock *vs = vxlan->vn_sock; struct sock *sk = vs->sock->sk; - struct ip_mreqn mreq = { - .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip, - .imr_ifindex = vxlan->default_dst.remote_ifindex, - }; + union vxlan_addr *ip = &vxlan->default_dst.remote_ip; + int ifindex = vxlan->default_dst.remote_ifindex; lock_sock(sk); - ip_mc_join_group(sk, &mreq); + if (ip->sa.sa_family == AF_INET) { + struct ip_mreqn mreq = { + .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, + .imr_ifindex = ifindex, + }; + + ip_mc_join_group(sk, &mreq); +#if IS_ENABLED(CONFIG_IPV6) + } else { + ipv6_stub->ipv6_sock_mc_join(sk, ifindex, + &ip->sin6.sin6_addr); +#endif + } release_sock(sk); vxlan_sock_release(vs); @@ -838,13 +959,24 @@ static void vxlan_igmp_leave(struct work_struct *work) struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_leave); struct vxlan_sock *vs = vxlan->vn_sock; struct sock *sk = vs->sock->sk; - struct ip_mreqn mreq = { - .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip, - .imr_ifindex = vxlan->default_dst.remote_ifindex, - }; + union vxlan_addr *ip = &vxlan->default_dst.remote_ip; + int ifindex = vxlan->default_dst.remote_ifindex; lock_sock(sk); - ip_mc_leave_group(sk, &mreq); + if (ip->sa.sa_family == AF_INET) { + struct ip_mreqn mreq = { + .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, + .imr_ifindex = ifindex, + }; + + ip_mc_leave_group(sk, &mreq); +#if IS_ENABLED(CONFIG_IPV6) + } else { + ipv6_stub->ipv6_sock_mc_drop(sk, ifindex, + &ip->sin6.sin6_addr); +#endif + } + release_sock(sk); vxlan_sock_release(vs); @@ -896,11 +1028,14 @@ error: static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) { - struct iphdr *oip; + struct iphdr *oip = NULL; + struct ipv6hdr *oip6 = NULL; struct vxlan_dev *vxlan; struct pcpu_tstats *stats; + union vxlan_addr saddr; __u32 vni; - int err; + int err = 0; + union vxlan_addr *remote_ip; vni = ntohl(vx_vni) >> 8; /* Is this VNI defined? */ @@ -908,6 +1043,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, if (!vxlan) goto drop; + remote_ip = &vxlan->default_dst.remote_ip; skb_reset_mac_header(skb); skb->protocol = eth_type_trans(skb, vxlan->dev); @@ -917,9 +1053,20 @@ static void vxlan_rcv(struct vxlan_sock *vs, goto drop; /* Re-examine inner Ethernet packet */ - oip = ip_hdr(skb); + if (remote_ip->sa.sa_family == AF_INET) { + oip = ip_hdr(skb); + saddr.sin.sin_addr.s_addr = oip->saddr; + saddr.sa.sa_family = AF_INET; +#if IS_ENABLED(CONFIG_IPV6) + } else { + oip6 = ipv6_hdr(skb); + saddr.sin6.sin6_addr = oip6->saddr; + saddr.sa.sa_family = AF_INET6; +#endif + } + if ((vxlan->flags & VXLAN_F_LEARN) && - vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source)) + vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source)) goto drop; skb_reset_network_header(skb); @@ -935,11 +1082,20 @@ static void vxlan_rcv(struct vxlan_sock *vs, skb->encapsulation = 0; - err = IP_ECN_decapsulate(oip, skb); + if (oip6) + err = IP6_ECN_decapsulate(oip6, skb); + if (oip) + err = IP_ECN_decapsulate(oip, skb); + if (unlikely(err)) { - if (log_ecn_error) - net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", - &oip->saddr, oip->tos); + if (log_ecn_error) { + if (oip6) + net_info_ratelimited("non-ECT from %pI6\n", + &oip6->saddr); + if (oip) + net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", + &oip->saddr, oip->tos); + } if (err > 1) { ++vxlan->dev->stats.rx_frame_errors; ++vxlan->dev->stats.rx_errors; @@ -1009,7 +1165,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb) } f = vxlan_find_mac(vxlan, n->ha); - if (f && first_remote_rcu(f)->remote_ip == htonl(INADDR_ANY)) { + if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { /* bridge-local neighbor */ neigh_release(n); goto out; @@ -1027,8 +1183,14 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb) if (netif_rx_ni(reply) == NET_RX_DROP) dev->stats.rx_dropped++; - } else if (vxlan->flags & VXLAN_F_L3MISS) - vxlan_ip_miss(dev, tip); + } else if (vxlan->flags & VXLAN_F_L3MISS) { + union vxlan_addr ipa = { + .sin.sin_addr.s_addr = tip, + .sa.sa_family = AF_INET, + }; + + vxlan_ip_miss(dev, &ipa); + } out: consume_skb(skb); return NETDEV_TX_OK; @@ -1050,6 +1212,16 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) return false; pip = ip_hdr(skb); n = neigh_lookup(&arp_tbl, &pip->daddr, dev); + if (!n && (vxlan->flags & VXLAN_F_L3MISS)) { + union vxlan_addr ipa = { + .sin.sin_addr.s_addr = pip->daddr, + .sa.sa_family = AF_INET, + }; + + vxlan_ip_miss(dev, &ipa); + return false; + } + break; default: return false; @@ -1066,8 +1238,8 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) } neigh_release(n); return diff; - } else if (vxlan->flags & VXLAN_F_L3MISS) - vxlan_ip_miss(dev, pip->daddr); + } + return false; } @@ -1118,6 +1290,102 @@ static int handle_offloads(struct sk_buff *skb) return 0; } +#if IS_ENABLED(CONFIG_IPV6) +static int vxlan6_xmit_skb(struct net *net, struct vxlan_sock *vs, + struct dst_entry *dst, struct sk_buff *skb, + struct net_device *dev, struct in6_addr *saddr, + struct in6_addr *daddr, __u8 prio, __u8 ttl, + __be16 src_port, __be16 dst_port, __be32 vni) +{ + struct ipv6hdr *ip6h; + struct vxlanhdr *vxh; + struct udphdr *uh; + int min_headroom; + int err; + + if (!skb->encapsulation) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + + min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + + VXLAN_HLEN + sizeof(struct ipv6hdr) + + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); + + /* Need space for new headers (invalidates iph ptr) */ + err = skb_cow_head(skb, min_headroom); + if (unlikely(err)) + return err; + + if (vlan_tx_tag_present(skb)) { + if (WARN_ON(!__vlan_put_tag(skb, + skb->vlan_proto, + vlan_tx_tag_get(skb)))) + return -ENOMEM; + + skb->vlan_tci = 0; + } + + vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); + vxh->vx_flags = htonl(VXLAN_FLAGS); + vxh->vx_vni = vni; + + __skb_push(skb, sizeof(*uh)); + skb_reset_transport_header(skb); + uh = udp_hdr(skb); + + uh->dest = dst_port; + uh->source = src_port; + + uh->len = htons(skb->len); + uh->check = 0; + + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | + IPSKB_REROUTED); + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + if (!skb_is_gso(skb) && !(dst->dev->features & NETIF_F_IPV6_CSUM)) { + __wsum csum = skb_checksum(skb, 0, skb->len, 0); + skb->ip_summed = CHECKSUM_UNNECESSARY; + uh->check = csum_ipv6_magic(saddr, daddr, skb->len, + IPPROTO_UDP, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } else { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + uh->check = ~csum_ipv6_magic(saddr, daddr, + skb->len, IPPROTO_UDP, 0); + } + + __skb_push(skb, sizeof(*ip6h)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); + ip6h->version = 6; + ip6h->priority = prio; + ip6h->flow_lbl[0] = 0; + ip6h->flow_lbl[1] = 0; + ip6h->flow_lbl[2] = 0; + ip6h->payload_len = htons(skb->len); + ip6h->nexthdr = IPPROTO_UDP; + ip6h->hop_limit = ttl; + ip6h->daddr = *daddr; + ip6h->saddr = *saddr; + + vxlan_set_owner(vs->sock->sk, skb); + + err = handle_offloads(skb); + if (err) + return err; + + ip6tunnel_xmit(skb, dev); + return 0; +} +#endif + int vxlan_xmit_skb(struct net *net, struct vxlan_sock *vs, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, @@ -1182,15 +1450,26 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, { struct pcpu_tstats *tx_stats = this_cpu_ptr(src_vxlan->dev->tstats); struct pcpu_tstats *rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats); + union vxlan_addr loopback; + union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip; skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; skb->dev = dst_vxlan->dev; __skb_pull(skb, skb_network_offset(skb)); + if (remote_ip->sa.sa_family == AF_INET) { + loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + loopback.sa.sa_family = AF_INET; +#if IS_ENABLED(CONFIG_IPV6) + } else { + loopback.sin6.sin6_addr = in6addr_loopback; + loopback.sa.sa_family = AF_INET6; +#endif + } + if (dst_vxlan->flags & VXLAN_F_LEARN) - vxlan_snoop(skb->dev, htonl(INADDR_LOOPBACK), - eth_hdr(skb)->h_source); + vxlan_snoop(skb->dev, &loopback, eth_hdr(skb)->h_source); u64_stats_update_begin(&tx_stats->syncp); tx_stats->tx_packets++; @@ -1211,11 +1490,11 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct vxlan_rdst *rdst, bool did_rsc) { struct vxlan_dev *vxlan = netdev_priv(dev); - struct rtable *rt; + struct rtable *rt = NULL; const struct iphdr *old_iph; struct flowi4 fl4; - __be32 dst; - __be16 src_port, dst_port; + union vxlan_addr *dst; + __be16 src_port = 0, dst_port; u32 vni; __be16 df = 0; __u8 tos, ttl; @@ -1223,9 +1502,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port; vni = rdst->remote_vni; - dst = rdst->remote_ip; + dst = &rdst->remote_ip; - if (!dst) { + if (vxlan_addr_any(dst)) { if (did_rsc) { /* short-circuited back to local bridge */ vxlan_encap_bypass(skb, vxlan, vxlan); @@ -1237,7 +1516,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, old_iph = ip_hdr(skb); ttl = vxlan->ttl; - if (!ttl && IN_MULTICAST(ntohl(dst))) + if (!ttl && vxlan_addr_multicast(dst)) ttl = 1; tos = vxlan->tos; @@ -1246,48 +1525,101 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, src_port = vxlan_src_port(vxlan->port_min, vxlan->port_max, skb); - memset(&fl4, 0, sizeof(fl4)); - fl4.flowi4_oif = rdst->remote_ifindex; - fl4.flowi4_tos = RT_TOS(tos); - fl4.daddr = dst; - fl4.saddr = vxlan->saddr; - - rt = ip_route_output_key(dev_net(dev), &fl4); - if (IS_ERR(rt)) { - netdev_dbg(dev, "no route to %pI4\n", &dst); - dev->stats.tx_carrier_errors++; - goto tx_error; - } + if (dst->sa.sa_family == AF_INET) { + memset(&fl4, 0, sizeof(fl4)); + fl4.flowi4_oif = rdst->remote_ifindex; + fl4.flowi4_tos = RT_TOS(tos); + fl4.daddr = dst->sin.sin_addr.s_addr; + fl4.saddr = vxlan->saddr.sin.sin_addr.s_addr; + + rt = ip_route_output_key(dev_net(dev), &fl4); + if (IS_ERR(rt)) { + netdev_dbg(dev, "no route to %pI4\n", + &dst->sin.sin_addr.s_addr); + dev->stats.tx_carrier_errors++; + goto tx_error; + } - if (rt->dst.dev == dev) { - netdev_dbg(dev, "circular route to %pI4\n", &dst); - dev->stats.collisions++; - goto rt_tx_error; - } + if (rt->dst.dev == dev) { + netdev_dbg(dev, "circular route to %pI4\n", + &dst->sin.sin_addr.s_addr); + dev->stats.collisions++; + goto tx_error; + } + + /* Bypass encapsulation if the destination is local */ + if (rt->rt_flags & RTCF_LOCAL && + !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { + struct vxlan_dev *dst_vxlan; + + ip_rt_put(rt); + dst_vxlan = vxlan_find_vni(dev_net(dev), vni, dst_port); + if (!dst_vxlan) + goto tx_error; + vxlan_encap_bypass(skb, vxlan, dst_vxlan); + return; + } - /* Bypass encapsulation if the destination is local */ - if (rt->rt_flags & RTCF_LOCAL && - !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { - struct vxlan_dev *dst_vxlan; + tos = ip_tunnel_ecn_encap(tos, old_iph, skb); + ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); - ip_rt_put(rt); - dst_vxlan = vxlan_find_vni(dev_net(dev), vni, dst_port); - if (!dst_vxlan) + err = vxlan_xmit_skb(dev_net(dev), vxlan->vn_sock, rt, skb, + fl4.saddr, dst->sin.sin_addr.s_addr, + tos, ttl, df, src_port, dst_port, + htonl(vni << 8)); + + if (err < 0) + goto rt_tx_error; + iptunnel_xmit_stats(err, &dev->stats, dev->tstats); +#if IS_ENABLED(CONFIG_IPV6) + } else { + struct sock *sk = vxlan->vn_sock->sock->sk; + struct dst_entry *ndst; + struct flowi6 fl6; + u32 flags; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_oif = rdst->remote_ifindex; + fl6.daddr = dst->sin6.sin6_addr; + fl6.saddr = vxlan->saddr.sin6.sin6_addr; + fl6.flowi6_proto = skb->protocol; + + if (ipv6_stub->ipv6_dst_lookup(sk, &ndst, &fl6)) { + netdev_dbg(dev, "no route to %pI6\n", + &dst->sin6.sin6_addr); + dev->stats.tx_carrier_errors++; goto tx_error; - vxlan_encap_bypass(skb, vxlan, dst_vxlan); - return; - } + } - tos = ip_tunnel_ecn_encap(tos, old_iph, skb); - ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); + if (ndst->dev == dev) { + netdev_dbg(dev, "circular route to %pI6\n", + &dst->sin6.sin6_addr); + dst_release(ndst); + dev->stats.collisions++; + goto tx_error; + } - err = vxlan_xmit_skb(dev_net(dev), vxlan->vn_sock, rt, skb, - fl4.saddr, dst, tos, ttl, df, - src_port, dst_port, htonl(vni << 8)); + /* Bypass encapsulation if the destination is local */ + flags = ((struct rt6_info *)ndst)->rt6i_flags; + if (flags & RTF_LOCAL && + !(flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { + struct vxlan_dev *dst_vxlan; + + dst_release(ndst); + dst_vxlan = vxlan_find_vni(dev_net(dev), vni, dst_port); + if (!dst_vxlan) + goto tx_error; + vxlan_encap_bypass(skb, vxlan, dst_vxlan); + return; + } - if (err < 0) - goto rt_tx_error; - iptunnel_xmit_stats(err, &dev->stats, dev->tstats); + ttl = ttl ? : ip6_dst_hoplimit(ndst); + + err = vxlan6_xmit_skb(dev_net(dev), vxlan->vn_sock, ndst, skb, + dev, &fl6.saddr, &fl6.daddr, 0, ttl, + src_port, dst_port, htonl(vni << 8)); +#endif + } return; @@ -1464,8 +1796,8 @@ static int vxlan_open(struct net_device *dev) if (!vs) return -ENOTCONN; - if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip)) && - vxlan_group_used(vn, vxlan->default_dst.remote_ip)) { + if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) && + vxlan_group_used(vn, &vxlan->default_dst.remote_ip)) { vxlan_sock_hold(vs); dev_hold(dev); queue_work(vxlan_wq, &vxlan->igmp_join); @@ -1503,8 +1835,8 @@ static int vxlan_stop(struct net_device *dev) struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_sock *vs = vxlan->vn_sock; - if (vs && IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip)) && - ! vxlan_group_used(vn, vxlan->default_dst.remote_ip)) { + if (vs && vxlan_addr_multicast(&vxlan->default_dst.remote_ip) && + ! vxlan_group_used(vn, &vxlan->default_dst.remote_ip)) { vxlan_sock_hold(vs); dev_hold(dev); queue_work(vxlan_wq, &vxlan->igmp_leave); @@ -1552,7 +1884,10 @@ static void vxlan_setup(struct net_device *dev) eth_hw_addr_random(dev); ether_setup(dev); - dev->hard_header_len = ETH_HLEN + VXLAN_HEADROOM; + if (vxlan->default_dst.remote_ip.sa.sa_family == AF_INET6) + dev->hard_header_len = ETH_HLEN + VXLAN6_HEADROOM; + else + dev->hard_header_len = ETH_HLEN + VXLAN_HEADROOM; dev->netdev_ops = &vxlan_netdev_ops; dev->destructor = free_netdev; @@ -1597,8 +1932,10 @@ static void vxlan_setup(struct net_device *dev) static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_ID] = { .type = NLA_U32 }, [IFLA_VXLAN_GROUP] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, + [IFLA_VXLAN_GROUP6] = { .len = sizeof(struct in6_addr) }, [IFLA_VXLAN_LINK] = { .type = NLA_U32 }, [IFLA_VXLAN_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, + [IFLA_VXLAN_LOCAL6] = { .len = sizeof(struct in6_addr) }, [IFLA_VXLAN_TOS] = { .type = NLA_U8 }, [IFLA_VXLAN_TTL] = { .type = NLA_U8 }, [IFLA_VXLAN_LEARNING] = { .type = NLA_U8 }, @@ -1669,58 +2006,132 @@ static void vxlan_del_work(struct work_struct *work) kfree_rcu(vs, rcu); } -static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, - vxlan_rcv_t *rcv, void *data) +#if IS_ENABLED(CONFIG_IPV6) +/* Create UDP socket for encapsulation receive. AF_INET6 socket + * could be used for both IPv4 and IPv6 communications, but + * users may set bindv6only=1. + */ +static int create_v6_sock(struct net *net, __be16 port, struct socket **psock) +{ + struct sock *sk; + struct socket *sock; + struct sockaddr_in6 vxlan_addr = { + .sin6_family = AF_INET6, + .sin6_port = port, + }; + int rc, val = 1; + + rc = sock_create_kern(AF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (rc < 0) { + pr_debug("UDPv6 socket create failed\n"); + return rc; + } + + /* Put in proper namespace */ + sk = sock->sk; + sk_change_net(sk, net); + + kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, + (char *)&val, sizeof(val)); + rc = kernel_bind(sock, (struct sockaddr *)&vxlan_addr, + sizeof(struct sockaddr_in6)); + if (rc < 0) { + pr_debug("bind for UDPv6 socket %pI6:%u (%d)\n", + &vxlan_addr.sin6_addr, ntohs(vxlan_addr.sin6_port), rc); + sk_release_kernel(sk); + return rc; + } + /* At this point, IPv6 module should have been loaded in + * sock_create_kern(). + */ + BUG_ON(!ipv6_stub); + + *psock = sock; + /* Disable multicast loopback */ + inet_sk(sk)->mc_loop = 0; + return 0; +} + +#else + +static int create_v6_sock(struct net *net, __be16 port, struct socket **psock) +{ + return -EPFNOSUPPORT; +} +#endif + +static int create_v4_sock(struct net *net, __be16 port, struct socket **psock) { - struct vxlan_net *vn = net_generic(net, vxlan_net_id); - struct vxlan_sock *vs; struct sock *sk; + struct socket *sock; struct sockaddr_in vxlan_addr = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), .sin_port = port, }; int rc; - unsigned int h; - - vs = kmalloc(sizeof(*vs), GFP_KERNEL); - if (!vs) { - pr_debug("memory alocation failure\n"); - return ERR_PTR(-ENOMEM); - } - - for (h = 0; h < VNI_HASH_SIZE; ++h) - INIT_HLIST_HEAD(&vs->vni_list[h]); - - INIT_WORK(&vs->del_work, vxlan_del_work); /* Create UDP socket for encapsulation receive. */ - rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vs->sock); + rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (rc < 0) { pr_debug("UDP socket create failed\n"); - kfree(vs); - return ERR_PTR(rc); + return rc; } /* Put in proper namespace */ - sk = vs->sock->sk; + sk = sock->sk; sk_change_net(sk, net); - rc = kernel_bind(vs->sock, (struct sockaddr *) &vxlan_addr, + rc = kernel_bind(sock, (struct sockaddr *) &vxlan_addr, sizeof(vxlan_addr)); if (rc < 0) { pr_debug("bind for UDP socket %pI4:%u (%d)\n", &vxlan_addr.sin_addr, ntohs(vxlan_addr.sin_port), rc); sk_release_kernel(sk); + return rc; + } + + *psock = sock; + /* Disable multicast loopback */ + inet_sk(sk)->mc_loop = 0; + return 0; +} + +/* Create new listen socket if needed */ +static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, + vxlan_rcv_t *rcv, void *data, bool ipv6) +{ + struct vxlan_net *vn = net_generic(net, vxlan_net_id); + struct vxlan_sock *vs; + struct socket *sock; + struct sock *sk; + int rc = 0; + unsigned int h; + + vs = kmalloc(sizeof(*vs), GFP_KERNEL); + if (!vs) + return ERR_PTR(-ENOMEM); + + for (h = 0; h < VNI_HASH_SIZE; ++h) + INIT_HLIST_HEAD(&vs->vni_list[h]); + + INIT_WORK(&vs->del_work, vxlan_del_work); + + if (ipv6) + rc = create_v6_sock(net, port, &sock); + else + rc = create_v4_sock(net, port, &sock); + if (rc < 0) { kfree(vs); return ERR_PTR(rc); } + + vs->sock = sock; + sk = sock->sk; atomic_set(&vs->refcnt, 1); vs->rcv = rcv; vs->data = data; - /* Disable multicast loopback */ - inet_sk(sk)->mc_loop = 0; spin_lock(&vn->sock_lock); hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); spin_unlock(&vn->sock_lock); @@ -1728,18 +2139,24 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, /* Mark socket as an encapsulation socket. */ udp_sk(sk)->encap_type = 1; udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv; - udp_encap_enable(); +#if IS_ENABLED(CONFIG_IPV6) + if (ipv6) + ipv6_stub->udpv6_encap_enable(); + else +#endif + udp_encap_enable(); + return vs; } struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, vxlan_rcv_t *rcv, void *data, - bool no_share) + bool no_share, bool ipv6) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_sock *vs; - vs = vxlan_socket_create(net, port, rcv, data); + vs = vxlan_socket_create(net, port, rcv, data, ipv6); if (!IS_ERR(vs)) return vs; @@ -1772,7 +2189,7 @@ static void vxlan_sock_work(struct work_struct *work) __be16 port = vxlan->dst_port; struct vxlan_sock *nvs; - nvs = vxlan_sock_add(net, port, vxlan_rcv, NULL, false); + nvs = vxlan_sock_add(net, port, vxlan_rcv, NULL, false, vxlan->flags & VXLAN_F_IPV6); spin_lock(&vn->sock_lock); if (!IS_ERR(nvs)) vxlan_vs_add_dev(nvs, vxlan); @@ -1789,6 +2206,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, struct vxlan_rdst *dst = &vxlan->default_dst; __u32 vni; int err; + bool use_ipv6 = false; if (!data[IFLA_VXLAN_ID]) return -EINVAL; @@ -1796,11 +2214,32 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, vni = nla_get_u32(data[IFLA_VXLAN_ID]); dst->remote_vni = vni; - if (data[IFLA_VXLAN_GROUP]) - dst->remote_ip = nla_get_be32(data[IFLA_VXLAN_GROUP]); + if (data[IFLA_VXLAN_GROUP]) { + dst->remote_ip.sin.sin_addr.s_addr = nla_get_be32(data[IFLA_VXLAN_GROUP]); + dst->remote_ip.sa.sa_family = AF_INET; + } else if (data[IFLA_VXLAN_GROUP6]) { + if (!IS_ENABLED(CONFIG_IPV6)) + return -EPFNOSUPPORT; + + nla_memcpy(&dst->remote_ip.sin6.sin6_addr, data[IFLA_VXLAN_GROUP6], + sizeof(struct in6_addr)); + dst->remote_ip.sa.sa_family = AF_INET6; + use_ipv6 = true; + } - if (data[IFLA_VXLAN_LOCAL]) - vxlan->saddr = nla_get_be32(data[IFLA_VXLAN_LOCAL]); + if (data[IFLA_VXLAN_LOCAL]) { + vxlan->saddr.sin.sin_addr.s_addr = nla_get_be32(data[IFLA_VXLAN_LOCAL]); + vxlan->saddr.sa.sa_family = AF_INET; + } else if (data[IFLA_VXLAN_LOCAL6]) { + if (!IS_ENABLED(CONFIG_IPV6)) + return -EPFNOSUPPORT; + + /* TODO: respect scope id */ + nla_memcpy(&vxlan->saddr.sin6.sin6_addr, data[IFLA_VXLAN_LOCAL6], + sizeof(struct in6_addr)); + vxlan->saddr.sa.sa_family = AF_INET6; + use_ipv6 = true; + } if (data[IFLA_VXLAN_LINK] && (dst->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]))) { @@ -1812,12 +2251,23 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, return -ENODEV; } +#if IS_ENABLED(CONFIG_IPV6) + if (use_ipv6) { + struct inet6_dev *idev = __in6_dev_get(lowerdev); + if (idev && idev->cnf.disable_ipv6) { + pr_info("IPv6 is disabled via sysctl\n"); + return -EPERM; + } + vxlan->flags |= VXLAN_F_IPV6; + } +#endif + if (!tb[IFLA_MTU]) - dev->mtu = lowerdev->mtu - VXLAN_HEADROOM; + dev->mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); /* update header length based on lower device */ dev->hard_header_len = lowerdev->hard_header_len + - VXLAN_HEADROOM; + (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); } if (data[IFLA_VXLAN_TOS]) @@ -1868,7 +2318,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, /* create an fdb entry for default destination */ err = vxlan_fdb_create(vxlan, all_zeros_mac, - vxlan->default_dst.remote_ip, + &vxlan->default_dst.remote_ip, NUD_REACHABLE|NUD_PERMANENT, NLM_F_EXCL|NLM_F_CREATE, vxlan->dst_port, vxlan->default_dst.remote_vni, @@ -1905,9 +2355,9 @@ static size_t vxlan_get_size(const struct net_device *dev) { return nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_ID */ - nla_total_size(sizeof(__be32)) +/* IFLA_VXLAN_GROUP */ + nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */ - nla_total_size(sizeof(__be32))+ /* IFLA_VXLAN_LOCAL */ + nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */ @@ -1934,14 +2384,36 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) if (nla_put_u32(skb, IFLA_VXLAN_ID, dst->remote_vni)) goto nla_put_failure; - if (dst->remote_ip && nla_put_be32(skb, IFLA_VXLAN_GROUP, dst->remote_ip)) - goto nla_put_failure; + if (!vxlan_addr_any(&dst->remote_ip)) { + if (dst->remote_ip.sa.sa_family == AF_INET) { + if (nla_put_be32(skb, IFLA_VXLAN_GROUP, + dst->remote_ip.sin.sin_addr.s_addr)) + goto nla_put_failure; +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (nla_put(skb, IFLA_VXLAN_GROUP6, sizeof(struct in6_addr), + &dst->remote_ip.sin6.sin6_addr)) + goto nla_put_failure; +#endif + } + } if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex)) goto nla_put_failure; - if (vxlan->saddr && nla_put_be32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr)) - goto nla_put_failure; + if (!vxlan_addr_any(&vxlan->saddr)) { + if (vxlan->saddr.sa.sa_family == AF_INET) { + if (nla_put_be32(skb, IFLA_VXLAN_LOCAL, + vxlan->saddr.sin.sin_addr.s_addr)) + goto nla_put_failure; +#if IS_ENABLED(CONFIG_IPV6) + } else { + if (nla_put(skb, IFLA_VXLAN_LOCAL6, sizeof(struct in6_addr), + &vxlan->saddr.sin6.sin6_addr)) + goto nla_put_failure; +#endif + } + } if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) || nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) || diff --git a/include/net/vxlan.h b/include/net/vxlan.h index ad342e3688a0..d2b88cafa7a2 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -25,7 +25,7 @@ struct vxlan_sock { struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, vxlan_rcv_t *rcv, void *data, - bool no_share); + bool no_share, bool ipv6); void vxlan_sock_release(struct vxlan_sock *vs); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 04c0e7a5d484..80394e8dc3a3 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -314,6 +314,8 @@ enum { IFLA_VXLAN_L2MISS, IFLA_VXLAN_L3MISS, IFLA_VXLAN_PORT, /* destination port */ + IFLA_VXLAN_GROUP6, + IFLA_VXLAN_LOCAL6, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 36848bd54a77..a0060245b4e1 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -123,7 +123,7 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) vxlan_port = vxlan_vport(vport); strncpy(vxlan_port->name, parms->name, IFNAMSIZ); - vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true); + vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, false); if (IS_ERR(vs)) { ovs_vport_free(vport); return (void *)vs; -- cgit v1.2.3-71-gd317 From 416161db9b63e353a8fb79d1369779175102fca1 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 6 Aug 2013 11:42:51 -0700 Subject: btrfs: offline dedupe This patch adds an ioctl, BTRFS_IOC_FILE_EXTENT_SAME which will try to de-duplicate a list of extents across a range of files. Internally, the ioctl re-uses code from the clone ioctl. This avoids rewriting a large chunk of extent handling code. Userspace passes in an array of file, offset pairs along with a length argument. The ioctl will then (for each dedupe) do a byte-by-byte comparison of the user data before deduping the extent. Status and number of bytes deduped are returned for each operation. Signed-off-by: Mark Fasheh Reviewed-by: Zach Brown Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 279 +++++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/btrfs.h | 28 +++++ 2 files changed, 307 insertions(+) (limited to 'include/uapi/linux') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5b5148a1b0d3..022d8364e072 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -43,6 +43,7 @@ #include #include #include +#include #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -57,6 +58,9 @@ #include "send.h" #include "dev-replace.h" +static int btrfs_clone(struct inode *src, struct inode *inode, + u64 off, u64 olen, u64 olen_aligned, u64 destoff); + /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) { @@ -2470,6 +2474,34 @@ out: return ret; } +static struct page *extent_same_get_page(struct inode *inode, u64 off) +{ + struct page *page; + pgoff_t index; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + + index = off >> PAGE_CACHE_SHIFT; + + page = grab_cache_page(inode->i_mapping, index); + if (!page) + return NULL; + + if (!PageUptodate(page)) { + if (extent_read_full_page_nolock(tree, page, btrfs_get_extent, + 0)) + return NULL; + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + return NULL; + } + } + unlock_page(page); + + return page; +} + static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) { /* do any pending delalloc/csum calc on src, one way or @@ -2490,6 +2522,251 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) } } +static void btrfs_double_unlock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); + + mutex_unlock(&inode1->i_mutex); + mutex_unlock(&inode2->i_mutex); +} + +static void btrfs_double_lock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + if (inode1 < inode2) { + swap(inode1, inode2); + swap(loff1, loff2); + } + + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); + lock_extent_range(inode1, loff1, len); + if (inode1 != inode2) { + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + lock_extent_range(inode2, loff2, len); + } +} + +static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, + u64 dst_loff, u64 len) +{ + int ret = 0; + struct page *src_page, *dst_page; + unsigned int cmp_len = PAGE_CACHE_SIZE; + void *addr, *dst_addr; + + while (len) { + if (len < PAGE_CACHE_SIZE) + cmp_len = len; + + src_page = extent_same_get_page(src, loff); + if (!src_page) + return -EINVAL; + dst_page = extent_same_get_page(dst, dst_loff); + if (!dst_page) { + page_cache_release(src_page); + return -EINVAL; + } + addr = kmap_atomic(src_page); + dst_addr = kmap_atomic(dst_page); + + flush_dcache_page(src_page); + flush_dcache_page(dst_page); + + if (memcmp(addr, dst_addr, cmp_len)) + ret = BTRFS_SAME_DATA_DIFFERS; + + kunmap_atomic(addr); + kunmap_atomic(dst_addr); + page_cache_release(src_page); + page_cache_release(dst_page); + + if (ret) + break; + + loff += cmp_len; + dst_loff += cmp_len; + len -= cmp_len; + } + + return ret; +} + +static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) +{ + u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; + + if (off + len > inode->i_size || off + len < off) + return -EINVAL; + /* Check that we are block aligned - btrfs_clone() requires this */ + if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) + return -EINVAL; + + return 0; +} + +static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, + struct inode *dst, u64 dst_loff) +{ + int ret; + + /* + * btrfs_clone() can't handle extents in the same file + * yet. Once that works, we can drop this check and replace it + * with a check for the same inode, but overlapping extents. + */ + if (src == dst) + return -EINVAL; + + btrfs_double_lock(src, loff, dst, dst_loff, len); + + ret = extent_same_check_offsets(src, loff, len); + if (ret) + goto out_unlock; + + ret = extent_same_check_offsets(dst, dst_loff, len); + if (ret) + goto out_unlock; + + /* don't make the dst file partly checksummed */ + if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != + (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) { + ret = -EINVAL; + goto out_unlock; + } + + ret = btrfs_cmp_data(src, loff, dst, dst_loff, len); + if (ret == 0) + ret = btrfs_clone(src, dst, loff, len, len, dst_loff); + +out_unlock: + btrfs_double_unlock(src, loff, dst, dst_loff, len); + + return ret; +} + +#define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) + +static long btrfs_ioctl_file_extent_same(struct file *file, + void __user *argp) +{ + struct btrfs_ioctl_same_args *args = argp; + struct btrfs_ioctl_same_args same; + struct btrfs_ioctl_same_extent_info info; + struct inode *src = file->f_dentry->d_inode; + struct file *dst_file = NULL; + struct inode *dst; + u64 off; + u64 len; + int i; + int ret; + u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; + bool is_admin = capable(CAP_SYS_ADMIN); + + if (!(file->f_mode & FMODE_READ)) + return -EINVAL; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + if (copy_from_user(&same, + (struct btrfs_ioctl_same_args __user *)argp, + sizeof(same))) { + ret = -EFAULT; + goto out; + } + + off = same.logical_offset; + len = same.length; + + /* + * Limit the total length we will dedupe for each operation. + * This is intended to bound the total time spent in this + * ioctl to something sane. + */ + if (len > BTRFS_MAX_DEDUPE_LEN) + len = BTRFS_MAX_DEDUPE_LEN; + + if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) { + /* + * Btrfs does not support blocksize < page_size. As a + * result, btrfs_cmp_data() won't correctly handle + * this situation without an update. + */ + ret = -EINVAL; + goto out; + } + + ret = -EISDIR; + if (S_ISDIR(src->i_mode)) + goto out; + + ret = -EACCES; + if (!S_ISREG(src->i_mode)) + goto out; + + ret = 0; + for (i = 0; i < same.dest_count; i++) { + if (copy_from_user(&info, &args->info[i], sizeof(info))) { + ret = -EFAULT; + goto out; + } + + info.bytes_deduped = 0; + + dst_file = fget(info.fd); + if (!dst_file) { + info.status = -EBADF; + goto next; + } + + if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) { + info.status = -EINVAL; + goto next; + } + + info.status = -EXDEV; + if (file->f_path.mnt != dst_file->f_path.mnt) + goto next; + + dst = dst_file->f_dentry->d_inode; + if (src->i_sb != dst->i_sb) + goto next; + + if (S_ISDIR(dst->i_mode)) { + info.status = -EISDIR; + goto next; + } + + if (!S_ISREG(dst->i_mode)) { + info.status = -EACCES; + goto next; + } + + info.status = btrfs_extent_same(src, off, len, dst, + info.logical_offset); + if (info.status == 0) + info.bytes_deduped += len; + +next: + if (dst_file) + fput(dst_file); + + if (__put_user_unaligned(info.status, &args->info[i].status) || + __put_user_unaligned(info.bytes_deduped, + &args->info[i].bytes_deduped)) { + ret = -EFAULT; + goto out; + } + } + +out: + mnt_drop_write_file(file); + return ret; +} + /** * btrfs_clone() - clone a range from inode file to another * @@ -4242,6 +4519,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_get_fslabel(file, argp); case BTRFS_IOC_SET_FSLABEL: return btrfs_ioctl_set_fslabel(file, argp); + case BTRFS_IOC_FILE_EXTENT_SAME: + return btrfs_ioctl_file_extent_same(file, argp); } return -ENOTTY; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 05aed70627e2..90d7bd9d839c 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -305,6 +305,31 @@ struct btrfs_ioctl_clone_range_args { #define BTRFS_DEFRAG_RANGE_COMPRESS 1 #define BTRFS_DEFRAG_RANGE_START_IO 2 +#define BTRFS_SAME_DATA_DIFFERS 1 +/* For extent-same ioctl */ +struct btrfs_ioctl_same_extent_info { + __s64 fd; /* in - destination file */ + __u64 logical_offset; /* in - start of extent in destination */ + __u64 bytes_deduped; /* out - total # of bytes we were able + * to dedupe from this file */ + /* status of this dedupe operation: + * 0 if dedup succeeds + * < 0 for error + * == BTRFS_SAME_DATA_DIFFERS if data differs + */ + __s32 status; /* out - see above description */ + __u32 reserved; +}; + +struct btrfs_ioctl_same_args { + __u64 logical_offset; /* in - start of extent in source */ + __u64 length; /* in - length of extent */ + __u16 dest_count; /* in - total elements in info array */ + __u16 reserved1; + __u32 reserved2; + struct btrfs_ioctl_same_extent_info info[0]; +}; + struct btrfs_ioctl_space_info { __u64 flags; __u64 total_bytes; @@ -579,4 +604,7 @@ static inline char *btrfs_err_str(enum btrfs_err_code err_code) struct btrfs_ioctl_get_dev_stats) #define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \ struct btrfs_ioctl_dev_replace_args) +#define BTRFS_IOC_FILE_EXTENT_SAME _IOWR(BTRFS_IOCTL_MAGIC, 54, \ + struct btrfs_ioctl_same_args) + #endif /* _UAPI_LINUX_BTRFS_H */ -- cgit v1.2.3-71-gd317 From 68b823ef41f17b05ce8d04e10f0635e068451f6c Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Fri, 16 Aug 2013 14:02:33 -0400 Subject: Btrfs: use __u64 in exported user headers Signed-off-by: Mike Frysinger Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- include/uapi/linux/btrfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 90d7bd9d839c..45e618921c61 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -549,7 +549,7 @@ static inline char *btrfs_err_str(enum btrfs_err_code err_code) struct btrfs_ioctl_search_args) #define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \ struct btrfs_ioctl_ino_lookup_args) -#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64) +#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, __u64) #define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \ struct btrfs_ioctl_space_args) #define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64) -- cgit v1.2.3-71-gd317 From 13d7a2410fa637f450a29ecb515ac318ee40c741 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 21 Aug 2013 12:10:24 +0200 Subject: perf: Add attr->mmap2 attribute to an event Adds a new PERF_RECORD_MMAP2 record type which is essence an expanded version of PERF_RECORD_MMAP. Used to request mmap records with more information about the mapping, including device major, minor and the inode number and generation for mappings associated with files or shared memory segments. Works for code and data (with attr->mmap_data set). Existing PERF_RECORD_MMAP record is unmodified by this patch. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Al Viro Link: http://lkml.kernel.org/r/1377079825-19057-2-git-send-email-eranian@google.com [ Added Al to the Cc:. Are the ino, maj/min exports of vma->vm_file OK? ] Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 24 ++++++++++++++++++++- kernel/events/core.c | 46 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 65 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 42cb7b62ca59..a77f43af72b8 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -276,8 +276,9 @@ struct perf_event_attr { exclude_callchain_kernel : 1, /* exclude kernel callchains */ exclude_callchain_user : 1, /* exclude user callchains */ + mmap2 : 1, /* include mmap with inode data */ - __reserved_1 : 41; + __reserved_1 : 40; union { __u32 wakeup_events; /* wakeup every n events */ @@ -651,6 +652,27 @@ enum perf_event_type { */ PERF_RECORD_SAMPLE = 9, + /* + * The MMAP2 records are an augmented version of MMAP, they add + * maj, min, ino numbers to be used to uniquely identify each mapping + * + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * u32 maj; + * u32 min; + * u64 ino; + * u64 ino_generation; + * char filename[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_MMAP2 = 10, + PERF_RECORD_MAX, /* non-ABI */ }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 15d0f2418e54..c7ee497c39a7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4776,7 +4776,7 @@ next: /* * task tracking -- fork/exit * - * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task + * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task */ struct perf_task_event { @@ -4796,8 +4796,9 @@ struct perf_task_event { static int perf_event_task_match(struct perf_event *event) { - return event->attr.comm || event->attr.mmap || - event->attr.mmap_data || event->attr.task; + return event->attr.comm || event->attr.mmap || + event->attr.mmap2 || event->attr.mmap_data || + event->attr.task; } static void perf_event_task_output(struct perf_event *event, @@ -4992,6 +4993,9 @@ struct perf_mmap_event { const char *file_name; int file_size; + int maj, min; + u64 ino; + u64 ino_generation; struct { struct perf_event_header header; @@ -5012,7 +5016,7 @@ static int perf_event_mmap_match(struct perf_event *event, int executable = vma->vm_flags & VM_EXEC; return (!executable && event->attr.mmap_data) || - (executable && event->attr.mmap); + (executable && (event->attr.mmap || event->attr.mmap2)); } static void perf_event_mmap_output(struct perf_event *event, @@ -5027,6 +5031,13 @@ static void perf_event_mmap_output(struct perf_event *event, if (!perf_event_mmap_match(event, data)) return; + if (event->attr.mmap2) { + mmap_event->event_id.header.type = PERF_RECORD_MMAP2; + mmap_event->event_id.header.size += sizeof(mmap_event->maj); + mmap_event->event_id.header.size += sizeof(mmap_event->min); + mmap_event->event_id.header.size += sizeof(mmap_event->ino); + } + perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, mmap_event->event_id.header.size); @@ -5037,6 +5048,14 @@ static void perf_event_mmap_output(struct perf_event *event, mmap_event->event_id.tid = perf_event_tid(event, current); perf_output_put(&handle, mmap_event->event_id); + + if (event->attr.mmap2) { + perf_output_put(&handle, mmap_event->maj); + perf_output_put(&handle, mmap_event->min); + perf_output_put(&handle, mmap_event->ino); + perf_output_put(&handle, mmap_event->ino_generation); + } + __output_copy(&handle, mmap_event->file_name, mmap_event->file_size); @@ -5051,6 +5070,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) { struct vm_area_struct *vma = mmap_event->vma; struct file *file = vma->vm_file; + int maj = 0, min = 0; + u64 ino = 0, gen = 0; unsigned int size; char tmp[16]; char *buf = NULL; @@ -5059,6 +5080,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) memset(tmp, 0, sizeof(tmp)); if (file) { + struct inode *inode; + dev_t dev; /* * d_path works from the end of the rb backwards, so we * need to add enough zero bytes after the string to handle @@ -5074,6 +5097,13 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) name = strncpy(tmp, "//toolong", sizeof(tmp)); goto got_name; } + inode = file_inode(vma->vm_file); + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + gen = inode->i_generation; + maj = MAJOR(dev); + min = MINOR(dev); + } else { if (arch_vma_name(mmap_event->vma)) { name = strncpy(tmp, arch_vma_name(mmap_event->vma), @@ -5104,6 +5134,10 @@ got_name: mmap_event->file_name = name; mmap_event->file_size = size; + mmap_event->maj = maj; + mmap_event->min = min; + mmap_event->ino = ino; + mmap_event->ino_generation = gen; if (!(vma->vm_flags & VM_EXEC)) mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; @@ -5140,6 +5174,10 @@ void perf_event_mmap(struct vm_area_struct *vma) .len = vma->vm_end - vma->vm_start, .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, }, + /* .maj (attr_mmap2 only) */ + /* .min (attr_mmap2 only) */ + /* .ino (attr_mmap2 only) */ + /* .ino_generation (attr_mmap2 only) */ }; perf_event_mmap_event(&mmap_event); -- cgit v1.2.3-71-gd317 From 274481de6cb69abdb49403ff32abb63c23743413 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Fri, 23 Aug 2013 15:51:03 -0400 Subject: perf: Export struct perf_branch_entry to userspace If PERF_SAMPLE_BRANCH_STACK is enabled then samples are returned with the format { u64 from, to, flags } but the flags layout is not specified. This field has the type struct perf_branch_entry; move this definition into include/uapi/linux/perf_event.h so users can access these fields. This is similar to the existing inclusion of perf_mem_data_src in the include/uapi/linux/perf_event.h file. Signed-off-by: Vince Weaver Acked-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1308231544420.1889@vincent-weaver-1.um.maine.edu Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 24 ------------------------ include/uapi/linux/perf_event.h | 24 ++++++++++++++++++++++++ 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index c43f6eabad5b..4019d82c3d03 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -63,30 +63,6 @@ struct perf_raw_record { void *data; }; -/* - * single taken branch record layout: - * - * from: source instruction (may not always be a branch insn) - * to: branch target - * mispred: branch target was mispredicted - * predicted: branch target was predicted - * - * support for mispred, predicted is optional. In case it - * is not supported mispred = predicted = 0. - * - * in_tx: running in a hardware transaction - * abort: aborting a hardware transaction - */ -struct perf_branch_entry { - __u64 from; - __u64 to; - __u64 mispred:1, /* target mispredicted */ - predicted:1,/* target predicted */ - in_tx:1, /* in transaction */ - abort:1, /* transaction abort */ - reserved:60; -}; - /* * branch stack layout: * nr: number of taken branches stored in entries[] diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index a77f43af72b8..408b8c730731 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -757,4 +757,28 @@ union perf_mem_data_src { #define PERF_MEM_S(a, s) \ (((u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) +/* + * single taken branch record layout: + * + * from: source instruction (may not always be a branch insn) + * to: branch target + * mispred: branch target was mispredicted + * predicted: branch target was predicted + * + * support for mispred, predicted is optional. In case it + * is not supported mispred = predicted = 0. + * + * in_tx: running in a hardware transaction + * abort: aborting a hardware transaction + */ +struct perf_branch_entry { + __u64 from; + __u64 to; + __u64 mispred:1, /* target mispredicted */ + predicted:1,/* target predicted */ + in_tx:1, /* in transaction */ + abort:1, /* transaction abort */ + reserved:60; +}; + #endif /* _UAPI_LINUX_PERF_EVENT_H */ -- cgit v1.2.3-71-gd317 From 909e3ee4119f87b85c6e1b8534b2287ed1ea3ca2 Mon Sep 17 00:00:00 2001 From: Dan Aloni Date: Wed, 28 Aug 2013 14:24:53 +0100 Subject: Move the EM_ARM and EM_AARCH64 definitions to uapi/linux/elf-em.h Signed-off-by: Dan Aloni Signed-off-by: Catalin Marinas --- arch/arm/include/asm/elf.h | 2 -- arch/arm64/include/asm/elf.h | 3 --- include/uapi/linux/elf-em.h | 2 ++ 3 files changed, 2 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h index 56211f2084ef..f4b46d39b9cf 100644 --- a/arch/arm/include/asm/elf.h +++ b/arch/arm/include/asm/elf.h @@ -19,8 +19,6 @@ typedef elf_greg_t elf_gregset_t[ELF_NGREG]; typedef struct user_fp elf_fpregset_t; -#define EM_ARM 40 - #define EF_ARM_EABI_MASK 0xff000000 #define EF_ARM_EABI_UNKNOWN 0x00000000 #define EF_ARM_EABI_VER1 0x01000000 diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index fe32c0e4ac01..e7fa87f9201b 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -33,8 +33,6 @@ typedef unsigned long elf_greg_t; typedef elf_greg_t elf_gregset_t[ELF_NGREG]; typedef struct user_fpsimd_state elf_fpregset_t; -#define EM_AARCH64 183 - /* * AArch64 static relocation types. */ @@ -151,7 +149,6 @@ extern unsigned long arch_randomize_brk(struct mm_struct *mm); #define arch_randomize_brk arch_randomize_brk #ifdef CONFIG_COMPAT -#define EM_ARM 40 #define COMPAT_ELF_PLATFORM ("v8l") #define COMPAT_ELF_ET_DYN_BASE (randomize_et_dyn(2 * TASK_SIZE_32 / 3)) diff --git a/include/uapi/linux/elf-em.h b/include/uapi/linux/elf-em.h index 8e2b7bac4378..59c17a2d38ad 100644 --- a/include/uapi/linux/elf-em.h +++ b/include/uapi/linux/elf-em.h @@ -22,6 +22,7 @@ #define EM_PPC 20 /* PowerPC */ #define EM_PPC64 21 /* PowerPC64 */ #define EM_SPU 23 /* Cell BE SPU */ +#define EM_ARM 40 /* ARM 32 bit */ #define EM_SH 42 /* SuperH */ #define EM_SPARCV9 43 /* SPARC v9 64-bit */ #define EM_IA_64 50 /* HP/Intel IA-64 */ @@ -34,6 +35,7 @@ #define EM_MN10300 89 /* Panasonic/MEI MN10300, AM33 */ #define EM_BLACKFIN 106 /* ADI Blackfin Processor */ #define EM_TI_C6000 140 /* TI C6X DSPs */ +#define EM_AARCH64 183 /* ARM 64 bit */ #define EM_FRV 0x5441 /* Fujitsu FR-V */ #define EM_AVR32 0x18ad /* Atmel AVR32 */ -- cgit v1.2.3-71-gd317 From fa0097ee690693006ab1aea6c01ad3c851b65c77 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Sat, 31 Aug 2013 21:50:51 +0300 Subject: perf: Add a dummy software event to keep tracking When an event is disabled the "tracking" events selected by the 'mmap', 'comm' and 'task' bits of struct perf_event_attr, are also disabled. However, the information those events provide is necessary to resolve symbols for when the main event is re-enabled. The "tracking" events can be kept enabled by putting them on another event, but that requires an event that otherwise does nothing. A new software event PERF_COUNT_SW_DUMMY is added for that purpose. Signed-off-by: Adrian Hunter Acked-by: Peter Zijlstra Cc: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1377975053-3811-2-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- include/uapi/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 408b8c730731..ca1d90bcb74d 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -109,6 +109,7 @@ enum perf_sw_ids { PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, PERF_COUNT_SW_ALIGNMENT_FAULTS = 7, PERF_COUNT_SW_EMULATION_FAULTS = 8, + PERF_COUNT_SW_DUMMY = 9, PERF_COUNT_SW_MAX, /* non-ABI */ }; -- cgit v1.2.3-71-gd317 From b5c6c1a72afcc416c11ad932589054dcd3125782 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Mon, 12 Aug 2013 14:11:44 -0400 Subject: tilegx: Add tty serial support for TILE-Gx on-chip UART Signed-off-by: Chris Metcalf Acked-by: Greg Kroah-Hartman --- MAINTAINERS | 1 + drivers/tty/serial/Kconfig | 9 + drivers/tty/serial/Makefile | 1 + drivers/tty/serial/tilegx.c | 708 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/serial_core.h | 3 + 5 files changed, 722 insertions(+) create mode 100644 drivers/tty/serial/tilegx.c (limited to 'include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 8f4919842dff..23479fa2adf0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8293,6 +8293,7 @@ F: drivers/edac/tile_edac.c F: drivers/net/ethernet/tile/ F: drivers/rtc/rtc-tile.c F: drivers/tty/hvc/hvc_tile.c +F: drivers/tty/serial/tilegx.c F: drivers/usb/host/*-tilegx.c F: include/linux/usb/tilegx.h diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig index 5e3d68917ffe..fdcaaefcdf8c 100644 --- a/drivers/tty/serial/Kconfig +++ b/drivers/tty/serial/Kconfig @@ -1436,6 +1436,15 @@ config SERIAL_EFM32_UART_CONSOLE depends on SERIAL_EFM32_UART=y select SERIAL_CORE_CONSOLE +config SERIAL_TILEGX + tristate "TILE-Gx on-chip serial port support" + depends on TILEGX + select TILE_GXIO_UART + select SERIAL_CORE + ---help--- + This device provides access to the on-chip UARTs on the TILE-Gx + processor. + config SERIAL_ARC tristate "ARC UART driver support" select SERIAL_CORE diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile index cf650f0cd6e4..3d0f097e82ff 100644 --- a/drivers/tty/serial/Makefile +++ b/drivers/tty/serial/Makefile @@ -65,6 +65,7 @@ obj-$(CONFIG_SERIAL_KGDB_NMI) += kgdb_nmi.o obj-$(CONFIG_SERIAL_KS8695) += serial_ks8695.o obj-$(CONFIG_SERIAL_OMAP) += omap-serial.o obj-$(CONFIG_SERIAL_ALTERA_UART) += altera_uart.o +obj-$(CONFIG_SERIAL_TILEGX) += tilegx.o obj-$(CONFIG_KGDB_SERIAL_CONSOLE) += kgdboc.o obj-$(CONFIG_SERIAL_QE) += ucc_uart.o obj-$(CONFIG_SERIAL_TIMBERDALE) += timbuart.o diff --git a/drivers/tty/serial/tilegx.c b/drivers/tty/serial/tilegx.c new file mode 100644 index 000000000000..f92d7e6bd876 --- /dev/null +++ b/drivers/tty/serial/tilegx.c @@ -0,0 +1,708 @@ +/* + * Copyright 2013 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + * + * TILEGx UART driver. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +/* + * Use device name ttyS, major 4, minor 64-65. + * This is the usual serial port name, 8250 conventional range. + */ +#define TILEGX_UART_MAJOR TTY_MAJOR +#define TILEGX_UART_MINOR 64 +#define TILEGX_UART_NAME "ttyS" +#define DRIVER_NAME_STRING "TILEGx_Serial" +#define TILEGX_UART_REF_CLK 125000000; /* REF_CLK is always 125 MHz. */ + +struct tile_uart_port { + /* UART port. */ + struct uart_port uart; + + /* GXIO device context. */ + gxio_uart_context_t context; + + /* UART access mutex. */ + struct mutex mutex; + + /* CPU receiving interrupts. */ + int irq_cpu; +}; + +static struct tile_uart_port tile_uart_ports[TILEGX_UART_NR]; +static struct uart_driver tilegx_uart_driver; + + +/* + * Read UART rx fifo, and insert the chars into tty buffer. + */ +static void receive_chars(struct tile_uart_port *tile_uart, + struct tty_struct *tty) +{ + int i; + char c; + UART_FIFO_COUNT_t count; + gxio_uart_context_t *context = &tile_uart->context; + struct tty_port *port = tty->port; + + count.word = gxio_uart_read(context, UART_FIFO_COUNT); + for (i = 0; i < count.rfifo_count; i++) { + c = (char)gxio_uart_read(context, UART_RECEIVE_DATA); + tty_insert_flip_char(port, c, TTY_NORMAL); + } +} + + +/* + * Drain the Rx FIFO, called by interrupt handler. + */ +static void handle_receive(struct tile_uart_port *tile_uart) +{ + struct tty_port *port = &tile_uart->uart.state->port; + struct tty_struct *tty = tty_port_tty_get(port); + gxio_uart_context_t *context = &tile_uart->context; + + if (!tty) + return; + + /* First read UART rx fifo. */ + receive_chars(tile_uart, tty); + + /* Reset RFIFO_WE interrupt. */ + gxio_uart_write(context, UART_INTERRUPT_STATUS, + UART_INTERRUPT_MASK__RFIFO_WE_MASK); + + /* Final read, if any chars comes between the first read and + * the interrupt reset. + */ + receive_chars(tile_uart, tty); + + spin_unlock(&tile_uart->uart.lock); + tty_flip_buffer_push(port); + spin_lock(&tile_uart->uart.lock); + tty_kref_put(tty); +} + + +/* + * Push one char to UART Write FIFO. + * Return 0 on success, -1 if write filo is full. + */ +static int tilegx_putchar(gxio_uart_context_t *context, char c) +{ + UART_FLAG_t flag; + flag.word = gxio_uart_read(context, UART_FLAG); + if (flag.wfifo_full) + return -1; + + gxio_uart_write(context, UART_TRANSMIT_DATA, (unsigned long)c); + return 0; +} + + +/* + * Send chars to UART Write FIFO; called by interrupt handler. + */ +static void handle_transmit(struct tile_uart_port *tile_uart) +{ + unsigned char ch; + struct uart_port *port; + struct circ_buf *xmit; + gxio_uart_context_t *context = &tile_uart->context; + + /* First reset WFIFO_RE interrupt. */ + gxio_uart_write(context, UART_INTERRUPT_STATUS, + UART_INTERRUPT_MASK__WFIFO_RE_MASK); + + port = &tile_uart->uart; + xmit = &port->state->xmit; + if (port->x_char) { + if (tilegx_putchar(context, port->x_char)) + return; + port->x_char = 0; + port->icount.tx++; + } + + if (uart_circ_empty(xmit) || uart_tx_stopped(port)) + return; + + while (!uart_circ_empty(xmit)) { + ch = xmit->buf[xmit->tail]; + if (tilegx_putchar(context, ch)) + break; + xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1); + port->icount.tx++; + } + + /* Reset WFIFO_RE interrupt. */ + gxio_uart_write(context, UART_INTERRUPT_STATUS, + UART_INTERRUPT_MASK__WFIFO_RE_MASK); + + if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) + uart_write_wakeup(port); +} + + +/* + * UART Interrupt handler. + */ +static irqreturn_t tilegx_interrupt(int irq, void *dev_id) +{ + unsigned long flags; + UART_INTERRUPT_STATUS_t intr_stat; + struct tile_uart_port *tile_uart; + gxio_uart_context_t *context; + struct uart_port *port = dev_id; + irqreturn_t ret = IRQ_NONE; + + spin_lock_irqsave(&port->lock, flags); + + tile_uart = container_of(port, struct tile_uart_port, uart); + context = &tile_uart->context; + intr_stat.word = gxio_uart_read(context, UART_INTERRUPT_STATUS); + + if (intr_stat.rfifo_we) { + handle_receive(tile_uart); + ret = IRQ_HANDLED; + } + if (intr_stat.wfifo_re) { + handle_transmit(tile_uart); + ret = IRQ_HANDLED; + } + + spin_unlock_irqrestore(&port->lock, flags); + return ret; +} + + +/* + * Return TIOCSER_TEMT when transmitter FIFO is empty. + */ +static u_int tilegx_tx_empty(struct uart_port *port) +{ + int ret; + UART_FLAG_t flag; + struct tile_uart_port *tile_uart; + gxio_uart_context_t *context; + + tile_uart = container_of(port, struct tile_uart_port, uart); + if (!mutex_trylock(&tile_uart->mutex)) + return 0; + context = &tile_uart->context; + + flag.word = gxio_uart_read(context, UART_FLAG); + ret = (flag.wfifo_empty) ? TIOCSER_TEMT : 0; + mutex_unlock(&tile_uart->mutex); + + return ret; +} + + +/* + * Set state of the modem control output lines. + */ +static void tilegx_set_mctrl(struct uart_port *port, u_int mctrl) +{ + /* N/A */ +} + + +/* + * Get state of the modem control input lines. + */ +static u_int tilegx_get_mctrl(struct uart_port *port) +{ + return TIOCM_CTS | TIOCM_DSR | TIOCM_CAR; +} + + +/* + * Stop transmitting. + */ +static void tilegx_stop_tx(struct uart_port *port) +{ + /* N/A */ +} + + +/* + * Start transmitting. + */ +static void tilegx_start_tx(struct uart_port *port) +{ + unsigned char ch; + struct circ_buf *xmit; + struct tile_uart_port *tile_uart; + gxio_uart_context_t *context; + + tile_uart = container_of(port, struct tile_uart_port, uart); + if (!mutex_trylock(&tile_uart->mutex)) + return; + context = &tile_uart->context; + xmit = &port->state->xmit; + if (port->x_char) { + if (tilegx_putchar(context, port->x_char)) + return; + port->x_char = 0; + port->icount.tx++; + } + + if (uart_circ_empty(xmit) || uart_tx_stopped(port)) { + mutex_unlock(&tile_uart->mutex); + return; + } + + while (!uart_circ_empty(xmit)) { + ch = xmit->buf[xmit->tail]; + if (tilegx_putchar(context, ch)) + break; + xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1); + port->icount.tx++; + } + + if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) + uart_write_wakeup(port); + + mutex_unlock(&tile_uart->mutex); +} + + +/* + * Stop receiving - port is in process of being closed. + */ +static void tilegx_stop_rx(struct uart_port *port) +{ + int err; + struct tile_uart_port *tile_uart; + gxio_uart_context_t *context; + int cpu; + + tile_uart = container_of(port, struct tile_uart_port, uart); + if (!mutex_trylock(&tile_uart->mutex)) + return; + + context = &tile_uart->context; + cpu = tile_uart->irq_cpu; + err = gxio_uart_cfg_interrupt(context, cpu_x(cpu), cpu_y(cpu), + KERNEL_PL, -1); + mutex_unlock(&tile_uart->mutex); +} + + +/* + * Enable modem status interrupts. + */ +static void tilegx_enable_ms(struct uart_port *port) +{ + /* N/A */ +} + +/* + * Control the transmission of a break signal. + */ +static void tilegx_break_ctl(struct uart_port *port, int break_state) +{ + /* N/A */ +} + + +/* + * Perform initialization and enable port for reception. + */ +static int tilegx_startup(struct uart_port *port) +{ + struct tile_uart_port *tile_uart; + gxio_uart_context_t *context; + int ret = 0; + int cpu = raw_smp_processor_id(); /* pick an arbitrary cpu */ + + tile_uart = container_of(port, struct tile_uart_port, uart); + if (mutex_lock_interruptible(&tile_uart->mutex)) + return -EBUSY; + context = &tile_uart->context; + + /* Now open the hypervisor device if we haven't already. */ + if (context->fd < 0) { + UART_INTERRUPT_MASK_t intr_mask; + + /* Initialize UART device. */ + ret = gxio_uart_init(context, port->line); + if (ret) { + ret = -ENXIO; + goto err; + } + + /* Create our IRQs. */ + port->irq = create_irq(); + if (port->irq < 0) + goto err_uart_dest; + tile_irq_activate(port->irq, TILE_IRQ_PERCPU); + + /* Register our IRQs. */ + ret = request_irq(port->irq, tilegx_interrupt, 0, + tilegx_uart_driver.driver_name, port); + if (ret) + goto err_dest_irq; + + /* Request that the hardware start sending us interrupts. */ + tile_uart->irq_cpu = cpu; + ret = gxio_uart_cfg_interrupt(context, cpu_x(cpu), cpu_y(cpu), + KERNEL_PL, port->irq); + if (ret) + goto err_free_irq; + + /* Enable UART Tx/Rx Interrupt. */ + intr_mask.word = gxio_uart_read(context, UART_INTERRUPT_MASK); + intr_mask.wfifo_re = 0; + intr_mask.rfifo_we = 0; + gxio_uart_write(context, UART_INTERRUPT_MASK, intr_mask.word); + + /* Reset the Tx/Rx interrupt in case it's set. */ + gxio_uart_write(context, UART_INTERRUPT_STATUS, + UART_INTERRUPT_MASK__WFIFO_RE_MASK | + UART_INTERRUPT_MASK__RFIFO_WE_MASK); + } + + mutex_unlock(&tile_uart->mutex); + return ret; + +err_free_irq: + free_irq(port->irq, port); +err_dest_irq: + destroy_irq(port->irq); +err_uart_dest: + gxio_uart_destroy(context); + ret = -ENXIO; +err: + mutex_unlock(&tile_uart->mutex); + return ret; +} + + +/* + * Release kernel resources if it is the last close, disable the port, + * free IRQ and close the port. + */ +static void tilegx_shutdown(struct uart_port *port) +{ + int err; + UART_INTERRUPT_MASK_t intr_mask; + struct tile_uart_port *tile_uart; + gxio_uart_context_t *context; + int cpu; + + tile_uart = container_of(port, struct tile_uart_port, uart); + if (mutex_lock_interruptible(&tile_uart->mutex)) + return; + context = &tile_uart->context; + + /* Disable UART Tx/Rx Interrupt. */ + intr_mask.word = gxio_uart_read(context, UART_INTERRUPT_MASK); + intr_mask.wfifo_re = 1; + intr_mask.rfifo_we = 1; + gxio_uart_write(context, UART_INTERRUPT_MASK, intr_mask.word); + + /* Request that the hardware stop sending us interrupts. */ + cpu = tile_uart->irq_cpu; + err = gxio_uart_cfg_interrupt(context, cpu_x(cpu), cpu_y(cpu), + KERNEL_PL, -1); + + if (port->irq > 0) { + free_irq(port->irq, port); + destroy_irq(port->irq); + port->irq = 0; + } + + gxio_uart_destroy(context); + + mutex_unlock(&tile_uart->mutex); +} + + +/* + * Flush the buffer. + */ +static void tilegx_flush_buffer(struct uart_port *port) +{ + /* N/A */ +} + + +/* + * Change the port parameters. + */ +static void tilegx_set_termios(struct uart_port *port, + struct ktermios *termios, struct ktermios *old) +{ + int err; + UART_DIVISOR_t divisor; + UART_TYPE_t type; + unsigned int baud; + struct tile_uart_port *tile_uart; + gxio_uart_context_t *context; + + tile_uart = container_of(port, struct tile_uart_port, uart); + if (!mutex_trylock(&tile_uart->mutex)) + return; + context = &tile_uart->context; + + /* Open the hypervisor device if we haven't already. */ + if (context->fd < 0) { + err = gxio_uart_init(context, port->line); + if (err) { + mutex_unlock(&tile_uart->mutex); + return; + } + } + + divisor.word = gxio_uart_read(context, UART_DIVISOR); + type.word = gxio_uart_read(context, UART_TYPE); + + /* Divisor. */ + baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk / 16); + divisor.divisor = uart_get_divisor(port, baud); + + /* Byte size. */ + if ((termios->c_cflag & CSIZE) == CS7) + type.dbits = UART_TYPE__DBITS_VAL_SEVEN_DBITS; + else + type.dbits = UART_TYPE__DBITS_VAL_EIGHT_DBITS; + + /* Parity. */ + if (termios->c_cflag & PARENB) { + /* Mark or Space parity. */ + if (termios->c_cflag & CMSPAR) + if (termios->c_cflag & PARODD) + type.ptype = UART_TYPE__PTYPE_VAL_MARK; + else + type.ptype = UART_TYPE__PTYPE_VAL_SPACE; + else if (termios->c_cflag & PARODD) + type.ptype = UART_TYPE__PTYPE_VAL_ODD; + else + type.ptype = UART_TYPE__PTYPE_VAL_EVEN; + } else + type.ptype = UART_TYPE__PTYPE_VAL_NONE; + + /* Stop bits. */ + if (termios->c_cflag & CSTOPB) + type.sbits = UART_TYPE__SBITS_VAL_TWO_SBITS; + else + type.sbits = UART_TYPE__SBITS_VAL_ONE_SBITS; + + /* Set the uart paramters. */ + gxio_uart_write(context, UART_DIVISOR, divisor.word); + gxio_uart_write(context, UART_TYPE, type.word); + + mutex_unlock(&tile_uart->mutex); +} + + +/* + * Return string describing the specified port. + */ +static const char *tilegx_type(struct uart_port *port) +{ + return port->type == PORT_TILEGX ? DRIVER_NAME_STRING : NULL; +} + + +/* + * Release the resources being used by 'port'. + */ +static void tilegx_release_port(struct uart_port *port) +{ + /* Nothing to release. */ +} + + +/* + * Request the resources being used by 'port'. + */ +static int tilegx_request_port(struct uart_port *port) +{ + /* Always present. */ + return 0; +} + + +/* + * Configure/autoconfigure the port. + */ +static void tilegx_config_port(struct uart_port *port, int flags) +{ + if (flags & UART_CONFIG_TYPE) + port->type = PORT_TILEGX; +} + + +/* + * Verify the new serial_struct (for TIOCSSERIAL). + */ +static int tilegx_verify_port(struct uart_port *port, + struct serial_struct *ser) +{ + if ((ser->type != PORT_UNKNOWN) && (ser->type != PORT_TILEGX)) + return -EINVAL; + + return 0; +} + +#ifdef CONFIG_CONSOLE_POLL + +/* + * Console polling routines for writing and reading from the uart while + * in an interrupt or debug context. + */ + +static int tilegx_poll_get_char(struct uart_port *port) +{ + UART_FIFO_COUNT_t count; + gxio_uart_context_t *context; + struct tile_uart_port *tile_uart; + + tile_uart = container_of(port, struct tile_uart_port, uart); + context = &tile_uart->context; + count.word = gxio_uart_read(context, UART_FIFO_COUNT); + if (count.rfifo_count == 0) + return NO_POLL_CHAR; + return (char)gxio_uart_read(context, UART_RECEIVE_DATA); +} + +static void tilegx_poll_put_char(struct uart_port *port, unsigned char c) +{ + gxio_uart_context_t *context; + struct tile_uart_port *tile_uart; + + tile_uart = container_of(port, struct tile_uart_port, uart); + context = &tile_uart->context; + gxio_uart_write(context, UART_TRANSMIT_DATA, (unsigned long)c); +} + +#endif /* CONFIG_CONSOLE_POLL */ + + +static const struct uart_ops tilegx_ops = { + .tx_empty = tilegx_tx_empty, + .set_mctrl = tilegx_set_mctrl, + .get_mctrl = tilegx_get_mctrl, + .stop_tx = tilegx_stop_tx, + .start_tx = tilegx_start_tx, + .stop_rx = tilegx_stop_rx, + .enable_ms = tilegx_enable_ms, + .break_ctl = tilegx_break_ctl, + .startup = tilegx_startup, + .shutdown = tilegx_shutdown, + .flush_buffer = tilegx_flush_buffer, + .set_termios = tilegx_set_termios, + .type = tilegx_type, + .release_port = tilegx_release_port, + .request_port = tilegx_request_port, + .config_port = tilegx_config_port, + .verify_port = tilegx_verify_port, +#ifdef CONFIG_CONSOLE_POLL + .poll_get_char = tilegx_poll_get_char, + .poll_put_char = tilegx_poll_put_char, +#endif +}; + + +static void tilegx_init_ports(void) +{ + int i; + struct uart_port *port; + + for (i = 0; i < TILEGX_UART_NR; i++) { + port = &tile_uart_ports[i].uart; + port->ops = &tilegx_ops; + port->line = i; + port->type = PORT_TILEGX; + port->uartclk = TILEGX_UART_REF_CLK; + port->flags = UPF_BOOT_AUTOCONF; + + tile_uart_ports[i].context.fd = -1; + mutex_init(&tile_uart_ports[i].mutex); + } +} + + +static struct uart_driver tilegx_uart_driver = { + .owner = THIS_MODULE, + .driver_name = DRIVER_NAME_STRING, + .dev_name = TILEGX_UART_NAME, + .major = TILEGX_UART_MAJOR, + .minor = TILEGX_UART_MINOR, + .nr = TILEGX_UART_NR, +}; + + +static int __init tilegx_init(void) +{ + int i; + int ret; + struct tty_driver *tty_drv; + + ret = uart_register_driver(&tilegx_uart_driver); + if (ret) + return ret; + tty_drv = tilegx_uart_driver.tty_driver; + tty_drv->init_termios.c_cflag = B115200 | CS8 | CREAD | HUPCL | CLOCAL; + tty_drv->init_termios.c_ispeed = 115200; + tty_drv->init_termios.c_ospeed = 115200; + + tilegx_init_ports(); + + for (i = 0; i < TILEGX_UART_NR; i++) { + struct uart_port *port = &tile_uart_ports[i].uart; + ret = uart_add_one_port(&tilegx_uart_driver, port); + } + + return 0; +} + + +static void __exit tilegx_exit(void) +{ + int i; + struct uart_port *port; + + for (i = 0; i < TILEGX_UART_NR; i++) { + port = &tile_uart_ports[i].uart; + uart_remove_one_port(&tilegx_uart_driver, port); + } + + uart_unregister_driver(&tilegx_uart_driver); +} + + +module_init(tilegx_init); +module_exit(tilegx_exit); + +MODULE_AUTHOR("Tilera Corporation"); +MODULE_DESCRIPTION("TILEGx serial port driver"); +MODULE_LICENSE("GPL"); diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index 9119cc0977bf..b466487bb5da 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -232,4 +232,7 @@ /* SH-SCI */ #define PORT_HSCIF 104 +/* Tilera TILE-Gx UART */ +#define PORT_TILEGX 105 + #endif /* _UAPILINUX_SERIAL_CORE_H */ -- cgit v1.2.3-71-gd317 From 42c7768316905dc64ad22256d6cbff273e3fbf55 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 25 Jun 2013 15:14:56 -0400 Subject: NVMe: Split header file into user-visible and kernel-visible pieces To build user programs that call the NVMe ioctls, we need to have a user header file. Catch up to the new way of doing that by splitting the header file into kernel and uapi portions. Signed-off-by: Matthew Wilcox --- include/linux/nvme.h | 461 +-------------------------------------------- include/uapi/linux/Kbuild | 1 + include/uapi/linux/nvme.h | 471 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 477 insertions(+), 456 deletions(-) create mode 100644 include/uapi/linux/nvme.h (limited to 'include/uapi/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 5d7c07946fbe..8d0041513e1a 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1,6 +1,6 @@ /* * Definitions for the NVM Express interface - * Copyright (c) 2011, Intel Corporation. + * Copyright (c) 2011-2013, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -19,7 +19,10 @@ #ifndef _LINUX_NVME_H #define _LINUX_NVME_H -#include +#include +#include +#include +#include struct nvme_bar { __u64 cap; /* Controller Capabilities */ @@ -59,460 +62,8 @@ enum { NVME_CSTS_SHST_CMPLT = 2 << 2, }; -struct nvme_id_power_state { - __le16 max_power; /* centiwatts */ - __u16 rsvd2; - __le32 entry_lat; /* microseconds */ - __le32 exit_lat; /* microseconds */ - __u8 read_tput; - __u8 read_lat; - __u8 write_tput; - __u8 write_lat; - __u8 rsvd16[16]; -}; - #define NVME_VS(major, minor) (major << 16 | minor) -struct nvme_id_ctrl { - __le16 vid; - __le16 ssvid; - char sn[20]; - char mn[40]; - char fr[8]; - __u8 rab; - __u8 ieee[3]; - __u8 mic; - __u8 mdts; - __u8 rsvd78[178]; - __le16 oacs; - __u8 acl; - __u8 aerl; - __u8 frmw; - __u8 lpa; - __u8 elpe; - __u8 npss; - __u8 rsvd264[248]; - __u8 sqes; - __u8 cqes; - __u8 rsvd514[2]; - __le32 nn; - __le16 oncs; - __le16 fuses; - __u8 fna; - __u8 vwc; - __le16 awun; - __le16 awupf; - __u8 rsvd530[1518]; - struct nvme_id_power_state psd[32]; - __u8 vs[1024]; -}; - -enum { - NVME_CTRL_ONCS_COMPARE = 1 << 0, - NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1, - NVME_CTRL_ONCS_DSM = 1 << 2, -}; - -struct nvme_lbaf { - __le16 ms; - __u8 ds; - __u8 rp; -}; - -struct nvme_id_ns { - __le64 nsze; - __le64 ncap; - __le64 nuse; - __u8 nsfeat; - __u8 nlbaf; - __u8 flbas; - __u8 mc; - __u8 dpc; - __u8 dps; - __u8 rsvd30[98]; - struct nvme_lbaf lbaf[16]; - __u8 rsvd192[192]; - __u8 vs[3712]; -}; - -enum { - NVME_NS_FEAT_THIN = 1 << 0, - NVME_LBAF_RP_BEST = 0, - NVME_LBAF_RP_BETTER = 1, - NVME_LBAF_RP_GOOD = 2, - NVME_LBAF_RP_DEGRADED = 3, -}; - -struct nvme_smart_log { - __u8 critical_warning; - __u8 temperature[2]; - __u8 avail_spare; - __u8 spare_thresh; - __u8 percent_used; - __u8 rsvd6[26]; - __u8 data_units_read[16]; - __u8 data_units_written[16]; - __u8 host_reads[16]; - __u8 host_writes[16]; - __u8 ctrl_busy_time[16]; - __u8 power_cycles[16]; - __u8 power_on_hours[16]; - __u8 unsafe_shutdowns[16]; - __u8 media_errors[16]; - __u8 num_err_log_entries[16]; - __u8 rsvd192[320]; -}; - -enum { - NVME_SMART_CRIT_SPARE = 1 << 0, - NVME_SMART_CRIT_TEMPERATURE = 1 << 1, - NVME_SMART_CRIT_RELIABILITY = 1 << 2, - NVME_SMART_CRIT_MEDIA = 1 << 3, - NVME_SMART_CRIT_VOLATILE_MEMORY = 1 << 4, -}; - -struct nvme_lba_range_type { - __u8 type; - __u8 attributes; - __u8 rsvd2[14]; - __u64 slba; - __u64 nlb; - __u8 guid[16]; - __u8 rsvd48[16]; -}; - -enum { - NVME_LBART_TYPE_FS = 0x01, - NVME_LBART_TYPE_RAID = 0x02, - NVME_LBART_TYPE_CACHE = 0x03, - NVME_LBART_TYPE_SWAP = 0x04, - - NVME_LBART_ATTRIB_TEMP = 1 << 0, - NVME_LBART_ATTRIB_HIDE = 1 << 1, -}; - -/* I/O commands */ - -enum nvme_opcode { - nvme_cmd_flush = 0x00, - nvme_cmd_write = 0x01, - nvme_cmd_read = 0x02, - nvme_cmd_write_uncor = 0x04, - nvme_cmd_compare = 0x05, - nvme_cmd_dsm = 0x09, -}; - -struct nvme_common_command { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __le32 cdw2[2]; - __le64 metadata; - __le64 prp1; - __le64 prp2; - __le32 cdw10[6]; -}; - -struct nvme_rw_command { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd2; - __le64 metadata; - __le64 prp1; - __le64 prp2; - __le64 slba; - __le16 length; - __le16 control; - __le32 dsmgmt; - __le32 reftag; - __le16 apptag; - __le16 appmask; -}; - -enum { - NVME_RW_LR = 1 << 15, - NVME_RW_FUA = 1 << 14, - NVME_RW_DSM_FREQ_UNSPEC = 0, - NVME_RW_DSM_FREQ_TYPICAL = 1, - NVME_RW_DSM_FREQ_RARE = 2, - NVME_RW_DSM_FREQ_READS = 3, - NVME_RW_DSM_FREQ_WRITES = 4, - NVME_RW_DSM_FREQ_RW = 5, - NVME_RW_DSM_FREQ_ONCE = 6, - NVME_RW_DSM_FREQ_PREFETCH = 7, - NVME_RW_DSM_FREQ_TEMP = 8, - NVME_RW_DSM_LATENCY_NONE = 0 << 4, - NVME_RW_DSM_LATENCY_IDLE = 1 << 4, - NVME_RW_DSM_LATENCY_NORM = 2 << 4, - NVME_RW_DSM_LATENCY_LOW = 3 << 4, - NVME_RW_DSM_SEQ_REQ = 1 << 6, - NVME_RW_DSM_COMPRESSED = 1 << 7, -}; - -struct nvme_dsm_cmd { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd2[2]; - __le64 prp1; - __le64 prp2; - __le32 nr; - __le32 attributes; - __u32 rsvd12[4]; -}; - -enum { - NVME_DSMGMT_IDR = 1 << 0, - NVME_DSMGMT_IDW = 1 << 1, - NVME_DSMGMT_AD = 1 << 2, -}; - -struct nvme_dsm_range { - __le32 cattr; - __le32 nlb; - __le64 slba; -}; - -/* Admin commands */ - -enum nvme_admin_opcode { - nvme_admin_delete_sq = 0x00, - nvme_admin_create_sq = 0x01, - nvme_admin_get_log_page = 0x02, - nvme_admin_delete_cq = 0x04, - nvme_admin_create_cq = 0x05, - nvme_admin_identify = 0x06, - nvme_admin_abort_cmd = 0x08, - nvme_admin_set_features = 0x09, - nvme_admin_get_features = 0x0a, - nvme_admin_async_event = 0x0c, - nvme_admin_activate_fw = 0x10, - nvme_admin_download_fw = 0x11, - nvme_admin_format_nvm = 0x80, - nvme_admin_security_send = 0x81, - nvme_admin_security_recv = 0x82, -}; - -enum { - NVME_QUEUE_PHYS_CONTIG = (1 << 0), - NVME_CQ_IRQ_ENABLED = (1 << 1), - NVME_SQ_PRIO_URGENT = (0 << 1), - NVME_SQ_PRIO_HIGH = (1 << 1), - NVME_SQ_PRIO_MEDIUM = (2 << 1), - NVME_SQ_PRIO_LOW = (3 << 1), - NVME_FEAT_ARBITRATION = 0x01, - NVME_FEAT_POWER_MGMT = 0x02, - NVME_FEAT_LBA_RANGE = 0x03, - NVME_FEAT_TEMP_THRESH = 0x04, - NVME_FEAT_ERR_RECOVERY = 0x05, - NVME_FEAT_VOLATILE_WC = 0x06, - NVME_FEAT_NUM_QUEUES = 0x07, - NVME_FEAT_IRQ_COALESCE = 0x08, - NVME_FEAT_IRQ_CONFIG = 0x09, - NVME_FEAT_WRITE_ATOMIC = 0x0a, - NVME_FEAT_ASYNC_EVENT = 0x0b, - NVME_FEAT_SW_PROGRESS = 0x0c, - NVME_FWACT_REPL = (0 << 3), - NVME_FWACT_REPL_ACTV = (1 << 3), - NVME_FWACT_ACTV = (2 << 3), -}; - -struct nvme_identify { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd2[2]; - __le64 prp1; - __le64 prp2; - __le32 cns; - __u32 rsvd11[5]; -}; - -struct nvme_features { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd2[2]; - __le64 prp1; - __le64 prp2; - __le32 fid; - __le32 dword11; - __u32 rsvd12[4]; -}; - -struct nvme_create_cq { - __u8 opcode; - __u8 flags; - __u16 command_id; - __u32 rsvd1[5]; - __le64 prp1; - __u64 rsvd8; - __le16 cqid; - __le16 qsize; - __le16 cq_flags; - __le16 irq_vector; - __u32 rsvd12[4]; -}; - -struct nvme_create_sq { - __u8 opcode; - __u8 flags; - __u16 command_id; - __u32 rsvd1[5]; - __le64 prp1; - __u64 rsvd8; - __le16 sqid; - __le16 qsize; - __le16 sq_flags; - __le16 cqid; - __u32 rsvd12[4]; -}; - -struct nvme_delete_queue { - __u8 opcode; - __u8 flags; - __u16 command_id; - __u32 rsvd1[9]; - __le16 qid; - __u16 rsvd10; - __u32 rsvd11[5]; -}; - -struct nvme_download_firmware { - __u8 opcode; - __u8 flags; - __u16 command_id; - __u32 rsvd1[5]; - __le64 prp1; - __le64 prp2; - __le32 numd; - __le32 offset; - __u32 rsvd12[4]; -}; - -struct nvme_format_cmd { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd2[4]; - __le32 cdw10; - __u32 rsvd11[5]; -}; - -struct nvme_command { - union { - struct nvme_common_command common; - struct nvme_rw_command rw; - struct nvme_identify identify; - struct nvme_features features; - struct nvme_create_cq create_cq; - struct nvme_create_sq create_sq; - struct nvme_delete_queue delete_queue; - struct nvme_download_firmware dlfw; - struct nvme_format_cmd format; - struct nvme_dsm_cmd dsm; - }; -}; - -enum { - NVME_SC_SUCCESS = 0x0, - NVME_SC_INVALID_OPCODE = 0x1, - NVME_SC_INVALID_FIELD = 0x2, - NVME_SC_CMDID_CONFLICT = 0x3, - NVME_SC_DATA_XFER_ERROR = 0x4, - NVME_SC_POWER_LOSS = 0x5, - NVME_SC_INTERNAL = 0x6, - NVME_SC_ABORT_REQ = 0x7, - NVME_SC_ABORT_QUEUE = 0x8, - NVME_SC_FUSED_FAIL = 0x9, - NVME_SC_FUSED_MISSING = 0xa, - NVME_SC_INVALID_NS = 0xb, - NVME_SC_CMD_SEQ_ERROR = 0xc, - NVME_SC_LBA_RANGE = 0x80, - NVME_SC_CAP_EXCEEDED = 0x81, - NVME_SC_NS_NOT_READY = 0x82, - NVME_SC_CQ_INVALID = 0x100, - NVME_SC_QID_INVALID = 0x101, - NVME_SC_QUEUE_SIZE = 0x102, - NVME_SC_ABORT_LIMIT = 0x103, - NVME_SC_ABORT_MISSING = 0x104, - NVME_SC_ASYNC_LIMIT = 0x105, - NVME_SC_FIRMWARE_SLOT = 0x106, - NVME_SC_FIRMWARE_IMAGE = 0x107, - NVME_SC_INVALID_VECTOR = 0x108, - NVME_SC_INVALID_LOG_PAGE = 0x109, - NVME_SC_INVALID_FORMAT = 0x10a, - NVME_SC_BAD_ATTRIBUTES = 0x180, - NVME_SC_WRITE_FAULT = 0x280, - NVME_SC_READ_ERROR = 0x281, - NVME_SC_GUARD_CHECK = 0x282, - NVME_SC_APPTAG_CHECK = 0x283, - NVME_SC_REFTAG_CHECK = 0x284, - NVME_SC_COMPARE_FAILED = 0x285, - NVME_SC_ACCESS_DENIED = 0x286, -}; - -struct nvme_completion { - __le32 result; /* Used by admin commands to return data */ - __u32 rsvd; - __le16 sq_head; /* how much of this queue may be reclaimed */ - __le16 sq_id; /* submission queue that generated this entry */ - __u16 command_id; /* of the command which completed */ - __le16 status; /* did the command fail, and if so, why? */ -}; - -struct nvme_user_io { - __u8 opcode; - __u8 flags; - __u16 control; - __u16 nblocks; - __u16 rsvd; - __u64 metadata; - __u64 addr; - __u64 slba; - __u32 dsmgmt; - __u32 reftag; - __u16 apptag; - __u16 appmask; -}; - -struct nvme_admin_cmd { - __u8 opcode; - __u8 flags; - __u16 rsvd1; - __u32 nsid; - __u32 cdw2; - __u32 cdw3; - __u64 metadata; - __u64 addr; - __u32 metadata_len; - __u32 data_len; - __u32 cdw10; - __u32 cdw11; - __u32 cdw12; - __u32 cdw13; - __u32 cdw14; - __u32 cdw15; - __u32 timeout_ms; - __u32 result; -}; - -#define NVME_IOCTL_ID _IO('N', 0x40) -#define NVME_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct nvme_admin_cmd) -#define NVME_IOCTL_SUBMIT_IO _IOW('N', 0x42, struct nvme_user_io) - -#ifdef __KERNEL__ -#include -#include -#include - #define NVME_IO_TIMEOUT (5 * HZ) /* @@ -614,6 +165,4 @@ struct sg_io_hdr; int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr); int nvme_sg_get_version_num(int __user *ip); -#endif - #endif /* _LINUX_NVME_H */ diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index ab5d4992e568..a5e677a7070f 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -281,6 +281,7 @@ header-y += nfs_mount.h header-y += nfsacl.h header-y += nl80211.h header-y += nubus.h +header-y += nvme.h header-y += nvram.h header-y += omap3isp.h header-y += omapfb.h diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h new file mode 100644 index 000000000000..31ed566f81f6 --- /dev/null +++ b/include/uapi/linux/nvme.h @@ -0,0 +1,471 @@ +/* + * Definitions for the NVM Express interface + * Copyright (c) 2011-2013, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef _UAPI_LINUX_NVME_H +#define _UAPI_LINUX_NVME_H + +#include + +struct nvme_id_power_state { + __le16 max_power; /* centiwatts */ + __u16 rsvd2; + __le32 entry_lat; /* microseconds */ + __le32 exit_lat; /* microseconds */ + __u8 read_tput; + __u8 read_lat; + __u8 write_tput; + __u8 write_lat; + __u8 rsvd16[16]; +}; + +struct nvme_id_ctrl { + __le16 vid; + __le16 ssvid; + char sn[20]; + char mn[40]; + char fr[8]; + __u8 rab; + __u8 ieee[3]; + __u8 mic; + __u8 mdts; + __u8 rsvd78[178]; + __le16 oacs; + __u8 acl; + __u8 aerl; + __u8 frmw; + __u8 lpa; + __u8 elpe; + __u8 npss; + __u8 rsvd264[248]; + __u8 sqes; + __u8 cqes; + __u8 rsvd514[2]; + __le32 nn; + __le16 oncs; + __le16 fuses; + __u8 fna; + __u8 vwc; + __le16 awun; + __le16 awupf; + __u8 rsvd530[1518]; + struct nvme_id_power_state psd[32]; + __u8 vs[1024]; +}; + +enum { + NVME_CTRL_ONCS_COMPARE = 1 << 0, + NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1, + NVME_CTRL_ONCS_DSM = 1 << 2, +}; + +struct nvme_lbaf { + __le16 ms; + __u8 ds; + __u8 rp; +}; + +struct nvme_id_ns { + __le64 nsze; + __le64 ncap; + __le64 nuse; + __u8 nsfeat; + __u8 nlbaf; + __u8 flbas; + __u8 mc; + __u8 dpc; + __u8 dps; + __u8 rsvd30[98]; + struct nvme_lbaf lbaf[16]; + __u8 rsvd192[192]; + __u8 vs[3712]; +}; + +enum { + NVME_NS_FEAT_THIN = 1 << 0, + NVME_LBAF_RP_BEST = 0, + NVME_LBAF_RP_BETTER = 1, + NVME_LBAF_RP_GOOD = 2, + NVME_LBAF_RP_DEGRADED = 3, +}; + +struct nvme_smart_log { + __u8 critical_warning; + __u8 temperature[2]; + __u8 avail_spare; + __u8 spare_thresh; + __u8 percent_used; + __u8 rsvd6[26]; + __u8 data_units_read[16]; + __u8 data_units_written[16]; + __u8 host_reads[16]; + __u8 host_writes[16]; + __u8 ctrl_busy_time[16]; + __u8 power_cycles[16]; + __u8 power_on_hours[16]; + __u8 unsafe_shutdowns[16]; + __u8 media_errors[16]; + __u8 num_err_log_entries[16]; + __u8 rsvd192[320]; +}; + +enum { + NVME_SMART_CRIT_SPARE = 1 << 0, + NVME_SMART_CRIT_TEMPERATURE = 1 << 1, + NVME_SMART_CRIT_RELIABILITY = 1 << 2, + NVME_SMART_CRIT_MEDIA = 1 << 3, + NVME_SMART_CRIT_VOLATILE_MEMORY = 1 << 4, +}; + +struct nvme_lba_range_type { + __u8 type; + __u8 attributes; + __u8 rsvd2[14]; + __u64 slba; + __u64 nlb; + __u8 guid[16]; + __u8 rsvd48[16]; +}; + +enum { + NVME_LBART_TYPE_FS = 0x01, + NVME_LBART_TYPE_RAID = 0x02, + NVME_LBART_TYPE_CACHE = 0x03, + NVME_LBART_TYPE_SWAP = 0x04, + + NVME_LBART_ATTRIB_TEMP = 1 << 0, + NVME_LBART_ATTRIB_HIDE = 1 << 1, +}; + +/* I/O commands */ + +enum nvme_opcode { + nvme_cmd_flush = 0x00, + nvme_cmd_write = 0x01, + nvme_cmd_read = 0x02, + nvme_cmd_write_uncor = 0x04, + nvme_cmd_compare = 0x05, + nvme_cmd_dsm = 0x09, +}; + +struct nvme_common_command { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __le32 cdw2[2]; + __le64 metadata; + __le64 prp1; + __le64 prp2; + __le32 cdw10[6]; +}; + +struct nvme_rw_command { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2; + __le64 metadata; + __le64 prp1; + __le64 prp2; + __le64 slba; + __le16 length; + __le16 control; + __le32 dsmgmt; + __le32 reftag; + __le16 apptag; + __le16 appmask; +}; + +enum { + NVME_RW_LR = 1 << 15, + NVME_RW_FUA = 1 << 14, + NVME_RW_DSM_FREQ_UNSPEC = 0, + NVME_RW_DSM_FREQ_TYPICAL = 1, + NVME_RW_DSM_FREQ_RARE = 2, + NVME_RW_DSM_FREQ_READS = 3, + NVME_RW_DSM_FREQ_WRITES = 4, + NVME_RW_DSM_FREQ_RW = 5, + NVME_RW_DSM_FREQ_ONCE = 6, + NVME_RW_DSM_FREQ_PREFETCH = 7, + NVME_RW_DSM_FREQ_TEMP = 8, + NVME_RW_DSM_LATENCY_NONE = 0 << 4, + NVME_RW_DSM_LATENCY_IDLE = 1 << 4, + NVME_RW_DSM_LATENCY_NORM = 2 << 4, + NVME_RW_DSM_LATENCY_LOW = 3 << 4, + NVME_RW_DSM_SEQ_REQ = 1 << 6, + NVME_RW_DSM_COMPRESSED = 1 << 7, +}; + +struct nvme_dsm_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + __le64 prp1; + __le64 prp2; + __le32 nr; + __le32 attributes; + __u32 rsvd12[4]; +}; + +enum { + NVME_DSMGMT_IDR = 1 << 0, + NVME_DSMGMT_IDW = 1 << 1, + NVME_DSMGMT_AD = 1 << 2, +}; + +struct nvme_dsm_range { + __le32 cattr; + __le32 nlb; + __le64 slba; +}; + +/* Admin commands */ + +enum nvme_admin_opcode { + nvme_admin_delete_sq = 0x00, + nvme_admin_create_sq = 0x01, + nvme_admin_get_log_page = 0x02, + nvme_admin_delete_cq = 0x04, + nvme_admin_create_cq = 0x05, + nvme_admin_identify = 0x06, + nvme_admin_abort_cmd = 0x08, + nvme_admin_set_features = 0x09, + nvme_admin_get_features = 0x0a, + nvme_admin_async_event = 0x0c, + nvme_admin_activate_fw = 0x10, + nvme_admin_download_fw = 0x11, + nvme_admin_format_nvm = 0x80, + nvme_admin_security_send = 0x81, + nvme_admin_security_recv = 0x82, +}; + +enum { + NVME_QUEUE_PHYS_CONTIG = (1 << 0), + NVME_CQ_IRQ_ENABLED = (1 << 1), + NVME_SQ_PRIO_URGENT = (0 << 1), + NVME_SQ_PRIO_HIGH = (1 << 1), + NVME_SQ_PRIO_MEDIUM = (2 << 1), + NVME_SQ_PRIO_LOW = (3 << 1), + NVME_FEAT_ARBITRATION = 0x01, + NVME_FEAT_POWER_MGMT = 0x02, + NVME_FEAT_LBA_RANGE = 0x03, + NVME_FEAT_TEMP_THRESH = 0x04, + NVME_FEAT_ERR_RECOVERY = 0x05, + NVME_FEAT_VOLATILE_WC = 0x06, + NVME_FEAT_NUM_QUEUES = 0x07, + NVME_FEAT_IRQ_COALESCE = 0x08, + NVME_FEAT_IRQ_CONFIG = 0x09, + NVME_FEAT_WRITE_ATOMIC = 0x0a, + NVME_FEAT_ASYNC_EVENT = 0x0b, + NVME_FEAT_SW_PROGRESS = 0x0c, + NVME_FWACT_REPL = (0 << 3), + NVME_FWACT_REPL_ACTV = (1 << 3), + NVME_FWACT_ACTV = (2 << 3), +}; + +struct nvme_identify { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + __le64 prp1; + __le64 prp2; + __le32 cns; + __u32 rsvd11[5]; +}; + +struct nvme_features { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + __le64 prp1; + __le64 prp2; + __le32 fid; + __le32 dword11; + __u32 rsvd12[4]; +}; + +struct nvme_create_cq { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __u64 rsvd8; + __le16 cqid; + __le16 qsize; + __le16 cq_flags; + __le16 irq_vector; + __u32 rsvd12[4]; +}; + +struct nvme_create_sq { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __u64 rsvd8; + __le16 sqid; + __le16 qsize; + __le16 sq_flags; + __le16 cqid; + __u32 rsvd12[4]; +}; + +struct nvme_delete_queue { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[9]; + __le16 qid; + __u16 rsvd10; + __u32 rsvd11[5]; +}; + +struct nvme_download_firmware { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __le64 prp2; + __le32 numd; + __le32 offset; + __u32 rsvd12[4]; +}; + +struct nvme_format_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[4]; + __le32 cdw10; + __u32 rsvd11[5]; +}; + +struct nvme_command { + union { + struct nvme_common_command common; + struct nvme_rw_command rw; + struct nvme_identify identify; + struct nvme_features features; + struct nvme_create_cq create_cq; + struct nvme_create_sq create_sq; + struct nvme_delete_queue delete_queue; + struct nvme_download_firmware dlfw; + struct nvme_format_cmd format; + struct nvme_dsm_cmd dsm; + }; +}; + +enum { + NVME_SC_SUCCESS = 0x0, + NVME_SC_INVALID_OPCODE = 0x1, + NVME_SC_INVALID_FIELD = 0x2, + NVME_SC_CMDID_CONFLICT = 0x3, + NVME_SC_DATA_XFER_ERROR = 0x4, + NVME_SC_POWER_LOSS = 0x5, + NVME_SC_INTERNAL = 0x6, + NVME_SC_ABORT_REQ = 0x7, + NVME_SC_ABORT_QUEUE = 0x8, + NVME_SC_FUSED_FAIL = 0x9, + NVME_SC_FUSED_MISSING = 0xa, + NVME_SC_INVALID_NS = 0xb, + NVME_SC_CMD_SEQ_ERROR = 0xc, + NVME_SC_LBA_RANGE = 0x80, + NVME_SC_CAP_EXCEEDED = 0x81, + NVME_SC_NS_NOT_READY = 0x82, + NVME_SC_CQ_INVALID = 0x100, + NVME_SC_QID_INVALID = 0x101, + NVME_SC_QUEUE_SIZE = 0x102, + NVME_SC_ABORT_LIMIT = 0x103, + NVME_SC_ABORT_MISSING = 0x104, + NVME_SC_ASYNC_LIMIT = 0x105, + NVME_SC_FIRMWARE_SLOT = 0x106, + NVME_SC_FIRMWARE_IMAGE = 0x107, + NVME_SC_INVALID_VECTOR = 0x108, + NVME_SC_INVALID_LOG_PAGE = 0x109, + NVME_SC_INVALID_FORMAT = 0x10a, + NVME_SC_BAD_ATTRIBUTES = 0x180, + NVME_SC_WRITE_FAULT = 0x280, + NVME_SC_READ_ERROR = 0x281, + NVME_SC_GUARD_CHECK = 0x282, + NVME_SC_APPTAG_CHECK = 0x283, + NVME_SC_REFTAG_CHECK = 0x284, + NVME_SC_COMPARE_FAILED = 0x285, + NVME_SC_ACCESS_DENIED = 0x286, +}; + +struct nvme_completion { + __le32 result; /* Used by admin commands to return data */ + __u32 rsvd; + __le16 sq_head; /* how much of this queue may be reclaimed */ + __le16 sq_id; /* submission queue that generated this entry */ + __u16 command_id; /* of the command which completed */ + __le16 status; /* did the command fail, and if so, why? */ +}; + +struct nvme_user_io { + __u8 opcode; + __u8 flags; + __u16 control; + __u16 nblocks; + __u16 rsvd; + __u64 metadata; + __u64 addr; + __u64 slba; + __u32 dsmgmt; + __u32 reftag; + __u16 apptag; + __u16 appmask; +}; + +struct nvme_admin_cmd { + __u8 opcode; + __u8 flags; + __u16 rsvd1; + __u32 nsid; + __u32 cdw2; + __u32 cdw3; + __u64 metadata; + __u64 addr; + __u32 metadata_len; + __u32 data_len; + __u32 cdw10; + __u32 cdw11; + __u32 cdw12; + __u32 cdw13; + __u32 cdw14; + __u32 cdw15; + __u32 timeout_ms; + __u32 result; +}; + +#define NVME_IOCTL_ID _IO('N', 0x40) +#define NVME_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct nvme_admin_cmd) +#define NVME_IOCTL_SUBMIT_IO _IOW('N', 0x42, struct nvme_user_io) + +#endif /* _UAPI_LINUX_NVME_H */ -- cgit v1.2.3-71-gd317 From 685585c25e8269cc355b59d1cd6fc65b8c5c4878 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 25 Jun 2013 15:15:23 -0600 Subject: NVMe: Update nvme_id_power_state with latest spec Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- include/uapi/linux/nvme.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h index 31ed566f81f6..989c04e0c563 100644 --- a/include/uapi/linux/nvme.h +++ b/include/uapi/linux/nvme.h @@ -23,7 +23,8 @@ struct nvme_id_power_state { __le16 max_power; /* centiwatts */ - __u16 rsvd2; + __u8 rsvd2; + __u8 flags; __le32 entry_lat; /* microseconds */ __le32 exit_lat; /* microseconds */ __u8 read_tput; @@ -33,6 +34,11 @@ struct nvme_id_power_state { __u8 rsvd16[16]; }; +enum { + NVME_PS_FLAGS_MAX_POWER_SCALE = 1 << 0, + NVME_PS_FLAGS_NON_OP_STATE = 1 << 1, +}; + struct nvme_id_ctrl { __le16 vid; __le16 ssvid; -- cgit v1.2.3-71-gd317 From 61e76b178dbe7145e8d6afa84bb4ccea71918994 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Fri, 30 Aug 2013 11:18:45 +0200 Subject: ICMPv6: treat dest unreachable codes 5 and 6 as EACCES, not EPROTO RFC 4443 has defined two additional codes for ICMPv6 type 1 (destination unreachable) messages: 5 - Source address failed ingress/egress policy 6 - Reject route to destination Now they are treated as protocol error and icmpv6_err_convert() converts them to EPROTO. RFC 4443 says: "Codes 5 and 6 are more informative subsets of code 1." Treat codes 5 and 6 as code 1 (EACCES) Btw, connect() returning -EPROTO confuses firefox, so that fallback to other/IPv4 addresses does not work: https://bugzilla.mozilla.org/show_bug.cgi?id=910773 Signed-off-by: Jiri Bohac Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/uapi/linux/icmpv6.h | 2 ++ net/ipv6/icmp.c | 10 +++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h index e0133c73c304..590beda78ea0 100644 --- a/include/uapi/linux/icmpv6.h +++ b/include/uapi/linux/icmpv6.h @@ -115,6 +115,8 @@ struct icmp6hdr { #define ICMPV6_NOT_NEIGHBOUR 2 #define ICMPV6_ADDR_UNREACH 3 #define ICMPV6_PORT_UNREACH 4 +#define ICMPV6_POLICY_FAIL 5 +#define ICMPV6_REJECT_ROUTE 6 /* * Codes for Time Exceeded diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 7cfc8d284870..67ae4e0d40bf 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -940,6 +940,14 @@ static const struct icmp6_err { .err = ECONNREFUSED, .fatal = 1, }, + { /* POLICY_FAIL */ + .err = EACCES, + .fatal = 1, + }, + { /* REJECT_ROUTE */ + .err = EACCES, + .fatal = 1, + }, }; int icmpv6_err_convert(u8 type, u8 code, int *err) @@ -951,7 +959,7 @@ int icmpv6_err_convert(u8 type, u8 code, int *err) switch (type) { case ICMPV6_DEST_UNREACH: fatal = 1; - if (code <= ICMPV6_PORT_UNREACH) { + if (code < ARRAY_SIZE(tab_unreach)) { *err = tab_unreach[code].err; fatal = tab_unreach[code].fatal; } -- cgit v1.2.3-71-gd317 From 61e00655e9cb82e034eb72b95a51072e718d14a7 Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Mon, 26 Aug 2013 19:14:46 +0200 Subject: Input: introduce BTN/ABS bits for drums and guitars There are a bunch of guitar and drums devices out there that all report similar data. To avoid reporting this as BTN_MISC or ABS_MISC, we allocate some proper namespace for them. Note that most of these devices are toys and we cannot report any sophisticated physics via this API. I did some google-images research and tried to provide definitions that work with all common devices. That's why I went with 4 toms, 4 cymbals, one bass, one hi-hat. I haven't seen other drums and I doubt that we need any additions to that. Anyway, the naming-scheme is intentionally done in an extensible way. For guitars, we support 5 frets (normally aligned vertically, compared to the real horizontal layouts), a single strum-bar with up/down directions, an optional fret-board and a whammy-bar. Most of the devices provide pressure values so I went with ABS_* bits. If we ever support devices which only provide digital input, we have to decide whether to emulate pressure data or add additional BTN_* bits. If someone is not familiar with these devices, here are two pictures which provide almost all introduced interfaces (or try the given keywords with a google-image search): Guitar: ("guitar hero world tour guitar") http://images1.wikia.nocookie.net/__cb20120911023442/applezone/es/images/f/f9/Wii_Guitar.jpg Drums: ("guitar hero drums") http://oyster.ignimgs.com/franchises/images/03/55/35526_band-hero-drum-set-hands-on-20090929040735768.jpg Signed-off-by: David Herrmann Acked-by: Dmitry Torokhov Signed-off-by: Jiri Kosina --- include/linux/mod_devicetable.h | 2 +- include/uapi/linux/input.h | 25 +++++++++++++++++++++++-- 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index b62d4af6c667..65f8a8c4ebbc 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -277,7 +277,7 @@ struct pcmcia_device_id { #define INPUT_DEVICE_ID_KEY_MIN_INTERESTING 0x71 #define INPUT_DEVICE_ID_KEY_MAX 0x2ff #define INPUT_DEVICE_ID_REL_MAX 0x0f -#define INPUT_DEVICE_ID_ABS_MAX 0x3f +#define INPUT_DEVICE_ID_ABS_MAX 0x4f #define INPUT_DEVICE_ID_MSC_MAX 0x07 #define INPUT_DEVICE_ID_LED_MAX 0x0f #define INPUT_DEVICE_ID_SND_MAX 0x07 diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h index d584047b072b..76457eef172a 100644 --- a/include/uapi/linux/input.h +++ b/include/uapi/linux/input.h @@ -716,6 +716,14 @@ struct input_keymap_entry { #define BTN_DPAD_LEFT 0x222 #define BTN_DPAD_RIGHT 0x223 +#define BTN_FRET_FAR_UP 0x224 +#define BTN_FRET_UP 0x225 +#define BTN_FRET_MID 0x226 +#define BTN_FRET_LOW 0x227 +#define BTN_FRET_FAR_LOW 0x228 +#define BTN_STRUM_BAR_UP 0x229 +#define BTN_STRUM_BAR_DOWN 0x22a + #define BTN_TRIGGER_HAPPY 0x2c0 #define BTN_TRIGGER_HAPPY1 0x2c0 #define BTN_TRIGGER_HAPPY2 0x2c1 @@ -829,8 +837,21 @@ struct input_keymap_entry { #define ABS_MT_TOOL_X 0x3c /* Center X tool position */ #define ABS_MT_TOOL_Y 0x3d /* Center Y tool position */ - -#define ABS_MAX 0x3f +/* Drums and guitars (mostly toys) */ +#define ABS_TOM_FAR_LEFT 0x40 +#define ABS_TOM_LEFT 0x41 +#define ABS_TOM_RIGHT 0x42 +#define ABS_TOM_FAR_RIGHT 0x43 +#define ABS_CYMBAL_FAR_LEFT 0x44 +#define ABS_CYMBAL_LEFT 0x45 +#define ABS_CYMBAL_RIGHT 0x46 +#define ABS_CYMBAL_FAR_RIGHT 0x47 +#define ABS_BASS 0x48 +#define ABS_HI_HAT 0x49 +#define ABS_FRET_BOARD 0x4a /* Guitar fret board, vertical pos */ +#define ABS_WHAMMY_BAR 0x4b /* Guitar whammy bar (or vibrato) */ + +#define ABS_MAX 0x4f #define ABS_CNT (ABS_MAX+1) /* -- cgit v1.2.3-71-gd317 From cfd280c91253cc28e4919e349fa7a813b63e71e8 Mon Sep 17 00:00:00 2001 From: Carlos O'Donell Date: Thu, 15 Aug 2013 17:28:10 +0800 Subject: net: sync some IP headers with glibc Solution: ========= - Synchronize linux's `include/uapi/linux/in6.h' with glibc's `inet/netinet/in.h'. - Synchronize glibc's `inet/netinet/in.h with linux's `include/uapi/linux/in6.h'. - Allow including the headers in either other. - First header included defines the structures and macros. Details: ======== The kernel promises not to break the UAPI ABI so I don't see why we can't just have the two userspace headers coordinate? If you include the kernel headers first you get those, and if you include the glibc headers first you get those, and the following patch arranges a coordination and synchronization between the two. Let's handle `include/uapi/linux/in6.h' from linux, and `inet/netinet/in.h' from glibc and ensure they compile in any order and preserve the required ABI. These two patches pass the following compile tests: cat >> test1.c <> test2.c < Cc: Thomas Backlund Cc: libc-alpha@sourceware.org Cc: YOSHIFUJI Hideaki Cc: David S. Miller Tested-by: Cong Wang Signed-off-by: Carlos O'Donell Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/uapi/linux/Kbuild | 1 + include/uapi/linux/in.h | 49 ++++++++++++++----- include/uapi/linux/in6.h | 36 +++++++++++--- include/uapi/linux/libc-compat.h | 103 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 169 insertions(+), 20 deletions(-) create mode 100644 include/uapi/linux/libc-compat.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 997f9f2f0963..e7c94eeb9475 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -227,6 +227,7 @@ header-y += kvm_para.h endif header-y += l2tp.h +header-y += libc-compat.h header-y += limits.h header-y += llc.h header-y += loop.h diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 9edb441df827..f9e8e496ae5d 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -24,30 +24,53 @@ /* Standard well-defined IP protocols. */ enum { IPPROTO_IP = 0, /* Dummy protocol for TCP */ +#define IPPROTO_IP IPPROTO_IP IPPROTO_ICMP = 1, /* Internet Control Message Protocol */ +#define IPPROTO_ICMP IPPROTO_ICMP IPPROTO_IGMP = 2, /* Internet Group Management Protocol */ +#define IPPROTO_IGMP IPPROTO_IGMP IPPROTO_IPIP = 4, /* IPIP tunnels (older KA9Q tunnels use 94) */ +#define IPPROTO_IPIP IPPROTO_IPIP IPPROTO_TCP = 6, /* Transmission Control Protocol */ +#define IPPROTO_TCP IPPROTO_TCP IPPROTO_EGP = 8, /* Exterior Gateway Protocol */ +#define IPPROTO_EGP IPPROTO_EGP IPPROTO_PUP = 12, /* PUP protocol */ +#define IPPROTO_PUP IPPROTO_PUP IPPROTO_UDP = 17, /* User Datagram Protocol */ +#define IPPROTO_UDP IPPROTO_UDP IPPROTO_IDP = 22, /* XNS IDP protocol */ +#define IPPROTO_IDP IPPROTO_IDP + IPPROTO_TP = 29, /* SO Transport Protocol Class 4 */ +#define IPPROTO_TP IPPROTO_TP IPPROTO_DCCP = 33, /* Datagram Congestion Control Protocol */ - IPPROTO_RSVP = 46, /* RSVP protocol */ +#define IPPROTO_DCCP IPPROTO_DCCP + IPPROTO_IPV6 = 41, /* IPv6-in-IPv4 tunnelling */ +#define IPPROTO_IPV6 IPPROTO_IPV6 + IPPROTO_RSVP = 46, /* RSVP Protocol */ +#define IPPROTO_RSVP IPPROTO_RSVP IPPROTO_GRE = 47, /* Cisco GRE tunnels (rfc 1701,1702) */ - - IPPROTO_IPV6 = 41, /* IPv6-in-IPv4 tunnelling */ - - IPPROTO_ESP = 50, /* Encapsulation Security Payload protocol */ - IPPROTO_AH = 51, /* Authentication Header protocol */ - IPPROTO_BEETPH = 94, /* IP option pseudo header for BEET */ - IPPROTO_PIM = 103, /* Protocol Independent Multicast */ - - IPPROTO_COMP = 108, /* Compression Header protocol */ - IPPROTO_SCTP = 132, /* Stream Control Transport Protocol */ +#define IPPROTO_GRE IPPROTO_GRE + IPPROTO_ESP = 50, /* Encapsulation Security Payload protocol */ +#define IPPROTO_ESP IPPROTO_ESP + IPPROTO_AH = 51, /* Authentication Header protocol */ +#define IPPROTO_AH IPPROTO_AH + IPPROTO_MTP = 92, /* Multicast Transport Protocol */ +#define IPPROTO_MTP IPPROTO_MTP + IPPROTO_BEETPH = 94, /* IP option pseudo header for BEET */ +#define IPPROTO_BEETPH IPPROTO_BEETPH + IPPROTO_ENCAP = 98, /* Encapsulation Header */ +#define IPPROTO_ENCAP IPPROTO_ENCAP + IPPROTO_PIM = 103, /* Protocol Independent Multicast */ +#define IPPROTO_PIM IPPROTO_PIM + IPPROTO_COMP = 108, /* Compression Header Protocol */ +#define IPPROTO_COMP IPPROTO_COMP + IPPROTO_SCTP = 132, /* Stream Control Transport Protocol */ +#define IPPROTO_SCTP IPPROTO_SCTP IPPROTO_UDPLITE = 136, /* UDP-Lite (RFC 3828) */ - - IPPROTO_RAW = 255, /* Raw IP packets */ +#define IPPROTO_UDPLITE IPPROTO_UDPLITE + IPPROTO_RAW = 255, /* Raw IP packets */ +#define IPPROTO_RAW IPPROTO_RAW IPPROTO_MAX }; diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 53b1d56a6e7f..440d5c479145 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -22,22 +22,30 @@ #define _UAPI_LINUX_IN6_H #include +#include /* * IPv6 address structure */ +#if __UAPI_DEF_IN6_ADDR struct in6_addr { union { __u8 u6_addr8[16]; +#if __UAPI_DEF_IN6_ADDR_ALT __be16 u6_addr16[8]; __be32 u6_addr32[4]; +#endif } in6_u; #define s6_addr in6_u.u6_addr8 +#if __UAPI_DEF_IN6_ADDR_ALT #define s6_addr16 in6_u.u6_addr16 #define s6_addr32 in6_u.u6_addr32 +#endif }; +#endif /* __UAPI_DEF_IN6_ADDR */ +#if __UAPI_DEF_SOCKADDR_IN6 struct sockaddr_in6 { unsigned short int sin6_family; /* AF_INET6 */ __be16 sin6_port; /* Transport layer port # */ @@ -45,7 +53,9 @@ struct sockaddr_in6 { struct in6_addr sin6_addr; /* IPv6 address */ __u32 sin6_scope_id; /* scope id (new in RFC2553) */ }; +#endif /* __UAPI_DEF_SOCKADDR_IN6 */ +#if __UAPI_DEF_IPV6_MREQ struct ipv6_mreq { /* IPv6 multicast address of group */ struct in6_addr ipv6mr_multiaddr; @@ -53,6 +63,7 @@ struct ipv6_mreq { /* local IPv6 address of interface */ int ipv6mr_ifindex; }; +#endif /* __UAPI_DEF_IVP6_MREQ */ #define ipv6mr_acaddr ipv6mr_multiaddr @@ -114,13 +125,24 @@ struct in6_flowlabel_req { /* * IPV6 extension headers */ -#define IPPROTO_HOPOPTS 0 /* IPv6 hop-by-hop options */ -#define IPPROTO_ROUTING 43 /* IPv6 routing header */ -#define IPPROTO_FRAGMENT 44 /* IPv6 fragmentation header */ -#define IPPROTO_ICMPV6 58 /* ICMPv6 */ -#define IPPROTO_NONE 59 /* IPv6 no next header */ -#define IPPROTO_DSTOPTS 60 /* IPv6 destination options */ -#define IPPROTO_MH 135 /* IPv6 mobility header */ +#if __UAPI_DEF_IPPROTO_V6 +enum { + IPPROTO_HOPOPTS = 0, /* IPv6 hop-by-hop options */ +#define IPPROTO_HOPOPTS IPPROTO_HOPOPTS + IPPROTO_ROUTING = 43, /* IPv6 routing header */ +#define IPPROTO_ROUTING IPPROTO_ROUTING + IPPROTO_FRAGMENT = 44, /* IPv6 fragmentation header */ +#define IPPROTO_FRAGMENT IPPROTO_FRAGMENT + IPPROTO_ICMPV6 = 58, /* ICMPv6 */ +#define IPPROTO_ICMPV6 IPPROTO_ICMPV6 + IPPROTO_NONE = 59, /* IPv6 no next header */ +#define IPPROTO_NONE IPPROTO_NONE + IPPROTO_DSTOPTS = 60, /* IPv6 destination options */ +#define IPPROTO_DSTOPTS IPPROTO_DSTOPTS + IPPROTO_MH = 135, /* IPv6 mobility header */ +#define IPPROTO_MH IPPROTO_MH +}; +#endif /* __UAPI_DEF_IPPROTO_V6 */ /* * IPv6 TLV options. diff --git a/include/uapi/linux/libc-compat.h b/include/uapi/linux/libc-compat.h new file mode 100644 index 000000000000..335e8a7cad39 --- /dev/null +++ b/include/uapi/linux/libc-compat.h @@ -0,0 +1,103 @@ +/* + * Compatibility interface for userspace libc header coordination: + * + * Define compatibility macros that are used to control the inclusion or + * exclusion of UAPI structures and definitions in coordination with another + * userspace C library. + * + * This header is intended to solve the problem of UAPI definitions that + * conflict with userspace definitions. If a UAPI header has such conflicting + * definitions then the solution is as follows: + * + * * Synchronize the UAPI header and the libc headers so either one can be + * used and such that the ABI is preserved. If this is not possible then + * no simple compatibility interface exists (you need to write translating + * wrappers and rename things) and you can't use this interface. + * + * Then follow this process: + * + * (a) Include libc-compat.h in the UAPI header. + * e.g. #include + * This include must be as early as possible. + * + * (b) In libc-compat.h add enough code to detect that the comflicting + * userspace libc header has been included first. + * + * (c) If the userspace libc header has been included first define a set of + * guard macros of the form __UAPI_DEF_FOO and set their values to 1, else + * set their values to 0. + * + * (d) Back in the UAPI header with the conflicting definitions, guard the + * definitions with: + * #if __UAPI_DEF_FOO + * ... + * #endif + * + * This fixes the situation where the linux headers are included *after* the + * libc headers. To fix the problem with the inclusion in the other order the + * userspace libc headers must be fixed like this: + * + * * For all definitions that conflict with kernel definitions wrap those + * defines in the following: + * #if !__UAPI_DEF_FOO + * ... + * #endif + * + * This prevents the redefinition of a construct already defined by the kernel. + */ +#ifndef _UAPI_LIBC_COMPAT_H +#define _UAPI_LIBC_COMPAT_H + +/* We have included glibc headers... */ +#if defined(__GLIBC__) + +/* Coordinate with glibc netinet/in.h header. */ +#if defined(_NETINET_IN_H) + +/* GLIBC headers included first so don't define anything + * that would already be defined. */ +#define __UAPI_DEF_IN6_ADDR 0 +/* The exception is the in6_addr macros which must be defined + * if the glibc code didn't define them. This guard matches + * the guard in glibc/inet/netinet/in.h which defines the + * additional in6_addr macros e.g. s6_addr16, and s6_addr32. */ +#if defined(__USE_MISC) || defined (__USE_GNU) +#define __UAPI_DEF_IN6_ADDR_ALT 0 +#else +#define __UAPI_DEF_IN6_ADDR_ALT 1 +#endif +#define __UAPI_DEF_SOCKADDR_IN6 0 +#define __UAPI_DEF_IPV6_MREQ 0 +#define __UAPI_DEF_IPPROTO_V6 0 + +#else + +/* Linux headers included first, and we must define everything + * we need. The expectation is that glibc will check the + * __UAPI_DEF_* defines and adjust appropriately. */ +#define __UAPI_DEF_IN6_ADDR 1 +/* We unconditionally define the in6_addr macros and glibc must + * coordinate. */ +#define __UAPI_DEF_IN6_ADDR_ALT 1 +#define __UAPI_DEF_SOCKADDR_IN6 1 +#define __UAPI_DEF_IPV6_MREQ 1 +#define __UAPI_DEF_IPPROTO_V6 1 + +#endif /* _NETINET_IN_H */ + + +/* If we did not see any headers from any supported C libraries, + * or we are being included in the kernel, then define everything + * that we need. */ +#else /* !defined(__GLIBC__) */ + +/* Definitions for in6.h */ +#define __UAPI_DEF_IN6_ADDR 1 +#define __UAPI_DEF_IN6_ADDR_ALT 1 +#define __UAPI_DEF_SOCKADDR_IN6 1 +#define __UAPI_DEF_IPV6_MREQ 1 +#define __UAPI_DEF_IPPROTO_V6 1 + +#endif /* __GLIBC__ */ + +#endif /* _UAPI_LIBC_COMPAT_H */ -- cgit v1.2.3-71-gd317 From 8b27ee60bfd6bbb84d2df28fa706c5c5081066ca Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 4 Sep 2013 11:28:04 -0600 Subject: vfio-pci: PCI hot reset interface The current VFIO_DEVICE_RESET interface only maps to PCI use cases where we can isolate the reset to the individual PCI function. This means the device must support FLR (PCIe or AF), PM reset on D3hot->D0 transition, device specific reset, or be a singleton device on a bus for a secondary bus reset. FLR does not have widespread support, PM reset is not very reliable, and bus topology is dictated by the system and device design. We need to provide a means for a user to induce a bus reset in cases where the existing mechanisms are not available or not reliable. This device specific extension to VFIO provides the user with this ability. Two new ioctls are introduced: - VFIO_DEVICE_PCI_GET_HOT_RESET_INFO - VFIO_DEVICE_PCI_HOT_RESET The first provides the user with information about the extent of devices affected by a hot reset. This is essentially a list of devices and the IOMMU groups they belong to. The user may then initiate a hot reset by calling the second ioctl. We must be careful that the user has ownership of all the affected devices found via the first ioctl, so the second ioctl takes a list of file descriptors for the VFIO groups affected by the reset. Each group must have IOMMU protection established for the ioctl to succeed. Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci.c | 286 +++++++++++++++++++++++++++++++++++++++++++- include/uapi/linux/vfio.h | 38 ++++++ 2 files changed, 323 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index cef6002acbd4..6ab71b9fcf8d 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -227,6 +228,110 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) return 0; } +static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) +{ + (*(int *)data)++; + return 0; +} + +struct vfio_pci_fill_info { + int max; + int cur; + struct vfio_pci_dependent_device *devices; +}; + +static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) +{ + struct vfio_pci_fill_info *fill = data; + struct iommu_group *iommu_group; + + if (fill->cur == fill->max) + return -EAGAIN; /* Something changed, try again */ + + iommu_group = iommu_group_get(&pdev->dev); + if (!iommu_group) + return -EPERM; /* Cannot reset non-isolated devices */ + + fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); + fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); + fill->devices[fill->cur].bus = pdev->bus->number; + fill->devices[fill->cur].devfn = pdev->devfn; + fill->cur++; + iommu_group_put(iommu_group); + return 0; +} + +struct vfio_pci_group_entry { + struct vfio_group *group; + int id; +}; + +struct vfio_pci_group_info { + int count; + struct vfio_pci_group_entry *groups; +}; + +static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data) +{ + struct vfio_pci_group_info *info = data; + struct iommu_group *group; + int id, i; + + group = iommu_group_get(&pdev->dev); + if (!group) + return -EPERM; + + id = iommu_group_id(group); + + for (i = 0; i < info->count; i++) + if (info->groups[i].id == id) + break; + + iommu_group_put(group); + + return (i == info->count) ? -EINVAL : 0; +} + +static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot) +{ + for (; pdev; pdev = pdev->bus->self) + if (pdev->bus == slot->bus) + return (pdev->slot == slot); + return false; +} + +struct vfio_pci_walk_info { + int (*fn)(struct pci_dev *, void *data); + void *data; + struct pci_dev *pdev; + bool slot; + int ret; +}; + +static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data) +{ + struct vfio_pci_walk_info *walk = data; + + if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot)) + walk->ret = walk->fn(pdev, walk->data); + + return walk->ret; +} + +static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev, + int (*fn)(struct pci_dev *, + void *data), void *data, + bool slot) +{ + struct vfio_pci_walk_info walk = { + .fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0, + }; + + pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk); + + return walk.ret; +} + static long vfio_pci_ioctl(void *device_data, unsigned int cmd, unsigned long arg) { @@ -407,10 +512,189 @@ static long vfio_pci_ioctl(void *device_data, return ret; - } else if (cmd == VFIO_DEVICE_RESET) + } else if (cmd == VFIO_DEVICE_RESET) { return vdev->reset_works ? pci_reset_function(vdev->pdev) : -EINVAL; + } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) { + struct vfio_pci_hot_reset_info hdr; + struct vfio_pci_fill_info fill = { 0 }; + struct vfio_pci_dependent_device *devices = NULL; + bool slot = false; + int ret = 0; + + minsz = offsetofend(struct vfio_pci_hot_reset_info, count); + + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; + + if (hdr.argsz < minsz) + return -EINVAL; + + hdr.flags = 0; + + /* Can we do a slot or bus reset or neither? */ + if (!pci_probe_reset_slot(vdev->pdev->slot)) + slot = true; + else if (pci_probe_reset_bus(vdev->pdev->bus)) + return -ENODEV; + + /* How many devices are affected? */ + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, + vfio_pci_count_devs, + &fill.max, slot); + if (ret) + return ret; + + WARN_ON(!fill.max); /* Should always be at least one */ + + /* + * If there's enough space, fill it now, otherwise return + * -ENOSPC and the number of devices affected. + */ + if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { + ret = -ENOSPC; + hdr.count = fill.max; + goto reset_info_exit; + } + + devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); + if (!devices) + return -ENOMEM; + + fill.devices = devices; + + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, + vfio_pci_fill_devs, + &fill, slot); + + /* + * If a device was removed between counting and filling, + * we may come up short of fill.max. If a device was + * added, we'll have a return of -EAGAIN above. + */ + if (!ret) + hdr.count = fill.cur; + +reset_info_exit: + if (copy_to_user((void __user *)arg, &hdr, minsz)) + ret = -EFAULT; + + if (!ret) { + if (copy_to_user((void __user *)(arg + minsz), devices, + hdr.count * sizeof(*devices))) + ret = -EFAULT; + } + + kfree(devices); + return ret; + + } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) { + struct vfio_pci_hot_reset hdr; + int32_t *group_fds; + struct vfio_pci_group_entry *groups; + struct vfio_pci_group_info info; + bool slot = false; + int i, count = 0, ret = 0; + + minsz = offsetofend(struct vfio_pci_hot_reset, count); + + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; + + if (hdr.argsz < minsz || hdr.flags) + return -EINVAL; + + /* Can we do a slot or bus reset or neither? */ + if (!pci_probe_reset_slot(vdev->pdev->slot)) + slot = true; + else if (pci_probe_reset_bus(vdev->pdev->bus)) + return -ENODEV; + + /* + * We can't let userspace give us an arbitrarily large + * buffer to copy, so verify how many we think there + * could be. Note groups can have multiple devices so + * one group per device is the max. + */ + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, + vfio_pci_count_devs, + &count, slot); + if (ret) + return ret; + + /* Somewhere between 1 and count is OK */ + if (!hdr.count || hdr.count > count) + return -EINVAL; + + group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); + groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL); + if (!group_fds || !groups) { + kfree(group_fds); + kfree(groups); + return -ENOMEM; + } + + if (copy_from_user(group_fds, (void __user *)(arg + minsz), + hdr.count * sizeof(*group_fds))) { + kfree(group_fds); + kfree(groups); + return -EFAULT; + } + + /* + * For each group_fd, get the group through the vfio external + * user interface and store the group and iommu ID. This + * ensures the group is held across the reset. + */ + for (i = 0; i < hdr.count; i++) { + struct vfio_group *group; + struct fd f = fdget(group_fds[i]); + if (!f.file) { + ret = -EBADF; + break; + } + + group = vfio_group_get_external_user(f.file); + fdput(f); + if (IS_ERR(group)) { + ret = PTR_ERR(group); + break; + } + + groups[i].group = group; + groups[i].id = vfio_external_user_iommu_id(group); + } + + kfree(group_fds); + + /* release reference to groups on error */ + if (ret) + goto hot_reset_release; + + info.count = hdr.count; + info.groups = groups; + + /* + * Test whether all the affected devices are contained + * by the set of groups provided by the user. + */ + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, + vfio_pci_validate_devs, + &info, slot); + if (!ret) + /* User has access, do the reset */ + ret = slot ? pci_reset_slot(vdev->pdev->slot) : + pci_reset_bus(vdev->pdev->bus); + +hot_reset_release: + for (i--; i >= 0; i--) + vfio_group_put_external_user(groups[i].group); + + kfree(groups); + return ret; + } + return -ENOTTY; } diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 916e444e6f74..0fd47f5bc146 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -324,6 +324,44 @@ enum { VFIO_PCI_NUM_IRQS }; +/** + * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IORW(VFIO_TYPE, VFIO_BASE + 12, + * struct vfio_pci_hot_reset_info) + * + * Return: 0 on success, -errno on failure: + * -enospc = insufficient buffer, -enodev = unsupported for device. + */ +struct vfio_pci_dependent_device { + __u32 group_id; + __u16 segment; + __u8 bus; + __u8 devfn; /* Use PCI_SLOT/PCI_FUNC */ +}; + +struct vfio_pci_hot_reset_info { + __u32 argsz; + __u32 flags; + __u32 count; + struct vfio_pci_dependent_device devices[]; +}; + +#define VFIO_DEVICE_GET_PCI_HOT_RESET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) + +/** + * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13, + * struct vfio_pci_hot_reset) + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_pci_hot_reset { + __u32 argsz; + __u32 flags; + __u32 count; + __s32 group_fds[]; +}; + +#define VFIO_DEVICE_PCI_HOT_RESET _IO(VFIO_TYPE, VFIO_BASE + 13) + /* -------- API for Type1 VFIO IOMMU -------- */ /** -- cgit v1.2.3-71-gd317 From fd2ed4d252701d3bbed4cd3e3d267ad469bb832a Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 16 Aug 2013 10:54:23 -0400 Subject: dm: add statistics support Support the collection of I/O statistics on user-defined regions of a DM device. If no regions are defined no statistics are collected so there isn't any performance impact. Only bio-based DM devices are currently supported. Each user-defined region specifies a starting sector, length and step. Individual statistics will be collected for each step-sized area within the range specified. The I/O statistics counters for each step-sized area of a region are in the same format as /sys/block/*/stat or /proc/diskstats but extra counters (12 and 13) are provided: total time spent reading and writing in milliseconds. All these counters may be accessed by sending the @stats_print message to the appropriate DM device via dmsetup. The creation of DM statistics will allocate memory via kmalloc or fallback to using vmalloc space. At most, 1/4 of the overall system memory may be allocated by DM statistics. The admin can see how much memory is used by reading /sys/module/dm_mod/parameters/stats_current_allocated_bytes See Documentation/device-mapper/statistics.txt for more details. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- Documentation/device-mapper/statistics.txt | 186 ++++++ drivers/md/Makefile | 2 +- drivers/md/dm-ioctl.c | 22 +- drivers/md/dm-stats.c | 969 +++++++++++++++++++++++++++++ drivers/md/dm-stats.h | 40 ++ drivers/md/dm.c | 65 +- drivers/md/dm.h | 16 + include/linux/device-mapper.h | 9 + include/uapi/linux/dm-ioctl.h | 4 +- 9 files changed, 1299 insertions(+), 14 deletions(-) create mode 100644 Documentation/device-mapper/statistics.txt create mode 100644 drivers/md/dm-stats.c create mode 100644 drivers/md/dm-stats.h (limited to 'include/uapi/linux') diff --git a/Documentation/device-mapper/statistics.txt b/Documentation/device-mapper/statistics.txt new file mode 100644 index 000000000000..2a1673adc200 --- /dev/null +++ b/Documentation/device-mapper/statistics.txt @@ -0,0 +1,186 @@ +DM statistics +============= + +Device Mapper supports the collection of I/O statistics on user-defined +regions of a DM device. If no regions are defined no statistics are +collected so there isn't any performance impact. Only bio-based DM +devices are currently supported. + +Each user-defined region specifies a starting sector, length and step. +Individual statistics will be collected for each step-sized area within +the range specified. + +The I/O statistics counters for each step-sized area of a region are +in the same format as /sys/block/*/stat or /proc/diskstats (see: +Documentation/iostats.txt). But two extra counters (12 and 13) are +provided: total time spent reading and writing in milliseconds. All +these counters may be accessed by sending the @stats_print message to +the appropriate DM device via dmsetup. + +Each region has a corresponding unique identifier, which we call a +region_id, that is assigned when the region is created. The region_id +must be supplied when querying statistics about the region, deleting the +region, etc. Unique region_ids enable multiple userspace programs to +request and process statistics for the same DM device without stepping +on each other's data. + +The creation of DM statistics will allocate memory via kmalloc or +fallback to using vmalloc space. At most, 1/4 of the overall system +memory may be allocated by DM statistics. The admin can see how much +memory is used by reading +/sys/module/dm_mod/parameters/stats_current_allocated_bytes + +Messages +======== + + @stats_create [ []] + + Create a new region and return the region_id. + + + "-" - whole device + "+" - a range of 512-byte sectors + starting with . + + + "" - the range is subdivided into areas each containing + sectors. + "/" - the range is subdivided into the specified + number of areas. + + + An optional parameter. A name that uniquely identifies + the userspace owner of the range. This groups ranges together + so that userspace programs can identify the ranges they + created and ignore those created by others. + The kernel returns this string back in the output of + @stats_list message, but it doesn't use it for anything else. + + + An optional parameter. A word that provides auxiliary data + that is useful to the client program that created the range. + The kernel returns this string back in the output of + @stats_list message, but it doesn't use this value for anything. + + @stats_delete + + Delete the region with the specified id. + + + region_id returned from @stats_create + + @stats_clear + + Clear all the counters except the in-flight i/o counters. + + + region_id returned from @stats_create + + @stats_list [] + + List all regions registered with @stats_create. + + + An optional parameter. + If this parameter is specified, only matching regions + are returned. + If it is not specified, all regions are returned. + + Output format: + : + + + @stats_print [ ] + + Print counters for each step-sized area of a region. + + + region_id returned from @stats_create + + + The index of the starting line in the output. + If omitted, all lines are returned. + + + The number of lines to include in the output. + If omitted, all lines are returned. + + Output format for each step-sized area of a region: + + + counters + + The first 11 counters have the same meaning as + /sys/block/*/stat or /proc/diskstats. + + Please refer to Documentation/iostats.txt for details. + + 1. the number of reads completed + 2. the number of reads merged + 3. the number of sectors read + 4. the number of milliseconds spent reading + 5. the number of writes completed + 6. the number of writes merged + 7. the number of sectors written + 8. the number of milliseconds spent writing + 9. the number of I/Os currently in progress + 10. the number of milliseconds spent doing I/Os + 11. the weighted number of milliseconds spent doing I/Os + + Additional counters: + 12. the total time spent reading in milliseconds + 13. the total time spent writing in milliseconds + + @stats_print_clear [ ] + + Atomically print and then clear all the counters except the + in-flight i/o counters. Useful when the client consuming the + statistics does not want to lose any statistics (those updated + between printing and clearing). + + + region_id returned from @stats_create + + + The index of the starting line in the output. + If omitted, all lines are printed and then cleared. + + + The number of lines to process. + If omitted, all lines are printed and then cleared. + + @stats_set_aux + + Store auxiliary data aux_data for the specified region. + + + region_id returned from @stats_create + + + The string that identifies data which is useful to the client + program that created the range. The kernel returns this + string back in the output of @stats_list message, but it + doesn't use this value for anything. + +Examples +======== + +Subdivide the DM device 'vol' into 100 pieces and start collecting +statistics on them: + + dmsetup message vol 0 @stats_create - /100 + +Set the auxillary data string to "foo bar baz" (the escape for each +space must also be escaped, otherwise the shell will consume them): + + dmsetup message vol 0 @stats_set_aux 0 foo\\ bar\\ baz + +List the statistics: + + dmsetup message vol 0 @stats_list + +Print the statistics: + + dmsetup message vol 0 @stats_print 0 + +Delete the statistics: + + dmsetup message vol 0 @stats_delete 0 diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 5ef78efc27f2..2acc43fe0229 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -3,7 +3,7 @@ # dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ - dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o + dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o dm-multipath-y += dm-path-selector.o dm-mpath.o dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ dm-snap-persistent.o diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index e9c0de75010e..afe08146f73e 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1455,20 +1455,26 @@ static int table_status(struct dm_ioctl *param, size_t param_size) return 0; } -static bool buffer_test_overflow(char *result, unsigned maxlen) -{ - return !maxlen || strlen(result) + 1 >= maxlen; -} - /* - * Process device-mapper dependent messages. + * Process device-mapper dependent messages. Messages prefixed with '@' + * are processed by the DM core. All others are delivered to the target. * Returns a number <= 1 if message was processed by device mapper. * Returns 2 if message should be delivered to the target. */ static int message_for_md(struct mapped_device *md, unsigned argc, char **argv, char *result, unsigned maxlen) { - return 2; + int r; + + if (**argv != '@') + return 2; /* no '@' prefix, deliver to target */ + + r = dm_stats_message(md, argc, argv, result, maxlen); + if (r < 2) + return r; + + DMERR("Unsupported message sent to DM core: %s", argv[0]); + return -EINVAL; } /* @@ -1542,7 +1548,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size) if (r == 1) { param->flags |= DM_DATA_OUT_FLAG; - if (buffer_test_overflow(result, maxlen)) + if (dm_message_test_buffer_overflow(result, maxlen)) param->flags |= DM_BUFFER_FULL_FLAG; else param->data_size = param->data_start + strlen(result) + 1; diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c new file mode 100644 index 000000000000..8ae31e8d3d64 --- /dev/null +++ b/drivers/md/dm-stats.c @@ -0,0 +1,969 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dm.h" +#include "dm-stats.h" + +#define DM_MSG_PREFIX "stats" + +static int dm_stat_need_rcu_barrier; + +/* + * Using 64-bit values to avoid overflow (which is a + * problem that block/genhd.c's IO accounting has). + */ +struct dm_stat_percpu { + unsigned long long sectors[2]; + unsigned long long ios[2]; + unsigned long long merges[2]; + unsigned long long ticks[2]; + unsigned long long io_ticks[2]; + unsigned long long io_ticks_total; + unsigned long long time_in_queue; +}; + +struct dm_stat_shared { + atomic_t in_flight[2]; + unsigned long stamp; + struct dm_stat_percpu tmp; +}; + +struct dm_stat { + struct list_head list_entry; + int id; + size_t n_entries; + sector_t start; + sector_t end; + sector_t step; + const char *program_id; + const char *aux_data; + struct rcu_head rcu_head; + size_t shared_alloc_size; + size_t percpu_alloc_size; + struct dm_stat_percpu *stat_percpu[NR_CPUS]; + struct dm_stat_shared stat_shared[0]; +}; + +struct dm_stats_last_position { + sector_t last_sector; + unsigned last_rw; +}; + +/* + * A typo on the command line could possibly make the kernel run out of memory + * and crash. To prevent the crash we account all used memory. We fail if we + * exhaust 1/4 of all memory or 1/2 of vmalloc space. + */ +#define DM_STATS_MEMORY_FACTOR 4 +#define DM_STATS_VMALLOC_FACTOR 2 + +static DEFINE_SPINLOCK(shared_memory_lock); + +static unsigned long shared_memory_amount; + +static bool __check_shared_memory(size_t alloc_size) +{ + size_t a; + + a = shared_memory_amount + alloc_size; + if (a < shared_memory_amount) + return false; + if (a >> PAGE_SHIFT > totalram_pages / DM_STATS_MEMORY_FACTOR) + return false; +#ifdef CONFIG_MMU + if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR) + return false; +#endif + return true; +} + +static bool check_shared_memory(size_t alloc_size) +{ + bool ret; + + spin_lock_irq(&shared_memory_lock); + + ret = __check_shared_memory(alloc_size); + + spin_unlock_irq(&shared_memory_lock); + + return ret; +} + +static bool claim_shared_memory(size_t alloc_size) +{ + spin_lock_irq(&shared_memory_lock); + + if (!__check_shared_memory(alloc_size)) { + spin_unlock_irq(&shared_memory_lock); + return false; + } + + shared_memory_amount += alloc_size; + + spin_unlock_irq(&shared_memory_lock); + + return true; +} + +static void free_shared_memory(size_t alloc_size) +{ + unsigned long flags; + + spin_lock_irqsave(&shared_memory_lock, flags); + + if (WARN_ON_ONCE(shared_memory_amount < alloc_size)) { + spin_unlock_irqrestore(&shared_memory_lock, flags); + DMCRIT("Memory usage accounting bug."); + return; + } + + shared_memory_amount -= alloc_size; + + spin_unlock_irqrestore(&shared_memory_lock, flags); +} + +static void *dm_kvzalloc(size_t alloc_size, int node) +{ + void *p; + + if (!claim_shared_memory(alloc_size)) + return NULL; + + if (alloc_size <= KMALLOC_MAX_SIZE) { + p = kzalloc_node(alloc_size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node); + if (p) + return p; + } + p = vzalloc_node(alloc_size, node); + if (p) + return p; + + free_shared_memory(alloc_size); + + return NULL; +} + +static void dm_kvfree(void *ptr, size_t alloc_size) +{ + if (!ptr) + return; + + free_shared_memory(alloc_size); + + if (is_vmalloc_addr(ptr)) + vfree(ptr); + else + kfree(ptr); +} + +static void dm_stat_free(struct rcu_head *head) +{ + int cpu; + struct dm_stat *s = container_of(head, struct dm_stat, rcu_head); + + kfree(s->program_id); + kfree(s->aux_data); + for_each_possible_cpu(cpu) + dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size); + dm_kvfree(s, s->shared_alloc_size); +} + +static int dm_stat_in_flight(struct dm_stat_shared *shared) +{ + return atomic_read(&shared->in_flight[READ]) + + atomic_read(&shared->in_flight[WRITE]); +} + +void dm_stats_init(struct dm_stats *stats) +{ + int cpu; + struct dm_stats_last_position *last; + + mutex_init(&stats->mutex); + INIT_LIST_HEAD(&stats->list); + stats->last = alloc_percpu(struct dm_stats_last_position); + for_each_possible_cpu(cpu) { + last = per_cpu_ptr(stats->last, cpu); + last->last_sector = (sector_t)ULLONG_MAX; + last->last_rw = UINT_MAX; + } +} + +void dm_stats_cleanup(struct dm_stats *stats) +{ + size_t ni; + struct dm_stat *s; + struct dm_stat_shared *shared; + + while (!list_empty(&stats->list)) { + s = container_of(stats->list.next, struct dm_stat, list_entry); + list_del(&s->list_entry); + for (ni = 0; ni < s->n_entries; ni++) { + shared = &s->stat_shared[ni]; + if (WARN_ON(dm_stat_in_flight(shared))) { + DMCRIT("leaked in-flight counter at index %lu " + "(start %llu, end %llu, step %llu): reads %d, writes %d", + (unsigned long)ni, + (unsigned long long)s->start, + (unsigned long long)s->end, + (unsigned long long)s->step, + atomic_read(&shared->in_flight[READ]), + atomic_read(&shared->in_flight[WRITE])); + } + } + dm_stat_free(&s->rcu_head); + } + free_percpu(stats->last); +} + +static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, + sector_t step, const char *program_id, const char *aux_data, + void (*suspend_callback)(struct mapped_device *), + void (*resume_callback)(struct mapped_device *), + struct mapped_device *md) +{ + struct list_head *l; + struct dm_stat *s, *tmp_s; + sector_t n_entries; + size_t ni; + size_t shared_alloc_size; + size_t percpu_alloc_size; + struct dm_stat_percpu *p; + int cpu; + int ret_id; + int r; + + if (end < start || !step) + return -EINVAL; + + n_entries = end - start; + if (dm_sector_div64(n_entries, step)) + n_entries++; + + if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1)) + return -EOVERFLOW; + + shared_alloc_size = sizeof(struct dm_stat) + (size_t)n_entries * sizeof(struct dm_stat_shared); + if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries) + return -EOVERFLOW; + + percpu_alloc_size = (size_t)n_entries * sizeof(struct dm_stat_percpu); + if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries) + return -EOVERFLOW; + + if (!check_shared_memory(shared_alloc_size + num_possible_cpus() * percpu_alloc_size)) + return -ENOMEM; + + s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE); + if (!s) + return -ENOMEM; + + s->n_entries = n_entries; + s->start = start; + s->end = end; + s->step = step; + s->shared_alloc_size = shared_alloc_size; + s->percpu_alloc_size = percpu_alloc_size; + + s->program_id = kstrdup(program_id, GFP_KERNEL); + if (!s->program_id) { + r = -ENOMEM; + goto out; + } + s->aux_data = kstrdup(aux_data, GFP_KERNEL); + if (!s->aux_data) { + r = -ENOMEM; + goto out; + } + + for (ni = 0; ni < n_entries; ni++) { + atomic_set(&s->stat_shared[ni].in_flight[READ], 0); + atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0); + } + + for_each_possible_cpu(cpu) { + p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu)); + if (!p) { + r = -ENOMEM; + goto out; + } + s->stat_percpu[cpu] = p; + } + + /* + * Suspend/resume to make sure there is no i/o in flight, + * so that newly created statistics will be exact. + * + * (note: we couldn't suspend earlier because we must not + * allocate memory while suspended) + */ + suspend_callback(md); + + mutex_lock(&stats->mutex); + s->id = 0; + list_for_each(l, &stats->list) { + tmp_s = container_of(l, struct dm_stat, list_entry); + if (WARN_ON(tmp_s->id < s->id)) { + r = -EINVAL; + goto out_unlock_resume; + } + if (tmp_s->id > s->id) + break; + if (unlikely(s->id == INT_MAX)) { + r = -ENFILE; + goto out_unlock_resume; + } + s->id++; + } + ret_id = s->id; + list_add_tail_rcu(&s->list_entry, l); + mutex_unlock(&stats->mutex); + + resume_callback(md); + + return ret_id; + +out_unlock_resume: + mutex_unlock(&stats->mutex); + resume_callback(md); +out: + dm_stat_free(&s->rcu_head); + return r; +} + +static struct dm_stat *__dm_stats_find(struct dm_stats *stats, int id) +{ + struct dm_stat *s; + + list_for_each_entry(s, &stats->list, list_entry) { + if (s->id > id) + break; + if (s->id == id) + return s; + } + + return NULL; +} + +static int dm_stats_delete(struct dm_stats *stats, int id) +{ + struct dm_stat *s; + int cpu; + + mutex_lock(&stats->mutex); + + s = __dm_stats_find(stats, id); + if (!s) { + mutex_unlock(&stats->mutex); + return -ENOENT; + } + + list_del_rcu(&s->list_entry); + mutex_unlock(&stats->mutex); + + /* + * vfree can't be called from RCU callback + */ + for_each_possible_cpu(cpu) + if (is_vmalloc_addr(s->stat_percpu)) + goto do_sync_free; + if (is_vmalloc_addr(s)) { +do_sync_free: + synchronize_rcu_expedited(); + dm_stat_free(&s->rcu_head); + } else { + ACCESS_ONCE(dm_stat_need_rcu_barrier) = 1; + call_rcu(&s->rcu_head, dm_stat_free); + } + return 0; +} + +static int dm_stats_list(struct dm_stats *stats, const char *program, + char *result, unsigned maxlen) +{ + struct dm_stat *s; + sector_t len; + unsigned sz = 0; + + /* + * Output format: + * : + + */ + + mutex_lock(&stats->mutex); + list_for_each_entry(s, &stats->list, list_entry) { + if (!program || !strcmp(program, s->program_id)) { + len = s->end - s->start; + DMEMIT("%d: %llu+%llu %llu %s %s\n", s->id, + (unsigned long long)s->start, + (unsigned long long)len, + (unsigned long long)s->step, + s->program_id, + s->aux_data); + } + } + mutex_unlock(&stats->mutex); + + return 1; +} + +static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *p) +{ + /* + * This is racy, but so is part_round_stats_single. + */ + unsigned long now = jiffies; + unsigned in_flight_read; + unsigned in_flight_write; + unsigned long difference = now - shared->stamp; + + if (!difference) + return; + in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]); + in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]); + if (in_flight_read) + p->io_ticks[READ] += difference; + if (in_flight_write) + p->io_ticks[WRITE] += difference; + if (in_flight_read + in_flight_write) { + p->io_ticks_total += difference; + p->time_in_queue += (in_flight_read + in_flight_write) * difference; + } + shared->stamp = now; +} + +static void dm_stat_for_entry(struct dm_stat *s, size_t entry, + unsigned long bi_rw, sector_t len, bool merged, + bool end, unsigned long duration) +{ + unsigned long idx = bi_rw & REQ_WRITE; + struct dm_stat_shared *shared = &s->stat_shared[entry]; + struct dm_stat_percpu *p; + + /* + * For strict correctness we should use local_irq_disable/enable + * instead of preempt_disable/enable. + * + * This is racy if the driver finishes bios from non-interrupt + * context as well as from interrupt context or from more different + * interrupts. + * + * However, the race only results in not counting some events, + * so it is acceptable. + * + * part_stat_lock()/part_stat_unlock() have this race too. + */ + preempt_disable(); + p = &s->stat_percpu[smp_processor_id()][entry]; + + if (!end) { + dm_stat_round(shared, p); + atomic_inc(&shared->in_flight[idx]); + } else { + dm_stat_round(shared, p); + atomic_dec(&shared->in_flight[idx]); + p->sectors[idx] += len; + p->ios[idx] += 1; + p->merges[idx] += merged; + p->ticks[idx] += duration; + } + + preempt_enable(); +} + +static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw, + sector_t bi_sector, sector_t end_sector, + bool end, unsigned long duration, + struct dm_stats_aux *stats_aux) +{ + sector_t rel_sector, offset, todo, fragment_len; + size_t entry; + + if (end_sector <= s->start || bi_sector >= s->end) + return; + if (unlikely(bi_sector < s->start)) { + rel_sector = 0; + todo = end_sector - s->start; + } else { + rel_sector = bi_sector - s->start; + todo = end_sector - bi_sector; + } + if (unlikely(end_sector > s->end)) + todo -= (end_sector - s->end); + + offset = dm_sector_div64(rel_sector, s->step); + entry = rel_sector; + do { + if (WARN_ON_ONCE(entry >= s->n_entries)) { + DMCRIT("Invalid area access in region id %d", s->id); + return; + } + fragment_len = todo; + if (fragment_len > s->step - offset) + fragment_len = s->step - offset; + dm_stat_for_entry(s, entry, bi_rw, fragment_len, + stats_aux->merged, end, duration); + todo -= fragment_len; + entry++; + offset = 0; + } while (unlikely(todo != 0)); +} + +void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, + sector_t bi_sector, unsigned bi_sectors, bool end, + unsigned long duration, struct dm_stats_aux *stats_aux) +{ + struct dm_stat *s; + sector_t end_sector; + struct dm_stats_last_position *last; + + if (unlikely(!bi_sectors)) + return; + + end_sector = bi_sector + bi_sectors; + + if (!end) { + /* + * A race condition can at worst result in the merged flag being + * misrepresented, so we don't have to disable preemption here. + */ + last = __this_cpu_ptr(stats->last); + stats_aux->merged = + (bi_sector == (ACCESS_ONCE(last->last_sector) && + ((bi_rw & (REQ_WRITE | REQ_DISCARD)) == + (ACCESS_ONCE(last->last_rw) & (REQ_WRITE | REQ_DISCARD))) + )); + ACCESS_ONCE(last->last_sector) = end_sector; + ACCESS_ONCE(last->last_rw) = bi_rw; + } + + rcu_read_lock(); + + list_for_each_entry_rcu(s, &stats->list, list_entry) + __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration, stats_aux); + + rcu_read_unlock(); +} + +static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared, + struct dm_stat *s, size_t x) +{ + int cpu; + struct dm_stat_percpu *p; + + local_irq_disable(); + p = &s->stat_percpu[smp_processor_id()][x]; + dm_stat_round(shared, p); + local_irq_enable(); + + memset(&shared->tmp, 0, sizeof(shared->tmp)); + for_each_possible_cpu(cpu) { + p = &s->stat_percpu[cpu][x]; + shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]); + shared->tmp.sectors[WRITE] += ACCESS_ONCE(p->sectors[WRITE]); + shared->tmp.ios[READ] += ACCESS_ONCE(p->ios[READ]); + shared->tmp.ios[WRITE] += ACCESS_ONCE(p->ios[WRITE]); + shared->tmp.merges[READ] += ACCESS_ONCE(p->merges[READ]); + shared->tmp.merges[WRITE] += ACCESS_ONCE(p->merges[WRITE]); + shared->tmp.ticks[READ] += ACCESS_ONCE(p->ticks[READ]); + shared->tmp.ticks[WRITE] += ACCESS_ONCE(p->ticks[WRITE]); + shared->tmp.io_ticks[READ] += ACCESS_ONCE(p->io_ticks[READ]); + shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]); + shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total); + shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue); + } +} + +static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end, + bool init_tmp_percpu_totals) +{ + size_t x; + struct dm_stat_shared *shared; + struct dm_stat_percpu *p; + + for (x = idx_start; x < idx_end; x++) { + shared = &s->stat_shared[x]; + if (init_tmp_percpu_totals) + __dm_stat_init_temporary_percpu_totals(shared, s, x); + local_irq_disable(); + p = &s->stat_percpu[smp_processor_id()][x]; + p->sectors[READ] -= shared->tmp.sectors[READ]; + p->sectors[WRITE] -= shared->tmp.sectors[WRITE]; + p->ios[READ] -= shared->tmp.ios[READ]; + p->ios[WRITE] -= shared->tmp.ios[WRITE]; + p->merges[READ] -= shared->tmp.merges[READ]; + p->merges[WRITE] -= shared->tmp.merges[WRITE]; + p->ticks[READ] -= shared->tmp.ticks[READ]; + p->ticks[WRITE] -= shared->tmp.ticks[WRITE]; + p->io_ticks[READ] -= shared->tmp.io_ticks[READ]; + p->io_ticks[WRITE] -= shared->tmp.io_ticks[WRITE]; + p->io_ticks_total -= shared->tmp.io_ticks_total; + p->time_in_queue -= shared->tmp.time_in_queue; + local_irq_enable(); + } +} + +static int dm_stats_clear(struct dm_stats *stats, int id) +{ + struct dm_stat *s; + + mutex_lock(&stats->mutex); + + s = __dm_stats_find(stats, id); + if (!s) { + mutex_unlock(&stats->mutex); + return -ENOENT; + } + + __dm_stat_clear(s, 0, s->n_entries, true); + + mutex_unlock(&stats->mutex); + + return 1; +} + +/* + * This is like jiffies_to_msec, but works for 64-bit values. + */ +static unsigned long long dm_jiffies_to_msec64(unsigned long long j) +{ + unsigned long long result = 0; + unsigned mult; + + if (j) + result = jiffies_to_msecs(j & 0x3fffff); + if (j >= 1 << 22) { + mult = jiffies_to_msecs(1 << 22); + result += (unsigned long long)mult * (unsigned long long)jiffies_to_msecs((j >> 22) & 0x3fffff); + } + if (j >= 1ULL << 44) + result += (unsigned long long)mult * (unsigned long long)mult * (unsigned long long)jiffies_to_msecs(j >> 44); + + return result; +} + +static int dm_stats_print(struct dm_stats *stats, int id, + size_t idx_start, size_t idx_len, + bool clear, char *result, unsigned maxlen) +{ + unsigned sz = 0; + struct dm_stat *s; + size_t x; + sector_t start, end, step; + size_t idx_end; + struct dm_stat_shared *shared; + + /* + * Output format: + * + counters + */ + + mutex_lock(&stats->mutex); + + s = __dm_stats_find(stats, id); + if (!s) { + mutex_unlock(&stats->mutex); + return -ENOENT; + } + + idx_end = idx_start + idx_len; + if (idx_end < idx_start || + idx_end > s->n_entries) + idx_end = s->n_entries; + + if (idx_start > idx_end) + idx_start = idx_end; + + step = s->step; + start = s->start + (step * idx_start); + + for (x = idx_start; x < idx_end; x++, start = end) { + shared = &s->stat_shared[x]; + end = start + step; + if (unlikely(end > s->end)) + end = s->end; + + __dm_stat_init_temporary_percpu_totals(shared, s, x); + + DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu\n", + (unsigned long long)start, + (unsigned long long)step, + shared->tmp.ios[READ], + shared->tmp.merges[READ], + shared->tmp.sectors[READ], + dm_jiffies_to_msec64(shared->tmp.ticks[READ]), + shared->tmp.ios[WRITE], + shared->tmp.merges[WRITE], + shared->tmp.sectors[WRITE], + dm_jiffies_to_msec64(shared->tmp.ticks[WRITE]), + dm_stat_in_flight(shared), + dm_jiffies_to_msec64(shared->tmp.io_ticks_total), + dm_jiffies_to_msec64(shared->tmp.time_in_queue), + dm_jiffies_to_msec64(shared->tmp.io_ticks[READ]), + dm_jiffies_to_msec64(shared->tmp.io_ticks[WRITE])); + + if (unlikely(sz + 1 >= maxlen)) + goto buffer_overflow; + } + + if (clear) + __dm_stat_clear(s, idx_start, idx_end, false); + +buffer_overflow: + mutex_unlock(&stats->mutex); + + return 1; +} + +static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data) +{ + struct dm_stat *s; + const char *new_aux_data; + + mutex_lock(&stats->mutex); + + s = __dm_stats_find(stats, id); + if (!s) { + mutex_unlock(&stats->mutex); + return -ENOENT; + } + + new_aux_data = kstrdup(aux_data, GFP_KERNEL); + if (!new_aux_data) { + mutex_unlock(&stats->mutex); + return -ENOMEM; + } + + kfree(s->aux_data); + s->aux_data = new_aux_data; + + mutex_unlock(&stats->mutex); + + return 0; +} + +static int message_stats_create(struct mapped_device *md, + unsigned argc, char **argv, + char *result, unsigned maxlen) +{ + int id; + char dummy; + unsigned long long start, end, len, step; + unsigned divisor; + const char *program_id, *aux_data; + + /* + * Input format: + * [ []] + */ + + if (argc < 3 || argc > 5) + return -EINVAL; + + if (!strcmp(argv[1], "-")) { + start = 0; + len = dm_get_size(md); + if (!len) + len = 1; + } else if (sscanf(argv[1], "%llu+%llu%c", &start, &len, &dummy) != 2 || + start != (sector_t)start || len != (sector_t)len) + return -EINVAL; + + end = start + len; + if (start >= end) + return -EINVAL; + + if (sscanf(argv[2], "/%u%c", &divisor, &dummy) == 1) { + step = end - start; + if (do_div(step, divisor)) + step++; + if (!step) + step = 1; + } else if (sscanf(argv[2], "%llu%c", &step, &dummy) != 1 || + step != (sector_t)step || !step) + return -EINVAL; + + program_id = "-"; + aux_data = "-"; + + if (argc > 3) + program_id = argv[3]; + + if (argc > 4) + aux_data = argv[4]; + + /* + * If a buffer overflow happens after we created the region, + * it's too late (the userspace would retry with a larger + * buffer, but the region id that caused the overflow is already + * leaked). So we must detect buffer overflow in advance. + */ + snprintf(result, maxlen, "%d", INT_MAX); + if (dm_message_test_buffer_overflow(result, maxlen)) + return 1; + + id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data, + dm_internal_suspend, dm_internal_resume, md); + if (id < 0) + return id; + + snprintf(result, maxlen, "%d", id); + + return 1; +} + +static int message_stats_delete(struct mapped_device *md, + unsigned argc, char **argv) +{ + int id; + char dummy; + + if (argc != 2) + return -EINVAL; + + if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) + return -EINVAL; + + return dm_stats_delete(dm_get_stats(md), id); +} + +static int message_stats_clear(struct mapped_device *md, + unsigned argc, char **argv) +{ + int id; + char dummy; + + if (argc != 2) + return -EINVAL; + + if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) + return -EINVAL; + + return dm_stats_clear(dm_get_stats(md), id); +} + +static int message_stats_list(struct mapped_device *md, + unsigned argc, char **argv, + char *result, unsigned maxlen) +{ + int r; + const char *program = NULL; + + if (argc < 1 || argc > 2) + return -EINVAL; + + if (argc > 1) { + program = kstrdup(argv[1], GFP_KERNEL); + if (!program) + return -ENOMEM; + } + + r = dm_stats_list(dm_get_stats(md), program, result, maxlen); + + kfree(program); + + return r; +} + +static int message_stats_print(struct mapped_device *md, + unsigned argc, char **argv, bool clear, + char *result, unsigned maxlen) +{ + int id; + char dummy; + unsigned long idx_start = 0, idx_len = ULONG_MAX; + + if (argc != 2 && argc != 4) + return -EINVAL; + + if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) + return -EINVAL; + + if (argc > 3) { + if (strcmp(argv[2], "-") && + sscanf(argv[2], "%lu%c", &idx_start, &dummy) != 1) + return -EINVAL; + if (strcmp(argv[3], "-") && + sscanf(argv[3], "%lu%c", &idx_len, &dummy) != 1) + return -EINVAL; + } + + return dm_stats_print(dm_get_stats(md), id, idx_start, idx_len, clear, + result, maxlen); +} + +static int message_stats_set_aux(struct mapped_device *md, + unsigned argc, char **argv) +{ + int id; + char dummy; + + if (argc != 3) + return -EINVAL; + + if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) + return -EINVAL; + + return dm_stats_set_aux(dm_get_stats(md), id, argv[2]); +} + +int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv, + char *result, unsigned maxlen) +{ + int r; + + if (dm_request_based(md)) { + DMWARN("Statistics are only supported for bio-based devices"); + return -EOPNOTSUPP; + } + + /* All messages here must start with '@' */ + if (!strcasecmp(argv[0], "@stats_create")) + r = message_stats_create(md, argc, argv, result, maxlen); + else if (!strcasecmp(argv[0], "@stats_delete")) + r = message_stats_delete(md, argc, argv); + else if (!strcasecmp(argv[0], "@stats_clear")) + r = message_stats_clear(md, argc, argv); + else if (!strcasecmp(argv[0], "@stats_list")) + r = message_stats_list(md, argc, argv, result, maxlen); + else if (!strcasecmp(argv[0], "@stats_print")) + r = message_stats_print(md, argc, argv, false, result, maxlen); + else if (!strcasecmp(argv[0], "@stats_print_clear")) + r = message_stats_print(md, argc, argv, true, result, maxlen); + else if (!strcasecmp(argv[0], "@stats_set_aux")) + r = message_stats_set_aux(md, argc, argv); + else + return 2; /* this wasn't a stats message */ + + if (r == -EINVAL) + DMWARN("Invalid parameters for message %s", argv[0]); + + return r; +} + +int __init dm_statistics_init(void) +{ + dm_stat_need_rcu_barrier = 0; + return 0; +} + +void dm_statistics_exit(void) +{ + if (dm_stat_need_rcu_barrier) + rcu_barrier(); + if (WARN_ON(shared_memory_amount)) + DMCRIT("shared_memory_amount leaked: %lu", shared_memory_amount); +} + +module_param_named(stats_current_allocated_bytes, shared_memory_amount, ulong, S_IRUGO); +MODULE_PARM_DESC(stats_current_allocated_bytes, "Memory currently used by statistics"); diff --git a/drivers/md/dm-stats.h b/drivers/md/dm-stats.h new file mode 100644 index 000000000000..e7c4984bf235 --- /dev/null +++ b/drivers/md/dm-stats.h @@ -0,0 +1,40 @@ +#ifndef DM_STATS_H +#define DM_STATS_H + +#include +#include +#include + +int dm_statistics_init(void); +void dm_statistics_exit(void); + +struct dm_stats { + struct mutex mutex; + struct list_head list; /* list of struct dm_stat */ + struct dm_stats_last_position __percpu *last; + sector_t last_sector; + unsigned last_rw; +}; + +struct dm_stats_aux { + bool merged; +}; + +void dm_stats_init(struct dm_stats *st); +void dm_stats_cleanup(struct dm_stats *st); + +struct mapped_device; + +int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv, + char *result, unsigned maxlen); + +void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, + sector_t bi_sector, unsigned bi_sectors, bool end, + unsigned long duration, struct dm_stats_aux *aux); + +static inline bool dm_stats_used(struct dm_stats *st) +{ + return !list_empty(&st->list); +} + +#endif diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7faeaa3d4835..6a5e9ed2fcc3 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -60,6 +60,7 @@ struct dm_io { struct bio *bio; unsigned long start_time; spinlock_t endio_lock; + struct dm_stats_aux stats_aux; }; /* @@ -198,6 +199,8 @@ struct mapped_device { /* zero-length flush that will be cloned and submitted to targets */ struct bio flush_bio; + + struct dm_stats stats; }; /* @@ -269,6 +272,7 @@ static int (*_inits[])(void) __initdata = { dm_io_init, dm_kcopyd_init, dm_interface_init, + dm_statistics_init, }; static void (*_exits[])(void) = { @@ -279,6 +283,7 @@ static void (*_exits[])(void) = { dm_io_exit, dm_kcopyd_exit, dm_interface_exit, + dm_statistics_exit, }; static int __init dm_init(void) @@ -384,6 +389,16 @@ int dm_lock_for_deletion(struct mapped_device *md) return r; } +sector_t dm_get_size(struct mapped_device *md) +{ + return get_capacity(md->disk); +} + +struct dm_stats *dm_get_stats(struct mapped_device *md) +{ + return &md->stats; +} + static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) { struct mapped_device *md = bdev->bd_disk->private_data; @@ -466,8 +481,9 @@ static int md_in_flight(struct mapped_device *md) static void start_io_acct(struct dm_io *io) { struct mapped_device *md = io->md; + struct bio *bio = io->bio; int cpu; - int rw = bio_data_dir(io->bio); + int rw = bio_data_dir(bio); io->start_time = jiffies; @@ -476,6 +492,10 @@ static void start_io_acct(struct dm_io *io) part_stat_unlock(); atomic_set(&dm_disk(md)->part0.in_flight[rw], atomic_inc_return(&md->pending[rw])); + + if (unlikely(dm_stats_used(&md->stats))) + dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector, + bio_sectors(bio), false, 0, &io->stats_aux); } static void end_io_acct(struct dm_io *io) @@ -491,6 +511,10 @@ static void end_io_acct(struct dm_io *io) part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); part_stat_unlock(); + if (unlikely(dm_stats_used(&md->stats))) + dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector, + bio_sectors(bio), true, duration, &io->stats_aux); + /* * After this is decremented the bio must not be touched if it is * a flush. @@ -1519,7 +1543,7 @@ static void _dm_request(struct request_queue *q, struct bio *bio) return; } -static int dm_request_based(struct mapped_device *md) +int dm_request_based(struct mapped_device *md) { return blk_queue_stackable(md->queue); } @@ -1958,6 +1982,8 @@ static struct mapped_device *alloc_dev(int minor) md->flush_bio.bi_bdev = md->bdev; md->flush_bio.bi_rw = WRITE_FLUSH; + dm_stats_init(&md->stats); + /* Populate the mapping, nobody knows we exist yet */ spin_lock(&_minor_lock); old_md = idr_replace(&_minor_idr, md, minor); @@ -2009,6 +2035,7 @@ static void free_dev(struct mapped_device *md) put_disk(md->disk); blk_cleanup_queue(md->queue); + dm_stats_cleanup(&md->stats); module_put(THIS_MODULE); kfree(md); } @@ -2150,7 +2177,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, /* * Wipe any geometry if the size of the table changed. */ - if (size != get_capacity(md->disk)) + if (size != dm_get_size(md)) memset(&md->geometry, 0, sizeof(md->geometry)); __set_size(md, size); @@ -2696,6 +2723,38 @@ out: return r; } +/* + * Internal suspend/resume works like userspace-driven suspend. It waits + * until all bios finish and prevents issuing new bios to the target drivers. + * It may be used only from the kernel. + * + * Internal suspend holds md->suspend_lock, which prevents interaction with + * userspace-driven suspend. + */ + +void dm_internal_suspend(struct mapped_device *md) +{ + mutex_lock(&md->suspend_lock); + if (dm_suspended_md(md)) + return; + + set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); + synchronize_srcu(&md->io_barrier); + flush_workqueue(md->wq); + dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); +} + +void dm_internal_resume(struct mapped_device *md) +{ + if (dm_suspended_md(md)) + goto done; + + dm_queue_flush(md); + +done: + mutex_unlock(&md->suspend_lock); +} + /*----------------------------------------------------------------- * Event notification. *---------------------------------------------------------------*/ diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 8b4c075d9a2f..5e604cc7b4aa 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -16,6 +16,8 @@ #include #include +#include "dm-stats.h" + /* * Suspend feature flags */ @@ -157,10 +159,16 @@ void dm_destroy(struct mapped_device *md); void dm_destroy_immediate(struct mapped_device *md); int dm_open_count(struct mapped_device *md); int dm_lock_for_deletion(struct mapped_device *md); +int dm_request_based(struct mapped_device *md); +sector_t dm_get_size(struct mapped_device *md); +struct dm_stats *dm_get_stats(struct mapped_device *md); int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, unsigned cookie); +void dm_internal_suspend(struct mapped_device *md); +void dm_internal_resume(struct mapped_device *md); + int dm_io_init(void); void dm_io_exit(void); @@ -173,4 +181,12 @@ void dm_kcopyd_exit(void); struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size); void dm_free_md_mempools(struct dm_md_mempools *pools); +/* + * Helpers that are used by DM core + */ +static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen) +{ + return !maxlen || strlen(result) + 1 >= maxlen; +} + #endif diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index e151d4c9298d..653073de09e3 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -10,6 +10,7 @@ #include #include +#include #include struct dm_dev; @@ -550,6 +551,14 @@ extern struct ratelimit_state dm_ratelimit_state; #define DM_MAPIO_REMAPPED 1 #define DM_MAPIO_REQUEUE DM_ENDIO_REQUEUE +#define dm_sector_div64(x, y)( \ +{ \ + u64 _res; \ + (x) = div64_u64_rem(x, y, &_res); \ + _res; \ +} \ +) + /* * Ceiling(n / sz) */ diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index afd0cbd52edb..f1e12bd40b3b 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -267,9 +267,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 25 +#define DM_VERSION_MINOR 26 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2013-06-26)" +#define DM_VERSION_EXTRA "-ioctl (2013-08-15)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ -- cgit v1.2.3-71-gd317 From b04c99e3b845892d754ee8052d6324c39c4040de Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 7 Sep 2013 09:48:41 -0700 Subject: Revert "Input: introduce BTN/ABS bits for drums and guitars" This reverts commits 61e00655e9cb, 73f8645db191 and 8e22ecb603c8: "Input: introduce BTN/ABS bits for drums and guitars" "HID: wiimote: add support for Guitar-Hero drums" "HID: wiimote: add support for Guitar-Hero guitars" The extra new ABS_xx values resulted in ABS_MAX no longer being a power-of-two, which broke the comparison logic. It also caused the ioctl numbers to overflow into the next byte, causing problems for that. We'll try again for 3.13. Reported-by: Markus Trippelsdorf Reported-by: Linus Torvalds Acked-by: David Herrmann Acked-by: Dmitry Torokhov Cc: Benjamin Tissoires Signed-off-by: Linus Torvalds --- drivers/hid/hid-wiimote-core.c | 14 -- drivers/hid/hid-wiimote-modules.c | 392 -------------------------------------- drivers/hid/hid-wiimote.h | 3 - include/linux/mod_devicetable.h | 2 +- include/uapi/linux/input.h | 25 +-- 5 files changed, 3 insertions(+), 433 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/hid/hid-wiimote-core.c b/drivers/hid/hid-wiimote-core.c index bd2bc4a1f378..abb20db2b443 100644 --- a/drivers/hid/hid-wiimote-core.c +++ b/drivers/hid/hid-wiimote-core.c @@ -455,12 +455,6 @@ static __u8 wiimote_cmd_read_ext(struct wiimote_data *wdata, __u8 *rmem) return WIIMOTE_EXT_BALANCE_BOARD; if (rmem[4] == 0x01 && rmem[5] == 0x20) return WIIMOTE_EXT_PRO_CONTROLLER; - if (rmem[0] == 0x01 && rmem[1] == 0x00 && - rmem[4] == 0x01 && rmem[5] == 0x03) - return WIIMOTE_EXT_GUITAR_HERO_DRUMS; - if (rmem[0] == 0x00 && rmem[1] == 0x00 && - rmem[4] == 0x01 && rmem[5] == 0x03) - return WIIMOTE_EXT_GUITAR_HERO_GUITAR; return WIIMOTE_EXT_UNKNOWN; } @@ -494,8 +488,6 @@ static bool wiimote_cmd_map_mp(struct wiimote_data *wdata, __u8 exttype) /* map MP with correct pass-through mode */ switch (exttype) { case WIIMOTE_EXT_CLASSIC_CONTROLLER: - case WIIMOTE_EXT_GUITAR_HERO_DRUMS: - case WIIMOTE_EXT_GUITAR_HERO_GUITAR: wmem = 0x07; break; case WIIMOTE_EXT_NUNCHUK: @@ -1083,8 +1075,6 @@ static const char *wiimote_exttype_names[WIIMOTE_EXT_NUM] = { [WIIMOTE_EXT_CLASSIC_CONTROLLER] = "Nintendo Wii Classic Controller", [WIIMOTE_EXT_BALANCE_BOARD] = "Nintendo Wii Balance Board", [WIIMOTE_EXT_PRO_CONTROLLER] = "Nintendo Wii U Pro Controller", - [WIIMOTE_EXT_GUITAR_HERO_DRUMS] = "Nintendo Wii Guitar Hero Drums", - [WIIMOTE_EXT_GUITAR_HERO_GUITAR] = "Nintendo Wii Guitar Hero Guitar", }; /* @@ -1670,10 +1660,6 @@ static ssize_t wiimote_ext_show(struct device *dev, return sprintf(buf, "balanceboard\n"); case WIIMOTE_EXT_PRO_CONTROLLER: return sprintf(buf, "procontroller\n"); - case WIIMOTE_EXT_GUITAR_HERO_DRUMS: - return sprintf(buf, "drums\n"); - case WIIMOTE_EXT_GUITAR_HERO_GUITAR: - return sprintf(buf, "guitar\n"); case WIIMOTE_EXT_UNKNOWN: /* fallthrough */ default: diff --git a/drivers/hid/hid-wiimote-modules.c b/drivers/hid/hid-wiimote-modules.c index 7e124c351e67..2e7d644dba18 100644 --- a/drivers/hid/hid-wiimote-modules.c +++ b/drivers/hid/hid-wiimote-modules.c @@ -1833,396 +1833,6 @@ static const struct wiimod_ops wiimod_pro = { .in_ext = wiimod_pro_in_ext, }; -/* - * Drums - * Guitar-Hero, Rock-Band and other games came bundled with drums which can - * be plugged as extension to a Wiimote. Drum-reports are still not entirely - * figured out, but the most important information is known. - * We create a separate device for drums and report all information via this - * input device. - */ - -static inline void wiimod_drums_report_pressure(struct wiimote_data *wdata, - __u8 none, __u8 which, - __u8 pressure, __u8 onoff, - __u8 *store, __u16 code, - __u8 which_code) -{ - static const __u8 default_pressure = 3; - - if (!none && which == which_code) { - *store = pressure; - input_report_abs(wdata->extension.input, code, *store); - } else if (onoff != !!*store) { - *store = onoff ? default_pressure : 0; - input_report_abs(wdata->extension.input, code, *store); - } -} - -static void wiimod_drums_in_ext(struct wiimote_data *wdata, const __u8 *ext) -{ - __u8 pressure, which, none, hhp, sx, sy; - __u8 o, r, y, g, b, bass, bm, bp; - - /* Byte | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | - * -----+-----+-----+-----+-----+-----+-----+-----+-----+ - * 1 | 0 | 0 | SX <5:0> | - * 2 | 0 | 0 | SY <5:0> | - * -----+-----+-----+-----------------------------+-----+ - * 3 | HPP | NON | WHICH <5:1> | ? | - * -----+-----+-----+-----+-----+-----+-----+-----+-----+ - * 4 | SOFT <7:5> | 0 | 1 | 1 | 0 | ? | - * -----+-----+-----+-----+-----+-----+-----+-----+-----+ - * 5 | ? | 1 | 1 | B- | 1 | B+ | 1 | ? | - * -----+-----+-----+-----+-----+-----+-----+-----+-----+ - * 6 | O | R | Y | G | B | BSS | 1 | 1 | - * -----+-----+-----+-----+-----+-----+-----+-----+-----+ - * All buttons are 0 if pressed - * - * With Motion+ enabled, the following bits will get invalid: - * Byte | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | - * -----+-----+-----+-----+-----+-----+-----+-----+-----+ - * 1 | 0 | 0 | SX <5:1> |XXXXX| - * 2 | 0 | 0 | SY <5:1> |XXXXX| - * -----+-----+-----+-----------------------------+-----+ - * 3 | HPP | NON | WHICH <5:1> | ? | - * -----+-----+-----+-----+-----+-----+-----+-----+-----+ - * 4 | SOFT <7:5> | 0 | 1 | 1 | 0 | ? | - * -----+-----+-----+-----+-----+-----+-----+-----+-----+ - * 5 | ? | 1 | 1 | B- | 1 | B+ | 1 |XXXXX| - * -----+-----+-----+-----+-----+-----+-----+-----+-----+ - * 6 | O | R | Y | G | B | BSS |XXXXX|XXXXX| - * -----+-----+-----+-----+-----+-----+-----+-----+-----+ - */ - - pressure = 7 - (ext[3] >> 5); - which = (ext[2] >> 1) & 0x1f; - none = !!(ext[2] & 0x40); - hhp = !(ext[2] & 0x80); - sx = ext[0] & 0x3f; - sy = ext[1] & 0x3f; - o = !(ext[5] & 0x80); - r = !(ext[5] & 0x40); - y = !(ext[5] & 0x20); - g = !(ext[5] & 0x10); - b = !(ext[5] & 0x08); - bass = !(ext[5] & 0x04); - bm = !(ext[4] & 0x10); - bp = !(ext[4] & 0x04); - - wiimod_drums_report_pressure(wdata, none, which, pressure, - o, &wdata->state.pressure_drums[0], - ABS_CYMBAL_RIGHT, 0x0e); - wiimod_drums_report_pressure(wdata, none, which, pressure, - r, &wdata->state.pressure_drums[1], - ABS_TOM_LEFT, 0x19); - wiimod_drums_report_pressure(wdata, none, which, pressure, - y, &wdata->state.pressure_drums[2], - ABS_CYMBAL_LEFT, 0x11); - wiimod_drums_report_pressure(wdata, none, which, pressure, - g, &wdata->state.pressure_drums[3], - ABS_TOM_FAR_RIGHT, 0x12); - wiimod_drums_report_pressure(wdata, none, which, pressure, - b, &wdata->state.pressure_drums[4], - ABS_TOM_RIGHT, 0x0f); - - /* Bass shares pressure with hi-hat (set via hhp) */ - wiimod_drums_report_pressure(wdata, none, hhp ? 0xff : which, pressure, - bass, &wdata->state.pressure_drums[5], - ABS_BASS, 0x1b); - /* Hi-hat has no on/off values, just pressure. Force to off/0. */ - wiimod_drums_report_pressure(wdata, none, hhp ? which : 0xff, pressure, - 0, &wdata->state.pressure_drums[6], - ABS_HI_HAT, 0x0e); - - input_report_abs(wdata->extension.input, ABS_X, sx - 0x20); - input_report_abs(wdata->extension.input, ABS_Y, sy - 0x20); - - input_report_key(wdata->extension.input, BTN_START, bp); - input_report_key(wdata->extension.input, BTN_SELECT, bm); - - input_sync(wdata->extension.input); -} - -static int wiimod_drums_open(struct input_dev *dev) -{ - struct wiimote_data *wdata = input_get_drvdata(dev); - unsigned long flags; - - spin_lock_irqsave(&wdata->state.lock, flags); - wdata->state.flags |= WIIPROTO_FLAG_EXT_USED; - wiiproto_req_drm(wdata, WIIPROTO_REQ_NULL); - spin_unlock_irqrestore(&wdata->state.lock, flags); - - return 0; -} - -static void wiimod_drums_close(struct input_dev *dev) -{ - struct wiimote_data *wdata = input_get_drvdata(dev); - unsigned long flags; - - spin_lock_irqsave(&wdata->state.lock, flags); - wdata->state.flags &= ~WIIPROTO_FLAG_EXT_USED; - wiiproto_req_drm(wdata, WIIPROTO_REQ_NULL); - spin_unlock_irqrestore(&wdata->state.lock, flags); -} - -static int wiimod_drums_probe(const struct wiimod_ops *ops, - struct wiimote_data *wdata) -{ - int ret; - - wdata->extension.input = input_allocate_device(); - if (!wdata->extension.input) - return -ENOMEM; - - input_set_drvdata(wdata->extension.input, wdata); - wdata->extension.input->open = wiimod_drums_open; - wdata->extension.input->close = wiimod_drums_close; - wdata->extension.input->dev.parent = &wdata->hdev->dev; - wdata->extension.input->id.bustype = wdata->hdev->bus; - wdata->extension.input->id.vendor = wdata->hdev->vendor; - wdata->extension.input->id.product = wdata->hdev->product; - wdata->extension.input->id.version = wdata->hdev->version; - wdata->extension.input->name = WIIMOTE_NAME " Drums"; - - set_bit(EV_KEY, wdata->extension.input->evbit); - set_bit(BTN_START, wdata->extension.input->keybit); - set_bit(BTN_SELECT, wdata->extension.input->keybit); - - set_bit(EV_ABS, wdata->extension.input->evbit); - set_bit(ABS_X, wdata->extension.input->absbit); - set_bit(ABS_Y, wdata->extension.input->absbit); - set_bit(ABS_TOM_LEFT, wdata->extension.input->absbit); - set_bit(ABS_TOM_RIGHT, wdata->extension.input->absbit); - set_bit(ABS_TOM_FAR_RIGHT, wdata->extension.input->absbit); - set_bit(ABS_CYMBAL_LEFT, wdata->extension.input->absbit); - set_bit(ABS_CYMBAL_RIGHT, wdata->extension.input->absbit); - set_bit(ABS_BASS, wdata->extension.input->absbit); - set_bit(ABS_HI_HAT, wdata->extension.input->absbit); - input_set_abs_params(wdata->extension.input, - ABS_X, -32, 31, 1, 1); - input_set_abs_params(wdata->extension.input, - ABS_Y, -32, 31, 1, 1); - input_set_abs_params(wdata->extension.input, - ABS_TOM_LEFT, 0, 7, 0, 0); - input_set_abs_params(wdata->extension.input, - ABS_TOM_RIGHT, 0, 7, 0, 0); - input_set_abs_params(wdata->extension.input, - ABS_TOM_FAR_RIGHT, 0, 7, 0, 0); - input_set_abs_params(wdata->extension.input, - ABS_CYMBAL_LEFT, 0, 7, 0, 0); - input_set_abs_params(wdata->extension.input, - ABS_CYMBAL_RIGHT, 0, 7, 0, 0); - input_set_abs_params(wdata->extension.input, - ABS_BASS, 0, 7, 0, 0); - input_set_abs_params(wdata->extension.input, - ABS_HI_HAT, 0, 7, 0, 0); - - ret = input_register_device(wdata->extension.input); - if (ret) - goto err_free; - - return 0; - -err_free: - input_free_device(wdata->extension.input); - wdata->extension.input = NULL; - return ret; -} - -static void wiimod_drums_remove(const struct wiimod_ops *ops, - struct wiimote_data *wdata) -{ - if (!wdata->extension.input) - return; - - input_unregister_device(wdata->extension.input); - wdata->extension.input = NULL; -} - -static const struct wiimod_ops wiimod_drums = { - .flags = 0, - .arg = 0, - .probe = wiimod_drums_probe, - .remove = wiimod_drums_remove, - .in_ext = wiimod_drums_in_ext, -}; - -/* - * Guitar - * Guitar-Hero, Rock-Band and other games came bundled with guitars which can - * be plugged as extension to a Wiimote. - * We create a separate device for guitars and report all information via this - * input device. - */ - -static void wiimod_guitar_in_ext(struct wiimote_data *wdata, const __u8 *ext) -{ - __u8 sx, sy, tb, wb, bd, bm, bp, bo, br, bb, bg, by, bu; - - /* Byte | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | - * -----+-----+-----+-----+-----+-----+-----+-----+-----+ - * 1 | 0 | 0 | SX <5:0> | - * 2 | 0 | 0 | SY <5:0> | - * -----+-----+-----+-----+-----------------------------+ - * 3 | 0 | 0 | 0 | TB <4:0> | - * -----+-----+-----+-----+-----------------------------+ - * 4 | 0 | 0 | 0 | WB <4:0> | - * -----+-----+-----+-----+-----+-----+-----+-----+-----+ - * 5 | 1 | BD | 1 | B- | 1 | B+ | 1 | 1 | - * -----+-----+-----+-----+-----+-----+-----+-----+-----+ - * 6 | BO | BR | BB | BG | BY | 1 | 1 | BU | - * -----+-----+-----+-----+-----+-----+-----+-----+-----+ - * All buttons are 0 if pressed - * - * With Motion+ enabled, the following bits will get invalid: - * Byte | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | - * -----+-----+-----+-----+-----+-----+-----+-----+-----+ - * 1 | 0 | 0 | SX <5:1> |XXXXX| - * 2 | 0 | 0 | SY <5:1> |XXXXX| - * -----+-----+-----+-----+-----------------------+-----+ - * 3 | 0 | 0 | 0 | TB <4:0> | - * -----+-----+-----+-----+-----------------------------+ - * 4 | 0 | 0 | 0 | WB <4:0> | - * -----+-----+-----+-----+-----+-----+-----+-----+-----+ - * 5 | 1 | BD | 1 | B- | 1 | B+ | 1 |XXXXX| - * -----+-----+-----+-----+-----+-----+-----+-----+-----+ - * 6 | BO | BR | BB | BG | BY | 1 |XXXXX|XXXXX| - * -----+-----+-----+-----+-----+-----+-----+-----+-----+ - */ - - sx = ext[0] & 0x3f; - sy = ext[1] & 0x3f; - tb = ext[2] & 0x1f; - wb = ext[3] & 0x1f; - bd = !(ext[4] & 0x40); - bm = !(ext[4] & 0x10); - bp = !(ext[4] & 0x04); - bo = !(ext[5] & 0x80); - br = !(ext[5] & 0x40); - bb = !(ext[5] & 0x20); - bg = !(ext[5] & 0x10); - by = !(ext[5] & 0x08); - bu = !(ext[5] & 0x01); - - input_report_abs(wdata->extension.input, ABS_X, sx - 0x20); - input_report_abs(wdata->extension.input, ABS_Y, sy - 0x20); - input_report_abs(wdata->extension.input, ABS_FRET_BOARD, tb); - input_report_abs(wdata->extension.input, ABS_WHAMMY_BAR, wb - 0x10); - - input_report_key(wdata->extension.input, BTN_MODE, bm); - input_report_key(wdata->extension.input, BTN_START, bp); - input_report_key(wdata->extension.input, BTN_STRUM_BAR_UP, bu); - input_report_key(wdata->extension.input, BTN_STRUM_BAR_DOWN, bd); - input_report_key(wdata->extension.input, BTN_FRET_FAR_UP, bg); - input_report_key(wdata->extension.input, BTN_FRET_UP, br); - input_report_key(wdata->extension.input, BTN_FRET_MID, by); - input_report_key(wdata->extension.input, BTN_FRET_LOW, bb); - input_report_key(wdata->extension.input, BTN_FRET_FAR_LOW, bo); - - input_sync(wdata->extension.input); -} - -static int wiimod_guitar_open(struct input_dev *dev) -{ - struct wiimote_data *wdata = input_get_drvdata(dev); - unsigned long flags; - - spin_lock_irqsave(&wdata->state.lock, flags); - wdata->state.flags |= WIIPROTO_FLAG_EXT_USED; - wiiproto_req_drm(wdata, WIIPROTO_REQ_NULL); - spin_unlock_irqrestore(&wdata->state.lock, flags); - - return 0; -} - -static void wiimod_guitar_close(struct input_dev *dev) -{ - struct wiimote_data *wdata = input_get_drvdata(dev); - unsigned long flags; - - spin_lock_irqsave(&wdata->state.lock, flags); - wdata->state.flags &= ~WIIPROTO_FLAG_EXT_USED; - wiiproto_req_drm(wdata, WIIPROTO_REQ_NULL); - spin_unlock_irqrestore(&wdata->state.lock, flags); -} - -static int wiimod_guitar_probe(const struct wiimod_ops *ops, - struct wiimote_data *wdata) -{ - int ret; - - wdata->extension.input = input_allocate_device(); - if (!wdata->extension.input) - return -ENOMEM; - - input_set_drvdata(wdata->extension.input, wdata); - wdata->extension.input->open = wiimod_guitar_open; - wdata->extension.input->close = wiimod_guitar_close; - wdata->extension.input->dev.parent = &wdata->hdev->dev; - wdata->extension.input->id.bustype = wdata->hdev->bus; - wdata->extension.input->id.vendor = wdata->hdev->vendor; - wdata->extension.input->id.product = wdata->hdev->product; - wdata->extension.input->id.version = wdata->hdev->version; - wdata->extension.input->name = WIIMOTE_NAME " Guitar"; - - set_bit(EV_KEY, wdata->extension.input->evbit); - set_bit(BTN_MODE, wdata->extension.input->keybit); - set_bit(BTN_START, wdata->extension.input->keybit); - set_bit(BTN_FRET_FAR_UP, wdata->extension.input->keybit); - set_bit(BTN_FRET_UP, wdata->extension.input->keybit); - set_bit(BTN_FRET_MID, wdata->extension.input->keybit); - set_bit(BTN_FRET_LOW, wdata->extension.input->keybit); - set_bit(BTN_FRET_FAR_LOW, wdata->extension.input->keybit); - set_bit(BTN_STRUM_BAR_UP, wdata->extension.input->keybit); - set_bit(BTN_STRUM_BAR_DOWN, wdata->extension.input->keybit); - - set_bit(EV_ABS, wdata->extension.input->evbit); - set_bit(ABS_X, wdata->extension.input->absbit); - set_bit(ABS_Y, wdata->extension.input->absbit); - set_bit(ABS_FRET_BOARD, wdata->extension.input->absbit); - set_bit(ABS_WHAMMY_BAR, wdata->extension.input->absbit); - input_set_abs_params(wdata->extension.input, - ABS_X, -32, 31, 1, 1); - input_set_abs_params(wdata->extension.input, - ABS_Y, -32, 31, 1, 1); - input_set_abs_params(wdata->extension.input, - ABS_FRET_BOARD, 0, 0x1f, 1, 1); - input_set_abs_params(wdata->extension.input, - ABS_WHAMMY_BAR, 0, 0x0f, 1, 1); - - ret = input_register_device(wdata->extension.input); - if (ret) - goto err_free; - - return 0; - -err_free: - input_free_device(wdata->extension.input); - wdata->extension.input = NULL; - return ret; -} - -static void wiimod_guitar_remove(const struct wiimod_ops *ops, - struct wiimote_data *wdata) -{ - if (!wdata->extension.input) - return; - - input_unregister_device(wdata->extension.input); - wdata->extension.input = NULL; -} - -static const struct wiimod_ops wiimod_guitar = { - .flags = 0, - .arg = 0, - .probe = wiimod_guitar_probe, - .remove = wiimod_guitar_remove, - .in_ext = wiimod_guitar_in_ext, -}; - /* * Builtin Motion Plus * This module simply sets the WIIPROTO_FLAG_BUILTIN_MP protocol flag which @@ -2473,6 +2083,4 @@ const struct wiimod_ops *wiimod_ext_table[WIIMOTE_EXT_NUM] = { [WIIMOTE_EXT_CLASSIC_CONTROLLER] = &wiimod_classic, [WIIMOTE_EXT_BALANCE_BOARD] = &wiimod_bboard, [WIIMOTE_EXT_PRO_CONTROLLER] = &wiimod_pro, - [WIIMOTE_EXT_GUITAR_HERO_DRUMS] = &wiimod_drums, - [WIIMOTE_EXT_GUITAR_HERO_GUITAR] = &wiimod_guitar, }; diff --git a/drivers/hid/hid-wiimote.h b/drivers/hid/hid-wiimote.h index 379cdfb6bd25..f1474f372c0b 100644 --- a/drivers/hid/hid-wiimote.h +++ b/drivers/hid/hid-wiimote.h @@ -88,8 +88,6 @@ enum wiimote_exttype { WIIMOTE_EXT_CLASSIC_CONTROLLER, WIIMOTE_EXT_BALANCE_BOARD, WIIMOTE_EXT_PRO_CONTROLLER, - WIIMOTE_EXT_GUITAR_HERO_DRUMS, - WIIMOTE_EXT_GUITAR_HERO_GUITAR, WIIMOTE_EXT_NUM, }; @@ -137,7 +135,6 @@ struct wiimote_state { /* calibration data */ __u16 calib_bboard[4][3]; - __u8 pressure_drums[7]; }; struct wiimote_data { diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 329aa307cb77..45e921401b06 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -277,7 +277,7 @@ struct pcmcia_device_id { #define INPUT_DEVICE_ID_KEY_MIN_INTERESTING 0x71 #define INPUT_DEVICE_ID_KEY_MAX 0x2ff #define INPUT_DEVICE_ID_REL_MAX 0x0f -#define INPUT_DEVICE_ID_ABS_MAX 0x4f +#define INPUT_DEVICE_ID_ABS_MAX 0x3f #define INPUT_DEVICE_ID_MSC_MAX 0x07 #define INPUT_DEVICE_ID_LED_MAX 0x0f #define INPUT_DEVICE_ID_SND_MAX 0x07 diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h index 76457eef172a..d584047b072b 100644 --- a/include/uapi/linux/input.h +++ b/include/uapi/linux/input.h @@ -716,14 +716,6 @@ struct input_keymap_entry { #define BTN_DPAD_LEFT 0x222 #define BTN_DPAD_RIGHT 0x223 -#define BTN_FRET_FAR_UP 0x224 -#define BTN_FRET_UP 0x225 -#define BTN_FRET_MID 0x226 -#define BTN_FRET_LOW 0x227 -#define BTN_FRET_FAR_LOW 0x228 -#define BTN_STRUM_BAR_UP 0x229 -#define BTN_STRUM_BAR_DOWN 0x22a - #define BTN_TRIGGER_HAPPY 0x2c0 #define BTN_TRIGGER_HAPPY1 0x2c0 #define BTN_TRIGGER_HAPPY2 0x2c1 @@ -837,21 +829,8 @@ struct input_keymap_entry { #define ABS_MT_TOOL_X 0x3c /* Center X tool position */ #define ABS_MT_TOOL_Y 0x3d /* Center Y tool position */ -/* Drums and guitars (mostly toys) */ -#define ABS_TOM_FAR_LEFT 0x40 -#define ABS_TOM_LEFT 0x41 -#define ABS_TOM_RIGHT 0x42 -#define ABS_TOM_FAR_RIGHT 0x43 -#define ABS_CYMBAL_FAR_LEFT 0x44 -#define ABS_CYMBAL_LEFT 0x45 -#define ABS_CYMBAL_RIGHT 0x46 -#define ABS_CYMBAL_FAR_RIGHT 0x47 -#define ABS_BASS 0x48 -#define ABS_HI_HAT 0x49 -#define ABS_FRET_BOARD 0x4a /* Guitar fret board, vertical pos */ -#define ABS_WHAMMY_BAR 0x4b /* Guitar whammy bar (or vibrato) */ - -#define ABS_MAX 0x4f + +#define ABS_MAX 0x3f #define ABS_CNT (ABS_MAX+1) /* -- cgit v1.2.3-71-gd317 From 8c3a2b4c420c5b988005b8697b7404ced076aaaa Mon Sep 17 00:00:00 2001 From: Scott Lovenberg Date: Fri, 9 Aug 2013 08:47:17 -0400 Subject: cifs: Move string length definitions to uapi The max string length definitions for user name, domain name, password, and share name have been moved into their own header file in uapi so the mount helper can use autoconf to define them instead of keeping the kernel side and userland side definitions in sync manually. The names have also been standardized with a "CIFS" prefix and "LEN" suffix. Signed-off-by: Scott Lovenberg Reviewed-by: Chen Gang Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 7 ++----- fs/cifs/connect.c | 12 ++++++------ fs/cifs/sess.c | 10 +++++----- include/uapi/linux/cifs/cifs_mount.h | 25 +++++++++++++++++++++++++ 4 files changed, 38 insertions(+), 16 deletions(-) create mode 100644 include/uapi/linux/cifs/cifs_mount.h (limited to 'include/uapi/linux') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index ec6c3fbb29eb..633bbc5e8801 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -28,6 +28,7 @@ #include "cifsacl.h" #include #include +#include #ifdef CONFIG_CIFS_SMB2 #include "smb2pdu.h" #endif @@ -41,12 +42,8 @@ #define MAX_SES_INFO 2 #define MAX_TCON_INFO 4 -#define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1) +#define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + CIFS_MAX_SHARE_LEN + 1) #define MAX_SERVER_SIZE 15 -#define MAX_SHARE_SIZE 80 -#define CIFS_MAX_DOMAINNAME_LEN 256 /* max domain name length */ -#define MAX_USERNAME_SIZE 256 /* reasonable maximum for current servers */ -#define MAX_PASSWORD_SIZE 512 /* max for windows seems to be 256 wide chars */ #define CIFS_MIN_RCV_POOL 4 diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 37950c65453c..b1bab99be83b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1577,8 +1577,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (string == NULL) goto out_nomem; - if (strnlen(string, MAX_USERNAME_SIZE) > - MAX_USERNAME_SIZE) { + if (strnlen(string, CIFS_MAX_USERNAME_LEN) > + CIFS_MAX_USERNAME_LEN) { printk(KERN_WARNING "CIFS: username too long\n"); goto cifs_parse_mount_err; } @@ -2223,13 +2223,13 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol) /* anything else takes username/password */ if (strncmp(ses->user_name, vol->username ? vol->username : "", - MAX_USERNAME_SIZE)) + CIFS_MAX_USERNAME_LEN)) return 0; if (strlen(vol->username) != 0 && ses->password != NULL && strncmp(ses->password, vol->password ? vol->password : "", - MAX_PASSWORD_SIZE)) + CIFS_MAX_PASSWORD_LEN)) return 0; } return 1; @@ -2354,7 +2354,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) } len = delim - payload; - if (len > MAX_USERNAME_SIZE || len <= 0) { + if (len > CIFS_MAX_USERNAME_LEN || len <= 0) { cifs_dbg(FYI, "Bad value from username search (len=%zd)\n", len); rc = -EINVAL; @@ -2371,7 +2371,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) cifs_dbg(FYI, "%s: username=%s\n", __func__, vol->username); len = key->datalen - (len + 1); - if (len > MAX_PASSWORD_SIZE || len <= 0) { + if (len > CIFS_MAX_PASSWORD_LEN || len <= 0) { cifs_dbg(FYI, "Bad len for password search (len=%zd)\n", len); rc = -EINVAL; kfree(vol->username); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 08dd37bb23aa..a0a62db0f575 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -226,7 +226,7 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, *(bcc_ptr+1) = 0; } else { bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->user_name, - MAX_USERNAME_SIZE, nls_cp); + CIFS_MAX_USERNAME_LEN, nls_cp); } bcc_ptr += 2 * bytes_ret; bcc_ptr += 2; /* account for null termination */ @@ -246,8 +246,8 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, /* BB what about null user mounts - check that we do this BB */ /* copy user */ if (ses->user_name != NULL) { - strncpy(bcc_ptr, ses->user_name, MAX_USERNAME_SIZE); - bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE); + strncpy(bcc_ptr, ses->user_name, CIFS_MAX_USERNAME_LEN); + bcc_ptr += strnlen(ses->user_name, CIFS_MAX_USERNAME_LEN); } /* else null user mount */ *bcc_ptr = 0; @@ -501,7 +501,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, } else { int len; len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName, - MAX_USERNAME_SIZE, nls_cp); + CIFS_MAX_USERNAME_LEN, nls_cp); len *= 2; /* unicode is 2 bytes each */ sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); sec_blob->DomainName.Length = cpu_to_le16(len); @@ -517,7 +517,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, } else { int len; len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name, - MAX_USERNAME_SIZE, nls_cp); + CIFS_MAX_USERNAME_LEN, nls_cp); len *= 2; /* unicode is 2 bytes each */ sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); sec_blob->UserName.Length = cpu_to_le16(len); diff --git a/include/uapi/linux/cifs/cifs_mount.h b/include/uapi/linux/cifs/cifs_mount.h new file mode 100644 index 000000000000..19063fef6219 --- /dev/null +++ b/include/uapi/linux/cifs/cifs_mount.h @@ -0,0 +1,25 @@ +/* + * include/uapi/linux/cifs/cifs_mount.h + * + * Author(s): Scott Lovenberg (scott.lovenberg@gmail.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + */ +#ifndef _CIFS_MOUNT_H +#define _CIFS_MOUNT_H + +/* Max string lengths for cifs mounting options. */ +#define CIFS_MAX_DOMAINNAME_LEN 256 /* max fully qualified domain name */ +#define CIFS_MAX_USERNAME_LEN 256 /* reasonable max for current servers */ +#define CIFS_MAX_PASSWORD_LEN 512 /* Windows max seems to be 256 wide chars */ +#define CIFS_MAX_SHARE_LEN 80 + +#endif /* _CIFS_MOUNT_H */ -- cgit v1.2.3-71-gd317 From 54fcf270dee6ed9d27e23b94129788b758df1e6b Mon Sep 17 00:00:00 2001 From: Scott Lovenberg Date: Fri, 9 Aug 2013 08:47:18 -0400 Subject: cifs: Expand max share name length to 256 The old max share name length limit was 80 due to Windows NET SHARE command not allowing more than that. However, share names can be much longer. This is a more reasonable maximum share name length. Signed-off-by: Scott Lovenberg Reviewed-by: Jeff Layton Signed-off-by: Steve French --- include/uapi/linux/cifs/cifs_mount.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/cifs/cifs_mount.h b/include/uapi/linux/cifs/cifs_mount.h index 19063fef6219..5ac43f04f74a 100644 --- a/include/uapi/linux/cifs/cifs_mount.h +++ b/include/uapi/linux/cifs/cifs_mount.h @@ -20,6 +20,7 @@ #define CIFS_MAX_DOMAINNAME_LEN 256 /* max fully qualified domain name */ #define CIFS_MAX_USERNAME_LEN 256 /* reasonable max for current servers */ #define CIFS_MAX_PASSWORD_LEN 512 /* Windows max seems to be 256 wide chars */ -#define CIFS_MAX_SHARE_LEN 80 +#define CIFS_MAX_SHARE_LEN 256 /* reasonable max share name length */ + #endif /* _CIFS_MOUNT_H */ -- cgit v1.2.3-71-gd317 From cdf1246ffbb2ddd86a875c80b7290d22b9022805 Mon Sep 17 00:00:00 2001 From: Scott Lovenberg Date: Fri, 9 Aug 2013 08:47:19 -0400 Subject: cifs: Move and expand MAX_SERVER_SIZE definition MAX_SERVER_SIZE has been moved to cifs_mount.h and renamed CIFS_NI_MAXHOST for clarity. It has been expanded to 1024 as the previous value of 16 was very short. Signed-off-by: Scott Lovenberg Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 3 +-- include/uapi/linux/cifs/cifs_mount.h | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 633bbc5e8801..fb186f7bae49 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -42,8 +42,7 @@ #define MAX_SES_INFO 2 #define MAX_TCON_INFO 4 -#define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + CIFS_MAX_SHARE_LEN + 1) -#define MAX_SERVER_SIZE 15 +#define MAX_TREE_SIZE (2 + CIFS_NI_MAXHOST + 1 + CIFS_MAX_SHARE_LEN + 1) #define CIFS_MIN_RCV_POOL 4 diff --git a/include/uapi/linux/cifs/cifs_mount.h b/include/uapi/linux/cifs/cifs_mount.h index 5ac43f04f74a..d7e4c6ce6171 100644 --- a/include/uapi/linux/cifs/cifs_mount.h +++ b/include/uapi/linux/cifs/cifs_mount.h @@ -21,6 +21,7 @@ #define CIFS_MAX_USERNAME_LEN 256 /* reasonable max for current servers */ #define CIFS_MAX_PASSWORD_LEN 512 /* Windows max seems to be 256 wide chars */ #define CIFS_MAX_SHARE_LEN 256 /* reasonable max share name length */ +#define CIFS_NI_MAXHOST 1024 /* max host name length (256 * 4 bytes) */ #endif /* _CIFS_MOUNT_H */ -- cgit v1.2.3-71-gd317 From 3942c07ccf98e66b8893f396dca98f5b076f905f Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Wed, 28 Aug 2013 10:17:53 +1000 Subject: fs: bump inode and dentry counters to long MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This series reworks our current object cache shrinking infrastructure in two main ways: * Noticing that a lot of users copy and paste their own version of LRU lists for objects, we put some effort in providing a generic version. It is modeled after the filesystem users: dentries, inodes, and xfs (for various tasks), but we expect that other users could benefit in the near future with little or no modification. Let us know if you have any issues. * The underlying list_lru being proposed automatically and transparently keeps the elements in per-node lists, and is able to manipulate the node lists individually. Given this infrastructure, we are able to modify the up-to-now hammer called shrink_slab to proceed with node-reclaim instead of always searching memory from all over like it has been doing. Per-node lru lists are also expected to lead to less contention in the lru locks on multi-node scans, since we are now no longer fighting for a global lock. The locks usually disappear from the profilers with this change. Although we have no official benchmarks for this version - be our guest to independently evaluate this - earlier versions of this series were performance tested (details at http://permalink.gmane.org/gmane.linux.kernel.mm/100537) yielding no visible performance regressions while yielding a better qualitative behavior in NUMA machines. With this infrastructure in place, we can use the list_lru entry point to provide memcg isolation and per-memcg targeted reclaim. Historically, those two pieces of work have been posted together. This version presents only the infrastructure work, deferring the memcg work for a later time, so we can focus on getting this part tested. You can see more about the history of such work at http://lwn.net/Articles/552769/ Dave Chinner (18): dcache: convert dentry_stat.nr_unused to per-cpu counters dentry: move to per-sb LRU locks dcache: remove dentries from LRU before putting on dispose list mm: new shrinker API shrinker: convert superblock shrinkers to new API list: add a new LRU list type inode: convert inode lru list to generic lru list code. dcache: convert to use new lru list infrastructure list_lru: per-node list infrastructure shrinker: add node awareness fs: convert inode and dentry shrinking to be node aware xfs: convert buftarg LRU to generic code xfs: rework buffer dispose list tracking xfs: convert dquot cache lru to list_lru fs: convert fs shrinkers to new scan/count API drivers: convert shrinkers to new count/scan API shrinker: convert remaining shrinkers to count/scan API shrinker: Kill old ->shrink API. Glauber Costa (7): fs: bump inode and dentry counters to long super: fix calculation of shrinkable objects for small numbers list_lru: per-node API vmscan: per-node deferred work i915: bail out earlier when shrinker cannot acquire mutex hugepage: convert huge zero page shrinker to new shrinker API list_lru: dynamically adjust node arrays This patch: There are situations in very large machines in which we can have a large quantity of dirty inodes, unused dentries, etc. This is particularly true when umounting a filesystem, where eventually since every live object will eventually be discarded. Dave Chinner reported a problem with this while experimenting with the shrinker revamp patchset. So we believe it is time for a change. This patch just moves int to longs. Machines where it matters should have a big long anyway. Signed-off-by: Glauber Costa Cc: Dave Chinner Cc: "Theodore Ts'o" Cc: Adrian Hunter Cc: Al Viro Cc: Artem Bityutskiy Cc: Arve Hjønnevåg Cc: Carlos Maiolino Cc: Christoph Hellwig Cc: Chuck Lever Cc: Daniel Vetter Cc: Dave Chinner Cc: David Rientjes Cc: Gleb Natapov Cc: Greg Thelen Cc: J. Bruce Fields Cc: Jan Kara Cc: Jerome Glisse Cc: John Stultz Cc: KAMEZAWA Hiroyuki Cc: Kent Overstreet Cc: Kirill A. Shutemov Cc: Marcelo Tosatti Cc: Mel Gorman Cc: Steven Whitehouse Cc: Thomas Hellstrom Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 8 ++++---- fs/inode.c | 18 +++++++++--------- fs/internal.h | 2 +- include/linux/dcache.h | 10 +++++----- include/linux/fs.h | 4 ++-- include/uapi/linux/fs.h | 6 +++--- kernel/sysctl.c | 6 +++--- 7 files changed, 27 insertions(+), 27 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/dcache.c b/fs/dcache.c index 4d9df3c940e6..6ef1c2e1bbc4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -146,13 +146,13 @@ struct dentry_stat_t dentry_stat = { .age_limit = 45, }; -static DEFINE_PER_CPU(unsigned int, nr_dentry); +static DEFINE_PER_CPU(long, nr_dentry); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) -static int get_nr_dentry(void) +static long get_nr_dentry(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry, i); return sum < 0 ? 0 : sum; @@ -162,7 +162,7 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); - return proc_dointvec(table, write, buffer, lenp, ppos); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif diff --git a/fs/inode.c b/fs/inode.c index 93a0625b46e4..2a3c37ea823d 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -70,33 +70,33 @@ EXPORT_SYMBOL(empty_aops); */ struct inodes_stat_t inodes_stat; -static DEFINE_PER_CPU(unsigned int, nr_inodes); -static DEFINE_PER_CPU(unsigned int, nr_unused); +static DEFINE_PER_CPU(unsigned long, nr_inodes); +static DEFINE_PER_CPU(unsigned long, nr_unused); static struct kmem_cache *inode_cachep __read_mostly; -static int get_nr_inodes(void) +static long get_nr_inodes(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_inodes, i); return sum < 0 ? 0 : sum; } -static inline int get_nr_inodes_unused(void) +static inline long get_nr_inodes_unused(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_unused, i); return sum < 0 ? 0 : sum; } -int get_nr_dirty_inodes(void) +long get_nr_dirty_inodes(void) { /* not actually dirty inodes, but a wild approximation */ - int nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); + long nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); return nr_dirty > 0 ? nr_dirty : 0; } @@ -109,7 +109,7 @@ int proc_nr_inodes(ctl_table *table, int write, { inodes_stat.nr_inodes = get_nr_inodes(); inodes_stat.nr_unused = get_nr_inodes_unused(); - return proc_dointvec(table, write, buffer, lenp, ppos); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif diff --git a/fs/internal.h b/fs/internal.h index 2be46ea5dd0b..b6495659d6e8 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -121,7 +121,7 @@ extern void inode_add_lru(struct inode *inode); */ extern void inode_wb_list_del(struct inode *inode); -extern int get_nr_dirty_inodes(void); +extern long get_nr_dirty_inodes(void); extern void evict_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *, bool); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index feaa8d88eef7..844a1ef387e4 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -55,11 +55,11 @@ struct qstr { #define hashlen_len(hashlen) ((u32)((hashlen) >> 32)) struct dentry_stat_t { - int nr_dentry; - int nr_unused; - int age_limit; /* age in seconds */ - int want_pages; /* pages requested by system */ - int dummy[2]; + long nr_dentry; + long nr_unused; + long age_limit; /* age in seconds */ + long want_pages; /* pages requested by system */ + long dummy[2]; }; extern struct dentry_stat_t dentry_stat; diff --git a/include/linux/fs.h b/include/linux/fs.h index 49e71b0f0e9f..3b3edac75df2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1271,12 +1271,12 @@ struct super_block { struct list_head s_mounts; /* list of mounts; _not_ for fs use */ /* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */ struct list_head s_dentry_lru; /* unused dentry lru */ - int s_nr_dentry_unused; /* # of dentry on lru */ + long s_nr_dentry_unused; /* # of dentry on lru */ /* s_inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */ spinlock_t s_inode_lru_lock ____cacheline_aligned_in_smp; struct list_head s_inode_lru; /* unused inode lru */ - int s_nr_inodes_unused; /* # of inodes on lru */ + long s_nr_inodes_unused; /* # of inodes on lru */ struct block_device *s_bdev; struct backing_dev_info *s_bdi; diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index a4ed56cf0eac..6c28b61bb690 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -49,9 +49,9 @@ struct files_stat_struct { }; struct inodes_stat_t { - int nr_inodes; - int nr_unused; - int dummy[5]; /* padding for sysctl ABI compatibility */ + long nr_inodes; + long nr_unused; + long dummy[5]; /* padding for sysctl ABI compatibility */ }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 07f6fc468e17..7822cd88a95c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1471,14 +1471,14 @@ static struct ctl_table fs_table[] = { { .procname = "inode-nr", .data = &inodes_stat, - .maxlen = 2*sizeof(int), + .maxlen = 2*sizeof(long), .mode = 0444, .proc_handler = proc_nr_inodes, }, { .procname = "inode-state", .data = &inodes_stat, - .maxlen = 7*sizeof(int), + .maxlen = 7*sizeof(long), .mode = 0444, .proc_handler = proc_nr_inodes, }, @@ -1508,7 +1508,7 @@ static struct ctl_table fs_table[] = { { .procname = "dentry-state", .data = &dentry_stat, - .maxlen = 6*sizeof(int), + .maxlen = 6*sizeof(long), .mode = 0444, .proc_handler = proc_nr_dentry, }, -- cgit v1.2.3-71-gd317 From a8e0108cac181a7b141dacaa99ea52efaf9b5f07 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Tue, 17 Sep 2013 14:53:41 -0400 Subject: perf: Fix UAPI export of PERF_EVENT_IOC_ID Without the following patch I have problems compiling code using the new PERF_EVENT_IOC_ID ioctl(). It looks like u64 was used instead of __u64 Signed-off-by: Vince Weaver Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1309171450380.11444@vincent-weaver-1.um.maine.edu Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index ca1d90bcb74d..40a1fb807396 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -324,7 +324,7 @@ struct perf_event_attr { #define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64) #define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) #define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) -#define PERF_EVENT_IOC_ID _IOR('$', 7, u64 *) +#define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) enum perf_event_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, -- cgit v1.2.3-71-gd317 From c5ecceefdb840af45db436adc58219ae97b6ef3c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Sep 2013 23:39:17 +0200 Subject: perf: Update ABI comment For some mysterious reason the sample_id field of PERF_RECORD_MMAP went AWOL. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 40a1fb807396..7f6d584c267b 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -528,6 +528,7 @@ enum perf_event_type { * u64 len; * u64 pgoff; * char filename[]; + * struct sample_id sample_id; * }; */ PERF_RECORD_MMAP = 1, -- cgit v1.2.3-71-gd317 From fa7315871046b9a4c48627905691dbde57e51033 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Sep 2013 10:16:42 +0200 Subject: perf: Fix capabilities bitfield compatibility in 'struct perf_event_mmap_page' Solve the problems around the broken definition of perf_event_mmap_page:: cap_usr_time and cap_usr_rdpmc fields which used to overlap, partially fixed by: 860f085b74e9 ("perf: Fix broken union in 'struct perf_event_mmap_page'") The problem with the fix (merged in v3.12-rc1 and not yet released officially), noticed by Vince Weaver is that the new behavior is not detectable by new user-space, and that due to the reuse of the field names it's easy to mis-compile a binary if old headers are used on a new kernel or new headers are used on an old kernel. To solve all that make this change explicit, detectable and self-contained, by iterating the ABI the following way: - Always clear bit 0, and rename it to usrpage->cap_bit0, to at least not confuse old user-space binaries. RDPMC will be marked as unavailable to old binaries but that's within the ABI, this is a capability bit. - Rename bit 1 to ->cap_bit0_is_deprecated and always set it to 1, so new libraries can reliably detect that bit 0 is deprecated and perma-zero without having to check the kernel version. - Use bits 2, 3, 4 for the newly defined, correct functionality: cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */ cap_user_time : 1, /* The time_* fields are used */ cap_user_time_zero : 1, /* The time_zero field is used */ - Rename all the bitfield names in perf_event.h to be different from the old names, to make sure it's not possible to mis-compile it accidentally with old assumptions. The 'size' field can then be used in the future to add new fields and it will act as a natural ABI version indicator as well. Also adjust tools/perf/ userspace for the new definitions, noticed by Adrian Hunter. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra Also-Fixed-by: Adrian Hunter Link: http://lkml.kernel.org/n/tip-zr03yxjrpXesOzzupszqglbv@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 10 +++++----- include/uapi/linux/perf_event.h | 14 +++++++++----- kernel/events/core.c | 21 +++++++++++++++++++++ tools/perf/arch/x86/util/tsc.c | 6 +++--- 4 files changed, 38 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8355c84b9729..a9c606bb4945 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1883,9 +1883,9 @@ static struct pmu pmu = { void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { - userpg->cap_usr_time = 0; - userpg->cap_usr_time_zero = 0; - userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc; + userpg->cap_user_time = 0; + userpg->cap_user_time_zero = 0; + userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; userpg->pmc_width = x86_pmu.cntval_bits; if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) @@ -1894,13 +1894,13 @@ void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) return; - userpg->cap_usr_time = 1; + userpg->cap_user_time = 1; userpg->time_mult = this_cpu_read(cyc2ns); userpg->time_shift = CYC2NS_SCALE_FACTOR; userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; if (sched_clock_stable && !check_tsc_disabled()) { - userpg->cap_usr_time_zero = 1; + userpg->cap_user_time_zero = 1; userpg->time_zero = this_cpu_read(cyc2ns_offset); } } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 7f6d584c267b..009a655a5d35 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -380,10 +380,13 @@ struct perf_event_mmap_page { union { __u64 capabilities; struct { - __u64 cap_usr_time : 1, - cap_usr_rdpmc : 1, - cap_usr_time_zero : 1, - cap_____res : 61; + __u64 cap_bit0 : 1, /* Always 0, deprecated, see commit 860f085b74e9 */ + cap_bit0_is_deprecated : 1, /* Always 1, signals that bit 0 is zero */ + + cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */ + cap_user_time : 1, /* The time_* fields are used */ + cap_user_time_zero : 1, /* The time_zero field is used */ + cap_____res : 59; }; }; @@ -442,12 +445,13 @@ struct perf_event_mmap_page { * ((rem * time_mult) >> time_shift); */ __u64 time_zero; + __u32 size; /* Header size up to __reserved[] fields. */ /* * Hole for extension of the self monitor capabilities */ - __u64 __reserved[119]; /* align to 1k */ + __u8 __reserved[118*8+4]; /* align to 1k. */ /* * Control data for the mmap() data buffer. diff --git a/kernel/events/core.c b/kernel/events/core.c index dd236b66ca3a..cb4238e85b38 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3660,6 +3660,26 @@ static void calc_timer_values(struct perf_event *event, *running = ctx_time - event->tstamp_running; } +static void perf_event_init_userpage(struct perf_event *event) +{ + struct perf_event_mmap_page *userpg; + struct ring_buffer *rb; + + rcu_read_lock(); + rb = rcu_dereference(event->rb); + if (!rb) + goto unlock; + + userpg = rb->user_page; + + /* Allow new userspace to detect that bit 0 is deprecated */ + userpg->cap_bit0_is_deprecated = 1; + userpg->size = offsetof(struct perf_event_mmap_page, __reserved); + +unlock: + rcu_read_unlock(); +} + void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { } @@ -4044,6 +4064,7 @@ again: ring_buffer_attach(event, rb); rcu_assign_pointer(event->rb, rb); + perf_event_init_userpage(event); perf_event_update_userpage(event); unlock: diff --git a/tools/perf/arch/x86/util/tsc.c b/tools/perf/arch/x86/util/tsc.c index 9570c2b0f83c..b2519e49424f 100644 --- a/tools/perf/arch/x86/util/tsc.c +++ b/tools/perf/arch/x86/util/tsc.c @@ -32,7 +32,7 @@ u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc) int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc, struct perf_tsc_conversion *tc) { - bool cap_usr_time_zero; + bool cap_user_time_zero; u32 seq; int i = 0; @@ -42,7 +42,7 @@ int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc, tc->time_mult = pc->time_mult; tc->time_shift = pc->time_shift; tc->time_zero = pc->time_zero; - cap_usr_time_zero = pc->cap_usr_time_zero; + cap_user_time_zero = pc->cap_user_time_zero; rmb(); if (pc->lock == seq && !(seq & 1)) break; @@ -52,7 +52,7 @@ int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc, } } - if (!cap_usr_time_zero) + if (!cap_user_time_zero) return -EOPNOTSUPP; return 0; -- cgit v1.2.3-71-gd317