From 07a3061e0832fe22932e0fa977581e45b9c42431 Mon Sep 17 00:00:00 2001
From: Matthias Schiffer <mschiffer@universe-factory.net>
Date: Sun, 3 Jul 2016 13:31:35 +0200
Subject: batman-adv: netlink: add routing_algo query

BATADV_CMD_GET_ROUTING_ALGOS is used to get the list of supported routing
algorithms.

Signed-off-by: Matthias Schiffer <mschiffer@universe-factory.net>
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
[sven.eckelmann@open-mesh.com: Reduce the number of changes to
BATADV_CMD_GET_ROUTING_ALGOS, fix includes]
Signed-off-by: Sven Eckelmann <sven.eckelmann@open-mesh.com>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
---
 include/uapi/linux/batman_adv.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index 0fbf6fd4711b..7afce444a64e 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -73,6 +73,7 @@ enum batadv_nl_attrs {
  * @BATADV_CMD_GET_MESH_INFO: Query basic information about batman-adv device
  * @BATADV_CMD_TP_METER: Start a tp meter session
  * @BATADV_CMD_TP_METER_CANCEL: Cancel a tp meter session
+ * @BATADV_CMD_GET_ROUTING_ALGOS: Query the list of routing algorithms.
  * @__BATADV_CMD_AFTER_LAST: internal use
  * @BATADV_CMD_MAX: highest used command number
  */
@@ -81,6 +82,7 @@ enum batadv_nl_commands {
 	BATADV_CMD_GET_MESH_INFO,
 	BATADV_CMD_TP_METER,
 	BATADV_CMD_TP_METER_CANCEL,
+	BATADV_CMD_GET_ROUTING_ALGOS,
 	/* add new commands above here */
 	__BATADV_CMD_AFTER_LAST,
 	BATADV_CMD_MAX = __BATADV_CMD_AFTER_LAST - 1
-- 
cgit v1.2.3-71-gd317


From b60620cf567b79da46096a0ba29b39f23b6e7f1c Mon Sep 17 00:00:00 2001
From: Matthias Schiffer <mschiffer@universe-factory.net>
Date: Sun, 3 Jul 2016 13:31:36 +0200
Subject: batman-adv: netlink: hardif query

BATADV_CMD_GET_HARDIFS will return the list of hardifs (including index,
name and MAC address) of all hardifs for a given softif.

Signed-off-by: Matthias Schiffer <mschiffer@universe-factory.net>
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
[sven.eckelmann@open-mesh.com: Reduce the number of changes to
BATADV_CMD_GET_HARDIFS, add policy for attributes]
Signed-off-by: Sven Eckelmann <sven.eckelmann@open-mesh.com>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
---
 include/uapi/linux/batman_adv.h |   4 ++
 net/batman-adv/netlink.c        | 128 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 130 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index 7afce444a64e..8abcbca14b6a 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -40,6 +40,7 @@
  * @BATADV_ATTR_TPMETER_BYTES: amount of acked bytes during run
  * @BATADV_ATTR_TPMETER_COOKIE: session cookie to match tp_meter session
  * @BATADV_ATTR_PAD: attribute used for padding for 64-bit alignment
+ * @BATADV_ATTR_ACTIVE: Flag indicating if the hard interface is active
  * @__BATADV_ATTR_AFTER_LAST: internal use
  * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available
  * @BATADV_ATTR_MAX: highest attribute number currently defined
@@ -60,6 +61,7 @@ enum batadv_nl_attrs {
 	BATADV_ATTR_TPMETER_BYTES,
 	BATADV_ATTR_TPMETER_COOKIE,
 	BATADV_ATTR_PAD,
+	BATADV_ATTR_ACTIVE,
 	/* add attributes above here, update the policy in netlink.c */
 	__BATADV_ATTR_AFTER_LAST,
 	NUM_BATADV_ATTR = __BATADV_ATTR_AFTER_LAST,
@@ -74,6 +76,7 @@ enum batadv_nl_attrs {
  * @BATADV_CMD_TP_METER: Start a tp meter session
  * @BATADV_CMD_TP_METER_CANCEL: Cancel a tp meter session
  * @BATADV_CMD_GET_ROUTING_ALGOS: Query the list of routing algorithms.
+ * @BATADV_CMD_GET_HARDIFS: Query list of hard interfaces
  * @__BATADV_CMD_AFTER_LAST: internal use
  * @BATADV_CMD_MAX: highest used command number
  */
@@ -83,6 +86,7 @@ enum batadv_nl_commands {
 	BATADV_CMD_TP_METER,
 	BATADV_CMD_TP_METER_CANCEL,
 	BATADV_CMD_GET_ROUTING_ALGOS,
+	BATADV_CMD_GET_HARDIFS,
 	/* add new commands above here */
 	__BATADV_CMD_AFTER_LAST,
 	BATADV_CMD_MAX = __BATADV_CMD_AFTER_LAST - 1
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 19fb2657e274..3f872d6eec57 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -26,10 +26,14 @@
 #include <linux/netdevice.h>
 #include <linux/netlink.h>
 #include <linux/printk.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
 #include <linux/stddef.h>
 #include <linux/types.h>
 #include <net/genetlink.h>
 #include <net/netlink.h>
+#include <net/sock.h>
 #include <uapi/linux/batman_adv.h>
 
 #include "bat_algo.h"
@@ -37,8 +41,6 @@
 #include "soft-interface.h"
 #include "tp_meter.h"
 
-struct sk_buff;
-
 struct genl_family batadv_netlink_family = {
 	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
@@ -70,8 +72,24 @@ static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_TPMETER_TEST_TIME]	= { .type = NLA_U32 },
 	[BATADV_ATTR_TPMETER_BYTES]	= { .type = NLA_U64 },
 	[BATADV_ATTR_TPMETER_COOKIE]	= { .type = NLA_U32 },
+	[BATADV_ATTR_ACTIVE]		= { .type = NLA_FLAG },
 };
 
+/**
+ * batadv_netlink_get_ifindex - Extract an interface index from a message
+ * @nlh: Message header
+ * @attrtype: Attribute which holds an interface index
+ *
+ * Return: interface index, or 0.
+ */
+static int
+batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype)
+{
+	struct nlattr *attr = nlmsg_find_attr(nlh, GENL_HDRLEN, attrtype);
+
+	return attr ? nla_get_u32(attr) : 0;
+}
+
 /**
  * batadv_netlink_mesh_info_put - fill in generic information about mesh
  *  interface
@@ -381,6 +399,106 @@ out:
 	return ret;
 }
 
+/**
+ * batadv_netlink_dump_hardif_entry - Dump one hard interface into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @hard_iface: Hard interface to dump
+ *
+ * Return: error code, or 0 on success
+ */
+static int
+batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid, u32 seq,
+				 struct batadv_hard_iface *hard_iface)
+{
+	struct net_device *net_dev = hard_iface->net_dev;
+	void *hdr;
+
+	hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI,
+			  BATADV_CMD_GET_HARDIFS);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
+			net_dev->ifindex) ||
+	    nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
+			   net_dev->name) ||
+	    nla_put(msg, BATADV_ATTR_HARD_ADDRESS, ETH_ALEN,
+		    net_dev->dev_addr))
+		goto nla_put_failure;
+
+	if (hard_iface->if_status == BATADV_IF_ACTIVE) {
+		if (nla_put_flag(msg, BATADV_ATTR_ACTIVE))
+			goto nla_put_failure;
+	}
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+/**
+ * batadv_netlink_dump_hardifs - Dump all hard interface into a messages
+ * @msg: Netlink message to dump into
+ * @cb: Parameters from query
+ *
+ * Return: error code, or length of reply message on success
+ */
+static int
+batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(cb->skb->sk);
+	struct net_device *soft_iface;
+	struct batadv_hard_iface *hard_iface;
+	int ifindex;
+	int portid = NETLINK_CB(cb->skb).portid;
+	int seq = cb->nlh->nlmsg_seq;
+	int skip = cb->args[0];
+	int i = 0;
+
+	ifindex = batadv_netlink_get_ifindex(cb->nlh,
+					     BATADV_ATTR_MESH_IFINDEX);
+	if (!ifindex)
+		return -EINVAL;
+
+	soft_iface = dev_get_by_index(net, ifindex);
+	if (!soft_iface)
+		return -ENODEV;
+
+	if (!batadv_softif_is_valid(soft_iface)) {
+		dev_put(soft_iface);
+		return -ENODEV;
+	}
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+		if (hard_iface->soft_iface != soft_iface)
+			continue;
+
+		if (i++ < skip)
+			continue;
+
+		if (batadv_netlink_dump_hardif_entry(msg, portid, seq,
+						     hard_iface)) {
+			i--;
+			break;
+		}
+	}
+
+	rcu_read_unlock();
+
+	dev_put(soft_iface);
+
+	cb->args[0] = i;
+
+	return msg->len;
+}
+
 static struct genl_ops batadv_netlink_ops[] = {
 	{
 		.cmd = BATADV_CMD_GET_MESH_INFO,
@@ -406,6 +524,12 @@ static struct genl_ops batadv_netlink_ops[] = {
 		.policy = batadv_netlink_policy,
 		.dumpit = batadv_algo_dump,
 	},
+	{
+		.cmd = BATADV_CMD_GET_HARDIFS,
+		.flags = GENL_ADMIN_PERM,
+		.policy = batadv_netlink_policy,
+		.dumpit = batadv_netlink_dump_hardifs,
+	},
 };
 
 /**
-- 
cgit v1.2.3-71-gd317


From d34f05507db245bef819b684ad84f9e0f9bb003d Mon Sep 17 00:00:00 2001
From: Matthias Schiffer <mschiffer@universe-factory.net>
Date: Sun, 3 Jul 2016 13:31:37 +0200
Subject: batman-adv: netlink: add translation table query

This adds the commands BATADV_CMD_GET_TRANSTABLE_LOCAL and
BATADV_CMD_GET_TRANSTABLE_GLOBAL, which correspond to the transtable_local
and transtable_global debugfs files.

The batadv_tt_client_flags enum is moved to the UAPI to expose it as part
of the netlink API.

Signed-off-by: Matthias Schiffer <mschiffer@universe-factory.net>
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
[sven.eckelmann@open-mesh.com: add policy for attributes, fix includes]
Signed-off-by: Sven Eckelmann <sven.eckelmann@open-mesh.com>
[sw@simonwunderlich.de: fix VID attributes content]
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
---
 include/uapi/linux/batman_adv.h    |  56 ++++++
 net/batman-adv/netlink.c           |  23 ++-
 net/batman-adv/netlink.h           |   3 +
 net/batman-adv/packet.h            |  36 ----
 net/batman-adv/translation-table.c | 377 +++++++++++++++++++++++++++++++++++++
 net/batman-adv/translation-table.h |   4 +
 6 files changed, 462 insertions(+), 37 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index 8abcbca14b6a..1168d058cd2b 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -22,6 +22,42 @@
 
 #define BATADV_NL_MCAST_GROUP_TPMETER	"tpmeter"
 
+/**
+ * enum batadv_tt_client_flags - TT client specific flags
+ * @BATADV_TT_CLIENT_DEL: the client has to be deleted from the table
+ * @BATADV_TT_CLIENT_ROAM: the client roamed to/from another node and the new
+ *  update telling its new real location has not been received/sent yet
+ * @BATADV_TT_CLIENT_WIFI: this client is connected through a wifi interface.
+ *  This information is used by the "AP Isolation" feature
+ * @BATADV_TT_CLIENT_ISOLA: this client is considered "isolated". This
+ *  information is used by the Extended Isolation feature
+ * @BATADV_TT_CLIENT_NOPURGE: this client should never be removed from the table
+ * @BATADV_TT_CLIENT_NEW: this client has been added to the local table but has
+ *  not been announced yet
+ * @BATADV_TT_CLIENT_PENDING: this client is marked for removal but it is kept
+ *  in the table for one more originator interval for consistency purposes
+ * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be part of
+ *  the network but no nnode has already announced it
+ *
+ * Bits from 0 to 7 are called _remote flags_ because they are sent on the wire.
+ * Bits from 8 to 15 are called _local flags_ because they are used for local
+ * computations only.
+ *
+ * Bits from 4 to 7 - a subset of remote flags - are ensured to be in sync with
+ * the other nodes in the network. To achieve this goal these flags are included
+ * in the TT CRC computation.
+ */
+enum batadv_tt_client_flags {
+	BATADV_TT_CLIENT_DEL     = (1 << 0),
+	BATADV_TT_CLIENT_ROAM    = (1 << 1),
+	BATADV_TT_CLIENT_WIFI    = (1 << 4),
+	BATADV_TT_CLIENT_ISOLA	 = (1 << 5),
+	BATADV_TT_CLIENT_NOPURGE = (1 << 8),
+	BATADV_TT_CLIENT_NEW     = (1 << 9),
+	BATADV_TT_CLIENT_PENDING = (1 << 10),
+	BATADV_TT_CLIENT_TEMP	 = (1 << 11),
+};
+
 /**
  * enum batadv_nl_attrs - batman-adv netlink attributes
  *
@@ -41,6 +77,14 @@
  * @BATADV_ATTR_TPMETER_COOKIE: session cookie to match tp_meter session
  * @BATADV_ATTR_PAD: attribute used for padding for 64-bit alignment
  * @BATADV_ATTR_ACTIVE: Flag indicating if the hard interface is active
+ * @BATADV_ATTR_TT_ADDRESS: Client MAC address
+ * @BATADV_ATTR_TT_TTVN: Translation table version
+ * @BATADV_ATTR_TT_LAST_TTVN: Previous translation table version
+ * @BATADV_ATTR_TT_CRC32: CRC32 over translation table
+ * @BATADV_ATTR_TT_VID: VLAN ID
+ * @BATADV_ATTR_TT_FLAGS: Translation table client flags
+ * @BATADV_ATTR_FLAG_BEST: Flags indicating entry is the best
+ * @BATADV_ATTR_LAST_SEEN_MSECS: Time in milliseconds since last seen
  * @__BATADV_ATTR_AFTER_LAST: internal use
  * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available
  * @BATADV_ATTR_MAX: highest attribute number currently defined
@@ -62,6 +106,14 @@ enum batadv_nl_attrs {
 	BATADV_ATTR_TPMETER_COOKIE,
 	BATADV_ATTR_PAD,
 	BATADV_ATTR_ACTIVE,
+	BATADV_ATTR_TT_ADDRESS,
+	BATADV_ATTR_TT_TTVN,
+	BATADV_ATTR_TT_LAST_TTVN,
+	BATADV_ATTR_TT_CRC32,
+	BATADV_ATTR_TT_VID,
+	BATADV_ATTR_TT_FLAGS,
+	BATADV_ATTR_FLAG_BEST,
+	BATADV_ATTR_LAST_SEEN_MSECS,
 	/* add attributes above here, update the policy in netlink.c */
 	__BATADV_ATTR_AFTER_LAST,
 	NUM_BATADV_ATTR = __BATADV_ATTR_AFTER_LAST,
@@ -77,6 +129,8 @@ enum batadv_nl_attrs {
  * @BATADV_CMD_TP_METER_CANCEL: Cancel a tp meter session
  * @BATADV_CMD_GET_ROUTING_ALGOS: Query the list of routing algorithms.
  * @BATADV_CMD_GET_HARDIFS: Query list of hard interfaces
+ * @BATADV_CMD_GET_TRANSTABLE_LOCAL: Query list of local translations
+ * @BATADV_CMD_GET_TRANSTABLE_GLOBAL Query list of global translations
  * @__BATADV_CMD_AFTER_LAST: internal use
  * @BATADV_CMD_MAX: highest used command number
  */
@@ -87,6 +141,8 @@ enum batadv_nl_commands {
 	BATADV_CMD_TP_METER_CANCEL,
 	BATADV_CMD_GET_ROUTING_ALGOS,
 	BATADV_CMD_GET_HARDIFS,
+	BATADV_CMD_GET_TRANSTABLE_LOCAL,
+	BATADV_CMD_GET_TRANSTABLE_GLOBAL,
 	/* add new commands above here */
 	__BATADV_CMD_AFTER_LAST,
 	BATADV_CMD_MAX = __BATADV_CMD_AFTER_LAST - 1
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 3f872d6eec57..14360ec16513 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -40,6 +40,7 @@
 #include "hard-interface.h"
 #include "soft-interface.h"
 #include "tp_meter.h"
+#include "translation-table.h"
 
 struct genl_family batadv_netlink_family = {
 	.id = GENL_ID_GENERATE,
@@ -73,6 +74,14 @@ static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_TPMETER_BYTES]	= { .type = NLA_U64 },
 	[BATADV_ATTR_TPMETER_COOKIE]	= { .type = NLA_U32 },
 	[BATADV_ATTR_ACTIVE]		= { .type = NLA_FLAG },
+	[BATADV_ATTR_TT_ADDRESS]	= { .len = ETH_ALEN },
+	[BATADV_ATTR_TT_TTVN]		= { .type = NLA_U8 },
+	[BATADV_ATTR_TT_LAST_TTVN]	= { .type = NLA_U8 },
+	[BATADV_ATTR_TT_CRC32]		= { .type = NLA_U32 },
+	[BATADV_ATTR_TT_VID]		= { .type = NLA_U16 },
+	[BATADV_ATTR_TT_FLAGS]		= { .type = NLA_U32 },
+	[BATADV_ATTR_FLAG_BEST]		= { .type = NLA_FLAG },
+	[BATADV_ATTR_LAST_SEEN_MSECS]	= { .type = NLA_U32 },
 };
 
 /**
@@ -82,7 +91,7 @@ static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
  *
  * Return: interface index, or 0.
  */
-static int
+int
 batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype)
 {
 	struct nlattr *attr = nlmsg_find_attr(nlh, GENL_HDRLEN, attrtype);
@@ -530,6 +539,18 @@ static struct genl_ops batadv_netlink_ops[] = {
 		.policy = batadv_netlink_policy,
 		.dumpit = batadv_netlink_dump_hardifs,
 	},
+	{
+		.cmd = BATADV_CMD_GET_TRANSTABLE_LOCAL,
+		.flags = GENL_ADMIN_PERM,
+		.policy = batadv_netlink_policy,
+		.dumpit = batadv_tt_local_dump,
+	},
+	{
+		.cmd = BATADV_CMD_GET_TRANSTABLE_GLOBAL,
+		.flags = GENL_ADMIN_PERM,
+		.policy = batadv_netlink_policy,
+		.dumpit = batadv_tt_global_dump,
+	},
 };
 
 /**
diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h
index b399f49504df..52eb16281aba 100644
--- a/net/batman-adv/netlink.h
+++ b/net/batman-adv/netlink.h
@@ -23,8 +23,11 @@
 #include <linux/types.h>
 #include <net/genetlink.h>
 
+struct nlmsghdr;
+
 void batadv_netlink_register(void);
 void batadv_netlink_unregister(void);
+int batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype);
 
 int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst,
 				  u8 result, u32 test_time, u64 total_bytes,
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index 6b011ff64dd8..6afc0b86950e 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -128,42 +128,6 @@ enum batadv_tt_data_flags {
 	BATADV_TT_FULL_TABLE = BIT(4),
 };
 
-/**
- * enum batadv_tt_client_flags - TT client specific flags
- * @BATADV_TT_CLIENT_DEL: the client has to be deleted from the table
- * @BATADV_TT_CLIENT_ROAM: the client roamed to/from another node and the new
- *  update telling its new real location has not been received/sent yet
- * @BATADV_TT_CLIENT_WIFI: this client is connected through a wifi interface.
- *  This information is used by the "AP Isolation" feature
- * @BATADV_TT_CLIENT_ISOLA: this client is considered "isolated". This
- *  information is used by the Extended Isolation feature
- * @BATADV_TT_CLIENT_NOPURGE: this client should never be removed from the table
- * @BATADV_TT_CLIENT_NEW: this client has been added to the local table but has
- *  not been announced yet
- * @BATADV_TT_CLIENT_PENDING: this client is marked for removal but it is kept
- *  in the table for one more originator interval for consistency purposes
- * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be part of
- *  the network but no nnode has already announced it
- *
- * Bits from 0 to 7 are called _remote flags_ because they are sent on the wire.
- * Bits from 8 to 15 are called _local flags_ because they are used for local
- * computations only.
- *
- * Bits from 4 to 7 - a subset of remote flags - are ensured to be in sync with
- * the other nodes in the network. To achieve this goal these flags are included
- * in the TT CRC computation.
- */
-enum batadv_tt_client_flags {
-	BATADV_TT_CLIENT_DEL     = BIT(0),
-	BATADV_TT_CLIENT_ROAM    = BIT(1),
-	BATADV_TT_CLIENT_WIFI    = BIT(4),
-	BATADV_TT_CLIENT_ISOLA	 = BIT(5),
-	BATADV_TT_CLIENT_NOPURGE = BIT(8),
-	BATADV_TT_CLIENT_NEW     = BIT(9),
-	BATADV_TT_CLIENT_PENDING = BIT(10),
-	BATADV_TT_CLIENT_TEMP	 = BIT(11),
-};
-
 /**
  * enum batadv_vlan_flags - flags for the four MSB of any vlan ID field
  * @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index af2bfef6dca8..20804078293c 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -37,20 +37,27 @@
 #include <linux/list.h>
 #include <linux/lockdep.h>
 #include <linux/netdevice.h>
+#include <linux/netlink.h>
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
 #include <linux/seq_file.h>
+#include <linux/skbuff.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/stddef.h>
 #include <linux/string.h>
 #include <linux/workqueue.h>
+#include <net/genetlink.h>
+#include <net/netlink.h>
+#include <net/sock.h>
+#include <uapi/linux/batman_adv.h>
 
 #include "bridge_loop_avoidance.h"
 #include "hard-interface.h"
 #include "hash.h"
 #include "log.h"
 #include "multicast.h"
+#include "netlink.h"
 #include "originator.h"
 #include "packet.h"
 #include "soft-interface.h"
@@ -1108,6 +1115,164 @@ out:
 	return 0;
 }
 
+/**
+ * batadv_tt_local_dump_entry - Dump one TT local entry into a message
+ * @msg :Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @common: tt local & tt global common data
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+			   struct batadv_priv *bat_priv,
+			   struct batadv_tt_common_entry *common)
+{
+	void *hdr;
+	struct batadv_softif_vlan *vlan;
+	struct batadv_tt_local_entry *local;
+	unsigned int last_seen_msecs;
+	u32 crc;
+
+	local = container_of(common, struct batadv_tt_local_entry, common);
+	last_seen_msecs = jiffies_to_msecs(jiffies - local->last_seen);
+
+	vlan = batadv_softif_vlan_get(bat_priv, common->vid);
+	if (!vlan)
+		return 0;
+
+	crc = vlan->tt.crc;
+
+	batadv_softif_vlan_put(vlan);
+
+	hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
+			  NLM_F_MULTI,
+			  BATADV_CMD_GET_TRANSTABLE_LOCAL);
+	if (!hdr)
+		return -ENOBUFS;
+
+	if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) ||
+	    nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) ||
+	    nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) ||
+	    nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, common->flags))
+		goto nla_put_failure;
+
+	if (!(common->flags & BATADV_TT_CLIENT_NOPURGE) &&
+	    nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+/**
+ * batadv_tt_local_dump_bucket - Dump one TT local bucket into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @head: Pointer to the list containing the local tt entries
+ * @idx_s: Number of entries to skip
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+			    struct batadv_priv *bat_priv,
+			    struct hlist_head *head, int *idx_s)
+{
+	struct batadv_tt_common_entry *common;
+	int idx = 0;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(common, head, hash_entry) {
+		if (idx++ < *idx_s)
+			continue;
+
+		if (batadv_tt_local_dump_entry(msg, portid, seq, bat_priv,
+					       common)) {
+			rcu_read_unlock();
+			*idx_s = idx - 1;
+			return -EMSGSIZE;
+		}
+	}
+	rcu_read_unlock();
+
+	*idx_s = 0;
+	return 0;
+}
+
+/**
+ * batadv_tt_local_dump - Dump TT local entries into a message
+ * @msg: Netlink message to dump into
+ * @cb: Parameters from query
+ *
+ * Return: Error code, or 0 on success
+ */
+int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(cb->skb->sk);
+	struct net_device *soft_iface;
+	struct batadv_priv *bat_priv;
+	struct batadv_hard_iface *primary_if = NULL;
+	struct batadv_hashtable *hash;
+	struct hlist_head *head;
+	int ret;
+	int ifindex;
+	int bucket = cb->args[0];
+	int idx = cb->args[1];
+	int portid = NETLINK_CB(cb->skb).portid;
+
+	ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
+	if (!ifindex)
+		return -EINVAL;
+
+	soft_iface = dev_get_by_index(net, ifindex);
+	if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	bat_priv = netdev_priv(soft_iface);
+
+	primary_if = batadv_primary_if_get_selected(bat_priv);
+	if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	hash = bat_priv->tt.local_hash;
+
+	while (bucket < hash->size) {
+		head = &hash->table[bucket];
+
+		if (batadv_tt_local_dump_bucket(msg, portid, cb->nlh->nlmsg_seq,
+						bat_priv, head, &idx))
+			break;
+
+		bucket++;
+	}
+
+	ret = msg->len;
+
+ out:
+	if (primary_if)
+		batadv_hardif_put(primary_if);
+	if (soft_iface)
+		dev_put(soft_iface);
+
+	cb->args[0] = bucket;
+	cb->args[1] = idx;
+
+	return ret;
+}
+
 static void
 batadv_tt_local_set_pending(struct batadv_priv *bat_priv,
 			    struct batadv_tt_local_entry *tt_local_entry,
@@ -1755,6 +1920,218 @@ out:
 	return 0;
 }
 
+/**
+ * batadv_tt_global_dump_subentry - Dump all TT local entries into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @common: tt local & tt global common data
+ * @orig: Originator node announcing a non-mesh client
+ * @best: Is the best originator for the TT entry
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_tt_global_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq,
+			       struct batadv_tt_common_entry *common,
+			       struct batadv_tt_orig_list_entry *orig,
+			       bool best)
+{
+	void *hdr;
+	struct batadv_orig_node_vlan *vlan;
+	u8 last_ttvn;
+	u32 crc;
+
+	vlan = batadv_orig_node_vlan_get(orig->orig_node,
+					 common->vid);
+	if (!vlan)
+		return 0;
+
+	crc = vlan->tt.crc;
+
+	batadv_orig_node_vlan_put(vlan);
+
+	hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
+			  NLM_F_MULTI,
+			  BATADV_CMD_GET_TRANSTABLE_GLOBAL);
+	if (!hdr)
+		return -ENOBUFS;
+
+	last_ttvn = atomic_read(&orig->orig_node->last_ttvn);
+
+	if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) ||
+	    nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN,
+		    orig->orig_node->orig) ||
+	    nla_put_u8(msg, BATADV_ATTR_TT_TTVN, orig->ttvn) ||
+	    nla_put_u8(msg, BATADV_ATTR_TT_LAST_TTVN, last_ttvn) ||
+	    nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) ||
+	    nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) ||
+	    nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, common->flags))
+		goto nla_put_failure;
+
+	if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+/**
+ * batadv_tt_global_dump_entry - Dump one TT global entry into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @common: tt local & tt global common data
+ * @sub_s: Number of entries to skip
+ *
+ * This function assumes the caller holds rcu_read_lock().
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_tt_global_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+			    struct batadv_priv *bat_priv,
+			    struct batadv_tt_common_entry *common, int *sub_s)
+{
+	struct batadv_tt_orig_list_entry *orig_entry, *best_entry;
+	struct batadv_tt_global_entry *global;
+	struct hlist_head *head;
+	int sub = 0;
+	bool best;
+
+	global = container_of(common, struct batadv_tt_global_entry, common);
+	best_entry = batadv_transtable_best_orig(bat_priv, global);
+	head = &global->orig_list;
+
+	hlist_for_each_entry_rcu(orig_entry, head, list) {
+		if (sub++ < *sub_s)
+			continue;
+
+		best = (orig_entry == best_entry);
+
+		if (batadv_tt_global_dump_subentry(msg, portid, seq, common,
+						   orig_entry, best)) {
+			*sub_s = sub - 1;
+			return -EMSGSIZE;
+		}
+	}
+
+	*sub_s = 0;
+	return 0;
+}
+
+/**
+ * batadv_tt_global_dump_bucket - Dump one TT local bucket into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @head: Pointer to the list containing the global tt entries
+ * @idx_s: Number of entries to skip
+ * @sub: Number of entries to skip
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_tt_global_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+			     struct batadv_priv *bat_priv,
+			     struct hlist_head *head, int *idx_s, int *sub)
+{
+	struct batadv_tt_common_entry *common;
+	int idx = 0;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(common, head, hash_entry) {
+		if (idx++ < *idx_s)
+			continue;
+
+		if (batadv_tt_global_dump_entry(msg, portid, seq, bat_priv,
+						common, sub)) {
+			rcu_read_unlock();
+			*idx_s = idx - 1;
+			return -EMSGSIZE;
+		}
+	}
+	rcu_read_unlock();
+
+	*idx_s = 0;
+	*sub = 0;
+	return 0;
+}
+
+/**
+ * batadv_tt_global_dump -  Dump TT global entries into a message
+ * @msg: Netlink message to dump into
+ * @cb: Parameters from query
+ *
+ * Return: Error code, or length of message on success
+ */
+int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(cb->skb->sk);
+	struct net_device *soft_iface;
+	struct batadv_priv *bat_priv;
+	struct batadv_hard_iface *primary_if = NULL;
+	struct batadv_hashtable *hash;
+	struct hlist_head *head;
+	int ret;
+	int ifindex;
+	int bucket = cb->args[0];
+	int idx = cb->args[1];
+	int sub = cb->args[2];
+	int portid = NETLINK_CB(cb->skb).portid;
+
+	ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
+	if (!ifindex)
+		return -EINVAL;
+
+	soft_iface = dev_get_by_index(net, ifindex);
+	if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	bat_priv = netdev_priv(soft_iface);
+
+	primary_if = batadv_primary_if_get_selected(bat_priv);
+	if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	hash = bat_priv->tt.global_hash;
+
+	while (bucket < hash->size) {
+		head = &hash->table[bucket];
+
+		if (batadv_tt_global_dump_bucket(msg, portid,
+						 cb->nlh->nlmsg_seq, bat_priv,
+						 head, &idx, &sub))
+			break;
+
+		bucket++;
+	}
+
+	ret = msg->len;
+
+ out:
+	if (primary_if)
+		batadv_hardif_put(primary_if);
+	if (soft_iface)
+		dev_put(soft_iface);
+
+	cb->args[0] = bucket;
+	cb->args[1] = idx;
+	cb->args[2] = sub;
+
+	return ret;
+}
+
 /**
  * _batadv_tt_global_del_orig_entry - remove and free an orig_entry
  * @tt_global_entry: the global entry to remove the orig_entry from
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
index 02b0f85527cc..783fdba84db2 100644
--- a/net/batman-adv/translation-table.h
+++ b/net/batman-adv/translation-table.h
@@ -22,8 +22,10 @@
 
 #include <linux/types.h>
 
+struct netlink_callback;
 struct net_device;
 struct seq_file;
+struct sk_buff;
 
 int batadv_tt_init(struct batadv_priv *bat_priv);
 bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
@@ -33,6 +35,8 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv,
 			   const char *message, bool roaming);
 int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset);
 int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset);
+int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb);
+int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb);
 void batadv_tt_global_del_orig(struct batadv_priv *bat_priv,
 			       struct batadv_orig_node *orig_node,
 			       s32 match_vid, const char *message);
-- 
cgit v1.2.3-71-gd317


From 85cf8c859d53f6f53c37bb7f23a41f6171427021 Mon Sep 17 00:00:00 2001
From: Matthias Schiffer <mschiffer@universe-factory.net>
Date: Sun, 3 Jul 2016 13:31:39 +0200
Subject: batman-adv: netlink: add originator and neighbor table queries

Add BATADV_CMD_GET_ORIGINATORS and BATADV_CMD_GET_NEIGHBORS commands,
using handlers bat_orig_dump and bat_neigh_dump in batadv_algo_ops. Will
always return -EOPNOTSUPP for now, as no implementations exist yet.

Signed-off-by: Matthias Schiffer <mschiffer@universe-factory.net>
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
[sven@narfation.org: Rewrite based on new algo_ops structures]
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
---
 include/uapi/linux/batman_adv.h |   4 +
 net/batman-adv/netlink.c        |  13 ++++
 net/batman-adv/originator.c     | 160 ++++++++++++++++++++++++++++++++++++++++
 net/batman-adv/originator.h     |   4 +
 net/batman-adv/types.h          |   9 +++
 5 files changed, 190 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index 1168d058cd2b..3f7a415f5e09 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -131,6 +131,8 @@ enum batadv_nl_attrs {
  * @BATADV_CMD_GET_HARDIFS: Query list of hard interfaces
  * @BATADV_CMD_GET_TRANSTABLE_LOCAL: Query list of local translations
  * @BATADV_CMD_GET_TRANSTABLE_GLOBAL Query list of global translations
+ * @BATADV_CMD_GET_ORIGINATORS: Query list of originators
+ * @BATADV_CMD_GET_NEIGHBORS: Query list of neighbours
  * @__BATADV_CMD_AFTER_LAST: internal use
  * @BATADV_CMD_MAX: highest used command number
  */
@@ -143,6 +145,8 @@ enum batadv_nl_commands {
 	BATADV_CMD_GET_HARDIFS,
 	BATADV_CMD_GET_TRANSTABLE_LOCAL,
 	BATADV_CMD_GET_TRANSTABLE_GLOBAL,
+	BATADV_CMD_GET_ORIGINATORS,
+	BATADV_CMD_GET_NEIGHBORS,
 	/* add new commands above here */
 	__BATADV_CMD_AFTER_LAST,
 	BATADV_CMD_MAX = __BATADV_CMD_AFTER_LAST - 1
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 0c0d20bb92ce..8469fc4ec5a3 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -39,6 +39,7 @@
 
 #include "bat_algo.h"
 #include "hard-interface.h"
+#include "originator.h"
 #include "soft-interface.h"
 #include "tp_meter.h"
 #include "translation-table.h"
@@ -554,6 +555,18 @@ static struct genl_ops batadv_netlink_ops[] = {
 		.policy = batadv_netlink_policy,
 		.dumpit = batadv_tt_global_dump,
 	},
+	{
+		.cmd = BATADV_CMD_GET_ORIGINATORS,
+		.flags = GENL_ADMIN_PERM,
+		.policy = batadv_netlink_policy,
+		.dumpit = batadv_orig_dump,
+	},
+	{
+		.cmd = BATADV_CMD_GET_NEIGHBORS,
+		.flags = GENL_ADMIN_PERM,
+		.policy = batadv_netlink_policy,
+		.dumpit = batadv_hardif_neigh_dump,
+	},
 };
 
 /**
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 3940b5d24421..95c85558c530 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -28,11 +28,15 @@
 #include <linux/list.h>
 #include <linux/lockdep.h>
 #include <linux/netdevice.h>
+#include <linux/netlink.h>
 #include <linux/rculist.h>
 #include <linux/seq_file.h>
+#include <linux/skbuff.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/workqueue.h>
+#include <net/sock.h>
+#include <uapi/linux/batman_adv.h>
 
 #include "bat_algo.h"
 #include "distributed-arp-table.h"
@@ -42,8 +46,10 @@
 #include "hash.h"
 #include "log.h"
 #include "multicast.h"
+#include "netlink.h"
 #include "network-coding.h"
 #include "routing.h"
+#include "soft-interface.h"
 #include "translation-table.h"
 
 /* hash class keys */
@@ -720,6 +726,83 @@ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset)
 	return 0;
 }
 
+/**
+ * batadv_hardif_neigh_dump - Dump to netlink the neighbor infos for a specific
+ *  outgoing interface
+ * @msg: message to dump into
+ * @cb: parameters for the dump
+ *
+ * Return: 0 or error value
+ */
+int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(cb->skb->sk);
+	struct net_device *soft_iface;
+	struct net_device *hard_iface = NULL;
+	struct batadv_hard_iface *hardif = BATADV_IF_DEFAULT;
+	struct batadv_priv *bat_priv;
+	struct batadv_hard_iface *primary_if = NULL;
+	int ret;
+	int ifindex, hard_ifindex;
+
+	ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
+	if (!ifindex)
+		return -EINVAL;
+
+	soft_iface = dev_get_by_index(net, ifindex);
+	if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	bat_priv = netdev_priv(soft_iface);
+
+	primary_if = batadv_primary_if_get_selected(bat_priv);
+	if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	hard_ifindex = batadv_netlink_get_ifindex(cb->nlh,
+						  BATADV_ATTR_HARD_IFINDEX);
+	if (hard_ifindex) {
+		hard_iface = dev_get_by_index(net, hard_ifindex);
+		if (hard_iface)
+			hardif = batadv_hardif_get_by_netdev(hard_iface);
+
+		if (!hardif) {
+			ret = -ENODEV;
+			goto out;
+		}
+
+		if (hardif->soft_iface != soft_iface) {
+			ret = -ENOENT;
+			goto out;
+		}
+	}
+
+	if (!bat_priv->algo_ops->neigh.dump) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	bat_priv->algo_ops->neigh.dump(msg, cb, bat_priv, hardif);
+
+	ret = msg->len;
+
+ out:
+	if (hardif)
+		batadv_hardif_put(hardif);
+	if (hard_iface)
+		dev_put(hard_iface);
+	if (primary_if)
+		batadv_hardif_put(primary_if);
+	if (soft_iface)
+		dev_put(soft_iface);
+
+	return ret;
+}
+
 /**
  * batadv_orig_ifinfo_release - release orig_ifinfo from lists and queue for
  *  free after rcu grace period
@@ -1330,6 +1413,83 @@ out:
 	return 0;
 }
 
+/**
+ * batadv_orig_dump - Dump to netlink the originator infos for a specific
+ *  outgoing interface
+ * @msg: message to dump into
+ * @cb: parameters for the dump
+ *
+ * Return: 0 or error value
+ */
+int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(cb->skb->sk);
+	struct net_device *soft_iface;
+	struct net_device *hard_iface = NULL;
+	struct batadv_hard_iface *hardif = BATADV_IF_DEFAULT;
+	struct batadv_priv *bat_priv;
+	struct batadv_hard_iface *primary_if = NULL;
+	int ret;
+	int ifindex, hard_ifindex;
+
+	ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
+	if (!ifindex)
+		return -EINVAL;
+
+	soft_iface = dev_get_by_index(net, ifindex);
+	if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	bat_priv = netdev_priv(soft_iface);
+
+	primary_if = batadv_primary_if_get_selected(bat_priv);
+	if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	hard_ifindex = batadv_netlink_get_ifindex(cb->nlh,
+						  BATADV_ATTR_HARD_IFINDEX);
+	if (hard_ifindex) {
+		hard_iface = dev_get_by_index(net, hard_ifindex);
+		if (hard_iface)
+			hardif = batadv_hardif_get_by_netdev(hard_iface);
+
+		if (!hardif) {
+			ret = -ENODEV;
+			goto out;
+		}
+
+		if (hardif->soft_iface != soft_iface) {
+			ret = -ENOENT;
+			goto out;
+		}
+	}
+
+	if (!bat_priv->algo_ops->orig.dump) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	bat_priv->algo_ops->orig.dump(msg, cb, bat_priv, hardif);
+
+	ret = msg->len;
+
+ out:
+	if (hardif)
+		batadv_hardif_put(hardif);
+	if (hard_iface)
+		dev_put(hard_iface);
+	if (primary_if)
+		batadv_hardif_put(primary_if);
+	if (soft_iface)
+		dev_put(soft_iface);
+
+	return ret;
+}
+
 int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface,
 			    int max_if_num)
 {
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index 566306bf05dc..ebc56183f358 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -31,7 +31,9 @@
 
 #include "hash.h"
 
+struct netlink_callback;
 struct seq_file;
+struct sk_buff;
 
 bool batadv_compare_orig(const struct hlist_node *node, const void *data2);
 int batadv_originator_init(struct batadv_priv *bat_priv);
@@ -61,6 +63,7 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh,
 			struct batadv_hard_iface *if_outgoing);
 void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo);
 
+int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb);
 int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset);
 
 struct batadv_orig_ifinfo *
@@ -72,6 +75,7 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node,
 void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo);
 
 int batadv_orig_seq_print_text(struct seq_file *seq, void *offset);
+int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb);
 int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset);
 int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface,
 			    int max_if_num);
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 72806a3c40df..968023a61598 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -28,6 +28,7 @@
 #include <linux/if_ether.h>
 #include <linux/kref.h>
 #include <linux/netdevice.h>
+#include <linux/netlink.h>
 #include <linux/sched.h> /* for linux/wait.h */
 #include <linux/spinlock.h>
 #include <linux/types.h>
@@ -1418,6 +1419,7 @@ struct batadv_algo_iface_ops {
  * @is_similar_or_better: check if neigh1 is equally similar or better than
  *  neigh2 for their respective outgoing interface from the metric prospective
  * @print: print the single hop neighbor list (optional)
+ * @dump: dump neighbors to a netlink socket (optional)
  */
 struct batadv_algo_neigh_ops {
 	void (*hardif_init)(struct batadv_hardif_neigh_node *neigh);
@@ -1430,6 +1432,9 @@ struct batadv_algo_neigh_ops {
 				     struct batadv_neigh_node *neigh2,
 				     struct batadv_hard_iface *if_outgoing2);
 	void (*print)(struct batadv_priv *priv, struct seq_file *seq);
+	void (*dump)(struct sk_buff *msg, struct netlink_callback *cb,
+		     struct batadv_priv *priv,
+		     struct batadv_hard_iface *hard_iface);
 };
 
 /**
@@ -1441,6 +1446,7 @@ struct batadv_algo_neigh_ops {
  * @del_if: ask the routing algorithm to apply the needed changes to the
  *  orig_node due to an hard-interface being removed from the mesh (optional)
  * @print: print the originator table (optional)
+ * @dump: dump originators to a netlink socket (optional)
  */
 struct batadv_algo_orig_ops {
 	void (*free)(struct batadv_orig_node *orig_node);
@@ -1449,6 +1455,9 @@ struct batadv_algo_orig_ops {
 		      int del_if_num);
 	void (*print)(struct batadv_priv *priv, struct seq_file *seq,
 		      struct batadv_hard_iface *hard_iface);
+	void (*dump)(struct sk_buff *msg, struct netlink_callback *cb,
+		     struct batadv_priv *priv,
+		     struct batadv_hard_iface *hard_iface);
 };
 
 /**
-- 
cgit v1.2.3-71-gd317


From 024f99cb4acc14dab5e55e1ecdf6aad31269ca98 Mon Sep 17 00:00:00 2001
From: Matthias Schiffer <mschiffer@universe-factory.net>
Date: Sun, 3 Jul 2016 13:31:40 +0200
Subject: batman-adv: add B.A.T.M.A.N. IV bat_{orig, neigh}_dump
 implementations

Signed-off-by: Matthias Schiffer <mschiffer@universe-factory.net>
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
[sven.eckelmann@open-mesh.com: Fix function parameter alignments,
add policy for attributes, fix includes, fix algo_ops integration]
Signed-off-by: Sven Eckelmann <sven.eckelmann@open-mesh.com>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
---
 include/uapi/linux/batman_adv.h |   4 +
 net/batman-adv/bat_iv_ogm.c     | 366 ++++++++++++++++++++++++++++++++++++++++
 net/batman-adv/netlink.c        |   2 +
 3 files changed, 372 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index 3f7a415f5e09..ba2359a1f464 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -85,6 +85,8 @@ enum batadv_tt_client_flags {
  * @BATADV_ATTR_TT_FLAGS: Translation table client flags
  * @BATADV_ATTR_FLAG_BEST: Flags indicating entry is the best
  * @BATADV_ATTR_LAST_SEEN_MSECS: Time in milliseconds since last seen
+ * @BATADV_ATTR_NEIGH_ADDRESS: Neighbour MAC address
+ * @BATADV_ATTR_TQ: TQ to neighbour
  * @__BATADV_ATTR_AFTER_LAST: internal use
  * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available
  * @BATADV_ATTR_MAX: highest attribute number currently defined
@@ -114,6 +116,8 @@ enum batadv_nl_attrs {
 	BATADV_ATTR_TT_FLAGS,
 	BATADV_ATTR_FLAG_BEST,
 	BATADV_ATTR_LAST_SEEN_MSECS,
+	BATADV_ATTR_NEIGH_ADDRESS,
+	BATADV_ATTR_TQ,
 	/* add attributes above here, update the policy in netlink.c */
 	__BATADV_ATTR_AFTER_LAST,
 	NUM_BATADV_ATTR = __BATADV_ATTR_AFTER_LAST,
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index a40cdf273625..7a8c0f63e2ae 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -35,6 +35,7 @@
 #include <linux/list.h>
 #include <linux/lockdep.h>
 #include <linux/netdevice.h>
+#include <linux/netlink.h>
 #include <linux/pkt_sched.h>
 #include <linux/printk.h>
 #include <linux/random.h>
@@ -48,6 +49,9 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
+#include <net/genetlink.h>
+#include <net/netlink.h>
+#include <uapi/linux/batman_adv.h>
 
 #include "bat_algo.h"
 #include "bitarray.h"
@@ -55,6 +59,7 @@
 #include "hard-interface.h"
 #include "hash.h"
 #include "log.h"
+#include "netlink.h"
 #include "network-coding.h"
 #include "originator.h"
 #include "packet.h"
@@ -1947,6 +1952,235 @@ next:
 		seq_puts(seq, "No batman nodes in range ...\n");
 }
 
+/**
+ * batadv_iv_ogm_neigh_get_tq_avg - Get the TQ average for a neighbour on a
+ *  given outgoing interface.
+ * @neigh_node: Neighbour of interest
+ * @if_outgoing: Outgoing interface of interest
+ * @tq_avg: Pointer of where to store the TQ average
+ *
+ * Return: False if no average TQ available, otherwise true.
+ */
+static bool
+batadv_iv_ogm_neigh_get_tq_avg(struct batadv_neigh_node *neigh_node,
+			       struct batadv_hard_iface *if_outgoing,
+			       u8 *tq_avg)
+{
+	struct batadv_neigh_ifinfo *n_ifinfo;
+
+	n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
+	if (!n_ifinfo)
+		return false;
+
+	*tq_avg = n_ifinfo->bat_iv.tq_avg;
+	batadv_neigh_ifinfo_put(n_ifinfo);
+
+	return true;
+}
+
+/**
+ * batadv_iv_ogm_orig_dump_subentry - Dump an originator subentry into a
+ *  message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ * @orig_node: Originator to dump
+ * @neigh_node: Single hops neighbour
+ * @best: Is the best originator
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_iv_ogm_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq,
+				 struct batadv_priv *bat_priv,
+				 struct batadv_hard_iface *if_outgoing,
+				 struct batadv_orig_node *orig_node,
+				 struct batadv_neigh_node *neigh_node,
+				 bool best)
+{
+	void *hdr;
+	u8 tq_avg;
+	unsigned int last_seen_msecs;
+
+	last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen);
+
+	if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node, if_outgoing, &tq_avg))
+		return 0;
+
+	if (if_outgoing != BATADV_IF_DEFAULT &&
+	    if_outgoing != neigh_node->if_incoming)
+		return 0;
+
+	hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
+			  NLM_F_MULTI, BATADV_CMD_GET_ORIGINATORS);
+	if (!hdr)
+		return -ENOBUFS;
+
+	if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN,
+		    orig_node->orig) ||
+	    nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN,
+		    neigh_node->addr) ||
+	    nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
+			neigh_node->if_incoming->net_dev->ifindex) ||
+	    nla_put_u8(msg, BATADV_ATTR_TQ, tq_avg) ||
+	    nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS,
+			last_seen_msecs))
+		goto nla_put_failure;
+
+	if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+/**
+ * batadv_iv_ogm_orig_dump_entry - Dump an originator entry into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ * @orig_node: Originator to dump
+ * @sub_s: Number of sub entries to skip
+ *
+ * This function assumes the caller holds rcu_read_lock().
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_iv_ogm_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+			      struct batadv_priv *bat_priv,
+			      struct batadv_hard_iface *if_outgoing,
+			      struct batadv_orig_node *orig_node, int *sub_s)
+{
+	struct batadv_neigh_node *neigh_node_best;
+	struct batadv_neigh_node *neigh_node;
+	int sub = 0;
+	bool best;
+	u8 tq_avg_best;
+
+	neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing);
+	if (!neigh_node_best)
+		goto out;
+
+	if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node_best, if_outgoing,
+					    &tq_avg_best))
+		goto out;
+
+	if (tq_avg_best == 0)
+		goto out;
+
+	hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) {
+		if (sub++ < *sub_s)
+			continue;
+
+		best = (neigh_node == neigh_node_best);
+
+		if (batadv_iv_ogm_orig_dump_subentry(msg, portid, seq,
+						     bat_priv, if_outgoing,
+						     orig_node, neigh_node,
+						     best)) {
+			batadv_neigh_node_put(neigh_node_best);
+
+			*sub_s = sub - 1;
+			return -EMSGSIZE;
+		}
+	}
+
+ out:
+	if (neigh_node_best)
+		batadv_neigh_node_put(neigh_node_best);
+
+	*sub_s = 0;
+	return 0;
+}
+
+/**
+ * batadv_iv_ogm_orig_dump_bucket - Dump an originator bucket into a
+ *  message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ * @head: Bucket to be dumped
+ * @idx_s: Number of entries to be skipped
+ * @sub: Number of sub entries to be skipped
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_iv_ogm_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+			       struct batadv_priv *bat_priv,
+			       struct batadv_hard_iface *if_outgoing,
+			       struct hlist_head *head, int *idx_s, int *sub)
+{
+	struct batadv_orig_node *orig_node;
+	int idx = 0;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+		if (idx++ < *idx_s)
+			continue;
+
+		if (batadv_iv_ogm_orig_dump_entry(msg, portid, seq, bat_priv,
+						  if_outgoing, orig_node,
+						  sub)) {
+			rcu_read_unlock();
+			*idx_s = idx - 1;
+			return -EMSGSIZE;
+		}
+	}
+	rcu_read_unlock();
+
+	*idx_s = 0;
+	*sub = 0;
+	return 0;
+}
+
+/**
+ * batadv_iv_ogm_orig_dump - Dump the originators into a message
+ * @msg: Netlink message to dump into
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the soft interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ */
+static void
+batadv_iv_ogm_orig_dump(struct sk_buff *msg, struct netlink_callback *cb,
+			struct batadv_priv *bat_priv,
+			struct batadv_hard_iface *if_outgoing)
+{
+	struct batadv_hashtable *hash = bat_priv->orig_hash;
+	struct hlist_head *head;
+	int bucket = cb->args[0];
+	int idx = cb->args[1];
+	int sub = cb->args[2];
+	int portid = NETLINK_CB(cb->skb).portid;
+
+	while (bucket < hash->size) {
+		head = &hash->table[bucket];
+
+		if (batadv_iv_ogm_orig_dump_bucket(msg, portid,
+						   cb->nlh->nlmsg_seq,
+						   bat_priv, if_outgoing, head,
+						   &idx, &sub))
+			break;
+
+		bucket++;
+	}
+
+	cb->args[0] = bucket;
+	cb->args[1] = idx;
+	cb->args[2] = sub;
+}
+
 /**
  * batadv_iv_hardif_neigh_print - print a single hop neighbour node
  * @seq: neighbour table seq_file struct
@@ -2043,6 +2277,136 @@ out:
 	return ret;
 }
 
+/**
+ * batadv_iv_ogm_neigh_dump_neigh - Dump a neighbour into a netlink message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @hardif_neigh: Neighbour to be dumped
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_iv_ogm_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq,
+			       struct batadv_hardif_neigh_node *hardif_neigh)
+{
+	void *hdr;
+	unsigned int last_seen_msecs;
+
+	last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen);
+
+	hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
+			  NLM_F_MULTI, BATADV_CMD_GET_NEIGHBORS);
+	if (!hdr)
+		return -ENOBUFS;
+
+	if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN,
+		    hardif_neigh->addr) ||
+	    nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
+			hardif_neigh->if_incoming->net_dev->ifindex) ||
+	    nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS,
+			last_seen_msecs))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+/**
+ * batadv_iv_ogm_neigh_dump_hardif - Dump the neighbours of a hard interface
+ *  into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @hard_iface: Hard interface to dump the neighbours for
+ * @idx_s: Number of entries to skip
+ *
+ * This function assumes the caller holds rcu_read_lock().
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_iv_ogm_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq,
+				struct batadv_priv *bat_priv,
+				struct batadv_hard_iface *hard_iface,
+				int *idx_s)
+{
+	struct batadv_hardif_neigh_node *hardif_neigh;
+	int idx = 0;
+
+	hlist_for_each_entry_rcu(hardif_neigh,
+				 &hard_iface->neigh_list, list) {
+		if (idx++ < *idx_s)
+			continue;
+
+		if (batadv_iv_ogm_neigh_dump_neigh(msg, portid, seq,
+						   hardif_neigh)) {
+			*idx_s = idx - 1;
+			return -EMSGSIZE;
+		}
+	}
+
+	*idx_s = 0;
+	return 0;
+}
+
+/**
+ * batadv_iv_ogm_neigh_dump - Dump the neighbours into a message
+ * @msg: Netlink message to dump into
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the soft interface information
+ * @single_hardif: Limit dump to this hard interfaace
+ */
+static void
+batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb,
+			 struct batadv_priv *bat_priv,
+			 struct batadv_hard_iface *single_hardif)
+{
+	struct batadv_hard_iface *hard_iface;
+	int i_hardif = 0;
+	int i_hardif_s = cb->args[0];
+	int idx = cb->args[1];
+	int portid = NETLINK_CB(cb->skb).portid;
+
+	rcu_read_lock();
+	if (single_hardif) {
+		if (i_hardif_s == 0) {
+			if (batadv_iv_ogm_neigh_dump_hardif(msg, portid,
+							    cb->nlh->nlmsg_seq,
+							    bat_priv,
+							    single_hardif,
+							    &idx) == 0)
+				i_hardif++;
+		}
+	} else {
+		list_for_each_entry_rcu(hard_iface, &batadv_hardif_list,
+					list) {
+			if (hard_iface->soft_iface != bat_priv->soft_iface)
+				continue;
+
+			if (i_hardif++ < i_hardif_s)
+				continue;
+
+			if (batadv_iv_ogm_neigh_dump_hardif(msg, portid,
+							    cb->nlh->nlmsg_seq,
+							    bat_priv,
+							    hard_iface, &idx)) {
+				i_hardif--;
+				break;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	cb->args[0] = i_hardif;
+	cb->args[1] = idx;
+}
+
 /**
  * batadv_iv_ogm_neigh_cmp - compare the metrics of two neighbors
  * @neigh1: the first neighbor object of the comparison
@@ -2330,9 +2694,11 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = {
 		.cmp = batadv_iv_ogm_neigh_cmp,
 		.is_similar_or_better = batadv_iv_ogm_neigh_is_sob,
 		.print = batadv_iv_neigh_print,
+		.dump = batadv_iv_ogm_neigh_dump,
 	},
 	.orig = {
 		.print = batadv_iv_ogm_orig_print,
+		.dump = batadv_iv_ogm_orig_dump,
 		.free = batadv_iv_ogm_orig_free,
 		.add_if = batadv_iv_ogm_orig_add_if,
 		.del_if = batadv_iv_ogm_orig_del_if,
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 8469fc4ec5a3..0c7940cc1968 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -84,6 +84,8 @@ static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_TT_FLAGS]		= { .type = NLA_U32 },
 	[BATADV_ATTR_FLAG_BEST]		= { .type = NLA_FLAG },
 	[BATADV_ATTR_LAST_SEEN_MSECS]	= { .type = NLA_U32 },
+	[BATADV_ATTR_NEIGH_ADDRESS]	= { .len = ETH_ALEN },
+	[BATADV_ATTR_TQ]		= { .type = NLA_U8 },
 };
 
 /**
-- 
cgit v1.2.3-71-gd317


From f02a478f518ee5690f279c8c2d3a6222143a7b20 Mon Sep 17 00:00:00 2001
From: Matthias Schiffer <mschiffer@universe-factory.net>
Date: Sun, 3 Jul 2016 13:31:41 +0200
Subject: batman-adv: add B.A.T.M.A.N. V bat_{orig, neigh}_dump implementations

Dump the algo V originators and neighbours.

Signed-off-by: Matthias Schiffer <mschiffer@universe-factory.net>
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
[sven@narfation.org: Fix includes, fix algo_ops integration]
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
---
 include/uapi/linux/batman_adv.h |   2 +
 net/batman-adv/bat_v.c          | 340 ++++++++++++++++++++++++++++++++++++++++
 net/batman-adv/netlink.c        |   1 +
 3 files changed, 343 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index ba2359a1f464..2e2747fb1311 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -87,6 +87,7 @@ enum batadv_tt_client_flags {
  * @BATADV_ATTR_LAST_SEEN_MSECS: Time in milliseconds since last seen
  * @BATADV_ATTR_NEIGH_ADDRESS: Neighbour MAC address
  * @BATADV_ATTR_TQ: TQ to neighbour
+ * @BATADV_ATTR_THROUGHPUT: Estimated throughput to Neighbour
  * @__BATADV_ATTR_AFTER_LAST: internal use
  * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available
  * @BATADV_ATTR_MAX: highest attribute number currently defined
@@ -118,6 +119,7 @@ enum batadv_nl_attrs {
 	BATADV_ATTR_LAST_SEEN_MSECS,
 	BATADV_ATTR_NEIGH_ADDRESS,
 	BATADV_ATTR_TQ,
+	BATADV_ATTR_THROUGHPUT,
 	/* add attributes above here, update the policy in netlink.c */
 	__BATADV_ATTR_AFTER_LAST,
 	NUM_BATADV_ATTR = __BATADV_ATTR_AFTER_LAST,
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 1d777b171366..9dccfaf32115 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -22,17 +22,22 @@
 #include <linux/bug.h>
 #include <linux/cache.h>
 #include <linux/errno.h>
+#include <linux/if_ether.h>
 #include <linux/init.h>
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
 #include <linux/kref.h>
 #include <linux/netdevice.h>
+#include <linux/netlink.h>
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
 #include <linux/seq_file.h>
 #include <linux/stddef.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
+#include <net/genetlink.h>
+#include <net/netlink.h>
+#include <uapi/linux/batman_adv.h>
 
 #include "bat_algo.h"
 #include "bat_v_elp.h"
@@ -42,9 +47,12 @@
 #include "hard-interface.h"
 #include "hash.h"
 #include "log.h"
+#include "netlink.h"
 #include "originator.h"
 #include "packet.h"
 
+struct sk_buff;
+
 static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface)
 {
 	struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
@@ -205,6 +213,138 @@ static void batadv_v_neigh_print(struct batadv_priv *bat_priv,
 		seq_puts(seq, "No batman nodes in range ...\n");
 }
 
+/**
+ * batadv_v_neigh_dump_neigh - Dump a neighbour into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @hardif_neigh: Neighbour to dump
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_v_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq,
+			  struct batadv_hardif_neigh_node *hardif_neigh)
+{
+	void *hdr;
+	unsigned int last_seen_msecs;
+	u32 throughput;
+
+	last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen);
+	throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput);
+	throughput = throughput * 100;
+
+	hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI,
+			  BATADV_CMD_GET_NEIGHBORS);
+	if (!hdr)
+		return -ENOBUFS;
+
+	if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN,
+		    hardif_neigh->addr) ||
+	    nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
+			hardif_neigh->if_incoming->net_dev->ifindex) ||
+	    nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS,
+			last_seen_msecs) ||
+	    nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+/**
+ * batadv_v_neigh_dump_hardif - Dump the  neighbours of a hard interface  into
+ *  a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @hard_iface: The hard interface to be dumped
+ * @idx_s: Entries to be skipped
+ *
+ * This function assumes the caller holds rcu_read_lock().
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_v_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq,
+			   struct batadv_priv *bat_priv,
+			   struct batadv_hard_iface *hard_iface,
+			   int *idx_s)
+{
+	struct batadv_hardif_neigh_node *hardif_neigh;
+	int idx = 0;
+
+	hlist_for_each_entry_rcu(hardif_neigh,
+				 &hard_iface->neigh_list, list) {
+		if (idx++ < *idx_s)
+			continue;
+
+		if (batadv_v_neigh_dump_neigh(msg, portid, seq, hardif_neigh)) {
+			*idx_s = idx - 1;
+			return -EMSGSIZE;
+		}
+	}
+
+	*idx_s = 0;
+	return 0;
+}
+
+/**
+ * batadv_v_neigh_dump - Dump the neighbours of a hard interface  into a
+ *  message
+ * @msg: Netlink message to dump into
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the soft interface information
+ * @single_hardif: Limit dumping to this hard interface
+ */
+static void
+batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb,
+		    struct batadv_priv *bat_priv,
+		    struct batadv_hard_iface *single_hardif)
+{
+	struct batadv_hard_iface *hard_iface;
+	int i_hardif = 0;
+	int i_hardif_s = cb->args[0];
+	int idx = cb->args[1];
+	int portid = NETLINK_CB(cb->skb).portid;
+
+	rcu_read_lock();
+	if (single_hardif) {
+		if (i_hardif_s == 0) {
+			if (batadv_v_neigh_dump_hardif(msg, portid,
+						       cb->nlh->nlmsg_seq,
+						       bat_priv, single_hardif,
+						       &idx) == 0)
+				i_hardif++;
+		}
+	} else {
+		list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+			if (hard_iface->soft_iface != bat_priv->soft_iface)
+				continue;
+
+			if (i_hardif++ < i_hardif_s)
+				continue;
+
+			if (batadv_v_neigh_dump_hardif(msg, portid,
+						       cb->nlh->nlmsg_seq,
+						       bat_priv, hard_iface,
+						       &idx)) {
+				i_hardif--;
+				break;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	cb->args[0] = i_hardif;
+	cb->args[1] = idx;
+}
+
 /**
  * batadv_v_orig_print - print the originator table
  * @bat_priv: the bat priv with all the soft interface information
@@ -272,6 +412,204 @@ next:
 		seq_puts(seq, "No batman nodes in range ...\n");
 }
 
+/**
+ * batadv_v_orig_dump_subentry - Dump an originator subentry into a
+ *  message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ * @orig_node: Originator to dump
+ * @neigh_node: Single hops neighbour
+ * @best: Is the best originator
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_v_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq,
+			    struct batadv_priv *bat_priv,
+			    struct batadv_hard_iface *if_outgoing,
+			    struct batadv_orig_node *orig_node,
+			    struct batadv_neigh_node *neigh_node,
+			    bool best)
+{
+	struct batadv_neigh_ifinfo *n_ifinfo;
+	unsigned int last_seen_msecs;
+	u32 throughput;
+	void *hdr;
+
+	n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
+	if (!n_ifinfo)
+		return 0;
+
+	throughput = n_ifinfo->bat_v.throughput * 100;
+
+	batadv_neigh_ifinfo_put(n_ifinfo);
+
+	last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen);
+
+	if (if_outgoing != BATADV_IF_DEFAULT &&
+	    if_outgoing != neigh_node->if_incoming)
+		return 0;
+
+	hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI,
+			  BATADV_CMD_GET_ORIGINATORS);
+	if (!hdr)
+		return -ENOBUFS;
+
+	if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig) ||
+	    nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN,
+		    neigh_node->addr) ||
+	    nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
+			neigh_node->if_incoming->net_dev->ifindex) ||
+	    nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput) ||
+	    nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS,
+			last_seen_msecs))
+		goto nla_put_failure;
+
+	if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+/**
+ * batadv_v_orig_dump_entry - Dump an originator entry into a message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ * @orig_node: Originator to dump
+ * @sub_s: Number of sub entries to skip
+ *
+ * This function assumes the caller holds rcu_read_lock().
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_v_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+			 struct batadv_priv *bat_priv,
+			 struct batadv_hard_iface *if_outgoing,
+			 struct batadv_orig_node *orig_node, int *sub_s)
+{
+	struct batadv_neigh_node *neigh_node_best;
+	struct batadv_neigh_node *neigh_node;
+	int sub = 0;
+	bool best;
+
+	neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing);
+	if (!neigh_node_best)
+		goto out;
+
+	hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) {
+		if (sub++ < *sub_s)
+			continue;
+
+		best = (neigh_node == neigh_node_best);
+
+		if (batadv_v_orig_dump_subentry(msg, portid, seq, bat_priv,
+						if_outgoing, orig_node,
+						neigh_node, best)) {
+			batadv_neigh_node_put(neigh_node_best);
+
+			*sub_s = sub - 1;
+			return -EMSGSIZE;
+		}
+	}
+
+ out:
+	if (neigh_node_best)
+		batadv_neigh_node_put(neigh_node_best);
+
+	*sub_s = 0;
+	return 0;
+}
+
+/**
+ * batadv_v_orig_dump_bucket - Dump an originator bucket into a
+ *  message
+ * @msg: Netlink message to dump into
+ * @portid: Port making netlink request
+ * @seq: Sequence number of netlink message
+ * @bat_priv: The bat priv with all the soft interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ * @head: Bucket to be dumped
+ * @idx_s: Number of entries to be skipped
+ * @sub: Number of sub entries to be skipped
+ *
+ * Return: Error code, or 0 on success
+ */
+static int
+batadv_v_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+			  struct batadv_priv *bat_priv,
+			  struct batadv_hard_iface *if_outgoing,
+			  struct hlist_head *head, int *idx_s, int *sub)
+{
+	struct batadv_orig_node *orig_node;
+	int idx = 0;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+		if (idx++ < *idx_s)
+			continue;
+
+		if (batadv_v_orig_dump_entry(msg, portid, seq, bat_priv,
+					     if_outgoing, orig_node, sub)) {
+			rcu_read_unlock();
+			*idx_s = idx - 1;
+			return -EMSGSIZE;
+		}
+	}
+	rcu_read_unlock();
+
+	*idx_s = 0;
+	*sub = 0;
+	return 0;
+}
+
+/**
+ * batadv_v_orig_dump - Dump the originators into a message
+ * @msg: Netlink message to dump into
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the soft interface information
+ * @if_outgoing: Limit dump to entries with this outgoing interface
+ */
+static void
+batadv_v_orig_dump(struct sk_buff *msg, struct netlink_callback *cb,
+		   struct batadv_priv *bat_priv,
+		   struct batadv_hard_iface *if_outgoing)
+{
+	struct batadv_hashtable *hash = bat_priv->orig_hash;
+	struct hlist_head *head;
+	int bucket = cb->args[0];
+	int idx = cb->args[1];
+	int sub = cb->args[2];
+	int portid = NETLINK_CB(cb->skb).portid;
+
+	while (bucket < hash->size) {
+		head = &hash->table[bucket];
+
+		if (batadv_v_orig_dump_bucket(msg, portid,
+					      cb->nlh->nlmsg_seq,
+					      bat_priv, if_outgoing, head, &idx,
+					      &sub))
+			break;
+
+		bucket++;
+	}
+
+	cb->args[0] = bucket;
+	cb->args[1] = idx;
+	cb->args[2] = sub;
+}
+
 static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1,
 			      struct batadv_hard_iface *if_outgoing1,
 			      struct batadv_neigh_node *neigh2,
@@ -573,9 +911,11 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = {
 		.cmp = batadv_v_neigh_cmp,
 		.is_similar_or_better = batadv_v_neigh_is_sob,
 		.print = batadv_v_neigh_print,
+		.dump = batadv_v_neigh_dump,
 	},
 	.orig = {
 		.print = batadv_v_orig_print,
+		.dump = batadv_v_orig_dump,
 	},
 	.gw = {
 		.store_sel_class = batadv_v_store_sel_class,
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 0c7940cc1968..025f2ec2b27e 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -86,6 +86,7 @@ static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_LAST_SEEN_MSECS]	= { .type = NLA_U32 },
 	[BATADV_ATTR_NEIGH_ADDRESS]	= { .len = ETH_ALEN },
 	[BATADV_ATTR_TQ]		= { .type = NLA_U8 },
+	[BATADV_ATTR_THROUGHPUT]	= { .type = NLA_U32 },
 };
 
 /**
-- 
cgit v1.2.3-71-gd317


From d7129dafcb71adfd1a166d0477ce0f564cf410d5 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 3 Jul 2016 13:31:42 +0200
Subject: batman-adv: netlink: add gateway table queries

Add BATADV_CMD_GET_GATEWAYS commands, using handlers bat_gw_dump in
batadv_algo_ops. Will always return -EOPNOTSUPP for now, as no
implementations exist yet.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h |  8 ++++++
 net/batman-adv/gateway_client.c | 59 +++++++++++++++++++++++++++++++++++++++++
 net/batman-adv/gateway_client.h |  2 ++
 net/batman-adv/netlink.c        | 10 +++++++
 net/batman-adv/types.h          |  3 +++
 5 files changed, 82 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index 2e2747fb1311..a13fc09e8192 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -88,6 +88,9 @@ enum batadv_tt_client_flags {
  * @BATADV_ATTR_NEIGH_ADDRESS: Neighbour MAC address
  * @BATADV_ATTR_TQ: TQ to neighbour
  * @BATADV_ATTR_THROUGHPUT: Estimated throughput to Neighbour
+ * @BATADV_ATTR_BANDWIDTH_UP: Reported uplink bandwidth
+ * @BATADV_ATTR_BANDWIDTH_DOWN: Reported downlink bandwidth
+ * @BATADV_ATTR_ROUTER: Gateway router MAC address
  * @__BATADV_ATTR_AFTER_LAST: internal use
  * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available
  * @BATADV_ATTR_MAX: highest attribute number currently defined
@@ -120,6 +123,9 @@ enum batadv_nl_attrs {
 	BATADV_ATTR_NEIGH_ADDRESS,
 	BATADV_ATTR_TQ,
 	BATADV_ATTR_THROUGHPUT,
+	BATADV_ATTR_BANDWIDTH_UP,
+	BATADV_ATTR_BANDWIDTH_DOWN,
+	BATADV_ATTR_ROUTER,
 	/* add attributes above here, update the policy in netlink.c */
 	__BATADV_ATTR_AFTER_LAST,
 	NUM_BATADV_ATTR = __BATADV_ATTR_AFTER_LAST,
@@ -139,6 +145,7 @@ enum batadv_nl_attrs {
  * @BATADV_CMD_GET_TRANSTABLE_GLOBAL Query list of global translations
  * @BATADV_CMD_GET_ORIGINATORS: Query list of originators
  * @BATADV_CMD_GET_NEIGHBORS: Query list of neighbours
+ * @BATADV_CMD_GET_GATEWAYS: Query list of gateways
  * @__BATADV_CMD_AFTER_LAST: internal use
  * @BATADV_CMD_MAX: highest used command number
  */
@@ -153,6 +160,7 @@ enum batadv_nl_commands {
 	BATADV_CMD_GET_TRANSTABLE_GLOBAL,
 	BATADV_CMD_GET_ORIGINATORS,
 	BATADV_CMD_GET_NEIGHBORS,
+	BATADV_CMD_GET_GATEWAYS,
 	/* add new commands above here */
 	__BATADV_CMD_AFTER_LAST,
 	BATADV_CMD_MAX = __BATADV_CMD_AFTER_LAST - 1
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index a77a17939f1e..c2928c2287b8 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -20,6 +20,7 @@
 
 #include <linux/atomic.h>
 #include <linux/byteorder/generic.h>
+#include <linux/errno.h>
 #include <linux/etherdevice.h>
 #include <linux/fs.h>
 #include <linux/if_ether.h>
@@ -31,6 +32,7 @@
 #include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/netdevice.h>
+#include <linux/netlink.h>
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
 #include <linux/seq_file.h>
@@ -39,13 +41,17 @@
 #include <linux/spinlock.h>
 #include <linux/stddef.h>
 #include <linux/udp.h>
+#include <net/sock.h>
+#include <uapi/linux/batman_adv.h>
 
 #include "gateway_common.h"
 #include "hard-interface.h"
 #include "log.h"
+#include "netlink.h"
 #include "originator.h"
 #include "packet.h"
 #include "routing.h"
+#include "soft-interface.h"
 #include "sysfs.h"
 #include "translation-table.h"
 
@@ -500,6 +506,59 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset)
 	return 0;
 }
 
+/**
+ * batadv_gw_dump - Dump gateways into a message
+ * @msg: Netlink message to dump into
+ * @cb: Control block containing additional options
+ *
+ * Return: Error code, or length of message
+ */
+int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+	struct batadv_hard_iface *primary_if = NULL;
+	struct net *net = sock_net(cb->skb->sk);
+	struct net_device *soft_iface;
+	struct batadv_priv *bat_priv;
+	int ifindex;
+	int ret;
+
+	ifindex = batadv_netlink_get_ifindex(cb->nlh,
+					     BATADV_ATTR_MESH_IFINDEX);
+	if (!ifindex)
+		return -EINVAL;
+
+	soft_iface = dev_get_by_index(net, ifindex);
+	if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	bat_priv = netdev_priv(soft_iface);
+
+	primary_if = batadv_primary_if_get_selected(bat_priv);
+	if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	if (!bat_priv->algo_ops->gw.dump) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	bat_priv->algo_ops->gw.dump(msg, cb, bat_priv);
+
+	ret = msg->len;
+
+out:
+	if (primary_if)
+		batadv_hardif_put(primary_if);
+	if (soft_iface)
+		dev_put(soft_iface);
+
+	return ret;
+}
+
 /**
  * batadv_gw_dhcp_recipient_get - check if a packet is a DHCP message
  * @skb: the packet to check
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index 6b40432aa1ed..859166d03561 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -23,6 +23,7 @@
 #include <linux/types.h>
 
 struct batadv_tvlv_gateway_data;
+struct netlink_callback;
 struct seq_file;
 struct sk_buff;
 
@@ -43,6 +44,7 @@ void batadv_gw_node_put(struct batadv_gw_node *gw_node);
 struct batadv_gw_node *
 batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv);
 int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset);
+int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb);
 bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb);
 enum batadv_dhcp_recipient
 batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len,
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 025f2ec2b27e..c68ccb03634d 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -38,6 +38,7 @@
 #include <uapi/linux/batman_adv.h>
 
 #include "bat_algo.h"
+#include "gateway_client.h"
 #include "hard-interface.h"
 #include "originator.h"
 #include "soft-interface.h"
@@ -87,6 +88,9 @@ static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_NEIGH_ADDRESS]	= { .len = ETH_ALEN },
 	[BATADV_ATTR_TQ]		= { .type = NLA_U8 },
 	[BATADV_ATTR_THROUGHPUT]	= { .type = NLA_U32 },
+	[BATADV_ATTR_BANDWIDTH_UP]	= { .type = NLA_U32 },
+	[BATADV_ATTR_BANDWIDTH_DOWN]	= { .type = NLA_U32 },
+	[BATADV_ATTR_ROUTER]		= { .len = ETH_ALEN },
 };
 
 /**
@@ -570,6 +574,12 @@ static struct genl_ops batadv_netlink_ops[] = {
 		.policy = batadv_netlink_policy,
 		.dumpit = batadv_hardif_neigh_dump,
 	},
+	{
+		.cmd = BATADV_CMD_GET_GATEWAYS,
+		.flags = GENL_ADMIN_PERM,
+		.policy = batadv_netlink_policy,
+		.dumpit = batadv_gw_dump,
+	},
 };
 
 /**
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 968023a61598..b5f01a36ec34 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1469,6 +1469,7 @@ struct batadv_algo_orig_ops {
  * @is_eligible: check if a newly discovered GW is a potential candidate for
  *  the election as best GW (optional)
  * @print: print the gateway table (optional)
+ * @dump: dump gateways to a netlink socket (optional)
  */
 struct batadv_algo_gw_ops {
 	ssize_t (*store_sel_class)(struct batadv_priv *bat_priv, char *buff,
@@ -1480,6 +1481,8 @@ struct batadv_algo_gw_ops {
 			    struct batadv_orig_node *curr_gw_orig,
 			    struct batadv_orig_node *orig_node);
 	void (*print)(struct batadv_priv *bat_priv, struct seq_file *seq);
+	void (*dump)(struct sk_buff *msg, struct netlink_callback *cb,
+		     struct batadv_priv *priv);
 };
 
 /**
-- 
cgit v1.2.3-71-gd317


From 04f3f5bf1883fbe0acba5c1fc698cf5cedebc5c5 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sun, 3 Jul 2016 13:31:45 +0200
Subject: batman-adv: add B.A.T.M.A.N. Dump BLA claims via netlink

Dump the list of bridge loop avoidance claims via the netlink socket.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
[sven.eckelmann@open-mesh.com: add policy for attributes, fix includes, fix
soft_iface reference leak]
Signed-off-by: Sven Eckelmann <sven.eckelmann@open-mesh.com>
[sw@simonwunderlich.de: fix kerneldoc, fix error reporting]
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
---
 include/uapi/linux/batman_adv.h        |  12 +++
 net/batman-adv/bridge_loop_avoidance.c | 169 +++++++++++++++++++++++++++++++++
 net/batman-adv/bridge_loop_avoidance.h |  10 +-
 net/batman-adv/netlink.c               |  12 +++
 4 files changed, 202 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index a13fc09e8192..96b37ab2e840 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -91,6 +91,11 @@ enum batadv_tt_client_flags {
  * @BATADV_ATTR_BANDWIDTH_UP: Reported uplink bandwidth
  * @BATADV_ATTR_BANDWIDTH_DOWN: Reported downlink bandwidth
  * @BATADV_ATTR_ROUTER: Gateway router MAC address
+ * @BATADV_ATTR_BLA_OWN: Flag indicating own originator
+ * @BATADV_ATTR_BLA_ADDRESS: Bridge loop avoidance claim MAC address
+ * @BATADV_ATTR_BLA_VID: BLA VLAN ID
+ * @BATADV_ATTR_BLA_BACKBONE: BLA gateway originator MAC address
+ * @BATADV_ATTR_BLA_CRC: BLA CRC
  * @__BATADV_ATTR_AFTER_LAST: internal use
  * @NUM_BATADV_ATTR: total number of batadv_nl_attrs available
  * @BATADV_ATTR_MAX: highest attribute number currently defined
@@ -126,6 +131,11 @@ enum batadv_nl_attrs {
 	BATADV_ATTR_BANDWIDTH_UP,
 	BATADV_ATTR_BANDWIDTH_DOWN,
 	BATADV_ATTR_ROUTER,
+	BATADV_ATTR_BLA_OWN,
+	BATADV_ATTR_BLA_ADDRESS,
+	BATADV_ATTR_BLA_VID,
+	BATADV_ATTR_BLA_BACKBONE,
+	BATADV_ATTR_BLA_CRC,
 	/* add attributes above here, update the policy in netlink.c */
 	__BATADV_ATTR_AFTER_LAST,
 	NUM_BATADV_ATTR = __BATADV_ATTR_AFTER_LAST,
@@ -146,6 +156,7 @@ enum batadv_nl_attrs {
  * @BATADV_CMD_GET_ORIGINATORS: Query list of originators
  * @BATADV_CMD_GET_NEIGHBORS: Query list of neighbours
  * @BATADV_CMD_GET_GATEWAYS: Query list of gateways
+ * @BATADV_CMD_GET_BLA_CLAIM: Query list of bridge loop avoidance claims
  * @__BATADV_CMD_AFTER_LAST: internal use
  * @BATADV_CMD_MAX: highest used command number
  */
@@ -161,6 +172,7 @@ enum batadv_nl_commands {
 	BATADV_CMD_GET_ORIGINATORS,
 	BATADV_CMD_GET_NEIGHBORS,
 	BATADV_CMD_GET_GATEWAYS,
+	BATADV_CMD_GET_BLA_CLAIM,
 	/* add new commands above here */
 	__BATADV_CMD_AFTER_LAST,
 	BATADV_CMD_MAX = __BATADV_CMD_AFTER_LAST - 1
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index c75ef648f0fd..aafa88f3e98d 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -35,6 +35,7 @@
 #include <linux/list.h>
 #include <linux/lockdep.h>
 #include <linux/netdevice.h>
+#include <linux/netlink.h>
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
 #include <linux/seq_file.h>
@@ -45,12 +46,18 @@
 #include <linux/string.h>
 #include <linux/workqueue.h>
 #include <net/arp.h>
+#include <net/genetlink.h>
+#include <net/netlink.h>
+#include <net/sock.h>
+#include <uapi/linux/batman_adv.h>
 
 #include "hard-interface.h"
 #include "hash.h"
 #include "log.h"
+#include "netlink.h"
 #include "originator.h"
 #include "packet.h"
+#include "soft-interface.h"
 #include "sysfs.h"
 #include "translation-table.h"
 
@@ -2051,6 +2058,168 @@ out:
 	return 0;
 }
 
+/**
+ * batadv_bla_claim_dump_entry - dump one entry of the claim table
+ * to a netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @seq: Sequence number of netlink message
+ * @primary_if: primary interface
+ * @claim: entry to dump
+ *
+ * Return: 0 or error code.
+ */
+static int
+batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+			    struct batadv_hard_iface *primary_if,
+			    struct batadv_bla_claim *claim)
+{
+	u8 *primary_addr = primary_if->net_dev->dev_addr;
+	u16 backbone_crc;
+	bool is_own;
+	void *hdr;
+	int ret = -EINVAL;
+
+	hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
+			  NLM_F_MULTI, BATADV_CMD_GET_BLA_CLAIM);
+	if (!hdr) {
+		ret = -ENOBUFS;
+		goto out;
+	}
+
+	is_own = batadv_compare_eth(claim->backbone_gw->orig,
+				    primary_addr);
+
+	spin_lock_bh(&claim->backbone_gw->crc_lock);
+	backbone_crc = claim->backbone_gw->crc;
+	spin_unlock_bh(&claim->backbone_gw->crc_lock);
+
+	if (is_own)
+		if (nla_put_flag(msg, BATADV_ATTR_BLA_OWN)) {
+			genlmsg_cancel(msg, hdr);
+			goto out;
+		}
+
+	if (nla_put(msg, BATADV_ATTR_BLA_ADDRESS, ETH_ALEN, claim->addr) ||
+	    nla_put_u16(msg, BATADV_ATTR_BLA_VID, claim->vid) ||
+	    nla_put(msg, BATADV_ATTR_BLA_BACKBONE, ETH_ALEN,
+		    claim->backbone_gw->orig) ||
+	    nla_put_u16(msg, BATADV_ATTR_BLA_CRC,
+			backbone_crc)) {
+		genlmsg_cancel(msg, hdr);
+		goto out;
+	}
+
+	genlmsg_end(msg, hdr);
+	ret = 0;
+
+out:
+	return ret;
+}
+
+/**
+ * batadv_bla_claim_dump_bucket - dump one bucket of the claim table
+ * to a netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @seq: Sequence number of netlink message
+ * @primary_if: primary interface
+ * @head: bucket to dump
+ * @idx_skip: How many entries to skip
+ *
+ * Return: always 0.
+ */
+static int
+batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+			     struct batadv_hard_iface *primary_if,
+			     struct hlist_head *head, int *idx_skip)
+{
+	struct batadv_bla_claim *claim;
+	int idx = 0;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(claim, head, hash_entry) {
+		if (idx++ < *idx_skip)
+			continue;
+		if (batadv_bla_claim_dump_entry(msg, portid, seq,
+						primary_if, claim)) {
+			*idx_skip = idx - 1;
+			goto unlock;
+		}
+	}
+
+	*idx_skip = idx;
+unlock:
+	rcu_read_unlock();
+	return 0;
+}
+
+/**
+ * batadv_bla_claim_dump - dump claim table to a netlink socket
+ * @msg: buffer for the message
+ * @cb: callback structure containing arguments
+ *
+ * Return: message length.
+ */
+int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+	struct batadv_hard_iface *primary_if = NULL;
+	int portid = NETLINK_CB(cb->skb).portid;
+	struct net *net = sock_net(cb->skb->sk);
+	struct net_device *soft_iface;
+	struct batadv_hashtable *hash;
+	struct batadv_priv *bat_priv;
+	int bucket = cb->args[0];
+	struct hlist_head *head;
+	int idx = cb->args[1];
+	int ifindex;
+	int ret = 0;
+
+	ifindex = batadv_netlink_get_ifindex(cb->nlh,
+					     BATADV_ATTR_MESH_IFINDEX);
+	if (!ifindex)
+		return -EINVAL;
+
+	soft_iface = dev_get_by_index(net, ifindex);
+	if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	bat_priv = netdev_priv(soft_iface);
+	hash = bat_priv->bla.claim_hash;
+
+	primary_if = batadv_primary_if_get_selected(bat_priv);
+	if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	while (bucket < hash->size) {
+		head = &hash->table[bucket];
+
+		if (batadv_bla_claim_dump_bucket(msg, portid,
+						 cb->nlh->nlmsg_seq,
+						 primary_if, head, &idx))
+			break;
+		bucket++;
+	}
+
+	cb->args[0] = bucket;
+	cb->args[1] = idx;
+
+	ret = msg->len;
+
+out:
+	if (primary_if)
+		batadv_hardif_put(primary_if);
+
+	if (soft_iface)
+		dev_put(soft_iface);
+
+	return ret;
+}
+
 /**
  * batadv_bla_backbone_table_seq_print_text - print the backbone table in a seq
  *  file
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index 0f01daeb359e..a80b9e96f28e 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -23,6 +23,7 @@
 #include <linux/types.h>
 
 struct net_device;
+struct netlink_callback;
 struct seq_file;
 struct sk_buff;
 
@@ -35,6 +36,7 @@ bool batadv_bla_is_backbone_gw(struct sk_buff *skb,
 			       struct batadv_orig_node *orig_node,
 			       int hdr_size);
 int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset);
+int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb);
 int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq,
 					     void *offset);
 bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig,
@@ -47,7 +49,7 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv,
 void batadv_bla_status_update(struct net_device *net_dev);
 int batadv_bla_init(struct batadv_priv *bat_priv);
 void batadv_bla_free(struct batadv_priv *bat_priv);
-
+int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb);
 #define BATADV_BLA_CRC_INIT	0
 #else /* ifdef CONFIG_BATMAN_ADV_BLA */
 
@@ -112,6 +114,12 @@ static inline void batadv_bla_free(struct batadv_priv *bat_priv)
 {
 }
 
+static inline int batadv_bla_claim_dump(struct sk_buff *msg,
+					struct netlink_callback *cb)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif /* ifdef CONFIG_BATMAN_ADV_BLA */
 
 #endif /* ifndef _NET_BATMAN_ADV_BLA_H_ */
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index c68ccb03634d..b33675cbaecf 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -38,6 +38,7 @@
 #include <uapi/linux/batman_adv.h>
 
 #include "bat_algo.h"
+#include "bridge_loop_avoidance.h"
 #include "gateway_client.h"
 #include "hard-interface.h"
 #include "originator.h"
@@ -91,6 +92,11 @@ static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_BANDWIDTH_UP]	= { .type = NLA_U32 },
 	[BATADV_ATTR_BANDWIDTH_DOWN]	= { .type = NLA_U32 },
 	[BATADV_ATTR_ROUTER]		= { .len = ETH_ALEN },
+	[BATADV_ATTR_BLA_OWN]		= { .type = NLA_FLAG },
+	[BATADV_ATTR_BLA_ADDRESS]	= { .len = ETH_ALEN },
+	[BATADV_ATTR_BLA_VID]		= { .type = NLA_U16 },
+	[BATADV_ATTR_BLA_BACKBONE]	= { .len = ETH_ALEN },
+	[BATADV_ATTR_BLA_CRC]		= { .type = NLA_U16 },
 };
 
 /**
@@ -580,6 +586,12 @@ static struct genl_ops batadv_netlink_ops[] = {
 		.policy = batadv_netlink_policy,
 		.dumpit = batadv_gw_dump,
 	},
+	{
+		.cmd = BATADV_CMD_GET_BLA_CLAIM,
+		.flags = GENL_ADMIN_PERM,
+		.policy = batadv_netlink_policy,
+		.dumpit = batadv_bla_claim_dump,
+	},
 };
 
 /**
-- 
cgit v1.2.3-71-gd317


From ea4152e1171604f325f1a5f080190823a0edbc1f Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <sw@simonwunderlich.de>
Date: Sun, 3 Jul 2016 13:31:47 +0200
Subject: batman-adv: add backbone table netlink support

Dump the list of bridge loop avoidance backbones via the netlink socket.

Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
---
 include/uapi/linux/batman_adv.h        |   2 +
 net/batman-adv/bridge_loop_avoidance.c | 164 +++++++++++++++++++++++++++++++++
 net/batman-adv/bridge_loop_avoidance.h |   7 ++
 net/batman-adv/netlink.c               |   7 ++
 4 files changed, 180 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index 96b37ab2e840..734fe83ab645 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -157,6 +157,7 @@ enum batadv_nl_attrs {
  * @BATADV_CMD_GET_NEIGHBORS: Query list of neighbours
  * @BATADV_CMD_GET_GATEWAYS: Query list of gateways
  * @BATADV_CMD_GET_BLA_CLAIM: Query list of bridge loop avoidance claims
+ * @BATADV_CMD_GET_BLA_BACKBONE: Query list of bridge loop avoidance backbones
  * @__BATADV_CMD_AFTER_LAST: internal use
  * @BATADV_CMD_MAX: highest used command number
  */
@@ -173,6 +174,7 @@ enum batadv_nl_commands {
 	BATADV_CMD_GET_NEIGHBORS,
 	BATADV_CMD_GET_GATEWAYS,
 	BATADV_CMD_GET_BLA_CLAIM,
+	BATADV_CMD_GET_BLA_BACKBONE,
 	/* add new commands above here */
 	__BATADV_CMD_AFTER_LAST,
 	BATADV_CMD_MAX = __BATADV_CMD_AFTER_LAST - 1
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index aafa88f3e98d..35ed1d32bab5 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -2283,3 +2283,167 @@ out:
 		batadv_hardif_put(primary_if);
 	return 0;
 }
+
+/**
+ * batadv_bla_backbone_dump_entry - dump one entry of the backbone table
+ * to a netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @seq: Sequence number of netlink message
+ * @primary_if: primary interface
+ * @backbone_gw: entry to dump
+ *
+ * Return: 0 or error code.
+ */
+static int
+batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+			       struct batadv_hard_iface *primary_if,
+			       struct batadv_bla_backbone_gw *backbone_gw)
+{
+	u8 *primary_addr = primary_if->net_dev->dev_addr;
+	u16 backbone_crc;
+	bool is_own;
+	int msecs;
+	void *hdr;
+	int ret = -EINVAL;
+
+	hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
+			  NLM_F_MULTI, BATADV_CMD_GET_BLA_BACKBONE);
+	if (!hdr) {
+		ret = -ENOBUFS;
+		goto out;
+	}
+
+	is_own = batadv_compare_eth(backbone_gw->orig, primary_addr);
+
+	spin_lock_bh(&backbone_gw->crc_lock);
+	backbone_crc = backbone_gw->crc;
+	spin_unlock_bh(&backbone_gw->crc_lock);
+
+	msecs = jiffies_to_msecs(jiffies - backbone_gw->lasttime);
+
+	if (is_own)
+		if (nla_put_flag(msg, BATADV_ATTR_BLA_OWN)) {
+			genlmsg_cancel(msg, hdr);
+			goto out;
+		}
+
+	if (nla_put(msg, BATADV_ATTR_BLA_BACKBONE, ETH_ALEN,
+		    backbone_gw->orig) ||
+	    nla_put_u16(msg, BATADV_ATTR_BLA_VID, backbone_gw->vid) ||
+	    nla_put_u16(msg, BATADV_ATTR_BLA_CRC,
+			backbone_crc) ||
+	    nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, msecs)) {
+		genlmsg_cancel(msg, hdr);
+		goto out;
+	}
+
+	genlmsg_end(msg, hdr);
+	ret = 0;
+
+out:
+	return ret;
+}
+
+/**
+ * batadv_bla_backbone_dump_bucket - dump one bucket of the backbone table
+ * to a netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @seq: Sequence number of netlink message
+ * @primary_if: primary interface
+ * @head: bucket to dump
+ * @idx_skip: How many entries to skip
+ *
+ * Return: always 0.
+ */
+static int
+batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+				struct batadv_hard_iface *primary_if,
+				struct hlist_head *head, int *idx_skip)
+{
+	struct batadv_bla_backbone_gw *backbone_gw;
+	int idx = 0;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) {
+		if (idx++ < *idx_skip)
+			continue;
+		if (batadv_bla_backbone_dump_entry(msg, portid, seq,
+						   primary_if, backbone_gw)) {
+			*idx_skip = idx - 1;
+			goto unlock;
+		}
+	}
+
+	*idx_skip = idx;
+unlock:
+	rcu_read_unlock();
+	return 0;
+}
+
+/**
+ * batadv_bla_backbone_dump - dump backbone table to a netlink socket
+ * @msg: buffer for the message
+ * @cb: callback structure containing arguments
+ *
+ * Return: message length.
+ */
+int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+	struct batadv_hard_iface *primary_if = NULL;
+	int portid = NETLINK_CB(cb->skb).portid;
+	struct net *net = sock_net(cb->skb->sk);
+	struct net_device *soft_iface;
+	struct batadv_hashtable *hash;
+	struct batadv_priv *bat_priv;
+	int bucket = cb->args[0];
+	struct hlist_head *head;
+	int idx = cb->args[1];
+	int ifindex;
+	int ret = 0;
+
+	ifindex = batadv_netlink_get_ifindex(cb->nlh,
+					     BATADV_ATTR_MESH_IFINDEX);
+	if (!ifindex)
+		return -EINVAL;
+
+	soft_iface = dev_get_by_index(net, ifindex);
+	if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	bat_priv = netdev_priv(soft_iface);
+	hash = bat_priv->bla.backbone_hash;
+
+	primary_if = batadv_primary_if_get_selected(bat_priv);
+	if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	while (bucket < hash->size) {
+		head = &hash->table[bucket];
+
+		if (batadv_bla_backbone_dump_bucket(msg, portid,
+						    cb->nlh->nlmsg_seq,
+						    primary_if, head, &idx))
+			break;
+		bucket++;
+	}
+
+	cb->args[0] = bucket;
+	cb->args[1] = idx;
+
+	ret = msg->len;
+
+out:
+	if (primary_if)
+		batadv_hardif_put(primary_if);
+
+	if (soft_iface)
+		dev_put(soft_iface);
+
+	return ret;
+}
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index a80b9e96f28e..1ae93e46fb98 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -39,6 +39,7 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset);
 int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb);
 int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq,
 					     void *offset);
+int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb);
 bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig,
 				    unsigned short vid);
 bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
@@ -120,6 +121,12 @@ static inline int batadv_bla_claim_dump(struct sk_buff *msg,
 	return -EOPNOTSUPP;
 }
 
+static inline int batadv_bla_backbone_dump(struct sk_buff *msg,
+					   struct netlink_callback *cb)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif /* ifdef CONFIG_BATMAN_ADV_BLA */
 
 #endif /* ifndef _NET_BATMAN_ADV_BLA_H_ */
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 464de9d05135..c3f68167591d 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -600,6 +600,13 @@ static struct genl_ops batadv_netlink_ops[] = {
 		.policy = batadv_netlink_policy,
 		.dumpit = batadv_bla_claim_dump,
 	},
+	{
+		.cmd = BATADV_CMD_GET_BLA_BACKBONE,
+		.flags = GENL_ADMIN_PERM,
+		.policy = batadv_netlink_policy,
+		.dumpit = batadv_bla_backbone_dump,
+	},
+
 };
 
 /**
-- 
cgit v1.2.3-71-gd317


From ab10dccb11608b96b43b557c12a5ad867723e503 Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Tue, 9 Aug 2016 12:38:24 +0800
Subject: rps: Inspect PPTP encapsulated by GRE to get flow hash

The PPTP is encapsulated by GRE header with that GRE_VERSION bits
must contain one. But current GRE RPS needs the GRE_VERSION must be
zero. So RPS does not work for PPTP traffic.

In my test environment, there are four MIPS cores, and all traffic
are passed through by PPTP. As a result, only one core is 100% busy
while other three cores are very idle. After this patch, the usage
of four cores are balanced well.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Reviewed-by: Philip Prindeville <philipp@redfish-solutions.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ppp/pptp.c         |  36 +------------
 include/net/gre.h              |  10 +++-
 include/net/pptp.h             |  40 +++++++++++++++
 include/uapi/linux/if_tunnel.h |   7 ++-
 net/core/flow_dissector.c      | 113 ++++++++++++++++++++++++++++-------------
 5 files changed, 135 insertions(+), 71 deletions(-)
 create mode 100644 include/net/pptp.h

(limited to 'include/uapi/linux')

diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index ae0905ed4a32..3e68dbc0af7f 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -37,6 +37,7 @@
 #include <net/icmp.h>
 #include <net/route.h>
 #include <net/gre.h>
+#include <net/pptp.h>
 
 #include <linux/uaccess.h>
 
@@ -53,41 +54,6 @@ static struct proto pptp_sk_proto __read_mostly;
 static const struct ppp_channel_ops pptp_chan_ops;
 static const struct proto_ops pptp_ops;
 
-#define PPP_LCP_ECHOREQ 0x09
-#define PPP_LCP_ECHOREP 0x0A
-#define SC_RCV_BITS	(SC_RCV_B7_1|SC_RCV_B7_0|SC_RCV_ODDP|SC_RCV_EVNP)
-
-#define MISSING_WINDOW 20
-#define WRAPPED(curseq, lastseq)\
-	((((curseq) & 0xffffff00) == 0) &&\
-	(((lastseq) & 0xffffff00) == 0xffffff00))
-
-#define PPTP_GRE_PROTO  0x880B
-#define PPTP_GRE_VER    0x1
-
-#define PPTP_GRE_FLAG_C	0x80
-#define PPTP_GRE_FLAG_R	0x40
-#define PPTP_GRE_FLAG_K	0x20
-#define PPTP_GRE_FLAG_S	0x10
-#define PPTP_GRE_FLAG_A	0x80
-
-#define PPTP_GRE_IS_C(f) ((f)&PPTP_GRE_FLAG_C)
-#define PPTP_GRE_IS_R(f) ((f)&PPTP_GRE_FLAG_R)
-#define PPTP_GRE_IS_K(f) ((f)&PPTP_GRE_FLAG_K)
-#define PPTP_GRE_IS_S(f) ((f)&PPTP_GRE_FLAG_S)
-#define PPTP_GRE_IS_A(f) ((f)&PPTP_GRE_FLAG_A)
-
-#define PPTP_HEADER_OVERHEAD (2+sizeof(struct pptp_gre_header))
-struct pptp_gre_header {
-	u8  flags;
-	u8  ver;
-	__be16 protocol;
-	__be16 payload_len;
-	__be16 call_id;
-	__be32 seq;
-	__be32 ack;
-} __packed;
-
 static struct pppox_sock *lookup_chan(u16 call_id, __be32 s_addr)
 {
 	struct pppox_sock *sock;
diff --git a/include/net/gre.h b/include/net/gre.h
index 7a54a31d1d4c..8962e1e68449 100644
--- a/include/net/gre.h
+++ b/include/net/gre.h
@@ -7,7 +7,15 @@
 struct gre_base_hdr {
 	__be16 flags;
 	__be16 protocol;
-};
+} __packed;
+
+struct gre_full_hdr {
+	struct gre_base_hdr fixed_header;
+	__be16 csum;
+	__be16 reserved1;
+	__be32 key;
+	__be32 seq;
+} __packed;
 #define GRE_HEADER_SECTION 4
 
 #define GREPROTO_CISCO		0
diff --git a/include/net/pptp.h b/include/net/pptp.h
new file mode 100644
index 000000000000..301d3e2ba1f9
--- /dev/null
+++ b/include/net/pptp.h
@@ -0,0 +1,40 @@
+#ifndef _NET_PPTP_H
+#define _NET_PPTP_H
+
+#define PPP_LCP_ECHOREQ 0x09
+#define PPP_LCP_ECHOREP 0x0A
+#define SC_RCV_BITS     (SC_RCV_B7_1|SC_RCV_B7_0|SC_RCV_ODDP|SC_RCV_EVNP)
+
+#define MISSING_WINDOW 20
+#define WRAPPED(curseq, lastseq)\
+	((((curseq) & 0xffffff00) == 0) &&\
+	(((lastseq) & 0xffffff00) == 0xffffff00))
+
+#define PPTP_GRE_PROTO  0x880B
+#define PPTP_GRE_VER    0x1
+
+#define PPTP_GRE_FLAG_C 0x80
+#define PPTP_GRE_FLAG_R 0x40
+#define PPTP_GRE_FLAG_K 0x20
+#define PPTP_GRE_FLAG_S 0x10
+#define PPTP_GRE_FLAG_A 0x80
+
+#define PPTP_GRE_IS_C(f) ((f)&PPTP_GRE_FLAG_C)
+#define PPTP_GRE_IS_R(f) ((f)&PPTP_GRE_FLAG_R)
+#define PPTP_GRE_IS_K(f) ((f)&PPTP_GRE_FLAG_K)
+#define PPTP_GRE_IS_S(f) ((f)&PPTP_GRE_FLAG_S)
+#define PPTP_GRE_IS_A(f) ((f)&PPTP_GRE_FLAG_A)
+
+#define PPTP_HEADER_OVERHEAD (2+sizeof(struct pptp_gre_header))
+struct pptp_gre_header {
+	u8  flags;
+	u8  ver;
+	__be16 protocol;
+	__be16 payload_len;
+	__be16 call_id;
+	__be32 seq;
+	__be32 ack;
+} __packed;
+
+
+#endif
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 1046f5515174..60dbb200de60 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -24,9 +24,14 @@
 #define GRE_SEQ		__cpu_to_be16(0x1000)
 #define GRE_STRICT	__cpu_to_be16(0x0800)
 #define GRE_REC		__cpu_to_be16(0x0700)
-#define GRE_FLAGS	__cpu_to_be16(0x00F8)
+#define GRE_ACK		__cpu_to_be16(0x0080)
+#define GRE_FLAGS	__cpu_to_be16(0x0078)
 #define GRE_VERSION	__cpu_to_be16(0x0007)
 
+#define GRE_VERSION_1	__cpu_to_be16(0x0001)
+#define GRE_PROTO_PPP	__cpu_to_be16(0x880b)
+#define GRE_PPTP_KEY_MASK	__cpu_to_be32(0xffff)
+
 struct ip_tunnel_parm {
 	char			name[IFNAMSIZ];
 	int			link;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 61ad43f61c5e..91028ae2fb01 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -6,6 +6,8 @@
 #include <linux/if_vlan.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <net/gre.h>
+#include <net/pptp.h>
 #include <linux/igmp.h>
 #include <linux/icmp.h>
 #include <linux/sctp.h>
@@ -338,32 +340,42 @@ mpls:
 ip_proto_again:
 	switch (ip_proto) {
 	case IPPROTO_GRE: {
-		struct gre_hdr {
-			__be16 flags;
-			__be16 proto;
-		} *hdr, _hdr;
+		struct gre_base_hdr *hdr, _hdr;
+		u16 gre_ver;
+		int offset = 0;
 
 		hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
 		if (!hdr)
 			goto out_bad;
-		/*
-		 * Only look inside GRE if version zero and no
-		 * routing
-		 */
-		if (hdr->flags & (GRE_VERSION | GRE_ROUTING))
+
+		/* Only look inside GRE without routing */
+		if (hdr->flags & GRE_ROUTING)
 			break;
 
-		proto = hdr->proto;
-		nhoff += 4;
+		/* Only look inside GRE for version 0 and 1 */
+		gre_ver = ntohs(hdr->flags & GRE_VERSION);
+		if (gre_ver > 1)
+			break;
+
+		proto = hdr->protocol;
+		if (gre_ver) {
+			/* Version1 must be PPTP, and check the flags */
+			if (!(proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY)))
+				break;
+		}
+
+		offset += sizeof(struct gre_base_hdr);
+
 		if (hdr->flags & GRE_CSUM)
-			nhoff += 4;
+			offset += sizeof(((struct gre_full_hdr *)0)->csum) +
+				  sizeof(((struct gre_full_hdr *)0)->reserved1);
+
 		if (hdr->flags & GRE_KEY) {
 			const __be32 *keyid;
 			__be32 _keyid;
 
-			keyid = __skb_header_pointer(skb, nhoff, sizeof(_keyid),
+			keyid = __skb_header_pointer(skb, nhoff + offset, sizeof(_keyid),
 						     data, hlen, &_keyid);
-
 			if (!keyid)
 				goto out_bad;
 
@@ -372,32 +384,65 @@ ip_proto_again:
 				key_keyid = skb_flow_dissector_target(flow_dissector,
 								      FLOW_DISSECTOR_KEY_GRE_KEYID,
 								      target_container);
-				key_keyid->keyid = *keyid;
+				if (gre_ver == 0)
+					key_keyid->keyid = *keyid;
+				else
+					key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK;
 			}
-			nhoff += 4;
+			offset += sizeof(((struct gre_full_hdr *)0)->key);
 		}
+
 		if (hdr->flags & GRE_SEQ)
-			nhoff += 4;
-		if (proto == htons(ETH_P_TEB)) {
-			const struct ethhdr *eth;
-			struct ethhdr _eth;
-
-			eth = __skb_header_pointer(skb, nhoff,
-						   sizeof(_eth),
-						   data, hlen, &_eth);
-			if (!eth)
+			offset += sizeof(((struct pptp_gre_header *)0)->seq);
+
+		if (gre_ver == 0) {
+			if (proto == htons(ETH_P_TEB)) {
+				const struct ethhdr *eth;
+				struct ethhdr _eth;
+
+				eth = __skb_header_pointer(skb, nhoff + offset,
+							   sizeof(_eth),
+							   data, hlen, &_eth);
+				if (!eth)
+					goto out_bad;
+				proto = eth->h_proto;
+				offset += sizeof(*eth);
+
+				/* Cap headers that we access via pointers at the
+				 * end of the Ethernet header as our maximum alignment
+				 * at that point is only 2 bytes.
+				 */
+				if (NET_IP_ALIGN)
+					hlen = (nhoff + offset);
+			}
+		} else { /* version 1, must be PPTP */
+			u8 _ppp_hdr[PPP_HDRLEN];
+			u8 *ppp_hdr;
+
+			if (hdr->flags & GRE_ACK)
+				offset += sizeof(((struct pptp_gre_header *)0)->ack);
+
+			ppp_hdr = skb_header_pointer(skb, nhoff + offset,
+						     sizeof(_ppp_hdr), _ppp_hdr);
+			if (!ppp_hdr)
 				goto out_bad;
-			proto = eth->h_proto;
-			nhoff += sizeof(*eth);
-
-			/* Cap headers that we access via pointers at the
-			 * end of the Ethernet header as our maximum alignment
-			 * at that point is only 2 bytes.
-			 */
-			if (NET_IP_ALIGN)
-				hlen = nhoff;
+
+			switch (PPP_PROTOCOL(ppp_hdr)) {
+			case PPP_IP:
+				proto = htons(ETH_P_IP);
+				break;
+			case PPP_IPV6:
+				proto = htons(ETH_P_IPV6);
+				break;
+			default:
+				/* Could probably catch some more like MPLS */
+				break;
+			}
+
+			offset += PPP_HDRLEN;
 		}
 
+		nhoff += offset;
 		key_control->flags |= FLOW_DIS_ENCAPSULATION;
 		if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
 			goto out_good;
-- 
cgit v1.2.3-71-gd317


From cb1b69b0b15b2897daeba8674c14c85a23a3347f Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana <nevola@gmail.com>
Date: Thu, 11 Aug 2016 18:02:07 +0200
Subject: netfilter: nf_tables: add hash expression

This patch adds a new hash expression, this provides jhash support but
this can be extended to support for other hash functions. The modulus
and seed already comes embedded into this new expression.

Use case example:

	... meta mark set hash ip saddr mod 10

Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  20 +++++
 net/netfilter/Kconfig                    |   6 ++
 net/netfilter/Makefile                   |   1 +
 net/netfilter/nft_hash.c                 | 136 +++++++++++++++++++++++++++++++
 4 files changed, 163 insertions(+)
 create mode 100644 net/netfilter/nft_hash.c

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 01751faccaf8..6ce0a6dd0889 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -723,6 +723,26 @@ enum nft_meta_keys {
 	NFT_META_PRANDOM,
 };
 
+/**
+ * enum nft_hash_attributes - nf_tables hash expression netlink attributes
+ *
+ * @NFTA_HASH_SREG: source register (NLA_U32)
+ * @NFTA_HASH_DREG: destination register (NLA_U32)
+ * @NFTA_HASH_LEN: source data length (NLA_U32)
+ * @NFTA_HASH_MODULUS: modulus value (NLA_U32)
+ * @NFTA_HASH_SEED: seed value (NLA_U32)
+ */
+enum nft_hash_attributes {
+	NFTA_HASH_UNSPEC,
+	NFTA_HASH_SREG,
+	NFTA_HASH_DREG,
+	NFTA_HASH_LEN,
+	NFTA_HASH_MODULUS,
+	NFTA_HASH_SEED,
+	__NFTA_HASH_MAX,
+};
+#define NFTA_HASH_MAX	(__NFTA_HASH_MAX - 1)
+
 /**
  * enum nft_meta_attributes - nf_tables meta expression netlink attributes
  *
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index e5740e108a0b..9cfaa00c79b2 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -563,6 +563,12 @@ config NFT_COMPAT
 	  x_tables match/target extensions over the nf_tables
 	  framework.
 
+config NFT_HASH
+	tristate "Netfilter nf_tables hash module"
+	help
+	  This option adds the "hash" expression that you can use to perform
+	  a hash operation on registers.
+
 if NF_TABLES_NETDEV
 
 config NF_DUP_NETDEV
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 101fb859203c..1106ccde215c 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -92,6 +92,7 @@ obj-$(CONFIG_NFT_COUNTER)	+= nft_counter.o
 obj-$(CONFIG_NFT_LOG)		+= nft_log.o
 obj-$(CONFIG_NFT_MASQ)		+= nft_masq.o
 obj-$(CONFIG_NFT_REDIR)		+= nft_redir.o
+obj-$(CONFIG_NFT_HASH)		+= nft_hash.o
 
 # nf_tables netdev
 obj-$(CONFIG_NFT_DUP_NETDEV)	+= nft_dup_netdev.o
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
new file mode 100644
index 000000000000..b82ff29b3f5f
--- /dev/null
+++ b/net/netfilter/nft_hash.c
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2016 Laura Garcia <nevola@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <linux/jhash.h>
+
+struct nft_hash {
+	enum nft_registers      sreg:8;
+	enum nft_registers      dreg:8;
+	u8			len;
+	u32			modulus;
+	u32			seed;
+};
+
+static void nft_hash_eval(const struct nft_expr *expr,
+			  struct nft_regs *regs,
+			  const struct nft_pktinfo *pkt)
+{
+	struct nft_hash *priv = nft_expr_priv(expr);
+	const void *data = &regs->data[priv->sreg];
+
+	regs->data[priv->dreg] =
+		reciprocal_scale(jhash(data, priv->len, priv->seed),
+				 priv->modulus);
+}
+
+const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
+	[NFTA_HASH_SREG]	= { .type = NLA_U32 },
+	[NFTA_HASH_DREG]	= { .type = NLA_U32 },
+	[NFTA_HASH_LEN]		= { .type = NLA_U32 },
+	[NFTA_HASH_MODULUS]	= { .type = NLA_U32 },
+	[NFTA_HASH_SEED]	= { .type = NLA_U32 },
+};
+
+static int nft_hash_init(const struct nft_ctx *ctx,
+			 const struct nft_expr *expr,
+			 const struct nlattr * const tb[])
+{
+	struct nft_hash *priv = nft_expr_priv(expr);
+	u32 len;
+
+	if (!tb[NFTA_HASH_SREG] ||
+	    !tb[NFTA_HASH_DREG] ||
+	    !tb[NFTA_HASH_LEN]  ||
+	    !tb[NFTA_HASH_SEED] ||
+	    !tb[NFTA_HASH_MODULUS])
+		return -EINVAL;
+
+	priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]);
+	priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]);
+
+	len = ntohl(nla_get_be32(tb[NFTA_HASH_LEN]));
+	if (len == 0 || len > U8_MAX)
+		return -ERANGE;
+
+	priv->len = len;
+
+	priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS]));
+	if (priv->modulus <= 1)
+		return -ERANGE;
+
+	priv->seed = ntohl(nla_get_be32(tb[NFTA_HASH_SEED]));
+
+	return nft_validate_register_load(priv->sreg, len) &&
+	       nft_validate_register_store(ctx, priv->dreg, NULL,
+					   NFT_DATA_VALUE, sizeof(u32));
+}
+
+static int nft_hash_dump(struct sk_buff *skb,
+			 const struct nft_expr *expr)
+{
+	const struct nft_hash *priv = nft_expr_priv(expr);
+
+	if (nft_dump_register(skb, NFTA_HASH_SREG, priv->sreg))
+		goto nla_put_failure;
+	if (nft_dump_register(skb, NFTA_HASH_DREG, priv->dreg))
+		goto nla_put_failure;
+	if (nft_dump_register(skb, NFTA_HASH_LEN, priv->len))
+		goto nla_put_failure;
+	if (nft_dump_register(skb, NFTA_HASH_MODULUS, priv->modulus))
+		goto nla_put_failure;
+	if (nft_dump_register(skb, NFTA_HASH_SEED, priv->seed))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_type nft_hash_type;
+static const struct nft_expr_ops nft_hash_ops = {
+	.type		= &nft_hash_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_hash)),
+	.eval		= nft_hash_eval,
+	.init		= nft_hash_init,
+	.dump		= nft_hash_dump,
+};
+
+static struct nft_expr_type nft_hash_type __read_mostly = {
+	.name		= "hash",
+	.ops		= &nft_hash_ops,
+	.policy		= nft_hash_policy,
+	.maxattr	= NFTA_HASH_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_hash_module_init(void)
+{
+	return nft_register_expr(&nft_hash_type);
+}
+
+static void __exit nft_hash_module_exit(void)
+{
+	nft_unregister_expr(&nft_hash_type);
+}
+
+module_init(nft_hash_module_init);
+module_exit(nft_hash_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Laura Garcia <nevola@gmail.com>");
+MODULE_ALIAS_NFT_EXPR("hash");
-- 
cgit v1.2.3-71-gd317


From 300d8b93d73533411a98362350fe7d1795f1a044 Mon Sep 17 00:00:00 2001
From: Appana Durga Kedareswara Rao <appana.durga.rao@xilinx.com>
Date: Wed, 10 Aug 2016 11:20:06 +0530
Subject: net: Add mask for Control register 10Mbps speed

This patch adds mask for the Control register
10Mbps speed.

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Kedareswara rao Appana <appanad@xilinx.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/mii.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/mii.h b/include/uapi/linux/mii.h
index 237fac4bc17b..15d8510cdae0 100644
--- a/include/uapi/linux/mii.h
+++ b/include/uapi/linux/mii.h
@@ -48,6 +48,7 @@
 #define BMCR_SPEED100		0x2000	/* Select 100Mbps              */
 #define BMCR_LOOPBACK		0x4000	/* TXD loopback bits           */
 #define BMCR_RESET		0x8000	/* Reset to default state      */
+#define BMCR_SPEED10		0x0000	/* Select 10Mbps               */
 
 /* Basic mode status register. */
 #define BMSR_ERCAP		0x0001	/* Ext-reg capability          */
-- 
cgit v1.2.3-71-gd317


From 60d20f9195b260bdf0ac10c275ae9f6016f9c069 Mon Sep 17 00:00:00 2001
From: Sargun Dhillon <sargun@sargun.me>
Date: Fri, 12 Aug 2016 08:56:52 -0700
Subject: bpf: Add bpf_current_task_under_cgroup helper

This adds a bpf helper that's similar to the skb_in_cgroup helper to check
whether the probe is currently executing in the context of a specific
subset of the cgroupsv2 hierarchy. It does this based on membership test
for a cgroup arraymap. It is invalid to call this in an interrupt, and
it'll return an error. The helper is primarily to be used in debugging
activities for containers, where you may have multiple programs running in
a given top-level "container".

Signed-off-by: Sargun Dhillon <sargun@sargun.me>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Tejun Heo <tj@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 11 +++++++++++
 kernel/bpf/arraymap.c    |  2 +-
 kernel/bpf/verifier.c    |  4 +++-
 kernel/trace/bpf_trace.c | 30 ++++++++++++++++++++++++++++++
 4 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index da218fec6056..bea0c4e2830a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -375,6 +375,17 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_probe_write_user,
 
+	/**
+	 * bpf_current_task_under_cgroup(map, index) - Check cgroup2 membership of current task
+	 * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
+	 * @index: index of the cgroup in the bpf_map
+	 * Return:
+	 *   == 0 current failed the cgroup2 descendant test
+	 *   == 1 current succeeded the cgroup2 descendant test
+	 *    < 0 error
+	 */
+	BPF_FUNC_current_task_under_cgroup,
+
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 633a650d7aeb..a2ac051c342f 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -538,7 +538,7 @@ static int __init register_perf_event_array_map(void)
 }
 late_initcall(register_perf_event_array_map);
 
-#ifdef CONFIG_SOCK_CGROUP_DATA
+#ifdef CONFIG_CGROUPS
 static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
 				     struct file *map_file /* not used */,
 				     int fd)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7094c69ac199..d504722ebfa4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1053,7 +1053,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_CGROUP_ARRAY:
-		if (func_id != BPF_FUNC_skb_in_cgroup)
+		if (func_id != BPF_FUNC_skb_in_cgroup &&
+		    func_id != BPF_FUNC_current_task_under_cgroup)
 			goto error;
 		break;
 	default:
@@ -1075,6 +1076,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
 			goto error;
 		break;
+	case BPF_FUNC_current_task_under_cgroup:
 	case BPF_FUNC_skb_in_cgroup:
 		if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
 			goto error;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b20438fdb029..6b794d6669a7 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -376,6 +376,34 @@ static const struct bpf_func_proto bpf_get_current_task_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
+static u64 bpf_current_task_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	struct bpf_map *map = (struct bpf_map *)(long)r1;
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct cgroup *cgrp;
+	u32 idx = (u32)r2;
+
+	if (unlikely(in_interrupt()))
+		return -EINVAL;
+
+	if (unlikely(idx >= array->map.max_entries))
+		return -E2BIG;
+
+	cgrp = READ_ONCE(array->ptrs[idx]);
+	if (unlikely(!cgrp))
+		return -EAGAIN;
+
+	return task_under_cgroup_hierarchy(current, cgrp);
+}
+
+static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
+	.func           = bpf_current_task_under_cgroup,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_CONST_MAP_PTR,
+	.arg2_type      = ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -407,6 +435,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 		return &bpf_perf_event_read_proto;
 	case BPF_FUNC_probe_write_user:
 		return bpf_get_probe_write_proto();
+	case BPF_FUNC_current_task_under_cgroup:
+		return &bpf_current_task_under_cgroup_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3-71-gd317


From 03459345bc00da70a35fa39bcfcf13d779097074 Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Sat, 13 Aug 2016 00:30:48 +0800
Subject: pptp: Refactor the struct and macros of PPTP codes

1. Use struct gre_base_hdr directly in pptp_gre_header instead of
duplicated members;
2. Use existing macros like GRE_KEY, GRE_SEQ, and so on instead of
duplicated macros defined by PPTP;
3. Add new macros like GRE_IS_ACK/SEQ and so on instead of
PPTP_GRE_IS_A/S and so on;

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Reviewed-by: Philip Prindeville <philipp@redfish-solutions.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ppp/pptp.c         | 28 +++++++++++++---------------
 include/net/pptp.h             | 19 +------------------
 include/uapi/linux/if_tunnel.h | 12 ++++++++++--
 3 files changed, 24 insertions(+), 35 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index 3e68dbc0af7f..1951b1085cb8 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -206,16 +206,14 @@ static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
 	skb_push(skb, header_len);
 	hdr = (struct pptp_gre_header *)(skb->data);
 
-	hdr->flags       = PPTP_GRE_FLAG_K;
-	hdr->ver         = PPTP_GRE_VER;
-	hdr->protocol    = htons(PPTP_GRE_PROTO);
-	hdr->call_id     = htons(opt->dst_addr.call_id);
+	hdr->gre_hd.flags = GRE_KEY | GRE_VERSION_1 | GRE_SEQ;
+	hdr->gre_hd.protocol = GRE_PROTO_PPP;
+	hdr->call_id = htons(opt->dst_addr.call_id);
 
-	hdr->flags      |= PPTP_GRE_FLAG_S;
-	hdr->seq         = htonl(++opt->seq_sent);
+	hdr->seq = htonl(++opt->seq_sent);
 	if (opt->ack_sent != seq_recv)	{
 		/* send ack with this message */
-		hdr->ver |= PPTP_GRE_FLAG_A;
+		hdr->gre_hd.flags |= GRE_ACK;
 		hdr->ack  = htonl(seq_recv);
 		opt->ack_sent = seq_recv;
 	}
@@ -278,7 +276,7 @@ static int pptp_rcv_core(struct sock *sk, struct sk_buff *skb)
 	headersize  = sizeof(*header);
 
 	/* test if acknowledgement present */
-	if (PPTP_GRE_IS_A(header->ver)) {
+	if (GRE_IS_ACK(header->gre_hd.flags)) {
 		__u32 ack;
 
 		if (!pskb_may_pull(skb, headersize))
@@ -286,7 +284,7 @@ static int pptp_rcv_core(struct sock *sk, struct sk_buff *skb)
 		header = (struct pptp_gre_header *)(skb->data);
 
 		/* ack in different place if S = 0 */
-		ack = PPTP_GRE_IS_S(header->flags) ? header->ack : header->seq;
+		ack = GRE_IS_SEQ(header->gre_hd.flags) ? header->ack : header->seq;
 
 		ack = ntohl(ack);
 
@@ -299,7 +297,7 @@ static int pptp_rcv_core(struct sock *sk, struct sk_buff *skb)
 		headersize -= sizeof(header->ack);
 	}
 	/* test if payload present */
-	if (!PPTP_GRE_IS_S(header->flags))
+	if (!GRE_IS_SEQ(header->gre_hd.flags))
 		goto drop;
 
 	payload_len = ntohs(header->payload_len);
@@ -360,11 +358,11 @@ static int pptp_rcv(struct sk_buff *skb)
 
 	header = (struct pptp_gre_header *)skb->data;
 
-	if (ntohs(header->protocol) != PPTP_GRE_PROTO || /* PPTP-GRE protocol for PPTP */
-		PPTP_GRE_IS_C(header->flags) ||                /* flag C should be clear */
-		PPTP_GRE_IS_R(header->flags) ||                /* flag R should be clear */
-		!PPTP_GRE_IS_K(header->flags) ||               /* flag K should be set */
-		(header->flags&0xF) != 0)                      /* routing and recursion ctrl = 0 */
+	if (header->gre_hd.protocol != GRE_PROTO_PPP || /* PPTP-GRE protocol for PPTP */
+		GRE_IS_CSUM(header->gre_hd.flags) ||    /* flag CSUM should be clear */
+		GRE_IS_ROUTING(header->gre_hd.flags) || /* flag ROUTING should be clear */
+		!GRE_IS_KEY(header->gre_hd.flags) ||    /* flag KEY should be set */
+		(header->gre_hd.flags & GRE_FLAGS))     /* flag Recursion Ctrl should be clear */
 		/* if invalid, discard this packet */
 		goto drop;
 
diff --git a/include/net/pptp.h b/include/net/pptp.h
index 301d3e2ba1f9..92e9f1fe2628 100644
--- a/include/net/pptp.h
+++ b/include/net/pptp.h
@@ -10,26 +10,9 @@
 	((((curseq) & 0xffffff00) == 0) &&\
 	(((lastseq) & 0xffffff00) == 0xffffff00))
 
-#define PPTP_GRE_PROTO  0x880B
-#define PPTP_GRE_VER    0x1
-
-#define PPTP_GRE_FLAG_C 0x80
-#define PPTP_GRE_FLAG_R 0x40
-#define PPTP_GRE_FLAG_K 0x20
-#define PPTP_GRE_FLAG_S 0x10
-#define PPTP_GRE_FLAG_A 0x80
-
-#define PPTP_GRE_IS_C(f) ((f)&PPTP_GRE_FLAG_C)
-#define PPTP_GRE_IS_R(f) ((f)&PPTP_GRE_FLAG_R)
-#define PPTP_GRE_IS_K(f) ((f)&PPTP_GRE_FLAG_K)
-#define PPTP_GRE_IS_S(f) ((f)&PPTP_GRE_FLAG_S)
-#define PPTP_GRE_IS_A(f) ((f)&PPTP_GRE_FLAG_A)
-
 #define PPTP_HEADER_OVERHEAD (2+sizeof(struct pptp_gre_header))
 struct pptp_gre_header {
-	u8  flags;
-	u8  ver;
-	__be16 protocol;
+	struct gre_base_hdr gre_hd;
 	__be16 payload_len;
 	__be16 call_id;
 	__be32 seq;
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 60dbb200de60..361b9f0f20d7 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -28,8 +28,16 @@
 #define GRE_FLAGS	__cpu_to_be16(0x0078)
 #define GRE_VERSION	__cpu_to_be16(0x0007)
 
-#define GRE_VERSION_1	__cpu_to_be16(0x0001)
-#define GRE_PROTO_PPP	__cpu_to_be16(0x880b)
+#define GRE_IS_CSUM(f)		((f) & GRE_CSUM)
+#define GRE_IS_ROUTING(f)	((f) & GRE_ROUTING)
+#define GRE_IS_KEY(f)		((f) & GRE_KEY)
+#define GRE_IS_SEQ(f)		((f) & GRE_SEQ)
+#define GRE_IS_STRICT(f)	((f) & GRE_STRICT)
+#define GRE_IS_REC(f)		((f) & GRE_REC)
+#define GRE_IS_ACK(f)		((f) & GRE_ACK)
+
+#define GRE_VERSION_1		__cpu_to_be16(0x0001)
+#define GRE_PROTO_PPP		__cpu_to_be16(0x880b)
 #define GRE_PPTP_KEY_MASK	__cpu_to_be32(0xffff)
 
 struct ip_tunnel_parm {
-- 
cgit v1.2.3-71-gd317


From 9399ae9a6cb28ebac78216f715ace3b42f1c2132 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Wed, 17 Aug 2016 13:36:13 +0300
Subject: net_sched: flower: Add vlan support

Enhance flower to support 802.1Q vlan protocol classification.
Currently, the supported fields are vlan_id and vlan_priority.

Example:

	# add a flower filter with vlan id and priority classification
	tc filter add dev ens4f0 protocol 802.1Q parent ffff: \
		flower \
		indev ens4f0 \
		vlan_ethtype ipv4 \
		vlan_id 100 \
		vlan_prio 3 \
	action vlan pop

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  3 ++
 net/sched/cls_flower.c       | 70 ++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 70 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index d1c1ccaba787..51b5b247fb5a 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -428,6 +428,9 @@ enum {
 	TCA_FLOWER_KEY_UDP_DST,		/* be16 */
 
 	TCA_FLOWER_FLAGS,
+	TCA_FLOWER_KEY_VLAN_ID,
+	TCA_FLOWER_KEY_VLAN_PRIO,
+	TCA_FLOWER_KEY_VLAN_ETH_TYPE,
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 0080fc073019..1e11e57e6947 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -28,6 +28,7 @@ struct fl_flow_key {
 	struct flow_dissector_key_control control;
 	struct flow_dissector_key_basic basic;
 	struct flow_dissector_key_eth_addrs eth;
+	struct flow_dissector_key_vlan vlan;
 	struct flow_dissector_key_addrs ipaddrs;
 	union {
 		struct flow_dissector_key_ipv4_addrs ipv4;
@@ -293,6 +294,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_KEY_TCP_DST]	= { .type = NLA_U16 },
 	[TCA_FLOWER_KEY_UDP_SRC]	= { .type = NLA_U16 },
 	[TCA_FLOWER_KEY_UDP_DST]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_VLAN_ID]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_VLAN_PRIO]	= { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_VLAN_ETH_TYPE]	= { .type = NLA_U16 },
+
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -308,9 +313,29 @@ static void fl_set_key_val(struct nlattr **tb,
 		memcpy(mask, nla_data(tb[mask_type]), len);
 }
 
+static void fl_set_key_vlan(struct nlattr **tb,
+			    struct flow_dissector_key_vlan *key_val,
+			    struct flow_dissector_key_vlan *key_mask)
+{
+#define VLAN_PRIORITY_MASK	0x7
+
+	if (tb[TCA_FLOWER_KEY_VLAN_ID]) {
+		key_val->vlan_id =
+			nla_get_u16(tb[TCA_FLOWER_KEY_VLAN_ID]) & VLAN_VID_MASK;
+		key_mask->vlan_id = VLAN_VID_MASK;
+	}
+	if (tb[TCA_FLOWER_KEY_VLAN_PRIO]) {
+		key_val->vlan_priority =
+			nla_get_u8(tb[TCA_FLOWER_KEY_VLAN_PRIO]) &
+			VLAN_PRIORITY_MASK;
+		key_mask->vlan_priority = VLAN_PRIORITY_MASK;
+	}
+}
+
 static int fl_set_key(struct net *net, struct nlattr **tb,
 		      struct fl_flow_key *key, struct fl_flow_key *mask)
 {
+	__be16 ethertype;
 #ifdef CONFIG_NET_CLS_IND
 	if (tb[TCA_FLOWER_INDEV]) {
 		int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]);
@@ -328,9 +353,19 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 		       mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK,
 		       sizeof(key->eth.src));
 
-	fl_set_key_val(tb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE,
-		       &mask->basic.n_proto, TCA_FLOWER_UNSPEC,
-		       sizeof(key->basic.n_proto));
+	if (tb[TCA_FLOWER_KEY_ETH_TYPE])
+		ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]);
+
+	if (ethertype == htons(ETH_P_8021Q)) {
+		fl_set_key_vlan(tb, &key->vlan, &mask->vlan);
+		fl_set_key_val(tb, &key->basic.n_proto,
+			       TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+			       &mask->basic.n_proto, TCA_FLOWER_UNSPEC,
+			       sizeof(key->basic.n_proto));
+	} else {
+		key->basic.n_proto = ethertype;
+		mask->basic.n_proto = cpu_to_be16(~0);
+	}
 
 	if (key->basic.n_proto == htons(ETH_P_IP) ||
 	    key->basic.n_proto == htons(ETH_P_IPV6)) {
@@ -438,6 +473,8 @@ static void fl_init_dissector(struct cls_fl_head *head,
 			     FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
 	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
 			     FLOW_DISSECTOR_KEY_PORTS, tp);
+	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+			     FLOW_DISSECTOR_KEY_VLAN, vlan);
 
 	skb_flow_dissector_init(&head->dissector, keys, cnt);
 }
@@ -666,6 +703,29 @@ static int fl_dump_key_val(struct sk_buff *skb,
 	return 0;
 }
 
+static int fl_dump_key_vlan(struct sk_buff *skb,
+			    struct flow_dissector_key_vlan *vlan_key,
+			    struct flow_dissector_key_vlan *vlan_mask)
+{
+	int err;
+
+	if (!memchr_inv(vlan_mask, 0, sizeof(*vlan_mask)))
+		return 0;
+	if (vlan_mask->vlan_id) {
+		err = nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ID,
+				  vlan_key->vlan_id);
+		if (err)
+			return err;
+	}
+	if (vlan_mask->vlan_priority) {
+		err = nla_put_u8(skb, TCA_FLOWER_KEY_VLAN_PRIO,
+				 vlan_key->vlan_priority);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
 static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 		   struct sk_buff *skb, struct tcmsg *t)
 {
@@ -710,6 +770,10 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 			    &mask->basic.n_proto, TCA_FLOWER_UNSPEC,
 			    sizeof(key->basic.n_proto)))
 		goto nla_put_failure;
+
+	if (fl_dump_key_vlan(skb, &key->vlan, &mask->vlan))
+		goto nla_put_failure;
+
 	if ((key->basic.n_proto == htons(ETH_P_IP) ||
 	     key->basic.n_proto == htons(ETH_P_IPV6)) &&
 	    fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
-- 
cgit v1.2.3-71-gd317


From 956af37102b515512331a03c35c958b2a1d8dd87 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Wed, 17 Aug 2016 13:36:14 +0300
Subject: net_sched: act_vlan: Add priority option

The current vlan push action supports only vid and protocol options.
Add priority option.

Example script that adds vlan push action with vid and
priority:

tc filter add dev veth0 protocol ip parent ffff: \
	   flower \
	   	indev veth0 \
	   action vlan push id 100 priority 5

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_vlan.h        |  1 +
 include/uapi/linux/tc_act/tc_vlan.h |  1 +
 net/sched/act_vlan.c                | 13 +++++++++++--
 3 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h
index e29f52e8bdf1..6b835889ea30 100644
--- a/include/net/tc_act/tc_vlan.h
+++ b/include/net/tc_act/tc_vlan.h
@@ -20,6 +20,7 @@ struct tcf_vlan {
 	int			tcfv_action;
 	u16			tcfv_push_vid;
 	__be16			tcfv_push_proto;
+	u8			tcfv_push_prio;
 };
 #define to_vlan(a) ((struct tcf_vlan *)a)
 
diff --git a/include/uapi/linux/tc_act/tc_vlan.h b/include/uapi/linux/tc_act/tc_vlan.h
index 31151ff6264f..be72b6e3843b 100644
--- a/include/uapi/linux/tc_act/tc_vlan.h
+++ b/include/uapi/linux/tc_act/tc_vlan.h
@@ -29,6 +29,7 @@ enum {
 	TCA_VLAN_PUSH_VLAN_ID,
 	TCA_VLAN_PUSH_VLAN_PROTOCOL,
 	TCA_VLAN_PAD,
+	TCA_VLAN_PUSH_VLAN_PRIORITY,
 	__TCA_VLAN_MAX,
 };
 #define TCA_VLAN_MAX (__TCA_VLAN_MAX - 1)
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 691409de3e1a..59a8d3150ae2 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -43,7 +43,8 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
 			goto drop;
 		break;
 	case TCA_VLAN_ACT_PUSH:
-		err = skb_vlan_push(skb, v->tcfv_push_proto, v->tcfv_push_vid);
+		err = skb_vlan_push(skb, v->tcfv_push_proto, v->tcfv_push_vid |
+				    (v->tcfv_push_prio << VLAN_PRIO_SHIFT));
 		if (err)
 			goto drop;
 		break;
@@ -65,6 +66,7 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
 	[TCA_VLAN_PARMS]		= { .len = sizeof(struct tc_vlan) },
 	[TCA_VLAN_PUSH_VLAN_ID]		= { .type = NLA_U16 },
 	[TCA_VLAN_PUSH_VLAN_PROTOCOL]	= { .type = NLA_U16 },
+	[TCA_VLAN_PUSH_VLAN_PRIORITY]	= { .type = NLA_U8 },
 };
 
 static int tcf_vlan_init(struct net *net, struct nlattr *nla,
@@ -78,6 +80,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 	int action;
 	__be16 push_vid = 0;
 	__be16 push_proto = 0;
+	u8 push_prio = 0;
 	bool exists = false;
 	int ret = 0, err;
 
@@ -123,6 +126,9 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 		} else {
 			push_proto = htons(ETH_P_8021Q);
 		}
+
+		if (tb[TCA_VLAN_PUSH_VLAN_PRIORITY])
+			push_prio = nla_get_u8(tb[TCA_VLAN_PUSH_VLAN_PRIORITY]);
 		break;
 	default:
 		if (exists)
@@ -150,6 +156,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 
 	v->tcfv_action = action;
 	v->tcfv_push_vid = push_vid;
+	v->tcfv_push_prio = push_prio;
 	v->tcfv_push_proto = push_proto;
 
 	v->tcf_action = parm->action;
@@ -181,7 +188,9 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
 	if (v->tcfv_action == TCA_VLAN_ACT_PUSH &&
 	    (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) ||
 	     nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL,
-			  v->tcfv_push_proto)))
+			  v->tcfv_push_proto) ||
+	     (nla_put_u8(skb, TCA_VLAN_PUSH_VLAN_PRIORITY,
+					      v->tcfv_push_prio))))
 		goto nla_put_failure;
 
 	tcf_tm_dump(&t, &v->tcf_tm);
-- 
cgit v1.2.3-71-gd317


From 61ba1a2da9693b88bf5f2bb8e7a99a29cd139122 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Wed, 17 Aug 2016 12:53:10 +0200
Subject: net: bridge: export vlan flags with the stats

Use one of the vlan xstats padding fields to export the vlan flags. This is
needed in order to be able to distinguish between master (bridge) and port
vlan entries in user-space when dumping the bridge vlan stats.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 2 +-
 net/bridge/br_netlink.c        | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index c186f64fffca..ab92bca6d448 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -140,7 +140,7 @@ struct bridge_vlan_xstats {
 	__u64 tx_bytes;
 	__u64 tx_packets;
 	__u16 vid;
-	__u16 pad1;
+	__u16 flags;
 	__u32 pad2;
 };
 
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 493ab9b3d51a..872d4c0deb59 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1321,6 +1321,7 @@ static int br_fill_linkxstats(struct sk_buff *skb,
 				continue;
 			memset(&vxi, 0, sizeof(vxi));
 			vxi.vid = v->vid;
+			vxi.flags = v->flags;
 			br_vlan_get_stats(v, &stats);
 			vxi.rx_bytes = stats.rx_bytes;
 			vxi.rx_packets = stats.rx_packets;
-- 
cgit v1.2.3-71-gd317


From b34040227be7da760cc72ef3c807e0985e7f0f16 Mon Sep 17 00:00:00 2001
From: Richard Alpe <richard.alpe@ericsson.com>
Date: Thu, 18 Aug 2016 10:33:52 +0200
Subject: tipc: add peer removal functionality

Add TIPC_NL_PEER_REMOVE netlink command. This command can remove
an offline peer node from the internal data structures.

This will be supported by the tipc user space tool in iproute2.

Signed-off-by: Richard Alpe <richard.alpe@ericsson.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc_netlink.h |  1 +
 net/tipc/net.h                    |  2 ++
 net/tipc/netlink.c                |  5 ++++
 net/tipc/node.c                   | 63 +++++++++++++++++++++++++++++++++++++++
 net/tipc/node.h                   |  1 +
 5 files changed, 72 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h
index 5f3f6d09fb79..bcb65ef725f6 100644
--- a/include/uapi/linux/tipc_netlink.h
+++ b/include/uapi/linux/tipc_netlink.h
@@ -59,6 +59,7 @@ enum {
 	TIPC_NL_MON_SET,
 	TIPC_NL_MON_GET,
 	TIPC_NL_MON_PEER_GET,
+	TIPC_NL_PEER_REMOVE,
 
 	__TIPC_NL_CMD_MAX,
 	TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1
diff --git a/net/tipc/net.h b/net/tipc/net.h
index 77a7a118911d..c7c254902873 100644
--- a/net/tipc/net.h
+++ b/net/tipc/net.h
@@ -39,6 +39,8 @@
 
 #include <net/genetlink.h>
 
+extern const struct nla_policy tipc_nl_net_policy[];
+
 int tipc_net_start(struct net *net, u32 addr);
 
 void tipc_net_stop(struct net *net);
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index a84daec0afe9..2718de667828 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -238,6 +238,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = {
 		.dumpit	= tipc_nl_node_dump_monitor_peer,
 		.policy = tipc_nl_policy,
 	},
+	{
+		.cmd	= TIPC_NL_PEER_REMOVE,
+		.doit	= tipc_nl_peer_rm,
+		.policy = tipc_nl_policy,
+	}
 };
 
 int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr)
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 21974191e425..7e8b75fd1a02 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1553,6 +1553,69 @@ discard:
 	kfree_skb(skb);
 }
 
+int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net *net = sock_net(skb->sk);
+	struct tipc_net *tn = net_generic(net, tipc_net_id);
+	struct nlattr *attrs[TIPC_NLA_NET_MAX + 1];
+	struct tipc_node *peer;
+	u32 addr;
+	int err;
+	int i;
+
+	/* We identify the peer by its net */
+	if (!info->attrs[TIPC_NLA_NET])
+		return -EINVAL;
+
+	err = nla_parse_nested(attrs, TIPC_NLA_NET_MAX,
+			       info->attrs[TIPC_NLA_NET],
+			       tipc_nl_net_policy);
+	if (err)
+		return err;
+
+	if (!attrs[TIPC_NLA_NET_ADDR])
+		return -EINVAL;
+
+	addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]);
+
+	if (in_own_node(net, addr))
+		return -ENOTSUPP;
+
+	spin_lock_bh(&tn->node_list_lock);
+	peer = tipc_node_find(net, addr);
+	if (!peer) {
+		spin_unlock_bh(&tn->node_list_lock);
+		return -ENXIO;
+	}
+
+	tipc_node_write_lock(peer);
+	if (peer->state != SELF_DOWN_PEER_DOWN &&
+	    peer->state != SELF_DOWN_PEER_LEAVING) {
+		tipc_node_write_unlock(peer);
+		err = -EBUSY;
+		goto err_out;
+	}
+
+	for (i = 0; i < MAX_BEARERS; i++) {
+		struct tipc_link_entry *le = &peer->links[i];
+
+		if (le->link) {
+			kfree(le->link);
+			le->link = NULL;
+			peer->link_cnt--;
+		}
+	}
+	tipc_node_write_unlock(peer);
+	tipc_node_delete(peer);
+
+	err = 0;
+err_out:
+	tipc_node_put(peer);
+	spin_unlock_bh(&tn->node_list_lock);
+
+	return err;
+}
+
 int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	int err;
diff --git a/net/tipc/node.h b/net/tipc/node.h
index d69fdfcc0ec9..4578b34c7dca 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -77,6 +77,7 @@ int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb);
 int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info);
 int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info);
 int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info);
+int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info);
 
 int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info);
 int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info);
-- 
cgit v1.2.3-71-gd317


From 5293efe62df81908f2e90c9820c7edcc8e61f5e9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 18 Aug 2016 01:00:39 +0200
Subject: bpf: add bpf_skb_change_tail helper

This work adds a bpf_skb_change_tail() helper for tc BPF programs. The
basic idea is to expand or shrink the skb in a controlled manner. The
eBPF program can then rewrite the rest via helpers like bpf_skb_store_bytes(),
bpf_lX_csum_replace() and others rather than passing a raw buffer for
writing here.

bpf_skb_change_tail() is really a slow path helper and intended for
replies with f.e. ICMP control messages. Concept is similar to other
helpers like bpf_skb_change_proto() helper to keep the helper without
protocol specifics and let the BPF program mangle the remaining parts.
A flags field has been added and is reserved for now should we extend
the helper in future.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h   |  43 +++++++++++++++++++-
 include/uapi/linux/bpf.h |  11 ++++++
 net/core/filter.c        | 100 +++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 150 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0f665cb26b50..7047448e8129 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2295,7 +2295,7 @@ static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len)
 
 int ___pskb_trim(struct sk_buff *skb, unsigned int len);
 
-static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
+static inline void __skb_set_length(struct sk_buff *skb, unsigned int len)
 {
 	if (unlikely(skb_is_nonlinear(skb))) {
 		WARN_ON(1);
@@ -2305,6 +2305,11 @@ static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
 	skb_set_tail_pointer(skb, len);
 }
 
+static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
+{
+	__skb_set_length(skb, len);
+}
+
 void skb_trim(struct sk_buff *skb, unsigned int len);
 
 static inline int __pskb_trim(struct sk_buff *skb, unsigned int len)
@@ -2335,6 +2340,20 @@ static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len)
 	BUG_ON(err);
 }
 
+static inline int __skb_grow(struct sk_buff *skb, unsigned int len)
+{
+	unsigned int diff = len - skb->len;
+
+	if (skb_tailroom(skb) < diff) {
+		int ret = pskb_expand_head(skb, 0, diff - skb_tailroom(skb),
+					   GFP_ATOMIC);
+		if (ret)
+			return ret;
+	}
+	__skb_set_length(skb, len);
+	return 0;
+}
+
 /**
  *	skb_orphan - orphan a buffer
  *	@skb: buffer to orphan
@@ -2938,6 +2957,21 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
 	return __pskb_trim(skb, len);
 }
 
+static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len)
+{
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		skb->ip_summed = CHECKSUM_NONE;
+	__skb_trim(skb, len);
+	return 0;
+}
+
+static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
+{
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		skb->ip_summed = CHECKSUM_NONE;
+	return __skb_grow(skb, len);
+}
+
 #define skb_queue_walk(queue, skb) \
 		for (skb = (queue)->next;					\
 		     skb != (struct sk_buff *)(queue);				\
@@ -3726,6 +3760,13 @@ static inline bool skb_is_gso_v6(const struct sk_buff *skb)
 	return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6;
 }
 
+static inline void skb_gso_reset(struct sk_buff *skb)
+{
+	skb_shinfo(skb)->gso_size = 0;
+	skb_shinfo(skb)->gso_segs = 0;
+	skb_shinfo(skb)->gso_type = 0;
+}
+
 void __skb_warn_lro_forwarding(const struct sk_buff *skb);
 
 static inline bool skb_warn_if_lro(const struct sk_buff *skb)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 866d53c33298..e4c5a1baa993 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -386,6 +386,17 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_current_task_under_cgroup,
 
+	/**
+	 * bpf_skb_change_tail(skb, len, flags)
+	 * The helper will resize the skb to the given new size,
+	 * to be used f.e. with control messages.
+	 * @skb: pointer to skb
+	 * @len: new skb length
+	 * @flags: reserved
+	 * Return: 0 on success or negative error
+	 */
+	BPF_FUNC_skb_change_tail,
+
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 58b5e6dd25fe..abf546d96b6b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1350,14 +1350,18 @@ struct bpf_scratchpad {
 
 static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
 
+static inline int __bpf_try_make_writable(struct sk_buff *skb,
+					  unsigned int write_len)
+{
+	return skb_ensure_writable(skb, write_len);
+}
+
 static inline int bpf_try_make_writable(struct sk_buff *skb,
 					unsigned int write_len)
 {
-	int err;
+	int err = __bpf_try_make_writable(skb, write_len);
 
-	err = skb_ensure_writable(skb, write_len);
 	bpf_compute_data_end(skb);
-
 	return err;
 }
 
@@ -1992,6 +1996,92 @@ static const struct bpf_func_proto bpf_skb_change_type_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+static u32 __bpf_skb_min_len(const struct sk_buff *skb)
+{
+	u32 min_len = skb_network_offset(skb);
+
+	if (skb_transport_header_was_set(skb))
+		min_len = skb_transport_offset(skb);
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		min_len = skb_checksum_start_offset(skb) +
+			  skb->csum_offset + sizeof(__sum16);
+	return min_len;
+}
+
+static u32 __bpf_skb_max_len(const struct sk_buff *skb)
+{
+	return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len :
+	       65536;
+}
+
+static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
+{
+	unsigned int old_len = skb->len;
+	int ret;
+
+	ret = __skb_grow_rcsum(skb, new_len);
+	if (!ret)
+		memset(skb->data + old_len, 0, new_len - old_len);
+	return ret;
+}
+
+static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
+{
+	return __skb_trim_rcsum(skb, new_len);
+}
+
+static u64 bpf_skb_change_tail(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *)(long) r1;
+	u32 max_len = __bpf_skb_max_len(skb);
+	u32 min_len = __bpf_skb_min_len(skb);
+	u32 new_len = (u32) r2;
+	int ret;
+
+	if (unlikely(flags || new_len > max_len || new_len < min_len))
+		return -EINVAL;
+	if (skb->encapsulation)
+		return -ENOTSUPP;
+
+	/* The basic idea of this helper is that it's performing the
+	 * needed work to either grow or trim an skb, and eBPF program
+	 * rewrites the rest via helpers like bpf_skb_store_bytes(),
+	 * bpf_lX_csum_replace() and others rather than passing a raw
+	 * buffer here. This one is a slow path helper and intended
+	 * for replies with control messages.
+	 *
+	 * Like in bpf_skb_change_proto(), we want to keep this rather
+	 * minimal and without protocol specifics so that we are able
+	 * to separate concerns as in bpf_skb_store_bytes() should only
+	 * be the one responsible for writing buffers.
+	 *
+	 * It's really expected to be a slow path operation here for
+	 * control message replies, so we're implicitly linearizing,
+	 * uncloning and drop offloads from the skb by this.
+	 */
+	ret = __bpf_try_make_writable(skb, skb->len);
+	if (!ret) {
+		if (new_len > skb->len)
+			ret = bpf_skb_grow_rcsum(skb, new_len);
+		else if (new_len < skb->len)
+			ret = bpf_skb_trim_rcsum(skb, new_len);
+		if (!ret && skb_is_gso(skb))
+			skb_gso_reset(skb);
+	}
+
+	bpf_compute_data_end(skb);
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_skb_change_tail_proto = {
+	.func		= bpf_skb_change_tail,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+};
+
 bool bpf_helper_changes_skb_data(void *func)
 {
 	if (func == bpf_skb_vlan_push)
@@ -2002,6 +2092,8 @@ bool bpf_helper_changes_skb_data(void *func)
 		return true;
 	if (func == bpf_skb_change_proto)
 		return true;
+	if (func == bpf_skb_change_tail)
+		return true;
 	if (func == bpf_l3_csum_replace)
 		return true;
 	if (func == bpf_l4_csum_replace)
@@ -2368,6 +2460,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_skb_change_proto_proto;
 	case BPF_FUNC_skb_change_type:
 		return &bpf_skb_change_type_proto;
+	case BPF_FUNC_skb_change_tail:
+		return &bpf_skb_change_tail_proto;
 	case BPF_FUNC_skb_get_tunnel_key:
 		return &bpf_skb_get_tunnel_key_proto;
 	case BPF_FUNC_skb_set_tunnel_key:
-- 
cgit v1.2.3-71-gd317


From 3d2f30a1df907e3ef4175121f0d21456630a72aa Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 18 Aug 2016 01:46:06 +0200
Subject: netfilter: nf_tables: add quota expression

This patch adds the quota expression. This new stateful expression
integrate easily into the dynset expression to build 'hashquota' flow
tables.

Arguably, we could use instead "counter bytes > 1000" instead, but this
approach has several problems:

1) We only support for one single stateful expression in dynamic set
   definitions, and the expression above is a composite of two
   expressions: get counter + comparison.

2) We would need to restore the packed counter representation (that we
   used to have) based on seqlock to synchronize this, since per-cpu is
   not suitable for this.

So instead of bloating the counter expression back with the seqlock
representation and extending the existing set infrastructure to make it
more complex for the composite described above, let's follow the more
simple approach of adding a quota expression that we can plug into our
existing infrastructure.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  19 +++++
 net/netfilter/Kconfig                    |   6 ++
 net/netfilter/Makefile                   |   1 +
 net/netfilter/nft_quota.c                | 121 +++++++++++++++++++++++++++++++
 4 files changed, 147 insertions(+)
 create mode 100644 net/netfilter/nft_quota.c

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 6ce0a6dd0889..784fbf15ab3d 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -900,6 +900,25 @@ enum nft_queue_attributes {
 #define NFT_QUEUE_FLAG_CPU_FANOUT	0x02 /* use current CPU (no hashing) */
 #define NFT_QUEUE_FLAG_MASK		0x03
 
+enum nft_quota_flags {
+	NFT_QUOTA_F_INV		= (1 << 0),
+};
+
+/**
+ * enum nft_quota_attributes - nf_tables quota expression netlink attributes
+ *
+ * @NFTA_QUOTA_BYTES: quota in bytes (NLA_U16)
+ * @NFTA_QUOTA_FLAGS: flags (NLA_U32)
+ */
+enum nft_quota_attributes {
+	NFTA_QUOTA_UNSPEC,
+	NFTA_QUOTA_BYTES,
+	NFTA_QUOTA_FLAGS,
+	NFTA_QUOTA_PAD,
+	__NFTA_QUOTA_MAX
+};
+#define NFTA_QUOTA_MAX		(__NFTA_QUOTA_MAX - 1)
+
 /**
  * enum nft_reject_types - nf_tables reject expression reject types
  *
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 9cfaa00c79b2..29a8078deafa 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -542,6 +542,12 @@ config NFT_QUEUE
 	  This is required if you intend to use the userspace queueing
 	  infrastructure (also known as NFQUEUE) from nftables.
 
+config NFT_QUOTA
+	tristate "Netfilter nf_tables quota module"
+	help
+	  This option adds the "quota" expression that you can use to match
+	  enforce bytes quotas.
+
 config NFT_REJECT
 	default m if NETFILTER_ADVANCED=n
 	tristate "Netfilter nf_tables reject support"
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 1106ccde215c..0fc42df19b8c 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -84,6 +84,7 @@ obj-$(CONFIG_NFT_CT)		+= nft_ct.o
 obj-$(CONFIG_NFT_LIMIT)		+= nft_limit.o
 obj-$(CONFIG_NFT_NAT)		+= nft_nat.o
 obj-$(CONFIG_NFT_QUEUE)		+= nft_queue.o
+obj-$(CONFIG_NFT_QUOTA)		+= nft_quota.o
 obj-$(CONFIG_NFT_REJECT) 	+= nft_reject.o
 obj-$(CONFIG_NFT_REJECT_INET)	+= nft_reject_inet.o
 obj-$(CONFIG_NFT_SET_RBTREE)	+= nft_set_rbtree.o
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
new file mode 100644
index 000000000000..6eafbf987ed9
--- /dev/null
+++ b/net/netfilter/nft_quota.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/atomic.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_quota {
+	u64		quota;
+	bool		invert;
+	atomic64_t	remain;
+};
+
+static inline long nft_quota(struct nft_quota *priv,
+			     const struct nft_pktinfo *pkt)
+{
+	return atomic64_sub_return(pkt->skb->len, &priv->remain);
+}
+
+static void nft_quota_eval(const struct nft_expr *expr,
+			   struct nft_regs *regs,
+			   const struct nft_pktinfo *pkt)
+{
+	struct nft_quota *priv = nft_expr_priv(expr);
+
+	if (nft_quota(priv, pkt) < 0 && !priv->invert)
+		regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_quota_policy[NFTA_QUOTA_MAX + 1] = {
+	[NFTA_QUOTA_BYTES]	= { .type = NLA_U64 },
+	[NFTA_QUOTA_FLAGS]	= { .type = NLA_U32 },
+};
+
+static int nft_quota_init(const struct nft_ctx *ctx,
+			  const struct nft_expr *expr,
+			  const struct nlattr * const tb[])
+{
+	struct nft_quota *priv = nft_expr_priv(expr);
+	u32 flags = 0;
+	u64 quota;
+
+	if (!tb[NFTA_QUOTA_BYTES])
+		return -EINVAL;
+
+	quota = be64_to_cpu(nla_get_be64(tb[NFTA_QUOTA_BYTES]));
+	if (quota > S64_MAX)
+		return -EOVERFLOW;
+
+	if (tb[NFTA_QUOTA_FLAGS]) {
+		flags = ntohl(nla_get_be32(tb[NFTA_QUOTA_FLAGS]));
+		if (flags & ~NFT_QUOTA_F_INV)
+			return -EINVAL;
+	}
+
+	priv->quota = quota;
+	priv->invert = (flags & NFT_QUOTA_F_INV) ? true : false;
+	atomic64_set(&priv->remain, quota);
+
+	return 0;
+}
+
+static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_quota *priv = nft_expr_priv(expr);
+	u32 flags = priv->invert ? NFT_QUOTA_F_INV : 0;
+
+	if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota),
+			 NFTA_QUOTA_PAD) ||
+	    nla_put_be32(skb, NFTA_QUOTA_FLAGS, htonl(flags)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_type nft_quota_type;
+static const struct nft_expr_ops nft_quota_ops = {
+	.type		= &nft_quota_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_quota)),
+	.eval		= nft_quota_eval,
+	.init		= nft_quota_init,
+	.dump		= nft_quota_dump,
+};
+
+static struct nft_expr_type nft_quota_type __read_mostly = {
+	.name		= "quota",
+	.ops		= &nft_quota_ops,
+	.policy		= nft_quota_policy,
+	.maxattr	= NFTA_QUOTA_MAX,
+	.flags		= NFT_EXPR_STATEFUL,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_quota_module_init(void)
+{
+        return nft_register_expr(&nft_quota_type);
+}
+
+static void __exit nft_quota_module_exit(void)
+{
+        nft_unregister_expr(&nft_quota_type);
+}
+
+module_init(nft_quota_module_init);
+module_exit(nft_quota_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_EXPR("quota");
-- 
cgit v1.2.3-71-gd317


From 91dbc6be0a62d3bcea98287734d593610aed507d Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana <nevola@gmail.com>
Date: Thu, 18 Aug 2016 12:13:13 +0200
Subject: netfilter: nf_tables: add number generator expression

This patch adds the numgen expression that allows us to generated
incremental and random numbers, this generator is bound to a upper limit
that is specified by userspace.

This expression is useful to distribute packets in a round-robin fashion
as well as randomly.

Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  24 ++++
 net/netfilter/Kconfig                    |   6 +
 net/netfilter/Makefile                   |   1 +
 net/netfilter/nft_numgen.c               | 192 +++++++++++++++++++++++++++++++
 4 files changed, 223 insertions(+)
 create mode 100644 net/netfilter/nft_numgen.c

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 784fbf15ab3d..8c9d6ff70ec0 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1121,4 +1121,28 @@ enum nft_trace_types {
 	__NFT_TRACETYPE_MAX
 };
 #define NFT_TRACETYPE_MAX (__NFT_TRACETYPE_MAX - 1)
+
+/**
+ * enum nft_ng_attributes - nf_tables number generator expression netlink attributes
+ *
+ * @NFTA_NG_DREG: destination register (NLA_U32)
+ * @NFTA_NG_UNTIL: source value to increment the counter until reset (NLA_U32)
+ * @NFTA_NG_TYPE: operation type (NLA_U32)
+ */
+enum nft_ng_attributes {
+	NFTA_NG_UNSPEC,
+	NFTA_NG_DREG,
+	NFTA_NG_UNTIL,
+	NFTA_NG_TYPE,
+	__NFTA_NG_MAX
+};
+#define NFTA_NG_MAX	(__NFTA_NG_MAX - 1)
+
+enum nft_ng_types {
+	NFT_NG_INCREMENTAL,
+	NFT_NG_RANDOM,
+	__NFT_NG_MAX
+};
+#define NFT_NG_MAX	(__NFT_NG_MAX - 1)
+
 #endif /* _LINUX_NF_TABLES_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 29a8078deafa..e8d56d9a4df2 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -474,6 +474,12 @@ config NFT_META
 	  This option adds the "meta" expression that you can use to match and
 	  to set packet metainformation such as the packet mark.
 
+config NFT_NUMGEN
+	tristate "Netfilter nf_tables number generator module"
+	help
+	  This option adds the number generator expression used to perform
+	  incremental counting and random numbers bound to a upper limit.
+
 config NFT_CT
 	depends on NF_CONNTRACK
 	tristate "Netfilter nf_tables conntrack module"
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 0fc42df19b8c..0c8581100ac6 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -80,6 +80,7 @@ obj-$(CONFIG_NF_TABLES_NETDEV)	+= nf_tables_netdev.o
 obj-$(CONFIG_NFT_COMPAT)	+= nft_compat.o
 obj-$(CONFIG_NFT_EXTHDR)	+= nft_exthdr.o
 obj-$(CONFIG_NFT_META)		+= nft_meta.o
+obj-$(CONFIG_NFT_NUMGEN)	+= nft_numgen.o
 obj-$(CONFIG_NFT_CT)		+= nft_ct.o
 obj-$(CONFIG_NFT_LIMIT)		+= nft_limit.o
 obj-$(CONFIG_NFT_NAT)		+= nft_nat.o
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
new file mode 100644
index 000000000000..176e26d5bbd0
--- /dev/null
+++ b/net/netfilter/nft_numgen.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2016 Laura Garcia <nevola@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <linux/static_key.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+
+static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state);
+
+struct nft_ng_inc {
+	enum nft_registers      dreg:8;
+	u32			until;
+	atomic_t		counter;
+};
+
+static void nft_ng_inc_eval(const struct nft_expr *expr,
+			    struct nft_regs *regs,
+			    const struct nft_pktinfo *pkt)
+{
+	struct nft_ng_inc *priv = nft_expr_priv(expr);
+	u32 nval, oval;
+
+	do {
+		oval = atomic_read(&priv->counter);
+		nval = (oval + 1 < priv->until) ? oval + 1 : 0;
+	} while (atomic_cmpxchg(&priv->counter, oval, nval) != oval);
+
+	memcpy(&regs->data[priv->dreg], &priv->counter, sizeof(u32));
+}
+
+static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
+	[NFTA_NG_DREG]		= { .type = NLA_U32 },
+	[NFTA_NG_UNTIL]		= { .type = NLA_U32 },
+	[NFTA_NG_TYPE]		= { .type = NLA_U32 },
+};
+
+static int nft_ng_inc_init(const struct nft_ctx *ctx,
+			   const struct nft_expr *expr,
+			   const struct nlattr * const tb[])
+{
+	struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+	priv->until = ntohl(nla_get_be32(tb[NFTA_NG_UNTIL]));
+	if (priv->until == 0)
+		return -ERANGE;
+
+	priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
+	atomic_set(&priv->counter, 0);
+
+	return nft_validate_register_store(ctx, priv->dreg, NULL,
+					   NFT_DATA_VALUE, sizeof(u32));
+}
+
+static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
+		       u32 until, enum nft_ng_types type)
+{
+	if (nft_dump_register(skb, NFTA_NG_DREG, dreg))
+		goto nla_put_failure;
+	if (nft_dump_register(skb, NFTA_NG_UNTIL, until))
+		goto nla_put_failure;
+	if (nft_dump_register(skb, NFTA_NG_TYPE, type))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+	return nft_ng_dump(skb, priv->dreg, priv->until, NFT_NG_INCREMENTAL);
+}
+
+struct nft_ng_random {
+	enum nft_registers      dreg:8;
+	u32			until;
+};
+
+static void nft_ng_random_eval(const struct nft_expr *expr,
+			       struct nft_regs *regs,
+			       const struct nft_pktinfo *pkt)
+{
+	struct nft_ng_random *priv = nft_expr_priv(expr);
+	struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
+
+	regs->data[priv->dreg] = reciprocal_scale(prandom_u32_state(state),
+						  priv->until);
+}
+
+static int nft_ng_random_init(const struct nft_ctx *ctx,
+			      const struct nft_expr *expr,
+			      const struct nlattr * const tb[])
+{
+	struct nft_ng_random *priv = nft_expr_priv(expr);
+
+	priv->until = ntohl(nla_get_be32(tb[NFTA_NG_UNTIL]));
+	if (priv->until == 0)
+		return -ERANGE;
+
+	prandom_init_once(&nft_numgen_prandom_state);
+
+	priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
+
+	return nft_validate_register_store(ctx, priv->dreg, NULL,
+					   NFT_DATA_VALUE, sizeof(u32));
+}
+
+static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_ng_random *priv = nft_expr_priv(expr);
+
+	return nft_ng_dump(skb, priv->dreg, priv->until, NFT_NG_RANDOM);
+}
+
+static struct nft_expr_type nft_ng_type;
+static const struct nft_expr_ops nft_ng_inc_ops = {
+	.type		= &nft_ng_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)),
+	.eval		= nft_ng_inc_eval,
+	.init		= nft_ng_inc_init,
+	.dump		= nft_ng_inc_dump,
+};
+
+static const struct nft_expr_ops nft_ng_random_ops = {
+	.type		= &nft_ng_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_ng_random)),
+	.eval		= nft_ng_random_eval,
+	.init		= nft_ng_random_init,
+	.dump		= nft_ng_random_dump,
+};
+
+static const struct nft_expr_ops *
+nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
+{
+	u32 type;
+
+	if (!tb[NFTA_NG_DREG]	||
+	    !tb[NFTA_NG_UNTIL]	||
+	    !tb[NFTA_NG_TYPE])
+		return ERR_PTR(-EINVAL);
+
+	type = ntohl(nla_get_be32(tb[NFTA_NG_TYPE]));
+
+	switch (type) {
+	case NFT_NG_INCREMENTAL:
+		return &nft_ng_inc_ops;
+	case NFT_NG_RANDOM:
+		return &nft_ng_random_ops;
+	}
+
+	return ERR_PTR(-EINVAL);
+}
+
+static struct nft_expr_type nft_ng_type __read_mostly = {
+	.name		= "numgen",
+	.select_ops	= &nft_ng_select_ops,
+	.policy		= nft_ng_policy,
+	.maxattr	= NFTA_NG_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_ng_module_init(void)
+{
+	return nft_register_expr(&nft_ng_type);
+}
+
+static void __exit nft_ng_module_exit(void)
+{
+	nft_unregister_expr(&nft_ng_type);
+}
+
+module_init(nft_ng_module_init);
+module_exit(nft_ng_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Laura Garcia <nevola@gmail.com>");
+MODULE_ALIAS_NFT_EXPR("numgen");
-- 
cgit v1.2.3-71-gd317


From b43f95695124baba9f7c48e247fc8fd212a984d9 Mon Sep 17 00:00:00 2001
From: Pablo Neira <pablo@netfilter.org>
Date: Mon, 22 Aug 2016 11:37:36 +0200
Subject: netfilter: nf_tables: typo in trace attribute definition

Should be attributes, instead of attibutes, for consistency with other
definitions.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 8c9d6ff70ec0..8a63f22b365d 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1090,7 +1090,7 @@ enum nft_gen_attributes {
  * @NFTA_TRACE_NFPROTO: nf protocol processed (NLA_U32)
  * @NFTA_TRACE_POLICY: policy that decided fate of packet (NLA_U32)
  */
-enum nft_trace_attibutes {
+enum nft_trace_attributes {
 	NFTA_TRACE_UNSPEC,
 	NFTA_TRACE_TABLE,
 	NFTA_TRACE_CHAIN,
-- 
cgit v1.2.3-71-gd317


From a52e95abf772b43c9226e9a72d3c1353903ba96f Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Wed, 24 Aug 2016 15:46:26 +0900
Subject: net: diag: allow socket bytecode filters to match socket marks

This allows a privileged process to filter by socket mark when
dumping sockets via INET_DIAG_BY_FAMILY. This is useful on
systems that use mark-based routing such as Android.

The ability to filter socket marks requires CAP_NET_ADMIN, which
is consistent with other privileged operations allowed by the
SOCK_DIAG interface such as the ability to destroy sockets and
the ability to inspect BPF filters attached to packet sockets.

Tested: https://android-review.googlesource.com/261350
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Acked-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/inet_diag.h |  6 ++++++
 net/ipv4/inet_diag.c           | 36 +++++++++++++++++++++++++++++++++---
 2 files changed, 39 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index abbd1dc5d683..5581206a08ae 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -73,6 +73,7 @@ enum {
 	INET_DIAG_BC_S_COND,
 	INET_DIAG_BC_D_COND,
 	INET_DIAG_BC_DEV_COND,   /* u32 ifindex */
+	INET_DIAG_BC_MARK_COND,
 };
 
 struct inet_diag_hostcond {
@@ -82,6 +83,11 @@ struct inet_diag_hostcond {
 	__be32	addr[0];
 };
 
+struct inet_diag_markcond {
+	__u32 mark;
+	__u32 mask;
+};
+
 /* Base info structure. It contains socket identity (addrs/ports/cookie)
  * and, alas, the information shown by netstat. */
 struct inet_diag_msg {
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index a89b68c6a934..abfbe492ebfe 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -45,6 +45,7 @@ struct inet_diag_entry {
 	u16 family;
 	u16 userlocks;
 	u32 ifindex;
+	u32 mark;
 };
 
 static DEFINE_MUTEX(inet_diag_table_mutex);
@@ -580,6 +581,14 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
 				yes = 0;
 			break;
 		}
+		case INET_DIAG_BC_MARK_COND: {
+			struct inet_diag_markcond *cond;
+
+			cond = (struct inet_diag_markcond *)(op + 1);
+			if ((entry->mark & cond->mask) != cond->mark)
+				yes = 0;
+			break;
+		}
 		}
 
 		if (yes) {
@@ -624,6 +633,12 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
 	entry.dport = ntohs(inet->inet_dport);
 	entry.ifindex = sk->sk_bound_dev_if;
 	entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0;
+	if (sk_fullsock(sk))
+		entry.mark = sk->sk_mark;
+	else if (sk->sk_state == TCP_NEW_SYN_RECV)
+		entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark;
+	else
+		entry.mark = 0;
 
 	return inet_diag_bc_run(bc, &entry);
 }
@@ -706,8 +721,17 @@ static bool valid_port_comparison(const struct inet_diag_bc_op *op,
 	return true;
 }
 
-static int inet_diag_bc_audit(const struct nlattr *attr)
+static bool valid_markcond(const struct inet_diag_bc_op *op, int len,
+			   int *min_len)
 {
+	*min_len += sizeof(struct inet_diag_markcond);
+	return len >= *min_len;
+}
+
+static int inet_diag_bc_audit(const struct nlattr *attr,
+			      const struct sk_buff *skb)
+{
+	bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN);
 	const void *bytecode, *bc;
 	int bytecode_len, len;
 
@@ -738,6 +762,12 @@ static int inet_diag_bc_audit(const struct nlattr *attr)
 			if (!valid_port_comparison(bc, len, &min_len))
 				return -EINVAL;
 			break;
+		case INET_DIAG_BC_MARK_COND:
+			if (!net_admin)
+				return -EPERM;
+			if (!valid_markcond(bc, len, &min_len))
+				return -EINVAL;
+			break;
 		case INET_DIAG_BC_AUTO:
 		case INET_DIAG_BC_JMP:
 		case INET_DIAG_BC_NOP:
@@ -1030,7 +1060,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 			attr = nlmsg_find_attr(nlh, hdrlen,
 					       INET_DIAG_REQ_BYTECODE);
-			err = inet_diag_bc_audit(attr);
+			err = inet_diag_bc_audit(attr, skb);
 			if (err)
 				return err;
 		}
@@ -1061,7 +1091,7 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h)
 
 			attr = nlmsg_find_attr(h, hdrlen,
 					       INET_DIAG_REQ_BYTECODE);
-			err = inet_diag_bc_audit(attr);
+			err = inet_diag_bc_audit(attr, skb);
 			if (err)
 				return err;
 		}
-- 
cgit v1.2.3-71-gd317


From 72145a68e4ee116533df49af4b87aca0aacc179c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 24 Aug 2016 09:01:23 -0700
Subject: tcp: md5: add LINUX_MIB_TCPMD5FAILURE counter

Adds SNMP counter for drops caused by MD5 mismatches.

The current syslog might help, but a counter is more precise and helps
monitoring.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h | 1 +
 net/ipv4/proc.c           | 1 +
 net/ipv4/tcp_ipv4.c       | 1 +
 net/ipv6/tcp_ipv6.c       | 1 +
 4 files changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 25a9ad8bcef1..e7a31f830690 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -235,6 +235,7 @@ enum
 	LINUX_MIB_TCPSPURIOUSRTOS,		/* TCPSpuriousRTOs */
 	LINUX_MIB_TCPMD5NOTFOUND,		/* TCPMD5NotFound */
 	LINUX_MIB_TCPMD5UNEXPECTED,		/* TCPMD5Unexpected */
+	LINUX_MIB_TCPMD5FAILURE,		/* TCPMD5Failure */
 	LINUX_MIB_SACKSHIFTED,
 	LINUX_MIB_SACKMERGED,
 	LINUX_MIB_SACKSHIFTFALLBACK,
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 9f665b63a927..1ed015e4bc79 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -257,6 +257,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS),
 	SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND),
 	SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED),
+	SNMP_MIB_ITEM("TCPMD5Failure", LINUX_MIB_TCPMD5FAILURE),
 	SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED),
 	SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
 	SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 436d978c6c39..ad41e8ecf796 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1169,6 +1169,7 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
 				      NULL, skb);
 
 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
 				     &iph->saddr, ntohs(th->source),
 				     &iph->daddr, ntohs(th->dest),
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ac0ed7bda406..e4f55683af31 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -671,6 +671,7 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk,
 				      NULL, skb);
 
 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
 		net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n",
 				     genhash ? "failed" : "mismatch",
 				     &ip6h->saddr, ntohs(th->source),
-- 
cgit v1.2.3-71-gd317


From ef20cd4dd1633987bcf46ac34ace2c8af212361f Mon Sep 17 00:00:00 2001
From: Richard Alpe <richard.alpe@ericsson.com>
Date: Fri, 26 Aug 2016 10:52:53 +0200
Subject: tipc: introduce UDP replicast

This patch introduces UDP replicast. A concept where we emulate
multicast by sending multiple unicast messages to configured peers.

The purpose of replicast is mainly to be able to use TIPC in cloud
environments where IP multicast is disabled. Using replicas to unicast
multicast messages is costly as we have to copy each skb and send the
copies individually.

Signed-off-by: Richard Alpe <richard.alpe@ericsson.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc_netlink.h |   1 +
 net/tipc/bearer.c                 |  44 ++++++++++++++
 net/tipc/bearer.h                 |   1 +
 net/tipc/netlink.c                |   5 ++
 net/tipc/udp_media.c              | 118 ++++++++++++++++++++++++++++++++++----
 net/tipc/udp_media.h              |  44 ++++++++++++++
 6 files changed, 201 insertions(+), 12 deletions(-)
 create mode 100644 net/tipc/udp_media.h

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h
index bcb65ef725f6..b15664c36219 100644
--- a/include/uapi/linux/tipc_netlink.h
+++ b/include/uapi/linux/tipc_netlink.h
@@ -60,6 +60,7 @@ enum {
 	TIPC_NL_MON_GET,
 	TIPC_NL_MON_PEER_GET,
 	TIPC_NL_PEER_REMOVE,
+	TIPC_NL_BEARER_ADD,
 
 	__TIPC_NL_CMD_MAX,
 	TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 28056fa8f77a..d7b442dd6669 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -42,6 +42,7 @@
 #include "monitor.h"
 #include "bcast.h"
 #include "netlink.h"
+#include "udp_media.h"
 
 #define MAX_ADDR_STR 60
 
@@ -897,6 +898,49 @@ int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info)
 	return 0;
 }
 
+int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info)
+{
+	int err;
+	char *name;
+	struct tipc_bearer *b;
+	struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1];
+	struct net *net = sock_net(skb->sk);
+
+	if (!info->attrs[TIPC_NLA_BEARER])
+		return -EINVAL;
+
+	err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX,
+			       info->attrs[TIPC_NLA_BEARER],
+			       tipc_nl_bearer_policy);
+	if (err)
+		return err;
+
+	if (!attrs[TIPC_NLA_BEARER_NAME])
+		return -EINVAL;
+	name = nla_data(attrs[TIPC_NLA_BEARER_NAME]);
+
+	rtnl_lock();
+	b = tipc_bearer_find(net, name);
+	if (!b) {
+		rtnl_unlock();
+		return -EINVAL;
+	}
+
+#ifdef CONFIG_TIPC_MEDIA_UDP
+	if (attrs[TIPC_NLA_BEARER_UDP_OPTS]) {
+		err = tipc_udp_nl_bearer_add(b,
+					     attrs[TIPC_NLA_BEARER_UDP_OPTS]);
+		if (err) {
+			rtnl_unlock();
+			return err;
+		}
+	}
+#endif
+	rtnl_unlock();
+
+	return 0;
+}
+
 int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info)
 {
 	int err;
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 83a9abbfe32c..78892e2f53e3 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -181,6 +181,7 @@ int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info);
 int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb);
 int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info);
 int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info);
+int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info);
 
 int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb);
 int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info);
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 2718de667828..3122f21ca979 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -160,6 +160,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = {
 		.dumpit	= tipc_nl_bearer_dump,
 		.policy = tipc_nl_policy,
 	},
+	{
+		.cmd	= TIPC_NL_BEARER_ADD,
+		.doit	= tipc_nl_bearer_add,
+		.policy = tipc_nl_policy,
+	},
 	{
 		.cmd	= TIPC_NL_BEARER_SET,
 		.doit	= tipc_nl_bearer_set,
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index b8ec1a18241e..6b938cc15daf 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -49,6 +49,7 @@
 #include "core.h"
 #include "bearer.h"
 #include "netlink.h"
+#include "msg.h"
 
 /* IANA assigned UDP port */
 #define UDP_PORT_DEFAULT	6118
@@ -70,6 +71,13 @@ struct udp_media_addr {
 	};
 };
 
+/* struct udp_replicast - container for UDP remote addresses */
+struct udp_replicast {
+	struct udp_media_addr addr;
+	struct rcu_head rcu;
+	struct list_head list;
+};
+
 /**
  * struct udp_bearer - ip/udp bearer data structure
  * @bearer:	associated generic tipc bearer
@@ -82,6 +90,7 @@ struct udp_bearer {
 	struct socket *ubsock;
 	u32 ifindex;
 	struct work_struct work;
+	struct udp_replicast rcast;
 };
 
 static int tipc_udp_is_mcast_addr(struct udp_media_addr *addr)
@@ -203,29 +212,75 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
 {
 	struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value;
 	struct udp_media_addr *dst = (struct udp_media_addr *)&addr->value;
+	struct udp_replicast *rcast;
 	struct udp_bearer *ub;
 	int err = 0;
 
 	if (skb_headroom(skb) < UDP_MIN_HEADROOM) {
 		err = pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC);
 		if (err)
-			goto tx_error;
+			goto out;
 	}
 
 	skb_set_inner_protocol(skb, htons(ETH_P_TIPC));
 	ub = rcu_dereference_rtnl(b->media_ptr);
 	if (!ub) {
 		err = -ENODEV;
-		goto tx_error;
+		goto out;
 	}
 
-	return tipc_udp_xmit(net, skb, ub, src, dst);
+	if (!addr->broadcast || list_empty(&ub->rcast.list))
+		return tipc_udp_xmit(net, skb, ub, src, dst);
 
-tx_error:
+	/* Replicast, send an skb to each configured IP address */
+	list_for_each_entry_rcu(rcast, &ub->rcast.list, list) {
+		struct sk_buff *_skb;
+
+		_skb = pskb_copy(skb, GFP_ATOMIC);
+		if (!_skb) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = tipc_udp_xmit(net, _skb, ub, src, &rcast->addr);
+		if (err) {
+			kfree_skb(_skb);
+			goto out;
+		}
+	}
+	err = 0;
+out:
 	kfree_skb(skb);
 	return err;
 }
 
+static int tipc_udp_rcast_add(struct tipc_bearer *b,
+			      struct udp_media_addr *addr)
+{
+	struct udp_replicast *rcast;
+	struct udp_bearer *ub;
+
+	ub = rcu_dereference_rtnl(b->media_ptr);
+	if (!ub)
+		return -ENODEV;
+
+	rcast = kmalloc(sizeof(*rcast), GFP_ATOMIC);
+	if (!rcast)
+		return -ENOMEM;
+
+	memcpy(&rcast->addr, addr, sizeof(struct udp_media_addr));
+
+	if (ntohs(addr->proto) == ETH_P_IP)
+		pr_info("New replicast peer: %pI4\n", &rcast->addr.ipv4);
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (ntohs(addr->proto) == ETH_P_IPV6)
+		pr_info("New replicast peer: %pI6\n", &rcast->addr.ipv6);
+#endif
+
+	list_add_rcu(&rcast->list, &ub->rcast.list);
+	return 0;
+}
+
 /* tipc_udp_recv - read data from bearer socket */
 static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb)
 {
@@ -320,6 +375,32 @@ static int tipc_parse_udp_addr(struct nlattr *nla, struct udp_media_addr *addr,
 	return -EADDRNOTAVAIL;
 }
 
+int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr)
+{
+	int err;
+	struct udp_media_addr addr = {0};
+	struct nlattr *opts[TIPC_NLA_UDP_MAX + 1];
+	struct udp_media_addr *dst;
+
+	if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX, attr, tipc_nl_udp_policy))
+		return -EINVAL;
+
+	if (!opts[TIPC_NLA_UDP_REMOTE])
+		return -EINVAL;
+
+	err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], &addr, NULL);
+	if (err)
+		return err;
+
+	dst = (struct udp_media_addr *)&b->bcast_addr.value;
+	if (tipc_udp_is_mcast_addr(dst)) {
+		pr_err("Can't add remote ip to TIPC UDP multicast bearer\n");
+		return -EINVAL;
+	}
+
+	return tipc_udp_rcast_add(b, &addr);
+}
+
 /**
  * tipc_udp_enable - callback to create a new udp bearer instance
  * @net:	network namespace
@@ -334,7 +415,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
 {
 	int err = -EINVAL;
 	struct udp_bearer *ub;
-	struct udp_media_addr *remote;
+	struct udp_media_addr remote = {0};
 	struct udp_media_addr local = {0};
 	struct udp_port_cfg udp_conf = {0};
 	struct udp_tunnel_sock_cfg tuncfg = {NULL};
@@ -344,6 +425,8 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
 	if (!ub)
 		return -ENOMEM;
 
+	INIT_LIST_HEAD(&ub->rcast.list);
+
 	if (!attrs[TIPC_NLA_BEARER_UDP_OPTS])
 		goto err;
 
@@ -362,9 +445,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
 	if (err)
 		goto err;
 
-	remote = (struct udp_media_addr *)&b->bcast_addr.value;
-	memset(remote, 0, sizeof(struct udp_media_addr));
-	err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], remote, NULL);
+	err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], &remote, NULL);
 	if (err)
 		goto err;
 
@@ -409,10 +490,17 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
 	tuncfg.encap_destroy = NULL;
 	setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg);
 
-	if (tipc_udp_is_mcast_addr(remote)) {
-		if (enable_mcast(ub, remote))
-			goto err;
-	}
+	/**
+	 * The bcast media address port is used for all peers and the ip
+	 * is used if it's a multicast address.
+	 */
+	memcpy(&b->bcast_addr.value, &remote, sizeof(remote));
+	if (tipc_udp_is_mcast_addr(&remote))
+		err = enable_mcast(ub, &remote);
+	else
+		err = tipc_udp_rcast_add(b, &remote);
+	if (err)
+		goto err;
 
 	return 0;
 err:
@@ -424,6 +512,12 @@ err:
 static void cleanup_bearer(struct work_struct *work)
 {
 	struct udp_bearer *ub = container_of(work, struct udp_bearer, work);
+	struct udp_replicast *rcast, *tmp;
+
+	list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) {
+		list_del_rcu(&rcast->list);
+		kfree_rcu(rcast, rcu);
+	}
 
 	if (ub->ubsock)
 		udp_tunnel_sock_release(ub->ubsock);
diff --git a/net/tipc/udp_media.h b/net/tipc/udp_media.h
new file mode 100644
index 000000000000..4dcb54880aa6
--- /dev/null
+++ b/net/tipc/udp_media.h
@@ -0,0 +1,44 @@
+/*
+ * net/tipc/udp_media.h: Include file for UDP bearer media
+ *
+ * Copyright (c) 1996-2006, 2013-2016, Ericsson AB
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef CONFIG_TIPC_MEDIA_UDP
+#ifndef _TIPC_UDP_MEDIA_H
+#define _TIPC_UDP_MEDIA_H
+
+int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr);
+
+#endif
+#endif
-- 
cgit v1.2.3-71-gd317


From fdb3accc2c15fabc2b687b2819da9167027c61b6 Mon Sep 17 00:00:00 2001
From: Richard Alpe <richard.alpe@ericsson.com>
Date: Fri, 26 Aug 2016 10:52:55 +0200
Subject: tipc: add the ability to get UDP options via netlink

Add UDP bearer options to netlink bearer get message. This is used by
the tipc user space tool to display UDP options.

The UDP bearer information is passed using either a sockaddr_in or
sockaddr_in6 structs. This means the user space receiver should
intermediately store the retrieved data in a large enough struct
(sockaddr_strage) before casting to the proper IP version type.

Signed-off-by: Richard Alpe <richard.alpe@ericsson.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc_netlink.h |  2 ++
 net/tipc/bearer.c                 |  8 +++++
 net/tipc/udp_media.c              | 61 +++++++++++++++++++++++++++++++++++++++
 net/tipc/udp_media.h              |  1 +
 4 files changed, 72 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h
index b15664c36219..f9edd20fe9ba 100644
--- a/include/uapi/linux/tipc_netlink.h
+++ b/include/uapi/linux/tipc_netlink.h
@@ -61,6 +61,7 @@ enum {
 	TIPC_NL_MON_PEER_GET,
 	TIPC_NL_PEER_REMOVE,
 	TIPC_NL_BEARER_ADD,
+	TIPC_NL_UDP_GET_REMOTEIP,
 
 	__TIPC_NL_CMD_MAX,
 	TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1
@@ -100,6 +101,7 @@ enum {
 	TIPC_NLA_UDP_UNSPEC,
 	TIPC_NLA_UDP_LOCAL,		/* sockaddr_storage */
 	TIPC_NLA_UDP_REMOTE,		/* sockaddr_storage */
+	TIPC_NLA_UDP_MULTI_REMOTEIP,	/* flag */
 
 	__TIPC_NLA_UDP_MAX,
 	TIPC_NLA_UDP_MAX = __TIPC_NLA_UDP_MAX - 1
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index d7b442dd6669..975dbeb60ab0 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -712,6 +712,14 @@ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg,
 		goto prop_msg_full;
 
 	nla_nest_end(msg->skb, prop);
+
+#ifdef CONFIG_TIPC_MEDIA_UDP
+	if (bearer->media->type_id == TIPC_MEDIA_TYPE_UDP) {
+		if (tipc_udp_nl_add_bearer_data(msg, bearer))
+			goto attr_msg_full;
+	}
+#endif
+
 	nla_nest_end(msg->skb, attrs);
 	genlmsg_end(msg->skb, hdr);
 
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 6ece3c9ccf82..a6cdd9895653 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -401,6 +401,67 @@ static int enable_mcast(struct udp_bearer *ub, struct udp_media_addr *remote)
 	return err;
 }
 
+static int __tipc_nl_add_udp_addr(struct sk_buff *skb,
+				  struct udp_media_addr *addr, int nla_t)
+{
+	if (ntohs(addr->proto) == ETH_P_IP) {
+		struct sockaddr_in ip4;
+
+		ip4.sin_family = AF_INET;
+		ip4.sin_port = addr->port;
+		ip4.sin_addr.s_addr = addr->ipv4.s_addr;
+		if (nla_put(skb, nla_t, sizeof(ip4), &ip4))
+			return -EMSGSIZE;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (ntohs(addr->proto) == ETH_P_IPV6) {
+		struct sockaddr_in6 ip6;
+
+		ip6.sin6_family = AF_INET6;
+		ip6.sin6_port  = addr->port;
+		memcpy(&ip6.sin6_addr, &addr->ipv6, sizeof(struct in6_addr));
+		if (nla_put(skb, nla_t, sizeof(ip6), &ip6))
+			return -EMSGSIZE;
+#endif
+	}
+
+	return 0;
+}
+
+int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b)
+{
+	struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value;
+	struct udp_media_addr *dst;
+	struct udp_bearer *ub;
+	struct nlattr *nest;
+
+	ub = rcu_dereference_rtnl(b->media_ptr);
+	if (!ub)
+		return -ENODEV;
+
+	nest = nla_nest_start(msg->skb, TIPC_NLA_BEARER_UDP_OPTS);
+	if (!nest)
+		goto msg_full;
+
+	if (__tipc_nl_add_udp_addr(msg->skb, src, TIPC_NLA_UDP_LOCAL))
+		goto msg_full;
+
+	dst = (struct udp_media_addr *)&b->bcast_addr.value;
+	if (__tipc_nl_add_udp_addr(msg->skb, dst, TIPC_NLA_UDP_REMOTE))
+		goto msg_full;
+
+	if (!list_empty(&ub->rcast.list)) {
+		if (nla_put_flag(msg->skb, TIPC_NLA_UDP_MULTI_REMOTEIP))
+			goto msg_full;
+	}
+
+	nla_nest_end(msg->skb, nest);
+	return 0;
+msg_full:
+	nla_nest_cancel(msg->skb, nest);
+	return -EMSGSIZE;
+}
+
 /**
  * tipc_parse_udp_addr - build udp media address from netlink data
  * @nlattr:	netlink attribute containing sockaddr storage aligned address
diff --git a/net/tipc/udp_media.h b/net/tipc/udp_media.h
index 4dcb54880aa6..c06326a134db 100644
--- a/net/tipc/udp_media.h
+++ b/net/tipc/udp_media.h
@@ -39,6 +39,7 @@
 #define _TIPC_UDP_MEDIA_H
 
 int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr);
+int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b);
 
 #endif
 #endif
-- 
cgit v1.2.3-71-gd317


From 5711a98221443aec54c4c81ee98c6ae46acccb65 Mon Sep 17 00:00:00 2001
From: Vidya Sagar Ravipati <vidya@cumulusnetworks.com>
Date: Fri, 26 Aug 2016 01:25:50 -0700
Subject: net: ethtool: add support for 1000BaseX and missing 10G link modes

This patch enhances ethtool link mode bitmap to include
missing interface modes for 1G/10G speeds

Changes:
1000baseX is the mode introduced to cover all 1G Fiber cases.
All modes under 1000BaseX i.e. 1000BASE-SX, 1000BASE-LX, 1000BASE-LX10
and 1000BASE-BX10 are not explicitly defined at this moment.
10G CR,SR,LR and ER link modes are included for 10G speed..

Issue:
ethtool on  1G/10G SFP port reports Base-T
as this port supports 1000baseX,10G CR, SR and LR modes.

root@tor-02$ ethtool swp1
Settings for swp1:
        Supported ports: [ FIBRE ]
        Supported link modes:   1000baseT/Full
                                10000baseT/Full
        Supported pause frame use: Symmetric Receive-only
        Supports auto-negotiation: Yes
        Advertised link modes:  1000baseT/Full
        Advertised pause frame use: No
        Advertised auto-negotiation: No
        Speed: 10000Mb/s
        Duplex: Full
        Port: FIBRE
        PHYAD: 0
        Transceiver: external
        Auto-negotiation: off
        Current message level: 0x00000000 (0)

        Link detected: yes

After fix:
root@tor-02$ ethtool swp1
Settings for swp1:
        Supported ports: [ FIBRE ]
        Supported link modes:   1000baseX/Full
                                10000baseCR/Full
                                10000baseSR/Full
                                10000baseLR/Full
                                10000baseER/Full
        Supported pause frame use: Symmetric Receive-only
        Supports auto-negotiation: Yes
        Advertised link modes:  1000baseT/Full
        Advertised pause frame use: No
        Advertised auto-negotiation: No
        Speed: 10000Mb/s
        Duplex: Full
        Port: FIBRE
        PHYAD: 0
        Transceiver: external
        Auto-negotiation: off
        Current message level: 0x00000000 (0)
        Link detected: yes

Signed-off-by: Vidya Sagar Ravipati <vidya@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index b8f38e84d93a..099a4200732c 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1362,7 +1362,14 @@ enum ethtool_link_mode_bit_indices {
 	ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT	= 37,
 	ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT	= 38,
 	ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT	= 39,
-	ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT         = 40,
+	ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT		= 40,
+	ETHTOOL_LINK_MODE_1000baseX_Full_BIT	= 41,
+	ETHTOOL_LINK_MODE_10000baseCR_Full_BIT	= 42,
+	ETHTOOL_LINK_MODE_10000baseSR_Full_BIT	= 43,
+	ETHTOOL_LINK_MODE_10000baseLR_Full_BIT	= 44,
+	ETHTOOL_LINK_MODE_10000baseLRM_Full_BIT	= 45,
+	ETHTOOL_LINK_MODE_10000baseER_Full_BIT	= 46,
+
 
 	/* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit
 	 * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_*
@@ -1371,7 +1378,7 @@ enum ethtool_link_mode_bit_indices {
 	 */
 
 	__ETHTOOL_LINK_MODE_LAST
-	  = ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT,
+	  = ETHTOOL_LINK_MODE_10000baseER_Full_BIT,
 };
 
 #define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name)	\
-- 
cgit v1.2.3-71-gd317


From b6cb5ac8331b6bcfe9ce38c7f7f58db6e1d6270a Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Wed, 31 Aug 2016 15:36:52 +0200
Subject: net: bridge: add per-port multicast flood flag

Add a per-port flag to control the unknown multicast flood, similar to the
unknown unicast flood flag and break a few long lines in the netlink flag
exports.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h    |  1 +
 include/uapi/linux/if_link.h |  1 +
 net/bridge/br_forward.c      |  3 +++
 net/bridge/br_if.c           |  2 +-
 net/bridge/br_netlink.c      | 12 +++++++++---
 net/bridge/br_sysfs_if.c     |  1 +
 6 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index dcb89e3515db..c6587c01d951 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -45,6 +45,7 @@ struct br_ip_list {
 #define BR_PROXYARP		BIT(8)
 #define BR_LEARNING_SYNC	BIT(9)
 #define BR_PROXYARP_WIFI	BIT(10)
+#define BR_MCAST_FLOOD		BIT(11)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index a1b5202c5f6b..9bf3aecfe05b 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -318,6 +318,7 @@ enum {
 	IFLA_BRPORT_FLUSH,
 	IFLA_BRPORT_MULTICAST_ROUTER,
 	IFLA_BRPORT_PAD,
+	IFLA_BRPORT_MCAST_FLOOD,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 5de854ed3340..7cb41aee4c82 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -186,6 +186,9 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
 		/* Do not flood unicast traffic to ports that turn it off */
 		if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD))
 			continue;
+		if (pkt_type == BR_PKT_MULTICAST &&
+		    !(p->flags & BR_MCAST_FLOOD))
+			continue;
 
 		/* Do not flood to ports that enable proxy ARP */
 		if (p->flags & BR_PROXYARP)
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 1da3221845f1..ed0dd3340084 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -362,7 +362,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br,
 	p->path_cost = port_cost(dev);
 	p->priority = 0x8000 >> BR_PORT_BITS;
 	p->port_no = index;
-	p->flags = BR_LEARNING | BR_FLOOD;
+	p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD;
 	br_init_port(p);
 	br_set_state(p, BR_STATE_DISABLED);
 	br_stp_port_timer_init(p);
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 190a5bc00f4a..e99037c6f7b7 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -169,10 +169,15 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 	    nla_put_u32(skb, IFLA_BRPORT_COST, p->path_cost) ||
 	    nla_put_u8(skb, IFLA_BRPORT_MODE, mode) ||
 	    nla_put_u8(skb, IFLA_BRPORT_GUARD, !!(p->flags & BR_BPDU_GUARD)) ||
-	    nla_put_u8(skb, IFLA_BRPORT_PROTECT, !!(p->flags & BR_ROOT_BLOCK)) ||
-	    nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) ||
+	    nla_put_u8(skb, IFLA_BRPORT_PROTECT,
+		       !!(p->flags & BR_ROOT_BLOCK)) ||
+	    nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE,
+		       !!(p->flags & BR_MULTICAST_FAST_LEAVE)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) ||
-	    nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)) ||
+	    nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD,
+		       !!(p->flags & BR_FLOOD)) ||
+	    nla_put_u8(skb, IFLA_BRPORT_MCAST_FLOOD,
+		       !!(p->flags & BR_MCAST_FLOOD)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI,
 		       !!(p->flags & BR_PROXYARP_WIFI)) ||
@@ -630,6 +635,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 	br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK);
 	br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING);
 	br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD);
+	br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD);
 	br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP);
 	br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI);
 
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 1e04d4d44273..e657258e1f2c 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -171,6 +171,7 @@ BRPORT_ATTR_FLAG(learning, BR_LEARNING);
 BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD);
 BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP);
 BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI);
+BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD);
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
-- 
cgit v1.2.3-71-gd317


From 0515e5999a466dfe6e1924f460da599bb6821487 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 1 Sep 2016 18:37:22 -0700
Subject: bpf: introduce BPF_PROG_TYPE_PERF_EVENT program type

Introduce BPF_PROG_TYPE_PERF_EVENT programs that can be attached to
HW and SW perf events (PERF_TYPE_HARDWARE and PERF_TYPE_SOFTWARE
correspondingly in uapi/linux/perf_event.h)

The program visible context meta structure is
struct bpf_perf_event_data {
    struct pt_regs regs;
     __u64 sample_period;
};
which is accessible directly from the program:
int bpf_prog(struct bpf_perf_event_data *ctx)
{
  ... ctx->sample_period ...
  ... ctx->regs.ip ...
}

The bpf verifier rewrites the accesses into kernel internal
struct bpf_perf_event_data_kern which allows changing
struct perf_sample_data without affecting bpf programs.
New fields can be added to the end of struct bpf_perf_event_data
in the future.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/perf_event.h          |  5 +++
 include/uapi/linux/Kbuild           |  1 +
 include/uapi/linux/bpf.h            |  1 +
 include/uapi/linux/bpf_perf_event.h | 18 +++++++++++
 kernel/trace/bpf_trace.c            | 61 +++++++++++++++++++++++++++++++++++++
 5 files changed, 86 insertions(+)
 create mode 100644 include/uapi/linux/bpf_perf_event.h

(limited to 'include/uapi/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2b6b43cc0dd5..97bfe62f30d7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -788,6 +788,11 @@ struct perf_output_handle {
 	int				page;
 };
 
+struct bpf_perf_event_data_kern {
+	struct pt_regs *regs;
+	struct perf_sample_data *data;
+};
+
 #ifdef CONFIG_CGROUP_PERF
 
 /*
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 185f8ea2702f..d0352a971ebd 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -71,6 +71,7 @@ header-y += binfmts.h
 header-y += blkpg.h
 header-y += blktrace_api.h
 header-y += bpf_common.h
+header-y += bpf_perf_event.h
 header-y += bpf.h
 header-y += bpqether.h
 header-y += bsg.h
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e4c5a1baa993..f896dfac4ac0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -95,6 +95,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SCHED_ACT,
 	BPF_PROG_TYPE_TRACEPOINT,
 	BPF_PROG_TYPE_XDP,
+	BPF_PROG_TYPE_PERF_EVENT,
 };
 
 #define BPF_PSEUDO_MAP_FD	1
diff --git a/include/uapi/linux/bpf_perf_event.h b/include/uapi/linux/bpf_perf_event.h
new file mode 100644
index 000000000000..067427259820
--- /dev/null
+++ b/include/uapi/linux/bpf_perf_event.h
@@ -0,0 +1,18 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef _UAPI__LINUX_BPF_PERF_EVENT_H__
+#define _UAPI__LINUX_BPF_PERF_EVENT_H__
+
+#include <linux/types.h>
+#include <linux/ptrace.h>
+
+struct bpf_perf_event_data {
+	struct pt_regs regs;
+	__u64 sample_period;
+};
+
+#endif /* _UAPI__LINUX_BPF_PERF_EVENT_H__ */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ad35213b8405..d3869b03d9fe 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1,4 +1,5 @@
 /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -8,6 +9,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/bpf.h>
+#include <linux/bpf_perf_event.h>
 #include <linux/filter.h>
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
@@ -552,10 +554,69 @@ static struct bpf_prog_type_list tracepoint_tl = {
 	.type	= BPF_PROG_TYPE_TRACEPOINT,
 };
 
+static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+				    enum bpf_reg_type *reg_type)
+{
+	if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
+		return false;
+	if (type != BPF_READ)
+		return false;
+	if (off % size != 0)
+		return false;
+	if (off == offsetof(struct bpf_perf_event_data, sample_period)) {
+		if (size != sizeof(u64))
+			return false;
+	} else {
+		if (size != sizeof(long))
+			return false;
+	}
+	return true;
+}
+
+static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg,
+				      int src_reg, int ctx_off,
+				      struct bpf_insn *insn_buf,
+				      struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (ctx_off) {
+	case offsetof(struct bpf_perf_event_data, sample_period):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
+		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, data)),
+				      dst_reg, src_reg,
+				      offsetof(struct bpf_perf_event_data_kern, data));
+		*insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg,
+				      offsetof(struct perf_sample_data, period));
+		break;
+	default:
+		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs)),
+				      dst_reg, src_reg,
+				      offsetof(struct bpf_perf_event_data_kern, regs));
+		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(long)),
+				      dst_reg, dst_reg, ctx_off);
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
+static const struct bpf_verifier_ops perf_event_prog_ops = {
+	.get_func_proto		= tp_prog_func_proto,
+	.is_valid_access	= pe_prog_is_valid_access,
+	.convert_ctx_access	= pe_prog_convert_ctx_access,
+};
+
+static struct bpf_prog_type_list perf_event_tl = {
+	.ops	= &perf_event_prog_ops,
+	.type	= BPF_PROG_TYPE_PERF_EVENT,
+};
+
 static int __init register_kprobe_prog_ops(void)
 {
 	bpf_register_prog_type(&kprobe_tl);
 	bpf_register_prog_type(&tracepoint_tl);
+	bpf_register_prog_type(&perf_event_tl);
 	return 0;
 }
 late_initcall(register_kprobe_prog_ops);
-- 
cgit v1.2.3-71-gd317


From ecc6569f3503b39f45bc6b86197b5e0a8533fb72 Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Thu, 25 Aug 2016 23:08:11 +0800
Subject: netfilter: gre: Use consistent GRE_* macros instead of ones defined
 by netfilter.

There are already some GRE_* macros in kernel, so it is unnecessary
to define these macros. And remove some useless macros

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_conntrack_proto_gre.h | 22 ++--------------------
 include/uapi/linux/if_tunnel.h                   |  1 +
 net/ipv4/netfilter/nf_nat_proto_gre.c            |  4 ++--
 net/netfilter/nf_conntrack_proto_gre.c           |  4 ++--
 4 files changed, 7 insertions(+), 24 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/netfilter/nf_conntrack_proto_gre.h b/include/linux/netfilter/nf_conntrack_proto_gre.h
index df78dc2b5524..0189747f2691 100644
--- a/include/linux/netfilter/nf_conntrack_proto_gre.h
+++ b/include/linux/netfilter/nf_conntrack_proto_gre.h
@@ -1,29 +1,11 @@
 #ifndef _CONNTRACK_PROTO_GRE_H
 #define _CONNTRACK_PROTO_GRE_H
 #include <asm/byteorder.h>
+#include <net/gre.h>
+#include <net/pptp.h>
 
 /* GRE PROTOCOL HEADER */
 
-/* GRE Version field */
-#define GRE_VERSION_1701	0x0
-#define GRE_VERSION_PPTP	0x1
-
-/* GRE Protocol field */
-#define GRE_PROTOCOL_PPTP	0x880B
-
-/* GRE Flags */
-#define GRE_FLAG_C		0x80
-#define GRE_FLAG_R		0x40
-#define GRE_FLAG_K		0x20
-#define GRE_FLAG_S		0x10
-#define GRE_FLAG_A		0x80
-
-#define GRE_IS_C(f)	((f)&GRE_FLAG_C)
-#define GRE_IS_R(f)	((f)&GRE_FLAG_R)
-#define GRE_IS_K(f)	((f)&GRE_FLAG_K)
-#define GRE_IS_S(f)	((f)&GRE_FLAG_S)
-#define GRE_IS_A(f)	((f)&GRE_FLAG_A)
-
 /* GRE is a mess: Four different standards */
 struct gre_hdr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 9865c8caedde..fb7337d6b985 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -39,6 +39,7 @@
 #define GRE_IS_REC(f)		((f) & GRE_REC)
 #define GRE_IS_ACK(f)		((f) & GRE_ACK)
 
+#define GRE_VERSION_0		__cpu_to_be16(0x0000)
 #define GRE_VERSION_1		__cpu_to_be16(0x0001)
 #define GRE_PROTO_PPP		__cpu_to_be16(0x880b)
 #define GRE_PPTP_KEY_MASK	__cpu_to_be32(0xffff)
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index 9414923f1e15..93198d71dbb6 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -104,11 +104,11 @@ gre_manip_pkt(struct sk_buff *skb,
 	if (maniptype != NF_NAT_MANIP_DST)
 		return true;
 	switch (greh->version) {
-	case GRE_VERSION_1701:
+	case ntohs(GRE_VERSION_0):
 		/* We do not currently NAT any GREv0 packets.
 		 * Try to behave like "nf_nat_proto_unknown" */
 		break;
-	case GRE_VERSION_PPTP:
+	case ntohs(GRE_VERSION_1):
 		pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key));
 		pgreh->call_id = tuple->dst.u.gre.key;
 		break;
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index a96451a7af20..deb239a014e4 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -200,7 +200,7 @@ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
 
 	/* first only delinearize old RFC1701 GRE header */
 	grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr);
-	if (!grehdr || grehdr->version != GRE_VERSION_PPTP) {
+	if (!grehdr || grehdr->version != ntohs(GRE_VERSION_1)) {
 		/* try to behave like "nf_conntrack_proto_generic" */
 		tuple->src.u.all = 0;
 		tuple->dst.u.all = 0;
@@ -212,7 +212,7 @@ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
 	if (!pgrehdr)
 		return true;
 
-	if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) {
+	if (grehdr->protocol != GRE_PROTO_PPP) {
 		pr_debug("GRE_VERSION_PPTP but unknown proto\n");
 		return false;
 	}
-- 
cgit v1.2.3-71-gd317


From 0d9932b2875f568d679f2af33ce610da3903ac11 Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana <nevola@gmail.com>
Date: Fri, 2 Sep 2016 15:05:57 +0200
Subject: netfilter: nft_numgen: rename until attribute by modulus

The _until_ attribute is renamed to _modulus_ as the behaviour is similar to
other expresions with number limits (ex. nft_hash).

Renaming is possible because there isn't a kernel release yet with these
changes.

Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  4 ++--
 net/netfilter/nft_numgen.c               | 30 +++++++++++++++---------------
 2 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 28ce01d79707..24161e25576d 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1126,13 +1126,13 @@ enum nft_trace_types {
  * enum nft_ng_attributes - nf_tables number generator expression netlink attributes
  *
  * @NFTA_NG_DREG: destination register (NLA_U32)
- * @NFTA_NG_UNTIL: source value to increment the counter until reset (NLA_U32)
+ * @NFTA_NG_MODULUS: maximum counter value (NLA_U32)
  * @NFTA_NG_TYPE: operation type (NLA_U32)
  */
 enum nft_ng_attributes {
 	NFTA_NG_UNSPEC,
 	NFTA_NG_DREG,
-	NFTA_NG_UNTIL,
+	NFTA_NG_MODULUS,
 	NFTA_NG_TYPE,
 	__NFTA_NG_MAX
 };
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 294745ecb0fc..f51a3ede3932 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -21,7 +21,7 @@ static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state);
 
 struct nft_ng_inc {
 	enum nft_registers      dreg:8;
-	u32			until;
+	u32			modulus;
 	atomic_t		counter;
 };
 
@@ -34,7 +34,7 @@ static void nft_ng_inc_eval(const struct nft_expr *expr,
 
 	do {
 		oval = atomic_read(&priv->counter);
-		nval = (oval + 1 < priv->until) ? oval + 1 : 0;
+		nval = (oval + 1 < priv->modulus) ? oval + 1 : 0;
 	} while (atomic_cmpxchg(&priv->counter, oval, nval) != oval);
 
 	memcpy(&regs->data[priv->dreg], &priv->counter, sizeof(u32));
@@ -42,7 +42,7 @@ static void nft_ng_inc_eval(const struct nft_expr *expr,
 
 static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
 	[NFTA_NG_DREG]		= { .type = NLA_U32 },
-	[NFTA_NG_UNTIL]		= { .type = NLA_U32 },
+	[NFTA_NG_MODULUS]	= { .type = NLA_U32 },
 	[NFTA_NG_TYPE]		= { .type = NLA_U32 },
 };
 
@@ -52,8 +52,8 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
 {
 	struct nft_ng_inc *priv = nft_expr_priv(expr);
 
-	priv->until = ntohl(nla_get_be32(tb[NFTA_NG_UNTIL]));
-	if (priv->until == 0)
+	priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS]));
+	if (priv->modulus == 0)
 		return -ERANGE;
 
 	priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
@@ -64,11 +64,11 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
 }
 
 static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
-		       u32 until, enum nft_ng_types type)
+		       u32 modulus, enum nft_ng_types type)
 {
 	if (nft_dump_register(skb, NFTA_NG_DREG, dreg))
 		goto nla_put_failure;
-	if (nla_put_be32(skb, NFTA_NG_UNTIL, htonl(until)))
+	if (nla_put_be32(skb, NFTA_NG_MODULUS, htonl(modulus)))
 		goto nla_put_failure;
 	if (nla_put_be32(skb, NFTA_NG_TYPE, htonl(type)))
 		goto nla_put_failure;
@@ -83,12 +83,12 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_ng_inc *priv = nft_expr_priv(expr);
 
-	return nft_ng_dump(skb, priv->dreg, priv->until, NFT_NG_INCREMENTAL);
+	return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_INCREMENTAL);
 }
 
 struct nft_ng_random {
 	enum nft_registers      dreg:8;
-	u32			until;
+	u32			modulus;
 };
 
 static void nft_ng_random_eval(const struct nft_expr *expr,
@@ -99,7 +99,7 @@ static void nft_ng_random_eval(const struct nft_expr *expr,
 	struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
 
 	regs->data[priv->dreg] = reciprocal_scale(prandom_u32_state(state),
-						  priv->until);
+						  priv->modulus);
 }
 
 static int nft_ng_random_init(const struct nft_ctx *ctx,
@@ -108,8 +108,8 @@ static int nft_ng_random_init(const struct nft_ctx *ctx,
 {
 	struct nft_ng_random *priv = nft_expr_priv(expr);
 
-	priv->until = ntohl(nla_get_be32(tb[NFTA_NG_UNTIL]));
-	if (priv->until == 0)
+	priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS]));
+	if (priv->modulus == 0)
 		return -ERANGE;
 
 	prandom_init_once(&nft_numgen_prandom_state);
@@ -124,7 +124,7 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_ng_random *priv = nft_expr_priv(expr);
 
-	return nft_ng_dump(skb, priv->dreg, priv->until, NFT_NG_RANDOM);
+	return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_RANDOM);
 }
 
 static struct nft_expr_type nft_ng_type;
@@ -149,8 +149,8 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
 {
 	u32 type;
 
-	if (!tb[NFTA_NG_DREG]	||
-	    !tb[NFTA_NG_UNTIL]	||
+	if (!tb[NFTA_NG_DREG]	 ||
+	    !tb[NFTA_NG_MODULUS] ||
 	    !tb[NFTA_NG_TYPE])
 		return ERR_PTR(-EINVAL);
 
-- 
cgit v1.2.3-71-gd317


From d545caca827b65aab557a9e9dcdcf1e5a3823c2d Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Thu, 8 Sep 2016 00:42:25 +0900
Subject: net: inet: diag: expose the socket mark to privileged processes.

This adds the capability for a process that has CAP_NET_ADMIN on
a socket to see the socket mark in socket dumps.

Commit a52e95abf772 ("net: diag: allow socket bytecode filters to
match socket marks") recently gave privileged processes the
ability to filter socket dumps based on mark. This patch is
complementary: it ensures that the mark is also passed to
userspace in the socket's netlink attributes.  It is useful for
tools like ss which display information about sockets.

Tested: https://android-review.googlesource.com/270210
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inet_diag.h      |  4 ++--
 include/uapi/linux/inet_diag.h |  1 +
 net/ipv4/inet_diag.c           | 49 ++++++++++++++++++++++++++++--------------
 net/ipv4/udp_diag.c            | 10 +++++----
 net/sctp/sctp_diag.c           | 20 +++++++++++------
 5 files changed, 56 insertions(+), 28 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h
index feb04ea20f11..65da430e260f 100644
--- a/include/linux/inet_diag.h
+++ b/include/linux/inet_diag.h
@@ -37,7 +37,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		      struct sk_buff *skb, const struct inet_diag_req_v2 *req,
 		      struct user_namespace *user_ns,
 		      u32 pid, u32 seq, u16 nlmsg_flags,
-		      const struct nlmsghdr *unlh);
+		      const struct nlmsghdr *unlh, bool net_admin);
 void inet_diag_dump_icsk(struct inet_hashinfo *h, struct sk_buff *skb,
 			 struct netlink_callback *cb,
 			 const struct inet_diag_req_v2 *r,
@@ -56,7 +56,7 @@ void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk);
 
 int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
 			     struct inet_diag_msg *r, int ext,
-			     struct user_namespace *user_ns);
+			     struct user_namespace *user_ns, bool net_admin);
 
 extern int  inet_diag_register(const struct inet_diag_handler *handler);
 extern void inet_diag_unregister(const struct inet_diag_handler *handler);
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 5581206a08ae..b5c366f87b3e 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -123,6 +123,7 @@ enum {
 	INET_DIAG_LOCALS,
 	INET_DIAG_PEERS,
 	INET_DIAG_PAD,
+	INET_DIAG_MARK,
 	__INET_DIAG_MAX,
 };
 
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index abfbe492ebfe..e4d16fc5bbb3 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -99,6 +99,7 @@ static size_t inet_sk_attr_size(void)
 		+ nla_total_size(1) /* INET_DIAG_SHUTDOWN */
 		+ nla_total_size(1) /* INET_DIAG_TOS */
 		+ nla_total_size(1) /* INET_DIAG_TCLASS */
+		+ nla_total_size(4) /* INET_DIAG_MARK */
 		+ nla_total_size(sizeof(struct inet_diag_meminfo))
 		+ nla_total_size(sizeof(struct inet_diag_msg))
 		+ nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
@@ -109,7 +110,8 @@ static size_t inet_sk_attr_size(void)
 
 int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
 			     struct inet_diag_msg *r, int ext,
-			     struct user_namespace *user_ns)
+			     struct user_namespace *user_ns,
+			     bool net_admin)
 {
 	const struct inet_sock *inet = inet_sk(sk);
 
@@ -136,6 +138,9 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
 	}
 #endif
 
+	if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark))
+		goto errout;
+
 	r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
 	r->idiag_inode = sock_i_ino(sk);
 
@@ -149,7 +154,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		      struct sk_buff *skb, const struct inet_diag_req_v2 *req,
 		      struct user_namespace *user_ns,
 		      u32 portid, u32 seq, u16 nlmsg_flags,
-		      const struct nlmsghdr *unlh)
+		      const struct nlmsghdr *unlh,
+		      bool net_admin)
 {
 	const struct tcp_congestion_ops *ca_ops;
 	const struct inet_diag_handler *handler;
@@ -175,7 +181,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 	r->idiag_timer = 0;
 	r->idiag_retrans = 0;
 
-	if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns))
+	if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin))
 		goto errout;
 
 	if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
@@ -274,10 +280,11 @@ static int inet_csk_diag_fill(struct sock *sk,
 			      const struct inet_diag_req_v2 *req,
 			      struct user_namespace *user_ns,
 			      u32 portid, u32 seq, u16 nlmsg_flags,
-			      const struct nlmsghdr *unlh)
+			      const struct nlmsghdr *unlh,
+			      bool net_admin)
 {
-	return inet_sk_diag_fill(sk, inet_csk(sk), skb, req,
-				 user_ns, portid, seq, nlmsg_flags, unlh);
+	return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns,
+				 portid, seq, nlmsg_flags, unlh, net_admin);
 }
 
 static int inet_twsk_diag_fill(struct sock *sk,
@@ -319,8 +326,9 @@ static int inet_twsk_diag_fill(struct sock *sk,
 
 static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
 			      u32 portid, u32 seq, u16 nlmsg_flags,
-			      const struct nlmsghdr *unlh)
+			      const struct nlmsghdr *unlh, bool net_admin)
 {
+	struct request_sock *reqsk = inet_reqsk(sk);
 	struct inet_diag_msg *r;
 	struct nlmsghdr *nlh;
 	long tmo;
@@ -334,7 +342,7 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
 	inet_diag_msg_common_fill(r, sk);
 	r->idiag_state = TCP_SYN_RECV;
 	r->idiag_timer = 1;
-	r->idiag_retrans = inet_reqsk(sk)->num_retrans;
+	r->idiag_retrans = reqsk->num_retrans;
 
 	BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
 		     offsetof(struct sock, sk_cookie));
@@ -346,6 +354,10 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
 	r->idiag_uid	= 0;
 	r->idiag_inode	= 0;
 
+	if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
+				     inet_rsk(reqsk)->ir_mark))
+		return -EMSGSIZE;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 }
@@ -354,7 +366,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 			const struct inet_diag_req_v2 *r,
 			struct user_namespace *user_ns,
 			u32 portid, u32 seq, u16 nlmsg_flags,
-			const struct nlmsghdr *unlh)
+			const struct nlmsghdr *unlh, bool net_admin)
 {
 	if (sk->sk_state == TCP_TIME_WAIT)
 		return inet_twsk_diag_fill(sk, skb, portid, seq,
@@ -362,10 +374,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 
 	if (sk->sk_state == TCP_NEW_SYN_RECV)
 		return inet_req_diag_fill(sk, skb, portid, seq,
-					  nlmsg_flags, unlh);
+					  nlmsg_flags, unlh, net_admin);
 
 	return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
-				  nlmsg_flags, unlh);
+				  nlmsg_flags, unlh, net_admin);
 }
 
 struct sock *inet_diag_find_one_icsk(struct net *net,
@@ -435,7 +447,8 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
 	err = sk_diag_fill(sk, rep, req,
 			   sk_user_ns(NETLINK_CB(in_skb).sk),
 			   NETLINK_CB(in_skb).portid,
-			   nlh->nlmsg_seq, 0, nlh);
+			   nlh->nlmsg_seq, 0, nlh,
+			   netlink_net_capable(in_skb, CAP_NET_ADMIN));
 	if (err < 0) {
 		WARN_ON(err == -EMSGSIZE);
 		nlmsg_free(rep);
@@ -796,7 +809,8 @@ static int inet_csk_diag_dump(struct sock *sk,
 			      struct sk_buff *skb,
 			      struct netlink_callback *cb,
 			      const struct inet_diag_req_v2 *r,
-			      const struct nlattr *bc)
+			      const struct nlattr *bc,
+			      bool net_admin)
 {
 	if (!inet_diag_bc_sk(bc, sk))
 		return 0;
@@ -804,7 +818,8 @@ static int inet_csk_diag_dump(struct sock *sk,
 	return inet_csk_diag_fill(sk, skb, r,
 				  sk_user_ns(NETLINK_CB(cb->skb).sk),
 				  NETLINK_CB(cb->skb).portid,
-				  cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+				  cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh,
+				  net_admin);
 }
 
 static void twsk_build_assert(void)
@@ -840,6 +855,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 	struct net *net = sock_net(skb->sk);
 	int i, num, s_i, s_num;
 	u32 idiag_states = r->idiag_states;
+	bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
 
 	if (idiag_states & TCPF_SYN_RECV)
 		idiag_states |= TCPF_NEW_SYN_RECV;
@@ -880,7 +896,8 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 				    cb->args[3] > 0)
 					goto next_listen;
 
-				if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
+				if (inet_csk_diag_dump(sk, skb, cb, r,
+						       bc, net_admin) < 0) {
 					spin_unlock_bh(&ilb->lock);
 					goto done;
 				}
@@ -948,7 +965,7 @@ skip_listen_ht:
 					   sk_user_ns(NETLINK_CB(cb->skb).sk),
 					   NETLINK_CB(cb->skb).portid,
 					   cb->nlh->nlmsg_seq, NLM_F_MULTI,
-					   cb->nlh);
+					   cb->nlh, net_admin);
 			if (res < 0) {
 				spin_unlock_bh(lock);
 				goto done;
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 58b79c0c0d69..9a89c10a55f0 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -20,7 +20,7 @@
 static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
 			struct netlink_callback *cb,
 			const struct inet_diag_req_v2 *req,
-			struct nlattr *bc)
+			struct nlattr *bc, bool net_admin)
 {
 	if (!inet_diag_bc_sk(bc, sk))
 		return 0;
@@ -28,7 +28,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
 	return inet_sk_diag_fill(sk, NULL, skb, req,
 			sk_user_ns(NETLINK_CB(cb->skb).sk),
 			NETLINK_CB(cb->skb).portid,
-			cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+			cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin);
 }
 
 static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
@@ -76,7 +76,8 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
 	err = inet_sk_diag_fill(sk, NULL, rep, req,
 			   sk_user_ns(NETLINK_CB(in_skb).sk),
 			   NETLINK_CB(in_skb).portid,
-			   nlh->nlmsg_seq, 0, nlh);
+			   nlh->nlmsg_seq, 0, nlh,
+			   netlink_net_capable(in_skb, CAP_NET_ADMIN));
 	if (err < 0) {
 		WARN_ON(err == -EMSGSIZE);
 		kfree_skb(rep);
@@ -97,6 +98,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
 		     struct netlink_callback *cb,
 		     const struct inet_diag_req_v2 *r, struct nlattr *bc)
 {
+	bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
 	struct net *net = sock_net(skb->sk);
 	int num, s_num, slot, s_slot;
 
@@ -132,7 +134,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
 			    r->id.idiag_dport)
 				goto next;
 
-			if (sk_diag_dump(sk, skb, cb, r, bc) < 0) {
+			if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) {
 				spin_unlock_bh(&hslot->lock);
 				goto done;
 			}
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
index f3508aa75815..807158e32f5f 100644
--- a/net/sctp/sctp_diag.c
+++ b/net/sctp/sctp_diag.c
@@ -106,7 +106,8 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc,
 			       const struct inet_diag_req_v2 *req,
 			       struct user_namespace *user_ns,
 			       int portid, u32 seq, u16 nlmsg_flags,
-			       const struct nlmsghdr *unlh)
+			       const struct nlmsghdr *unlh,
+			       bool net_admin)
 {
 	struct sctp_endpoint *ep = sctp_sk(sk)->ep;
 	struct list_head *addr_list;
@@ -133,7 +134,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc,
 		r->idiag_retrans = 0;
 	}
 
-	if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns))
+	if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin))
 		goto errout;
 
 	if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) {
@@ -203,6 +204,7 @@ struct sctp_comm_param {
 	struct netlink_callback *cb;
 	const struct inet_diag_req_v2 *r;
 	const struct nlmsghdr *nlh;
+	bool net_admin;
 };
 
 static size_t inet_assoc_attr_size(struct sctp_association *asoc)
@@ -219,6 +221,7 @@ static size_t inet_assoc_attr_size(struct sctp_association *asoc)
 		+ nla_total_size(1) /* INET_DIAG_SHUTDOWN */
 		+ nla_total_size(1) /* INET_DIAG_TOS */
 		+ nla_total_size(1) /* INET_DIAG_TCLASS */
+		+ nla_total_size(4) /* INET_DIAG_MARK */
 		+ nla_total_size(addrlen * asoc->peer.transport_count)
 		+ nla_total_size(addrlen * addrcnt)
 		+ nla_total_size(sizeof(struct inet_diag_meminfo))
@@ -256,7 +259,8 @@ static int sctp_tsp_dump_one(struct sctp_transport *tsp, void *p)
 	err = inet_sctp_diag_fill(sk, assoc, rep, req,
 				  sk_user_ns(NETLINK_CB(in_skb).sk),
 				  NETLINK_CB(in_skb).portid,
-				  nlh->nlmsg_seq, 0, nlh);
+				  nlh->nlmsg_seq, 0, nlh,
+				  commp->net_admin);
 	release_sock(sk);
 	if (err < 0) {
 		WARN_ON(err == -EMSGSIZE);
@@ -310,7 +314,8 @@ static int sctp_tsp_dump(struct sctp_transport *tsp, void *p)
 					sk_user_ns(NETLINK_CB(cb->skb).sk),
 					NETLINK_CB(cb->skb).portid,
 					cb->nlh->nlmsg_seq,
-					NLM_F_MULTI, cb->nlh) < 0) {
+					NLM_F_MULTI, cb->nlh,
+					commp->net_admin) < 0) {
 			cb->args[3] = 1;
 			err = 2;
 			goto release;
@@ -320,7 +325,8 @@ static int sctp_tsp_dump(struct sctp_transport *tsp, void *p)
 		if (inet_sctp_diag_fill(sk, assoc, skb, r,
 					sk_user_ns(NETLINK_CB(cb->skb).sk),
 					NETLINK_CB(cb->skb).portid,
-					cb->nlh->nlmsg_seq, 0, cb->nlh) < 0) {
+					cb->nlh->nlmsg_seq, 0, cb->nlh,
+					commp->net_admin) < 0) {
 			err = 2;
 			goto release;
 		}
@@ -375,7 +381,7 @@ static int sctp_ep_dump(struct sctp_endpoint *ep, void *p)
 				sk_user_ns(NETLINK_CB(cb->skb).sk),
 				NETLINK_CB(cb->skb).portid,
 				cb->nlh->nlmsg_seq, NLM_F_MULTI,
-				cb->nlh) < 0) {
+				cb->nlh, commp->net_admin) < 0) {
 		err = 2;
 		goto out;
 	}
@@ -412,6 +418,7 @@ static int sctp_diag_dump_one(struct sk_buff *in_skb,
 		.skb = in_skb,
 		.r = req,
 		.nlh = nlh,
+		.net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN),
 	};
 
 	if (req->sdiag_family == AF_INET) {
@@ -447,6 +454,7 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 		.skb = skb,
 		.cb = cb,
 		.r = r,
+		.net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN),
 	};
 
 	/* eps hashtable dumps
-- 
cgit v1.2.3-71-gd317


From 8c146bb9d59aa2ac45222171916ece186c4b3943 Mon Sep 17 00:00:00 2001
From: Thomas F Herbert <thomasfherbert@gmail.com>
Date: Wed, 7 Sep 2016 12:56:57 -0400
Subject: openvswitch: 802.1ad uapi changes.

openvswitch: Add support for 8021.AD

Change the description of the VLAN tpid field.

Signed-off-by: Thomas F Herbert <thomasfherbert@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 54c3b4f4aceb..59ed3992c760 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -605,13 +605,13 @@ struct ovs_action_push_mpls {
  * @vlan_tci: Tag control identifier (TCI) to push.  The CFI bit must be set
  * (but it will not be set in the 802.1Q header that is pushed).
  *
- * The @vlan_tpid value is typically %ETH_P_8021Q.  The only acceptable TPID
- * values are those that the kernel module also parses as 802.1Q headers, to
- * prevent %OVS_ACTION_ATTR_PUSH_VLAN followed by %OVS_ACTION_ATTR_POP_VLAN
- * from having surprising results.
+ * The @vlan_tpid value is typically %ETH_P_8021Q or %ETH_P_8021AD.
+ * The only acceptable TPID values are those that the kernel module also parses
+ * as 802.1Q or 802.1AD headers, to prevent %OVS_ACTION_ATTR_PUSH_VLAN followed
+ * by %OVS_ACTION_ATTR_POP_VLAN from having surprising results.
  */
 struct ovs_action_push_vlan {
-	__be16 vlan_tpid;	/* 802.1Q TPID. */
+	__be16 vlan_tpid;	/* 802.1Q or 802.1ad TPID. */
 	__be16 vlan_tci;	/* 802.1Q TCI (VLAN ID and priority). */
 };
 
@@ -721,9 +721,10 @@ enum ovs_nat_attr {
  * is copied from the value to the packet header field, rest of the bits are
  * left unchanged.  The non-masked value bits must be passed in as zeroes.
  * Masking is not supported for the %OVS_KEY_ATTR_TUNNEL attribute.
- * @OVS_ACTION_ATTR_PUSH_VLAN: Push a new outermost 802.1Q header onto the
- * packet.
- * @OVS_ACTION_ATTR_POP_VLAN: Pop the outermost 802.1Q header off the packet.
+ * @OVS_ACTION_ATTR_PUSH_VLAN: Push a new outermost 802.1Q or 802.1ad header
+ * onto the packet.
+ * @OVS_ACTION_ATTR_POP_VLAN: Pop the outermost 802.1Q or 802.1ad header
+ * from the packet.
  * @OVS_ACTION_ATTR_SAMPLE: Probabilitically executes actions, as specified in
  * the nested %OVS_SAMPLE_ATTR_* attributes.
  * @OVS_ACTION_ATTR_PUSH_MPLS: Push a new MPLS label stack entry onto the
-- 
cgit v1.2.3-71-gd317


From 34a3d4b2d1f1b7c81af79f6f93a6cef4c3a0f54a Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@tricolour.ca>
Date: Thu, 8 Sep 2016 13:55:56 -0400
Subject: xfrm: fix header file comment reference to struct
 xfrm_replay_state_esn

Reported-by: Paul Wouters <paul@nohats.ca>
Signed-off-by: Richard Guy Briggs <rgb@tricolour.ca>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/uapi/linux/xfrm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 143338978b48..1fc62b239f1b 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -298,7 +298,7 @@ enum xfrm_attr_type_t {
 	XFRMA_ALG_AUTH_TRUNC,	/* struct xfrm_algo_auth */
 	XFRMA_MARK,		/* struct xfrm_mark */
 	XFRMA_TFCPAD,		/* __u32 */
-	XFRMA_REPLAY_ESN_VAL,	/* struct xfrm_replay_esn */
+	XFRMA_REPLAY_ESN_VAL,	/* struct xfrm_replay_state_esn */
 	XFRMA_SA_EXTRA_FLAGS,	/* __u32 */
 	XFRMA_PROTO,		/* __u8 */
 	XFRMA_ADDRESS_FILTER,	/* struct xfrm_address_filter */
-- 
cgit v1.2.3-71-gd317


From bc3103f1ed405de587fa43d8b0671e615505a700 Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Thu, 8 Sep 2016 16:23:47 +0300
Subject: net/sched: cls_flower: Classify packet in ip tunnels

Introduce classifying by metadata extracted by the tunnel device.
Outer header fields - source/dest ip and tunnel id, are extracted from
the metadata when classifying.

For example, the following will add a filter on the ingress Qdisc of shared
vxlan device named 'vxlan0'. To forward packets with outer src ip
11.11.0.2, dst ip 11.11.0.1 and tunnel id 11. The packets will be
forwarded to tap device 'vnet0' (after metadata is released):

$ tc filter add dev vxlan0 protocol ip parent ffff: \
    flower \
      enc_src_ip 11.11.0.2 \
      enc_dst_ip 11.11.0.1 \
      enc_key_id 11 \
      dst_ip 11.11.11.1 \
    action tunnel_key release \
    action mirred egress redirect dev vnet0

The action tunnel_key, will be introduced in the next patch in this
series.

Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  11 +++++
 net/sched/cls_flower.c       | 100 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 110 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 51b5b247fb5a..f9c287c67eae 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -431,6 +431,17 @@ enum {
 	TCA_FLOWER_KEY_VLAN_ID,
 	TCA_FLOWER_KEY_VLAN_PRIO,
 	TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+
+	TCA_FLOWER_KEY_ENC_KEY_ID,	/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV4_SRC,	/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV4_DST,	/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV6_SRC,	/* struct in6_addr */
+	TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,/* struct in6_addr */
+	TCA_FLOWER_KEY_ENC_IPV6_DST,	/* struct in6_addr */
+	TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,/* struct in6_addr */
+
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index cf9ad5b50889..b084b2aab2d7 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -23,9 +23,13 @@
 #include <net/ip.h>
 #include <net/flow_dissector.h>
 
+#include <net/dst.h>
+#include <net/dst_metadata.h>
+
 struct fl_flow_key {
 	int	indev_ifindex;
 	struct flow_dissector_key_control control;
+	struct flow_dissector_key_control enc_control;
 	struct flow_dissector_key_basic basic;
 	struct flow_dissector_key_eth_addrs eth;
 	struct flow_dissector_key_vlan vlan;
@@ -35,6 +39,11 @@ struct fl_flow_key {
 		struct flow_dissector_key_ipv6_addrs ipv6;
 	};
 	struct flow_dissector_key_ports tp;
+	struct flow_dissector_key_keyid enc_key_id;
+	union {
+		struct flow_dissector_key_ipv4_addrs enc_ipv4;
+		struct flow_dissector_key_ipv6_addrs enc_ipv6;
+	};
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
 
 struct fl_flow_mask_range {
@@ -124,11 +133,31 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	struct cls_fl_filter *f;
 	struct fl_flow_key skb_key;
 	struct fl_flow_key skb_mkey;
+	struct ip_tunnel_info *info;
 
 	if (!atomic_read(&head->ht.nelems))
 		return -1;
 
 	fl_clear_masked_range(&skb_key, &head->mask);
+
+	info = skb_tunnel_info(skb);
+	if (info) {
+		struct ip_tunnel_key *key = &info->key;
+
+		switch (ip_tunnel_info_af(info)) {
+		case AF_INET:
+			skb_key.enc_ipv4.src = key->u.ipv4.src;
+			skb_key.enc_ipv4.dst = key->u.ipv4.dst;
+			break;
+		case AF_INET6:
+			skb_key.enc_ipv6.src = key->u.ipv6.src;
+			skb_key.enc_ipv6.dst = key->u.ipv6.dst;
+			break;
+		}
+
+		skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id);
+	}
+
 	skb_key.indev_ifindex = skb->skb_iif;
 	/* skb_flow_dissect() does not set n_proto in case an unknown protocol,
 	 * so do it rather here.
@@ -297,7 +326,15 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_KEY_VLAN_ID]	= { .type = NLA_U16 },
 	[TCA_FLOWER_KEY_VLAN_PRIO]	= { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_VLAN_ETH_TYPE]	= { .type = NLA_U16 },
-
+	[TCA_FLOWER_KEY_ENC_KEY_ID]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ENC_IPV4_SRC]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK] = { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ENC_IPV4_DST]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ENC_IPV4_DST_MASK] = { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ENC_IPV6_SRC]	= { .len = sizeof(struct in6_addr) },
+	[TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) },
+	[TCA_FLOWER_KEY_ENC_IPV6_DST]	= { .len = sizeof(struct in6_addr) },
+	[TCA_FLOWER_KEY_ENC_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) },
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -409,6 +446,40 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 			       sizeof(key->tp.dst));
 	}
 
+	if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
+	    tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) {
+		key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+		fl_set_key_val(tb, &key->enc_ipv4.src,
+			       TCA_FLOWER_KEY_ENC_IPV4_SRC,
+			       &mask->enc_ipv4.src,
+			       TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
+			       sizeof(key->enc_ipv4.src));
+		fl_set_key_val(tb, &key->enc_ipv4.dst,
+			       TCA_FLOWER_KEY_ENC_IPV4_DST,
+			       &mask->enc_ipv4.dst,
+			       TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
+			       sizeof(key->enc_ipv4.dst));
+	}
+
+	if (tb[TCA_FLOWER_KEY_ENC_IPV6_SRC] ||
+	    tb[TCA_FLOWER_KEY_ENC_IPV6_DST]) {
+		key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+		fl_set_key_val(tb, &key->enc_ipv6.src,
+			       TCA_FLOWER_KEY_ENC_IPV6_SRC,
+			       &mask->enc_ipv6.src,
+			       TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
+			       sizeof(key->enc_ipv6.src));
+		fl_set_key_val(tb, &key->enc_ipv6.dst,
+			       TCA_FLOWER_KEY_ENC_IPV6_DST,
+			       &mask->enc_ipv6.dst,
+			       TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
+			       sizeof(key->enc_ipv6.dst));
+	}
+
+	fl_set_key_val(tb, &key->enc_key_id.keyid, TCA_FLOWER_KEY_ENC_KEY_ID,
+		       &mask->enc_key_id.keyid, TCA_FLOWER_KEY_ENC_KEY_ID,
+		       sizeof(key->enc_key_id.keyid));
+
 	return 0;
 }
 
@@ -821,6 +892,33 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 				  sizeof(key->tp.dst))))
 		goto nla_put_failure;
 
+	if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
+	    (fl_dump_key_val(skb, &key->enc_ipv4.src,
+			    TCA_FLOWER_KEY_ENC_IPV4_SRC, &mask->enc_ipv4.src,
+			    TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
+			    sizeof(key->enc_ipv4.src)) ||
+	     fl_dump_key_val(skb, &key->enc_ipv4.dst,
+			     TCA_FLOWER_KEY_ENC_IPV4_DST, &mask->enc_ipv4.dst,
+			     TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
+			     sizeof(key->enc_ipv4.dst))))
+		goto nla_put_failure;
+	else if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS &&
+		 (fl_dump_key_val(skb, &key->enc_ipv6.src,
+			    TCA_FLOWER_KEY_ENC_IPV6_SRC, &mask->enc_ipv6.src,
+			    TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
+			    sizeof(key->enc_ipv6.src)) ||
+		 fl_dump_key_val(skb, &key->enc_ipv6.dst,
+				 TCA_FLOWER_KEY_ENC_IPV6_DST,
+				 &mask->enc_ipv6.dst,
+				 TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
+			    sizeof(key->enc_ipv6.dst))))
+		goto nla_put_failure;
+
+	if (fl_dump_key_val(skb, &key->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID,
+			    &mask->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID,
+			    sizeof(key->enc_key_id)))
+		goto nla_put_failure;
+
 	nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags);
 
 	if (tcf_exts_dump(skb, &f->exts))
-- 
cgit v1.2.3-71-gd317


From d0f6dd8a914f42c6f1a3a8c08caa16559d3d9a1b Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Thu, 8 Sep 2016 16:23:48 +0300
Subject: net/sched: Introduce act_tunnel_key

This action could be used before redirecting packets to a shared tunnel
device, or when redirecting packets arriving from a such a device.

The action will release the metadata created by the tunnel device
(decap), or set the metadata with the specified values for encap
operation.

For example, the following flower filter will forward all ICMP packets
destined to 11.11.11.2 through the shared vxlan device 'vxlan0'. Before
redirecting, a metadata for the vxlan tunnel is created using the
tunnel_key action and it's arguments:

$ tc filter add dev net0 protocol ip parent ffff: \
    flower \
      ip_proto 1 \
      dst_ip 11.11.11.2 \
    action tunnel_key set \
      src_ip 11.11.0.1 \
      dst_ip 11.11.0.2 \
      id 11 \
    action mirred egress redirect dev vxlan0

Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Reviewed-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_tunnel_key.h        |  30 +++
 include/uapi/linux/tc_act/tc_tunnel_key.h |  41 ++++
 net/sched/Kconfig                         |  11 +
 net/sched/Makefile                        |   1 +
 net/sched/act_tunnel_key.c                | 351 ++++++++++++++++++++++++++++++
 5 files changed, 434 insertions(+)
 create mode 100644 include/net/tc_act/tc_tunnel_key.h
 create mode 100644 include/uapi/linux/tc_act/tc_tunnel_key.h
 create mode 100644 net/sched/act_tunnel_key.c

(limited to 'include/uapi/linux')

diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h
new file mode 100644
index 000000000000..253f8da6c2a6
--- /dev/null
+++ b/include/net/tc_act/tc_tunnel_key.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __NET_TC_TUNNEL_KEY_H
+#define __NET_TC_TUNNEL_KEY_H
+
+#include <net/act_api.h>
+
+struct tcf_tunnel_key_params {
+	struct rcu_head		rcu;
+	int			tcft_action;
+	int			action;
+	struct metadata_dst     *tcft_enc_metadata;
+};
+
+struct tcf_tunnel_key {
+	struct tc_action	      common;
+	struct tcf_tunnel_key_params __rcu *params;
+};
+
+#define to_tunnel_key(a) ((struct tcf_tunnel_key *)a)
+
+#endif /* __NET_TC_TUNNEL_KEY_H */
diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h
new file mode 100644
index 000000000000..890106ff16e6
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_tunnel_key.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_TC_TUNNEL_KEY_H
+#define __LINUX_TC_TUNNEL_KEY_H
+
+#include <linux/pkt_cls.h>
+
+#define TCA_ACT_TUNNEL_KEY 17
+
+#define TCA_TUNNEL_KEY_ACT_SET	    1
+#define TCA_TUNNEL_KEY_ACT_RELEASE  2
+
+struct tc_tunnel_key {
+	tc_gen;
+	int t_action;
+};
+
+enum {
+	TCA_TUNNEL_KEY_UNSPEC,
+	TCA_TUNNEL_KEY_TM,
+	TCA_TUNNEL_KEY_PARMS,
+	TCA_TUNNEL_KEY_ENC_IPV4_SRC,	/* be32 */
+	TCA_TUNNEL_KEY_ENC_IPV4_DST,	/* be32 */
+	TCA_TUNNEL_KEY_ENC_IPV6_SRC,	/* struct in6_addr */
+	TCA_TUNNEL_KEY_ENC_IPV6_DST,	/* struct in6_addr */
+	TCA_TUNNEL_KEY_ENC_KEY_ID,	/* be64 */
+	TCA_TUNNEL_KEY_PAD,
+	__TCA_TUNNEL_KEY_MAX,
+};
+
+#define TCA_TUNNEL_KEY_MAX (__TCA_TUNNEL_KEY_MAX - 1)
+
+#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index ccf931b3b94c..72e3426fa48f 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -761,6 +761,17 @@ config NET_ACT_IFE
 	  To compile this code as a module, choose M here: the
 	  module will be called act_ife.
 
+config NET_ACT_TUNNEL_KEY
+        tristate "IP tunnel metadata manipulation"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to set/release ip tunnel metadata.
+
+	  If unsure, say N.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_tunnel_key.
+
 config NET_IFE_SKBMARK
         tristate "Support to encoding decoding skb mark on IFE action"
         depends on NET_ACT_IFE
diff --git a/net/sched/Makefile b/net/sched/Makefile
index ae088a5a9d95..b9d046b9535a 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_NET_ACT_CONNMARK)	+= act_connmark.o
 obj-$(CONFIG_NET_ACT_IFE)	+= act_ife.o
 obj-$(CONFIG_NET_IFE_SKBMARK)	+= act_meta_mark.o
 obj-$(CONFIG_NET_IFE_SKBPRIO)	+= act_meta_skbprio.o
+obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
new file mode 100644
index 000000000000..dceff7412dc3
--- /dev/null
+++ b/net/sched/act_tunnel_key.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/dst.h>
+#include <net/dst_metadata.h>
+
+#include <linux/tc_act/tc_tunnel_key.h>
+#include <net/tc_act/tc_tunnel_key.h>
+
+#define TUNNEL_KEY_TAB_MASK     15
+
+static int tunnel_key_net_id;
+static struct tc_action_ops act_tunnel_key_ops;
+
+static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
+			  struct tcf_result *res)
+{
+	struct tcf_tunnel_key *t = to_tunnel_key(a);
+	struct tcf_tunnel_key_params *params;
+	int action;
+
+	rcu_read_lock();
+
+	params = rcu_dereference(t->params);
+
+	tcf_lastuse_update(&t->tcf_tm);
+	bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb);
+	action = params->action;
+
+	switch (params->tcft_action) {
+	case TCA_TUNNEL_KEY_ACT_RELEASE:
+		skb_dst_drop(skb);
+		break;
+	case TCA_TUNNEL_KEY_ACT_SET:
+		skb_dst_drop(skb);
+		skb_dst_set(skb, dst_clone(&params->tcft_enc_metadata->dst));
+		break;
+	default:
+		WARN_ONCE(1, "Bad tunnel_key action %d.\n",
+			  params->tcft_action);
+		break;
+	}
+
+	rcu_read_unlock();
+
+	return action;
+}
+
+static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
+	[TCA_TUNNEL_KEY_PARMS]	    = { .len = sizeof(struct tc_tunnel_key) },
+	[TCA_TUNNEL_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 },
+	[TCA_TUNNEL_KEY_ENC_IPV4_DST] = { .type = NLA_U32 },
+	[TCA_TUNNEL_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) },
+	[TCA_TUNNEL_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) },
+	[TCA_TUNNEL_KEY_ENC_KEY_ID]   = { .type = NLA_U32 },
+};
+
+static int tunnel_key_init(struct net *net, struct nlattr *nla,
+			   struct nlattr *est, struct tc_action **a,
+			   int ovr, int bind)
+{
+	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+	struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
+	struct tcf_tunnel_key_params *params_old;
+	struct tcf_tunnel_key_params *params_new;
+	struct metadata_dst *metadata = NULL;
+	struct tc_tunnel_key *parm;
+	struct tcf_tunnel_key *t;
+	bool exists = false;
+	__be64 key_id;
+	int ret = 0;
+	int err;
+
+	if (!nla)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_TUNNEL_KEY_PARMS])
+		return -EINVAL;
+
+	parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]);
+	exists = tcf_hash_check(tn, parm->index, a, bind);
+	if (exists && bind)
+		return 0;
+
+	switch (parm->t_action) {
+	case TCA_TUNNEL_KEY_ACT_RELEASE:
+		break;
+	case TCA_TUNNEL_KEY_ACT_SET:
+		if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) {
+			ret = -EINVAL;
+			goto err_out;
+		}
+
+		key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]));
+
+		if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] &&
+		    tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) {
+			__be32 saddr;
+			__be32 daddr;
+
+			saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]);
+			daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]);
+
+			metadata = __ip_tun_set_dst(saddr, daddr, 0, 0,
+						    TUNNEL_KEY, key_id, 0);
+		} else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] &&
+			   tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) {
+			struct in6_addr saddr;
+			struct in6_addr daddr;
+
+			saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]);
+			daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]);
+
+			metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, 0,
+						      TUNNEL_KEY, key_id, 0);
+		}
+
+		if (!metadata) {
+			ret = -EINVAL;
+			goto err_out;
+		}
+
+		metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX;
+		break;
+	default:
+		goto err_out;
+	}
+
+	if (!exists) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      &act_tunnel_key_ops, bind, true);
+		if (ret)
+			return ret;
+
+		ret = ACT_P_CREATED;
+	} else {
+		tcf_hash_release(*a, bind);
+		if (!ovr)
+			return -EEXIST;
+	}
+
+	t = to_tunnel_key(*a);
+
+	ASSERT_RTNL();
+	params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
+	if (unlikely(!params_new)) {
+		if (ret == ACT_P_CREATED)
+			tcf_hash_release(*a, bind);
+		return -ENOMEM;
+	}
+
+	params_old = rtnl_dereference(t->params);
+
+	params_new->action = parm->action;
+	params_new->tcft_action = parm->t_action;
+	params_new->tcft_enc_metadata = metadata;
+
+	rcu_assign_pointer(t->params, params_new);
+
+	if (params_old)
+		kfree_rcu(params_old, rcu);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(tn, *a);
+
+	return ret;
+
+err_out:
+	if (exists)
+		tcf_hash_release(*a, bind);
+	return ret;
+}
+
+static void tunnel_key_release(struct tc_action *a, int bind)
+{
+	struct tcf_tunnel_key *t = to_tunnel_key(a);
+	struct tcf_tunnel_key_params *params;
+
+	rcu_read_lock();
+	params = rcu_dereference(t->params);
+
+	if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET)
+		dst_release(&params->tcft_enc_metadata->dst);
+
+	kfree_rcu(params, rcu);
+
+	rcu_read_unlock();
+}
+
+static int tunnel_key_dump_addresses(struct sk_buff *skb,
+				     const struct ip_tunnel_info *info)
+{
+	unsigned short family = ip_tunnel_info_af(info);
+
+	if (family == AF_INET) {
+		__be32 saddr = info->key.u.ipv4.src;
+		__be32 daddr = info->key.u.ipv4.dst;
+
+		if (!nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_SRC, saddr) &&
+		    !nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_DST, daddr))
+			return 0;
+	}
+
+	if (family == AF_INET6) {
+		const struct in6_addr *saddr6 = &info->key.u.ipv6.src;
+		const struct in6_addr *daddr6 = &info->key.u.ipv6.dst;
+
+		if (!nla_put_in6_addr(skb,
+				      TCA_TUNNEL_KEY_ENC_IPV6_SRC, saddr6) &&
+		    !nla_put_in6_addr(skb,
+				      TCA_TUNNEL_KEY_ENC_IPV6_DST, daddr6))
+			return 0;
+	}
+
+	return -EINVAL;
+}
+
+static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
+			   int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_tunnel_key *t = to_tunnel_key(a);
+	struct tcf_tunnel_key_params *params;
+	struct tc_tunnel_key opt = {
+		.index    = t->tcf_index,
+		.refcnt   = t->tcf_refcnt - ref,
+		.bindcnt  = t->tcf_bindcnt - bind,
+	};
+	struct tcf_t tm;
+	int ret = -1;
+
+	rcu_read_lock();
+	params = rcu_dereference(t->params);
+
+	opt.t_action = params->tcft_action;
+	opt.action = params->action;
+
+	if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) {
+		struct ip_tunnel_key *key =
+			&params->tcft_enc_metadata->u.tun_info.key;
+		__be32 key_id = tunnel_id_to_key32(key->tun_id);
+
+		if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) ||
+		    tunnel_key_dump_addresses(skb,
+					      &params->tcft_enc_metadata->u.tun_info))
+			goto nla_put_failure;
+	}
+
+	tcf_tm_dump(&tm, &t->tcf_tm);
+	if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm),
+			  &tm, TCA_TUNNEL_KEY_PAD))
+		goto nla_put_failure;
+
+	ret = skb->len;
+	goto out;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+out:
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int tunnel_key_walker(struct net *net, struct sk_buff *skb,
+			     struct netlink_callback *cb, int type,
+			     const struct tc_action_ops *ops)
+{
+	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops);
+}
+
+static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_tunnel_key_ops = {
+	.kind		=	"tunnel_key",
+	.type		=	TCA_ACT_TUNNEL_KEY,
+	.owner		=	THIS_MODULE,
+	.act		=	tunnel_key_act,
+	.dump		=	tunnel_key_dump,
+	.init		=	tunnel_key_init,
+	.cleanup	=	tunnel_key_release,
+	.walk		=	tunnel_key_walker,
+	.lookup		=	tunnel_key_search,
+	.size		=	sizeof(struct tcf_tunnel_key),
+};
+
+static __net_init int tunnel_key_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+	return tc_action_net_init(tn, &act_tunnel_key_ops, TUNNEL_KEY_TAB_MASK);
+}
+
+static void __net_exit tunnel_key_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations tunnel_key_net_ops = {
+	.init = tunnel_key_init_net,
+	.exit = tunnel_key_exit_net,
+	.id   = &tunnel_key_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+static int __init tunnel_key_init_module(void)
+{
+	return tcf_register_action(&act_tunnel_key_ops, &tunnel_key_net_ops);
+}
+
+static void __exit tunnel_key_cleanup_module(void)
+{
+	tcf_unregister_action(&act_tunnel_key_ops, &tunnel_key_net_ops);
+}
+
+module_init(tunnel_key_init_module);
+module_exit(tunnel_key_cleanup_module);
+
+MODULE_AUTHOR("Amir Vadai <amir@vadai.me>");
+MODULE_DESCRIPTION("ip tunnel manipulation actions");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3-71-gd317


From 70ca767ea1b2748f45e96192400e515dddbe517c Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana <nevola@gmail.com>
Date: Tue, 6 Sep 2016 08:44:19 +0200
Subject: netfilter: nft_hash: Add hash offset value

Add support to pass through an offset to the hash value. With this
feature, the sysadmin is able to generate a hash with a given
offset value.

Example:

	meta mark set jhash ip saddr mod 2 seed 0xabcd offset 100

This option generates marks according to the source address from 100 to
101.

Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 net/netfilter/nft_hash.c                 | 17 +++++++++++++----
 2 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 24161e25576d..8c653bbd1ead 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -731,6 +731,7 @@ enum nft_meta_keys {
  * @NFTA_HASH_LEN: source data length (NLA_U32)
  * @NFTA_HASH_MODULUS: modulus value (NLA_U32)
  * @NFTA_HASH_SEED: seed value (NLA_U32)
+ * @NFTA_HASH_OFFSET: add this offset value to hash result (NLA_U32)
  */
 enum nft_hash_attributes {
 	NFTA_HASH_UNSPEC,
@@ -739,6 +740,7 @@ enum nft_hash_attributes {
 	NFTA_HASH_LEN,
 	NFTA_HASH_MODULUS,
 	NFTA_HASH_SEED,
+	NFTA_HASH_OFFSET,
 	__NFTA_HASH_MAX,
 };
 #define NFTA_HASH_MAX	(__NFTA_HASH_MAX - 1)
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 764251d31e46..bd12f7a801c2 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -23,6 +23,7 @@ struct nft_hash {
 	u8			len;
 	u32			modulus;
 	u32			seed;
+	u32			offset;
 };
 
 static void nft_hash_eval(const struct nft_expr *expr,
@@ -31,10 +32,10 @@ static void nft_hash_eval(const struct nft_expr *expr,
 {
 	struct nft_hash *priv = nft_expr_priv(expr);
 	const void *data = &regs->data[priv->sreg];
+	u32 h;
 
-	regs->data[priv->dreg] =
-		reciprocal_scale(jhash(data, priv->len, priv->seed),
-				 priv->modulus);
+	h = reciprocal_scale(jhash(data, priv->len, priv->seed), priv->modulus);
+	regs->data[priv->dreg] = h + priv->offset;
 }
 
 static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
@@ -59,6 +60,9 @@ static int nft_hash_init(const struct nft_ctx *ctx,
 	    !tb[NFTA_HASH_MODULUS])
 		return -EINVAL;
 
+	if (tb[NFTA_HASH_OFFSET])
+		priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET]));
+
 	priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]);
 	priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]);
 
@@ -72,6 +76,9 @@ static int nft_hash_init(const struct nft_ctx *ctx,
 	if (priv->modulus <= 1)
 		return -ERANGE;
 
+	if (priv->offset + priv->modulus - 1 < U32_MAX)
+		return -EOVERFLOW;
+
 	priv->seed = ntohl(nla_get_be32(tb[NFTA_HASH_SEED]));
 
 	return nft_validate_register_load(priv->sreg, len) &&
@@ -94,7 +101,9 @@ static int nft_hash_dump(struct sk_buff *skb,
 		goto nla_put_failure;
 	if (nla_put_be32(skb, NFTA_HASH_SEED, htonl(priv->seed)))
 		goto nla_put_failure;
-
+	if (priv->offset != 0)
+		if (nla_put_be32(skb, NFTA_HASH_OFFSET, htonl(priv->offset)))
+			goto nla_put_failure;
 	return 0;
 
 nla_put_failure:
-- 
cgit v1.2.3-71-gd317


From dbd2be0646e3239022630c426cbceefa15714bca Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 7 Sep 2016 12:22:18 +0200
Subject: netfilter: nft_dynset: allow to invert match criteria

The dynset expression matches if we can fit a new entry into the set.
If there is no room for it, then it breaks the rule evaluation.

This patch introduces the inversion flag so you can add rules to
explicitly drop packets that don't fit into the set. For example:

 # nft filter input flow table xyz size 4 { ip saddr timeout 120s counter } overflow drop

This is useful to provide a replacement for connlimit.

For the rule above, every new entry uses the IPv4 address as key in the
set, this entry gets a timeout of 120 seconds that gets refresh on every
packet seen. If we get new flow and our set already contains 4 entries
already, then this packet is dropped.

You can already express this in positive logic, assuming default policy
to drop:

 # nft filter input flow table xyz size 4 { ip saddr timeout 10s counter } accept

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  6 ++++++
 net/netfilter/nft_dynset.c               | 20 +++++++++++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 8c653bbd1ead..bc0eb6a1066d 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -575,6 +575,10 @@ enum nft_dynset_ops {
 	NFT_DYNSET_OP_UPDATE,
 };
 
+enum nft_dynset_flags {
+	NFT_DYNSET_F_INV	= (1 << 0),
+};
+
 /**
  * enum nft_dynset_attributes - dynset expression attributes
  *
@@ -585,6 +589,7 @@ enum nft_dynset_ops {
  * @NFTA_DYNSET_SREG_DATA: source register of the data (NLA_U32)
  * @NFTA_DYNSET_TIMEOUT: timeout value for the new element (NLA_U64)
  * @NFTA_DYNSET_EXPR: expression (NLA_NESTED: nft_expr_attributes)
+ * @NFTA_DYNSET_FLAGS: flags (NLA_U32)
  */
 enum nft_dynset_attributes {
 	NFTA_DYNSET_UNSPEC,
@@ -596,6 +601,7 @@ enum nft_dynset_attributes {
 	NFTA_DYNSET_TIMEOUT,
 	NFTA_DYNSET_EXPR,
 	NFTA_DYNSET_PAD,
+	NFTA_DYNSET_FLAGS,
 	__NFTA_DYNSET_MAX,
 };
 #define NFTA_DYNSET_MAX		(__NFTA_DYNSET_MAX - 1)
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 0af26699bf04..e3b83c31da2e 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -22,6 +22,7 @@ struct nft_dynset {
 	enum nft_dynset_ops		op:8;
 	enum nft_registers		sreg_key:8;
 	enum nft_registers		sreg_data:8;
+	bool				invert;
 	u64				timeout;
 	struct nft_expr			*expr;
 	struct nft_set_binding		binding;
@@ -82,10 +83,14 @@ static void nft_dynset_eval(const struct nft_expr *expr,
 
 		if (sexpr != NULL)
 			sexpr->ops->eval(sexpr, regs, pkt);
+
+		if (priv->invert)
+			regs->verdict.code = NFT_BREAK;
 		return;
 	}
 out:
-	regs->verdict.code = NFT_BREAK;
+	if (!priv->invert)
+		regs->verdict.code = NFT_BREAK;
 }
 
 static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = {
@@ -96,6 +101,7 @@ static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = {
 	[NFTA_DYNSET_SREG_DATA]	= { .type = NLA_U32 },
 	[NFTA_DYNSET_TIMEOUT]	= { .type = NLA_U64 },
 	[NFTA_DYNSET_EXPR]	= { .type = NLA_NESTED },
+	[NFTA_DYNSET_FLAGS]	= { .type = NLA_U32 },
 };
 
 static int nft_dynset_init(const struct nft_ctx *ctx,
@@ -113,6 +119,15 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
 	    tb[NFTA_DYNSET_SREG_KEY] == NULL)
 		return -EINVAL;
 
+	if (tb[NFTA_DYNSET_FLAGS]) {
+		u32 flags = ntohl(nla_get_be32(tb[NFTA_DYNSET_FLAGS]));
+
+		if (flags & ~NFT_DYNSET_F_INV)
+			return -EINVAL;
+		if (flags & NFT_DYNSET_F_INV)
+			priv->invert = true;
+	}
+
 	set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME],
 				   genmask);
 	if (IS_ERR(set)) {
@@ -220,6 +235,7 @@ static void nft_dynset_destroy(const struct nft_ctx *ctx,
 static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_dynset *priv = nft_expr_priv(expr);
+	u32 flags = priv->invert ? NFT_DYNSET_F_INV : 0;
 
 	if (nft_dump_register(skb, NFTA_DYNSET_SREG_KEY, priv->sreg_key))
 		goto nla_put_failure;
@@ -235,6 +251,8 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
 		goto nla_put_failure;
 	if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr))
 		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_DYNSET_FLAGS, htonl(flags)))
+		goto nla_put_failure;
 	return 0;
 
 nla_put_failure:
-- 
cgit v1.2.3-71-gd317


From 8e8118f893138d4cc3d4dbf4163d7497fca54a9d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 11 Sep 2016 22:55:53 +0200
Subject: netfilter: conntrack: remove packet hotpath stats

These counters sit in hot path and do show up in perf, this is especially
true for 'found' and 'searched' which get incremented for every packet
processed.

Information like

searched=212030105
new=623431
found=333613
delete=623327

does not seem too helpful nowadays:

- on busy systems found and searched will overflow every few hours
(these are 32bit integers), other more busy ones every few days.

- for debugging there are better methods, such as iptables' trace target,
the conntrack log sysctls.  Nowadays we also have perf tool.

This removes packet path stat counters except those that
are expected to be 0 (or close to 0) on a normal system, e.g.
'insert_failed' (race happened) or 'invalid' (proto tracker rejects).

The insert stat is retained for the ctnetlink case.
The found stat is retained for the tuple-is-taken check when NAT has to
determine if it needs to pick a different source address.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_conntrack_common.h      |  4 ----
 include/uapi/linux/netfilter/nfnetlink_conntrack.h |  8 ++++----
 net/netfilter/nf_conntrack_core.c                  | 14 ++------------
 net/netfilter/nf_conntrack_netlink.c               |  6 +-----
 net/netfilter/nf_conntrack_standalone.c            |  8 ++++----
 5 files changed, 11 insertions(+), 29 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h
index 275505792664..1d1ef4e20512 100644
--- a/include/linux/netfilter/nf_conntrack_common.h
+++ b/include/linux/netfilter/nf_conntrack_common.h
@@ -4,13 +4,9 @@
 #include <uapi/linux/netfilter/nf_conntrack_common.h>
 
 struct ip_conntrack_stat {
-	unsigned int searched;
 	unsigned int found;
-	unsigned int new;
 	unsigned int invalid;
 	unsigned int ignore;
-	unsigned int delete;
-	unsigned int delete_list;
 	unsigned int insert;
 	unsigned int insert_failed;
 	unsigned int drop;
diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
index 9df789709abe..6deb8867c5fc 100644
--- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
@@ -231,13 +231,13 @@ enum ctattr_secctx {
 
 enum ctattr_stats_cpu {
 	CTA_STATS_UNSPEC,
-	CTA_STATS_SEARCHED,
+	CTA_STATS_SEARCHED,	/* no longer used */
 	CTA_STATS_FOUND,
-	CTA_STATS_NEW,
+	CTA_STATS_NEW,		/* no longer used */
 	CTA_STATS_INVALID,
 	CTA_STATS_IGNORE,
-	CTA_STATS_DELETE,
-	CTA_STATS_DELETE_LIST,
+	CTA_STATS_DELETE,	/* no longer used */
+	CTA_STATS_DELETE_LIST,	/* no longer used */
 	CTA_STATS_INSERT,
 	CTA_STATS_INSERT_FAILED,
 	CTA_STATS_DROP,
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index ac1db4019d5c..8d1ddb9b63ed 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -379,7 +379,6 @@ static void
 destroy_conntrack(struct nf_conntrack *nfct)
 {
 	struct nf_conn *ct = (struct nf_conn *)nfct;
-	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_l4proto *l4proto;
 
 	pr_debug("destroy_conntrack(%p)\n", ct);
@@ -406,7 +405,6 @@ destroy_conntrack(struct nf_conntrack *nfct)
 
 	nf_ct_del_from_dying_or_unconfirmed_list(ct);
 
-	NF_CT_STAT_INC(net, delete);
 	local_bh_enable();
 
 	if (ct->master)
@@ -438,7 +436,6 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct)
 
 	nf_ct_add_to_dying_list(ct);
 
-	NF_CT_STAT_INC(net, delete_list);
 	local_bh_enable();
 }
 
@@ -529,11 +526,8 @@ begin:
 		if (nf_ct_is_dying(ct))
 			continue;
 
-		if (nf_ct_key_equal(h, tuple, zone, net)) {
-			NF_CT_STAT_INC_ATOMIC(net, found);
+		if (nf_ct_key_equal(h, tuple, zone, net))
 			return h;
-		}
-		NF_CT_STAT_INC_ATOMIC(net, searched);
 	}
 	/*
 	 * if the nulls value we got at the end of this lookup is
@@ -798,7 +792,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	 */
 	__nf_conntrack_hash_insert(ct, hash, reply_hash);
 	nf_conntrack_double_unlock(hash, reply_hash);
-	NF_CT_STAT_INC(net, insert);
 	local_bh_enable();
 
 	help = nfct_help(ct);
@@ -857,7 +850,6 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 			rcu_read_unlock();
 			return 1;
 		}
-		NF_CT_STAT_INC_ATOMIC(net, searched);
 	}
 
 	if (get_nulls_value(n) != hash) {
@@ -1177,10 +1169,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 		}
 		spin_unlock(&nf_conntrack_expect_lock);
 	}
-	if (!exp) {
+	if (!exp)
 		__nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
-		NF_CT_STAT_INC(net, new);
-	}
 
 	/* Now it is inserted into the unconfirmed list, bump refcount */
 	nf_conntrack_get(&ct->ct_general);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index c052b712c49f..27540455dc62 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1984,13 +1984,9 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
 	nfmsg->version      = NFNETLINK_V0;
 	nfmsg->res_id	    = htons(cpu);
 
-	if (nla_put_be32(skb, CTA_STATS_SEARCHED, htonl(st->searched)) ||
-	    nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) ||
-	    nla_put_be32(skb, CTA_STATS_NEW, htonl(st->new)) ||
+	if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) ||
 	    nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) ||
 	    nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) ||
-	    nla_put_be32(skb, CTA_STATS_DELETE, htonl(st->delete)) ||
-	    nla_put_be32(skb, CTA_STATS_DELETE_LIST, htonl(st->delete_list)) ||
 	    nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) ||
 	    nla_put_be32(skb, CTA_STATS_INSERT_FAILED,
 				htonl(st->insert_failed)) ||
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 3d9a316a3c77..7d52f8401afd 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -352,13 +352,13 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "%08x  %08x %08x %08x %08x %08x %08x %08x "
 			"%08x %08x %08x %08x %08x  %08x %08x %08x %08x\n",
 		   nr_conntracks,
-		   st->searched,
+		   0,
 		   st->found,
-		   st->new,
+		   0,
 		   st->invalid,
 		   st->ignore,
-		   st->delete,
-		   st->delete_list,
+		   0,
+		   0,
 		   st->insert,
 		   st->insert_failed,
 		   st->drop,
-- 
cgit v1.2.3-71-gd317


From 86da71b57383d40993cb90baafb3735cffe5d800 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Mon, 12 Sep 2016 20:13:09 -0400
Subject: net_sched: Introduce skbmod action

This action is intended to be an upgrade from a usability perspective
from pedit (as well as operational debugability).
Compare this:

sudo tc filter add dev $ETH parent 1: protocol ip prio 10 \
u32 match ip protocol 1 0xff flowid 1:2 \
action pedit munge offset -14 u8 set 0x02 \
munge offset -13 u8 set 0x15 \
munge offset -12 u8 set 0x15 \
munge offset -11 u8 set 0x15 \
munge offset -10 u16 set 0x1515 \
pipe

to:

sudo tc filter add dev $ETH parent 1: protocol ip prio 10 \
u32 match ip protocol 1 0xff flowid 1:2 \
action skbmod dmac 02:15:15:15:15:15

Also try to do a MAC address swap with pedit or worse
try to debug a policy with destination mac, source mac and
etherype. Then make few rules out of those and you'll get my point.

In the future common use cases on pedit can be migrated to this action
(as an example different fields in ip v4/6, transports like tcp/udp/sctp
etc). For this first cut, this allows modifying basic ethernet header.

The most important ethernet use case at the moment is when redirecting or
mirroring packets to a remote machine. The dst mac address needs a re-write
so that it doesnt get dropped or confuse an interconnecting (learning) switch
or dropped by a target machine (which looks at the dst mac). And at times
when flipping back the packet a swap of the MAC addresses is needed.

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_skbmod.h        |  30 ++++
 include/uapi/linux/tc_act/tc_skbmod.h |  39 +++++
 net/sched/Kconfig                     |  11 ++
 net/sched/Makefile                    |   1 +
 net/sched/act_skbmod.c                | 301 ++++++++++++++++++++++++++++++++++
 5 files changed, 382 insertions(+)
 create mode 100644 include/net/tc_act/tc_skbmod.h
 create mode 100644 include/uapi/linux/tc_act/tc_skbmod.h
 create mode 100644 net/sched/act_skbmod.c

(limited to 'include/uapi/linux')

diff --git a/include/net/tc_act/tc_skbmod.h b/include/net/tc_act/tc_skbmod.h
new file mode 100644
index 000000000000..644a2116b47b
--- /dev/null
+++ b/include/net/tc_act/tc_skbmod.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2016, Jamal Hadi Salim
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+*/
+
+#ifndef __NET_TC_SKBMOD_H
+#define __NET_TC_SKBMOD_H
+
+#include <net/act_api.h>
+#include <linux/tc_act/tc_skbmod.h>
+
+struct tcf_skbmod_params {
+	struct rcu_head	rcu;
+	u64	flags; /*up to 64 types of operations; extend if needed */
+	u8	eth_dst[ETH_ALEN];
+	u16	eth_type;
+	u8	eth_src[ETH_ALEN];
+};
+
+struct tcf_skbmod {
+	struct tc_action	common;
+	struct tcf_skbmod_params __rcu *skbmod_p;
+};
+#define to_skbmod(a) ((struct tcf_skbmod *)a)
+
+#endif /* __NET_TC_SKBMOD_H */
diff --git a/include/uapi/linux/tc_act/tc_skbmod.h b/include/uapi/linux/tc_act/tc_skbmod.h
new file mode 100644
index 000000000000..10fc07da6c69
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_skbmod.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, Jamal Hadi Salim
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+*/
+
+#ifndef __LINUX_TC_SKBMOD_H
+#define __LINUX_TC_SKBMOD_H
+
+#include <linux/pkt_cls.h>
+
+#define TCA_ACT_SKBMOD 15
+
+#define SKBMOD_F_DMAC	0x1
+#define SKBMOD_F_SMAC	0x2
+#define SKBMOD_F_ETYPE	0x4
+#define SKBMOD_F_SWAPMAC 0x8
+
+struct tc_skbmod {
+	tc_gen;
+	__u64 flags;
+};
+
+enum {
+	TCA_SKBMOD_UNSPEC,
+	TCA_SKBMOD_TM,
+	TCA_SKBMOD_PARMS,
+	TCA_SKBMOD_DMAC,
+	TCA_SKBMOD_SMAC,
+	TCA_SKBMOD_ETYPE,
+	TCA_SKBMOD_PAD,
+	__TCA_SKBMOD_MAX
+};
+#define TCA_SKBMOD_MAX (__TCA_SKBMOD_MAX - 1)
+
+#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 72e3426fa48f..7795d5a3f79a 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -749,6 +749,17 @@ config NET_ACT_CONNMARK
 	  To compile this code as a module, choose M here: the
 	  module will be called act_connmark.
 
+config NET_ACT_SKBMOD
+        tristate "skb data modification action"
+        depends on NET_CLS_ACT
+        ---help---
+         Say Y here to allow modification of skb data
+
+         If unsure, say N.
+
+         To compile this code as a module, choose M here: the
+         module will be called act_skbmod.
+
 config NET_ACT_IFE
         tristate "Inter-FE action based on IETF ForCES InterFE LFB"
         depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index b9d046b9535a..148ae0d5ac2c 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_NET_ACT_CSUM)	+= act_csum.o
 obj-$(CONFIG_NET_ACT_VLAN)	+= act_vlan.o
 obj-$(CONFIG_NET_ACT_BPF)	+= act_bpf.o
 obj-$(CONFIG_NET_ACT_CONNMARK)	+= act_connmark.o
+obj-$(CONFIG_NET_ACT_SKBMOD)	+= act_skbmod.o
 obj-$(CONFIG_NET_ACT_IFE)	+= act_ife.o
 obj-$(CONFIG_NET_IFE_SKBMARK)	+= act_meta_mark.o
 obj-$(CONFIG_NET_IFE_SKBPRIO)	+= act_meta_skbprio.o
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
new file mode 100644
index 000000000000..e7d96381c908
--- /dev/null
+++ b/net/sched/act_skbmod.c
@@ -0,0 +1,301 @@
+/*
+ * net/sched/act_skbmod.c  skb data modifier
+ *
+ * Copyright (c) 2016 Jamal Hadi Salim <jhs@mojatatu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+*/
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#include <linux/tc_act/tc_skbmod.h>
+#include <net/tc_act/tc_skbmod.h>
+
+#define SKBMOD_TAB_MASK     15
+
+static int skbmod_net_id;
+static struct tc_action_ops act_skbmod_ops;
+
+#define MAX_EDIT_LEN ETH_HLEN
+static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a,
+			  struct tcf_result *res)
+{
+	struct tcf_skbmod *d = to_skbmod(a);
+	int action;
+	struct tcf_skbmod_params *p;
+	u64 flags;
+	int err;
+
+	tcf_lastuse_update(&d->tcf_tm);
+	bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);
+
+	/* XXX: if you are going to edit more fields beyond ethernet header
+	 * (example when you add IP header replacement or vlan swap)
+	 * then MAX_EDIT_LEN needs to change appropriately
+	*/
+	err = skb_ensure_writable(skb, MAX_EDIT_LEN);
+	if (unlikely(err)) { /* best policy is to drop on the floor */
+		qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
+		return TC_ACT_SHOT;
+	}
+
+	rcu_read_lock();
+	action = READ_ONCE(d->tcf_action);
+	if (unlikely(action == TC_ACT_SHOT)) {
+		qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
+		rcu_read_unlock();
+		return action;
+	}
+
+	p = rcu_dereference(d->skbmod_p);
+	flags = p->flags;
+	if (flags & SKBMOD_F_DMAC)
+		ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst);
+	if (flags & SKBMOD_F_SMAC)
+		ether_addr_copy(eth_hdr(skb)->h_source, p->eth_src);
+	if (flags & SKBMOD_F_ETYPE)
+		eth_hdr(skb)->h_proto = p->eth_type;
+	rcu_read_unlock();
+
+	if (flags & SKBMOD_F_SWAPMAC) {
+		u16 tmpaddr[ETH_ALEN / 2]; /* ether_addr_copy() requirement */
+		/*XXX: I am sure we can come up with more efficient swapping*/
+		ether_addr_copy((u8 *)tmpaddr, eth_hdr(skb)->h_dest);
+		ether_addr_copy(eth_hdr(skb)->h_dest, eth_hdr(skb)->h_source);
+		ether_addr_copy(eth_hdr(skb)->h_source, (u8 *)tmpaddr);
+	}
+
+	return action;
+}
+
+static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
+	[TCA_SKBMOD_PARMS]		= { .len = sizeof(struct tc_skbmod) },
+	[TCA_SKBMOD_DMAC]		= { .len = ETH_ALEN },
+	[TCA_SKBMOD_SMAC]		= { .len = ETH_ALEN },
+	[TCA_SKBMOD_ETYPE]		= { .type = NLA_U16 },
+};
+
+static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
+			   struct nlattr *est, struct tc_action **a,
+			   int ovr, int bind)
+{
+	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+	struct nlattr *tb[TCA_SKBMOD_MAX + 1];
+	struct tcf_skbmod_params *p, *p_old;
+	struct tc_skbmod *parm;
+	struct tcf_skbmod *d;
+	bool exists = false;
+	u8 *daddr = NULL;
+	u8 *saddr = NULL;
+	u16 eth_type = 0;
+	u32 lflags = 0;
+	int ret = 0, err;
+
+	if (!nla)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_SKBMOD_MAX, nla, skbmod_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_SKBMOD_PARMS])
+		return -EINVAL;
+
+	if (tb[TCA_SKBMOD_DMAC]) {
+		daddr = nla_data(tb[TCA_SKBMOD_DMAC]);
+		lflags |= SKBMOD_F_DMAC;
+	}
+
+	if (tb[TCA_SKBMOD_SMAC]) {
+		saddr = nla_data(tb[TCA_SKBMOD_SMAC]);
+		lflags |= SKBMOD_F_SMAC;
+	}
+
+	if (tb[TCA_SKBMOD_ETYPE]) {
+		eth_type = nla_get_u16(tb[TCA_SKBMOD_ETYPE]);
+		lflags |= SKBMOD_F_ETYPE;
+	}
+
+	parm = nla_data(tb[TCA_SKBMOD_PARMS]);
+	if (parm->flags & SKBMOD_F_SWAPMAC)
+		lflags = SKBMOD_F_SWAPMAC;
+
+	exists = tcf_hash_check(tn, parm->index, a, bind);
+	if (exists && bind)
+		return 0;
+
+	if (!lflags)
+		return -EINVAL;
+
+	if (!exists) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      &act_skbmod_ops, bind, true);
+		if (ret)
+			return ret;
+
+		ret = ACT_P_CREATED;
+	} else {
+		tcf_hash_release(*a, bind);
+		if (!ovr)
+			return -EEXIST;
+	}
+
+	d = to_skbmod(*a);
+
+	ASSERT_RTNL();
+	p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL);
+	if (unlikely(!p)) {
+		if (ovr)
+			tcf_hash_release(*a, bind);
+		return -ENOMEM;
+	}
+
+	p->flags = lflags;
+	d->tcf_action = parm->action;
+
+	p_old = rtnl_dereference(d->skbmod_p);
+
+	if (ovr)
+		spin_lock_bh(&d->tcf_lock);
+
+	if (lflags & SKBMOD_F_DMAC)
+		ether_addr_copy(p->eth_dst, daddr);
+	if (lflags & SKBMOD_F_SMAC)
+		ether_addr_copy(p->eth_src, saddr);
+	if (lflags & SKBMOD_F_ETYPE)
+		p->eth_type = htons(eth_type);
+
+	rcu_assign_pointer(d->skbmod_p, p);
+	if (ovr)
+		spin_unlock_bh(&d->tcf_lock);
+
+	if (p_old)
+		kfree_rcu(p_old, rcu);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(tn, *a);
+	return ret;
+}
+
+static void tcf_skbmod_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_skbmod *d = to_skbmod(a);
+	struct tcf_skbmod_params  *p;
+
+	p = rcu_dereference_protected(d->skbmod_p, 1);
+	kfree_rcu(p, rcu);
+}
+
+static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
+			   int bind, int ref)
+{
+	struct tcf_skbmod *d = to_skbmod(a);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_skbmod_params  *p = rtnl_dereference(d->skbmod_p);
+	struct tc_skbmod opt = {
+		.index   = d->tcf_index,
+		.refcnt  = d->tcf_refcnt - ref,
+		.bindcnt = d->tcf_bindcnt - bind,
+		.action  = d->tcf_action,
+	};
+	struct tcf_t t;
+
+	opt.flags  = p->flags;
+	if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+	if ((p->flags & SKBMOD_F_DMAC) &&
+	    nla_put(skb, TCA_SKBMOD_DMAC, ETH_ALEN, p->eth_dst))
+		goto nla_put_failure;
+	if ((p->flags & SKBMOD_F_SMAC) &&
+	    nla_put(skb, TCA_SKBMOD_SMAC, ETH_ALEN, p->eth_src))
+		goto nla_put_failure;
+	if ((p->flags & SKBMOD_F_ETYPE) &&
+	    nla_put_u16(skb, TCA_SKBMOD_ETYPE, ntohs(p->eth_type)))
+		goto nla_put_failure;
+
+	tcf_tm_dump(&t, &d->tcf_tm);
+	if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD))
+		goto nla_put_failure;
+
+	return skb->len;
+nla_put_failure:
+	rcu_read_unlock();
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb,
+			     struct netlink_callback *cb, int type,
+			     const struct tc_action_ops *ops)
+{
+	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops);
+}
+
+static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_skbmod_ops = {
+	.kind		=	"skbmod",
+	.type		=	TCA_ACT_SKBMOD,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_skbmod_run,
+	.dump		=	tcf_skbmod_dump,
+	.init		=	tcf_skbmod_init,
+	.cleanup	=	tcf_skbmod_cleanup,
+	.walk		=	tcf_skbmod_walker,
+	.lookup		=	tcf_skbmod_search,
+	.size		=	sizeof(struct tcf_skbmod),
+};
+
+static __net_init int skbmod_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+	return tc_action_net_init(tn, &act_skbmod_ops, SKBMOD_TAB_MASK);
+}
+
+static void __net_exit skbmod_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations skbmod_net_ops = {
+	.init = skbmod_init_net,
+	.exit = skbmod_exit_net,
+	.id   = &skbmod_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim, <jhs@mojatatu.com>");
+MODULE_DESCRIPTION("SKB data mod-ing");
+MODULE_LICENSE("GPL");
+
+static int __init skbmod_init_module(void)
+{
+	return tcf_register_action(&act_skbmod_ops, &skbmod_net_ops);
+}
+
+static void __exit skbmod_cleanup_module(void)
+{
+	tcf_unregister_action(&act_skbmod_ops, &skbmod_net_ops);
+}
+
+module_init(skbmod_init_module);
+module_exit(skbmod_cleanup_module);
-- 
cgit v1.2.3-71-gd317


From aa72d708373dacfa690960b336543b867784b350 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 15 Sep 2016 15:28:22 +0300
Subject: net/sched: cls_flower: Support masking for matching on tcp/udp ports

Add the definitions for src/dst udp/tcp port masks and use
them when setting && dumping the relevant keys.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  4 ++++
 net/sched/cls_flower.c       | 20 ++++++++++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index f9c287c67eae..60ea2a084880 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -442,6 +442,10 @@ enum {
 	TCA_FLOWER_KEY_ENC_IPV6_DST,	/* struct in6_addr */
 	TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,/* struct in6_addr */
 
+	TCA_FLOWER_KEY_TCP_SRC_MASK,	/* be16 */
+	TCA_FLOWER_KEY_TCP_DST_MASK,	/* be16 */
+	TCA_FLOWER_KEY_UDP_SRC_MASK,	/* be16 */
+	TCA_FLOWER_KEY_UDP_DST_MASK,	/* be16 */
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index b084b2aab2d7..027523c82797 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -335,6 +335,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) },
 	[TCA_FLOWER_KEY_ENC_IPV6_DST]	= { .len = sizeof(struct in6_addr) },
 	[TCA_FLOWER_KEY_ENC_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) },
+	[TCA_FLOWER_KEY_TCP_SRC_MASK]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_TCP_DST_MASK]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_UDP_SRC_MASK]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_UDP_DST_MASK]	= { .type = NLA_U16 },
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -432,17 +436,17 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 
 	if (key->basic.ip_proto == IPPROTO_TCP) {
 		fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC,
-			       &mask->tp.src, TCA_FLOWER_UNSPEC,
+			       &mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK,
 			       sizeof(key->tp.src));
 		fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST,
-			       &mask->tp.dst, TCA_FLOWER_UNSPEC,
+			       &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK,
 			       sizeof(key->tp.dst));
 	} else if (key->basic.ip_proto == IPPROTO_UDP) {
 		fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC,
-			       &mask->tp.src, TCA_FLOWER_UNSPEC,
+			       &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK,
 			       sizeof(key->tp.src));
 		fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST,
-			       &mask->tp.dst, TCA_FLOWER_UNSPEC,
+			       &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK,
 			       sizeof(key->tp.dst));
 	}
 
@@ -877,18 +881,18 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 
 	if (key->basic.ip_proto == IPPROTO_TCP &&
 	    (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC,
-			     &mask->tp.src, TCA_FLOWER_UNSPEC,
+			     &mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK,
 			     sizeof(key->tp.src)) ||
 	     fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST,
-			     &mask->tp.dst, TCA_FLOWER_UNSPEC,
+			     &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK,
 			     sizeof(key->tp.dst))))
 		goto nla_put_failure;
 	else if (key->basic.ip_proto == IPPROTO_UDP &&
 		 (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC,
-				  &mask->tp.src, TCA_FLOWER_UNSPEC,
+				  &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK,
 				  sizeof(key->tp.src)) ||
 		  fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST,
-				  &mask->tp.dst, TCA_FLOWER_UNSPEC,
+				  &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK,
 				  sizeof(key->tp.dst))))
 		goto nla_put_failure;
 
-- 
cgit v1.2.3-71-gd317


From 37a6c1512314d2439ef7136d773d5a470e0996b9 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 15 Sep 2016 15:28:24 +0300
Subject: net/sched: cls_flower: Specify vlan attributes format in the UAPI
 header

Specify the format (size and endianess) for the vlan attributes.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 60ea2a084880..8915b61bbf83 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -428,9 +428,9 @@ enum {
 	TCA_FLOWER_KEY_UDP_DST,		/* be16 */
 
 	TCA_FLOWER_FLAGS,
-	TCA_FLOWER_KEY_VLAN_ID,
-	TCA_FLOWER_KEY_VLAN_PRIO,
-	TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+	TCA_FLOWER_KEY_VLAN_ID,		/* be16 */
+	TCA_FLOWER_KEY_VLAN_PRIO,	/* u8   */
+	TCA_FLOWER_KEY_VLAN_ETH_TYPE,	/* be16 */
 
 	TCA_FLOWER_KEY_ENC_KEY_ID,	/* be32 */
 	TCA_FLOWER_KEY_ENC_IPV4_SRC,	/* be32 */
-- 
cgit v1.2.3-71-gd317


From cfc7381b3002756b1dcada32979e942aa3126e31 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 15 Sep 2016 13:00:29 -0700
Subject: ip_tunnel: add collect_md mode to IPIP tunnel

Similar to gre, vxlan, geneve tunnels allow IPIP tunnels to
operate in 'collect metadata' mode.
bpf_skb_[gs]et_tunnel_key() helpers can make use of it right away.
ovs can use it as well in the future (once appropriate ovs-vport
abstractions and user apis are added).
Note that just like in other tunnels we cannot cache the dst,
since tunnel_info metadata can be different for every packet.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h       |  2 ++
 include/uapi/linux/if_tunnel.h |  1 +
 net/ipv4/ip_tunnel.c           | 76 ++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/ipip.c                | 35 +++++++++++++++----
 4 files changed, 108 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index e598c639aa6f..59557c07904b 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -255,6 +255,8 @@ void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops);
 
 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		    const struct iphdr *tnl_params, const u8 protocol);
+void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
+		       const u8 proto);
 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd);
 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict);
 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu);
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 9865c8caedde..18d5dc13985d 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -73,6 +73,7 @@ enum {
 	IFLA_IPTUN_ENCAP_FLAGS,
 	IFLA_IPTUN_ENCAP_SPORT,
 	IFLA_IPTUN_ENCAP_DPORT,
+	IFLA_IPTUN_COLLECT_METADATA,
 	__IFLA_IPTUN_MAX,
 };
 #define IFLA_IPTUN_MAX	(__IFLA_IPTUN_MAX - 1)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 95649ebd2874..5719d6ba0824 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -55,6 +55,7 @@
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
 #include <net/udp.h>
+#include <net/dst_metadata.h>
 
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
@@ -546,6 +547,81 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
 	return 0;
 }
 
+void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	u32 headroom = sizeof(struct iphdr);
+	struct ip_tunnel_info *tun_info;
+	const struct ip_tunnel_key *key;
+	const struct iphdr *inner_iph;
+	struct rtable *rt;
+	struct flowi4 fl4;
+	__be16 df = 0;
+	u8 tos, ttl;
+
+	tun_info = skb_tunnel_info(skb);
+	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+		     ip_tunnel_info_af(tun_info) != AF_INET))
+		goto tx_error;
+	key = &tun_info->key;
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+	tos = key->tos;
+	if (tos == 1) {
+		if (skb->protocol == htons(ETH_P_IP))
+			tos = inner_iph->tos;
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
+	}
+	init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
+			 RT_TOS(tos), tunnel->parms.link);
+	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
+		goto tx_error;
+	rt = ip_route_output_key(tunnel->net, &fl4);
+	if (IS_ERR(rt)) {
+		dev->stats.tx_carrier_errors++;
+		goto tx_error;
+	}
+	if (rt->dst.dev == dev) {
+		ip_rt_put(rt);
+		dev->stats.collisions++;
+		goto tx_error;
+	}
+	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
+	ttl = key->ttl;
+	if (ttl == 0) {
+		if (skb->protocol == htons(ETH_P_IP))
+			ttl = inner_iph->ttl;
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
+		else
+			ttl = ip4_dst_hoplimit(&rt->dst);
+	}
+	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
+		df = htons(IP_DF);
+	else if (skb->protocol == htons(ETH_P_IP))
+		df = inner_iph->frag_off & htons(IP_DF);
+	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
+	if (headroom > dev->needed_headroom)
+		dev->needed_headroom = headroom;
+
+	if (skb_cow_head(skb, dev->needed_headroom)) {
+		ip_rt_put(rt);
+		goto tx_dropped;
+	}
+	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
+		      key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
+	return;
+tx_error:
+	dev->stats.tx_errors++;
+	goto kfree;
+tx_dropped:
+	dev->stats.tx_dropped++;
+kfree:
+	kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
+
 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		    const struct iphdr *tnl_params, u8 protocol)
 {
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 4ae3f8e6c6cc..c9392589c415 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -115,6 +115,7 @@
 #include <net/xfrm.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/dst_metadata.h>
 
 static bool log_ecn_error = true;
 module_param(log_ecn_error, bool, 0644);
@@ -193,6 +194,7 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
 {
 	struct net *net = dev_net(skb->dev);
 	struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+	struct metadata_dst *tun_dst = NULL;
 	struct ip_tunnel *tunnel;
 	const struct iphdr *iph;
 
@@ -216,7 +218,12 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
 			tpi = &ipip_tpi;
 		if (iptunnel_pull_header(skb, 0, tpi->proto, false))
 			goto drop;
-		return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
+		if (tunnel->collect_md) {
+			tun_dst = ip_tun_rx_dst(skb, 0, 0, 0);
+			if (!tun_dst)
+				return 0;
+		}
+		return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
 	}
 
 	return -1;
@@ -270,7 +277,10 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
 
 	skb_set_inner_ipproto(skb, ipproto);
 
-	ip_tunnel_xmit(skb, dev, tiph, ipproto);
+	if (tunnel->collect_md)
+		ip_md_tunnel_xmit(skb, dev, ipproto);
+	else
+		ip_tunnel_xmit(skb, dev, tiph, ipproto);
 	return NETDEV_TX_OK;
 
 tx_error:
@@ -380,13 +390,14 @@ static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
 }
 
 static void ipip_netlink_parms(struct nlattr *data[],
-			       struct ip_tunnel_parm *parms)
+			       struct ip_tunnel_parm *parms, bool *collect_md)
 {
 	memset(parms, 0, sizeof(*parms));
 
 	parms->iph.version = 4;
 	parms->iph.protocol = IPPROTO_IPIP;
 	parms->iph.ihl = 5;
+	*collect_md = false;
 
 	if (!data)
 		return;
@@ -414,6 +425,9 @@ static void ipip_netlink_parms(struct nlattr *data[],
 
 	if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
 		parms->iph.frag_off = htons(IP_DF);
+
+	if (data[IFLA_IPTUN_COLLECT_METADATA])
+		*collect_md = true;
 }
 
 /* This function returns true when ENCAP attributes are present in the nl msg */
@@ -453,18 +467,18 @@ static bool ipip_netlink_encap_parms(struct nlattr *data[],
 static int ipip_newlink(struct net *src_net, struct net_device *dev,
 			struct nlattr *tb[], struct nlattr *data[])
 {
+	struct ip_tunnel *t = netdev_priv(dev);
 	struct ip_tunnel_parm p;
 	struct ip_tunnel_encap ipencap;
 
 	if (ipip_netlink_encap_parms(data, &ipencap)) {
-		struct ip_tunnel *t = netdev_priv(dev);
 		int err = ip_tunnel_encap_setup(t, &ipencap);
 
 		if (err < 0)
 			return err;
 	}
 
-	ipip_netlink_parms(data, &p);
+	ipip_netlink_parms(data, &p, &t->collect_md);
 	return ip_tunnel_newlink(dev, tb, &p);
 }
 
@@ -473,6 +487,7 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
 {
 	struct ip_tunnel_parm p;
 	struct ip_tunnel_encap ipencap;
+	bool collect_md;
 
 	if (ipip_netlink_encap_parms(data, &ipencap)) {
 		struct ip_tunnel *t = netdev_priv(dev);
@@ -482,7 +497,9 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
 			return err;
 	}
 
-	ipip_netlink_parms(data, &p);
+	ipip_netlink_parms(data, &p, &collect_md);
+	if (collect_md)
+		return -EINVAL;
 
 	if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
 	    (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
@@ -516,6 +533,8 @@ static size_t ipip_get_size(const struct net_device *dev)
 		nla_total_size(2) +
 		/* IFLA_IPTUN_ENCAP_DPORT */
 		nla_total_size(2) +
+		/* IFLA_IPTUN_COLLECT_METADATA */
+		nla_total_size(0) +
 		0;
 }
 
@@ -544,6 +563,9 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			tunnel->encap.flags))
 		goto nla_put_failure;
 
+	if (tunnel->collect_md)
+		if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA))
+			goto nla_put_failure;
 	return 0;
 
 nla_put_failure:
@@ -562,6 +584,7 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
 	[IFLA_IPTUN_ENCAP_FLAGS]	= { .type = NLA_U16 },
 	[IFLA_IPTUN_ENCAP_SPORT]	= { .type = NLA_U16 },
 	[IFLA_IPTUN_ENCAP_DPORT]	= { .type = NLA_U16 },
+	[IFLA_IPTUN_COLLECT_METADATA]	= { .type = NLA_FLAG },
 };
 
 static struct rtnl_link_ops ipip_link_ops __read_mostly = {
-- 
cgit v1.2.3-71-gd317


From 69ae6ad2ff37911903a90256e216d7e7ae460002 Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Fri, 16 Sep 2016 15:05:37 +0200
Subject: net: core: Add offload stats to if_stats_msg

Add a nested attribute of offload stats to if_stats_msg
named IFLA_STATS_LINK_OFFLOAD_XSTATS.
Under it, add SW stats, meaning stats only per packets that went via
slowpath to the cpu, named IFLA_OFFLOAD_XSTATS_CPU_HIT.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h |   9 ++++
 net/core/rtnetlink.c         | 111 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 116 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 9bf3aecfe05b..2351776a724f 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -826,6 +826,7 @@ enum {
 	IFLA_STATS_LINK_64,
 	IFLA_STATS_LINK_XSTATS,
 	IFLA_STATS_LINK_XSTATS_SLAVE,
+	IFLA_STATS_LINK_OFFLOAD_XSTATS,
 	__IFLA_STATS_MAX,
 };
 
@@ -845,6 +846,14 @@ enum {
 };
 #define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1)
 
+/* These are stats embedded into IFLA_STATS_LINK_OFFLOAD_XSTATS */
+enum {
+	IFLA_OFFLOAD_XSTATS_UNSPEC,
+	IFLA_OFFLOAD_XSTATS_CPU_HIT, /* struct rtnl_link_stats64 */
+	__IFLA_OFFLOAD_XSTATS_MAX
+};
+#define IFLA_OFFLOAD_XSTATS_MAX (__IFLA_OFFLOAD_XSTATS_MAX - 1)
+
 /* XDP section */
 
 enum {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 937e459bdaa9..0dbae4244a89 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3577,6 +3577,91 @@ static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr)
 	       (!idxattr || idxattr == attrid);
 }
 
+#define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1)
+static int rtnl_get_offload_stats_attr_size(int attr_id)
+{
+	switch (attr_id) {
+	case IFLA_OFFLOAD_XSTATS_CPU_HIT:
+		return sizeof(struct rtnl_link_stats64);
+	}
+
+	return 0;
+}
+
+static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev,
+				  int *prividx)
+{
+	struct nlattr *attr = NULL;
+	int attr_id, size;
+	void *attr_data;
+	int err;
+
+	if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
+	      dev->netdev_ops->ndo_get_offload_stats))
+		return -ENODATA;
+
+	for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
+	     attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
+		if (attr_id < *prividx)
+			continue;
+
+		size = rtnl_get_offload_stats_attr_size(attr_id);
+		if (!size)
+			continue;
+
+		if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+			continue;
+
+		attr = nla_reserve_64bit(skb, attr_id, size,
+					 IFLA_OFFLOAD_XSTATS_UNSPEC);
+		if (!attr)
+			goto nla_put_failure;
+
+		attr_data = nla_data(attr);
+		memset(attr_data, 0, size);
+		err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev,
+							     attr_data);
+		if (err)
+			goto get_offload_stats_failure;
+	}
+
+	if (!attr)
+		return -ENODATA;
+
+	*prividx = 0;
+	return 0;
+
+nla_put_failure:
+	err = -EMSGSIZE;
+get_offload_stats_failure:
+	*prividx = attr_id;
+	return err;
+}
+
+static int rtnl_get_offload_stats_size(const struct net_device *dev)
+{
+	int nla_size = 0;
+	int attr_id;
+	int size;
+
+	if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
+	      dev->netdev_ops->ndo_get_offload_stats))
+		return 0;
+
+	for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
+	     attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
+		if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+			continue;
+		size = rtnl_get_offload_stats_attr_size(attr_id);
+		nla_size += nla_total_size_64bit(size);
+	}
+
+	if (nla_size != 0)
+		nla_size += nla_total_size(0);
+
+	return nla_size;
+}
+
 static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 			       int type, u32 pid, u32 seq, u32 change,
 			       unsigned int flags, unsigned int filter_mask,
@@ -3586,6 +3671,7 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 	struct nlmsghdr *nlh;
 	struct nlattr *attr;
 	int s_prividx = *prividx;
+	int err;
 
 	ASSERT_RTNL();
 
@@ -3614,8 +3700,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 		const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
 
 		if (ops && ops->fill_linkxstats) {
-			int err;
-
 			*idxattr = IFLA_STATS_LINK_XSTATS;
 			attr = nla_nest_start(skb,
 					      IFLA_STATS_LINK_XSTATS);
@@ -3639,8 +3723,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 		if (master)
 			ops = master->rtnl_link_ops;
 		if (ops && ops->fill_linkxstats) {
-			int err;
-
 			*idxattr = IFLA_STATS_LINK_XSTATS_SLAVE;
 			attr = nla_nest_start(skb,
 					      IFLA_STATS_LINK_XSTATS_SLAVE);
@@ -3655,6 +3737,24 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
 		}
 	}
 
+	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS,
+			     *idxattr)) {
+		*idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS;
+		attr = nla_nest_start(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS);
+		if (!attr)
+			goto nla_put_failure;
+
+		err = rtnl_get_offload_stats(skb, dev, prividx);
+		if (err == -ENODATA)
+			nla_nest_cancel(skb, attr);
+		else
+			nla_nest_end(skb, attr);
+
+		if (err && err != -ENODATA)
+			goto nla_put_failure;
+		*idxattr = 0;
+	}
+
 	nlmsg_end(skb, nlh);
 
 	return 0;
@@ -3708,6 +3808,9 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
 		}
 	}
 
+	if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
+		size += rtnl_get_offload_stats_size(dev);
+
 	return size;
 }
 
-- 
cgit v1.2.3-71-gd317


From 4fbae7d83c98c30efcf0a2a2ac55fbb75ef5a1a5 Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Fri, 16 Sep 2016 12:59:19 -0700
Subject: ipvlan: Introduce l3s mode

In a typical IPvlan L3 setup where master is in default-ns and
each slave is into different (slave) ns. In this setup egress
packet processing for traffic originating from slave-ns will
hit all NF_HOOKs in slave-ns as well as default-ns. However same
is not true for ingress processing. All these NF_HOOKs are
hit only in the slave-ns skipping them in the default-ns.
IPvlan in L3 mode is restrictive and if admins want to deploy
iptables rules in default-ns, this asymmetric data path makes it
impossible to do so.

This patch makes use of the l3_rcv() (added as part of l3mdev
enhancements) to perform input route lookup on RX packets without
changing the skb->dev and then uses nf_hook at NF_INET_LOCAL_IN
to change the skb->dev just before handing over skb to L4.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
CC: David Ahern <dsa@cumulusnetworks.com>
Reviewed-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ipvlan.txt |  7 ++-
 drivers/net/Kconfig                 |  1 +
 drivers/net/ipvlan/ipvlan.h         |  6 +++
 drivers/net/ipvlan/ipvlan_core.c    | 94 +++++++++++++++++++++++++++++++++++++
 drivers/net/ipvlan/ipvlan_main.c    | 87 +++++++++++++++++++++++++++++++---
 include/uapi/linux/if_link.h        |  1 +
 6 files changed, 188 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.txt
index 14422f8fcdc4..24196cef7c91 100644
--- a/Documentation/networking/ipvlan.txt
+++ b/Documentation/networking/ipvlan.txt
@@ -22,7 +22,7 @@ The driver can be built into the kernel (CONFIG_IPVLAN=y) or as a module
 	There are no module parameters for this driver and it can be configured
 using IProute2/ip utility.
 
-	ip link add link <master-dev> <slave-dev> type ipvlan mode { l2 | L3 }
+	ip link add link <master-dev> <slave-dev> type ipvlan mode { l2 | l3 | l3s }
 
 	e.g. ip link add link ipvl0 eth0 type ipvlan mode l2
 
@@ -48,6 +48,11 @@ master device for the L2 processing and routing from that instance will be
 used before packets are queued on the outbound device. In this mode the slaves
 will not receive nor can send multicast / broadcast traffic.
 
+4.3 L3S mode:
+	This is very similar to the L3 mode except that iptables (conn-tracking)
+works in this mode and hence it is L3-symmetric (L3s). This will have slightly less
+performance but that shouldn't matter since you are choosing this mode over plain-L3
+mode to make conn-tracking work.
 
 5. What to choose (macvlan vs. ipvlan)?
 	These two devices are very similar in many regards and the specific use
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 0c5415b05ea9..8768a625350d 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -149,6 +149,7 @@ config IPVLAN
     tristate "IP-VLAN support"
     depends on INET
     depends on IPV6
+    depends on NET_L3_MASTER_DEV
     ---help---
       This allows one to create virtual devices off of a main interface
       and packets will be delivered based on the dest L3 (IPv6/IPv4 addr)
diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index 695a5dc9ace3..7e0732f5ea07 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -23,11 +23,13 @@
 #include <linux/if_vlan.h>
 #include <linux/ip.h>
 #include <linux/inetdevice.h>
+#include <linux/netfilter.h>
 #include <net/ip.h>
 #include <net/ip6_route.h>
 #include <net/rtnetlink.h>
 #include <net/route.h>
 #include <net/addrconf.h>
+#include <net/l3mdev.h>
 
 #define IPVLAN_DRV	"ipvlan"
 #define IPV_DRV_VER	"0.1"
@@ -124,4 +126,8 @@ struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan,
 				   const void *iaddr, bool is_v6);
 bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6);
 void ipvlan_ht_addr_del(struct ipvl_addr *addr);
+struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb,
+			      u16 proto);
+unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb,
+			     const struct nf_hook_state *state);
 #endif /* __IPVLAN_H */
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index b5f9511d819e..b4e990743e1d 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -560,6 +560,7 @@ int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
 	case IPVLAN_MODE_L2:
 		return ipvlan_xmit_mode_l2(skb, dev);
 	case IPVLAN_MODE_L3:
+	case IPVLAN_MODE_L3S:
 		return ipvlan_xmit_mode_l3(skb, dev);
 	}
 
@@ -664,6 +665,8 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb)
 		return ipvlan_handle_mode_l2(pskb, port);
 	case IPVLAN_MODE_L3:
 		return ipvlan_handle_mode_l3(pskb, port);
+	case IPVLAN_MODE_L3S:
+		return RX_HANDLER_PASS;
 	}
 
 	/* Should not reach here */
@@ -672,3 +675,94 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb)
 	kfree_skb(skb);
 	return RX_HANDLER_CONSUMED;
 }
+
+static struct ipvl_addr *ipvlan_skb_to_addr(struct sk_buff *skb,
+					    struct net_device *dev)
+{
+	struct ipvl_addr *addr = NULL;
+	struct ipvl_port *port;
+	void *lyr3h;
+	int addr_type;
+
+	if (!dev || !netif_is_ipvlan_port(dev))
+		goto out;
+
+	port = ipvlan_port_get_rcu(dev);
+	if (!port || port->mode != IPVLAN_MODE_L3S)
+		goto out;
+
+	lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
+	if (!lyr3h)
+		goto out;
+
+	addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true);
+out:
+	return addr;
+}
+
+struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb,
+			      u16 proto)
+{
+	struct ipvl_addr *addr;
+	struct net_device *sdev;
+
+	addr = ipvlan_skb_to_addr(skb, dev);
+	if (!addr)
+		goto out;
+
+	sdev = addr->master->dev;
+	switch (proto) {
+	case AF_INET:
+	{
+		int err;
+		struct iphdr *ip4h = ip_hdr(skb);
+
+		err = ip_route_input_noref(skb, ip4h->daddr, ip4h->saddr,
+					   ip4h->tos, sdev);
+		if (unlikely(err))
+			goto out;
+		break;
+	}
+	case AF_INET6:
+	{
+		struct dst_entry *dst;
+		struct ipv6hdr *ip6h = ipv6_hdr(skb);
+		int flags = RT6_LOOKUP_F_HAS_SADDR;
+		struct flowi6 fl6 = {
+			.flowi6_iif   = sdev->ifindex,
+			.daddr        = ip6h->daddr,
+			.saddr        = ip6h->saddr,
+			.flowlabel    = ip6_flowinfo(ip6h),
+			.flowi6_mark  = skb->mark,
+			.flowi6_proto = ip6h->nexthdr,
+		};
+
+		skb_dst_drop(skb);
+		dst = ip6_route_input_lookup(dev_net(sdev), sdev, &fl6, flags);
+		skb_dst_set(skb, dst);
+		break;
+	}
+	default:
+		break;
+	}
+
+out:
+	return skb;
+}
+
+unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb,
+			     const struct nf_hook_state *state)
+{
+	struct ipvl_addr *addr;
+	unsigned int len;
+
+	addr = ipvlan_skb_to_addr(skb, skb->dev);
+	if (!addr)
+		goto out;
+
+	skb->dev = addr->master->dev;
+	len = skb->len + ETH_HLEN;
+	ipvlan_count_rx(addr->master, len, true, false);
+out:
+	return NF_ACCEPT;
+}
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 18b4e8c7f68a..f442eb366863 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -9,24 +9,87 @@
 
 #include "ipvlan.h"
 
+static u32 ipvl_nf_hook_refcnt = 0;
+
+static struct nf_hook_ops ipvl_nfops[] __read_mostly = {
+	{
+		.hook     = ipvlan_nf_input,
+		.pf       = NFPROTO_IPV4,
+		.hooknum  = NF_INET_LOCAL_IN,
+		.priority = INT_MAX,
+	},
+	{
+		.hook     = ipvlan_nf_input,
+		.pf       = NFPROTO_IPV6,
+		.hooknum  = NF_INET_LOCAL_IN,
+		.priority = INT_MAX,
+	},
+};
+
+static struct l3mdev_ops ipvl_l3mdev_ops __read_mostly = {
+	.l3mdev_l3_rcv = ipvlan_l3_rcv,
+};
+
 static void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev)
 {
 	ipvlan->dev->mtu = dev->mtu - ipvlan->mtu_adj;
 }
 
-static void ipvlan_set_port_mode(struct ipvl_port *port, u16 nval)
+static int ipvlan_register_nf_hook(void)
+{
+	int err = 0;
+
+	if (!ipvl_nf_hook_refcnt) {
+		err = _nf_register_hooks(ipvl_nfops, ARRAY_SIZE(ipvl_nfops));
+		if (!err)
+			ipvl_nf_hook_refcnt = 1;
+	} else {
+		ipvl_nf_hook_refcnt++;
+	}
+
+	return err;
+}
+
+static void ipvlan_unregister_nf_hook(void)
+{
+	WARN_ON(!ipvl_nf_hook_refcnt);
+
+	ipvl_nf_hook_refcnt--;
+	if (!ipvl_nf_hook_refcnt)
+		_nf_unregister_hooks(ipvl_nfops, ARRAY_SIZE(ipvl_nfops));
+}
+
+static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval)
 {
 	struct ipvl_dev *ipvlan;
+	struct net_device *mdev = port->dev;
+	int err = 0;
 
+	ASSERT_RTNL();
 	if (port->mode != nval) {
+		if (nval == IPVLAN_MODE_L3S) {
+			/* New mode is L3S */
+			err = ipvlan_register_nf_hook();
+			if (!err) {
+				mdev->l3mdev_ops = &ipvl_l3mdev_ops;
+				mdev->priv_flags |= IFF_L3MDEV_MASTER;
+			} else
+				return err;
+		} else if (port->mode == IPVLAN_MODE_L3S) {
+			/* Old mode was L3S */
+			mdev->priv_flags &= ~IFF_L3MDEV_MASTER;
+			ipvlan_unregister_nf_hook();
+			mdev->l3mdev_ops = NULL;
+		}
 		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
-			if (nval == IPVLAN_MODE_L3)
+			if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S)
 				ipvlan->dev->flags |= IFF_NOARP;
 			else
 				ipvlan->dev->flags &= ~IFF_NOARP;
 		}
 		port->mode = nval;
 	}
+	return err;
 }
 
 static int ipvlan_port_create(struct net_device *dev)
@@ -74,6 +137,11 @@ static void ipvlan_port_destroy(struct net_device *dev)
 	struct ipvl_port *port = ipvlan_port_get_rtnl(dev);
 
 	dev->priv_flags &= ~IFF_IPVLAN_MASTER;
+	if (port->mode == IPVLAN_MODE_L3S) {
+		dev->priv_flags &= ~IFF_L3MDEV_MASTER;
+		ipvlan_unregister_nf_hook();
+		dev->l3mdev_ops = NULL;
+	}
 	netdev_rx_handler_unregister(dev);
 	cancel_work_sync(&port->wq);
 	__skb_queue_purge(&port->backlog);
@@ -132,7 +200,8 @@ static int ipvlan_open(struct net_device *dev)
 	struct net_device *phy_dev = ipvlan->phy_dev;
 	struct ipvl_addr *addr;
 
-	if (ipvlan->port->mode == IPVLAN_MODE_L3)
+	if (ipvlan->port->mode == IPVLAN_MODE_L3 ||
+	    ipvlan->port->mode == IPVLAN_MODE_L3S)
 		dev->flags |= IFF_NOARP;
 	else
 		dev->flags &= ~IFF_NOARP;
@@ -372,13 +441,14 @@ static int ipvlan_nl_changelink(struct net_device *dev,
 {
 	struct ipvl_dev *ipvlan = netdev_priv(dev);
 	struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev);
+	int err = 0;
 
 	if (data && data[IFLA_IPVLAN_MODE]) {
 		u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
 
-		ipvlan_set_port_mode(port, nmode);
+		err = ipvlan_set_port_mode(port, nmode);
 	}
-	return 0;
+	return err;
 }
 
 static size_t ipvlan_nl_getsize(const struct net_device *dev)
@@ -473,10 +543,13 @@ static int ipvlan_link_new(struct net *src_net, struct net_device *dev,
 		unregister_netdevice(dev);
 		return err;
 	}
+	err = ipvlan_set_port_mode(port, mode);
+	if (err) {
+		unregister_netdevice(dev);
+		return err;
+	}
 
 	list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans);
-	ipvlan_set_port_mode(port, mode);
-
 	netif_stacked_transfer_operstate(phy_dev, dev);
 	return 0;
 }
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 2351776a724f..7ec9e99d5491 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -464,6 +464,7 @@ enum {
 enum ipvlan_mode {
 	IPVLAN_MODE_L2 = 0,
 	IPVLAN_MODE_L3,
+	IPVLAN_MODE_L3S,
 	IPVLAN_MODE_MAX
 };
 
-- 
cgit v1.2.3-71-gd317


From 408fbc22ef1efb00dd896acd00e9f7d9b641e047 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Sun, 18 Sep 2016 07:31:43 -0400
Subject: net sched ife action: Introduce skb tcindex metadata encap decap

Sample use case of how this is encoded:
user space via tuntap (or a connected VM/Machine/container)
encodes the tcindex TLV.

Sample use case of decoding:
IFE action decodes it and the skb->tc_index is then used to classify.
So something like this for encoded ICMP packets:

.. first decode then reclassify... skb->tcindex will be set
sudo $TC filter add dev $ETH parent ffff: prio 2 protocol 0xbeef \
u32 match u32 0 0 flowid 1:1 \
action ife decode reclassify

...next match the decode icmp packet...
sudo $TC filter add dev $ETH parent ffff: prio 4 protocol ip \
u32 match ip protocol 1 0xff flowid 1:1 \
action continue

... last classify it using the tcindex classifier and do someaction..
sudo $TC filter add dev $ETH parent ffff: prio 5 protocol ip \
handle 0x11 tcindex classid 1:1 \
action blah..

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_ife.h |  3 +-
 net/sched/Kconfig                  |  5 +++
 net/sched/Makefile                 |  1 +
 net/sched/act_meta_skbtcindex.c    | 79 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 87 insertions(+), 1 deletion(-)
 create mode 100644 net/sched/act_meta_skbtcindex.c

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tc_act/tc_ife.h b/include/uapi/linux/tc_act/tc_ife.h
index 4ece02a77b9a..cd18360eca24 100644
--- a/include/uapi/linux/tc_act/tc_ife.h
+++ b/include/uapi/linux/tc_act/tc_ife.h
@@ -32,8 +32,9 @@ enum {
 #define IFE_META_HASHID 2
 #define	IFE_META_PRIO 3
 #define	IFE_META_QMAP 4
+#define	IFE_META_TCINDEX 5
 /*Can be overridden at runtime by module option*/
-#define	__IFE_META_MAX 5
+#define	__IFE_META_MAX 6
 #define IFE_META_MAX (__IFE_META_MAX - 1)
 
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 7795d5a3f79a..87956a768d1b 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -793,6 +793,11 @@ config NET_IFE_SKBPRIO
         depends on NET_ACT_IFE
         ---help---
 
+config NET_IFE_SKBTCINDEX
+        tristate "Support to encoding decoding skb tcindex on IFE action"
+        depends on NET_ACT_IFE
+        ---help---
+
 config NET_CLS_IND
 	bool "Incoming device classification"
 	depends on NET_CLS_U32 || NET_CLS_FW
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 148ae0d5ac2c..4bdda3634e0b 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_NET_ACT_SKBMOD)	+= act_skbmod.o
 obj-$(CONFIG_NET_ACT_IFE)	+= act_ife.o
 obj-$(CONFIG_NET_IFE_SKBMARK)	+= act_meta_mark.o
 obj-$(CONFIG_NET_IFE_SKBPRIO)	+= act_meta_skbprio.o
+obj-$(CONFIG_NET_IFE_SKBTCINDEX)	+= act_meta_skbtcindex.o
 obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c
new file mode 100644
index 000000000000..3b35774ce890
--- /dev/null
+++ b/net/sched/act_meta_skbtcindex.c
@@ -0,0 +1,79 @@
+/*
+ * net/sched/act_meta_tc_index.c IFE skb->tc_index metadata module
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * copyright Jamal Hadi Salim (2016)
+ *
+*/
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <uapi/linux/tc_act/tc_ife.h>
+#include <net/tc_act/tc_ife.h>
+#include <linux/rtnetlink.h>
+
+static int skbtcindex_encode(struct sk_buff *skb, void *skbdata,
+			     struct tcf_meta_info *e)
+{
+	u32 ifetc_index = skb->tc_index;
+
+	return ife_encode_meta_u16(ifetc_index, skbdata, e);
+}
+
+static int skbtcindex_decode(struct sk_buff *skb, void *data, u16 len)
+{
+	u16 ifetc_index = *(u16 *)data;
+
+	skb->tc_index = ntohs(ifetc_index);
+	return 0;
+}
+
+static int skbtcindex_check(struct sk_buff *skb, struct tcf_meta_info *e)
+{
+	return ife_check_meta_u16(skb->tc_index, e);
+}
+
+static struct tcf_meta_ops ife_skbtcindex_ops = {
+	.metaid = IFE_META_TCINDEX,
+	.metatype = NLA_U16,
+	.name = "tc_index",
+	.synopsis = "skb tc_index 16 bit metadata",
+	.check_presence = skbtcindex_check,
+	.encode = skbtcindex_encode,
+	.decode = skbtcindex_decode,
+	.get = ife_get_meta_u16,
+	.alloc = ife_alloc_meta_u16,
+	.release = ife_release_meta_gen,
+	.validate = ife_validate_meta_u16,
+	.owner = THIS_MODULE,
+};
+
+static int __init ifetc_index_init_module(void)
+{
+	return register_ife_op(&ife_skbtcindex_ops);
+}
+
+static void __exit ifetc_index_cleanup_module(void)
+{
+	unregister_ife_op(&ife_skbtcindex_ops);
+}
+
+module_init(ifetc_index_init_module);
+module_exit(ifetc_index_cleanup_module);
+
+MODULE_AUTHOR("Jamal Hadi Salim(2016)");
+MODULE_DESCRIPTION("Inter-FE skb tc_index metadata module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_IFE_META(IFE_META_SKBTCINDEX);
-- 
cgit v1.2.3-71-gd317


From 36bbef52c7eb646ed6247055a2acd3851e317857 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 20 Sep 2016 00:26:13 +0200
Subject: bpf: direct packet write and access for helpers for clsact progs

This work implements direct packet access for helpers and direct packet
write in a similar fashion as already available for XDP types via commits
4acf6c0b84c9 ("bpf: enable direct packet data write for xdp progs") and
6841de8b0d03 ("bpf: allow helpers access the packet directly"), and as a
complementary feature to the already available direct packet read for tc
(cls/act) programs.

For enabling this, we need to introduce two helpers, bpf_skb_pull_data()
and bpf_csum_update(). The first is generally needed for both, read and
write, because they would otherwise only be limited to the current linear
skb head. Usually, when the data_end test fails, programs just bail out,
or, in the direct read case, use bpf_skb_load_bytes() as an alternative
to overcome this limitation. If such data sits in non-linear parts, we
can just pull them in once with the new helper, retest and eventually
access them.

At the same time, this also makes sure the skb is uncloned, which is, of
course, a necessary condition for direct write. As this needs to be an
invariant for the write part only, the verifier detects writes and adds
a prologue that is calling bpf_skb_pull_data() to effectively unclone the
skb from the very beginning in case it is indeed cloned. The heuristic
makes use of a similar trick that was done in 233577a22089 ("net: filter:
constify detection of pkt_type_offset"). This comes at zero cost for other
programs that do not use the direct write feature. Should a program use
this feature only sparsely and has read access for the most parts with,
for example, drop return codes, then such write action can be delegated
to a tail called program for mitigating this cost of potential uncloning
to a late point in time where it would have been paid similarly with the
bpf_skb_store_bytes() as well. Advantage of direct write is that the
writes are inlined whereas the helper cannot make any length assumptions
and thus needs to generate a call to memcpy() also for small sizes, as well
as cost of helper call itself with sanity checks are avoided. Plus, when
direct read is already used, we don't need to cache or perform rechecks
on the data boundaries (due to verifier invalidating previous checks for
helpers that change skb->data), so more complex programs using rewrites
can benefit from switching to direct read plus write.

For direct packet access to helpers, we save the otherwise needed copy into
a temp struct sitting on stack memory when use-case allows. Both facilities
are enabled via may_access_direct_pkt_data() in verifier. For now, we limit
this to map helpers and csum_diff, and can successively enable other helpers
where we find it makes sense. Helpers that definitely cannot be allowed for
this are those part of bpf_helper_changes_skb_data() since they can change
underlying data, and those that write into memory as this could happen for
packet typed args when still cloned. bpf_csum_update() helper accommodates
for the fact that we need to fixup checksum_complete when using direct write
instead of bpf_skb_store_bytes(), meaning the programs can use available
helpers like bpf_csum_diff(), and implement csum_add(), csum_sub(),
csum_block_add(), csum_block_sub() equivalents in eBPF together with the
new helper. A usage example will be provided for iproute2's examples/bpf/
directory.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   4 +-
 include/linux/skbuff.h   |  14 ++++-
 include/uapi/linux/bpf.h |  21 ++++++++
 kernel/bpf/helpers.c     |   3 ++
 kernel/bpf/verifier.c    |  54 ++++++++++++++-----
 net/core/filter.c        | 134 +++++++++++++++++++++++++++++++++++++++++------
 6 files changed, 196 insertions(+), 34 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9a904f63f8c1..5691fdc83819 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -96,6 +96,7 @@ enum bpf_return_type {
 struct bpf_func_proto {
 	u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 	bool gpl_only;
+	bool pkt_access;
 	enum bpf_return_type ret_type;
 	enum bpf_arg_type arg1_type;
 	enum bpf_arg_type arg2_type;
@@ -151,7 +152,8 @@ struct bpf_verifier_ops {
 	 */
 	bool (*is_valid_access)(int off, int size, enum bpf_access_type type,
 				enum bpf_reg_type *reg_type);
-
+	int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
+			    const struct bpf_prog *prog);
 	u32 (*convert_ctx_access)(enum bpf_access_type type, int dst_reg,
 				  int src_reg, int ctx_off,
 				  struct bpf_insn *insn, struct bpf_prog *prog);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4c5662f05bda..c6dab3f7457c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -676,13 +676,23 @@ struct sk_buff {
 	 */
 	kmemcheck_bitfield_begin(flags1);
 	__u16			queue_mapping;
+
+/* if you move cloned around you also must adapt those constants */
+#ifdef __BIG_ENDIAN_BITFIELD
+#define CLONED_MASK	(1 << 7)
+#else
+#define CLONED_MASK	1
+#endif
+#define CLONED_OFFSET()		offsetof(struct sk_buff, __cloned_offset)
+
+	__u8			__cloned_offset[0];
 	__u8			cloned:1,
 				nohdr:1,
 				fclone:2,
 				peeked:1,
 				head_frag:1,
-				xmit_more:1;
-	/* one bit hole */
+				xmit_more:1,
+				__unused:1; /* one bit hole */
 	kmemcheck_bitfield_end(flags1);
 
 	/* fields enclosed in headers_start/headers_end are copied
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f896dfac4ac0..e07432b9f8b8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -398,6 +398,27 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_skb_change_tail,
 
+	/**
+	 * bpf_skb_pull_data(skb, len)
+	 * The helper will pull in non-linear data in case the
+	 * skb is non-linear and not all of len are part of the
+	 * linear section. Only needed for read/write with direct
+	 * packet access.
+	 * @skb: pointer to skb
+	 * @len: len to make read/writeable
+	 * Return: 0 on success or negative error
+	 */
+	BPF_FUNC_skb_pull_data,
+
+	/**
+	 * bpf_csum_update(skb, csum)
+	 * Adds csum into skb->csum in case of CHECKSUM_COMPLETE.
+	 * @skb: pointer to skb
+	 * @csum: csum to add
+	 * Return: csum on success or negative error
+	 */
+	BPF_FUNC_csum_update,
+
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a5b8bf8cfcfd..39918402e6e9 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -36,6 +36,7 @@ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
 const struct bpf_func_proto bpf_map_lookup_elem_proto = {
 	.func		= bpf_map_lookup_elem,
 	.gpl_only	= false,
+	.pkt_access	= true,
 	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
 	.arg1_type	= ARG_CONST_MAP_PTR,
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
@@ -51,6 +52,7 @@ BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
 const struct bpf_func_proto bpf_map_update_elem_proto = {
 	.func		= bpf_map_update_elem,
 	.gpl_only	= false,
+	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_CONST_MAP_PTR,
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
@@ -67,6 +69,7 @@ BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
 const struct bpf_func_proto bpf_map_delete_elem_proto = {
 	.func		= bpf_map_delete_elem,
 	.gpl_only	= false,
+	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_CONST_MAP_PTR,
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bc138f34e38c..3a75ee3bdcd1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -196,6 +196,7 @@ struct verifier_env {
 	u32 used_map_cnt;		/* number of used maps */
 	u32 id_gen;			/* used to generate unique reg IDs */
 	bool allow_ptr_leaks;
+	bool seen_direct_write;
 };
 
 #define BPF_COMPLEXITY_LIMIT_INSNS	65536
@@ -204,6 +205,7 @@ struct verifier_env {
 struct bpf_call_arg_meta {
 	struct bpf_map *map_ptr;
 	bool raw_mode;
+	bool pkt_access;
 	int regno;
 	int access_size;
 };
@@ -654,10 +656,17 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
 
 #define MAX_PACKET_OFF 0xffff
 
-static bool may_write_pkt_data(enum bpf_prog_type type)
+static bool may_access_direct_pkt_data(struct verifier_env *env,
+				       const struct bpf_call_arg_meta *meta)
 {
-	switch (type) {
+	switch (env->prog->type) {
+	case BPF_PROG_TYPE_SCHED_CLS:
+	case BPF_PROG_TYPE_SCHED_ACT:
 	case BPF_PROG_TYPE_XDP:
+		if (meta)
+			return meta->pkt_access;
+
+		env->seen_direct_write = true;
 		return true;
 	default:
 		return false;
@@ -817,7 +826,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 			err = check_stack_read(state, off, size, value_regno);
 		}
 	} else if (state->regs[regno].type == PTR_TO_PACKET) {
-		if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) {
+		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) {
 			verbose("cannot write into packet\n");
 			return -EACCES;
 		}
@@ -950,8 +959,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 		return 0;
 	}
 
-	if (type == PTR_TO_PACKET && !may_write_pkt_data(env->prog->type)) {
-		verbose("helper access to the packet is not allowed for clsact\n");
+	if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) {
+		verbose("helper access to the packet is not allowed\n");
 		return -EACCES;
 	}
 
@@ -1191,6 +1200,7 @@ static int check_call(struct verifier_env *env, int func_id)
 	changes_data = bpf_helper_changes_skb_data(fn->func);
 
 	memset(&meta, 0, sizeof(meta));
+	meta.pkt_access = fn->pkt_access;
 
 	/* We only support one arg being in raw mode at the moment, which
 	 * is sufficient for the helper functions we have right now.
@@ -2675,18 +2685,35 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
  */
 static int convert_ctx_accesses(struct verifier_env *env)
 {
-	struct bpf_insn *insn = env->prog->insnsi;
-	int insn_cnt = env->prog->len;
-	struct bpf_insn insn_buf[16];
+	const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+	struct bpf_insn insn_buf[16], *insn;
 	struct bpf_prog *new_prog;
 	enum bpf_access_type type;
-	int i;
+	int i, insn_cnt, cnt;
 
-	if (!env->prog->aux->ops->convert_ctx_access)
+	if (ops->gen_prologue) {
+		cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
+					env->prog);
+		if (cnt >= ARRAY_SIZE(insn_buf)) {
+			verbose("bpf verifier is misconfigured\n");
+			return -EINVAL;
+		} else if (cnt) {
+			new_prog = bpf_patch_insn_single(env->prog, 0,
+							 insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+			env->prog = new_prog;
+		}
+	}
+
+	if (!ops->convert_ctx_access)
 		return 0;
 
+	insn_cnt = env->prog->len;
+	insn = env->prog->insnsi;
+
 	for (i = 0; i < insn_cnt; i++, insn++) {
-		u32 insn_delta, cnt;
+		u32 insn_delta;
 
 		if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
 		    insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
@@ -2703,9 +2730,8 @@ static int convert_ctx_accesses(struct verifier_env *env)
 			continue;
 		}
 
-		cnt = env->prog->aux->ops->
-			convert_ctx_access(type, insn->dst_reg, insn->src_reg,
-					   insn->off, insn_buf, env->prog);
+		cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg,
+					      insn->off, insn_buf, env->prog);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
 			verbose("bpf verifier is misconfigured\n");
 			return -EINVAL;
diff --git a/net/core/filter.c b/net/core/filter.c
index 298b146b47e7..0920c2ac1d00 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1362,6 +1362,11 @@ static inline int bpf_try_make_writable(struct sk_buff *skb,
 	return err;
 }
 
+static int bpf_try_make_head_writable(struct sk_buff *skb)
+{
+	return bpf_try_make_writable(skb, skb_headlen(skb));
+}
+
 static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
 {
 	if (skb_at_tc_ingress(skb))
@@ -1441,6 +1446,28 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
 	.arg4_type	= ARG_CONST_STACK_SIZE,
 };
 
+BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
+{
+	/* Idea is the following: should the needed direct read/write
+	 * test fail during runtime, we can pull in more data and redo
+	 * again, since implicitly, we invalidate previous checks here.
+	 *
+	 * Or, since we know how much we need to make read/writeable,
+	 * this can be done once at the program beginning for direct
+	 * access case. By this we overcome limitations of only current
+	 * headroom being accessible.
+	 */
+	return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
+}
+
+static const struct bpf_func_proto bpf_skb_pull_data_proto = {
+	.func		= bpf_skb_pull_data,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
 	   u64, from, u64, to, u64, flags)
 {
@@ -1567,6 +1594,7 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
 static const struct bpf_func_proto bpf_csum_diff_proto = {
 	.func		= bpf_csum_diff,
 	.gpl_only	= false,
+	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_STACK,
 	.arg2_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
@@ -1575,6 +1603,26 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
+{
+	/* The interface is to be used in combination with bpf_csum_diff()
+	 * for direct packet writes. csum rotation for alignment as well
+	 * as emulating csum_sub() can be done from the eBPF program.
+	 */
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		return (skb->csum = csum_add(skb->csum, csum));
+
+	return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_csum_update_proto = {
+	.func		= bpf_csum_update,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
 {
 	return dev_forward_skb(dev, skb);
@@ -1602,6 +1650,8 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
 BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
 {
 	struct net_device *dev;
+	struct sk_buff *clone;
+	int ret;
 
 	if (unlikely(flags & ~(BPF_F_INGRESS)))
 		return -EINVAL;
@@ -1610,14 +1660,25 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
 	if (unlikely(!dev))
 		return -EINVAL;
 
-	skb = skb_clone(skb, GFP_ATOMIC);
-	if (unlikely(!skb))
+	clone = skb_clone(skb, GFP_ATOMIC);
+	if (unlikely(!clone))
 		return -ENOMEM;
 
-	bpf_push_mac_rcsum(skb);
+	/* For direct write, we need to keep the invariant that the skbs
+	 * we're dealing with need to be uncloned. Should uncloning fail
+	 * here, we need to free the just generated clone to unclone once
+	 * again.
+	 */
+	ret = bpf_try_make_head_writable(skb);
+	if (unlikely(ret)) {
+		kfree_skb(clone);
+		return -ENOMEM;
+	}
+
+	bpf_push_mac_rcsum(clone);
 
 	return flags & BPF_F_INGRESS ?
-	       __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
+	       __bpf_rx_skb(dev, clone) : __bpf_tx_skb(dev, clone);
 }
 
 static const struct bpf_func_proto bpf_clone_redirect_proto = {
@@ -2063,19 +2124,14 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
 
 bool bpf_helper_changes_skb_data(void *func)
 {
-	if (func == bpf_skb_vlan_push)
-		return true;
-	if (func == bpf_skb_vlan_pop)
-		return true;
-	if (func == bpf_skb_store_bytes)
-		return true;
-	if (func == bpf_skb_change_proto)
-		return true;
-	if (func == bpf_skb_change_tail)
-		return true;
-	if (func == bpf_l3_csum_replace)
-		return true;
-	if (func == bpf_l4_csum_replace)
+	if (func == bpf_skb_vlan_push ||
+	    func == bpf_skb_vlan_pop ||
+	    func == bpf_skb_store_bytes ||
+	    func == bpf_skb_change_proto ||
+	    func == bpf_skb_change_tail ||
+	    func == bpf_skb_pull_data ||
+	    func == bpf_l3_csum_replace ||
+	    func == bpf_l4_csum_replace)
 		return true;
 
 	return false;
@@ -2440,8 +2496,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_skb_store_bytes_proto;
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_skb_pull_data:
+		return &bpf_skb_pull_data_proto;
 	case BPF_FUNC_csum_diff:
 		return &bpf_csum_diff_proto;
+	case BPF_FUNC_csum_update:
+		return &bpf_csum_update_proto;
 	case BPF_FUNC_l3_csum_replace:
 		return &bpf_l3_csum_replace_proto;
 	case BPF_FUNC_l4_csum_replace:
@@ -2533,6 +2593,45 @@ static bool sk_filter_is_valid_access(int off, int size,
 	return __is_valid_access(off, size, type);
 }
 
+static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
+			       const struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	if (!direct_write)
+		return 0;
+
+	/* if (!skb->cloned)
+	 *       goto start;
+	 *
+	 * (Fast-path, otherwise approximation that we might be
+	 *  a clone, do the rest in helper.)
+	 */
+	*insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET());
+	*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
+	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
+
+	/* ret = bpf_skb_pull_data(skb, 0); */
+	*insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+	*insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
+	*insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+			       BPF_FUNC_skb_pull_data);
+	/* if (!ret)
+	 *      goto restore;
+	 * return TC_ACT_SHOT;
+	 */
+	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
+	*insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT);
+	*insn++ = BPF_EXIT_INSN();
+
+	/* restore: */
+	*insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+	/* start: */
+	*insn++ = prog->insnsi[0];
+
+	return insn - insn_buf;
+}
+
 static bool tc_cls_act_is_valid_access(int off, int size,
 				       enum bpf_access_type type,
 				       enum bpf_reg_type *reg_type)
@@ -2810,6 +2909,7 @@ static const struct bpf_verifier_ops tc_cls_act_ops = {
 	.get_func_proto		= tc_cls_act_func_proto,
 	.is_valid_access	= tc_cls_act_is_valid_access,
 	.convert_ctx_access	= tc_cls_act_convert_ctx_access,
+	.gen_prologue		= tc_cls_act_prologue,
 };
 
 static const struct bpf_verifier_ops xdp_ops = {
-- 
cgit v1.2.3-71-gd317


From 77879147a3481babffd7e368d977ab682545a6bd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 19 Sep 2016 23:39:11 -0400
Subject: net_sched: sch_fq: add low_rate_threshold parameter

This commit adds to the fq module a low_rate_threshold parameter to
insert a delay after all packets if the socket requests a pacing rate
below the threshold.

This helps achieve more precise control of the sending rate with
low-rate paths, especially policers. The basic issue is that if a
congestion control module detects a policer at a certain rate, it may
want fq to be able to shape to that policed rate. That way the sender
can avoid policer drops by having the packets arrive at the policer at
or just under the policed rate.

The default threshold of 550Kbps was chosen analytically so that for
policers or links at 500Kbps or 512Kbps fq would very likely invoke
this mechanism, even if the pacing rate was briefly slightly above the
available bandwidth. This value was then empirically validated with
two years of production testing on YouTube video servers.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  2 ++
 net/sched/sch_fq.c             | 22 +++++++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 2382eed50278..f8e39dbaa781 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -792,6 +792,8 @@ enum {
 
 	TCA_FQ_ORPHAN_MASK,	/* mask applied to orphaned skb hashes */
 
+	TCA_FQ_LOW_RATE_THRESHOLD, /* per packet delay under this rate */
+
 	__TCA_FQ_MAX
 };
 
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index dc52cc10d6ed..5dd929cc1423 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -94,6 +94,7 @@ struct fq_sched_data {
 	u32		flow_max_rate;	/* optional max rate per flow */
 	u32		flow_plimit;	/* max packets per flow */
 	u32		orphan_mask;	/* mask for orphaned skb */
+	u32		low_rate_threshold;
 	struct rb_root	*fq_root;
 	u8		rate_enable;
 	u8		fq_trees_log;
@@ -433,7 +434,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 	struct fq_flow_head *head;
 	struct sk_buff *skb;
 	struct fq_flow *f;
-	u32 rate;
+	u32 rate, plen;
 
 	skb = fq_dequeue_head(sch, &q->internal);
 	if (skb)
@@ -482,7 +483,7 @@ begin:
 	prefetch(&skb->end);
 	f->credit -= qdisc_pkt_len(skb);
 
-	if (f->credit > 0 || !q->rate_enable)
+	if (!q->rate_enable)
 		goto out;
 
 	/* Do not pace locally generated ack packets */
@@ -493,8 +494,15 @@ begin:
 	if (skb->sk)
 		rate = min(skb->sk->sk_pacing_rate, rate);
 
+	if (rate <= q->low_rate_threshold) {
+		f->credit = 0;
+		plen = qdisc_pkt_len(skb);
+	} else {
+		plen = max(qdisc_pkt_len(skb), q->quantum);
+		if (f->credit > 0)
+			goto out;
+	}
 	if (rate != ~0U) {
-		u32 plen = max(qdisc_pkt_len(skb), q->quantum);
 		u64 len = (u64)plen * NSEC_PER_SEC;
 
 		if (likely(rate))
@@ -662,6 +670,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
 	[TCA_FQ_FLOW_MAX_RATE]		= { .type = NLA_U32 },
 	[TCA_FQ_BUCKETS_LOG]		= { .type = NLA_U32 },
 	[TCA_FQ_FLOW_REFILL_DELAY]	= { .type = NLA_U32 },
+	[TCA_FQ_LOW_RATE_THRESHOLD]	= { .type = NLA_U32 },
 };
 
 static int fq_change(struct Qdisc *sch, struct nlattr *opt)
@@ -716,6 +725,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
 	if (tb[TCA_FQ_FLOW_MAX_RATE])
 		q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
 
+	if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
+		q->low_rate_threshold =
+			nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
+
 	if (tb[TCA_FQ_RATE_ENABLE]) {
 		u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
 
@@ -781,6 +794,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
 	q->fq_root		= NULL;
 	q->fq_trees_log		= ilog2(1024);
 	q->orphan_mask		= 1024 - 1;
+	q->low_rate_threshold	= 550000 / 8;
 	qdisc_watchdog_init(&q->watchdog, sch);
 
 	if (opt)
@@ -811,6 +825,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	    nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
 			jiffies_to_usecs(q->flow_refill_delay)) ||
 	    nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
+	    nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
+			q->low_rate_threshold) ||
 	    nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
 		goto nla_put_failure;
 
-- 
cgit v1.2.3-71-gd317


From eb8329e0a04db0061f714f033b4454326ba147f4 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 19 Sep 2016 23:39:16 -0400
Subject: tcp: export data delivery rate

This commit export two new fields in struct tcp_info:

  tcpi_delivery_rate: The most recent goodput, as measured by
    tcp_rate_gen(). If the socket is limited by the sending
    application (e.g., no data to send), it reports the highest
    measurement instead of the most recent. The unit is bytes per
    second (like other rate fields in tcp_info).

  tcpi_delivery_rate_app_limited: A boolean indicating if the goodput
    was measured when the socket's throughput was limited by the
    sending application.

This delivery rate information can be useful for applications that
want to know the current throughput the TCP connection is seeing,
e.g. adaptive bitrate video streaming. It can also be very useful for
debugging or troubleshooting.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  5 ++++-
 include/uapi/linux/tcp.h |  3 +++
 net/ipv4/tcp.c           | 11 ++++++++++-
 net/ipv4/tcp_rate.c      | 12 +++++++++++-
 4 files changed, 28 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index fdcd00ffcb66..a17ae7b85218 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -213,7 +213,8 @@ struct tcp_sock {
 		u8 reord;    /* reordering detected */
 	} rack;
 	u16	advmss;		/* Advertised MSS			*/
-	u8	unused;
+	u8	rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
+		unused:7;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
 		thin_dupack : 1,/* Fast retransmit on first dupack      */
@@ -271,6 +272,8 @@ struct tcp_sock {
 	u32	app_limited;	/* limited until "delivered" reaches this val */
 	struct skb_mstamp first_tx_mstamp;  /* start of window send phase */
 	struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */
+	u32	rate_delivered;    /* saved rate sample: packets delivered */
+	u32	rate_interval_us;  /* saved rate sample: time elapsed */
 
  	u32	rcv_wnd;	/* Current receiver window		*/
 	u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 482898fc433a..73ac0db487f8 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -167,6 +167,7 @@ struct tcp_info {
 	__u8	tcpi_backoff;
 	__u8	tcpi_options;
 	__u8	tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
+	__u8	tcpi_delivery_rate_app_limited:1;
 
 	__u32	tcpi_rto;
 	__u32	tcpi_ato;
@@ -211,6 +212,8 @@ struct tcp_info {
 	__u32	tcpi_min_rtt;
 	__u32	tcpi_data_segs_in;	/* RFC4898 tcpEStatsDataSegsIn */
 	__u32	tcpi_data_segs_out;	/* RFC4898 tcpEStatsDataSegsOut */
+
+	__u64   tcpi_delivery_rate;
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2250f891f931..f253e5019d22 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2712,7 +2712,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 {
 	const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
 	const struct inet_connection_sock *icsk = inet_csk(sk);
-	u32 now = tcp_time_stamp;
+	u32 now = tcp_time_stamp, intv;
 	unsigned int start;
 	int notsent_bytes;
 	u64 rate64;
@@ -2802,6 +2802,15 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_min_rtt = tcp_min_rtt(tp);
 	info->tcpi_data_segs_in = tp->data_segs_in;
 	info->tcpi_data_segs_out = tp->data_segs_out;
+
+	info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
+	rate = READ_ONCE(tp->rate_delivered);
+	intv = READ_ONCE(tp->rate_interval_us);
+	if (rate && intv) {
+		rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
+		do_div(rate64, intv);
+		put_unaligned(rate64, &info->tcpi_delivery_rate);
+	}
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index 52ff84be59ab..9be1581a5a08 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -149,12 +149,22 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 	 * for connections suffer heavy or prolonged losses.
 	 */
 	if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
-		rs->interval_us = -1;
 		if (!rs->is_retrans)
 			pr_debug("tcp rate: %ld %d %u %u %u\n",
 				 rs->interval_us, rs->delivered,
 				 inet_csk(sk)->icsk_ca_state,
 				 tp->rx_opt.sack_ok, tcp_min_rtt(tp));
+		rs->interval_us = -1;
+		return;
+	}
+
+	/* Record the last non-app-limited or the highest app-limited bw */
+	if (!rs->is_app_limited ||
+	    ((u64)rs->delivered * tp->rate_interval_us >=
+	     (u64)tp->rate_delivered * rs->interval_us)) {
+		tp->rate_delivered = rs->delivered;
+		tp->rate_interval_us = rs->interval_us;
+		tp->rate_app_limited = rs->is_app_limited;
 	}
 }
 
-- 
cgit v1.2.3-71-gd317


From 0f8782ea14974ce992618b55f0c041ef43ed0b78 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Mon, 19 Sep 2016 23:39:23 -0400
Subject: tcp_bbr: add BBR congestion control

This commit implements a new TCP congestion control algorithm: BBR
(Bottleneck Bandwidth and RTT). A detailed description of BBR will be
published in ACM Queue, Vol. 14 No. 5, September-October 2016, as
"BBR: Congestion-Based Congestion Control".

BBR has significantly increased throughput and reduced latency for
connections on Google's internal backbone networks and google.com and
YouTube Web servers.

BBR requires only changes on the sender side, not in the network or
the receiver side. Thus it can be incrementally deployed on today's
Internet, or in datacenters.

The Internet has predominantly used loss-based congestion control
(largely Reno or CUBIC) since the 1980s, relying on packet loss as the
signal to slow down. While this worked well for many years, loss-based
congestion control is unfortunately out-dated in today's networks. On
today's Internet, loss-based congestion control causes the infamous
bufferbloat problem, often causing seconds of needless queuing delay,
since it fills the bloated buffers in many last-mile links. On today's
high-speed long-haul links using commodity switches with shallow
buffers, loss-based congestion control has abysmal throughput because
it over-reacts to losses caused by transient traffic bursts.

In 1981 Kleinrock and Gale showed that the optimal operating point for
a network maximizes delivered bandwidth while minimizing delay and
loss, not only for single connections but for the network as a
whole. Finding that optimal operating point has been elusive, since
any single network measurement is ambiguous: network measurements are
the result of both bandwidth and propagation delay, and those two
cannot be measured simultaneously.

While it is impossible to disambiguate any single bandwidth or RTT
measurement, a connection's behavior over time tells a clearer
story. BBR uses a measurement strategy designed to resolve this
ambiguity. It combines these measurements with a robust servo loop
using recent control systems advances to implement a distributed
congestion control algorithm that reacts to actual congestion, not
packet loss or transient queue delay, and is designed to converge with
high probability to a point near the optimal operating point.

In a nutshell, BBR creates an explicit model of the network pipe by
sequentially probing the bottleneck bandwidth and RTT. On the arrival
of each ACK, BBR derives the current delivery rate of the last round
trip, and feeds it through a windowed max-filter to estimate the
bottleneck bandwidth. Conversely it uses a windowed min-filter to
estimate the round trip propagation delay. The max-filtered bandwidth
and min-filtered RTT estimates form BBR's model of the network pipe.

Using its model, BBR sets control parameters to govern sending
behavior. The primary control is the pacing rate: BBR applies a gain
multiplier to transmit faster or slower than the observed bottleneck
bandwidth. The conventional congestion window (cwnd) is now the
secondary control; the cwnd is set to a small multiple of the
estimated BDP (bandwidth-delay product) in order to allow full
utilization and bandwidth probing while bounding the potential amount
of queue at the bottleneck.

When a BBR connection starts, it enters STARTUP mode and applies a
high gain to perform an exponential search to quickly probe the
bottleneck bandwidth (doubling its sending rate each round trip, like
slow start). However, instead of continuing until it fills up the
buffer (i.e. a loss), or until delay or ACK spacing reaches some
threshold (like Hystart), it uses its model of the pipe to estimate
when that pipe is full: it estimates the pipe is full when it notices
the estimated bandwidth has stopped growing. At that point it exits
STARTUP and enters DRAIN mode, where it reduces its pacing rate to
drain the queue it estimates it has created.

Then BBR enters steady state. In steady state, PROBE_BW mode cycles
between first pacing faster to probe for more bandwidth, then pacing
slower to drain any queue that created if no more bandwidth was
available, and then cruising at the estimated bandwidth to utilize the
pipe without creating excess queue. Occasionally, on an as-needed
basis, it sends significantly slower to probe for RTT (PROBE_RTT
mode).

BBR has been fully deployed on Google's wide-area backbone networks
and we're experimenting with BBR on Google.com and YouTube on a global
scale.  Replacing CUBIC with BBR has resulted in significant
improvements in network latency and application (RPC, browser, and
video) metrics. For more details please refer to our upcoming ACM
Queue publication.

Example performance results, to illustrate the difference between BBR
and CUBIC:

Resilience to random loss (e.g. from shallow buffers):
  Consider a netperf TCP_STREAM test lasting 30 secs on an emulated
  path with a 10Gbps bottleneck, 100ms RTT, and 1% packet loss
  rate. CUBIC gets 3.27 Mbps, and BBR gets 9150 Mbps (2798x higher).

Low latency with the bloated buffers common in today's last-mile links:
  Consider a netperf TCP_STREAM test lasting 120 secs on an emulated
  path with a 10Mbps bottleneck, 40ms RTT, and 1000-packet bottleneck
  buffer. Both fully utilize the bottleneck bandwidth, but BBR
  achieves this with a median RTT 25x lower (43 ms instead of 1.09
  secs).

Our long-term goal is to improve the congestion control algorithms
used on the Internet. We are hopeful that BBR can help advance the
efforts toward this goal, and motivate the community to do further
research.

Test results, performance evaluations, feedback, and BBR-related
discussions are very welcome in the public e-mail list for BBR:

  https://groups.google.com/forum/#!forum/bbr-dev

NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing
enabled, since pacing is integral to the BBR design and
implementation. BBR without pacing would not function properly, and
may incur unnecessary high packet loss rates.

Signed-off-by: Van Jacobson <vanj@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/inet_diag.h |  13 +
 net/ipv4/Kconfig               |  18 +
 net/ipv4/Makefile              |   1 +
 net/ipv4/tcp_bbr.c             | 896 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 928 insertions(+)
 create mode 100644 net/ipv4/tcp_bbr.c

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index b5c366f87b3e..509cd961068d 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -124,6 +124,7 @@ enum {
 	INET_DIAG_PEERS,
 	INET_DIAG_PAD,
 	INET_DIAG_MARK,
+	INET_DIAG_BBRINFO,
 	__INET_DIAG_MAX,
 };
 
@@ -157,8 +158,20 @@ struct tcp_dctcp_info {
 	__u32	dctcp_ab_tot;
 };
 
+/* INET_DIAG_BBRINFO */
+
+struct tcp_bbr_info {
+	/* u64 bw: max-filtered BW (app throughput) estimate in Byte per sec: */
+	__u32	bbr_bw_lo;		/* lower 32 bits of bw */
+	__u32	bbr_bw_hi;		/* upper 32 bits of bw */
+	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
+	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
+	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
+};
+
 union tcp_cc_info {
 	struct tcpvegas_info	vegas;
 	struct tcp_dctcp_info	dctcp;
+	struct tcp_bbr_info	bbr;
 };
 #endif /* _UAPI_INET_DIAG_H_ */
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 50d6a9b49f6c..300b06888fdf 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -640,6 +640,21 @@ config TCP_CONG_CDG
 	  D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
 	  delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
 
+config TCP_CONG_BBR
+	tristate "BBR TCP"
+	default n
+	---help---
+
+	BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+	maximize network utilization and minimize queues. It builds an explicit
+	model of the the bottleneck delivery rate and path round-trip
+	propagation delay. It tolerates packet loss and delay unrelated to
+	congestion. It can operate over LAN, WAN, cellular, wifi, or cable
+	modem links. It can coexist with flows that use loss-based congestion
+	control, and can operate with shallow buffers, deep buffers,
+	bufferbloat, policers, or AQM schemes that do not provide a delay
+	signal. It requires the fq ("Fair Queue") pacing packet scheduler.
+
 choice
 	prompt "Default TCP congestion control"
 	default DEFAULT_CUBIC
@@ -674,6 +689,9 @@ choice
 	config DEFAULT_CDG
 		bool "CDG" if TCP_CONG_CDG=y
 
+	config DEFAULT_BBR
+		bool "BBR" if TCP_CONG_BBR=y
+
 	config DEFAULT_RENO
 		bool "Reno"
 endchoice
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 9cfff1a0bf71..bc6a6c8b9bcd 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o
 obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
 obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
 obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
+obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
 obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
new file mode 100644
index 000000000000..0ea66c2c9344
--- /dev/null
+++ b/net/ipv4/tcp_bbr.c
@@ -0,0 +1,896 @@
+/* Bottleneck Bandwidth and RTT (BBR) congestion control
+ *
+ * BBR congestion control computes the sending rate based on the delivery
+ * rate (throughput) estimated from ACKs. In a nutshell:
+ *
+ *   On each ACK, update our model of the network path:
+ *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
+ *      min_rtt = windowed_min(rtt, 10 seconds)
+ *   pacing_rate = pacing_gain * bottleneck_bandwidth
+ *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
+ *
+ * The core algorithm does not react directly to packet losses or delays,
+ * although BBR may adjust the size of next send per ACK when loss is
+ * observed, or adjust the sending rate if it estimates there is a
+ * traffic policer, in order to keep the drop rate reasonable.
+ *
+ * BBR is described in detail in:
+ *   "BBR: Congestion-Based Congestion Control",
+ *   Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
+ *   Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
+ *
+ * There is a public e-mail list for discussing BBR development and testing:
+ *   https://groups.google.com/forum/#!forum/bbr-dev
+ *
+ * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled,
+ * since pacing is integral to the BBR design and implementation.
+ * BBR without pacing would not function properly, and may incur unnecessary
+ * high packet loss rates.
+ */
+#include <linux/module.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+#include <linux/inet.h>
+#include <linux/random.h>
+#include <linux/win_minmax.h>
+
+/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
+ * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
+ * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
+ * Since the minimum window is >=4 packets, the lower bound isn't
+ * an issue. The upper bound isn't an issue with existing technologies.
+ */
+#define BW_SCALE 24
+#define BW_UNIT (1 << BW_SCALE)
+
+#define BBR_SCALE 8	/* scaling factor for fractions in BBR (e.g. gains) */
+#define BBR_UNIT (1 << BBR_SCALE)
+
+/* BBR has the following modes for deciding how fast to send: */
+enum bbr_mode {
+	BBR_STARTUP,	/* ramp up sending rate rapidly to fill pipe */
+	BBR_DRAIN,	/* drain any queue created during startup */
+	BBR_PROBE_BW,	/* discover, share bw: pace around estimated bw */
+	BBR_PROBE_RTT,	/* cut cwnd to min to probe min_rtt */
+};
+
+/* BBR congestion control block */
+struct bbr {
+	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
+	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
+	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
+	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
+	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
+	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
+	struct skb_mstamp cycle_mstamp;  /* time of this cycle phase start */
+	u32     mode:3,		     /* current bbr_mode in state machine */
+		prev_ca_state:3,     /* CA state on previous ACK */
+		packet_conservation:1,  /* use packet conservation? */
+		restore_cwnd:1,	     /* decided to revert cwnd to old value */
+		round_start:1,	     /* start of packet-timed tx->ack round? */
+		tso_segs_goal:7,     /* segments we want in each skb we send */
+		idle_restart:1,	     /* restarting after idle? */
+		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
+		unused:5,
+		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
+		lt_rtt_cnt:7,	     /* round trips in long-term interval */
+		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
+	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
+	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
+	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
+	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
+	u32	pacing_gain:10,	/* current gain for setting pacing rate */
+		cwnd_gain:10,	/* current gain for setting cwnd */
+		full_bw_cnt:3,	/* number of rounds without large bw gains */
+		cycle_idx:3,	/* current index in pacing_gain cycle array */
+		unused_b:6;
+	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
+	u32	full_bw;	/* recent bw, to estimate if pipe is full */
+};
+
+#define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
+
+/* Window length of bw filter (in rounds): */
+static const int bbr_bw_rtts = CYCLE_LEN + 2;
+/* Window length of min_rtt filter (in sec): */
+static const u32 bbr_min_rtt_win_sec = 10;
+/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
+static const u32 bbr_probe_rtt_mode_ms = 200;
+/* Skip TSO below the following bandwidth (bits/sec): */
+static const int bbr_min_tso_rate = 1200000;
+
+/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
+ * that will allow a smoothly increasing pacing rate that will double each RTT
+ * and send the same number of packets per RTT that an un-paced, slow-starting
+ * Reno or CUBIC flow would:
+ */
+static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
+/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
+ * the queue created in BBR_STARTUP in a single round:
+ */
+static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */
+static const int bbr_cwnd_gain  = BBR_UNIT * 2;
+/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
+static const int bbr_pacing_gain[] = {
+	BBR_UNIT * 5 / 4,	/* probe for more available bw */
+	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
+	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
+	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
+};
+/* Randomize the starting gain cycling phase over N phases: */
+static const u32 bbr_cycle_rand = 7;
+
+/* Try to keep at least this many packets in flight, if things go smoothly. For
+ * smooth functioning, a sliding window protocol ACKing every other packet
+ * needs at least 4 packets in flight:
+ */
+static const u32 bbr_cwnd_min_target = 4;
+
+/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
+/* If bw has increased significantly (1.25x), there may be more bw available: */
+static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
+/* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
+static const u32 bbr_full_bw_cnt = 3;
+
+/* "long-term" ("LT") bandwidth estimator parameters... */
+/* The minimum number of rounds in an LT bw sampling interval: */
+static const u32 bbr_lt_intvl_min_rtts = 4;
+/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
+static const u32 bbr_lt_loss_thresh = 50;
+/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
+static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
+/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
+static const u32 bbr_lt_bw_diff = 4000 / 8;
+/* If we estimate we're policed, use lt_bw for this many round trips: */
+static const u32 bbr_lt_bw_max_rtts = 48;
+
+/* Do we estimate that STARTUP filled the pipe? */
+static bool bbr_full_bw_reached(const struct sock *sk)
+{
+	const struct bbr *bbr = inet_csk_ca(sk);
+
+	return bbr->full_bw_cnt >= bbr_full_bw_cnt;
+}
+
+/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+static u32 bbr_max_bw(const struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return minmax_get(&bbr->bw);
+}
+
+/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
+static u32 bbr_bw(const struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
+}
+
+/* Return rate in bytes per second, optionally with a gain.
+ * The order here is chosen carefully to avoid overflow of u64. This should
+ * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
+ */
+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
+{
+	rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache);
+	rate *= gain;
+	rate >>= BBR_SCALE;
+	rate *= USEC_PER_SEC;
+	return rate >> BW_SCALE;
+}
+
+/* Pace using current bw estimate and a gain factor. In order to help drive the
+ * network toward lower queues while maintaining high utilization and low
+ * latency, the average pacing rate aims to be slightly (~1%) lower than the
+ * estimated bandwidth. This is an important aspect of the design. In this
+ * implementation this slightly lower pacing rate is achieved implicitly by not
+ * including link-layer headers in the packet size used for the pacing rate.
+ */
+static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 rate = bw;
+
+	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
+	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
+	if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate)
+		sk->sk_pacing_rate = rate;
+}
+
+/* Return count of segments we want in the skbs we send, or 0 for default. */
+static u32 bbr_tso_segs_goal(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return bbr->tso_segs_goal;
+}
+
+static void bbr_set_tso_segs_goal(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 min_segs;
+
+	min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
+	bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs),
+				 0x7FU);
+}
+
+/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
+static void bbr_save_cwnd(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
+		bbr->prior_cwnd = tp->snd_cwnd;  /* this cwnd is good enough */
+	else  /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
+		bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
+}
+
+static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (event == CA_EVENT_TX_START && tp->app_limited) {
+		bbr->idle_restart = 1;
+		/* Avoid pointless buffer overflows: pace at est. bw if we don't
+		 * need more speed (we're restarting from idle and app-limited).
+		 */
+		if (bbr->mode == BBR_PROBE_BW)
+			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+	}
+}
+
+/* Find target cwnd. Right-size the cwnd based on min RTT and the
+ * estimated bottleneck bandwidth:
+ *
+ * cwnd = bw * min_rtt * gain = BDP * gain
+ *
+ * The key factor, gain, controls the amount of queue. While a small gain
+ * builds a smaller queue, it becomes more vulnerable to noise in RTT
+ * measurements (e.g., delayed ACKs or other ACK compression effects). This
+ * noise may cause BBR to under-estimate the rate.
+ *
+ * To achieve full performance in high-speed paths, we budget enough cwnd to
+ * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
+ *   - one skb in sending host Qdisc,
+ *   - one skb in sending host TSO/GSO engine
+ *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
+ * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+ * full even with ACK-every-other-packet delayed ACKs.
+ */
+static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 cwnd;
+	u64 w;
+
+	/* If we've never had a valid RTT sample, cap cwnd at the initial
+	 * default. This should only happen when the connection is not using TCP
+	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
+	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
+	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
+	 */
+	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
+		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
+
+	w = (u64)bw * bbr->min_rtt_us;
+
+	/* Apply a gain to the given value, then remove the BW_SCALE shift. */
+	cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
+
+	/* Allow enough full-sized skbs in flight to utilize end systems. */
+	cwnd += 3 * bbr->tso_segs_goal;
+
+	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
+	cwnd = (cwnd + 1) & ~1U;
+
+	return cwnd;
+}
+
+/* An optimization in BBR to reduce losses: On the first round of recovery, we
+ * follow the packet conservation principle: send P packets per P packets acked.
+ * After that, we slow-start and send at most 2*P packets per P packets acked.
+ * After recovery finishes, or upon undo, we restore the cwnd we had when
+ * recovery started (capped by the target cwnd based on estimated BDP).
+ *
+ * TODO(ycheng/ncardwell): implement a rate-based approach.
+ */
+static bool bbr_set_cwnd_to_recover_or_restore(
+	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
+	u32 cwnd = tp->snd_cwnd;
+
+	/* An ACK for P pkts should release at most 2*P packets. We do this
+	 * in two steps. First, here we deduct the number of lost packets.
+	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
+	 */
+	if (rs->losses > 0)
+		cwnd = max_t(s32, cwnd - rs->losses, 1);
+
+	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
+		/* Starting 1st round of Recovery, so do packet conservation. */
+		bbr->packet_conservation = 1;
+		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
+		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
+		cwnd = tcp_packets_in_flight(tp) + acked;
+	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
+		/* Exiting loss recovery; restore cwnd saved before recovery. */
+		bbr->restore_cwnd = 1;
+		bbr->packet_conservation = 0;
+	}
+	bbr->prev_ca_state = state;
+
+	if (bbr->restore_cwnd) {
+		/* Restore cwnd after exiting loss recovery or PROBE_RTT. */
+		cwnd = max(cwnd, bbr->prior_cwnd);
+		bbr->restore_cwnd = 0;
+	}
+
+	if (bbr->packet_conservation) {
+		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
+		return true;	/* yes, using packet conservation */
+	}
+	*new_cwnd = cwnd;
+	return false;
+}
+
+/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
+ * has drawn us down below target), or snap down to target if we're above it.
+ */
+static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
+			 u32 acked, u32 bw, int gain)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 cwnd = 0, target_cwnd = 0;
+
+	if (!acked)
+		return;
+
+	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
+		goto done;
+
+	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
+	target_cwnd = bbr_target_cwnd(sk, bw, gain);
+	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
+		cwnd = min(cwnd + acked, target_cwnd);
+	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
+		cwnd = cwnd + acked;
+	cwnd = max(cwnd, bbr_cwnd_min_target);
+
+done:
+	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);	/* apply global cap */
+	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
+		tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target);
+}
+
+/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
+static bool bbr_is_next_cycle_phase(struct sock *sk,
+				    const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	bool is_full_length =
+		skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) >
+		bbr->min_rtt_us;
+	u32 inflight, bw;
+
+	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
+	 * use the pipe without increasing the queue.
+	 */
+	if (bbr->pacing_gain == BBR_UNIT)
+		return is_full_length;		/* just use wall clock time */
+
+	inflight = rs->prior_in_flight;  /* what was in-flight before ACK? */
+	bw = bbr_max_bw(sk);
+
+	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
+	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
+	 * small (e.g. on a LAN). We do not persist if packets are lost, since
+	 * a path with small buffers may not hold that much.
+	 */
+	if (bbr->pacing_gain > BBR_UNIT)
+		return is_full_length &&
+			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
+			 inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain));
+
+	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
+	 * probing didn't find more bw. If inflight falls to match BDP then we
+	 * estimate queue is drained; persisting would underutilize the pipe.
+	 */
+	return is_full_length ||
+		inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT);
+}
+
+static void bbr_advance_cycle_phase(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
+	bbr->cycle_mstamp = tp->delivered_mstamp;
+	bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
+}
+
+/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
+static void bbr_update_cycle_phase(struct sock *sk,
+				   const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw &&
+	    bbr_is_next_cycle_phase(sk, rs))
+		bbr_advance_cycle_phase(sk);
+}
+
+static void bbr_reset_startup_mode(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->mode = BBR_STARTUP;
+	bbr->pacing_gain = bbr_high_gain;
+	bbr->cwnd_gain	 = bbr_high_gain;
+}
+
+static void bbr_reset_probe_bw_mode(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->mode = BBR_PROBE_BW;
+	bbr->pacing_gain = BBR_UNIT;
+	bbr->cwnd_gain = bbr_cwnd_gain;
+	bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
+	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
+}
+
+static void bbr_reset_mode(struct sock *sk)
+{
+	if (!bbr_full_bw_reached(sk))
+		bbr_reset_startup_mode(sk);
+	else
+		bbr_reset_probe_bw_mode(sk);
+}
+
+/* Start a new long-term sampling interval. */
+static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies;
+	bbr->lt_last_delivered = tp->delivered;
+	bbr->lt_last_lost = tp->lost;
+	bbr->lt_rtt_cnt = 0;
+}
+
+/* Completely reset long-term bandwidth sampling. */
+static void bbr_reset_lt_bw_sampling(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->lt_bw = 0;
+	bbr->lt_use_bw = 0;
+	bbr->lt_is_sampling = false;
+	bbr_reset_lt_bw_sampling_interval(sk);
+}
+
+/* Long-term bw sampling interval is done. Estimate whether we're policed. */
+static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 diff;
+
+	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
+		/* Is new bw close to the lt_bw from the previous interval? */
+		diff = abs(bw - bbr->lt_bw);
+		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
+		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
+		     bbr_lt_bw_diff)) {
+			/* All criteria are met; estimate we're policed. */
+			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
+			bbr->lt_use_bw = 1;
+			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
+			bbr->lt_rtt_cnt = 0;
+			return;
+		}
+	}
+	bbr->lt_bw = bw;
+	bbr_reset_lt_bw_sampling_interval(sk);
+}
+
+/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
+ * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
+ * explicitly models their policed rate, to reduce unnecessary losses. We
+ * estimate that we're policed if we see 2 consecutive sampling intervals with
+ * consistent throughput and high packet loss. If we think we're being policed,
+ * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
+ */
+static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 lost, delivered;
+	u64 bw;
+	s32 t;
+
+	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
+		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
+		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
+			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
+			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
+		}
+		return;
+	}
+
+	/* Wait for the first loss before sampling, to let the policer exhaust
+	 * its tokens and estimate the steady-state rate allowed by the policer.
+	 * Starting samples earlier includes bursts that over-estimate the bw.
+	 */
+	if (!bbr->lt_is_sampling) {
+		if (!rs->losses)
+			return;
+		bbr_reset_lt_bw_sampling_interval(sk);
+		bbr->lt_is_sampling = true;
+	}
+
+	/* To avoid underestimates, reset sampling if we run out of data. */
+	if (rs->is_app_limited) {
+		bbr_reset_lt_bw_sampling(sk);
+		return;
+	}
+
+	if (bbr->round_start)
+		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
+	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
+		return;		/* sampling interval needs to be longer */
+	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
+		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
+		return;
+	}
+
+	/* End sampling interval when a packet is lost, so we estimate the
+	 * policer tokens were exhausted. Stopping the sampling before the
+	 * tokens are exhausted under-estimates the policed rate.
+	 */
+	if (!rs->losses)
+		return;
+
+	/* Calculate packets lost and delivered in sampling interval. */
+	lost = tp->lost - bbr->lt_last_lost;
+	delivered = tp->delivered - bbr->lt_last_delivered;
+	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
+	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
+		return;
+
+	/* Find average delivery rate in this sampling interval. */
+	t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp);
+	if (t < 1)
+		return;		/* interval is less than one jiffy, so wait */
+	t = jiffies_to_usecs(t);
+	/* Interval long enough for jiffies_to_usecs() to return a bogus 0? */
+	if (t < 1) {
+		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
+		return;
+	}
+	bw = (u64)delivered * BW_UNIT;
+	do_div(bw, t);
+	bbr_lt_bw_interval_done(sk, bw);
+}
+
+/* Estimate the bandwidth based on how fast packets are delivered */
+static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 bw;
+
+	bbr->round_start = 0;
+	if (rs->delivered < 0 || rs->interval_us <= 0)
+		return; /* Not a valid observation */
+
+	/* See if we've reached the next RTT */
+	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
+		bbr->next_rtt_delivered = tp->delivered;
+		bbr->rtt_cnt++;
+		bbr->round_start = 1;
+		bbr->packet_conservation = 0;
+	}
+
+	bbr_lt_bw_sampling(sk, rs);
+
+	/* Divide delivered by the interval to find a (lower bound) bottleneck
+	 * bandwidth sample. Delivered is in packets and interval_us in uS and
+	 * ratio will be <<1 for most connections. So delivered is first scaled.
+	 */
+	bw = (u64)rs->delivered * BW_UNIT;
+	do_div(bw, rs->interval_us);
+
+	/* If this sample is application-limited, it is likely to have a very
+	 * low delivered count that represents application behavior rather than
+	 * the available network rate. Such a sample could drag down estimated
+	 * bw, causing needless slow-down. Thus, to continue to send at the
+	 * last measured network rate, we filter out app-limited samples unless
+	 * they describe the path bw at least as well as our bw model.
+	 *
+	 * So the goal during app-limited phase is to proceed with the best
+	 * network rate no matter how long. We automatically leave this
+	 * phase when app writes faster than the network can deliver :)
+	 */
+	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
+		/* Incorporate new sample into our max bw filter. */
+		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
+	}
+}
+
+/* Estimate when the pipe is full, using the change in delivery rate: BBR
+ * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
+ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+ * higher rwin, 3: we get higher delivery rate samples. Or transient
+ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+ */
+static void bbr_check_full_bw_reached(struct sock *sk,
+				      const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 bw_thresh;
+
+	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
+		return;
+
+	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
+	if (bbr_max_bw(sk) >= bw_thresh) {
+		bbr->full_bw = bbr_max_bw(sk);
+		bbr->full_bw_cnt = 0;
+		return;
+	}
+	++bbr->full_bw_cnt;
+}
+
+/* If pipe is probably full, drain the queue and then enter steady-state. */
+static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+		bbr->mode = BBR_DRAIN;	/* drain queue we created */
+		bbr->pacing_gain = bbr_drain_gain;	/* pace slow to drain */
+		bbr->cwnd_gain = bbr_high_gain;	/* maintain cwnd */
+	}	/* fall through to check if in-flight is already small: */
+	if (bbr->mode == BBR_DRAIN &&
+	    tcp_packets_in_flight(tcp_sk(sk)) <=
+	    bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
+		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
+}
+
+/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
+ * periodically drain the bottleneck queue, to converge to measure the true
+ * min_rtt (unloaded propagation delay). This allows the flows to keep queues
+ * small (reducing queuing delay and packet loss) and achieve fairness among
+ * BBR flows.
+ *
+ * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
+ * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
+ * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
+ * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
+ * re-enter the previous mode. BBR uses 200ms to approximately bound the
+ * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
+ *
+ * Note that flows need only pay 2% if they are busy sending over the last 10
+ * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
+ * natural silences or low-rate periods within 10 seconds where the rate is low
+ * enough for long enough to drain its queue in the bottleneck. We pick up
+ * these min RTT measurements opportunistically with our min_rtt filter. :-)
+ */
+static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	bool filter_expired;
+
+	/* Track min RTT seen in the min_rtt_win_sec filter window: */
+	filter_expired = after(tcp_time_stamp,
+			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
+	if (rs->rtt_us >= 0 &&
+	    (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) {
+		bbr->min_rtt_us = rs->rtt_us;
+		bbr->min_rtt_stamp = tcp_time_stamp;
+	}
+
+	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
+	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
+		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
+		bbr->pacing_gain = BBR_UNIT;
+		bbr->cwnd_gain = BBR_UNIT;
+		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
+		bbr->probe_rtt_done_stamp = 0;
+	}
+
+	if (bbr->mode == BBR_PROBE_RTT) {
+		/* Ignore low rate samples during this mode. */
+		tp->app_limited =
+			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+		/* Maintain min packets in flight for max(200 ms, 1 round). */
+		if (!bbr->probe_rtt_done_stamp &&
+		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
+			bbr->probe_rtt_done_stamp = tcp_time_stamp +
+				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
+			bbr->probe_rtt_round_done = 0;
+			bbr->next_rtt_delivered = tp->delivered;
+		} else if (bbr->probe_rtt_done_stamp) {
+			if (bbr->round_start)
+				bbr->probe_rtt_round_done = 1;
+			if (bbr->probe_rtt_round_done &&
+			    after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) {
+				bbr->min_rtt_stamp = tcp_time_stamp;
+				bbr->restore_cwnd = 1;  /* snap to prior_cwnd */
+				bbr_reset_mode(sk);
+			}
+		}
+	}
+	bbr->idle_restart = 0;
+}
+
+static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
+{
+	bbr_update_bw(sk, rs);
+	bbr_update_cycle_phase(sk, rs);
+	bbr_check_full_bw_reached(sk, rs);
+	bbr_check_drain(sk, rs);
+	bbr_update_min_rtt(sk, rs);
+}
+
+static void bbr_main(struct sock *sk, const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 bw;
+
+	bbr_update_model(sk, rs);
+
+	bw = bbr_bw(sk);
+	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+	bbr_set_tso_segs_goal(sk);
+	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
+}
+
+static void bbr_init(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 bw;
+
+	bbr->prior_cwnd = 0;
+	bbr->tso_segs_goal = 0;	 /* default segs per skb until first ACK */
+	bbr->rtt_cnt = 0;
+	bbr->next_rtt_delivered = 0;
+	bbr->prev_ca_state = TCP_CA_Open;
+	bbr->packet_conservation = 0;
+
+	bbr->probe_rtt_done_stamp = 0;
+	bbr->probe_rtt_round_done = 0;
+	bbr->min_rtt_us = tcp_min_rtt(tp);
+	bbr->min_rtt_stamp = tcp_time_stamp;
+
+	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
+
+	/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
+	bw = (u64)tp->snd_cwnd * BW_UNIT;
+	do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC);
+	sk->sk_pacing_rate = 0;		/* force an update of sk_pacing_rate */
+	bbr_set_pacing_rate(sk, bw, bbr_high_gain);
+
+	bbr->restore_cwnd = 0;
+	bbr->round_start = 0;
+	bbr->idle_restart = 0;
+	bbr->full_bw = 0;
+	bbr->full_bw_cnt = 0;
+	bbr->cycle_mstamp.v64 = 0;
+	bbr->cycle_idx = 0;
+	bbr_reset_lt_bw_sampling(sk);
+	bbr_reset_startup_mode(sk);
+}
+
+static u32 bbr_sndbuf_expand(struct sock *sk)
+{
+	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+	return 3;
+}
+
+/* In theory BBR does not need to undo the cwnd since it does not
+ * always reduce cwnd on losses (see bbr_main()). Keep it for now.
+ */
+static u32 bbr_undo_cwnd(struct sock *sk)
+{
+	return tcp_sk(sk)->snd_cwnd;
+}
+
+/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
+static u32 bbr_ssthresh(struct sock *sk)
+{
+	bbr_save_cwnd(sk);
+	return TCP_INFINITE_SSTHRESH;	 /* BBR does not use ssthresh */
+}
+
+static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
+			   union tcp_cc_info *info)
+{
+	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
+	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+		struct tcp_sock *tp = tcp_sk(sk);
+		struct bbr *bbr = inet_csk_ca(sk);
+		u64 bw = bbr_bw(sk);
+
+		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
+		memset(&info->bbr, 0, sizeof(info->bbr));
+		info->bbr.bbr_bw_lo		= (u32)bw;
+		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
+		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
+		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
+		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
+		*attr = INET_DIAG_BBRINFO;
+		return sizeof(info->bbr);
+	}
+	return 0;
+}
+
+static void bbr_set_state(struct sock *sk, u8 new_state)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (new_state == TCP_CA_Loss) {
+		struct rate_sample rs = { .losses = 1 };
+
+		bbr->prev_ca_state = TCP_CA_Loss;
+		bbr->full_bw = 0;
+		bbr->round_start = 1;	/* treat RTO like end of a round */
+		bbr_lt_bw_sampling(sk, &rs);
+	}
+}
+
+static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
+	.flags		= TCP_CONG_NON_RESTRICTED,
+	.name		= "bbr",
+	.owner		= THIS_MODULE,
+	.init		= bbr_init,
+	.cong_control	= bbr_main,
+	.sndbuf_expand	= bbr_sndbuf_expand,
+	.undo_cwnd	= bbr_undo_cwnd,
+	.cwnd_event	= bbr_cwnd_event,
+	.ssthresh	= bbr_ssthresh,
+	.tso_segs_goal	= bbr_tso_segs_goal,
+	.get_info	= bbr_get_info,
+	.set_state	= bbr_set_state,
+};
+
+static int __init bbr_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_bbr_cong_ops);
+}
+
+static void __exit bbr_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
+}
+
+module_init(bbr_register);
+module_exit(bbr_unregister);
+
+MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
+MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
+MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
+MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
-- 
cgit v1.2.3-71-gd317


From 0d01d45f1b251448590c710baa32f722e43c62c7 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 21 Sep 2016 11:43:54 +0100
Subject: net: cls_bpf: limit hardware offload by software-only flag

Add cls_bpf support for the TCA_CLS_FLAGS_SKIP_HW flag.
Unlike U32 and flower cls_bpf already has some netlink
flags defined.  Create a new attribute to be able to use
the same flag values as the above.

Unlike U32 and flower reject unknown flags.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h        |  1 +
 include/uapi/linux/pkt_cls.h |  1 +
 net/sched/cls_bpf.c          | 22 ++++++++++++++++++++--
 3 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 41e8071dff87..57af9f3032ff 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -498,6 +498,7 @@ struct tc_cls_bpf_offload {
 	struct bpf_prog *prog;
 	const char *name;
 	bool exts_integrated;
+	u32 gen_flags;
 };
 
 #endif
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 8915b61bbf83..8fd715f806a2 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -396,6 +396,7 @@ enum {
 	TCA_BPF_FD,
 	TCA_BPF_NAME,
 	TCA_BPF_FLAGS,
+	TCA_BPF_FLAGS_GEN,
 	__TCA_BPF_MAX,
 };
 
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 6523c5b4c0a5..ebf01f7c1470 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -27,6 +27,8 @@ MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
 MODULE_DESCRIPTION("TC BPF based classifier");
 
 #define CLS_BPF_NAME_LEN	256
+#define CLS_BPF_SUPPORTED_GEN_FLAGS		\
+	TCA_CLS_FLAGS_SKIP_HW
 
 struct cls_bpf_head {
 	struct list_head plist;
@@ -40,6 +42,7 @@ struct cls_bpf_prog {
 	struct tcf_result res;
 	bool exts_integrated;
 	bool offloaded;
+	u32 gen_flags;
 	struct tcf_exts exts;
 	u32 handle;
 	union {
@@ -55,6 +58,7 @@ struct cls_bpf_prog {
 static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
 	[TCA_BPF_CLASSID]	= { .type = NLA_U32 },
 	[TCA_BPF_FLAGS]		= { .type = NLA_U32 },
+	[TCA_BPF_FLAGS_GEN]	= { .type = NLA_U32 },
 	[TCA_BPF_FD]		= { .type = NLA_U32 },
 	[TCA_BPF_NAME]		= { .type = NLA_NUL_STRING,
 				    .len = CLS_BPF_NAME_LEN },
@@ -154,6 +158,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	bpf_offload.prog = prog->filter;
 	bpf_offload.name = prog->bpf_name;
 	bpf_offload.exts_integrated = prog->exts_integrated;
+	bpf_offload.gen_flags = prog->gen_flags;
 
 	return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
 					     tp->protocol, &offload);
@@ -167,14 +172,14 @@ static void cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	enum tc_clsbpf_command cmd;
 
 	if (oldprog && oldprog->offloaded) {
-		if (tc_should_offload(dev, tp, 0)) {
+		if (tc_should_offload(dev, tp, prog->gen_flags)) {
 			cmd = TC_CLSBPF_REPLACE;
 		} else {
 			obj = oldprog;
 			cmd = TC_CLSBPF_DESTROY;
 		}
 	} else {
-		if (!tc_should_offload(dev, tp, 0))
+		if (!tc_should_offload(dev, tp, prog->gen_flags))
 			return;
 		cmd = TC_CLSBPF_ADD;
 	}
@@ -370,6 +375,7 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
 {
 	bool is_bpf, is_ebpf, have_exts = false;
 	struct tcf_exts exts;
+	u32 gen_flags = 0;
 	int ret;
 
 	is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS];
@@ -394,8 +400,17 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
 
 		have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT;
 	}
+	if (tb[TCA_BPF_FLAGS_GEN]) {
+		gen_flags = nla_get_u32(tb[TCA_BPF_FLAGS_GEN]);
+		if (gen_flags & ~CLS_BPF_SUPPORTED_GEN_FLAGS ||
+		    !tc_flags_valid(gen_flags)) {
+			ret = -EINVAL;
+			goto errout;
+		}
+	}
 
 	prog->exts_integrated = have_exts;
+	prog->gen_flags = gen_flags;
 
 	ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) :
 		       cls_bpf_prog_from_efd(tb, prog, tp);
@@ -568,6 +583,9 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 		bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT;
 	if (bpf_flags && nla_put_u32(skb, TCA_BPF_FLAGS, bpf_flags))
 		goto nla_put_failure;
+	if (prog->gen_flags &&
+	    nla_put_u32(skb, TCA_BPF_FLAGS_GEN, prog->gen_flags))
+		goto nla_put_failure;
 
 	nla_nest_end(skb, nest);
 
-- 
cgit v1.2.3-71-gd317


From 45a497f2d149a4a8061c61518a79d59f1f3034b2 Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Date: Mon, 19 Sep 2016 19:11:10 +0300
Subject: net/sched: act_vlan: Introduce TCA_VLAN_ACT_MODIFY vlan action

TCA_VLAN_ACT_MODIFY allows one to change an existing tag.

It accepts same attributes as TCA_VLAN_ACT_PUSH (protocol, id,
priority).
If packet is vlan tagged, then the tag gets overwritten according to
user specified attributes.

For example, this allows user to replace a tag's vid while preserving
its priority bits (as opposed to "action vlan pop pipe action vlan push").

Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_vlan.h |  1 +
 net/sched/act_vlan.c                | 29 ++++++++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tc_act/tc_vlan.h b/include/uapi/linux/tc_act/tc_vlan.h
index be72b6e3843b..bddb272b843f 100644
--- a/include/uapi/linux/tc_act/tc_vlan.h
+++ b/include/uapi/linux/tc_act/tc_vlan.h
@@ -16,6 +16,7 @@
 
 #define TCA_VLAN_ACT_POP	1
 #define TCA_VLAN_ACT_PUSH	2
+#define TCA_VLAN_ACT_MODIFY	3
 
 struct tc_vlan {
 	tc_gen;
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 59a8d3150ae2..a95c00b119da 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -30,6 +30,7 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
 	struct tcf_vlan *v = to_vlan(a);
 	int action;
 	int err;
+	u16 tci;
 
 	spin_lock(&v->tcf_lock);
 	tcf_lastuse_update(&v->tcf_tm);
@@ -48,6 +49,30 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
 		if (err)
 			goto drop;
 		break;
+	case TCA_VLAN_ACT_MODIFY:
+		/* No-op if no vlan tag (either hw-accel or in-payload) */
+		if (!skb_vlan_tagged(skb))
+			goto unlock;
+		/* extract existing tag (and guarantee no hw-accel tag) */
+		if (skb_vlan_tag_present(skb)) {
+			tci = skb_vlan_tag_get(skb);
+			skb->vlan_tci = 0;
+		} else {
+			/* in-payload vlan tag, pop it */
+			err = __skb_vlan_pop(skb, &tci);
+			if (err)
+				goto drop;
+		}
+		/* replace the vid */
+		tci = (tci & ~VLAN_VID_MASK) | v->tcfv_push_vid;
+		/* replace prio bits, if tcfv_push_prio specified */
+		if (v->tcfv_push_prio) {
+			tci &= ~VLAN_PRIO_MASK;
+			tci |= v->tcfv_push_prio << VLAN_PRIO_SHIFT;
+		}
+		/* put updated tci as hwaccel tag */
+		__vlan_hwaccel_put_tag(skb, v->tcfv_push_proto, tci);
+		break;
 	default:
 		BUG();
 	}
@@ -102,6 +127,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 	case TCA_VLAN_ACT_POP:
 		break;
 	case TCA_VLAN_ACT_PUSH:
+	case TCA_VLAN_ACT_MODIFY:
 		if (!tb[TCA_VLAN_PUSH_VLAN_ID]) {
 			if (exists)
 				tcf_hash_release(*a, bind);
@@ -185,7 +211,8 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
 	if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
 
-	if (v->tcfv_action == TCA_VLAN_ACT_PUSH &&
+	if ((v->tcfv_action == TCA_VLAN_ACT_PUSH ||
+	     v->tcfv_action == TCA_VLAN_ACT_MODIFY) &&
 	    (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) ||
 	     nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL,
 			  v->tcfv_push_proto) ||
-- 
cgit v1.2.3-71-gd317


From 2b03bf732488a3c2e920afe22c03b82cb8477e28 Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana <nevola@gmail.com>
Date: Tue, 13 Sep 2016 13:49:53 +0200
Subject: netfilter: nft_numgen: add number generation offset

Add support of an offset value for incremental counter and random. With
this option the sysadmin is able to start the counter to a certain value
and then apply the generated number.

Example:

	meta mark set numgen inc mod 2 offset 100

This will generate marks with the serie 100, 101, 100, 101, ...

Suggested-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 net/netfilter/nft_numgen.c               | 32 ++++++++++++++++++++++++++------
 2 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index bc0eb6a1066d..bcfb892ff148 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1136,12 +1136,14 @@ enum nft_trace_types {
  * @NFTA_NG_DREG: destination register (NLA_U32)
  * @NFTA_NG_MODULUS: maximum counter value (NLA_U32)
  * @NFTA_NG_TYPE: operation type (NLA_U32)
+ * @NFTA_NG_OFFSET: offset to be added to the counter (NLA_U32)
  */
 enum nft_ng_attributes {
 	NFTA_NG_UNSPEC,
 	NFTA_NG_DREG,
 	NFTA_NG_MODULUS,
 	NFTA_NG_TYPE,
+	NFTA_NG_OFFSET,
 	__NFTA_NG_MAX
 };
 #define NFTA_NG_MAX	(__NFTA_NG_MAX - 1)
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index f173ebec30a7..55bc5ab78d4a 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -23,6 +23,7 @@ struct nft_ng_inc {
 	enum nft_registers      dreg:8;
 	u32			modulus;
 	atomic_t		counter;
+	u32			offset;
 };
 
 static void nft_ng_inc_eval(const struct nft_expr *expr,
@@ -37,13 +38,14 @@ static void nft_ng_inc_eval(const struct nft_expr *expr,
 		nval = (oval + 1 < priv->modulus) ? oval + 1 : 0;
 	} while (atomic_cmpxchg(&priv->counter, oval, nval) != oval);
 
-	regs->data[priv->dreg] = nval;
+	regs->data[priv->dreg] = nval + priv->offset;
 }
 
 static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
 	[NFTA_NG_DREG]		= { .type = NLA_U32 },
 	[NFTA_NG_MODULUS]	= { .type = NLA_U32 },
 	[NFTA_NG_TYPE]		= { .type = NLA_U32 },
+	[NFTA_NG_OFFSET]	= { .type = NLA_U32 },
 };
 
 static int nft_ng_inc_init(const struct nft_ctx *ctx,
@@ -52,10 +54,16 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
 {
 	struct nft_ng_inc *priv = nft_expr_priv(expr);
 
+	if (tb[NFTA_NG_OFFSET])
+		priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET]));
+
 	priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS]));
 	if (priv->modulus == 0)
 		return -ERANGE;
 
+	if (priv->offset + priv->modulus - 1 < priv->offset)
+		return -EOVERFLOW;
+
 	priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
 	atomic_set(&priv->counter, 0);
 
@@ -64,7 +72,7 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
 }
 
 static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
-		       u32 modulus, enum nft_ng_types type)
+		       u32 modulus, enum nft_ng_types type, u32 offset)
 {
 	if (nft_dump_register(skb, NFTA_NG_DREG, dreg))
 		goto nla_put_failure;
@@ -72,6 +80,8 @@ static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
 		goto nla_put_failure;
 	if (nla_put_be32(skb, NFTA_NG_TYPE, htonl(type)))
 		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_NG_OFFSET, htonl(offset)))
+		goto nla_put_failure;
 
 	return 0;
 
@@ -83,12 +93,14 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_ng_inc *priv = nft_expr_priv(expr);
 
-	return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_INCREMENTAL);
+	return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_INCREMENTAL,
+			   priv->offset);
 }
 
 struct nft_ng_random {
 	enum nft_registers      dreg:8;
 	u32			modulus;
+	u32			offset;
 };
 
 static void nft_ng_random_eval(const struct nft_expr *expr,
@@ -97,9 +109,10 @@ static void nft_ng_random_eval(const struct nft_expr *expr,
 {
 	struct nft_ng_random *priv = nft_expr_priv(expr);
 	struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
+	u32 val;
 
-	regs->data[priv->dreg] = reciprocal_scale(prandom_u32_state(state),
-						  priv->modulus);
+	val = reciprocal_scale(prandom_u32_state(state), priv->modulus);
+	regs->data[priv->dreg] = val + priv->offset;
 }
 
 static int nft_ng_random_init(const struct nft_ctx *ctx,
@@ -108,10 +121,16 @@ static int nft_ng_random_init(const struct nft_ctx *ctx,
 {
 	struct nft_ng_random *priv = nft_expr_priv(expr);
 
+	if (tb[NFTA_NG_OFFSET])
+		priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET]));
+
 	priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS]));
 	if (priv->modulus == 0)
 		return -ERANGE;
 
+	if (priv->offset + priv->modulus - 1 < priv->offset)
+		return -EOVERFLOW;
+
 	prandom_init_once(&nft_numgen_prandom_state);
 
 	priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
@@ -124,7 +143,8 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_ng_random *priv = nft_expr_priv(expr);
 
-	return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_RANDOM);
+	return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_RANDOM,
+			   priv->offset);
 }
 
 static struct nft_expr_type nft_ng_type;
-- 
cgit v1.2.3-71-gd317


From 8061bb54436c19fd16b7c734a69ff60bac26e3e9 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Wed, 14 Sep 2016 23:41:46 +0800
Subject: netfilter: nft_queue: add _SREG_QNUM attr to select the queue number

Currently, the user can specify the queue numbers by _QUEUE_NUM and
_QUEUE_TOTAL attributes, this is enough in most situations.

But acctually, it is not very flexible, for example:
  tcp dport 80 mapped to queue0
  tcp dport 81 mapped to queue1
  tcp dport 82 mapped to queue2
In order to do this thing, we must add 3 nft rules, and more
mapping meant more rules ...

So take one register to select the queue number, then we can add one
simple rule to mapping queues, maybe like this:
  queue num tcp dport map { 80:0, 81:1, 82:2 ... }

Florian Westphal also proposed wider usage scenarios:
  queue num jhash ip saddr . ip daddr mod ...
  queue num meta cpu ...
  queue num meta mark ...

The last point is how to load a queue number from sreg, although we can
use *(u16*)&regs->data[reg] to load the queue number, just like nat expr
to load its l4port do.

But we will cooperate with hash expr, meta cpu, meta mark expr and so on.
They all store the result to u32 type, so cast it to u16 pointer and
dereference it will generate wrong result in the big endian system.

So just keep it simple, we treat queue number as u32 type, although u16
type is already enough.

Suggested-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |   2 +
 net/netfilter/nft_queue.c                | 102 +++++++++++++++++++++++++++----
 2 files changed, 92 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index bcfb892ff148..1cf41dd838b2 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -894,12 +894,14 @@ enum nft_log_attributes {
  * @NFTA_QUEUE_NUM: netlink queue to send messages to (NLA_U16)
  * @NFTA_QUEUE_TOTAL: number of queues to load balance packets on (NLA_U16)
  * @NFTA_QUEUE_FLAGS: various flags (NLA_U16)
+ * @NFTA_QUEUE_SREG_QNUM: source register of queue number (NLA_U32: nft_registers)
  */
 enum nft_queue_attributes {
 	NFTA_QUEUE_UNSPEC,
 	NFTA_QUEUE_NUM,
 	NFTA_QUEUE_TOTAL,
 	NFTA_QUEUE_FLAGS,
+	NFTA_QUEUE_SREG_QNUM,
 	__NFTA_QUEUE_MAX
 };
 #define NFTA_QUEUE_MAX		(__NFTA_QUEUE_MAX - 1)
diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c
index d16d59959ff6..393d359a1889 100644
--- a/net/netfilter/nft_queue.c
+++ b/net/netfilter/nft_queue.c
@@ -22,9 +22,10 @@
 static u32 jhash_initval __read_mostly;
 
 struct nft_queue {
-	u16	queuenum;
-	u16	queues_total;
-	u16	flags;
+	enum nft_registers	sreg_qnum:8;
+	u16			queuenum;
+	u16			queues_total;
+	u16			flags;
 };
 
 static void nft_queue_eval(const struct nft_expr *expr,
@@ -54,26 +55,39 @@ static void nft_queue_eval(const struct nft_expr *expr,
 	regs->verdict.code = ret;
 }
 
+static void nft_queue_sreg_eval(const struct nft_expr *expr,
+				struct nft_regs *regs,
+				const struct nft_pktinfo *pkt)
+{
+	struct nft_queue *priv = nft_expr_priv(expr);
+	u32 queue, ret;
+
+	queue = regs->data[priv->sreg_qnum];
+
+	ret = NF_QUEUE_NR(queue);
+	if (priv->flags & NFT_QUEUE_FLAG_BYPASS)
+		ret |= NF_VERDICT_FLAG_QUEUE_BYPASS;
+
+	regs->verdict.code = ret;
+}
+
 static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = {
 	[NFTA_QUEUE_NUM]	= { .type = NLA_U16 },
 	[NFTA_QUEUE_TOTAL]	= { .type = NLA_U16 },
 	[NFTA_QUEUE_FLAGS]	= { .type = NLA_U16 },
+	[NFTA_QUEUE_SREG_QNUM]	= { .type = NLA_U32 },
 };
 
 static int nft_queue_init(const struct nft_ctx *ctx,
-			   const struct nft_expr *expr,
-			   const struct nlattr * const tb[])
+			  const struct nft_expr *expr,
+			  const struct nlattr * const tb[])
 {
 	struct nft_queue *priv = nft_expr_priv(expr);
 	u32 maxid;
 
-	if (tb[NFTA_QUEUE_NUM] == NULL)
-		return -EINVAL;
-
-	init_hashrandom(&jhash_initval);
 	priv->queuenum = ntohs(nla_get_be16(tb[NFTA_QUEUE_NUM]));
 
-	if (tb[NFTA_QUEUE_TOTAL] != NULL)
+	if (tb[NFTA_QUEUE_TOTAL])
 		priv->queues_total = ntohs(nla_get_be16(tb[NFTA_QUEUE_TOTAL]));
 	else
 		priv->queues_total = 1;
@@ -85,11 +99,34 @@ static int nft_queue_init(const struct nft_ctx *ctx,
 	if (maxid > U16_MAX)
 		return -ERANGE;
 
-	if (tb[NFTA_QUEUE_FLAGS] != NULL) {
+	if (tb[NFTA_QUEUE_FLAGS]) {
+		priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS]));
+		if (priv->flags & ~NFT_QUEUE_FLAG_MASK)
+			return -EINVAL;
+	}
+	return 0;
+}
+
+static int nft_queue_sreg_init(const struct nft_ctx *ctx,
+			       const struct nft_expr *expr,
+			       const struct nlattr * const tb[])
+{
+	struct nft_queue *priv = nft_expr_priv(expr);
+	int err;
+
+	priv->sreg_qnum = nft_parse_register(tb[NFTA_QUEUE_SREG_QNUM]);
+	err = nft_validate_register_load(priv->sreg_qnum, sizeof(u32));
+	if (err < 0)
+		return err;
+
+	if (tb[NFTA_QUEUE_FLAGS]) {
 		priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS]));
 		if (priv->flags & ~NFT_QUEUE_FLAG_MASK)
 			return -EINVAL;
+		if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT)
+			return -EOPNOTSUPP;
 	}
+
 	return 0;
 }
 
@@ -108,6 +145,21 @@ nla_put_failure:
 	return -1;
 }
 
+static int
+nft_queue_sreg_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_queue *priv = nft_expr_priv(expr);
+
+	if (nft_dump_register(skb, NFTA_QUEUE_SREG_QNUM, priv->sreg_qnum) ||
+	    nla_put_be16(skb, NFTA_QUEUE_FLAGS, htons(priv->flags)))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
 static struct nft_expr_type nft_queue_type;
 static const struct nft_expr_ops nft_queue_ops = {
 	.type		= &nft_queue_type,
@@ -117,9 +169,35 @@ static const struct nft_expr_ops nft_queue_ops = {
 	.dump		= nft_queue_dump,
 };
 
+static const struct nft_expr_ops nft_queue_sreg_ops = {
+	.type		= &nft_queue_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_queue)),
+	.eval		= nft_queue_sreg_eval,
+	.init		= nft_queue_sreg_init,
+	.dump		= nft_queue_sreg_dump,
+};
+
+static const struct nft_expr_ops *
+nft_queue_select_ops(const struct nft_ctx *ctx,
+		     const struct nlattr * const tb[])
+{
+	if (tb[NFTA_QUEUE_NUM] && tb[NFTA_QUEUE_SREG_QNUM])
+		return ERR_PTR(-EINVAL);
+
+	init_hashrandom(&jhash_initval);
+
+	if (tb[NFTA_QUEUE_NUM])
+		return &nft_queue_ops;
+
+	if (tb[NFTA_QUEUE_SREG_QNUM])
+		return &nft_queue_sreg_ops;
+
+	return ERR_PTR(-EINVAL);
+}
+
 static struct nft_expr_type nft_queue_type __read_mostly = {
 	.name		= "queue",
-	.ops		= &nft_queue_ops,
+	.select_ops	= &nft_queue_select_ops,
 	.policy		= nft_queue_policy,
 	.maxattr	= NFTA_QUEUE_MAX,
 	.owner		= THIS_MODULE,
-- 
cgit v1.2.3-71-gd317


From fefa569a9d4bc4b7758c0fddd75bb0382c95da77 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 22 Sep 2016 08:58:55 -0700
Subject: net_sched: sch_fq: account for schedule/timers drifts

It looks like the following patch can make FQ very precise, even in VM
or stressed hosts. It matters at high pacing rates.

We take into account the difference between the time that was programmed
when last packet was sent, and current time (a drift of tens of usecs is
often observed)

Add an EWMA of the unthrottle latency to help diagnostics.

This latency is the difference between current time and oldest packet in
delayed RB-tree. This accounts for the high resolution timer latency,
but can be different under stress, as fq_check_throttled() can be
opportunistically be called from a dequeue() called after an enqueue()
for a different flow.

Tested:
// Start a 10Gbit flow
$ netperf --google-pacing-rate 1250000000 -H lpaa24 -l 10000 -- -K bbr &

Before patch :
$ sar -n DEV 10 5 | grep eth0 | grep Average
Average:         eth0  17106.04 756876.84   1102.75 1119049.02      0.00      0.00      0.52

After patch :
$ sar -n DEV 10 5 | grep eth0 | grep Average
Average:         eth0  17867.00 800245.90   1151.77 1183172.12      0.00      0.00      0.52

A new iproute2 tc can output the 'unthrottle latency' :

$ tc -s qd sh dev eth0 | grep latency
  0 gc, 0 highprio, 32490767 throttled, 2382 ns latency

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  2 +-
 net/sched/sch_fq.c             | 21 ++++++++++++++++++---
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index f8e39dbaa781..df7451d35131 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -811,7 +811,7 @@ struct tc_fq_qd_stats {
 	__u32	flows;
 	__u32	inactive_flows;
 	__u32	throttled_flows;
-	__u32	pad;
+	__u32	unthrottle_latency_ns;
 };
 
 /* Heavy-Hitter Filter */
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 5dd929cc1423..18e752439f6f 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -86,6 +86,7 @@ struct fq_sched_data {
 
 	struct rb_root	delayed;	/* for rate limited flows */
 	u64		time_next_delayed_flow;
+	unsigned long	unthrottle_latency_ns;
 
 	struct fq_flow	internal;	/* for non classified or high prio packets */
 	u32		quantum;
@@ -408,11 +409,19 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 
 static void fq_check_throttled(struct fq_sched_data *q, u64 now)
 {
+	unsigned long sample;
 	struct rb_node *p;
 
 	if (q->time_next_delayed_flow > now)
 		return;
 
+	/* Update unthrottle latency EWMA.
+	 * This is cheap and can help diagnosing timer/latency problems.
+	 */
+	sample = (unsigned long)(now - q->time_next_delayed_flow);
+	q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
+	q->unthrottle_latency_ns += sample >> 3;
+
 	q->time_next_delayed_flow = ~0ULL;
 	while ((p = rb_first(&q->delayed)) != NULL) {
 		struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
@@ -515,7 +524,12 @@ begin:
 			len = NSEC_PER_SEC;
 			q->stat_pkts_too_long++;
 		}
-
+		/* Account for schedule/timers drifts.
+		 * f->time_next_packet was set when prior packet was sent,
+		 * and current time (@now) can be too late by tens of us.
+		 */
+		if (f->time_next_packet)
+			len -= min(len/2, now - f->time_next_packet);
 		f->time_next_packet = now + len;
 	}
 out:
@@ -787,6 +801,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
 	q->initial_quantum	= 10 * psched_mtu(qdisc_dev(sch));
 	q->flow_refill_delay	= msecs_to_jiffies(40);
 	q->flow_max_rate	= ~0U;
+	q->time_next_delayed_flow = ~0ULL;
 	q->rate_enable		= 1;
 	q->new_flows.first	= NULL;
 	q->old_flows.first	= NULL;
@@ -854,8 +869,8 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 	st.flows		  = q->flows;
 	st.inactive_flows	  = q->inactive_flows;
 	st.throttled_flows	  = q->throttled_flows;
-	st.pad			  = 0;
-
+	st.unthrottle_latency_ns  = min_t(unsigned long,
+					  q->unthrottle_latency_ns, ~0U);
 	sch_tree_unlock(sch);
 
 	return gnet_stats_copy_app(d, &st, sizeof(st));
-- 
cgit v1.2.3-71-gd317


From 7a4b28c6cc9ffac50f791b99cc7e46106436e5d8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 23 Sep 2016 01:28:37 +0200
Subject: bpf: add helper to invalidate hash

Add a small helper that complements 36bbef52c7eb ("bpf: direct packet
write and access for helpers for clsact progs") for invalidating the
current skb->hash after mangling on headers via direct packet write.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  7 +++++++
 net/core/filter.c        | 18 ++++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e07432b9f8b8..f09c70b97eca 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -419,6 +419,13 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_csum_update,
 
+	/**
+	 * bpf_set_hash_invalid(skb)
+	 * Invalidate current skb>hash.
+	 * @skb: pointer to skb
+	 */
+	BPF_FUNC_set_hash_invalid,
+
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index acf84fbfb043..00351cdf7d0c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1777,6 +1777,22 @@ static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb)
+{
+	/* After all direct packet write, this can be used once for
+	 * triggering a lazy recalc on next skb_get_hash() invocation.
+	 */
+	skb_clear_hash(skb);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
+	.func		= bpf_set_hash_invalid,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
 BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
 	   u16, vlan_tci)
 {
@@ -2534,6 +2550,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_route_realm_proto;
 	case BPF_FUNC_get_hash_recalc:
 		return &bpf_get_hash_recalc_proto;
+	case BPF_FUNC_set_hash_invalid:
+		return &bpf_set_hash_invalid_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_skb_event_output_proto;
 	case BPF_FUNC_get_smp_processor_id:
-- 
cgit v1.2.3-71-gd317


From 79aab093a0b5370d7fc4e99df75996f4744dc03f Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Thu, 22 Sep 2016 12:11:15 +0300
Subject: net: Update API for VF vlan protocol 802.1ad support

Introduce new rtnl UAPI that exposes a list of vlans per VF, giving
the ability for user-space application to specify it for the VF, as an
option to support 802.1ad.
We adjusted IP Link tool to support this option.

For future use cases, the new UAPI supports multiple vlans. For now we
limit the list size to a single vlan in kernel.
Add IFLA_VF_VLAN_LIST in addition to IFLA_VF_VLAN to keep backward
compatibility with older versions of IP Link tool.

Add a vlan protocol parameter to the ndo_set_vf_vlan callback.
We kept 802.1Q as the drivers' default vlan protocol.
Suitable ip link tool command examples:
  Set vf vlan protocol 802.1ad:
    ip link set eth0 vf 1 vlan 100 proto 802.1ad
  Set vf to VST (802.1Q) mode:
    ip link set eth0 vf 1 vlan 100 proto 802.1Q
  Or by omitting the new parameter
    ip link set eth0 vf 1 vlan 100

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h    |  3 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c  |  9 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c    |  6 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h    |  2 +-
 drivers/net/ethernet/emulex/benet/be_main.c        |  6 +-
 drivers/net/ethernet/intel/fm10k/fm10k.h           |  2 +-
 drivers/net/ethernet/intel/fm10k/fm10k_iov.c       |  6 +-
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 11 ++-
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h |  4 +-
 drivers/net/ethernet/intel/igb/igb_main.c          |  9 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c     |  5 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h     |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |  6 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  6 +-
 drivers/net/ethernet/qlogic/qede/qede_main.c       |  6 +-
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h  |  2 +-
 .../net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c   |  5 +-
 drivers/net/ethernet/sfc/sriov.c                   |  5 +-
 drivers/net/ethernet/sfc/sriov.h                   |  2 +-
 include/linux/if_link.h                            |  1 +
 include/linux/netdevice.h                          |  6 +-
 include/uapi/linux/if_link.h                       | 19 ++++-
 net/core/rtnetlink.c                               | 80 ++++++++++++++++++----
 23 files changed, 161 insertions(+), 42 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index 0e68fadecfdb..243cb9748d35 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -492,7 +492,8 @@ int __bnx2x_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
 int bnx2x_get_vf_config(struct net_device *dev, int vf,
 			struct ifla_vf_info *ivi);
 int bnx2x_set_vf_mac(struct net_device *dev, int queue, u8 *mac);
-int bnx2x_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos);
+int bnx2x_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos,
+		      __be16 vlan_proto);
 
 /* select_queue callback */
 u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb,
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
index 6c586b045d1d..3f77d0863543 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
@@ -2521,7 +2521,8 @@ void bnx2x_pf_set_vfs_vlan(struct bnx2x *bp)
 	for_each_vf(bp, vfidx) {
 		bulletin = BP_VF_BULLETIN(bp, vfidx);
 		if (bulletin->valid_bitmap & (1 << VLAN_VALID))
-			bnx2x_set_vf_vlan(bp->dev, vfidx, bulletin->vlan, 0);
+			bnx2x_set_vf_vlan(bp->dev, vfidx, bulletin->vlan, 0,
+					  htons(ETH_P_8021Q));
 	}
 }
 
@@ -2781,7 +2782,8 @@ static int bnx2x_set_vf_vlan_filter(struct bnx2x *bp, struct bnx2x_virtf *vf,
 	return 0;
 }
 
-int bnx2x_set_vf_vlan(struct net_device *dev, int vfidx, u16 vlan, u8 qos)
+int bnx2x_set_vf_vlan(struct net_device *dev, int vfidx, u16 vlan, u8 qos,
+		      __be16 vlan_proto)
 {
 	struct pf_vf_bulletin_content *bulletin = NULL;
 	struct bnx2x *bp = netdev_priv(dev);
@@ -2796,6 +2798,9 @@ int bnx2x_set_vf_vlan(struct net_device *dev, int vfidx, u16 vlan, u8 qos)
 		return -EINVAL;
 	}
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	DP(BNX2X_MSG_IOV, "configuring VF %d with VLAN %d qos %d\n",
 	   vfidx, vlan, 0);
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index 8be718508600..ec6cd18842c3 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -174,7 +174,8 @@ int bnxt_set_vf_mac(struct net_device *dev, int vf_id, u8 *mac)
 	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
 }
 
-int bnxt_set_vf_vlan(struct net_device *dev, int vf_id, u16 vlan_id, u8 qos)
+int bnxt_set_vf_vlan(struct net_device *dev, int vf_id, u16 vlan_id, u8 qos,
+		     __be16 vlan_proto)
 {
 	struct hwrm_func_cfg_input req = {0};
 	struct bnxt *bp = netdev_priv(dev);
@@ -185,6 +186,9 @@ int bnxt_set_vf_vlan(struct net_device *dev, int vf_id, u16 vlan_id, u8 qos)
 	if (bp->hwrm_spec_code < 0x10201)
 		return -ENOTSUPP;
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	rc = bnxt_vf_ndo_prep(bp, vf_id);
 	if (rc)
 		return rc;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
index 0392670ab49c..1ab72e4820af 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
@@ -12,7 +12,7 @@
 
 int bnxt_get_vf_config(struct net_device *, int, struct ifla_vf_info *);
 int bnxt_set_vf_mac(struct net_device *, int, u8 *);
-int bnxt_set_vf_vlan(struct net_device *, int, u16, u8);
+int bnxt_set_vf_vlan(struct net_device *, int, u16, u8, __be16);
 int bnxt_set_vf_bw(struct net_device *, int, int, int);
 int bnxt_set_vf_link_state(struct net_device *, int, int);
 int bnxt_set_vf_spoofchk(struct net_device *, int, bool);
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 9a94840c5757..ac513e6627d1 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -1895,7 +1895,8 @@ static int be_clear_vf_tvt(struct be_adapter *adapter, int vf)
 	return 0;
 }
 
-static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos)
+static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos,
+			  __be16 vlan_proto)
 {
 	struct be_adapter *adapter = netdev_priv(netdev);
 	struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf];
@@ -1907,6 +1908,9 @@ static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos)
 	if (vf >= adapter->num_vfs || vlan > 4095 || qos > 7)
 		return -EINVAL;
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	if (vlan || qos) {
 		vlan |= qos << VLAN_PRIO_SHIFT;
 		status = be_set_vf_tvt(adapter, vf, vlan);
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h
index 67ff01aeb11a..4d19e46f7c55 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k.h
@@ -507,7 +507,7 @@ int fm10k_iov_configure(struct pci_dev *pdev, int num_vfs);
 s32 fm10k_iov_update_pvid(struct fm10k_intfc *interface, u16 glort, u16 pvid);
 int fm10k_ndo_set_vf_mac(struct net_device *netdev, int vf_idx, u8 *mac);
 int fm10k_ndo_set_vf_vlan(struct net_device *netdev,
-			  int vf_idx, u16 vid, u8 qos);
+			  int vf_idx, u16 vid, u8 qos, __be16 vlan_proto);
 int fm10k_ndo_set_vf_bw(struct net_device *netdev, int vf_idx, int rate,
 			int unused);
 int fm10k_ndo_get_vf_config(struct net_device *netdev,
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c
index d9dec81f6b6d..5f4dac0d36ef 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c
@@ -445,7 +445,7 @@ int fm10k_ndo_set_vf_mac(struct net_device *netdev, int vf_idx, u8 *mac)
 }
 
 int fm10k_ndo_set_vf_vlan(struct net_device *netdev, int vf_idx, u16 vid,
-			  u8 qos)
+			  u8 qos, __be16 vlan_proto)
 {
 	struct fm10k_intfc *interface = netdev_priv(netdev);
 	struct fm10k_iov_data *iov_data = interface->iov_data;
@@ -460,6 +460,10 @@ int fm10k_ndo_set_vf_vlan(struct net_device *netdev, int vf_idx, u16 vid,
 	if (qos || (vid > (VLAN_VID_MASK - 1)))
 		return -EINVAL;
 
+	/* VF VLAN Protocol part to default is unsupported */
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	vf_info = &iov_data->vf_info[vf_idx];
 
 	/* exit if there is nothing to do */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index da3423561b3a..724d8740d4cc 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -2747,11 +2747,12 @@ error_param:
  * @vf_id: VF identifier
  * @vlan_id: mac address
  * @qos: priority setting
+ * @vlan_proto: vlan protocol
  *
  * program VF vlan id and/or qos
  **/
-int i40e_ndo_set_vf_port_vlan(struct net_device *netdev,
-			      int vf_id, u16 vlan_id, u8 qos)
+int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id,
+			      u16 vlan_id, u8 qos, __be16 vlan_proto)
 {
 	u16 vlanprio = vlan_id | (qos << I40E_VLAN_PRIORITY_SHIFT);
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
@@ -2774,6 +2775,12 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev,
 		goto error_pvid;
 	}
 
+	if (vlan_proto != htons(ETH_P_8021Q)) {
+		dev_err(&pf->pdev->dev, "VF VLAN protocol is not supported\n");
+		ret = -EPROTONOSUPPORT;
+		goto error_pvid;
+	}
+
 	vf = &(pf->vf[vf_id]);
 	vsi = pf->vsi[vf->lan_vsi_idx];
 	if (!test_bit(I40E_VF_STAT_INIT, &vf->vf_states)) {
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
index 875174141451..4012d069939a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
@@ -129,8 +129,8 @@ void i40e_vc_notify_vf_reset(struct i40e_vf *vf);
 
 /* VF configuration related iplink handlers */
 int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac);
-int i40e_ndo_set_vf_port_vlan(struct net_device *netdev,
-			      int vf_id, u16 vlan_id, u8 qos);
+int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id,
+			      u16 vlan_id, u8 qos, __be16 vlan_proto);
 int i40e_ndo_set_vf_bw(struct net_device *netdev, int vf_id, int min_tx_rate,
 		       int max_tx_rate);
 int i40e_ndo_set_vf_trust(struct net_device *netdev, int vf_id, bool setting);
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index af75eac5fa16..a83aa13a5bf4 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -169,7 +169,7 @@ static int igb_set_vf_mac(struct igb_adapter *, int, unsigned char *);
 static void igb_restore_vf_multicasts(struct igb_adapter *adapter);
 static int igb_ndo_set_vf_mac(struct net_device *netdev, int vf, u8 *mac);
 static int igb_ndo_set_vf_vlan(struct net_device *netdev,
-			       int vf, u16 vlan, u8 qos);
+			       int vf, u16 vlan, u8 qos, __be16 vlan_proto);
 static int igb_ndo_set_vf_bw(struct net_device *, int, int, int);
 static int igb_ndo_set_vf_spoofchk(struct net_device *netdev, int vf,
 				   bool setting);
@@ -6222,14 +6222,17 @@ static int igb_disable_port_vlan(struct igb_adapter *adapter, int vf)
 	return 0;
 }
 
-static int igb_ndo_set_vf_vlan(struct net_device *netdev,
-			       int vf, u16 vlan, u8 qos)
+static int igb_ndo_set_vf_vlan(struct net_device *netdev, int vf,
+			       u16 vlan, u8 qos, __be16 vlan_proto)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
 
 	if ((vf >= adapter->vfs_allocated_count) || (vlan > 4095) || (qos > 7))
 		return -EINVAL;
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	return (vlan || qos) ? igb_enable_port_vlan(adapter, vf, vlan, qos) :
 			       igb_disable_port_vlan(adapter, vf);
 }
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index 8618599dfd6f..b18590a995db 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -1354,13 +1354,16 @@ static int ixgbe_disable_port_vlan(struct ixgbe_adapter *adapter, int vf)
 	return err;
 }
 
-int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos)
+int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan,
+			  u8 qos, __be16 vlan_proto)
 {
 	int err = 0;
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 
 	if ((vf >= adapter->num_vfs) || (vlan > 4095) || (qos > 7))
 		return -EINVAL;
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
 	if (vlan || qos) {
 		/* Check if there is already a port VLAN set, if so
 		 * we have to delete the old one first before we
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
index 47e65e2f886a..0c7977d27b71 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
@@ -43,7 +43,7 @@ void ixgbe_disable_tx_rx(struct ixgbe_adapter *adapter);
 void ixgbe_ping_all_vfs(struct ixgbe_adapter *adapter);
 int ixgbe_ndo_set_vf_mac(struct net_device *netdev, int queue, u8 *mac);
 int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int queue, u16 vlan,
-			   u8 qos);
+			   u8 qos, __be16 vlan_proto);
 int ixgbe_link_mbps(struct ixgbe_adapter *adapter);
 int ixgbe_ndo_set_vf_bw(struct net_device *netdev, int vf, int min_tx_rate,
 			int max_tx_rate);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index a94f8a3f026c..132eeeafcdc4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2400,11 +2400,15 @@ static int mlx4_en_set_vf_mac(struct net_device *dev, int queue, u8 *mac)
 	return mlx4_set_vf_mac(mdev->dev, en_priv->port, queue, mac_u64);
 }
 
-static int mlx4_en_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos)
+static int mlx4_en_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos,
+			       __be16 vlan_proto)
 {
 	struct mlx4_en_priv *en_priv = netdev_priv(dev);
 	struct mlx4_en_dev *mdev = en_priv->mdev;
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	return mlx4_set_vf_vlan(mdev->dev, en_priv->port, vf, vlan, qos);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index c12792314be7..b58cfe37dead 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2917,11 +2917,15 @@ static int mlx5e_set_vf_mac(struct net_device *dev, int vf, u8 *mac)
 	return mlx5_eswitch_set_vport_mac(mdev->priv.eswitch, vf + 1, mac);
 }
 
-static int mlx5e_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos)
+static int mlx5e_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos,
+			     __be16 vlan_proto)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
 	struct mlx5_core_dev *mdev = priv->mdev;
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	return mlx5_eswitch_set_vport_vlan(mdev->priv.eswitch, vf + 1,
 					   vlan, qos);
 }
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index cd23a2946db7..0e198fe89d1a 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -100,7 +100,8 @@ static int qede_alloc_rx_buffer(struct qede_dev *edev,
 static void qede_link_update(void *dev, struct qed_link_output *link);
 
 #ifdef CONFIG_QED_SRIOV
-static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos)
+static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos,
+			    __be16 vlan_proto)
 {
 	struct qede_dev *edev = netdev_priv(ndev);
 
@@ -109,6 +110,9 @@ static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos)
 		return -EINVAL;
 	}
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	DP_VERBOSE(edev, QED_MSG_IOV, "Setting Vlan 0x%04x to VF [%d]\n",
 		   vlan, vf);
 
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h
index 24061b9b92e8..5f327659efa7 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov.h
@@ -238,7 +238,7 @@ int qlcnic_sriov_set_vf_mac(struct net_device *, int, u8 *);
 int qlcnic_sriov_set_vf_tx_rate(struct net_device *, int, int, int);
 int qlcnic_sriov_get_vf_config(struct net_device *, int ,
 			       struct ifla_vf_info *);
-int qlcnic_sriov_set_vf_vlan(struct net_device *, int, u16, u8);
+int qlcnic_sriov_set_vf_vlan(struct net_device *, int, u16, u8, __be16);
 int qlcnic_sriov_set_vf_spoofchk(struct net_device *, int, bool);
 #else
 static inline void qlcnic_sriov_pf_disable(struct qlcnic_adapter *adapter) {}
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c
index afd687e5e779..50eaafa3eaba 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c
@@ -1915,7 +1915,7 @@ int qlcnic_sriov_set_vf_tx_rate(struct net_device *netdev, int vf,
 }
 
 int qlcnic_sriov_set_vf_vlan(struct net_device *netdev, int vf,
-			     u16 vlan, u8 qos)
+			     u16 vlan, u8 qos, __be16 vlan_proto)
 {
 	struct qlcnic_adapter *adapter = netdev_priv(netdev);
 	struct qlcnic_sriov *sriov = adapter->ahw->sriov;
@@ -1928,6 +1928,9 @@ int qlcnic_sriov_set_vf_vlan(struct net_device *netdev, int vf,
 	if (vf >= sriov->num_vfs || qos > 7)
 		return -EINVAL;
 
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
 	if (vlan > MAX_VLAN_ID) {
 		netdev_err(netdev,
 			   "Invalid VLAN ID, allowed range is [0 - %d]\n",
diff --git a/drivers/net/ethernet/sfc/sriov.c b/drivers/net/ethernet/sfc/sriov.c
index 816c44689e67..9abcf4aded30 100644
--- a/drivers/net/ethernet/sfc/sriov.c
+++ b/drivers/net/ethernet/sfc/sriov.c
@@ -22,7 +22,7 @@ int efx_sriov_set_vf_mac(struct net_device *net_dev, int vf_i, u8 *mac)
 }
 
 int efx_sriov_set_vf_vlan(struct net_device *net_dev, int vf_i, u16 vlan,
-			  u8 qos)
+			  u8 qos, __be16 vlan_proto)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 
@@ -31,6 +31,9 @@ int efx_sriov_set_vf_vlan(struct net_device *net_dev, int vf_i, u16 vlan,
 		    (qos & ~(VLAN_PRIO_MASK >> VLAN_PRIO_SHIFT)))
 			return -EINVAL;
 
+		if (vlan_proto != htons(ETH_P_8021Q))
+			return -EPROTONOSUPPORT;
+
 		return efx->type->sriov_set_vf_vlan(efx, vf_i, vlan, qos);
 	} else {
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/sfc/sriov.h b/drivers/net/ethernet/sfc/sriov.h
index 400df526586d..ba1762e7f216 100644
--- a/drivers/net/ethernet/sfc/sriov.h
+++ b/drivers/net/ethernet/sfc/sriov.h
@@ -16,7 +16,7 @@
 
 int efx_sriov_set_vf_mac(struct net_device *net_dev, int vf_i, u8 *mac);
 int efx_sriov_set_vf_vlan(struct net_device *net_dev, int vf_i, u16 vlan,
-			  u8 qos);
+			  u8 qos, __be16 vlan_proto);
 int efx_sriov_set_vf_spoofchk(struct net_device *net_dev, int vf_i,
 			      bool spoofchk);
 int efx_sriov_get_vf_config(struct net_device *net_dev, int vf_i,
diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index f923d15b432c..0b17c585b5cd 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -25,5 +25,6 @@ struct ifla_vf_info {
 	__u32 max_tx_rate;
 	__u32 rss_query_en;
 	__u32 trusted;
+	__be16 vlan_proto;
 };
 #endif /* _LINUX_IF_LINK_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 69f242c71865..1e8a5c734d72 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -946,7 +946,8 @@ struct netdev_xdp {
  *
  *	SR-IOV management functions.
  * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
- * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, u8 qos);
+ * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan,
+ *			  u8 qos, __be16 proto);
  * int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate,
  *			  int max_tx_rate);
  * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting);
@@ -1187,7 +1188,8 @@ struct net_device_ops {
 	int			(*ndo_set_vf_mac)(struct net_device *dev,
 						  int queue, u8 *mac);
 	int			(*ndo_set_vf_vlan)(struct net_device *dev,
-						   int queue, u16 vlan, u8 qos);
+						   int queue, u16 vlan,
+						   u8 qos, __be16 proto);
 	int			(*ndo_set_vf_rate)(struct net_device *dev,
 						   int vf, int min_tx_rate,
 						   int max_tx_rate);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 7ec9e99d5491..b4fba662cd32 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -619,7 +619,7 @@ enum {
 enum {
 	IFLA_VF_UNSPEC,
 	IFLA_VF_MAC,		/* Hardware queue specific attributes */
-	IFLA_VF_VLAN,
+	IFLA_VF_VLAN,		/* VLAN ID and QoS */
 	IFLA_VF_TX_RATE,	/* Max TX Bandwidth Allocation */
 	IFLA_VF_SPOOFCHK,	/* Spoof Checking on/off switch */
 	IFLA_VF_LINK_STATE,	/* link state enable/disable/auto switch */
@@ -631,6 +631,7 @@ enum {
 	IFLA_VF_TRUST,		/* Trust VF */
 	IFLA_VF_IB_NODE_GUID,	/* VF Infiniband node GUID */
 	IFLA_VF_IB_PORT_GUID,	/* VF Infiniband port GUID */
+	IFLA_VF_VLAN_LIST,	/* nested list of vlans, option for QinQ */
 	__IFLA_VF_MAX,
 };
 
@@ -647,6 +648,22 @@ struct ifla_vf_vlan {
 	__u32 qos;
 };
 
+enum {
+	IFLA_VF_VLAN_INFO_UNSPEC,
+	IFLA_VF_VLAN_INFO,	/* VLAN ID, QoS and VLAN protocol */
+	__IFLA_VF_VLAN_INFO_MAX,
+};
+
+#define IFLA_VF_VLAN_INFO_MAX (__IFLA_VF_VLAN_INFO_MAX - 1)
+#define MAX_VLAN_LIST_LEN 1
+
+struct ifla_vf_vlan_info {
+	__u32 vf;
+	__u32 vlan; /* 0 - 4095, 0 disables VLAN filter */
+	__u32 qos;
+	__be16 vlan_proto; /* VLAN protocol either 802.1Q or 802.1ad */
+};
+
 struct ifla_vf_tx_rate {
 	__u32 vf;
 	__u32 rate; /* Max TX bandwidth in Mbps, 0 disables throttling */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 0dbae4244a89..3ac8946bf244 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -843,7 +843,10 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
 		size += nla_total_size(num_vfs * sizeof(struct nlattr));
 		size += num_vfs *
 			(nla_total_size(sizeof(struct ifla_vf_mac)) +
-			 nla_total_size(sizeof(struct ifla_vf_vlan)) +
+			 nla_total_size(MAX_VLAN_LIST_LEN *
+					sizeof(struct nlattr)) +
+			 nla_total_size(MAX_VLAN_LIST_LEN *
+					sizeof(struct ifla_vf_vlan_info)) +
 			 nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
 			 nla_total_size(sizeof(struct ifla_vf_rate)) +
 			 nla_total_size(sizeof(struct ifla_vf_link_state)) +
@@ -1111,14 +1114,15 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 					       struct nlattr *vfinfo)
 {
 	struct ifla_vf_rss_query_en vf_rss_query_en;
+	struct nlattr *vf, *vfstats, *vfvlanlist;
 	struct ifla_vf_link_state vf_linkstate;
+	struct ifla_vf_vlan_info vf_vlan_info;
 	struct ifla_vf_spoofchk vf_spoofchk;
 	struct ifla_vf_tx_rate vf_tx_rate;
 	struct ifla_vf_stats vf_stats;
 	struct ifla_vf_trust vf_trust;
 	struct ifla_vf_vlan vf_vlan;
 	struct ifla_vf_rate vf_rate;
-	struct nlattr *vf, *vfstats;
 	struct ifla_vf_mac vf_mac;
 	struct ifla_vf_info ivi;
 
@@ -1135,11 +1139,14 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	 * IFLA_VF_LINK_STATE_AUTO which equals zero
 	 */
 	ivi.linkstate = 0;
+	/* VLAN Protocol by default is 802.1Q */
+	ivi.vlan_proto = htons(ETH_P_8021Q);
 	if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi))
 		return 0;
 
 	vf_mac.vf =
 		vf_vlan.vf =
+		vf_vlan_info.vf =
 		vf_rate.vf =
 		vf_tx_rate.vf =
 		vf_spoofchk.vf =
@@ -1150,6 +1157,9 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
 	vf_vlan.vlan = ivi.vlan;
 	vf_vlan.qos = ivi.qos;
+	vf_vlan_info.vlan = ivi.vlan;
+	vf_vlan_info.qos = ivi.qos;
+	vf_vlan_info.vlan_proto = ivi.vlan_proto;
 	vf_tx_rate.rate = ivi.max_tx_rate;
 	vf_rate.min_tx_rate = ivi.min_tx_rate;
 	vf_rate.max_tx_rate = ivi.max_tx_rate;
@@ -1158,10 +1168,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	vf_rss_query_en.setting = ivi.rss_query_en;
 	vf_trust.setting = ivi.trusted;
 	vf = nla_nest_start(skb, IFLA_VF_INFO);
-	if (!vf) {
-		nla_nest_cancel(skb, vfinfo);
-		return -EMSGSIZE;
-	}
+	if (!vf)
+		goto nla_put_vfinfo_failure;
 	if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
 	    nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
 	    nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
@@ -1177,17 +1185,23 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 		    &vf_rss_query_en) ||
 	    nla_put(skb, IFLA_VF_TRUST,
 		    sizeof(vf_trust), &vf_trust))
-		return -EMSGSIZE;
+		goto nla_put_vf_failure;
+	vfvlanlist = nla_nest_start(skb, IFLA_VF_VLAN_LIST);
+	if (!vfvlanlist)
+		goto nla_put_vf_failure;
+	if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info),
+		    &vf_vlan_info)) {
+		nla_nest_cancel(skb, vfvlanlist);
+		goto nla_put_vf_failure;
+	}
+	nla_nest_end(skb, vfvlanlist);
 	memset(&vf_stats, 0, sizeof(vf_stats));
 	if (dev->netdev_ops->ndo_get_vf_stats)
 		dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num,
 						&vf_stats);
 	vfstats = nla_nest_start(skb, IFLA_VF_STATS);
-	if (!vfstats) {
-		nla_nest_cancel(skb, vf);
-		nla_nest_cancel(skb, vfinfo);
-		return -EMSGSIZE;
-	}
+	if (!vfstats)
+		goto nla_put_vf_failure;
 	if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS,
 			      vf_stats.rx_packets, IFLA_VF_STATS_PAD) ||
 	    nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS,
@@ -1199,11 +1213,19 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	    nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
 			      vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
 	    nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
-			      vf_stats.multicast, IFLA_VF_STATS_PAD))
-		return -EMSGSIZE;
+			      vf_stats.multicast, IFLA_VF_STATS_PAD)) {
+		nla_nest_cancel(skb, vfstats);
+		goto nla_put_vf_failure;
+	}
 	nla_nest_end(skb, vfstats);
 	nla_nest_end(skb, vf);
 	return 0;
+
+nla_put_vf_failure:
+	nla_nest_cancel(skb, vf);
+nla_put_vfinfo_failure:
+	nla_nest_cancel(skb, vfinfo);
+	return -EMSGSIZE;
 }
 
 static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
@@ -1448,6 +1470,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
 static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
 	[IFLA_VF_MAC]		= { .len = sizeof(struct ifla_vf_mac) },
 	[IFLA_VF_VLAN]		= { .len = sizeof(struct ifla_vf_vlan) },
+	[IFLA_VF_VLAN_LIST]     = { .type = NLA_NESTED },
 	[IFLA_VF_TX_RATE]	= { .len = sizeof(struct ifla_vf_tx_rate) },
 	[IFLA_VF_SPOOFCHK]	= { .len = sizeof(struct ifla_vf_spoofchk) },
 	[IFLA_VF_RATE]		= { .len = sizeof(struct ifla_vf_rate) },
@@ -1704,7 +1727,34 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
 		err = -EOPNOTSUPP;
 		if (ops->ndo_set_vf_vlan)
 			err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan,
-						   ivv->qos);
+						   ivv->qos,
+						   htons(ETH_P_8021Q));
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[IFLA_VF_VLAN_LIST]) {
+		struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN];
+		struct nlattr *attr;
+		int rem, len = 0;
+
+		err = -EOPNOTSUPP;
+		if (!ops->ndo_set_vf_vlan)
+			return err;
+
+		nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) {
+			if (nla_type(attr) != IFLA_VF_VLAN_INFO ||
+			    nla_len(attr) < NLA_HDRLEN) {
+				return -EINVAL;
+			}
+			if (len >= MAX_VLAN_LIST_LEN)
+				return -EOPNOTSUPP;
+			ivvl[len] = nla_data(attr);
+
+			len++;
+		}
+		err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan,
+					   ivvl[0]->qos, ivvl[0]->vlan_proto);
 		if (err < 0)
 			return err;
 	}
-- 
cgit v1.2.3-71-gd317


From 11d5f15723c9f39d7c131d0149d024c17dbef676 Mon Sep 17 00:00:00 2001
From: Vishwanath Pai <vpai@akamai.com>
Date: Thu, 22 Sep 2016 12:43:44 -0400
Subject: netfilter: xt_hashlimit: Create revision 2 to support higher pps
 rates

Create a new revision for the hashlimit iptables extension module. Rev 2
will support higher pps of upto 1 million, Version 1 supports only 10k.

To support this we have to increase the size of the variables avg and
burst in hashlimit_cfg to 64-bit. Create two new structs hashlimit_cfg2
and xt_hashlimit_mtinfo2 and also create newer versions of all the
functions for match, checkentry and destroy.

Some of the functions like hashlimit_mt, hashlimit_mt_check etc are very
similar in both rev1 and rev2 with only minor changes, so I have split
those functions and moved all the common code to a *_common function.

Signed-off-by: Vishwanath Pai <vpai@akamai.com>
Signed-off-by: Joshua Hunt <johunt@akamai.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/xt_hashlimit.h |  23 ++
 net/netfilter/xt_hashlimit.c                | 330 ++++++++++++++++++++++------
 2 files changed, 285 insertions(+), 68 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h b/include/uapi/linux/netfilter/xt_hashlimit.h
index 6db90372f09c..3efc0ca18345 100644
--- a/include/uapi/linux/netfilter/xt_hashlimit.h
+++ b/include/uapi/linux/netfilter/xt_hashlimit.h
@@ -6,6 +6,7 @@
 
 /* timings are in milliseconds. */
 #define XT_HASHLIMIT_SCALE 10000
+#define XT_HASHLIMIT_SCALE_v2 1000000llu
 /* 1/10,000 sec period => max of 10,000/sec.  Min rate is then 429490
  * seconds, or one packet every 59 hours.
  */
@@ -63,6 +64,20 @@ struct hashlimit_cfg1 {
 	__u8 srcmask, dstmask;
 };
 
+struct hashlimit_cfg2 {
+	__u64 avg;		/* Average secs between packets * scale */
+	__u64 burst;		/* Period multiplier for upper limit. */
+	__u32 mode;		/* bitmask of XT_HASHLIMIT_HASH_* */
+
+	/* user specified */
+	__u32 size;		/* how many buckets */
+	__u32 max;		/* max number of entries */
+	__u32 gc_interval;	/* gc interval */
+	__u32 expire;		/* when do entries expire? */
+
+	__u8 srcmask, dstmask;
+};
+
 struct xt_hashlimit_mtinfo1 {
 	char name[IFNAMSIZ];
 	struct hashlimit_cfg1 cfg;
@@ -71,4 +86,12 @@ struct xt_hashlimit_mtinfo1 {
 	struct xt_hashlimit_htable *hinfo __attribute__((aligned(8)));
 };
 
+struct xt_hashlimit_mtinfo2 {
+	char name[NAME_MAX];
+	struct hashlimit_cfg2 cfg;
+
+	/* Used internally by the kernel */
+	struct xt_hashlimit_htable *hinfo __attribute__((aligned(8)));
+};
+
 #endif /* _UAPI_XT_HASHLIMIT_H */
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index e93d9e0a3f35..44a095ecc7b7 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -57,6 +57,7 @@ static inline struct hashlimit_net *hashlimit_pernet(struct net *net)
 
 /* need to declare this at the top */
 static const struct file_operations dl_file_ops_v1;
+static const struct file_operations dl_file_ops;
 
 /* hash table crap */
 struct dsthash_dst {
@@ -86,8 +87,8 @@ struct dsthash_ent {
 	unsigned long expires;		/* precalculated expiry time */
 	struct {
 		unsigned long prev;	/* last modification */
-		u_int32_t credit;
-		u_int32_t credit_cap, cost;
+		u_int64_t credit;
+		u_int64_t credit_cap, cost;
 	} rateinfo;
 	struct rcu_head rcu;
 };
@@ -98,7 +99,7 @@ struct xt_hashlimit_htable {
 	u_int8_t family;
 	bool rnd_initialized;
 
-	struct hashlimit_cfg1 cfg;	/* config */
+	struct hashlimit_cfg2 cfg;	/* config */
 
 	/* used internally */
 	spinlock_t lock;		/* lock for list_head */
@@ -114,6 +115,30 @@ struct xt_hashlimit_htable {
 	struct hlist_head hash[0];	/* hashtable itself */
 };
 
+static int
+cfg_copy(struct hashlimit_cfg2 *to, void *from, int revision)
+{
+	if (revision == 1) {
+		struct hashlimit_cfg1 *cfg = (struct hashlimit_cfg1 *)from;
+
+		to->mode = cfg->mode;
+		to->avg = cfg->avg;
+		to->burst = cfg->burst;
+		to->size = cfg->size;
+		to->max = cfg->max;
+		to->gc_interval = cfg->gc_interval;
+		to->expire = cfg->expire;
+		to->srcmask = cfg->srcmask;
+		to->dstmask = cfg->dstmask;
+	} else if (revision == 2) {
+		memcpy(to, from, sizeof(struct hashlimit_cfg2));
+	} else {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static DEFINE_MUTEX(hashlimit_mutex);	/* protects htables list */
 static struct kmem_cache *hashlimit_cachep __read_mostly;
 
@@ -215,16 +240,18 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
 }
 static void htable_gc(struct work_struct *work);
 
-static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
-			    u_int8_t family)
+static int htable_create(struct net *net, struct hashlimit_cfg2 *cfg,
+			 const char *name, u_int8_t family,
+			 struct xt_hashlimit_htable **out_hinfo,
+			 int revision)
 {
 	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
 	struct xt_hashlimit_htable *hinfo;
-	unsigned int size;
-	unsigned int i;
+	unsigned int size, i;
+	int ret;
 
-	if (minfo->cfg.size) {
-		size = minfo->cfg.size;
+	if (cfg->size) {
+		size = cfg->size;
 	} else {
 		size = (totalram_pages << PAGE_SHIFT) / 16384 /
 		       sizeof(struct list_head);
@@ -238,10 +265,14 @@ static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
 	                sizeof(struct list_head) * size);
 	if (hinfo == NULL)
 		return -ENOMEM;
-	minfo->hinfo = hinfo;
+	*out_hinfo = hinfo;
 
 	/* copy match config into hashtable config */
-	memcpy(&hinfo->cfg, &minfo->cfg, sizeof(hinfo->cfg));
+	ret = cfg_copy(&hinfo->cfg, (void *)cfg, 2);
+
+	if (ret)
+		return ret;
+
 	hinfo->cfg.size = size;
 	if (hinfo->cfg.max == 0)
 		hinfo->cfg.max = 8 * hinfo->cfg.size;
@@ -255,17 +286,18 @@ static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
 	hinfo->count = 0;
 	hinfo->family = family;
 	hinfo->rnd_initialized = false;
-	hinfo->name = kstrdup(minfo->name, GFP_KERNEL);
+	hinfo->name = kstrdup(name, GFP_KERNEL);
 	if (!hinfo->name) {
 		vfree(hinfo);
 		return -ENOMEM;
 	}
 	spin_lock_init(&hinfo->lock);
 
-	hinfo->pde = proc_create_data(minfo->name, 0,
+	hinfo->pde = proc_create_data(name, 0,
 		(family == NFPROTO_IPV4) ?
 		hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
-		&dl_file_ops_v1, hinfo);
+		(revision == 1) ? &dl_file_ops_v1 : &dl_file_ops,
+		hinfo);
 	if (hinfo->pde == NULL) {
 		kfree(hinfo->name);
 		vfree(hinfo);
@@ -399,6 +431,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
    CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
 */
 #define MAX_CPJ_v1 (0xFFFFFFFF / (HZ*60*60*24))
+#define MAX_CPJ (0xFFFFFFFFFFFFFFFF / (HZ*60*60*24))
 
 /* Repeated shift and or gives us all 1s, final shift and add 1 gives
  * us the power of 2 below the theoretical max, so GCC simply does a
@@ -408,8 +441,11 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
 #define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
 #define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
 #define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
+#define _POW2_BELOW64(x) (_POW2_BELOW32(x)|_POW2_BELOW32((x)>>32))
 #define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
+#define POW2_BELOW64(x) ((_POW2_BELOW64(x)>>1) + 1)
 
+#define CREDITS_PER_JIFFY POW2_BELOW64(MAX_CPJ)
 #define CREDITS_PER_JIFFY_v1 POW2_BELOW32(MAX_CPJ_v1)
 
 /* in byte mode, the lowest possible rate is one packet/second.
@@ -425,15 +461,24 @@ static u32 xt_hashlimit_len_to_chunks(u32 len)
 }
 
 /* Precision saver. */
-static u32 user2credits(u32 user)
+static u64 user2credits(u64 user, int revision)
 {
-	/* If multiplying would overflow... */
-	if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1))
-		/* Divide first. */
-		return (user / XT_HASHLIMIT_SCALE) *\
-					HZ * CREDITS_PER_JIFFY_v1;
+	if (revision == 1) {
+		/* If multiplying would overflow... */
+		if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1))
+			/* Divide first. */
+			return (user / XT_HASHLIMIT_SCALE) *\
+						HZ * CREDITS_PER_JIFFY_v1;
+
+		return (user * HZ * CREDITS_PER_JIFFY_v1) \
+						/ XT_HASHLIMIT_SCALE;
+	} else {
+		if (user > 0xFFFFFFFFFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+			return (user / XT_HASHLIMIT_SCALE_v2) *\
+						HZ * CREDITS_PER_JIFFY;
 
-	return (user * HZ * CREDITS_PER_JIFFY_v1) / XT_HASHLIMIT_SCALE;
+		return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE_v2;
+	}
 }
 
 static u32 user2credits_byte(u32 user)
@@ -443,10 +488,11 @@ static u32 user2credits_byte(u32 user)
 	return (u32) (us >> 32);
 }
 
-static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
+static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now,
+			    u32 mode, int revision)
 {
 	unsigned long delta = now - dh->rateinfo.prev;
-	u32 cap;
+	u64 cap, cpj;
 
 	if (delta == 0)
 		return;
@@ -454,7 +500,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
 	dh->rateinfo.prev = now;
 
 	if (mode & XT_HASHLIMIT_BYTES) {
-		u32 tmp = dh->rateinfo.credit;
+		u64 tmp = dh->rateinfo.credit;
 		dh->rateinfo.credit += CREDITS_PER_JIFFY_BYTES * delta;
 		cap = CREDITS_PER_JIFFY_BYTES * HZ;
 		if (tmp >= dh->rateinfo.credit) {/* overflow */
@@ -462,7 +508,9 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
 			return;
 		}
 	} else {
-		dh->rateinfo.credit += delta * CREDITS_PER_JIFFY_v1;
+		cpj = (revision == 1) ?
+			CREDITS_PER_JIFFY_v1 : CREDITS_PER_JIFFY;
+		dh->rateinfo.credit += delta * cpj;
 		cap = dh->rateinfo.credit_cap;
 	}
 	if (dh->rateinfo.credit > cap)
@@ -470,7 +518,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
 }
 
 static void rateinfo_init(struct dsthash_ent *dh,
-			  struct xt_hashlimit_htable *hinfo)
+			  struct xt_hashlimit_htable *hinfo, int revision)
 {
 	dh->rateinfo.prev = jiffies;
 	if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) {
@@ -479,8 +527,8 @@ static void rateinfo_init(struct dsthash_ent *dh,
 		dh->rateinfo.credit_cap = hinfo->cfg.burst;
 	} else {
 		dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
-						   hinfo->cfg.burst);
-		dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
+						   hinfo->cfg.burst, revision);
+		dh->rateinfo.cost = user2credits(hinfo->cfg.avg, revision);
 		dh->rateinfo.credit_cap = dh->rateinfo.credit;
 	}
 }
@@ -604,15 +652,15 @@ static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh)
 }
 
 static bool
-hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par,
+		    struct xt_hashlimit_htable *hinfo,
+		    const struct hashlimit_cfg2 *cfg, int revision)
 {
-	const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
-	struct xt_hashlimit_htable *hinfo = info->hinfo;
 	unsigned long now = jiffies;
 	struct dsthash_ent *dh;
 	struct dsthash_dst dst;
 	bool race = false;
-	u32 cost;
+	u64 cost;
 
 	if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
 		goto hotdrop;
@@ -627,18 +675,18 @@ hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
 		} else if (race) {
 			/* Already got an entry, update expiration timeout */
 			dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
-			rateinfo_recalc(dh, now, hinfo->cfg.mode);
+			rateinfo_recalc(dh, now, hinfo->cfg.mode, revision);
 		} else {
 			dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
-			rateinfo_init(dh, hinfo);
+			rateinfo_init(dh, hinfo, revision);
 		}
 	} else {
 		/* update expiration timeout */
 		dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
-		rateinfo_recalc(dh, now, hinfo->cfg.mode);
+		rateinfo_recalc(dh, now, hinfo->cfg.mode, revision);
 	}
 
-	if (info->cfg.mode & XT_HASHLIMIT_BYTES)
+	if (cfg->mode & XT_HASHLIMIT_BYTES)
 		cost = hashlimit_byte_cost(skb->len, dh);
 	else
 		cost = dh->rateinfo.cost;
@@ -648,70 +696,126 @@ hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
 		dh->rateinfo.credit -= cost;
 		spin_unlock(&dh->lock);
 		rcu_read_unlock_bh();
-		return !(info->cfg.mode & XT_HASHLIMIT_INVERT);
+		return !(cfg->mode & XT_HASHLIMIT_INVERT);
 	}
 
 	spin_unlock(&dh->lock);
 	rcu_read_unlock_bh();
 	/* default match is underlimit - so over the limit, we need to invert */
-	return info->cfg.mode & XT_HASHLIMIT_INVERT;
+	return cfg->mode & XT_HASHLIMIT_INVERT;
 
  hotdrop:
 	par->hotdrop = true;
 	return false;
 }
 
-static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par)
+static bool
+hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
+	struct xt_hashlimit_htable *hinfo = info->hinfo;
+	struct hashlimit_cfg2 cfg = {};
+	int ret;
+
+	ret = cfg_copy(&cfg, (void *)&info->cfg, 1);
+
+	if (ret)
+		return ret;
+
+	return hashlimit_mt_common(skb, par, hinfo, &cfg, 1);
+}
+
+static bool
+hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
+	struct xt_hashlimit_htable *hinfo = info->hinfo;
+
+	return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 2);
+}
+
+static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
+				     struct xt_hashlimit_htable **hinfo,
+				     struct hashlimit_cfg2 *cfg,
+				     const char *name, int revision)
 {
 	struct net *net = par->net;
-	struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
 	int ret;
 
-	if (info->cfg.gc_interval == 0 || info->cfg.expire == 0)
-		return -EINVAL;
-	if (info->name[sizeof(info->name)-1] != '\0')
+	if (cfg->gc_interval == 0 || cfg->expire == 0)
 		return -EINVAL;
 	if (par->family == NFPROTO_IPV4) {
-		if (info->cfg.srcmask > 32 || info->cfg.dstmask > 32)
+		if (cfg->srcmask > 32 || cfg->dstmask > 32)
 			return -EINVAL;
 	} else {
-		if (info->cfg.srcmask > 128 || info->cfg.dstmask > 128)
+		if (cfg->srcmask > 128 || cfg->dstmask > 128)
 			return -EINVAL;
 	}
 
-	if (info->cfg.mode & ~XT_HASHLIMIT_ALL) {
+	if (cfg->mode & ~XT_HASHLIMIT_ALL) {
 		pr_info("Unknown mode mask %X, kernel too old?\n",
-						info->cfg.mode);
+						cfg->mode);
 		return -EINVAL;
 	}
 
 	/* Check for overflow. */
-	if (info->cfg.mode & XT_HASHLIMIT_BYTES) {
-		if (user2credits_byte(info->cfg.avg) == 0) {
-			pr_info("overflow, rate too high: %u\n", info->cfg.avg);
+	if (cfg->mode & XT_HASHLIMIT_BYTES) {
+		if (user2credits_byte(cfg->avg) == 0) {
+			pr_info("overflow, rate too high: %llu\n", cfg->avg);
 			return -EINVAL;
 		}
-	} else if (info->cfg.burst == 0 ||
-		    user2credits(info->cfg.avg * info->cfg.burst) <
-		    user2credits(info->cfg.avg)) {
-			pr_info("overflow, try lower: %u/%u\n",
-				info->cfg.avg, info->cfg.burst);
+	} else if (cfg->burst == 0 ||
+		    user2credits(cfg->avg * cfg->burst, revision) <
+		    user2credits(cfg->avg, revision)) {
+			pr_info("overflow, try lower: %llu/%llu\n",
+				cfg->avg, cfg->burst);
 			return -ERANGE;
 	}
 
 	mutex_lock(&hashlimit_mutex);
-	info->hinfo = htable_find_get(net, info->name, par->family);
-	if (info->hinfo == NULL) {
-		ret = htable_create_v1(net, info, par->family);
+	*hinfo = htable_find_get(net, name, par->family);
+	if (*hinfo == NULL) {
+		ret = htable_create(net, cfg, name, par->family,
+				    hinfo, revision);
 		if (ret < 0) {
 			mutex_unlock(&hashlimit_mutex);
 			return ret;
 		}
 	}
 	mutex_unlock(&hashlimit_mutex);
+
 	return 0;
 }
 
+static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par)
+{
+	struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
+	struct hashlimit_cfg2 cfg = {};
+	int ret;
+
+	if (info->name[sizeof(info->name) - 1] != '\0')
+		return -EINVAL;
+
+	ret = cfg_copy(&cfg, (void *)&info->cfg, 1);
+
+	if (ret)
+		return ret;
+
+	return hashlimit_mt_check_common(par, &info->hinfo,
+					 &cfg, info->name, 1);
+}
+
+static int hashlimit_mt_check(const struct xt_mtchk_param *par)
+{
+	struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
+
+	if (info->name[sizeof(info->name) - 1] != '\0')
+		return -EINVAL;
+
+	return hashlimit_mt_check_common(par, &info->hinfo, &info->cfg,
+					 info->name, 2);
+}
+
 static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par)
 {
 	const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
@@ -719,6 +823,13 @@ static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par)
 	htable_put(info->hinfo);
 }
 
+static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	const struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
+
+	htable_put(info->hinfo);
+}
+
 static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 	{
 		.name           = "hashlimit",
@@ -730,6 +841,16 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.destroy        = hashlimit_mt_destroy_v1,
 		.me             = THIS_MODULE,
 	},
+	{
+		.name           = "hashlimit",
+		.revision       = 2,
+		.family         = NFPROTO_IPV4,
+		.match          = hashlimit_mt,
+		.matchsize      = sizeof(struct xt_hashlimit_mtinfo2),
+		.checkentry     = hashlimit_mt_check,
+		.destroy        = hashlimit_mt_destroy,
+		.me             = THIS_MODULE,
+	},
 #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
 	{
 		.name           = "hashlimit",
@@ -741,6 +862,16 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.destroy        = hashlimit_mt_destroy_v1,
 		.me             = THIS_MODULE,
 	},
+	{
+		.name           = "hashlimit",
+		.revision       = 2,
+		.family         = NFPROTO_IPV6,
+		.match          = hashlimit_mt,
+		.matchsize      = sizeof(struct xt_hashlimit_mtinfo2),
+		.checkentry     = hashlimit_mt_check,
+		.destroy        = hashlimit_mt_destroy,
+		.me             = THIS_MODULE,
+	},
 #endif
 };
 
@@ -787,18 +918,12 @@ static void dl_seq_stop(struct seq_file *s, void *v)
 	spin_unlock_bh(&htable->lock);
 }
 
-static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
-			       struct seq_file *s)
+static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family,
+			 struct seq_file *s)
 {
-	const struct xt_hashlimit_htable *ht = s->private;
-
-	spin_lock(&ent->lock);
-	/* recalculate to show accurate numbers */
-	rateinfo_recalc(ent, jiffies, ht->cfg.mode);
-
 	switch (family) {
 	case NFPROTO_IPV4:
-		seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
+		seq_printf(s, "%ld %pI4:%u->%pI4:%u %llu %llu %llu\n",
 			   (long)(ent->expires - jiffies)/HZ,
 			   &ent->dst.ip.src,
 			   ntohs(ent->dst.src_port),
@@ -809,7 +934,7 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
 		break;
 #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
 	case NFPROTO_IPV6:
-		seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
+		seq_printf(s, "%ld %pI6:%u->%pI6:%u %llu %llu %llu\n",
 			   (long)(ent->expires - jiffies)/HZ,
 			   &ent->dst.ip6.src,
 			   ntohs(ent->dst.src_port),
@@ -822,6 +947,34 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
 	default:
 		BUG();
 	}
+}
+
+static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
+			       struct seq_file *s)
+{
+	const struct xt_hashlimit_htable *ht = s->private;
+
+	spin_lock(&ent->lock);
+	/* recalculate to show accurate numbers */
+	rateinfo_recalc(ent, jiffies, ht->cfg.mode, 1);
+
+	dl_seq_print(ent, family, s);
+
+	spin_unlock(&ent->lock);
+	return seq_has_overflowed(s);
+}
+
+static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
+			    struct seq_file *s)
+{
+	const struct xt_hashlimit_htable *ht = s->private;
+
+	spin_lock(&ent->lock);
+	/* recalculate to show accurate numbers */
+	rateinfo_recalc(ent, jiffies, ht->cfg.mode, 2);
+
+	dl_seq_print(ent, family, s);
+
 	spin_unlock(&ent->lock);
 	return seq_has_overflowed(s);
 }
@@ -840,6 +993,20 @@ static int dl_seq_show_v1(struct seq_file *s, void *v)
 	return 0;
 }
 
+static int dl_seq_show(struct seq_file *s, void *v)
+{
+	struct xt_hashlimit_htable *htable = s->private;
+	unsigned int *bucket = (unsigned int *)v;
+	struct dsthash_ent *ent;
+
+	if (!hlist_empty(&htable->hash[*bucket])) {
+		hlist_for_each_entry(ent, &htable->hash[*bucket], node)
+			if (dl_seq_real_show(ent, htable->family, s))
+				return -1;
+	}
+	return 0;
+}
+
 static const struct seq_operations dl_seq_ops_v1 = {
 	.start = dl_seq_start,
 	.next  = dl_seq_next,
@@ -847,6 +1014,13 @@ static const struct seq_operations dl_seq_ops_v1 = {
 	.show  = dl_seq_show_v1
 };
 
+static const struct seq_operations dl_seq_ops = {
+	.start = dl_seq_start,
+	.next  = dl_seq_next,
+	.stop  = dl_seq_stop,
+	.show  = dl_seq_show
+};
+
 static int dl_proc_open_v1(struct inode *inode, struct file *file)
 {
 	int ret = seq_open(file, &dl_seq_ops_v1);
@@ -858,6 +1032,18 @@ static int dl_proc_open_v1(struct inode *inode, struct file *file)
 	return ret;
 }
 
+static int dl_proc_open(struct inode *inode, struct file *file)
+{
+	int ret = seq_open(file, &dl_seq_ops);
+
+	if (!ret) {
+		struct seq_file *sf = file->private_data;
+
+		sf->private = PDE_DATA(inode);
+	}
+	return ret;
+}
+
 static const struct file_operations dl_file_ops_v1 = {
 	.owner   = THIS_MODULE,
 	.open    = dl_proc_open_v1,
@@ -866,6 +1052,14 @@ static const struct file_operations dl_file_ops_v1 = {
 	.release = seq_release
 };
 
+static const struct file_operations dl_file_ops = {
+	.owner   = THIS_MODULE,
+	.open    = dl_proc_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
 static int __net_init hashlimit_proc_net_init(struct net *net)
 {
 	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
-- 
cgit v1.2.3-71-gd317


From 0f3cd9b3697708c86a825ae3cedabf7be6fd3e72 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 23 Sep 2016 15:23:33 +0200
Subject: netfilter: nf_tables: add range expression

Inverse ranges != [a,b] are not currently possible because rules are
composites of && operations, and we need to express this:

	data < a || data > b

This patch adds a new range expression. Positive ranges can be already
through two cmp expressions:

	cmp(sreg, data, >=)
	cmp(sreg, data, <=)

This new range expression provides an alternative way to express this.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h   |   3 +
 include/uapi/linux/netfilter/nf_tables.h |  29 +++++++
 net/netfilter/Makefile                   |   3 +-
 net/netfilter/nf_tables_core.c           |   7 +-
 net/netfilter/nft_range.c                | 138 +++++++++++++++++++++++++++++++
 5 files changed, 178 insertions(+), 2 deletions(-)
 create mode 100644 net/netfilter/nft_range.c

(limited to 'include/uapi/linux')

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index a9060dd99db7..00f4f6b1b1ba 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -28,6 +28,9 @@ extern const struct nft_expr_ops nft_cmp_fast_ops;
 int nft_cmp_module_init(void);
 void nft_cmp_module_exit(void);
 
+int nft_range_module_init(void);
+void nft_range_module_exit(void);
+
 int nft_lookup_module_init(void);
 void nft_lookup_module_exit(void);
 
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 1cf41dd838b2..c6c4477c136b 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -546,6 +546,35 @@ enum nft_cmp_attributes {
 };
 #define NFTA_CMP_MAX		(__NFTA_CMP_MAX - 1)
 
+/**
+ * enum nft_range_ops - nf_tables range operator
+ *
+ * @NFT_RANGE_EQ: equal
+ * @NFT_RANGE_NEQ: not equal
+ */
+enum nft_range_ops {
+	NFT_RANGE_EQ,
+	NFT_RANGE_NEQ,
+};
+
+/**
+ * enum nft_range_attributes - nf_tables range expression netlink attributes
+ *
+ * @NFTA_RANGE_SREG: source register of data to compare (NLA_U32: nft_registers)
+ * @NFTA_RANGE_OP: cmp operation (NLA_U32: nft_cmp_ops)
+ * @NFTA_RANGE_FROM_DATA: data range from (NLA_NESTED: nft_data_attributes)
+ * @NFTA_RANGE_TO_DATA: data range to (NLA_NESTED: nft_data_attributes)
+ */
+enum nft_range_attributes {
+	NFTA_RANGE_UNSPEC,
+	NFTA_RANGE_SREG,
+	NFTA_RANGE_OP,
+	NFTA_RANGE_FROM_DATA,
+	NFTA_RANGE_TO_DATA,
+	__NFTA_RANGE_MAX
+};
+#define NFTA_RANGE_MAX		(__NFTA_RANGE_MAX - 1)
+
 enum nft_lookup_flags {
 	NFT_LOOKUP_F_INV = (1 << 0),
 };
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 0c8581100ac6..c23c3c84416f 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -71,8 +71,9 @@ obj-$(CONFIG_NF_DUP_NETDEV)	+= nf_dup_netdev.o
 
 # nf_tables
 nf_tables-objs += nf_tables_core.o nf_tables_api.o nf_tables_trace.o
-nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o nft_dynset.o
+nf_tables-objs += nft_immediate.o nft_cmp.o nft_range.o
 nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o
+nf_tables-objs += nft_lookup.o nft_dynset.o
 
 obj-$(CONFIG_NF_TABLES)		+= nf_tables.o
 obj-$(CONFIG_NF_TABLES_INET)	+= nf_tables_inet.o
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 67259cefef06..7c94ce0080d5 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -263,8 +263,13 @@ int __init nf_tables_core_module_init(void)
 	if (err < 0)
 		goto err7;
 
-	return 0;
+	err = nft_range_module_init();
+	if (err < 0)
+		goto err8;
 
+	return 0;
+err8:
+	nft_dynset_module_exit();
 err7:
 	nft_payload_module_exit();
 err6:
diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
new file mode 100644
index 000000000000..c6d5358482d1
--- /dev/null
+++ b/net/netfilter/nft_range.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_range_expr {
+	struct nft_data		data_from;
+	struct nft_data		data_to;
+	enum nft_registers	sreg:8;
+	u8			len;
+	enum nft_range_ops	op:8;
+};
+
+static void nft_range_eval(const struct nft_expr *expr,
+			 struct nft_regs *regs,
+			 const struct nft_pktinfo *pkt)
+{
+	const struct nft_range_expr *priv = nft_expr_priv(expr);
+	bool mismatch;
+	int d1, d2;
+
+	d1 = memcmp(&regs->data[priv->sreg], &priv->data_from, priv->len);
+	d2 = memcmp(&regs->data[priv->sreg], &priv->data_to, priv->len);
+	switch (priv->op) {
+	case NFT_RANGE_EQ:
+		mismatch = (d1 < 0 || d2 > 0);
+		break;
+	case NFT_RANGE_NEQ:
+		mismatch = (d1 >= 0 && d2 <= 0);
+		break;
+	}
+
+	if (mismatch)
+		regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_range_policy[NFTA_RANGE_MAX + 1] = {
+	[NFTA_RANGE_SREG]		= { .type = NLA_U32 },
+	[NFTA_RANGE_OP]			= { .type = NLA_U32 },
+	[NFTA_RANGE_FROM_DATA]		= { .type = NLA_NESTED },
+	[NFTA_RANGE_TO_DATA]		= { .type = NLA_NESTED },
+};
+
+static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+			const struct nlattr * const tb[])
+{
+	struct nft_range_expr *priv = nft_expr_priv(expr);
+	struct nft_data_desc desc_from, desc_to;
+	int err;
+
+	err = nft_data_init(NULL, &priv->data_from, sizeof(priv->data_from),
+			    &desc_from, tb[NFTA_RANGE_FROM_DATA]);
+	if (err < 0)
+		return err;
+
+	err = nft_data_init(NULL, &priv->data_to, sizeof(priv->data_to),
+			    &desc_to, tb[NFTA_RANGE_TO_DATA]);
+	if (err < 0)
+		goto err1;
+
+	if (desc_from.len != desc_to.len) {
+		err = -EINVAL;
+		goto err2;
+	}
+
+	priv->sreg = nft_parse_register(tb[NFTA_RANGE_SREG]);
+	err = nft_validate_register_load(priv->sreg, desc_from.len);
+	if (err < 0)
+		goto err2;
+
+	priv->op  = ntohl(nla_get_be32(tb[NFTA_RANGE_OP]));
+	priv->len = desc_from.len;
+	return 0;
+err2:
+	nft_data_uninit(&priv->data_to, desc_to.type);
+err1:
+	nft_data_uninit(&priv->data_from, desc_from.type);
+	return err;
+}
+
+static int nft_range_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_range_expr *priv = nft_expr_priv(expr);
+
+	if (nft_dump_register(skb, NFTA_RANGE_SREG, priv->sreg))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_RANGE_OP, htonl(priv->op)))
+		goto nla_put_failure;
+
+	if (nft_data_dump(skb, NFTA_RANGE_FROM_DATA, &priv->data_from,
+			  NFT_DATA_VALUE, priv->len) < 0 ||
+	    nft_data_dump(skb, NFTA_RANGE_TO_DATA, &priv->data_to,
+			  NFT_DATA_VALUE, priv->len) < 0)
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_type nft_range_type;
+static const struct nft_expr_ops nft_range_ops = {
+	.type		= &nft_range_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_range_expr)),
+	.eval		= nft_range_eval,
+	.init		= nft_range_init,
+	.dump		= nft_range_dump,
+};
+
+static struct nft_expr_type nft_range_type __read_mostly = {
+	.name		= "range",
+	.ops		= &nft_range_ops,
+	.policy		= nft_range_policy,
+	.maxattr	= NFTA_RANGE_MAX,
+	.owner		= THIS_MODULE,
+};
+
+int __init nft_range_module_init(void)
+{
+	return nft_register_expr(&nft_range_type);
+}
+
+void nft_range_module_exit(void)
+{
+	nft_unregister_expr(&nft_range_type);
+}
-- 
cgit v1.2.3-71-gd317


From ff107d27761ff4b644c82c209e004ec9c8fbbc22 Mon Sep 17 00:00:00 2001
From: Liping Zhang <liping.zhang@spreadtrum.com>
Date: Sun, 25 Sep 2016 16:35:56 +0800
Subject: netfilter: nft_log: complete NFTA_LOG_FLAGS attr support

NFTA_LOG_FLAGS attribute is already supported, but the related
NF_LOG_XXX flags are not exposed to the userspace. So we cannot
explicitly enable log flags to log uid, tcp sequence, ip options
and so on, i.e. such rule "nft add rule filter output log uid"
is not supported yet.

So move NF_LOG_XXX macro definitions to the uapi/../nf_log.h. In
order to keep consistent with other modules, change NF_LOG_MASK to
refer to all supported log flags. On the other hand, add a new
NF_LOG_DEFAULT_MASK to refer to the original default log flags.

Finally, if user specify the unsupported log flags or NFTA_LOG_GROUP
and NFTA_LOG_FLAGS are set at the same time, report EINVAL to the
userspace.

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_log.h        | 11 +++--------
 include/uapi/linux/netfilter/nf_log.h | 12 ++++++++++++
 net/bridge/netfilter/ebt_log.c        |  2 +-
 net/ipv4/netfilter/ip_tables.c        |  2 +-
 net/ipv4/netfilter/nf_log_arp.c       |  2 +-
 net/ipv4/netfilter/nf_log_ipv4.c      |  4 ++--
 net/ipv6/netfilter/ip6_tables.c       |  2 +-
 net/ipv6/netfilter/nf_log_ipv6.c      |  4 ++--
 net/netfilter/nf_tables_core.c        |  2 +-
 net/netfilter/nft_log.c               |  9 ++++++++-
 10 files changed, 32 insertions(+), 18 deletions(-)
 create mode 100644 include/uapi/linux/netfilter/nf_log.h

(limited to 'include/uapi/linux')

diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h
index ee07dc8b0a7b..309cd267be4f 100644
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -2,15 +2,10 @@
 #define _NF_LOG_H
 
 #include <linux/netfilter.h>
+#include <linux/netfilter/nf_log.h>
 
-/* those NF_LOG_* defines and struct nf_loginfo are legacy definitios that will
- * disappear once iptables is replaced with pkttables.  Please DO NOT use them
- * for any new code! */
-#define NF_LOG_TCPSEQ		0x01	/* Log TCP sequence numbers */
-#define NF_LOG_TCPOPT		0x02	/* Log TCP options */
-#define NF_LOG_IPOPT		0x04	/* Log IP options */
-#define NF_LOG_UID		0x08	/* Log UID owning local socket */
-#define NF_LOG_MASK		0x0f
+/* Log tcp sequence, tcp options, ip options and uid owning local socket */
+#define NF_LOG_DEFAULT_MASK	0x0f
 
 /* This flag indicates that copy_len field in nf_loginfo is set */
 #define NF_LOG_F_COPY_LEN	0x1
diff --git a/include/uapi/linux/netfilter/nf_log.h b/include/uapi/linux/netfilter/nf_log.h
new file mode 100644
index 000000000000..8be21e02387d
--- /dev/null
+++ b/include/uapi/linux/netfilter/nf_log.h
@@ -0,0 +1,12 @@
+#ifndef _NETFILTER_NF_LOG_H
+#define _NETFILTER_NF_LOG_H
+
+#define NF_LOG_TCPSEQ		0x01	/* Log TCP sequence numbers */
+#define NF_LOG_TCPOPT		0x02	/* Log TCP options */
+#define NF_LOG_IPOPT		0x04	/* Log IP options */
+#define NF_LOG_UID		0x08	/* Log UID owning local socket */
+#define NF_LOG_NFLOG		0x10	/* Unsupported, don't reuse */
+#define NF_LOG_MACDECODE	0x20	/* Decode MAC header */
+#define NF_LOG_MASK		0x2f
+
+#endif /* _NETFILTER_NF_LOG_H */
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 152300d164ac..9a11086ba6ff 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -91,7 +91,7 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
 	if (loginfo->type == NF_LOG_TYPE_LOG)
 		bitmask = loginfo->u.log.logflags;
 	else
-		bitmask = NF_LOG_MASK;
+		bitmask = NF_LOG_DEFAULT_MASK;
 
 	if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto ==
 	   htons(ETH_P_IP)) {
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index f993545a3373..7c00ce90adb8 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -156,7 +156,7 @@ static struct nf_loginfo trace_loginfo = {
 	.u = {
 		.log = {
 			.level = 4,
-			.logflags = NF_LOG_MASK,
+			.logflags = NF_LOG_DEFAULT_MASK,
 		},
 	},
 };
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index 8945c2653814..b24795e2ee6d 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -30,7 +30,7 @@ static struct nf_loginfo default_loginfo = {
 	.u = {
 		.log = {
 			.level	  = LOGLEVEL_NOTICE,
-			.logflags = NF_LOG_MASK,
+			.logflags = NF_LOG_DEFAULT_MASK,
 		},
 	},
 };
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 20f225593a8b..5b571e1b5f15 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -29,7 +29,7 @@ static struct nf_loginfo default_loginfo = {
 	.u = {
 		.log = {
 			.level	  = LOGLEVEL_NOTICE,
-			.logflags = NF_LOG_MASK,
+			.logflags = NF_LOG_DEFAULT_MASK,
 		},
 	},
 };
@@ -46,7 +46,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m,
 	if (info->type == NF_LOG_TYPE_LOG)
 		logflags = info->u.log.logflags;
 	else
-		logflags = NF_LOG_MASK;
+		logflags = NF_LOG_DEFAULT_MASK;
 
 	ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
 	if (ih == NULL) {
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 552fac2f390a..55aacea24396 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -190,7 +190,7 @@ static struct nf_loginfo trace_loginfo = {
 	.u = {
 		.log = {
 			.level = LOGLEVEL_WARNING,
-			.logflags = NF_LOG_MASK,
+			.logflags = NF_LOG_DEFAULT_MASK,
 		},
 	},
 };
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index c1bcf699a23d..f6aee2895fee 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -30,7 +30,7 @@ static struct nf_loginfo default_loginfo = {
 	.u = {
 		.log = {
 			.level	  = LOGLEVEL_NOTICE,
-			.logflags = NF_LOG_MASK,
+			.logflags = NF_LOG_DEFAULT_MASK,
 		},
 	},
 };
@@ -52,7 +52,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
 	if (info->type == NF_LOG_TYPE_LOG)
 		logflags = info->u.log.logflags;
 	else
-		logflags = NF_LOG_MASK;
+		logflags = NF_LOG_DEFAULT_MASK;
 
 	ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
 	if (ih == NULL) {
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 7c94ce0080d5..0dd5c695482f 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -34,7 +34,7 @@ static struct nf_loginfo trace_loginfo = {
 	.u = {
 		.log = {
 			.level = LOGLEVEL_WARNING,
-			.logflags = NF_LOG_MASK,
+			.logflags = NF_LOG_DEFAULT_MASK,
 	        },
 	},
 };
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 24a73bb26e94..1b01404bb33f 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -58,8 +58,11 @@ static int nft_log_init(const struct nft_ctx *ctx,
 	if (tb[NFTA_LOG_LEVEL] != NULL &&
 	    tb[NFTA_LOG_GROUP] != NULL)
 		return -EINVAL;
-	if (tb[NFTA_LOG_GROUP] != NULL)
+	if (tb[NFTA_LOG_GROUP] != NULL) {
 		li->type = NF_LOG_TYPE_ULOG;
+		if (tb[NFTA_LOG_FLAGS] != NULL)
+			return -EINVAL;
+	}
 
 	nla = tb[NFTA_LOG_PREFIX];
 	if (nla != NULL) {
@@ -87,6 +90,10 @@ static int nft_log_init(const struct nft_ctx *ctx,
 		if (tb[NFTA_LOG_FLAGS] != NULL) {
 			li->u.log.logflags =
 				ntohl(nla_get_be32(tb[NFTA_LOG_FLAGS]));
+			if (li->u.log.logflags & ~NF_LOG_MASK) {
+				err = -EINVAL;
+				goto err1;
+			}
 		}
 		break;
 	case NF_LOG_TYPE_ULOG:
-- 
cgit v1.2.3-71-gd317


From 8564e38206de2ff005a27c8d7c2ce3869a44f0dd Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 19 Sep 2016 09:44:44 +0200
Subject: cfg80211: add checks for beacon rate, extend to mesh

The previous commit added support for specifying the beacon rate
for AP mode. Add features checks to this, and extend it to also
support the rate configuration for mesh networks. For IBSS it's
not as simple due to joining etc., so that's not yet supported.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  4 +++-
 include/uapi/linux/nl80211.h | 17 +++++++++++++++-
 net/wireless/nl80211.c       | 46 +++++++++++++++++++++++++++++++++-----------
 3 files changed, 54 insertions(+), 13 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index e0949c8bc2d1..ed37304fa09d 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -712,7 +712,7 @@ struct cfg80211_bitrate_mask {
  *	MAC address based access control
  * @pbss: If set, start as a PCP instead of AP. Relevant for DMG
  *	networks.
- * @beacon_rate: masks for setting user configured beacon tx rate.
+ * @beacon_rate: bitrate to be used for beacons
  */
 struct cfg80211_ap_settings {
 	struct cfg80211_chan_def chandef;
@@ -1365,6 +1365,7 @@ struct mesh_config {
  * @beacon_interval: beacon interval to use
  * @mcast_rate: multicat rate for Mesh Node [6Mbps is the default for 802.11a]
  * @basic_rates: basic rates to use when creating the mesh
+ * @beacon_rate: bitrate to be used for beacons
  *
  * These parameters are fixed when the mesh is created.
  */
@@ -1385,6 +1386,7 @@ struct mesh_setup {
 	u16 beacon_interval;
 	int mcast_rate[NUM_NL80211_BANDS];
 	u32 basic_rates;
+	struct cfg80211_bitrate_mask beacon_rate;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 220694151434..ec10d1b2838f 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1343,7 +1343,13 @@ enum nl80211_commands {
  *	enum nl80211_band value is used as the index (nla_type() of the nested
  *	data. If a band is not included, it will be configured to allow all
  *	rates based on negotiated supported rates information. This attribute
- *	is used with %NL80211_CMD_SET_TX_BITRATE_MASK.
+ *	is used with %NL80211_CMD_SET_TX_BITRATE_MASK and with starting AP,
+ *	and joining mesh networks (not IBSS yet). In the later case, it must
+ *	specify just a single bitrate, which is to be used for the beacon.
+ *	The driver must also specify support for this with the extended
+ *	features NL80211_EXT_FEATURE_BEACON_RATE_LEGACY,
+ *	NL80211_EXT_FEATURE_BEACON_RATE_HT and
+ *	NL80211_EXT_FEATURE_BEACON_RATE_VHT.
  *
  * @NL80211_ATTR_FRAME_MATCH: A binary attribute which typically must contain
  *	at least one byte, currently used with @NL80211_CMD_REGISTER_FRAME.
@@ -4551,6 +4557,12 @@ enum nl80211_feature_flags {
  *	(if available).
  * @NL80211_EXT_FEATURE_SET_SCAN_DWELL: This driver supports configuration of
  *	channel dwell time.
+ * @NL80211_EXT_FEATURE_BEACON_RATE_LEGACY: Driver supports beacon rate
+ *	configuration (AP/mesh), supporting a legacy (non HT/VHT) rate.
+ * @NL80211_EXT_FEATURE_BEACON_RATE_HT: Driver supports beacon rate
+ *	configuration (AP/mesh) with HT rates.
+ * @NL80211_EXT_FEATURE_BEACON_RATE_VHT: Driver supports beacon rate
+ *	configuration (AP/mesh) with VHT rates.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -4562,6 +4574,9 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_SCAN_START_TIME,
 	NL80211_EXT_FEATURE_BSS_PARENT_TSF,
 	NL80211_EXT_FEATURE_SET_SCAN_DWELL,
+	NL80211_EXT_FEATURE_BEACON_RATE_LEGACY,
+	NL80211_EXT_FEATURE_BEACON_RATE_HT,
+	NL80211_EXT_FEATURE_BEACON_RATE_VHT,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index a10484da60c0..b8441e60b0f6 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3569,13 +3569,12 @@ out:
 	return 0;
 }
 
-static int validate_beacon_tx_rate(struct cfg80211_ap_settings *params)
+static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev,
+				   enum nl80211_band band,
+				   struct cfg80211_bitrate_mask *beacon_rate)
 {
-	u32 rate, count_ht, count_vht, i;
-	enum nl80211_band band;
-
-	band = params->chandef.chan->band;
-	rate = params->beacon_rate.control[band].legacy;
+	u32 count_ht, count_vht, i;
+	u32 rate = beacon_rate->control[band].legacy;
 
 	/* Allow only one rate */
 	if (hweight32(rate) > 1)
@@ -3583,9 +3582,9 @@ static int validate_beacon_tx_rate(struct cfg80211_ap_settings *params)
 
 	count_ht = 0;
 	for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) {
-		if (hweight8(params->beacon_rate.control[band].ht_mcs[i]) > 1) {
+		if (hweight8(beacon_rate->control[band].ht_mcs[i]) > 1) {
 			return -EINVAL;
-		} else if (params->beacon_rate.control[band].ht_mcs[i]) {
+		} else if (beacon_rate->control[band].ht_mcs[i]) {
 			count_ht++;
 			if (count_ht > 1)
 				return -EINVAL;
@@ -3596,9 +3595,9 @@ static int validate_beacon_tx_rate(struct cfg80211_ap_settings *params)
 
 	count_vht = 0;
 	for (i = 0; i < NL80211_VHT_NSS_MAX; i++) {
-		if (hweight16(params->beacon_rate.control[band].vht_mcs[i]) > 1) {
+		if (hweight16(beacon_rate->control[band].vht_mcs[i]) > 1) {
 			return -EINVAL;
-		} else if (params->beacon_rate.control[band].vht_mcs[i]) {
+		} else if (beacon_rate->control[band].vht_mcs[i]) {
 			count_vht++;
 			if (count_vht > 1)
 				return -EINVAL;
@@ -3610,6 +3609,19 @@ static int validate_beacon_tx_rate(struct cfg80211_ap_settings *params)
 	if ((count_ht && count_vht) || (!rate && !count_ht && !count_vht))
 		return -EINVAL;
 
+	if (rate &&
+	    !wiphy_ext_feature_isset(&rdev->wiphy,
+				     NL80211_EXT_FEATURE_BEACON_RATE_LEGACY))
+		return -EINVAL;
+	if (count_ht &&
+	    !wiphy_ext_feature_isset(&rdev->wiphy,
+				     NL80211_EXT_FEATURE_BEACON_RATE_HT))
+		return -EINVAL;
+	if (count_vht &&
+	    !wiphy_ext_feature_isset(&rdev->wiphy,
+				     NL80211_EXT_FEATURE_BEACON_RATE_VHT))
+		return -EINVAL;
+
 	return 0;
 }
 
@@ -3847,7 +3859,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 		if (err)
 			return err;
 
-		err = validate_beacon_tx_rate(&params);
+		err = validate_beacon_tx_rate(rdev, params.chandef.chan->band,
+					      &params.beacon_rate);
 		if (err)
 			return err;
 	}
@@ -9406,6 +9419,17 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info)
 			return err;
 	}
 
+	if (info->attrs[NL80211_ATTR_TX_RATES]) {
+		err = nl80211_parse_tx_bitrate_mask(info, &setup.beacon_rate);
+		if (err)
+			return err;
+
+		err = validate_beacon_tx_rate(rdev, setup.chandef.chan->band,
+					      &setup.beacon_rate);
+		if (err)
+			return err;
+	}
+
 	return cfg80211_join_mesh(rdev, dev, &setup, &cfg);
 }
 
-- 
cgit v1.2.3-71-gd317


From bd11f0741fa5a2c296629898ad07759dd12b35bb Mon Sep 17 00:00:00 2001
From: Maciej Żenczykowski <maze@google.com>
Date: Tue, 27 Sep 2016 23:57:58 -0700
Subject: ipv6 addrconf: implement RFC7559 router solicitation backoff
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This implements:
  https://tools.ietf.org/html/rfc7559

Backoff is performed according to RFC3315 section 14:
  https://tools.ietf.org/html/rfc3315#section-14

We allow setting /proc/sys/net/ipv6/conf/*/router_solicitations
to a negative value meaning an unlimited number of retransmits,
and we make this the new default (inline with the RFC).

We also add a new setting:
  /proc/sys/net/ipv6/conf/*/router_solicitation_max_interval
defaulting to 1 hour (per RFC recommendation).

Signed-off-by: Maciej Żenczykowski <maze@google.com>
Acked-by: Erik Kline <ek@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h      |  1 +
 include/net/addrconf.h    |  3 ++-
 include/net/if_inet6.h    |  1 +
 include/uapi/linux/ipv6.h |  1 +
 net/ipv6/addrconf.c       | 51 ++++++++++++++++++++++++++++++++++++++++-------
 5 files changed, 49 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index c6dbcd84a2c7..7e9a789be5e0 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -18,6 +18,7 @@ struct ipv6_devconf {
 	__s32		dad_transmits;
 	__s32		rtr_solicits;
 	__s32		rtr_solicit_interval;
+	__s32		rtr_solicit_max_interval;
 	__s32		rtr_solicit_delay;
 	__s32		force_mld_version;
 	__s32		mldv1_unsolicited_report_interval;
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 9826d3a9464c..f2d072787947 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -1,8 +1,9 @@
 #ifndef _ADDRCONF_H
 #define _ADDRCONF_H
 
-#define MAX_RTR_SOLICITATIONS		3
+#define MAX_RTR_SOLICITATIONS		-1		/* unlimited */
 #define RTR_SOLICITATION_INTERVAL	(4*HZ)
+#define RTR_SOLICITATION_MAX_INTERVAL	(3600*HZ)	/* 1 hour */
 
 #define MIN_VALID_LIFETIME		(2*3600)	/* 2 hours */
 
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index 1c8b6820b694..515352c6280a 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -201,6 +201,7 @@ struct inet6_dev {
 	struct ipv6_devstat	stats;
 
 	struct timer_list	rs_timer;
+	__s32			rs_interval;	/* in jiffies */
 	__u8			rs_probes;
 
 	__u8			addr_gen_mode;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 395876060f50..8c2772340c3f 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -177,6 +177,7 @@ enum {
 	DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
 	DEVCONF_DROP_UNSOLICITED_NA,
 	DEVCONF_KEEP_ADDR_ON_DOWN,
+	DEVCONF_RTR_SOLICIT_MAX_INTERVAL,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 35d4baa55c9d..87183983724d 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -112,6 +112,27 @@ static inline u32 cstamp_delta(unsigned long cstamp)
 	return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
 }
 
+static inline s32 rfc3315_s14_backoff_init(s32 irt)
+{
+	/* multiply 'initial retransmission time' by 0.9 .. 1.1 */
+	u64 tmp = (900000 + prandom_u32() % 200001) * (u64)irt;
+	do_div(tmp, 1000000);
+	return (s32)tmp;
+}
+
+static inline s32 rfc3315_s14_backoff_update(s32 rt, s32 mrt)
+{
+	/* multiply 'retransmission timeout' by 1.9 .. 2.1 */
+	u64 tmp = (1900000 + prandom_u32() % 200001) * (u64)rt;
+	do_div(tmp, 1000000);
+	if ((s32)tmp > mrt) {
+		/* multiply 'maximum retransmission time' by 0.9 .. 1.1 */
+		tmp = (900000 + prandom_u32() % 200001) * (u64)mrt;
+		do_div(tmp, 1000000);
+	}
+	return (s32)tmp;
+}
+
 #ifdef CONFIG_SYSCTL
 static int addrconf_sysctl_register(struct inet6_dev *idev);
 static void addrconf_sysctl_unregister(struct inet6_dev *idev);
@@ -187,6 +208,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.dad_transmits		= 1,
 	.rtr_solicits		= MAX_RTR_SOLICITATIONS,
 	.rtr_solicit_interval	= RTR_SOLICITATION_INTERVAL,
+	.rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL,
 	.rtr_solicit_delay	= MAX_RTR_SOLICITATION_DELAY,
 	.use_tempaddr		= 0,
 	.temp_valid_lft		= TEMP_VALID_LIFETIME,
@@ -232,6 +254,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.dad_transmits		= 1,
 	.rtr_solicits		= MAX_RTR_SOLICITATIONS,
 	.rtr_solicit_interval	= RTR_SOLICITATION_INTERVAL,
+	.rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL,
 	.rtr_solicit_delay	= MAX_RTR_SOLICITATION_DELAY,
 	.use_tempaddr		= 0,
 	.temp_valid_lft		= TEMP_VALID_LIFETIME,
@@ -3687,7 +3710,7 @@ static void addrconf_rs_timer(unsigned long data)
 	if (idev->if_flags & IF_RA_RCVD)
 		goto out;
 
-	if (idev->rs_probes++ < idev->cnf.rtr_solicits) {
+	if (idev->rs_probes++ < idev->cnf.rtr_solicits || idev->cnf.rtr_solicits < 0) {
 		write_unlock(&idev->lock);
 		if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE))
 			ndisc_send_rs(dev, &lladdr,
@@ -3696,11 +3719,13 @@ static void addrconf_rs_timer(unsigned long data)
 			goto put;
 
 		write_lock(&idev->lock);
+		idev->rs_interval = rfc3315_s14_backoff_update(
+			idev->rs_interval, idev->cnf.rtr_solicit_max_interval);
 		/* The wait after the last probe can be shorter */
 		addrconf_mod_rs_timer(idev, (idev->rs_probes ==
 					     idev->cnf.rtr_solicits) ?
 				      idev->cnf.rtr_solicit_delay :
-				      idev->cnf.rtr_solicit_interval);
+				      idev->rs_interval);
 	} else {
 		/*
 		 * Note: we do not support deprecated "all on-link"
@@ -3949,7 +3974,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
 	send_mld = ifp->scope == IFA_LINK && ipv6_lonely_lladdr(ifp);
 	send_rs = send_mld &&
 		  ipv6_accept_ra(ifp->idev) &&
-		  ifp->idev->cnf.rtr_solicits > 0 &&
+		  ifp->idev->cnf.rtr_solicits != 0 &&
 		  (dev->flags&IFF_LOOPBACK) == 0;
 	read_unlock_bh(&ifp->idev->lock);
 
@@ -3971,10 +3996,11 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
 
 		write_lock_bh(&ifp->idev->lock);
 		spin_lock(&ifp->lock);
+		ifp->idev->rs_interval = rfc3315_s14_backoff_init(
+			ifp->idev->cnf.rtr_solicit_interval);
 		ifp->idev->rs_probes = 1;
 		ifp->idev->if_flags |= IF_RS_SENT;
-		addrconf_mod_rs_timer(ifp->idev,
-				      ifp->idev->cnf.rtr_solicit_interval);
+		addrconf_mod_rs_timer(ifp->idev, ifp->idev->rs_interval);
 		spin_unlock(&ifp->lock);
 		write_unlock_bh(&ifp->idev->lock);
 	}
@@ -4891,6 +4917,8 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits;
 	array[DEVCONF_RTR_SOLICIT_INTERVAL] =
 		jiffies_to_msecs(cnf->rtr_solicit_interval);
+	array[DEVCONF_RTR_SOLICIT_MAX_INTERVAL] =
+		jiffies_to_msecs(cnf->rtr_solicit_max_interval);
 	array[DEVCONF_RTR_SOLICIT_DELAY] =
 		jiffies_to_msecs(cnf->rtr_solicit_delay);
 	array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version;
@@ -5099,7 +5127,7 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
 		return -EINVAL;
 	if (!ipv6_accept_ra(idev))
 		return -EINVAL;
-	if (idev->cnf.rtr_solicits <= 0)
+	if (idev->cnf.rtr_solicits == 0)
 		return -EINVAL;
 
 	write_lock_bh(&idev->lock);
@@ -5128,8 +5156,10 @@ update_lft:
 
 	if (update_rs) {
 		idev->if_flags |= IF_RS_SENT;
+		idev->rs_interval = rfc3315_s14_backoff_init(
+			idev->cnf.rtr_solicit_interval);
 		idev->rs_probes = 1;
-		addrconf_mod_rs_timer(idev, idev->cnf.rtr_solicit_interval);
+		addrconf_mod_rs_timer(idev, idev->rs_interval);
 	}
 
 	/* Well, that's kinda nasty ... */
@@ -5777,6 +5807,13 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
+	{
+		.procname	= "router_solicitation_max_interval",
+		.data		= &ipv6_devconf.rtr_solicit_max_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
 	{
 		.procname	= "router_solicitation_delay",
 		.data		= &ipv6_devconf.rtr_solicit_delay,
-- 
cgit v1.2.3-71-gd317


From cb3b7d87652aeb37cfb5295a6157a3280dae10cb Mon Sep 17 00:00:00 2001
From: Ayala Beker <ayala.beker@intel.com>
Date: Tue, 20 Sep 2016 17:31:13 +0300
Subject: cfg80211: add start / stop NAN commands

This allows user space to start/stop NAN interface.
A NAN interface is like P2P device in a few aspects: it
doesn't have a netdev associated to it.
Add the new interface type and prevent operations that
can't be executed on NAN interface like scan.

Define several attributes that may be configured by user space
when starting NAN functionality (master preference and dual
band operation)

Signed-off-by: Andrei Otcheretianski <andrei.otcheretianski@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 21 +++++++++-
 include/uapi/linux/nl80211.h | 47 +++++++++++++++++++++++
 net/mac80211/cfg.c           |  2 +
 net/mac80211/chan.c          |  3 ++
 net/mac80211/iface.c         |  4 ++
 net/mac80211/offchannel.c    |  1 +
 net/mac80211/rx.c            |  3 ++
 net/mac80211/util.c          |  1 +
 net/wireless/chan.c          |  2 +
 net/wireless/core.c          | 34 +++++++++++++++++
 net/wireless/core.h          |  3 ++
 net/wireless/mlme.c          |  1 +
 net/wireless/nl80211.c       | 91 ++++++++++++++++++++++++++++++++++++++++++--
 net/wireless/rdev-ops.h      | 20 ++++++++++
 net/wireless/trace.h         | 27 +++++++++++++
 net/wireless/util.c          |  6 ++-
 16 files changed, 260 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 68dca3d93b85..9898e1f883e2 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2313,6 +2313,19 @@ struct cfg80211_qos_map {
 	struct cfg80211_dscp_range up[8];
 };
 
+/**
+ * struct cfg80211_nan_conf - NAN configuration
+ *
+ * This struct defines NAN configuration parameters
+ *
+ * @master_pref: master preference (1 - 255)
+ * @dual: dual band operation mode, see &enum nl80211_nan_dual_band_conf
+ */
+struct cfg80211_nan_conf {
+	u8 master_pref;
+	u8 dual;
+};
+
 /**
  * struct cfg80211_ops - backend description for wireless configuration
  *
@@ -2601,6 +2614,8 @@ struct cfg80211_qos_map {
  *	and returning to the base channel for communication with the AP.
  * @tdls_cancel_channel_switch: Stop channel-switching with a TDLS peer. Both
  *	peers must be on the base channel when the call completes.
+ * @start_nan: Start the NAN interface.
+ * @stop_nan: Stop the NAN interface.
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -2866,6 +2881,9 @@ struct cfg80211_ops {
 	void	(*tdls_cancel_channel_switch)(struct wiphy *wiphy,
 					      struct net_device *dev,
 					      const u8 *addr);
+	int	(*start_nan)(struct wiphy *wiphy, struct wireless_dev *wdev,
+			     struct cfg80211_nan_conf *conf);
+	void	(*stop_nan)(struct wiphy *wiphy, struct wireless_dev *wdev);
 };
 
 /*
@@ -3626,6 +3644,7 @@ struct cfg80211_cached_keys;
  *	beacons, 0 when not valid
  * @address: The address for this device, valid only if @netdev is %NULL
  * @p2p_started: true if this is a P2P Device that has been started
+ * @nan_started: true if this is a NAN interface that has been started
  * @cac_started: true if DFS channel availability check has been started
  * @cac_start_time: timestamp (jiffies) when the dfs state was entered.
  * @cac_time_ms: CAC time in ms
@@ -3657,7 +3676,7 @@ struct wireless_dev {
 
 	struct mutex mtx;
 
-	bool use_4addr, p2p_started;
+	bool use_4addr, p2p_started, nan_started;
 
 	u8 address[ETH_ALEN] __aligned(sizeof(u16));
 
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index ec10d1b2838f..98fd3ec8598d 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -838,6 +838,16 @@
  *	not running. The driver indicates the status of the scan through
  *	cfg80211_scan_done().
  *
+ * @NL80211_CMD_START_NAN: Start NAN operation, identified by its
+ *	%NL80211_ATTR_WDEV interface. This interface must have been previously
+ *	created with %NL80211_CMD_NEW_INTERFACE. After it has been started, the
+ *	NAN interface will create or join a cluster. This command must have a
+ *	valid %NL80211_ATTR_NAN_MASTER_PREF attribute and optional
+ *	%NL80211_ATTR_NAN_DUAL attributes.
+ *	After this command NAN functions can be added.
+ * @NL80211_CMD_STOP_NAN: Stop the NAN operation, identified by
+ *	its %NL80211_ATTR_WDEV interface.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -1026,6 +1036,9 @@ enum nl80211_commands {
 
 	NL80211_CMD_ABORT_SCAN,
 
+	NL80211_CMD_START_NAN,
+	NL80211_CMD_STOP_NAN,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -1739,6 +1752,12 @@ enum nl80211_commands {
  *	regulatory indoor configuration would be owned by the netlink socket
  *	that configured the indoor setting, and the indoor operation would be
  *	cleared when the socket is closed.
+ *	If set during NAN interface creation, the interface will be destroyed
+ *	if the socket is closed just like any other interface. Moreover, only
+ *	the netlink socket that created the interface will be allowed to add
+ *	and remove functions. NAN notifications will be sent in unicast to that
+ *	socket. Without this attribute, any socket can add functions and the
+ *	notifications will be sent to the %NL80211_MCGRP_NAN multicast group.
  *
  * @NL80211_ATTR_TDLS_INITIATOR: flag attribute indicating the current end is
  *	the TDLS link initiator.
@@ -1873,6 +1892,14 @@ enum nl80211_commands {
  * @NL80211_ATTR_MESH_PEER_AID: Association ID for the mesh peer (u16). This is
  *	used to pull the stored data for mesh peer in power save state.
  *
+ * @NL80211_ATTR_NAN_MASTER_PREF: the master preference to be used by
+ *	%NL80211_CMD_START_NAN. Its type is u8 and it can't be 0.
+ *	Also, values 1 and 255 are reserved for certification purposes and
+ *	should not be used during a normal device operation.
+ * @NL80211_ATTR_NAN_DUAL: NAN dual band operation config (see
+ *	&enum nl80211_nan_dual_band_conf). This attribute is used with
+ *	%NL80211_CMD_START_NAN.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2267,6 +2294,9 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_MESH_PEER_AID,
 
+	NL80211_ATTR_NAN_MASTER_PREF,
+	NL80211_ATTR_NAN_DUAL,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -2345,6 +2375,7 @@ enum nl80211_attrs {
  *	commands to create and destroy one
  * @NL80211_IF_TYPE_OCB: Outside Context of a BSS
  *	This mode corresponds to the MIB variable dot11OCBActivated=true
+ * @NL80211_IFTYPE_NAN: NAN device interface type (not a netdev)
  * @NL80211_IFTYPE_MAX: highest interface type number currently defined
  * @NUM_NL80211_IFTYPES: number of defined interface types
  *
@@ -2365,6 +2396,7 @@ enum nl80211_iftype {
 	NL80211_IFTYPE_P2P_GO,
 	NL80211_IFTYPE_P2P_DEVICE,
 	NL80211_IFTYPE_OCB,
+	NL80211_IFTYPE_NAN,
 
 	/* keep last */
 	NUM_NL80211_IFTYPES,
@@ -4870,4 +4902,19 @@ enum nl80211_bss_select_attr {
 	NL80211_BSS_SELECT_ATTR_MAX = __NL80211_BSS_SELECT_ATTR_AFTER_LAST - 1
 };
 
+/**
+ * enum nl80211_nan_dual_band_conf - NAN dual band configuration
+ *
+ * Defines the NAN dual band mode of operation
+ *
+ * @NL80211_NAN_BAND_DEFAULT: device default mode
+ * @NL80211_NAN_BAND_2GHZ: 2.4GHz mode
+ * @NL80211_NAN_BAND_5GHZ: 5GHz mode
+  */
+enum nl80211_nan_dual_band_conf {
+	NL80211_NAN_BAND_DEFAULT	= 1 << 0,
+	NL80211_NAN_BAND_2GHZ		= 1 << 1,
+	NL80211_NAN_BAND_5GHZ		= 1 << 2,
+};
+
 #endif /* __LINUX_NL80211_H */
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index e29ff5749944..a74027f887bc 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -257,6 +257,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
 	case NL80211_IFTYPE_WDS:
 	case NL80211_IFTYPE_MONITOR:
 	case NL80211_IFTYPE_P2P_DEVICE:
+	case NL80211_IFTYPE_NAN:
 	case NL80211_IFTYPE_UNSPECIFIED:
 	case NUM_NL80211_IFTYPES:
 	case NL80211_IFTYPE_P2P_CLIENT:
@@ -2036,6 +2037,7 @@ static int ieee80211_scan(struct wiphy *wiphy,
 		     !(req->flags & NL80211_SCAN_FLAG_AP)))
 			return -EOPNOTSUPP;
 		break;
+	case NL80211_IFTYPE_NAN:
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index 74142d07ad31..d035801569eb 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -274,6 +274,7 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local,
 				    ieee80211_get_max_required_bw(sdata));
 			break;
 		case NL80211_IFTYPE_P2P_DEVICE:
+		case NL80211_IFTYPE_NAN:
 			continue;
 		case NL80211_IFTYPE_ADHOC:
 		case NL80211_IFTYPE_WDS:
@@ -718,6 +719,7 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local,
 
 		switch (sdata->vif.type) {
 		case NL80211_IFTYPE_P2P_DEVICE:
+		case NL80211_IFTYPE_NAN:
 			continue;
 		case NL80211_IFTYPE_STATION:
 			if (!sdata->u.mgd.associated)
@@ -980,6 +982,7 @@ ieee80211_vif_chanctx_reservation_complete(struct ieee80211_sub_if_data *sdata)
 	case NL80211_IFTYPE_P2P_CLIENT:
 	case NL80211_IFTYPE_P2P_GO:
 	case NL80211_IFTYPE_P2P_DEVICE:
+	case NL80211_IFTYPE_NAN:
 	case NUM_NL80211_IFTYPES:
 		WARN_ON(1);
 		break;
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index b0abddc714ef..e694ca2baad0 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -545,6 +545,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
 	case NL80211_IFTYPE_ADHOC:
 	case NL80211_IFTYPE_P2P_DEVICE:
 	case NL80211_IFTYPE_OCB:
+	case NL80211_IFTYPE_NAN:
 		/* no special treatment */
 		break;
 	case NL80211_IFTYPE_UNSPECIFIED:
@@ -660,6 +661,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
 			break;
 		case NL80211_IFTYPE_WDS:
 		case NL80211_IFTYPE_P2P_DEVICE:
+		case NL80211_IFTYPE_NAN:
 			break;
 		default:
 			/* not reached */
@@ -948,6 +950,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 		/* relies on synchronize_rcu() below */
 		RCU_INIT_POINTER(local->p2p_sdata, NULL);
 		/* fall through */
+	case NL80211_IFTYPE_NAN:
 	default:
 		cancel_work_sync(&sdata->work);
 		/*
@@ -1457,6 +1460,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
 		break;
 	case NL80211_IFTYPE_AP_VLAN:
 	case NL80211_IFTYPE_P2P_DEVICE:
+	case NL80211_IFTYPE_NAN:
 		sdata->vif.bss_conf.bssid = sdata->vif.addr;
 		break;
 	case NL80211_IFTYPE_UNSPECIFIED:
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index 55a9c5b94ce1..75d5c960ce67 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -838,6 +838,7 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
 	case NL80211_IFTYPE_P2P_DEVICE:
 		need_offchan = true;
 		break;
+	case NL80211_IFTYPE_NAN:
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index e796060b7c5e..c9489a86e6d6 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3586,6 +3586,9 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
 		       ieee80211_is_probe_req(hdr->frame_control) ||
 		       ieee80211_is_probe_resp(hdr->frame_control) ||
 		       ieee80211_is_beacon(hdr->frame_control);
+	case NL80211_IFTYPE_NAN:
+		/* Currently no frames on NAN interface are allowed */
+		return false;
 	default:
 		break;
 	}
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index b6865d884487..2c78541f695c 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1975,6 +1975,7 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 		case NL80211_IFTYPE_AP_VLAN:
 		case NL80211_IFTYPE_MONITOR:
 		case NL80211_IFTYPE_P2P_DEVICE:
+		case NL80211_IFTYPE_NAN:
 			/* nothing to do */
 			break;
 		case NL80211_IFTYPE_UNSPECIFIED:
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index 0f506220a3bd..5497d022fada 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -372,6 +372,7 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy,
 	case NL80211_IFTYPE_AP_VLAN:
 	case NL80211_IFTYPE_WDS:
 	case NL80211_IFTYPE_P2P_DEVICE:
+	case NL80211_IFTYPE_NAN:
 		break;
 	case NL80211_IFTYPE_UNSPECIFIED:
 	case NUM_NL80211_IFTYPES:
@@ -946,6 +947,7 @@ cfg80211_get_chan_state(struct wireless_dev *wdev,
 	case NL80211_IFTYPE_AP_VLAN:
 	case NL80211_IFTYPE_WDS:
 	case NL80211_IFTYPE_P2P_DEVICE:
+	case NL80211_IFTYPE_NAN:
 		/* these interface types don't really have a channel */
 		return;
 	case NL80211_IFTYPE_UNSPECIFIED:
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 4911cd997b9a..013987243c0b 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -225,6 +225,23 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
 	}
 }
 
+void cfg80211_stop_nan(struct cfg80211_registered_device *rdev,
+		       struct wireless_dev *wdev)
+{
+	ASSERT_RTNL();
+
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_NAN))
+		return;
+
+	if (!wdev->nan_started)
+		return;
+
+	rdev_stop_nan(rdev, wdev);
+	wdev->nan_started = false;
+
+	rdev->opencount--;
+}
+
 void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy)
 {
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
@@ -242,6 +259,9 @@ void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy)
 		case NL80211_IFTYPE_P2P_DEVICE:
 			cfg80211_stop_p2p_device(rdev, wdev);
 			break;
+		case NL80211_IFTYPE_NAN:
+			cfg80211_stop_nan(rdev, wdev);
+			break;
 		default:
 			break;
 		}
@@ -537,6 +557,11 @@ static int wiphy_verify_combinations(struct wiphy *wiphy)
 				    c->limits[j].max > 1))
 				return -EINVAL;
 
+			/* Only a single NAN can be allowed */
+			if (WARN_ON(types & BIT(NL80211_IFTYPE_NAN) &&
+				    c->limits[j].max > 1))
+				return -EINVAL;
+
 			cnt += c->limits[j].max;
 			/*
 			 * Don't advertise an unsupported type
@@ -579,6 +604,10 @@ int wiphy_register(struct wiphy *wiphy)
 		     !rdev->ops->tdls_cancel_channel_switch)))
 		return -EINVAL;
 
+	if (WARN_ON((wiphy->interface_modes & BIT(NL80211_IFTYPE_NAN)) &&
+		    (!rdev->ops->start_nan || !rdev->ops->stop_nan)))
+		return -EINVAL;
+
 	/*
 	 * if a wiphy has unsupported modes for regulatory channel enforcement,
 	 * opt-out of enforcement checking
@@ -589,6 +618,7 @@ int wiphy_register(struct wiphy *wiphy)
 				       BIT(NL80211_IFTYPE_P2P_GO) |
 				       BIT(NL80211_IFTYPE_ADHOC) |
 				       BIT(NL80211_IFTYPE_P2P_DEVICE) |
+				       BIT(NL80211_IFTYPE_NAN) |
 				       BIT(NL80211_IFTYPE_AP_VLAN) |
 				       BIT(NL80211_IFTYPE_MONITOR)))
 		wiphy->regulatory_flags |= REGULATORY_IGNORE_STALE_KICKOFF;
@@ -916,6 +946,9 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev)
 		cfg80211_mlme_purge_registrations(wdev);
 		cfg80211_stop_p2p_device(rdev, wdev);
 		break;
+	case NL80211_IFTYPE_NAN:
+		cfg80211_stop_nan(rdev, wdev);
+		break;
 	default:
 		WARN_ON_ONCE(1);
 		break;
@@ -979,6 +1012,7 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev,
 		/* must be handled by mac80211/driver, has no APIs */
 		break;
 	case NL80211_IFTYPE_P2P_DEVICE:
+	case NL80211_IFTYPE_NAN:
 		/* cannot happen, has no netdev */
 		break;
 	case NL80211_IFTYPE_AP_VLAN:
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 554f87d0f991..08d2e948c9ad 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -488,6 +488,9 @@ void cfg80211_leave(struct cfg80211_registered_device *rdev,
 void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
 			      struct wireless_dev *wdev);
 
+void cfg80211_stop_nan(struct cfg80211_registered_device *rdev,
+		       struct wireless_dev *wdev);
+
 #define CFG80211_MAX_NUM_DIFFERENT_CHANNELS 10
 
 #ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index d6abb0704db5..cbb48e26a871 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -634,6 +634,7 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
 			 * fall through, P2P device only supports
 			 * public action frames
 			 */
+		case NL80211_IFTYPE_NAN:
 		default:
 			err = -EOPNOTSUPP;
 			break;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index b8441e60b0f6..9e9fb37087fc 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -409,6 +409,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 		.len = VHT_MUMIMO_GROUPS_DATA_LEN
 	},
 	[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN },
+	[NL80211_ATTR_NAN_MASTER_PREF] = { .type = NLA_U8 },
+	[NL80211_ATTR_NAN_DUAL] = { .type = NLA_U8 },
 };
 
 /* policy for the key attributes */
@@ -934,6 +936,7 @@ static int nl80211_key_allowed(struct wireless_dev *wdev)
 	case NL80211_IFTYPE_UNSPECIFIED:
 	case NL80211_IFTYPE_OCB:
 	case NL80211_IFTYPE_MONITOR:
+	case NL80211_IFTYPE_NAN:
 	case NL80211_IFTYPE_P2P_DEVICE:
 	case NL80211_IFTYPE_WDS:
 	case NUM_NL80211_IFTYPES:
@@ -2819,7 +2822,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
 	    !(rdev->wiphy.interface_modes & (1 << type)))
 		return -EOPNOTSUPP;
 
-	if ((type == NL80211_IFTYPE_P2P_DEVICE ||
+	if ((type == NL80211_IFTYPE_P2P_DEVICE || type == NL80211_IFTYPE_NAN ||
 	     rdev->wiphy.features & NL80211_FEATURE_MAC_ON_CREATE) &&
 	    info->attrs[NL80211_ATTR_MAC]) {
 		nla_memcpy(params.macaddr, info->attrs[NL80211_ATTR_MAC],
@@ -2875,9 +2878,10 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
 		       wdev->mesh_id_up_len);
 		wdev_unlock(wdev);
 		break;
+	case NL80211_IFTYPE_NAN:
 	case NL80211_IFTYPE_P2P_DEVICE:
 		/*
-		 * P2P Device doesn't have a netdev, so doesn't go
+		 * P2P Device and NAN do not have a netdev, so don't go
 		 * through the netdev notifier and must be added here
 		 */
 		mutex_init(&wdev->mtx);
@@ -6434,6 +6438,9 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 
 	wiphy = &rdev->wiphy;
 
+	if (wdev->iftype == NL80211_IFTYPE_NAN)
+		return -EOPNOTSUPP;
+
 	if (!rdev->ops->scan)
 		return -EOPNOTSUPP;
 
@@ -8977,6 +8984,7 @@ static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info)
 	case NL80211_IFTYPE_P2P_GO:
 	case NL80211_IFTYPE_P2P_DEVICE:
 		break;
+	case NL80211_IFTYPE_NAN:
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -9022,6 +9030,7 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info)
 	case NL80211_IFTYPE_MESH_POINT:
 	case NL80211_IFTYPE_P2P_GO:
 		break;
+	case NL80211_IFTYPE_NAN:
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -9138,6 +9147,7 @@ static int nl80211_tx_mgmt_cancel_wait(struct sk_buff *skb, struct genl_info *in
 	case NL80211_IFTYPE_P2P_GO:
 	case NL80211_IFTYPE_P2P_DEVICE:
 		break;
+	case NL80211_IFTYPE_NAN:
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -10504,6 +10514,58 @@ static int nl80211_stop_p2p_device(struct sk_buff *skb, struct genl_info *info)
 	return 0;
 }
 
+static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct wireless_dev *wdev = info->user_ptr[1];
+	struct cfg80211_nan_conf conf = {};
+	int err;
+
+	if (wdev->iftype != NL80211_IFTYPE_NAN)
+		return -EOPNOTSUPP;
+
+	if (wdev->nan_started)
+		return -EEXIST;
+
+	if (rfkill_blocked(rdev->rfkill))
+		return -ERFKILL;
+
+	if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF])
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_NAN_DUAL])
+		return -EINVAL;
+
+	conf.master_pref =
+		nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]);
+	if (!conf.master_pref)
+		return -EINVAL;
+
+	conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]);
+
+	err = rdev_start_nan(rdev, wdev, &conf);
+	if (err)
+		return err;
+
+	wdev->nan_started = true;
+	rdev->opencount++;
+
+	return 0;
+}
+
+static int nl80211_stop_nan(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct wireless_dev *wdev = info->user_ptr[1];
+
+	if (wdev->iftype != NL80211_IFTYPE_NAN)
+		return -EOPNOTSUPP;
+
+	cfg80211_stop_nan(rdev, wdev);
+
+	return 0;
+}
+
 static int nl80211_get_protocol_features(struct sk_buff *skb,
 					 struct genl_info *info)
 {
@@ -11205,7 +11267,14 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
 
 			dev_hold(dev);
 		} else if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP) {
-			if (!wdev->p2p_started) {
+			if (wdev->iftype == NL80211_IFTYPE_P2P_DEVICE &&
+			    !wdev->p2p_started) {
+				if (rtnl)
+					rtnl_unlock();
+				return -ENETDOWN;
+			}
+			if (wdev->iftype == NL80211_IFTYPE_NAN &&
+			    !wdev->nan_started) {
 				if (rtnl)
 					rtnl_unlock();
 				return -ENETDOWN;
@@ -11838,6 +11907,22 @@ static const struct genl_ops nl80211_ops[] = {
 		.internal_flags = NL80211_FLAG_NEED_WDEV_UP |
 				  NL80211_FLAG_NEED_RTNL,
 	},
+	{
+		.cmd = NL80211_CMD_START_NAN,
+		.doit = nl80211_start_nan,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_WDEV |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_STOP_NAN,
+		.doit = nl80211_stop_nan,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_WDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
 	{
 		.cmd = NL80211_CMD_SET_MCAST_RATE,
 		.doit = nl80211_set_mcast_rate,
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 85ff30bee2b9..afb68a8428b9 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -887,6 +887,26 @@ static inline void rdev_stop_p2p_device(struct cfg80211_registered_device *rdev,
 	trace_rdev_return_void(&rdev->wiphy);
 }
 
+static inline int rdev_start_nan(struct cfg80211_registered_device *rdev,
+				 struct wireless_dev *wdev,
+				 struct cfg80211_nan_conf *conf)
+{
+	int ret;
+
+	trace_rdev_start_nan(&rdev->wiphy, wdev, conf);
+	ret = rdev->ops->start_nan(&rdev->wiphy, wdev, conf);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+	return ret;
+}
+
+static inline void rdev_stop_nan(struct cfg80211_registered_device *rdev,
+				 struct wireless_dev *wdev)
+{
+	trace_rdev_stop_nan(&rdev->wiphy, wdev);
+	rdev->ops->stop_nan(&rdev->wiphy, wdev);
+	trace_rdev_return_void(&rdev->wiphy);
+}
+
 static inline int rdev_set_mac_acl(struct cfg80211_registered_device *rdev,
 				   struct net_device *dev,
 				   struct cfg80211_acl_data *params)
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 72b5255cefe2..5f3370f4c6a2 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -1889,6 +1889,33 @@ DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_p2p_device,
 	TP_ARGS(wiphy, wdev)
 );
 
+TRACE_EVENT(rdev_start_nan,
+	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+		 struct cfg80211_nan_conf *conf),
+	TP_ARGS(wiphy, wdev, conf),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		WDEV_ENTRY
+		__field(u8, master_pref)
+		__field(u8, dual);
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		WDEV_ASSIGN;
+		__entry->master_pref = conf->master_pref;
+		__entry->dual = conf->dual;
+	),
+	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
+		  ", master preference: %u, dual: %d",
+		  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
+		  __entry->dual)
+);
+
+DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan,
+	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
+	TP_ARGS(wiphy, wdev)
+);
+
 TRACE_EVENT(rdev_set_mac_acl,
 	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
 		 struct cfg80211_acl_data *params),
diff --git a/net/wireless/util.c b/net/wireless/util.c
index e02141d66b69..7a2d46b0058a 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -1008,8 +1008,9 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
 	if (otype == NL80211_IFTYPE_AP_VLAN)
 		return -EOPNOTSUPP;
 
-	/* cannot change into P2P device type */
-	if (ntype == NL80211_IFTYPE_P2P_DEVICE)
+	/* cannot change into P2P device or NAN */
+	if (ntype == NL80211_IFTYPE_P2P_DEVICE ||
+	    ntype == NL80211_IFTYPE_NAN)
 		return -EOPNOTSUPP;
 
 	if (!rdev->ops->change_virtual_intf ||
@@ -1088,6 +1089,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
 			/* not happening */
 			break;
 		case NL80211_IFTYPE_P2P_DEVICE:
+		case NL80211_IFTYPE_NAN:
 			WARN_ON(1);
 			break;
 		}
-- 
cgit v1.2.3-71-gd317


From a442b761b24b6886f9a4e2ff5f8cb4824c96526b Mon Sep 17 00:00:00 2001
From: Ayala Beker <ayala.beker@intel.com>
Date: Tue, 20 Sep 2016 17:31:15 +0300
Subject: cfg80211: add add_nan_func / del_nan_func

A NAN function can be either publish, subscribe or follow
up. Make all the necessary verifications and just pass the
request to the driver.
Allow the user space application that starts NAN to
forbid any other socket to add or remove functions.

Signed-off-by: Andrei Otcheretianski <andrei.otcheretianski@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Ayala Beker <ayala.beker@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  91 +++++++++++
 include/uapi/linux/nl80211.h | 150 ++++++++++++++++++
 net/wireless/core.c          |   3 +-
 net/wireless/nl80211.c       | 369 +++++++++++++++++++++++++++++++++++++++++++
 net/wireless/rdev-ops.h      |  21 +++
 net/wireless/trace.h         |  39 +++++
 net/wireless/util.c          |  22 +++
 7 files changed, 694 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 9898e1f883e2..2f35ccf6da83 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2326,6 +2326,73 @@ struct cfg80211_nan_conf {
 	u8 dual;
 };
 
+/**
+ * struct cfg80211_nan_func_filter - a NAN function Rx / Tx filter
+ *
+ * @filter: the content of the filter
+ * @len: the length of the filter
+ */
+struct cfg80211_nan_func_filter {
+	const u8 *filter;
+	u8 len;
+};
+
+/**
+ * struct cfg80211_nan_func - a NAN function
+ *
+ * @type: &enum nl80211_nan_function_type
+ * @service_id: the service ID of the function
+ * @publish_type: &nl80211_nan_publish_type
+ * @close_range: if true, the range should be limited. Threshold is
+ *	implementation specific.
+ * @publish_bcast: if true, the solicited publish should be broadcasted
+ * @subscribe_active: if true, the subscribe is active
+ * @followup_id: the instance ID for follow up
+ * @followup_reqid: the requestor instance ID for follow up
+ * @followup_dest: MAC address of the recipient of the follow up
+ * @ttl: time to live counter in DW.
+ * @serv_spec_info: Service Specific Info
+ * @serv_spec_info_len: Service Specific Info length
+ * @srf_include: if true, SRF is inclusive
+ * @srf_bf: Bloom Filter
+ * @srf_bf_len: Bloom Filter length
+ * @srf_bf_idx: Bloom Filter index
+ * @srf_macs: SRF MAC addresses
+ * @srf_num_macs: number of MAC addresses in SRF
+ * @rx_filters: rx filters that are matched with corresponding peer's tx_filter
+ * @tx_filters: filters that should be transmitted in the SDF.
+ * @num_rx_filters: length of &rx_filters.
+ * @num_tx_filters: length of &tx_filters.
+ * @instance_id: driver allocated id of the function.
+ * @cookie: unique NAN function identifier.
+ */
+struct cfg80211_nan_func {
+	enum nl80211_nan_function_type type;
+	u8 service_id[NL80211_NAN_FUNC_SERVICE_ID_LEN];
+	u8 publish_type;
+	bool close_range;
+	bool publish_bcast;
+	bool subscribe_active;
+	u8 followup_id;
+	u8 followup_reqid;
+	struct mac_address followup_dest;
+	u32 ttl;
+	const u8 *serv_spec_info;
+	u8 serv_spec_info_len;
+	bool srf_include;
+	const u8 *srf_bf;
+	u8 srf_bf_len;
+	u8 srf_bf_idx;
+	struct mac_address *srf_macs;
+	int srf_num_macs;
+	struct cfg80211_nan_func_filter *rx_filters;
+	struct cfg80211_nan_func_filter *tx_filters;
+	u8 num_tx_filters;
+	u8 num_rx_filters;
+	u8 instance_id;
+	u64 cookie;
+};
+
 /**
  * struct cfg80211_ops - backend description for wireless configuration
  *
@@ -2616,6 +2683,14 @@ struct cfg80211_nan_conf {
  *	peers must be on the base channel when the call completes.
  * @start_nan: Start the NAN interface.
  * @stop_nan: Stop the NAN interface.
+ * @add_nan_func: Add a NAN function. Returns negative value on failure.
+ *	On success @nan_func ownership is transferred to the driver and
+ *	it may access it outside of the scope of this function. The driver
+ *	should free the @nan_func when no longer needed by calling
+ *	cfg80211_free_nan_func().
+ *	On success the driver should assign an instance_id in the
+ *	provided @nan_func.
+ * @del_nan_func: Delete a NAN function.
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -2884,6 +2959,10 @@ struct cfg80211_ops {
 	int	(*start_nan)(struct wiphy *wiphy, struct wireless_dev *wdev,
 			     struct cfg80211_nan_conf *conf);
 	void	(*stop_nan)(struct wiphy *wiphy, struct wireless_dev *wdev);
+	int	(*add_nan_func)(struct wiphy *wiphy, struct wireless_dev *wdev,
+				struct cfg80211_nan_func *nan_func);
+	void	(*del_nan_func)(struct wiphy *wiphy, struct wireless_dev *wdev,
+			       u64 cookie);
 };
 
 /*
@@ -3335,6 +3414,8 @@ struct wiphy_iftype_ext_capab {
  * @bss_select_support: bitmask indicating the BSS selection criteria supported
  *	by the driver in the .connect() callback. The bit position maps to the
  *	attribute indices defined in &enum nl80211_bss_select_attr.
+ *
+ * @cookie_counter: unique generic cookie counter, used to identify objects.
  */
 struct wiphy {
 	/* assign these fields before you register the wiphy */
@@ -3464,6 +3545,8 @@ struct wiphy {
 
 	u32 bss_select_support;
 
+	u64 cookie_counter;
+
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
@@ -5584,6 +5667,14 @@ wiphy_ext_feature_isset(struct wiphy *wiphy,
 	return (ft_byte & BIT(ftidx % 8)) != 0;
 }
 
+/**
+ * cfg80211_free_nan_func - free NAN function
+ * @f: NAN function that should be freed
+ *
+ * Frees all the NAN function and all it's allocated members.
+ */
+void cfg80211_free_nan_func(struct cfg80211_nan_func *f);
+
 /* ethtool helper */
 void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info);
 
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 98fd3ec8598d..e4935d963061 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -847,6 +847,21 @@
  *	After this command NAN functions can be added.
  * @NL80211_CMD_STOP_NAN: Stop the NAN operation, identified by
  *	its %NL80211_ATTR_WDEV interface.
+ * @NL80211_CMD_ADD_NAN_FUNCTION: Add a NAN function. The function is defined
+ *	with %NL80211_ATTR_NAN_FUNC nested attribute. When called, this
+ *	operation returns the strictly positive and unique instance id
+ *	(%NL80211_ATTR_NAN_FUNC_INST_ID) and a cookie (%NL80211_ATTR_COOKIE)
+ *	of the function upon success.
+ *	Since instance ID's can be re-used, this cookie is the right
+ *	way to identify the function. This will avoid races when a termination
+ *	event is handled by the user space after it has already added a new
+ *	function that got the same instance id from the kernel as the one
+ *	which just terminated.
+ *	This cookie may be used in NAN events even before the command
+ *	returns, so userspace shouldn't process NAN events until it processes
+ *	the response to this command.
+ *	Look at %NL80211_ATTR_SOCKET_OWNER as well.
+ * @NL80211_CMD_DEL_NAN_FUNCTION: Delete a NAN function by cookie.
  *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
@@ -1038,6 +1053,8 @@ enum nl80211_commands {
 
 	NL80211_CMD_START_NAN,
 	NL80211_CMD_STOP_NAN,
+	NL80211_CMD_ADD_NAN_FUNCTION,
+	NL80211_CMD_DEL_NAN_FUNCTION,
 
 	/* add new commands above here */
 
@@ -1899,6 +1916,9 @@ enum nl80211_commands {
  * @NL80211_ATTR_NAN_DUAL: NAN dual band operation config (see
  *	&enum nl80211_nan_dual_band_conf). This attribute is used with
  *	%NL80211_CMD_START_NAN.
+ * @NL80211_ATTR_NAN_FUNC: a function that can be added to NAN. See
+ *	&enum nl80211_nan_func_attributes for description of this nested
+ *	attribute.
  *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
@@ -2296,6 +2316,7 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_NAN_MASTER_PREF,
 	NL80211_ATTR_NAN_DUAL,
+	NL80211_ATTR_NAN_FUNC,
 
 	/* add attributes here, update the policy in nl80211.c */
 
@@ -4917,4 +4938,133 @@ enum nl80211_nan_dual_band_conf {
 	NL80211_NAN_BAND_5GHZ		= 1 << 2,
 };
 
+/**
+ * enum nl80211_nan_function_type - NAN function type
+ *
+ * Defines the function type of a NAN function
+ *
+ * @NL80211_NAN_FUNC_PUBLISH: function is publish
+ * @NL80211_NAN_FUNC_SUBSCRIBE: function is subscribe
+ * @NL80211_NAN_FUNC_FOLLOW_UP: function is follow-up
+ */
+enum nl80211_nan_function_type {
+	NL80211_NAN_FUNC_PUBLISH,
+	NL80211_NAN_FUNC_SUBSCRIBE,
+	NL80211_NAN_FUNC_FOLLOW_UP,
+
+	/* keep last */
+	__NL80211_NAN_FUNC_TYPE_AFTER_LAST,
+	NL80211_NAN_FUNC_MAX_TYPE = __NL80211_NAN_FUNC_TYPE_AFTER_LAST - 1,
+};
+
+/**
+ * enum nl80211_nan_publish_type - NAN publish tx type
+ *
+ * Defines how to send publish Service Discovery Frames
+ *
+ * @NL80211_NAN_SOLICITED_PUBLISH: publish function is solicited
+ * @NL80211_NAN_UNSOLICITED_PUBLISH: publish function is unsolicited
+ */
+enum nl80211_nan_publish_type {
+	NL80211_NAN_SOLICITED_PUBLISH = 1 << 0,
+	NL80211_NAN_UNSOLICITED_PUBLISH = 1 << 1,
+};
+
+#define NL80211_NAN_FUNC_SERVICE_ID_LEN 6
+#define NL80211_NAN_FUNC_SERVICE_SPEC_INFO_MAX_LEN 0xff
+#define NL80211_NAN_FUNC_SRF_MAX_LEN 0xff
+
+/**
+ * enum nl80211_nan_func_attributes - NAN function attributes
+ * @__NL80211_NAN_FUNC_INVALID: invalid
+ * @NL80211_NAN_FUNC_TYPE: &enum nl80211_nan_function_type (u8).
+ * @NL80211_NAN_FUNC_SERVICE_ID: 6 bytes of the service ID hash as
+ *	specified in NAN spec. This is a binary attribute.
+ * @NL80211_NAN_FUNC_PUBLISH_TYPE: relevant if the function's type is
+ *	publish. Defines the transmission type for the publish Service Discovery
+ *	Frame, see &enum nl80211_nan_publish_type. Its type is u8.
+ * @NL80211_NAN_FUNC_PUBLISH_BCAST: relevant if the function is a solicited
+ *	publish. Should the solicited publish Service Discovery Frame be sent to
+ *	the NAN Broadcast address. This is a flag.
+ * @NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE: relevant if the function's type is
+ *	subscribe. Is the subscribe active. This is a flag.
+ * @NL80211_NAN_FUNC_FOLLOW_UP_ID: relevant if the function's type is follow up.
+ *	The instance ID for the follow up Service Discovery Frame. This is u8.
+ * @NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID: relevant if the function's type
+ *	is follow up. This is a u8.
+ *	The requestor instance ID for the follow up Service Discovery Frame.
+ * @NL80211_NAN_FUNC_FOLLOW_UP_DEST: the MAC address of the recipient of the
+ *	follow up Service Discovery Frame. This is a binary attribute.
+ * @NL80211_NAN_FUNC_CLOSE_RANGE: is this function limited for devices in a
+ *	close range. The range itself (RSSI) is defined by the device.
+ *	This is a flag.
+ * @NL80211_NAN_FUNC_TTL: strictly positive number of DWs this function should
+ *	stay active. If not present infinite TTL is assumed. This is a u32.
+ * @NL80211_NAN_FUNC_SERVICE_INFO: array of bytes describing the service
+ *	specific info. This is a binary attribute.
+ * @NL80211_NAN_FUNC_SRF: Service Receive Filter. This is a nested attribute.
+ *	See &enum nl80211_nan_srf_attributes.
+ * @NL80211_NAN_FUNC_RX_MATCH_FILTER: Receive Matching filter. This is a nested
+ *	attribute. It is a list of binary values.
+ * @NL80211_NAN_FUNC_TX_MATCH_FILTER: Transmit Matching filter. This is a
+ *	nested attribute. It is a list of binary values.
+ * @NL80211_NAN_FUNC_INSTANCE_ID: The instance ID of the function.
+ *	Its type is u8 and it cannot be 0.
+ * @NL80211_NAN_FUNC_TERM_REASON: NAN function termination reason.
+ *	See &enum nl80211_nan_func_term_reason.
+ *
+ * @NUM_NL80211_NAN_FUNC_ATTR: internal
+ * @NL80211_NAN_FUNC_ATTR_MAX: highest NAN function attribute
+ */
+enum nl80211_nan_func_attributes {
+	__NL80211_NAN_FUNC_INVALID,
+	NL80211_NAN_FUNC_TYPE,
+	NL80211_NAN_FUNC_SERVICE_ID,
+	NL80211_NAN_FUNC_PUBLISH_TYPE,
+	NL80211_NAN_FUNC_PUBLISH_BCAST,
+	NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE,
+	NL80211_NAN_FUNC_FOLLOW_UP_ID,
+	NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID,
+	NL80211_NAN_FUNC_FOLLOW_UP_DEST,
+	NL80211_NAN_FUNC_CLOSE_RANGE,
+	NL80211_NAN_FUNC_TTL,
+	NL80211_NAN_FUNC_SERVICE_INFO,
+	NL80211_NAN_FUNC_SRF,
+	NL80211_NAN_FUNC_RX_MATCH_FILTER,
+	NL80211_NAN_FUNC_TX_MATCH_FILTER,
+	NL80211_NAN_FUNC_INSTANCE_ID,
+	NL80211_NAN_FUNC_TERM_REASON,
+
+	/* keep last */
+	NUM_NL80211_NAN_FUNC_ATTR,
+	NL80211_NAN_FUNC_ATTR_MAX = NUM_NL80211_NAN_FUNC_ATTR - 1
+};
+
+/**
+ * enum nl80211_nan_srf_attributes - NAN Service Response filter attributes
+ * @__NL80211_NAN_SRF_INVALID: invalid
+ * @NL80211_NAN_SRF_INCLUDE: present if the include bit of the SRF set.
+ *	This is a flag.
+ * @NL80211_NAN_SRF_BF: Bloom Filter. Present if and only if
+ *	&NL80211_NAN_SRF_MAC_ADDRS isn't present. This attribute is binary.
+ * @NL80211_NAN_SRF_BF_IDX: index of the Bloom Filter. Mandatory if
+ *	&NL80211_NAN_SRF_BF is present. This is a u8.
+ * @NL80211_NAN_SRF_MAC_ADDRS: list of MAC addresses for the SRF. Present if
+ *	and only if &NL80211_NAN_SRF_BF isn't present. This is a nested
+ *	attribute. Each nested attribute is a MAC address.
+ * @NUM_NL80211_NAN_SRF_ATTR: internal
+ * @NL80211_NAN_SRF_ATTR_MAX: highest NAN SRF attribute
+ */
+enum nl80211_nan_srf_attributes {
+	__NL80211_NAN_SRF_INVALID,
+	NL80211_NAN_SRF_INCLUDE,
+	NL80211_NAN_SRF_BF,
+	NL80211_NAN_SRF_BF_IDX,
+	NL80211_NAN_SRF_MAC_ADDRS,
+
+	/* keep last */
+	NUM_NL80211_NAN_SRF_ATTR,
+	NL80211_NAN_SRF_ATTR_MAX = NUM_NL80211_NAN_SRF_ATTR - 1,
+};
+
 #endif /* __LINUX_NL80211_H */
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 013987243c0b..8201e6d7449e 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -605,7 +605,8 @@ int wiphy_register(struct wiphy *wiphy)
 		return -EINVAL;
 
 	if (WARN_ON((wiphy->interface_modes & BIT(NL80211_IFTYPE_NAN)) &&
-		    (!rdev->ops->start_nan || !rdev->ops->stop_nan)))
+		    (!rdev->ops->start_nan || !rdev->ops->stop_nan ||
+		     !rdev->ops->add_nan_func || !rdev->ops->del_nan_func)))
 		return -EINVAL;
 
 	/*
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 9e9fb37087fc..0eca59ccd685 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -411,6 +411,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN },
 	[NL80211_ATTR_NAN_MASTER_PREF] = { .type = NLA_U8 },
 	[NL80211_ATTR_NAN_DUAL] = { .type = NLA_U8 },
+	[NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED },
 };
 
 /* policy for the key attributes */
@@ -504,6 +505,39 @@ nl80211_bss_select_policy[NL80211_BSS_SELECT_ATTR_MAX + 1] = {
 	},
 };
 
+/* policy for NAN function attributes */
+static const struct nla_policy
+nl80211_nan_func_policy[NL80211_NAN_FUNC_ATTR_MAX + 1] = {
+	[NL80211_NAN_FUNC_TYPE] = { .type = NLA_U8 },
+	[NL80211_NAN_FUNC_SERVICE_ID] = { .type = NLA_BINARY,
+				    .len = NL80211_NAN_FUNC_SERVICE_ID_LEN },
+	[NL80211_NAN_FUNC_PUBLISH_TYPE] = { .type = NLA_U8 },
+	[NL80211_NAN_FUNC_PUBLISH_BCAST] = { .type = NLA_FLAG },
+	[NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE] = { .type = NLA_FLAG },
+	[NL80211_NAN_FUNC_FOLLOW_UP_ID] = { .type = NLA_U8 },
+	[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID] = { .type = NLA_U8 },
+	[NL80211_NAN_FUNC_FOLLOW_UP_DEST] = { .len = ETH_ALEN },
+	[NL80211_NAN_FUNC_CLOSE_RANGE] = { .type = NLA_FLAG },
+	[NL80211_NAN_FUNC_TTL] = { .type = NLA_U32 },
+	[NL80211_NAN_FUNC_SERVICE_INFO] = { .type = NLA_BINARY,
+			.len = NL80211_NAN_FUNC_SERVICE_SPEC_INFO_MAX_LEN },
+	[NL80211_NAN_FUNC_SRF] = { .type = NLA_NESTED },
+	[NL80211_NAN_FUNC_RX_MATCH_FILTER] = { .type = NLA_NESTED },
+	[NL80211_NAN_FUNC_TX_MATCH_FILTER] = { .type = NLA_NESTED },
+	[NL80211_NAN_FUNC_INSTANCE_ID] = { .type = NLA_U8 },
+	[NL80211_NAN_FUNC_TERM_REASON] = { .type = NLA_U8 },
+};
+
+/* policy for Service Response Filter attributes */
+static const struct nla_policy
+nl80211_nan_srf_policy[NL80211_NAN_SRF_ATTR_MAX + 1] = {
+	[NL80211_NAN_SRF_INCLUDE] = { .type = NLA_FLAG },
+	[NL80211_NAN_SRF_BF] = { .type = NLA_BINARY,
+				 .len =  NL80211_NAN_FUNC_SRF_MAX_LEN },
+	[NL80211_NAN_SRF_BF_IDX] = { .type = NLA_U8 },
+	[NL80211_NAN_SRF_MAC_ADDRS] = { .type = NLA_NESTED },
+};
+
 static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
 				     struct netlink_callback *cb,
 				     struct cfg80211_registered_device **rdev,
@@ -10566,6 +10600,325 @@ static int nl80211_stop_nan(struct sk_buff *skb, struct genl_info *info)
 	return 0;
 }
 
+static int validate_nan_filter(struct nlattr *filter_attr)
+{
+	struct nlattr *attr;
+	int len = 0, n_entries = 0, rem;
+
+	nla_for_each_nested(attr, filter_attr, rem) {
+		len += nla_len(attr);
+		n_entries++;
+	}
+
+	if (len >= U8_MAX)
+		return -EINVAL;
+
+	return n_entries;
+}
+
+static int handle_nan_filter(struct nlattr *attr_filter,
+			     struct cfg80211_nan_func *func,
+			     bool tx)
+{
+	struct nlattr *attr;
+	int n_entries, rem, i;
+	struct cfg80211_nan_func_filter *filter;
+
+	n_entries = validate_nan_filter(attr_filter);
+	if (n_entries < 0)
+		return n_entries;
+
+	BUILD_BUG_ON(sizeof(*func->rx_filters) != sizeof(*func->tx_filters));
+
+	filter = kcalloc(n_entries, sizeof(*func->rx_filters), GFP_KERNEL);
+	if (!filter)
+		return -ENOMEM;
+
+	i = 0;
+	nla_for_each_nested(attr, attr_filter, rem) {
+		filter[i].filter = kmemdup(nla_data(attr), nla_len(attr),
+					   GFP_KERNEL);
+		filter[i].len = nla_len(attr);
+		i++;
+	}
+	if (tx) {
+		func->num_tx_filters = n_entries;
+		func->tx_filters = filter;
+	} else {
+		func->num_rx_filters = n_entries;
+		func->rx_filters = filter;
+	}
+
+	return 0;
+}
+
+static int nl80211_nan_add_func(struct sk_buff *skb,
+				struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct wireless_dev *wdev = info->user_ptr[1];
+	struct nlattr *tb[NUM_NL80211_NAN_FUNC_ATTR], *func_attr;
+	struct cfg80211_nan_func *func;
+	struct sk_buff *msg = NULL;
+	void *hdr = NULL;
+	int err = 0;
+
+	if (wdev->iftype != NL80211_IFTYPE_NAN)
+		return -EOPNOTSUPP;
+
+	if (!wdev->nan_started)
+		return -ENOTCONN;
+
+	if (!info->attrs[NL80211_ATTR_NAN_FUNC])
+		return -EINVAL;
+
+	if (wdev->owner_nlportid &&
+	    wdev->owner_nlportid != info->snd_portid)
+		return -ENOTCONN;
+
+	err = nla_parse(tb, NL80211_NAN_FUNC_ATTR_MAX,
+			nla_data(info->attrs[NL80211_ATTR_NAN_FUNC]),
+			nla_len(info->attrs[NL80211_ATTR_NAN_FUNC]),
+			nl80211_nan_func_policy);
+	if (err)
+		return err;
+
+	func = kzalloc(sizeof(*func), GFP_KERNEL);
+	if (!func)
+		return -ENOMEM;
+
+	func->cookie = wdev->wiphy->cookie_counter++;
+
+	if (!tb[NL80211_NAN_FUNC_TYPE] ||
+	    nla_get_u8(tb[NL80211_NAN_FUNC_TYPE]) > NL80211_NAN_FUNC_MAX_TYPE) {
+		err = -EINVAL;
+		goto out;
+	}
+
+
+	func->type = nla_get_u8(tb[NL80211_NAN_FUNC_TYPE]);
+
+	if (!tb[NL80211_NAN_FUNC_SERVICE_ID]) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	memcpy(func->service_id, nla_data(tb[NL80211_NAN_FUNC_SERVICE_ID]),
+	       sizeof(func->service_id));
+
+	func->close_range =
+		nla_get_flag(tb[NL80211_NAN_FUNC_CLOSE_RANGE]);
+
+	if (tb[NL80211_NAN_FUNC_SERVICE_INFO]) {
+		func->serv_spec_info_len =
+			nla_len(tb[NL80211_NAN_FUNC_SERVICE_INFO]);
+		func->serv_spec_info =
+			kmemdup(nla_data(tb[NL80211_NAN_FUNC_SERVICE_INFO]),
+				func->serv_spec_info_len,
+				GFP_KERNEL);
+		if (!func->serv_spec_info) {
+			err = -ENOMEM;
+			goto out;
+		}
+	}
+
+	if (tb[NL80211_NAN_FUNC_TTL])
+		func->ttl = nla_get_u32(tb[NL80211_NAN_FUNC_TTL]);
+
+	switch (func->type) {
+	case NL80211_NAN_FUNC_PUBLISH:
+		if (!tb[NL80211_NAN_FUNC_PUBLISH_TYPE]) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		func->publish_type =
+			nla_get_u8(tb[NL80211_NAN_FUNC_PUBLISH_TYPE]);
+		func->publish_bcast =
+			nla_get_flag(tb[NL80211_NAN_FUNC_PUBLISH_BCAST]);
+
+		if ((!(func->publish_type & NL80211_NAN_SOLICITED_PUBLISH)) &&
+			func->publish_bcast) {
+			err = -EINVAL;
+			goto out;
+		}
+		break;
+	case NL80211_NAN_FUNC_SUBSCRIBE:
+		func->subscribe_active =
+			nla_get_flag(tb[NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE]);
+		break;
+	case NL80211_NAN_FUNC_FOLLOW_UP:
+		if (!tb[NL80211_NAN_FUNC_FOLLOW_UP_ID] ||
+		    !tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID]) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		func->followup_id =
+			nla_get_u8(tb[NL80211_NAN_FUNC_FOLLOW_UP_ID]);
+		func->followup_reqid =
+			nla_get_u8(tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID]);
+		memcpy(func->followup_dest.addr,
+		       nla_data(tb[NL80211_NAN_FUNC_FOLLOW_UP_DEST]),
+		       sizeof(func->followup_dest.addr));
+		if (func->ttl) {
+			err = -EINVAL;
+			goto out;
+		}
+		break;
+	default:
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (tb[NL80211_NAN_FUNC_SRF]) {
+		struct nlattr *srf_tb[NUM_NL80211_NAN_SRF_ATTR];
+
+		err = nla_parse(srf_tb, NL80211_NAN_SRF_ATTR_MAX,
+				nla_data(tb[NL80211_NAN_FUNC_SRF]),
+				nla_len(tb[NL80211_NAN_FUNC_SRF]), NULL);
+		if (err)
+			goto out;
+
+		func->srf_include =
+			nla_get_flag(srf_tb[NL80211_NAN_SRF_INCLUDE]);
+
+		if (srf_tb[NL80211_NAN_SRF_BF]) {
+			if (srf_tb[NL80211_NAN_SRF_MAC_ADDRS] ||
+			    !srf_tb[NL80211_NAN_SRF_BF_IDX]) {
+				err = -EINVAL;
+				goto out;
+			}
+
+			func->srf_bf_len =
+				nla_len(srf_tb[NL80211_NAN_SRF_BF]);
+			func->srf_bf =
+				kmemdup(nla_data(srf_tb[NL80211_NAN_SRF_BF]),
+					func->srf_bf_len, GFP_KERNEL);
+			if (!func->srf_bf) {
+				err = -ENOMEM;
+				goto out;
+			}
+
+			func->srf_bf_idx =
+				nla_get_u8(srf_tb[NL80211_NAN_SRF_BF_IDX]);
+		} else {
+			struct nlattr *attr, *mac_attr =
+				srf_tb[NL80211_NAN_SRF_MAC_ADDRS];
+			int n_entries, rem, i = 0;
+
+			if (!mac_attr) {
+				err = -EINVAL;
+				goto out;
+			}
+
+			n_entries = validate_acl_mac_addrs(mac_attr);
+			if (n_entries <= 0) {
+				err = -EINVAL;
+				goto out;
+			}
+
+			func->srf_num_macs = n_entries;
+			func->srf_macs =
+				kzalloc(sizeof(*func->srf_macs) * n_entries,
+					GFP_KERNEL);
+			if (!func->srf_macs) {
+				err = -ENOMEM;
+				goto out;
+			}
+
+			nla_for_each_nested(attr, mac_attr, rem)
+				memcpy(func->srf_macs[i++].addr, nla_data(attr),
+				       sizeof(*func->srf_macs));
+		}
+	}
+
+	if (tb[NL80211_NAN_FUNC_TX_MATCH_FILTER]) {
+		err = handle_nan_filter(tb[NL80211_NAN_FUNC_TX_MATCH_FILTER],
+					func, true);
+		if (err)
+			goto out;
+	}
+
+	if (tb[NL80211_NAN_FUNC_RX_MATCH_FILTER]) {
+		err = handle_nan_filter(tb[NL80211_NAN_FUNC_RX_MATCH_FILTER],
+					func, false);
+		if (err)
+			goto out;
+	}
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
+			     NL80211_CMD_ADD_NAN_FUNCTION);
+	/* This can't really happen - we just allocated 4KB */
+	if (WARN_ON(!hdr)) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = rdev_add_nan_func(rdev, wdev, func);
+out:
+	if (err < 0) {
+		cfg80211_free_nan_func(func);
+		nlmsg_free(msg);
+		return err;
+	}
+
+	/* propagate the instance id and cookie to userspace  */
+	if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, func->cookie,
+			      NL80211_ATTR_PAD))
+		goto nla_put_failure;
+
+	func_attr = nla_nest_start(msg, NL80211_ATTR_NAN_FUNC);
+	if (!func_attr)
+		goto nla_put_failure;
+
+	if (nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID,
+		       func->instance_id))
+		goto nla_put_failure;
+
+	nla_nest_end(msg, func_attr);
+
+	genlmsg_end(msg, hdr);
+	return genlmsg_reply(msg, info);
+
+nla_put_failure:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+
+static int nl80211_nan_del_func(struct sk_buff *skb,
+			       struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct wireless_dev *wdev = info->user_ptr[1];
+	u64 cookie;
+
+	if (wdev->iftype != NL80211_IFTYPE_NAN)
+		return -EOPNOTSUPP;
+
+	if (!wdev->nan_started)
+		return -ENOTCONN;
+
+	if (!info->attrs[NL80211_ATTR_COOKIE])
+		return -EINVAL;
+
+	if (wdev->owner_nlportid &&
+	    wdev->owner_nlportid != info->snd_portid)
+		return -ENOTCONN;
+
+	cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]);
+
+	rdev_del_nan_func(rdev, wdev, cookie);
+
+	return 0;
+}
+
 static int nl80211_get_protocol_features(struct sk_buff *skb,
 					 struct genl_info *info)
 {
@@ -11923,6 +12276,22 @@ static const struct genl_ops nl80211_ops[] = {
 		.internal_flags = NL80211_FLAG_NEED_WDEV_UP |
 				  NL80211_FLAG_NEED_RTNL,
 	},
+	{
+		.cmd = NL80211_CMD_ADD_NAN_FUNCTION,
+		.doit = nl80211_nan_add_func,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_WDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_DEL_NAN_FUNCTION,
+		.doit = nl80211_nan_del_func,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_WDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
 	{
 		.cmd = NL80211_CMD_SET_MCAST_RATE,
 		.doit = nl80211_set_mcast_rate,
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index afb68a8428b9..98c4c3bdcb11 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -907,6 +907,27 @@ static inline void rdev_stop_nan(struct cfg80211_registered_device *rdev,
 	trace_rdev_return_void(&rdev->wiphy);
 }
 
+static inline int
+rdev_add_nan_func(struct cfg80211_registered_device *rdev,
+		  struct wireless_dev *wdev,
+		  struct cfg80211_nan_func *nan_func)
+{
+	int ret;
+
+	trace_rdev_add_nan_func(&rdev->wiphy, wdev, nan_func);
+	ret = rdev->ops->add_nan_func(&rdev->wiphy, wdev, nan_func);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+	return ret;
+}
+
+static inline void rdev_del_nan_func(struct cfg80211_registered_device *rdev,
+				    struct wireless_dev *wdev, u64 cookie)
+{
+	trace_rdev_del_nan_func(&rdev->wiphy, wdev, cookie);
+	rdev->ops->del_nan_func(&rdev->wiphy, wdev, cookie);
+	trace_rdev_return_void(&rdev->wiphy);
+}
+
 static inline int rdev_set_mac_acl(struct cfg80211_registered_device *rdev,
 				   struct net_device *dev,
 				   struct cfg80211_acl_data *params)
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 5f3370f4c6a2..56089843d619 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -1916,6 +1916,45 @@ DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan,
 	TP_ARGS(wiphy, wdev)
 );
 
+TRACE_EVENT(rdev_add_nan_func,
+	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+		 const struct cfg80211_nan_func *func),
+	TP_ARGS(wiphy, wdev, func),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		WDEV_ENTRY
+		__field(u8, func_type)
+		__field(u64, cookie)
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		WDEV_ASSIGN;
+		__entry->func_type = func->type;
+		__entry->cookie = func->cookie
+	),
+	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", type=%u, cookie=%llu",
+		  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->func_type,
+		  __entry->cookie)
+);
+
+TRACE_EVENT(rdev_del_nan_func,
+	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+		 u64 cookie),
+	TP_ARGS(wiphy, wdev, cookie),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		WDEV_ENTRY
+		__field(u64, cookie)
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		WDEV_ASSIGN;
+		__entry->cookie = cookie;
+	),
+	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie=%llu",
+		  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->cookie)
+);
+
 TRACE_EVENT(rdev_set_mac_acl,
 	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
 		 struct cfg80211_acl_data *params),
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 7a2d46b0058a..8edce22d1b93 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -1762,6 +1762,28 @@ int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr,
 }
 EXPORT_SYMBOL(cfg80211_get_station);
 
+void cfg80211_free_nan_func(struct cfg80211_nan_func *f)
+{
+	int i;
+
+	if (!f)
+		return;
+
+	kfree(f->serv_spec_info);
+	kfree(f->srf_bf);
+	kfree(f->srf_macs);
+	for (i = 0; i < f->num_rx_filters; i++)
+		kfree(f->rx_filters[i].filter);
+
+	for (i = 0; i < f->num_tx_filters; i++)
+		kfree(f->tx_filters[i].filter);
+
+	kfree(f->rx_filters);
+	kfree(f->tx_filters);
+	kfree(f);
+}
+EXPORT_SYMBOL(cfg80211_free_nan_func);
+
 /* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
 /* Ethernet-II snap header (RFC1042 for most EtherTypes) */
 const unsigned char rfc1042_header[] __aligned(2) =
-- 
cgit v1.2.3-71-gd317


From a5a9dcf291e1e541243878eed2d73a74006fa1f1 Mon Sep 17 00:00:00 2001
From: Ayala Beker <ayala.beker@intel.com>
Date: Tue, 20 Sep 2016 17:31:16 +0300
Subject: cfg80211: allow the user space to change current NAN configuration

Some NAN configuration paramaters may change during the operation of
the NAN device. For example, a user may want to update master preference
value when the device gets plugged/unplugged to the power.
Add API that allows to do so.

Signed-off-by: Andrei Otcheretianski <andrei.otcheretianski@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 19 +++++++++++++++++++
 include/uapi/linux/nl80211.h | 11 +++++++++--
 net/wireless/nl80211.c       | 42 ++++++++++++++++++++++++++++++++++++++++++
 net/wireless/rdev-ops.h      | 17 +++++++++++++++++
 net/wireless/trace.h         | 24 ++++++++++++++++++++++++
 5 files changed, 111 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 2f35ccf6da83..8574a57e19ba 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2326,6 +2326,18 @@ struct cfg80211_nan_conf {
 	u8 dual;
 };
 
+/**
+ * enum cfg80211_nan_conf_changes - indicates changed fields in NAN
+ * configuration
+ *
+ * @CFG80211_NAN_CONF_CHANGED_PREF: master preference
+ * @CFG80211_NAN_CONF_CHANGED_DUAL: dual band operation
+ */
+enum cfg80211_nan_conf_changes {
+	CFG80211_NAN_CONF_CHANGED_PREF = BIT(0),
+	CFG80211_NAN_CONF_CHANGED_DUAL = BIT(1),
+};
+
 /**
  * struct cfg80211_nan_func_filter - a NAN function Rx / Tx filter
  *
@@ -2691,6 +2703,9 @@ struct cfg80211_nan_func {
  *	On success the driver should assign an instance_id in the
  *	provided @nan_func.
  * @del_nan_func: Delete a NAN function.
+ * @nan_change_conf: changes NAN configuration. The changed parameters must
+ *	be specified in @changes (using &enum cfg80211_nan_conf_changes);
+ *	All other parameters must be ignored.
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -2963,6 +2978,10 @@ struct cfg80211_ops {
 				struct cfg80211_nan_func *nan_func);
 	void	(*del_nan_func)(struct wiphy *wiphy, struct wireless_dev *wdev,
 			       u64 cookie);
+	int	(*nan_change_conf)(struct wiphy *wiphy,
+				   struct wireless_dev *wdev,
+				   struct cfg80211_nan_conf *conf,
+				   u32 changes);
 };
 
 /*
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index e4935d963061..9c9c0c352873 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -862,6 +862,10 @@
  *	the response to this command.
  *	Look at %NL80211_ATTR_SOCKET_OWNER as well.
  * @NL80211_CMD_DEL_NAN_FUNCTION: Delete a NAN function by cookie.
+ * @NL80211_CMD_CHANGE_NAN_CONFIG: Change current NAN configuration. NAN
+ *	must be operational (%NL80211_CMD_START_NAN was executed).
+ *	It must contain at least one of the following attributes:
+ *	%NL80211_ATTR_NAN_MASTER_PREF, %NL80211_ATTR_NAN_DUAL.
  *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
@@ -1055,6 +1059,7 @@ enum nl80211_commands {
 	NL80211_CMD_STOP_NAN,
 	NL80211_CMD_ADD_NAN_FUNCTION,
 	NL80211_CMD_DEL_NAN_FUNCTION,
+	NL80211_CMD_CHANGE_NAN_CONFIG,
 
 	/* add new commands above here */
 
@@ -1910,12 +1915,14 @@ enum nl80211_commands {
  *	used to pull the stored data for mesh peer in power save state.
  *
  * @NL80211_ATTR_NAN_MASTER_PREF: the master preference to be used by
- *	%NL80211_CMD_START_NAN. Its type is u8 and it can't be 0.
+ *	%NL80211_CMD_START_NAN and optionally with
+ *	%NL80211_CMD_CHANGE_NAN_CONFIG. Its type is u8 and it can't be 0.
  *	Also, values 1 and 255 are reserved for certification purposes and
  *	should not be used during a normal device operation.
  * @NL80211_ATTR_NAN_DUAL: NAN dual band operation config (see
  *	&enum nl80211_nan_dual_band_conf). This attribute is used with
- *	%NL80211_CMD_START_NAN.
+ *	%NL80211_CMD_START_NAN and optionally with
+ *	%NL80211_CMD_CHANGE_NAN_CONFIG.
  * @NL80211_ATTR_NAN_FUNC: a function that can be added to NAN. See
  *	&enum nl80211_nan_func_attributes for description of this nested
  *	attribute.
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 0eca59ccd685..c0b5ae4af2d8 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -10919,6 +10919,40 @@ static int nl80211_nan_del_func(struct sk_buff *skb,
 	return 0;
 }
 
+static int nl80211_nan_change_config(struct sk_buff *skb,
+				     struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct wireless_dev *wdev = info->user_ptr[1];
+	struct cfg80211_nan_conf conf = {};
+	u32 changed = 0;
+
+	if (wdev->iftype != NL80211_IFTYPE_NAN)
+		return -EOPNOTSUPP;
+
+	if (!wdev->nan_started)
+		return -ENOTCONN;
+
+	if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) {
+		conf.master_pref =
+			nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]);
+		if (conf.master_pref <= 1 || conf.master_pref == 255)
+			return -EINVAL;
+
+		changed |= CFG80211_NAN_CONF_CHANGED_PREF;
+	}
+
+	if (info->attrs[NL80211_ATTR_NAN_DUAL]) {
+		conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]);
+		changed |= CFG80211_NAN_CONF_CHANGED_DUAL;
+	}
+
+	if (!changed)
+		return -EINVAL;
+
+	return rdev_nan_change_conf(rdev, wdev, &conf, changed);
+}
+
 static int nl80211_get_protocol_features(struct sk_buff *skb,
 					 struct genl_info *info)
 {
@@ -12292,6 +12326,14 @@ static const struct genl_ops nl80211_ops[] = {
 		.internal_flags = NL80211_FLAG_NEED_WDEV_UP |
 				  NL80211_FLAG_NEED_RTNL,
 	},
+	{
+		.cmd = NL80211_CMD_CHANGE_NAN_CONFIG,
+		.doit = nl80211_nan_change_config,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_WDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
 	{
 		.cmd = NL80211_CMD_SET_MCAST_RATE,
 		.doit = nl80211_set_mcast_rate,
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 98c4c3bdcb11..11cf83c8ad4f 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -928,6 +928,23 @@ static inline void rdev_del_nan_func(struct cfg80211_registered_device *rdev,
 	trace_rdev_return_void(&rdev->wiphy);
 }
 
+static inline int
+rdev_nan_change_conf(struct cfg80211_registered_device *rdev,
+		     struct wireless_dev *wdev,
+		     struct cfg80211_nan_conf *conf, u32 changes)
+{
+	int ret;
+
+	trace_rdev_nan_change_conf(&rdev->wiphy, wdev, conf, changes);
+	if (rdev->ops->nan_change_conf)
+		ret = rdev->ops->nan_change_conf(&rdev->wiphy, wdev, conf,
+						 changes);
+	else
+		ret = -ENOTSUPP;
+	trace_rdev_return_int(&rdev->wiphy, ret);
+	return ret;
+}
+
 static inline int rdev_set_mac_acl(struct cfg80211_registered_device *rdev,
 				   struct net_device *dev,
 				   struct cfg80211_acl_data *params)
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 56089843d619..a3d0a91b1e09 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -1911,6 +1911,30 @@ TRACE_EVENT(rdev_start_nan,
 		  __entry->dual)
 );
 
+TRACE_EVENT(rdev_nan_change_conf,
+	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+		 struct cfg80211_nan_conf *conf, u32 changes),
+	TP_ARGS(wiphy, wdev, conf, changes),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		WDEV_ENTRY
+		__field(u8, master_pref)
+		__field(u8, dual);
+		__field(u32, changes);
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		WDEV_ASSIGN;
+		__entry->master_pref = conf->master_pref;
+		__entry->dual = conf->dual;
+		__entry->changes = changes;
+	),
+	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
+		  ", master preference: %u, dual: %d, changes: %x",
+		  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
+		  __entry->dual, __entry->changes)
+);
+
 DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan,
 	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
 	TP_ARGS(wiphy, wdev)
-- 
cgit v1.2.3-71-gd317


From 50bcd31d9992e99c231820f5276e70346cbfbc51 Mon Sep 17 00:00:00 2001
From: Ayala Beker <ayala.beker@intel.com>
Date: Tue, 20 Sep 2016 17:31:17 +0300
Subject: cfg80211: provide a function to report a match for NAN

Provide a function the driver can call to report a match.
This will send the event to the user space.
If the NAN instance is tied to the owner, the notifications will be
sent to the socket that started the NAN interface only.

Signed-off-by: Andrei Otcheretianski <andrei.otcheretianski@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 37 ++++++++++++++++++++
 include/uapi/linux/nl80211.h | 31 +++++++++++++++++
 net/wireless/nl80211.c       | 80 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 148 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 8574a57e19ba..5481664b5389 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5694,6 +5694,43 @@ wiphy_ext_feature_isset(struct wiphy *wiphy,
  */
 void cfg80211_free_nan_func(struct cfg80211_nan_func *f);
 
+/**
+ * struct cfg80211_nan_match_params - NAN match parameters
+ * @type: the type of the function that triggered a match. If it is
+ *	 %NL80211_NAN_FUNC_SUBSCRIBE it means that we replied to a subscriber.
+ *	 If it is %NL80211_NAN_FUNC_PUBLISH, it means that we got a discovery
+ *	 result.
+ *	 If it is %NL80211_NAN_FUNC_FOLLOW_UP, we received a follow up.
+ * @inst_id: the local instance id
+ * @peer_inst_id: the instance id of the peer's function
+ * @addr: the MAC address of the peer
+ * @info_len: the length of the &info
+ * @info: the Service Specific Info from the peer (if any)
+ * @cookie: unique identifier of the corresponding function
+ */
+struct cfg80211_nan_match_params {
+	enum nl80211_nan_function_type type;
+	u8 inst_id;
+	u8 peer_inst_id;
+	const u8 *addr;
+	u8 info_len;
+	const u8 *info;
+	u64 cookie;
+};
+
+/**
+ * cfg80211_nan_match - report a match for a NAN function.
+ * @wdev: the wireless device reporting the match
+ * @match: match notification parameters
+ * @gfp: allocation flags
+ *
+ * This function reports that the a NAN function had a match. This
+ * can be a subscribe that had a match or a solicited publish that
+ * was sent. It can also be a follow up that was received.
+ */
+void cfg80211_nan_match(struct wireless_dev *wdev,
+			struct cfg80211_nan_match_params *match, gfp_t gfp);
+
 /* ethtool helper */
 void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info);
 
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 9c9c0c352873..995bf802d604 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -48,6 +48,7 @@
 #define NL80211_MULTICAST_GROUP_REG		"regulatory"
 #define NL80211_MULTICAST_GROUP_MLME		"mlme"
 #define NL80211_MULTICAST_GROUP_VENDOR		"vendor"
+#define NL80211_MULTICAST_GROUP_NAN		"nan"
 #define NL80211_MULTICAST_GROUP_TESTMODE	"testmode"
 
 /**
@@ -866,6 +867,9 @@
  *	must be operational (%NL80211_CMD_START_NAN was executed).
  *	It must contain at least one of the following attributes:
  *	%NL80211_ATTR_NAN_MASTER_PREF, %NL80211_ATTR_NAN_DUAL.
+ * @NL80211_CMD_NAN_FUNC_MATCH: Notification sent when a match is reported.
+ *	This will contain a %NL80211_ATTR_NAN_MATCH nested attribute and
+ *	%NL80211_ATTR_COOKIE.
  *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
@@ -1060,6 +1064,7 @@ enum nl80211_commands {
 	NL80211_CMD_ADD_NAN_FUNCTION,
 	NL80211_CMD_DEL_NAN_FUNCTION,
 	NL80211_CMD_CHANGE_NAN_CONFIG,
+	NL80211_CMD_NAN_MATCH,
 
 	/* add new commands above here */
 
@@ -1926,6 +1931,8 @@ enum nl80211_commands {
  * @NL80211_ATTR_NAN_FUNC: a function that can be added to NAN. See
  *	&enum nl80211_nan_func_attributes for description of this nested
  *	attribute.
+ * @NL80211_ATTR_NAN_MATCH: used to report a match. This is a nested attribute.
+ *	See &enum nl80211_nan_match_attributes.
  *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
@@ -2324,6 +2331,7 @@ enum nl80211_attrs {
 	NL80211_ATTR_NAN_MASTER_PREF,
 	NL80211_ATTR_NAN_DUAL,
 	NL80211_ATTR_NAN_FUNC,
+	NL80211_ATTR_NAN_MATCH,
 
 	/* add attributes here, update the policy in nl80211.c */
 
@@ -5074,4 +5082,27 @@ enum nl80211_nan_srf_attributes {
 	NL80211_NAN_SRF_ATTR_MAX = NUM_NL80211_NAN_SRF_ATTR - 1,
 };
 
+/**
+ * enum nl80211_nan_match_attributes - NAN match attributes
+ * @__NL80211_NAN_MATCH_INVALID: invalid
+ * @NL80211_NAN_MATCH_FUNC_LOCAL: the local function that had the
+ *	match. This is a nested attribute.
+ *	See &enum nl80211_nan_func_attributes.
+ * @NL80211_NAN_MATCH_FUNC_PEER: the peer function
+ *	that caused the match. This is a nested attribute.
+ *	See &enum nl80211_nan_func_attributes.
+ *
+ * @NUM_NL80211_NAN_MATCH_ATTR: internal
+ * @NL80211_NAN_MATCH_ATTR_MAX: highest NAN match attribute
+ */
+enum nl80211_nan_match_attributes {
+	__NL80211_NAN_MATCH_INVALID,
+	NL80211_NAN_MATCH_FUNC_LOCAL,
+	NL80211_NAN_MATCH_FUNC_PEER,
+
+	/* keep last */
+	NUM_NL80211_NAN_MATCH_ATTR,
+	NL80211_NAN_MATCH_ATTR_MAX = NUM_NL80211_NAN_MATCH_ATTR - 1
+};
+
 #endif /* __LINUX_NL80211_H */
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index c0b5ae4af2d8..0bbd9ed28318 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -56,6 +56,7 @@ enum nl80211_multicast_groups {
 	NL80211_MCGRP_REGULATORY,
 	NL80211_MCGRP_MLME,
 	NL80211_MCGRP_VENDOR,
+	NL80211_MCGRP_NAN,
 	NL80211_MCGRP_TESTMODE /* keep last - ifdef! */
 };
 
@@ -65,6 +66,7 @@ static const struct genl_multicast_group nl80211_mcgrps[] = {
 	[NL80211_MCGRP_REGULATORY] = { .name = NL80211_MULTICAST_GROUP_REG },
 	[NL80211_MCGRP_MLME] = { .name = NL80211_MULTICAST_GROUP_MLME },
 	[NL80211_MCGRP_VENDOR] = { .name = NL80211_MULTICAST_GROUP_VENDOR },
+	[NL80211_MCGRP_NAN] = { .name = NL80211_MULTICAST_GROUP_NAN },
 #ifdef CONFIG_NL80211_TESTMODE
 	[NL80211_MCGRP_TESTMODE] = { .name = NL80211_MULTICAST_GROUP_TESTMODE }
 #endif
@@ -10953,6 +10955,84 @@ static int nl80211_nan_change_config(struct sk_buff *skb,
 	return rdev_nan_change_conf(rdev, wdev, &conf, changed);
 }
 
+void cfg80211_nan_match(struct wireless_dev *wdev,
+			struct cfg80211_nan_match_params *match, gfp_t gfp)
+{
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+	struct nlattr *match_attr, *local_func_attr, *peer_func_attr;
+	struct sk_buff *msg;
+	void *hdr;
+
+	if (WARN_ON(!match->inst_id || !match->peer_inst_id || !match->addr))
+		return;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NAN_MATCH);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+	    (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
+					 wdev->netdev->ifindex)) ||
+	    nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
+			      NL80211_ATTR_PAD))
+		goto nla_put_failure;
+
+	if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, match->cookie,
+			      NL80211_ATTR_PAD) ||
+	    nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, match->addr))
+		goto nla_put_failure;
+
+	match_attr = nla_nest_start(msg, NL80211_ATTR_NAN_MATCH);
+	if (!match_attr)
+		goto nla_put_failure;
+
+	local_func_attr = nla_nest_start(msg, NL80211_NAN_MATCH_FUNC_LOCAL);
+	if (!local_func_attr)
+		goto nla_put_failure;
+
+	if (nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID, match->inst_id))
+		goto nla_put_failure;
+
+	nla_nest_end(msg, local_func_attr);
+
+	peer_func_attr = nla_nest_start(msg, NL80211_NAN_MATCH_FUNC_PEER);
+	if (!peer_func_attr)
+		goto nla_put_failure;
+
+	if (nla_put_u8(msg, NL80211_NAN_FUNC_TYPE, match->type) ||
+	    nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID, match->peer_inst_id))
+		goto nla_put_failure;
+
+	if (match->info && match->info_len &&
+	    nla_put(msg, NL80211_NAN_FUNC_SERVICE_INFO, match->info_len,
+		    match->info))
+		goto nla_put_failure;
+
+	nla_nest_end(msg, peer_func_attr);
+	nla_nest_end(msg, match_attr);
+	genlmsg_end(msg, hdr);
+
+	if (!wdev->owner_nlportid)
+		genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy),
+					msg, 0, NL80211_MCGRP_NAN, gfp);
+	else
+		genlmsg_unicast(wiphy_net(&rdev->wiphy), msg,
+				wdev->owner_nlportid);
+
+	return;
+
+nla_put_failure:
+	nlmsg_free(msg);
+}
+EXPORT_SYMBOL(cfg80211_nan_match);
+
 static int nl80211_get_protocol_features(struct sk_buff *skb,
 					 struct genl_info *info)
 {
-- 
cgit v1.2.3-71-gd317


From 368e5a7b4ecb71b3d347799cb9351b0dce5dec70 Mon Sep 17 00:00:00 2001
From: Ayala Beker <ayala.beker@intel.com>
Date: Tue, 20 Sep 2016 17:31:18 +0300
Subject: cfg80211: Provide an API to report NAN function termination

Provide a function that reports NAN DE function termination. The function
may be terminated due to one of the following reasons: user request,
ttl expiration or failure.
If the NAN instance is tied to the owner, the notification will be
sent to the socket that started the NAN interface only

Signed-off-by: Andrei Otcheretianski <andrei.otcheretianski@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 16 ++++++++++++
 include/uapi/linux/nl80211.h | 18 +++++++++++++
 net/wireless/nl80211.c       | 60 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 94 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5481664b5389..fe78f02a242e 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5731,6 +5731,22 @@ struct cfg80211_nan_match_params {
 void cfg80211_nan_match(struct wireless_dev *wdev,
 			struct cfg80211_nan_match_params *match, gfp_t gfp);
 
+/**
+ * cfg80211_nan_func_terminated - notify about NAN function termination.
+ *
+ * @wdev: the wireless device reporting the match
+ * @inst_id: the local instance id
+ * @reason: termination reason (one of the NL80211_NAN_FUNC_TERM_REASON_*)
+ * @cookie: unique NAN function identifier
+ * @gfp: allocation flags
+ *
+ * This function reports that the a NAN function is terminated.
+ */
+void cfg80211_nan_func_terminated(struct wireless_dev *wdev,
+				  u8 inst_id,
+				  enum nl80211_nan_func_term_reason reason,
+				  u64 cookie, gfp_t gfp);
+
 /* ethtool helper */
 void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info);
 
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 995bf802d604..56368e9b4622 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -863,6 +863,9 @@
  *	the response to this command.
  *	Look at %NL80211_ATTR_SOCKET_OWNER as well.
  * @NL80211_CMD_DEL_NAN_FUNCTION: Delete a NAN function by cookie.
+ *	This command is also used as a notification sent when a NAN function is
+ *	terminated. This will contain a %NL80211_ATTR_NAN_FUNC_INST_ID
+ *	and %NL80211_ATTR_COOKIE attributes.
  * @NL80211_CMD_CHANGE_NAN_CONFIG: Change current NAN configuration. NAN
  *	must be operational (%NL80211_CMD_START_NAN was executed).
  *	It must contain at least one of the following attributes:
@@ -4985,6 +4988,21 @@ enum nl80211_nan_publish_type {
 	NL80211_NAN_UNSOLICITED_PUBLISH = 1 << 1,
 };
 
+/**
+ * enum nl80211_nan_func_term_reason - NAN functions termination reason
+ *
+ * Defines termination reasons of a NAN function
+ *
+ * @NL80211_NAN_FUNC_TERM_REASON_USER_REQUEST: requested by user
+ * @NL80211_NAN_FUNC_TERM_REASON_TTL_EXPIRED: timeout
+ * @NL80211_NAN_FUNC_TERM_REASON_ERROR: errored
+ */
+enum nl80211_nan_func_term_reason {
+	NL80211_NAN_FUNC_TERM_REASON_USER_REQUEST,
+	NL80211_NAN_FUNC_TERM_REASON_TTL_EXPIRED,
+	NL80211_NAN_FUNC_TERM_REASON_ERROR,
+};
+
 #define NL80211_NAN_FUNC_SERVICE_ID_LEN 6
 #define NL80211_NAN_FUNC_SERVICE_SPEC_INFO_MAX_LEN 0xff
 #define NL80211_NAN_FUNC_SRF_MAX_LEN 0xff
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 0bbd9ed28318..92eb6f0b9f3d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -11033,6 +11033,66 @@ nla_put_failure:
 }
 EXPORT_SYMBOL(cfg80211_nan_match);
 
+void cfg80211_nan_func_terminated(struct wireless_dev *wdev,
+				  u8 inst_id,
+				  enum nl80211_nan_func_term_reason reason,
+				  u64 cookie, gfp_t gfp)
+{
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+	struct sk_buff *msg;
+	struct nlattr *func_attr;
+	void *hdr;
+
+	if (WARN_ON(!inst_id))
+		return;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_DEL_NAN_FUNCTION);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+	    (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
+					 wdev->netdev->ifindex)) ||
+	    nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
+			      NL80211_ATTR_PAD))
+		goto nla_put_failure;
+
+	if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie,
+			      NL80211_ATTR_PAD))
+		goto nla_put_failure;
+
+	func_attr = nla_nest_start(msg, NL80211_ATTR_NAN_FUNC);
+	if (!func_attr)
+		goto nla_put_failure;
+
+	if (nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID, inst_id) ||
+	    nla_put_u8(msg, NL80211_NAN_FUNC_TERM_REASON, reason))
+		goto nla_put_failure;
+
+	nla_nest_end(msg, func_attr);
+	genlmsg_end(msg, hdr);
+
+	if (!wdev->owner_nlportid)
+		genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy),
+					msg, 0, NL80211_MCGRP_NAN, gfp);
+	else
+		genlmsg_unicast(wiphy_net(&rdev->wiphy), msg,
+				wdev->owner_nlportid);
+
+	return;
+
+nla_put_failure:
+	nlmsg_free(msg);
+}
+EXPORT_SYMBOL(cfg80211_nan_func_terminated);
+
 static int nl80211_get_protocol_features(struct sk_buff *skb,
 					 struct genl_info *info)
 {
-- 
cgit v1.2.3-71-gd317